repotoire 0.9.0

//! SQL Injection detector
//!
//! Detects dangerous SQL patterns that can lead to SQL injection:
//!
//! - f-strings with SQL keywords and variable interpolation
//! - String concatenation in SQL queries
//! - .format() string interpolation in SQL
//! - % formatting in SQL queries
//!
//! CWE-89: Improper Neutralization of Special Elements used in an SQL Command
//!
//! # Phase 2j dual-branch stack structure
//!
//! Phase 2j of the dual-branch migration series migrates this detector
//! to the predictor-aware shape established by 2a–2i. Unlike the
//! `git mv`-based scaffolding moves used by 2g–2i (where the detector
//! file was renamed to a directory in commit 1), this detector is
//! **already** a directory — so the Phase 2j commit 1 is a no-op chore
//! that records the upcoming submodule layout here:
//!
//! - `predict` — weighted-scoring Phase 2j predictor + `SqlApi` /
//!   `UserInputSource` enums + Step 1.5 bidirectional collapses
//!   (D1.a Safe parameterized, D1.b Unsafe string-formatted,
//!   D1.c `.raw()` escape-hatch — the Phase 2j headline distinction).
//! - `annotation` — `sql-safe[<reason>]` / `sql-vulnerable[<source>]`
//!   re-export over the shared parser in `super::dual_branch_annotation`.
//! - `evidence` — AST-driven `tree_sitter` Python extractor that
//!   classifies each `cursor.execute(...)` / `Model.objects.raw(...)`
//!   / `db.execute(text(...), {...})` / SQLAlchemy / Django ORM call
//!   site into one of the `SqlApi` shapes.
//!
//! These submodules land in commits 3 (`predict`+`annotation`), 4
//! (`evidence`), and 5 (integration into [`SQLInjectionDetector::detect`]).
//! See `docs/superpowers/specs/2026-05-09-dual-branch-phase2-sql-injection-decisions.md`
//! for the full decisions doc.

// Phase 2j dual-branch submodules. Wired through `detect()` in
// commit 5 of the dual-branch migration stack. Scaffolded in
// commit 3 (predict + annotation) and commit 4 (evidence) with
// `#![allow(dead_code)]` so they compile without integration.
mod annotation;
mod evidence;
mod predict;

mod patterns;
pub(crate) use patterns::*;

#[cfg(test)]
mod tests;

use crate::detectors::base::{is_test_file, Detector, DetectorConfig};
use crate::detectors::framework_detection::{detect_frameworks, is_safe_orm_pattern};
use crate::detectors::taint::{TaintAnalysisResult, TaintAnalyzer, TaintCategory};
use crate::models::{deterministic_finding_id, Evidence, Finding, Severity, SourceSpan, Tier};
use anyhow::Result;
use regex::Regex;
use std::collections::HashSet;
use std::path::{Path, PathBuf};
use std::sync::LazyLock;
use tracing::{debug, info};

// Static compiled regex patterns (compiled once, shared across all instances)

/// f-string with SQL keywords
static FSTRING_SQL: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(
    r#"(?i)f["'].*?\b(SELECT|INSERT|UPDATE|DELETE|DROP|CREATE|ALTER|TRUNCATE|EXEC|EXECUTE)\b.*?\{[^}]+\}"#
).expect("valid regex")
});

/// String concatenation with SQL keywords
static CONCAT_SQL: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(
    r#"(?i)\b(SELECT|INSERT|UPDATE|DELETE|DROP|CREATE|ALTER|TRUNCATE|EXEC|EXECUTE)\b.*["']\s*\+"#
).expect("valid regex")
});

/// .format() with SQL keywords
static FORMAT_SQL: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(
    r#"(?i)\b(SELECT|INSERT|UPDATE|DELETE|DROP|CREATE|ALTER|TRUNCATE|EXEC|EXECUTE)\b.*["']\.format\s*\("#
).expect("valid regex")
});

/// % formatting with SQL keywords
static PERCENT_SQL: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(
    r#"(?i)\b(SELECT|INSERT|UPDATE|DELETE|DROP|CREATE|ALTER|TRUNCATE|EXEC|EXECUTE)\b.*%[sdr].*["']\s*%"#
).expect("valid regex")
});

/// JavaScript template literals with SQL keywords
static JS_TEMPLATE_SQL: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(
    r#"(?i)`[^`]*\b(SELECT|INSERT|UPDATE|DELETE|DROP|CREATE|ALTER|TRUNCATE|EXEC|EXECUTE)\b[^`]*\$\{[^}]+\}[^`]*`"#
).expect("valid regex")
});

/// Go fmt.Sprintf with SQL keywords
static GO_SPRINTF_SQL: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(
    r#"(?i)fmt\.Sprintf\s*\(\s*["'`].*\b(SELECT|INSERT|UPDATE|DELETE|DROP|CREATE|ALTER|TRUNCATE|EXEC|EXECUTE)\b.*%[svdqxXfFeEgGtTpbcoU].*["'`]"#
).expect("valid regex")
});

/// Detects potential SQL injection vulnerabilities
pub struct SQLInjectionDetector {
    config: DetectorConfig,
    repository_path: PathBuf,
    max_findings: usize,
    exclude_dirs: Vec<String>,
    // Taint analyzer for graph-based data flow
    taint_analyzer: TaintAnalyzer,
    // Pre-computed taint results (set by engine before detect())
    precomputed_cross: std::sync::OnceLock<Vec<crate::detectors::taint::TaintPath>>,
    precomputed_intra: std::sync::OnceLock<Vec<crate::detectors::taint::TaintPath>>,
}

impl SQLInjectionDetector {
    /// Create a new detector with default settings
    pub fn new() -> Self {
        Self::with_config(DetectorConfig::new(), PathBuf::from("."))
    }

    /// Create with custom repository path
    pub fn with_repository_path(repository_path: PathBuf) -> Self {
        Self::with_config(DetectorConfig::new(), repository_path)
    }

    /// Create with custom config and repository path
    pub fn with_config(config: DetectorConfig, repository_path: PathBuf) -> Self {
        let max_findings = config.get_option_or("max_findings", 100);
        let exclude_dirs = config
            .get_option::<Vec<String>>("exclude_dirs")
            .unwrap_or_else(|| DEFAULT_EXCLUDE_DIRS.iter().map(|s| s.to_string()).collect());

        Self {
            config,
            repository_path,
            max_findings,
            exclude_dirs,
            taint_analyzer: TaintAnalyzer::new(),
            precomputed_cross: std::sync::OnceLock::new(),
            precomputed_intra: std::sync::OnceLock::new(),
        }
    }

    /// Check if path should be excluded
    fn should_exclude(&self, path: &Path) -> bool {
        // Use shared test file detection utility
        if is_test_file(path) {
            return true;
        }

        // Check excluded directories
        let path_str = path.to_string_lossy();

        // Skip ORM/database internal paths — these files ARE the database layer
        let db_internal_patterns = [
            "db/backends/",
            "db/models/sql/",
            "db/models/expressions",
            "core/cache/backends/",
        ];
        if db_internal_patterns.iter().any(|p| path_str.contains(p)) {
            return true;
        }

        for dir in &self.exclude_dirs {
            // Match as path component (not substring)
            if path_str.split('/').any(|p| p == dir) {
                return true;
            }
        }

        false
    }

    /// Check if the template literal is inside an error/logging context.
    /// Error messages, log statements, and exception constructors are never SQL queries.
    fn is_error_or_log_context(line: &str) -> bool {
        let patterns = [
            "Error(",
            "error(",
            "throw ",
            "throw(",
            "console.",
            "log(",
            "warn(",
            "debug(",
            "info(",
            "trace(",
            "panic!(",
            "bail!(",
            "anyhow!(",
            "mkErr(",
            "reject(",
            "println!(",
            "eprintln!(",
            "tracing::",
            "logger.",
            "Logger.",
            "raise ",
            "Exception(",
        ];
        patterns.iter().any(|p| line.contains(p))
    }

    /// Check if a JavaScript template literal is a safe tagged template
    /// Tagged templates like sql`...`, Prisma.sql`...`, db.sql`...` are parameterized
    fn is_safe_tagged_template(&self, line: &str) -> bool {
        // Check for common safe SQL tagged template patterns
        // These ORMs/libraries parameterize interpolations automatically
        let safe_tags = [
            "sql`",        // Drizzle, Slonik, postgres.js
            ".sql`",       // db.sql`, Prisma.sql`
            "Prisma.sql`", // Prisma
            "raw`",        // Some ORMs
            "sqlstring`",  // sqlstring library
        ];

        let line_trimmed = line.trim();
        for tag in safe_tags {
            if line_trimmed.contains(tag) || line.contains(&format!(" {}", tag)) {
                return true;
            }
        }

        false
    }

    /// Check if the SQL keyword is actually a JavaScript variable name
    /// e.g., `${insert.id}` where "insert" is a variable, not SQL INSERT
    fn is_variable_name_false_positive(&self, line: &str) -> bool {
        let line_lower = line.to_lowercase();

        // Check if SQL keywords appear only inside ${...} as variable names
        // Pattern: ${insert.something} or ${update.field} or ${delete}
        let keywords = ["insert", "update", "delete", "select"];

        for keyword in keywords {
            // If keyword exists, check if it's inside ${...} as a variable reference
            if line_lower.contains(keyword) {
                // Check for patterns like ${insert. or ${update. (variable access)
                if line_lower.contains(&format!("${{{}", keyword)) {
                    // This is likely a variable named insert/update/delete
                    // Only flag if it ALSO appears outside of ${} in SQL context
                    let outside_interpolation = line_lower
                        .split("${")
                        .next()
                        .map(|s| s.contains(keyword))
                        .unwrap_or(false);

                    if !outside_interpolation {
                        return true; // Keyword only in variable name, not SQL
                    }
                }
            }
        }

        false
    }

    /// Check if the SQL string contains parameterized query placeholders
    /// If interpolation is alongside proper placeholders, it's likely for SQL structure
    fn has_parameterized_placeholders(&self, line: &str) -> bool {
        static PARAM_PATTERNS: LazyLock<Vec<Regex>> = LazyLock::new(|| {
            vec![
                Regex::new(r"@\w+").expect("valid regex"),  // @paramName
                Regex::new(r"\$\d+").expect("valid regex"), // $1, $2
                Regex::new(r":\w+").expect("valid regex"),  // :param
                Regex::new(r"(?:^|[^a-zA-Z0-9])\?(?:[^a-zA-Z0-9]|$)").expect("valid regex"), // ?
            ]
        });

        for re in PARAM_PATTERNS.iter() {
            if re.is_match(line) {
                return true;
            }
        }

        // Special case: check for standalone ? not in middle of words
        if line.contains(" ?") || line.ends_with("?") || line.contains("?,") || line.contains("= ?")
        {
            return true;
        }

        false
    }

    /// Check if the interpolated content is a placeholder generation pattern
    /// e.g., ids.map(() => '?').join(',') produces only '?,?,?' strings
    fn is_placeholder_generation_pattern(&self, line: &str) -> bool {
        let line_lower = line.to_lowercase();

        // Pattern 1: .map(() => '?').join(',')
        // Pattern 2: .map(() => "?").join(',')
        // Pattern 3: .map(_ => '?').join(',')
        // Pattern 4: .map(x => '?').join(',')
        if (line_lower.contains(".map(")
            && line_lower.contains("'?'")
            && line_lower.contains(".join"))
            || (line_lower.contains(".map(")
                && line_lower.contains("\"?\"")
                && line_lower.contains(".join"))
        {
            return true;
        }

        // Pattern: Array(count).fill('?').join(',')
        if line_lower.contains("array(")
            && line_lower.contains(".fill('?')")
            && line_lower.contains(".join")
        {
            return true;
        }

        // Pattern: new Array(n).fill('?').join(',')
        if line_lower.contains("new array") && line_lower.contains(".fill('?')") {
            return true;
        }

        false
    }

    /// Check if the interpolated variable name suggests SQL structure (not user data)
    /// e.g., ${where}, ${orderBy}, ${columns} are likely SQL clause builders
    fn is_sql_structure_variable(&self, line: &str) -> bool {
        // Extract variable names from ${...} interpolations
        static INTERP_RE: LazyLock<Regex> =
            LazyLock::new(|| Regex::new(r"\$\{(\w+)").expect("valid regex"));
        let re = &*INTERP_RE;

        for cap in re.captures_iter(line) {
            if let Some(var_name) = cap.get(1) {
                let var_lower = var_name.as_str().to_lowercase();

                // Common SQL structure variable names
                let structure_names = [
                    "where",
                    "orderby",
                    "order_by",
                    "sortby",
                    "sort_by",
                    "columns",
                    "fields",
                    "select",
                    "joins",
                    "groupby",
                    "group_by",
                    "having",
                    "limit",
                    "offset",
                    "tablename",
                    "table_name",
                    "sortcolumn",
                    "sort_column",
                    "sortdirection",
                    "sort_direction",
                    "conditions",
                    "clause",
                    "clauses",
                    "filters",
                    "sorts",
                    "placeholders",
                ];

                if structure_names.contains(&var_lower.as_str()) {
                    return true;
                }
            }
        }

        false
    }

    /// Check if the line contains sanitized SQL values (e.g., quote_name())
    fn is_sanitized_value(&self, line: &str) -> bool {
        let line_lower = line.to_lowercase();
        line_lower.contains("quote_name(")
            || line_lower.contains("escape_name(")
            || line_lower.contains("quote_ident(")
            || line_lower.contains("quotename(")
    }

    /// Check a line for dangerous SQL patterns
    /// Returns (pattern_type, is_likely_false_positive)
    fn check_line_for_patterns(&self, line: &str) -> Option<(&'static str, bool)> {
        let stripped = line.trim();
        if stripped.starts_with('#') {
            return None;
        }

        // Skip obvious non-SQL contexts that might contain SQL keywords coincidentally
        let line_lower = line.to_lowercase();
        if line_lower.contains("console.log") 
            || line_lower.contains("console.error")
            || line_lower.contains("console.warn")
            || line_lower.contains("console.info")
            || line_lower.contains("console.debug")
            || line_lower.contains("console.trace")
            || line_lower.contains("console.dir")
            || line_lower.contains(".log.")
            || line_lower.contains("log.error")
            || line_lower.contains("log.info")
            || line_lower.contains("log.warn")
            || line_lower.contains("log.debug")
            || line_lower.contains("logger.")
            // Node.js logging libraries
            || line_lower.contains("winston.")
            || line_lower.contains("pino.")
            || line_lower.contains("bunyan.")
            || line_lower.contains("log4js.")
            || line_lower.contains("morgan(")
            || line_lower.contains("throw new error")
            || line_lower.contains("throw error")
            || line_lower.contains("new error(")
            || line_lower.contains("reject(")
            || line_lower.contains("assert.")
            || line_lower.contains("expect(")
            || line_lower.contains("test(")
            || line_lower.contains("describe(")
            || line_lower.contains("it(")
        {
            return None;
        }

        // Shared false positive checks for template/interpolation patterns
        let has_placeholders = self.has_parameterized_placeholders(line);
        let is_placeholder_gen = self.is_placeholder_generation_pattern(line);
        let is_structure_var = self.is_sql_structure_variable(line);

        // Check f-string pattern
        if FSTRING_SQL.is_match(line) {
            return Some(("f-string", has_placeholders || is_structure_var));
        }

        // Check concatenation pattern
        if CONCAT_SQL.is_match(line) {
            return Some(("concatenation", has_placeholders || is_structure_var));
        }

        // Check .format() pattern
        if FORMAT_SQL.is_match(line) {
            return Some(("format", has_placeholders || is_structure_var));
        }

        // Check % formatting pattern
        if PERCENT_SQL.is_match(line) {
            return Some(("percent_format", has_placeholders || is_structure_var));
        }

        // Check JavaScript template literal pattern
        // Skip safe tagged templates (Drizzle sql``, Prisma.sql``, etc.)
        // Skip when SQL keyword is actually a variable name (${insert.id})
        // Skip placeholder generation patterns
        // Skip error/log contexts (never SQL queries)
        // Require 2+ SQL keywords when only 1 is present (single keyword is often English)
        if JS_TEMPLATE_SQL.is_match(line)
            && !self.is_safe_tagged_template(line)
            && !self.is_variable_name_false_positive(line)
            && !Self::is_error_or_log_context(line)
        {
            // Complete skip for placeholder generation - this can ONLY produce safe strings
            if is_placeholder_gen {
                return None;
            }

            // Mark as likely false positive if parameterized or structure variable
            let is_likely_fp = has_placeholders || is_structure_var;
            return Some(("js_template", is_likely_fp));
        }

        // Check Go fmt.Sprintf pattern
        if GO_SPRINTF_SQL.is_match(line) {
            return Some(("go_sprintf", has_placeholders || is_structure_var));
        }

        None
    }

    /// Check if line appears to be in SQL execution context
    fn is_sql_context(&self, line: &str) -> bool {
        let line_lower = line.to_lowercase();

        // Check for SQL function calls
        for func in SQL_SINK_FUNCTIONS {
            if line_lower.contains(&format!(".{}(", func)) {
                return true;
            }
        }

        // Check for SQL object patterns
        for obj in SQL_OBJECT_PATTERNS {
            if line_lower.contains(&format!("{}.", obj)) {
                return true;
            }
        }

        // Check for Django/SQLAlchemy patterns
        if line_lower.contains(".objects.raw(") {
            return true;
        }
        if line_lower.contains("text(")
            && ["select", "insert", "update", "delete"]
                .iter()
                .any(|kw| line_lower.contains(kw))
        {
            return true;
        }

        // JavaScript/Node.js SQL patterns
        if line_lower.contains(".query(") || line_lower.contains(".execute(") {
            return true;
        }
        // Common JS database libraries - require SQL-specific method calls
        if line_lower.contains("mysql.")
            || line_lower.contains("pg.")
            || line_lower.contains("sequelize")
            || line_lower.contains("knex")
        {
            return true;
        }
        // pool.* and client.* only count as SQL context with SQL-specific methods
        if (line_lower.contains("pool.") || line_lower.contains("client."))
            && (line_lower.contains(".query")
                || line_lower.contains(".execute")
                || line_lower.contains(".prepare")
                || line_lower.contains(".run")
                || line_lower.contains(".all(")
                || line_lower.contains(".get(")
                || line_lower.contains(".connect"))
        {
            return true;
        }

        // Go SQL patterns
        if line_lower.contains(".queryrow(") || line_lower.contains(".queryrowcontext(") {
            return true;
        }
        if line_lower.contains("sql.open")
            || line_lower.contains("db.query")
            || line_lower.contains("db.exec")
            || line_lower.contains("db.prepare")
        {
            return true;
        }
        // Go fmt.Sprintf with SQL keywords is always SQL context
        if line_lower.contains("fmt.sprintf")
            && ["select", "insert", "update", "delete"]
                .iter()
                .any(|kw| line_lower.contains(kw))
        {
            return true;
        }

        false
    }

    /// Scan source files for dangerous SQL patterns using the given FileProvider.
    ///
    /// When `skip_python` is true (Phase 2j dual-branch flag-on path),
    /// `.py` files are not scanned by the legacy line-regex pass —
    /// the AST predictor in `scan_python_file_dual_branch` handles
    /// them instead.
    fn scan_source_files_from_provider_filtered(
        &self,
        fp: &crate::detectors::analysis_context::AnalysisContextFileProvider<'_>,
        skip_python: bool,
    ) -> Vec<Finding> {
        let mut findings = Vec::new();
        let mut seen_locations: HashSet<(String, u32)> = HashSet::new();

        // Detect ORMs/frameworks to skip safe parameterized patterns
        let detected_frameworks = detect_frameworks(fp.repo_path());
        debug!(
            "Detected {} frameworks for ORM pattern detection",
            detected_frameworks.len()
        );

        debug!("Scanning for SQL injection via FileProvider");

        // Walk through Python, JavaScript, TypeScript, and Go files via FileProvider
        for path in fp.files_with_extensions(&["py", "js", "ts", "go", "java"]) {
            // Phase 2j: when the dual-branch flag is on, `.py` files
            // are handled by `scan_python_file_dual_branch` in
            // `detect()` — skip them here to avoid duplicate findings.
            if skip_python && path.extension().and_then(|e| e.to_str()) == Some("py") {
                continue;
            }
            if self.should_exclude(path) {
                debug!("Excluding file: {:?}", path);
                continue;
            }

            let rel_path = path.to_string_lossy().to_string();

            let content = match fp.content(path) {
                Some(c) => c,
                None => continue,
            };
            let content = content.as_str();

            // Skip very large files
            if content.len() > 500_000 {
                continue;
            }

            let lines: Vec<&str> = content.lines().collect();
            let mut skip_until = 0usize; // Skip joined continuation lines (#70)
            for (line_no, line) in lines.iter().enumerate() {
                if line_no < skip_until {
                    continue;
                }
                let line_num = (line_no + 1) as u32;

                // Check for suppression comments
                let prev_line = if line_no > 0 {
                    Some(lines[line_no - 1])
                } else {
                    None
                };
                if crate::detectors::is_line_suppressed(line, prev_line) {
                    continue;
                }

                // Join continuation lines for multiline query detection (#26)
                // If line ends with +, ||, .., \, or open string concat, peek next lines
                let check_line = {
                    let trimmed = line.trim_end();
                    if trimmed.ends_with('+')
                        || trimmed.ends_with("||")
                        || trimmed.ends_with('\\')
                        || trimmed.ends_with("..")
                        || trimmed.ends_with(',')
                    {
                        let mut joined = line.to_string();
                        let mut joined_count = 0usize;
                        for next in lines.iter().skip(line_no + 1).take(3) {
                            joined.push(' ');
                            joined.push_str(next.trim());
                            joined_count += 1;
                            let next_trimmed = next.trim_end();
                            if !next_trimmed.ends_with('+')
                                && !next_trimmed.ends_with("||")
                                && !next_trimmed.ends_with('\\')
                                && !next_trimmed.ends_with(',')
                            {
                                break;
                            }
                        }
                        skip_until = line_no + 1 + joined_count; // skip joined lines (#70)
                        joined
                    } else {
                        line.to_string()
                    }
                };

                if let Some((pattern_type, is_likely_fp)) =
                    self.check_line_for_patterns(&check_line)
                {
                    // Skip if line contains a safe ORM pattern (e.g., Prisma, Drizzle parameterized queries)
                    // is_safe_orm_pattern checks for unsafe raw SQL patterns first, then safe patterns
                    if is_safe_orm_pattern(line, &detected_frameworks) {
                        debug!("Skipping safe ORM pattern at {}:{}", rel_path, line_num);
                        continue;
                    }

                    // Skip lines with sanitized SQL identifiers (quote_name, etc.)
                    if self.is_sanitized_value(&check_line) {
                        debug!("Skipping sanitized SQL value at {}:{}", rel_path, line_num);
                        continue;
                    }

                    // go_sprintf patterns are self-evidently SQL (fmt.Sprintf with SQL keyword
                    // is always building a SQL string). js_template is NOT self-evident —
                    // a template literal with "create" could be English ("create succeeded")
                    // rather than SQL. JS template literals must pass the SQL context check
                    // (nearby .query(), .execute(), etc.) to reduce false positives.
                    let is_self_evident_sql = pattern_type == "go_sprintf";

                    // Check if this line directly contains SQL context
                    let has_direct_sql_context = is_self_evident_sql || self.is_sql_context(line);

                    // Require SQL context to reduce false positives
                    // "create directory" with f-string is not SQL injection
                    if !has_direct_sql_context {
                        // Check surrounding lines for context
                        let has_surrounding_sql_context = (line_no > 0
                            && self.is_sql_context(lines[line_no - 1]))
                            || (line_no + 1 < lines.len()
                                && self.is_sql_context(lines[line_no + 1]));
                        if !has_surrounding_sql_context {
                            continue;
                        }
                    }

                    let loc = (rel_path.clone(), line_num);
                    if seen_locations.contains(&loc) {
                        continue;
                    }
                    seen_locations.insert(loc);

                    // Skip entirely when is_likely_fp — the query uses parameterized
                    // placeholders alongside interpolation, meaning the interpolated
                    // parts are SQL structure (table/column names from whitelists),
                    // not user input.  False-positive rate is too high to report.
                    if is_likely_fp {
                        debug!(
                            "Skipping likely-FP SQL injection at {}:{} (parameterized + interpolation)",
                            rel_path, line_num
                        );
                        continue;
                    }

                    findings.push(self.create_finding(
                        &rel_path,
                        line_num,
                        pattern_type,
                        line.trim(),
                        has_direct_sql_context,
                        is_likely_fp,
                    ));

                    if findings.len() >= self.max_findings {
                        return findings;
                    }
                }
            }
        }

        findings
    }

    /// Create a finding for detected SQL injection vulnerability
    fn create_finding(
        &self,
        file_path: &str,
        line_start: u32,
        pattern_type: &str,
        snippet: &str,
        has_direct_sql_context: bool,
        is_likely_fp: bool,
    ) -> Finding {
        let pattern_descriptions = [
            (
                "f-string",
                "f-string with variable interpolation in SQL query",
            ),
            ("concatenation", "string concatenation in SQL query"),
            ("format", ".format() string interpolation in SQL query"),
            ("percent_format", "% string formatting in SQL query"),
            (
                "js_template",
                "JavaScript template literal with interpolation in SQL query",
            ),
            (
                "go_sprintf",
                "Go fmt.Sprintf with string interpolation in SQL query",
            ),
        ];

        let pattern_desc = pattern_descriptions
            .iter()
            .find(|(t, _)| *t == pattern_type)
            .map(|(_, d)| *d)
            .unwrap_or("dynamic SQL construction");

        let title = "Potential SQL Injection (CWE-89)".to_string();

        // Detect language for appropriate code block highlighting
        let language = detect_language(file_path);

        let mut description = format!(
            "**Potential SQL Injection Vulnerability**\n\n\
             **Pattern detected**: {}\n\n\
             **Location**: {}:{}\n\n\
             **Code snippet**:\n```{}\n{}\n```\n\n\
             SQL injection occurs when untrusted input is incorporated into SQL queries without\n\
             proper sanitization. An attacker could manipulate the query to:\n\
             - Access unauthorized data\n\
             - Modify or delete database records\n\
             - Execute administrative operations\n\
             - In some cases, execute operating system commands\n\n\
             This vulnerability is classified as **CWE-89: Improper Neutralization of Special\n\
             Elements used in an SQL Command ('SQL Injection')**.",
            pattern_desc, file_path, line_start, language, snippet
        );

        // Add note if this is likely a false positive
        if is_likely_fp {
            description.push_str(
                "\n\n**Note**: This query appears to use parameterized placeholders or \
                 interpolate SQL structure (table/column names, WHERE clauses) rather than \
                 user values. If the interpolated values are from a whitelist or hardcoded \
                 strings, this may be a false positive. Severity has been reduced accordingly.",
            );
        }

        let suggested_fix = get_fix_examples(language);

        // Determine severity based on confidence:
        // - If likely false positive (has placeholders or SQL structure vars): reduce to Medium
        // - Critical: Direct db.query/execute with user input (has_direct_sql_context = true, self-evident pattern)
        // - High: SQL context detected but uncertain source (has_direct_sql_context = true, from surrounding context)
        // - Medium: Pattern match without clear SQL context (should be rare given our filters)
        let is_self_evident_sql = pattern_type == "go_sprintf" || pattern_type == "js_template";
        let severity = if is_likely_fp {
            // Likely false positive - reduce severity
            Severity::Medium
        } else if has_direct_sql_context && is_self_evident_sql {
            // Direct SQL sink with string interpolation - highest confidence
            Severity::Critical
        } else if has_direct_sql_context {
            // SQL context detected on same line, but not self-evident pattern
            Severity::High
        } else {
            // SQL context from surrounding lines only
            Severity::Medium
        };

        // Calculate confidence based on how strongly the pattern matched
        let confidence = if is_likely_fp {
            0.50 // Reduced confidence for likely false positives
        } else if has_direct_sql_context && is_self_evident_sql {
            0.95 // Very high confidence - direct SQL sink with string interpolation
        } else if has_direct_sql_context {
            0.85 // High confidence - SQL context detected on same line
        } else {
            0.70 // Moderate confidence - SQL context from surrounding lines only
        };

        Finding {
            id: deterministic_finding_id(
                "SQLInjectionDetector",
                file_path,
                line_start,
                pattern_type,
            ),
            detector: "SQLInjectionDetector".to_string(),
            severity,
            title,
            description,
            affected_files: vec![PathBuf::from(file_path)],
            line_start: Some(line_start),
            line_end: Some(line_start),
            suggested_fix: Some(suggested_fix.to_string()),
            estimated_effort: Some("Medium (1-4 hours)".to_string()),
            category: Some("security".to_string()),
            cwe_id: Some("CWE-89".to_string()),
            why_it_matters: Some(
                "SQL injection is one of the most dangerous vulnerabilities, allowing attackers \
                 to access, modify, or delete sensitive data in the database."
                    .to_string(),
            ),
            confidence: Some(confidence),
            ..Default::default()
        }
    }

    /// Phase 2j dual-branch AST-driven Python scan.
    ///
    /// Parses the file once, collects every recognized SQL sink call
    /// site via [`evidence::collect_python_sql_sites`] (structurally
    /// classified into `SqlApi::Safe` / `Unsafe` / `UnsafeRaw` /
    /// `Ambiguous`), extracts evidence, runs [`predict::predict`],
    /// and builds a dual-branch finding per site. Replaces the legacy
    /// line-regex pass for `.py` files when the `sql-injection`
    /// dual-branch flag is on.
    ///
    /// Returns an empty vec if the file has no recognized SQL calls
    /// (fast path inside the collector) or fails to parse.
    ///
    /// Mirrors `nosql_injection::scan_python_file_dual_branch`
    /// (Phase 2i) and the 2g–2h predecessors.
    fn scan_python_file_dual_branch(&self, path: &Path, content: &str) -> Vec<Finding> {
        if content.contains('\0') {
            return Vec::new();
        }
        let Some(tree) = crate::detectors::ast_fingerprint::parse_root_ext(
            content,
            crate::parsers::lightweight::Language::Python,
            "py",
        ) else {
            return Vec::new();
        };
        let root = tree.root_node();
        let source = content.as_bytes();
        let lines: Vec<&str> = content.lines().collect();

        let mut findings = Vec::new();
        for site in evidence::collect_python_sql_sites(root, source) {
            let line_idx = site.call_node.start_position().row;

            // Honor `# repotoire: ignore` / inline suppressions same
            // as the legacy path. Without this, users who suppressed
            // the legacy finding would see a new dual-branch finding
            // appear when they flip the flag on — a regression.
            if let Some(line) = lines.get(line_idx) {
                let prev = if line_idx > 0 {
                    Some(lines[line_idx - 1])
                } else {
                    None
                };
                if crate::detectors::is_line_suppressed(line, prev) {
                    continue;
                }
            }

            let snippet = lines.get(line_idx).map(|s| s.trim()).unwrap_or("");
            let line_num = (line_idx + 1) as u32;

            findings.push(self.build_dual_branch_finding(
                path,
                line_num,
                site.api,
                site.callee_label.clone(),
                snippet,
                site.call_node,
                root,
                source,
                &lines,
            ));
        }
        findings
    }

    /// Build a dual-branch Finding for a single Python SQL call site.
    ///
    /// Mirrors `nosql_injection::build_dual_branch_finding` (Phase 2i):
    /// pull evidence, run the predictor, pick a title / description /
    /// fix per branch label, attach the alternative branch + every
    /// prediction reason + every resolution signal.
    #[allow(clippy::too_many_arguments)]
    fn build_dual_branch_finding(
        &self,
        path: &Path,
        line_num: u32,
        api: predict::SqlApi,
        callee_label: String,
        snippet: &str,
        call_node: tree_sitter::Node<'_>,
        module_root: tree_sitter::Node<'_>,
        source: &[u8],
        lines: &[&str],
    ) -> Finding {
        let file_path_str = path.to_string_lossy().to_string();
        let ev = evidence::extract_python_evidence(
            call_node,
            module_root,
            source,
            lines,
            Some(file_path_str),
            api,
            callee_label.clone(),
        );
        let prediction = predict::predict(&ev);

        let predicted_label = prediction.predicted;
        let predicted_severity = prediction.predicted_severity;
        let predicted_title = match predicted_label {
            crate::dual_branch::BranchLabel::RealBug => {
                format!("Potential SQL injection via `{callee_label}`")
            }
            crate::dual_branch::BranchLabel::Benign => {
                format!("Safe SQL call via `{callee_label}` (informational)")
            }
        };
        let predicted_description = format!(
            "**SQL Injection (dual-branch, CWE-89)**\n\n\
             **API**: `{}`\n\n\
             **Location**: {}:{}\n\n\
             **Code**:\n```python\n{}\n```\n\n\
             {}",
            callee_label,
            path.display(),
            line_num,
            snippet,
            match predicted_label {
                crate::dual_branch::BranchLabel::RealBug => format!(
                    "The `{callee_label}` call site appears to construct \
                     a SQL query via string formatting (f-string, `+`-\
                     concat, `.format()`, or `%`-operator) on user input, \
                     or invokes Django's `.raw()` / `.extra()` escape \
                     hatch with formatted SQL (D1.c — the Phase 2j \
                     headline). The predictor leans RealBug (see \
                     `prediction_reasons`)."
                ),
                crate::dual_branch::BranchLabel::Benign => format!(
                    "The `{callee_label}` call site appears to use a \
                     parameterized SQL API (e.g., `execute(literal, \
                     values)` with a separate values arg), a safe \
                     Django ORM keyword filter (`.filter()` / `.get()` \
                     / `.create()`), or SQLAlchemy `text(<literal>)` \
                     with a bound-params dict. The predictor leans \
                     Benign (see `prediction_reasons`); the alternative \
                     RealBug interpretation is carried in \
                     `alternative_branch` for users who want to inspect \
                     the call regardless."
                ),
            },
        );
        let predicted_fix = match predicted_label {
            crate::dual_branch::BranchLabel::RealBug => Some(
                "Use parameterized queries:\n\
                 ```python\n\
                 # Instead of:\n\
                 cursor.execute(f\"SELECT * FROM u WHERE id = {uid}\")\n\
                 User.objects.raw(\"SELECT * FROM u WHERE id = \" + uid)\n\
                 \n\
                 # Use parameterized SQL:\n\
                 cursor.execute(\"SELECT * FROM u WHERE id = %s\", (uid,))\n\
                 \n\
                 # Or the Django ORM keyword-filter form:\n\
                 User.objects.filter(id=uid)\n\
                 \n\
                 # Or SQLAlchemy text() + bound params:\n\
                 db.execute(text(\"SELECT :id\"), {\"id\": uid})\n\
                 ```\n\n\
                 If the call is intentionally constructing SQL the \
                 predictor cannot trace (cross-statement assembly, \
                 type-cast laundering inside `.format()`, etc.), \
                 annotate the call site with \
                 `# repotoire: sql-safe[<reason>]` to collapse the \
                 finding to Info."
                    .to_string(),
            ),
            crate::dual_branch::BranchLabel::Benign => Some(
                "If you need the predictor to surface this site (e.g. \
                 you're auditing every SQL query regardless), annotate \
                 the line with `# repotoire: sql-vulnerable[<source>]` \
                 where `<source>` is the rationale (e.g. \
                 `helper-built-query`)."
                    .to_string(),
            ),
        };

        let mut finding = Finding {
            id: deterministic_finding_id(
                "SQLInjectionDetector",
                &path.to_string_lossy(),
                line_num,
                "dual_branch",
            ),
            detector: "SQLInjectionDetector".to_string(),
            severity: predicted_severity,
            title: predicted_title,
            description: predicted_description,
            affected_files: vec![path.to_path_buf()],
            line_start: Some(line_num),
            line_end: Some(line_num),
            suggested_fix: predicted_fix,
            estimated_effort: Some("30 minutes".to_string()),
            category: Some("security".to_string()),
            cwe_id: Some("CWE-89".to_string()),
            why_it_matters: Some(
                "SQL injection is one of the most dangerous \
                 vulnerabilities, allowing attackers to access, modify, \
                 or delete sensitive data in the database."
                    .to_string(),
            ),
            ..Default::default()
        };

        finding = finding.with_alternative_branch(prediction.alternative_branch);
        for reason in prediction.reasons {
            finding = finding.with_prediction_reason(reason);
        }
        for resolution in prediction.resolutions {
            finding = finding.with_resolution_signal(resolution);
        }
        finding
    }
}

impl Default for SQLInjectionDetector {
    fn default() -> Self {
        Self::new()
    }
}

impl Detector for SQLInjectionDetector {
    fn name(&self) -> &'static str {
        "SQLInjectionDetector"
    }

    fn description(&self) -> &'static str {
        "Detects potential SQL injection vulnerabilities from string interpolation in queries"
    }

    fn bypass_postprocessor(&self) -> bool {
        true
    }

    fn category(&self) -> &'static str {
        "security"
    }

    fn config(&self) -> Option<&DetectorConfig> {
        Some(&self.config)
    }

    fn set_precomputed_taint(
        &self,
        cross: Vec<crate::detectors::taint::TaintPath>,
        intra: Vec<crate::detectors::taint::TaintPath>,
    ) {
        let _ = self.precomputed_cross.set(cross);
        let _ = self.precomputed_intra.set(intra);
    }

    fn taint_category(&self) -> Option<crate::detectors::taint::TaintCategory> {
        Some(TaintCategory::SqlInjection)
    }

    fn file_extensions(&self) -> &'static [&'static str] {
        &[
            "py", "js", "ts", "jsx", "tsx", "rb", "php", "java", "go", "rs",
        ]
    }

    fn content_requirements(&self) -> crate::detectors::detector_context::ContentFlags {
        crate::detectors::detector_context::ContentFlags::HAS_SQL
    }

    fn detect(
        &self,
        ctx: &crate::detectors::analysis_context::AnalysisContext,
    ) -> Result<Vec<Finding>> {
        let graph = ctx.graph;
        debug!("Starting SQL injection detection with taint analysis");

        // Phase 2j dual-branch gate. When `true`, Python `.py` files
        // go through the AST-driven predictor path
        // (`scan_python_file_dual_branch`) and skip the legacy line
        // scanner. Other languages and the flag-off path are
        // unchanged. Symmetric with nosql-injection (Phase 2i),
        // insecure-deserialize (Phase 2h), xxe (Phase 2g),
        // command-injection.
        let flag_on = ctx.dual_branch.is_enabled_for("sql-injection");

        // Step 1: Run pattern-based detection (existing logic)
        let fp = ctx.as_file_provider();
        let mut findings = self.scan_source_files_from_provider_filtered(&fp, flag_on);

        // Step 1.5: Phase 2j AST-driven predictor path for Python
        // when the dual-branch flag is on.
        if flag_on {
            for path in fp.files_with_extensions(&["py"]) {
                if findings.len() >= self.max_findings {
                    break;
                }
                if self.should_exclude(path) {
                    continue;
                }
                if let Some(content) = fp.content(path) {
                    let dual = self.scan_python_file_dual_branch(path, &content);
                    for finding in dual {
                        findings.push(finding);
                        if findings.len() >= self.max_findings {
                            break;
                        }
                    }
                }
            }
        }

        // Step 2: Run graph-based taint analysis to find data flow paths
        // Use precomputed results if available, otherwise fall back to own analysis
        let mut taint_paths = if let Some(cross) = self.precomputed_cross.get() {
            cross.clone()
        } else {
            self.taint_analyzer
                .trace_taint(graph, TaintCategory::SqlInjection)
        };

        // Step 2.5: Run intra-function data flow analysis for deeper precision
        let intra_paths = if let Some(intra) = self.precomputed_intra.get() {
            intra.clone()
        } else {
            crate::detectors::taint::run_intra_function_taint(
                &self.taint_analyzer,
                graph,
                TaintCategory::SqlInjection,
                &self.repository_path,
            )
        };
        debug!(
            "Intra-function analysis found {} additional taint paths",
            intra_paths.len()
        );
        taint_paths.extend(intra_paths);

        let taint_result = TaintAnalysisResult::from_paths(taint_paths);

        debug!(
            "Taint analysis found {} paths ({} vulnerable, {} sanitized)",
            taint_result.paths.len(),
            taint_result.vulnerable_count,
            taint_result.sanitized_count
        );

        // Step 3: Enhance findings with taint analysis results
        // - If a finding has a taint path with no sanitizer → Critical
        // - If a finding has a taint path with sanitizer → downgrade to Info/skip
        // - If pattern match but no taint path → keep as High/Medium
        for finding in &mut findings {
            if let (Some(file_path), Some(line)) =
                (finding.affected_files.first(), finding.line_start)
            {
                let file_str = file_path.to_string_lossy();

                // Check if there's a taint path that includes this file/location
                let matching_path = taint_result
                    .paths
                    .iter()
                    .find(|p| p.sink_file == file_str || p.source_file == file_str);

                if let Some(path) = matching_path {
                    if path.is_sanitized {
                        // Sanitizer found in the data flow path - downgrade severity
                        debug!(
                            "Finding at {}:{} has sanitized taint path via '{}'",
                            file_str,
                            line,
                            path.sanitizer.as_deref().unwrap_or("unknown")
                        );
                        finding.severity = Severity::Info;
                        finding.description = format!(
                            "{}\n\n**Taint Analysis Note**: A sanitizer function (`{}`) was found \
                             in the data flow path, which may mitigate this vulnerability. \
                             Please verify the sanitizer is applied correctly.",
                            finding.description,
                            path.sanitizer.as_deref().unwrap_or("unknown")
                        );
                    } else {
                        // Unsanitized taint path confirmed — upgrade severity and promote to
                        // Tier::Blocking. Dual-branch findings are excluded: their predictor
                        // already chose a severity and branch, and overwriting here would clobber
                        // the calibrated predictor output.
                        debug!(
                            "Finding at {}:{} has unsanitized taint path: {}",
                            file_str,
                            line,
                            path.path_string()
                        );
                        finding.severity = Severity::Critical;
                        finding.description = format!(
                            "{}\n\n**Taint Analysis Confirmed**: Data flow analysis traced a path \
                             from user input to this SQL sink without sanitization:\n\n\
                             `{}`\n\n\
                             This significantly increases confidence that this is a real vulnerability.",
                            finding.description,
                            path.path_string()
                        );
                        if !finding.is_dual_branch() {
                            if let Some(ev) = Self::sql_exec_evidence(path) {
                                finding.tier = Tier::Blocking;
                                finding.deterministic = true;
                                finding.confidence = Some(0.95);
                                finding.evidence = Some(ev);
                            }
                        }
                    }
                }
            }
        }

        // Step 4: Add findings for taint paths that weren't caught by pattern matching
        for path in taint_result.vulnerable_paths() {
            // Check if we already have a finding for this location
            let already_reported = findings.iter().any(|f| {
                f.affected_files
                    .first()
                    .map(|p| p.to_string_lossy() == path.sink_file)
                    .unwrap_or(false)
                    && f.line_start == Some(path.sink_line)
            });

            if !already_reported {
                findings.push(self.create_taint_finding(path));
            }
        }

        // Filter out Info-severity findings (sanitized paths), but
        // preserve Phase 2j dual-branch Info-severity findings — these
        // are the D1.a Safe-collapse "informational" branch and need
        // to be surfaced so users can see the alternative-branch and
        // resolution-signal metadata.
        findings.retain(|f| f.severity != Severity::Info || f.is_dual_branch());

        info!(
            "SQLInjectionDetector found {} potential vulnerabilities (after taint analysis)",
            findings.len()
        );

        Ok(findings)
    }
}

impl SQLInjectionDetector {
    /// Build `Evidence::TaintPath { sink_kind: "sql_exec", .. }` from a taint path.
    ///
    /// Returns `None` for sanitized paths — a sanitizer on the path means the
    /// finding does not meet the precision bar for `Tier::Blocking`.
    ///
    /// Gate: `!path.is_sanitized` (not `sanitizers_on_path.is_empty()`) because
    /// the intra-function heuristic engine reports only the bool, not names.
    fn sql_exec_evidence(path: &crate::detectors::taint::TaintPath) -> Option<Evidence> {
        if path.is_sanitized {
            return None;
        }
        Some(Evidence::TaintPath {
            source: SourceSpan {
                file: PathBuf::from(&path.source_file),
                line_start: path.source_line,
                line_end: path.source_line,
                snippet: None,
            },
            sink: SourceSpan {
                file: PathBuf::from(&path.sink_file),
                line_start: path.sink_line,
                line_end: path.sink_line,
                snippet: None,
            },
            sink_kind: "sql_exec".to_string(),
            flow: vec![],
            sanitizers_seen: path.sanitizers_on_path.clone(),
        })
    }

    /// Create a finding from a taint analysis path.
    ///
    /// Unsanitized paths are promoted to `Tier::Blocking` with `Evidence::TaintPath`.
    /// Sanitized paths stay `Tier::Advisory` with no evidence.
    fn create_taint_finding(&self, path: &crate::detectors::taint::TaintPath) -> Finding {
        let description = format!(
            "**SQL Injection via Data Flow**\n\n\
             Taint analysis traced a path from user input to a SQL sink:\n\n\
             **Source**: `{}` in `{}`:{}\n\
             **Sink**: `{}` in `{}`:{}\n\
             **Path**: `{}`\n\n\
             This vulnerability was detected through data flow analysis, which traced \
             how user-controlled data propagates through function calls to reach a \
             dangerous SQL operation without proper sanitization.",
            path.source_function,
            path.source_file,
            path.source_line,
            path.sink_function,
            path.sink_file,
            path.sink_line,
            path.path_string()
        );

        let evidence = Self::sql_exec_evidence(path);
        let (tier, deterministic, confidence) = if evidence.is_some() {
            (Tier::Blocking, true, Some(0.95_f64))
        } else {
            (Tier::Advisory, false, None)
        };

        Finding {
            id: deterministic_finding_id(
                "SQLInjectionDetector",
                &path.sink_file,
                path.sink_line,
                "taint_flow",
            ),
            detector: "SQLInjectionDetector".to_string(),
            severity: Severity::Critical,
            title: "SQL Injection (Confirmed via Taint Analysis)".to_string(),
            description,
            affected_files: vec![PathBuf::from(&path.sink_file)],
            line_start: Some(path.sink_line),
            line_end: Some(path.sink_line),
            suggested_fix: Some(get_fix_examples(detect_language(&path.sink_file)).to_string()),
            estimated_effort: Some("Medium (1-4 hours)".to_string()),
            category: Some("security".to_string()),
            cwe_id: Some("CWE-89".to_string()),
            why_it_matters: Some(
                "This SQL injection was confirmed through data flow analysis, tracking user input \
                 from its source to the dangerous SQL operation. This is a high-confidence finding."
                    .to_string(),
            ),
            tier,
            deterministic,
            confidence,
            evidence,
            ..Default::default()
        }
    }
}

impl crate::detectors::RegisteredDetector for SQLInjectionDetector {
    fn create(init: &crate::detectors::DetectorInit) -> std::sync::Arc<dyn Detector> {
        std::sync::Arc::new(Self::with_repository_path(init.repo_path.to_path_buf()))
    }

    fn max_tier() -> crate::models::Tier {
        crate::models::Tier::Blocking
    }
}