repotoire 0.8.2

//! NoSQL Injection Detector
//!
//! Graph-enhanced detection of NoSQL injection:
//! - Trace user input to MongoDB queries
//! - Detect dangerous operators ($where, $regex, etc.)
//! - Check for sanitization/validation in call chain

// Phase 2i dual-branch submodules. Wired through `detect()` in
// commit 5 of the dual-branch migration stack. Scaffolded in
// commit 3 (predict + annotation) and commit 4 (evidence) with
// `#![allow(dead_code)]` so they compile without integration.
mod annotation;
mod evidence;
mod predict;

use crate::detectors::base::{Detector, DetectorConfig};
use crate::graph::GraphQueryExt;
use crate::models::{deterministic_finding_id, Finding, Severity};
use anyhow::Result;
use regex::Regex;
use std::path::{Path, PathBuf};
use std::sync::LazyLock;
use tracing::info;

static NOSQL_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"(?i)(\.find\(|\.findOne\(|\.findById\(|\.updateOne\(|\.updateMany\(|\.deleteOne\(|\.deleteMany\(|\.aggregate\(|\.countDocuments\(|db\.\w+\.)").expect("valid regex")
});
static DANGEROUS_OPS: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"(\$where|\$regex|\$expr|\$function|\$accumulator)").expect("valid regex")
});
static USER_INPUT: LazyLock<Regex> = LazyLock::new(|| {
    Regex::new(r"(req\.(body|query|params|headers)|request\.(body|query)|ctx\.(request|body)|input|JSON\.parse)").expect("valid regex")
});

/// Categorize the type of NoSQL injection risk
fn categorize_risk(line: &str) -> (&'static str, &'static str) {
    if line.contains("$where") {
        return ("where", "$where allows JavaScript execution");
    }
    if line.contains("$regex") {
        return ("regex", "$regex with user input enables ReDoS");
    }
    if line.contains("$ne") || line.contains("$gt") || line.contains("$lt") {
        return ("operator", "Operator injection can bypass authentication");
    }
    if line.contains("$expr") || line.contains("$function") {
        return ("eval", "Expression evaluation can execute arbitrary code");
    }
    ("query", "Query injection")
}

pub struct NosqlInjectionDetector {
    repository_path: PathBuf,
    max_findings: usize,
    precomputed_cross: std::sync::OnceLock<Vec<crate::detectors::taint::TaintPath>>,
    precomputed_intra: std::sync::OnceLock<Vec<crate::detectors::taint::TaintPath>>,
}

impl NosqlInjectionDetector {
    pub fn new(repository_path: impl Into<PathBuf>) -> Self {
        Self {
            repository_path: repository_path.into(),
            max_findings: 50,
            precomputed_cross: std::sync::OnceLock::new(),
            precomputed_intra: std::sync::OnceLock::new(),
        }
    }

    /// Check if this is actually an Array method, not MongoDB
    fn is_array_method(line: &str) -> bool {
        // Common array variable patterns
        let array_vars = [
            "items.find(",
            "list.find(",
            "array.find(",
            "results.find(",
            "data.find(",
            "options.find(",
            "elements.find(",
            "entries.find(",
            "records.find(",
            "rows.find(",
            "values.find(",
            "keys.find(",
        ];

        if array_vars.iter().any(|v| line.contains(v)) {
            return true;
        }

        // Check for array method chains
        if line.contains(".filter(")
            || line.contains(".map(")
            || line.contains(".some(")
            || line.contains(".every(")
            || line.contains("Array.")
            || line.contains("[].")
        {
            return true;
        }

        false
    }

    /// Check for sanitization in surrounding context
    fn has_sanitization(lines: &[&str], current_line: usize) -> bool {
        let start = current_line.saturating_sub(10);
        let context = lines[start..current_line].join(" ").to_lowercase();

        context.contains("sanitize")
            || context.contains("validate")
            || context.contains("escape")
            || context.contains("clean")
            || context.contains("tostring()")
            || context.contains("parseint")
            || context.contains("number(")
            || context.contains("boolean(")
            || context.contains("mongo-sanitize")
            || context.contains("express-mongo-sanitize")
    }

    /// Check if function is a route handler (directly receives user input)
    fn is_route_handler(func_name: &str, file_path: &str) -> bool {
        let name_lower = func_name.to_lowercase();
        let path_lower = file_path.to_lowercase();

        name_lower.contains("handler")
            || name_lower.contains("controller")
            || name_lower.contains("route")
            || name_lower.contains("api")
            || name_lower.starts_with("get")
            || name_lower.starts_with("post")
            || name_lower.starts_with("put")
            || name_lower.starts_with("delete")
            || path_lower.contains("route")
            || path_lower.contains("controller")
            || path_lower.contains("handler")
    }

    /// Phase 2i dual-branch AST-driven Python scan.
    ///
    /// Parses the file once, collects every recognized pymongo / motor
    /// call site via [`evidence::collect_python_nosql_sites`] (already
    /// array-method-filtered and structurally classified), extracts
    /// evidence, runs [`predict::predict`], and builds a dual-branch
    /// finding per site. Replaces the legacy line-regex pass for `.py`
    /// files when the `nosql-injection` dual-branch flag is on.
    ///
    /// Returns an empty vec if the file has no recognized pymongo
    /// calls (fast path inside the collector) or fails to parse.
    ///
    /// Mirrors `insecure_deserialize::scan_python_file_dual_branch`
    /// (Phase 2h, commit `b2a98e25`) and the 2g xxe predecessor.
    fn scan_python_file_dual_branch(&self, path: &Path, content: &str) -> Vec<Finding> {
        if content.contains('\0') {
            return Vec::new();
        }
        let Some(tree) = crate::detectors::ast_fingerprint::parse_root_ext(
            content,
            crate::parsers::lightweight::Language::Python,
            "py",
        ) else {
            return Vec::new();
        };
        let root = tree.root_node();
        let source = content.as_bytes();
        let lines: Vec<&str> = content.lines().collect();

        let mut findings = Vec::new();
        for site in evidence::collect_python_nosql_sites(root, source) {
            let line_idx = site.call_node.start_position().row;

            // Honor `# repotoire: ignore` / inline suppressions same
            // as the legacy path. Without this, users who suppressed
            // the legacy finding would see a new dual-branch finding
            // appear when they flip the flag on — a regression.
            if let Some(line) = lines.get(line_idx) {
                let prev = if line_idx > 0 {
                    Some(lines[line_idx - 1])
                } else {
                    None
                };
                if crate::detectors::is_line_suppressed(line, prev) {
                    continue;
                }
            }

            let snippet = lines.get(line_idx).map(|s| s.trim()).unwrap_or("");
            let line_num = (line_idx + 1) as u32;

            findings.push(self.build_dual_branch_finding(
                path,
                line_num,
                site.api,
                site.callee_label.clone(),
                snippet,
                site.call_node,
                root,
                source,
                &lines,
            ));
        }
        findings
    }

    /// Build a dual-branch Finding for a single Python pymongo call site.
    ///
    /// Mirrors `insecure_deserialize::build_dual_branch_finding`
    /// (Phase 2h): pull evidence, run the predictor, pick a
    /// title/description/fix per branch label, attach the alternative
    /// branch + every prediction reason + every resolution signal.
    /// The result is a single `Finding` with the dual-branch shape that
    /// `--show-alternatives` knows how to render.
    #[allow(clippy::too_many_arguments)]
    fn build_dual_branch_finding(
        &self,
        path: &Path,
        line_num: u32,
        api: predict::NosqlApi,
        callee_label: String,
        snippet: &str,
        call_node: tree_sitter::Node<'_>,
        module_root: tree_sitter::Node<'_>,
        source: &[u8],
        lines: &[&str],
    ) -> Finding {
        let file_path_str = path.to_string_lossy().to_string();
        let ev = evidence::extract_python_evidence(
            call_node,
            module_root,
            source,
            lines,
            Some(file_path_str),
            api,
            callee_label.clone(),
        );
        let prediction = predict::predict(&ev);

        let predicted_label = prediction.predicted;
        let predicted_severity = prediction.predicted_severity;
        let predicted_title = match predicted_label {
            crate::dual_branch::BranchLabel::RealBug => {
                format!("Potential NoSQL injection via `{callee_label}`")
            }
            crate::dual_branch::BranchLabel::Benign => {
                format!("Safe pymongo query via `{callee_label}` (informational)")
            }
        };
        let predicted_description = format!(
            "**NoSQL Injection (dual-branch, CWE-943)**\n\n\
             **API**: `{}`\n\n\
             **Location**: {}:{}\n\n\
             **Code**:\n```python\n{}\n```\n\n\
             {}",
            callee_label,
            path.display(),
            line_num,
            snippet,
            match predicted_label {
                crate::dual_branch::BranchLabel::RealBug => format!(
                    "The `{callee_label}` call site appears to construct a \
                     MongoDB query with attacker-reachable input flowing \
                     into a dangerous server-side operator (`$where` / \
                     `$function` / `$expr` / `$accumulator`), via dict-\
                     expansion of raw user input, or under weighted \
                     signals indicating operator-injection exposure. \
                     The predictor leans RealBug (see `prediction_reasons`)."
                ),
                crate::dual_branch::BranchLabel::Benign => format!(
                    "The `{callee_label}` call site appears to use a \
                     structurally-typed pymongo query (no dangerous \
                     operators, no `**`-expansion, user-input values \
                     cast via `str` / `ObjectId` / `int` / pydantic), \
                     or developer-written operator filters without user \
                     input. The predictor leans Benign (see \
                     `prediction_reasons`); the alternative RealBug \
                     interpretation is carried in `alternative_branch` \
                     for users who want to inspect the call regardless."
                ),
            },
        );
        let predicted_fix = match predicted_label {
            crate::dual_branch::BranchLabel::RealBug => Some(
                "Sanitize the MongoDB query construction:\n\
                 ```python\n\
                 # Instead of:\n\
                 users.find_one({\"$where\": f\"this.name=='{req.form['n']}'\"})\n\
                 users.find_one({**request.get_json()})\n\
                 \n\
                 # Use typed-value queries:\n\
                 users.find_one({\"name\": str(request.form['n'])})\n\
                 users.find_one({\"_id\": ObjectId(request.form['id'])})\n\
                 \n\
                 # Or pydantic-validate the payload first:\n\
                 class Query(BaseModel):\n\
                 \x20   name: str\n\
                 q = Query.model_validate(request.get_json())\n\
                 users.find_one({\"name\": q.name})\n\
                 ```\n\n\
                 If the call is intentionally constructing a complex \
                 query that the predictor cannot trace (cross-statement \
                 assembly, helper-built filter, etc.), annotate the call \
                 site with `# repotoire: nosql-safe[<reason>]` to collapse \
                 the finding to Info."
                    .to_string(),
            ),
            crate::dual_branch::BranchLabel::Benign => Some(
                "If you need the predictor to surface this site (e.g. \
                 you're auditing every pymongo query regardless), \
                 annotate the line with \
                 `# repotoire: nosql-vulnerable[<source>]` where \
                 `<source>` is the rationale (e.g. \
                 `helper-assembled-query`)."
                    .to_string(),
            ),
        };

        let mut finding = Finding {
            id: String::new(),
            detector: "NosqlInjectionDetector".to_string(),
            severity: predicted_severity,
            title: predicted_title,
            description: predicted_description,
            affected_files: vec![path.to_path_buf()],
            line_start: Some(line_num),
            line_end: Some(line_num),
            suggested_fix: predicted_fix,
            estimated_effort: Some("30 minutes".to_string()),
            category: Some("security".to_string()),
            cwe_id: Some("CWE-943".to_string()),
            why_it_matters: Some(
                "NoSQL injection can allow attackers to:\n\
                 • Bypass authentication ({ password: { $ne: '' } })\n\
                 • Extract data through $regex probing\n\
                 • Execute arbitrary JavaScript ($where)\n\
                 • Denial of service through ReDoS"
                    .to_string(),
            ),
            ..Default::default()
        };

        finding = finding.with_alternative_branch(prediction.alternative_branch);
        for reason in prediction.reasons {
            finding = finding.with_prediction_reason(reason);
        }
        for resolution in prediction.resolutions {
            finding = finding.with_resolution_signal(resolution);
        }
        finding
    }
}

impl Detector for NosqlInjectionDetector {
    fn name(&self) -> &'static str {
        "nosql-injection"
    }
    fn description(&self) -> &'static str {
        "Detects NoSQL injection risks"
    }

    fn bypass_postprocessor(&self) -> bool {
        true
    }

    crate::detectors::impl_taint_precompute!();

    fn taint_category(&self) -> Option<crate::detectors::taint::TaintCategory> {
        Some(crate::detectors::taint::TaintCategory::SqlInjection)
    }

    fn file_extensions(&self) -> &'static [&'static str] {
        &["py", "js", "ts", "jsx", "tsx", "rb", "php", "java", "go"]
    }

    fn content_requirements(&self) -> crate::detectors::detector_context::ContentFlags {
        crate::detectors::detector_context::ContentFlags::HAS_SQL
    }

    fn detect(
        &self,
        ctx: &crate::detectors::analysis_context::AnalysisContext,
    ) -> Result<Vec<Finding>> {
        let graph = ctx.graph;
        let files = &ctx.as_file_provider();
        let mut findings = vec![];

        // Phase 2i dual-branch gate. When `true`, Python `.py` files
        // go through the AST-driven predictor path
        // (`scan_python_file_dual_branch`) and skip the legacy line
        // scanner. Other languages and the flag-off path are
        // unchanged. Symmetric with insecure-deserialize (Phase 2h),
        // xxe (Phase 2g), command-injection.
        let flag_on = ctx.dual_branch.is_enabled_for("nosql-injection");

        for path in files.files_with_extensions(&["js", "ts", "py", "rb", "php"]) {
            if findings.len() >= self.max_findings {
                break;
            }

            let path_str = path.to_string_lossy().to_string();

            // Skip test files
            if crate::detectors::base::is_test_path(&path_str) {
                continue;
            }

            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");

            // Phase 2i: AST-driven predictor path for Python when the
            // dual-branch flag is on. Replaces the legacy regex pass
            // for `.py` files; other languages and the flag-off path
            // fall through to the regex scanner below.
            if flag_on && ext == "py" {
                if let Some(content) = files.content(path) {
                    let dual = self.scan_python_file_dual_branch(path, &content);
                    for finding in dual {
                        findings.push(finding);
                        if findings.len() >= self.max_findings {
                            break;
                        }
                    }
                }
                continue;
            }

            if let Some(content) = files.content(path) {
                let lines: Vec<&str> = content.lines().collect();

                // Check if file has MongoDB context
                let has_mongo = content.contains("mongoose")
                    || content.contains("mongodb")
                    || content.contains("MongoClient")
                    || content.contains("pymongo")
                    || content.contains("Collection");

                if !has_mongo {
                    continue;
                }

                for (i, line) in lines.iter().enumerate() {
                    let prev_line = if i > 0 { Some(lines[i - 1]) } else { None };
                    if crate::detectors::is_line_suppressed(line, prev_line) {
                        continue;
                    }

                    if !NOSQL_PATTERN.is_match(line) {
                        continue;
                    }
                    if Self::is_array_method(line) {
                        continue;
                    }

                    // Check for user input
                    let has_input = USER_INPUT.is_match(line);
                    let start = i.saturating_sub(5);
                    let context = lines[start..i].join(" ");
                    let has_input_nearby = USER_INPUT.is_match(&context);

                    if !has_input && !has_input_nearby {
                        continue;
                    }

                    // Check for sanitization
                    let is_sanitized = Self::has_sanitization(&lines, i);
                    if is_sanitized {
                        continue;
                    }

                    // Check for dangerous operators
                    let has_dangerous = DANGEROUS_OPS.is_match(line);
                    let (risk_type, risk_desc) = categorize_risk(line);

                    // Get function context
                    let containing_func =
                        graph.find_function_at(&path_str, (i + 1) as u32).map(|f| {
                            let callers = graph
                                .get_callers(f.qn(crate::graph::interner::global_interner()))
                                .len();
                            (
                                f.node_name(crate::graph::interner::global_interner())
                                    .to_string(),
                                callers,
                            )
                        });
                    let is_handler = containing_func
                        .as_ref()
                        .map(|(name, _)| Self::is_route_handler(name, &path_str))
                        .unwrap_or(false);

                    // Calculate severity
                    let severity = if has_dangerous || (is_handler && has_input) {
                        Severity::Critical // dangerous operator or direct user input in route handler
                    } else if has_input {
                        Severity::High
                    } else {
                        Severity::Medium
                    };

                    // Build notes
                    let mut notes = Vec::new();
                    notes.push(format!("🔍 Risk type: {}", risk_desc));
                    if has_dangerous {
                        notes.push("⚠️ Uses dangerous operator".to_string());
                    }
                    if is_handler {
                        notes.push("🌐 In route handler (direct user input)".to_string());
                    }
                    if let Some((func_name, callers)) = &containing_func {
                        notes.push(format!(
                            "📦 In function: `{}` ({} callers)",
                            func_name, callers
                        ));
                    }

                    let context_notes = format!("\n\n**Analysis:**\n{}", notes.join("\n"));

                    let suggestion = match risk_type {
                        "where" =>
                            "**Never use $where with user input** - it executes JavaScript.\n\n\
                             ```javascript\n\
                             // Instead of:\n\
                             db.users.find({ $where: `this.name == '${userInput}'` });\n\
                             \n\
                             // Use:\n\
                             db.users.find({ name: userInput });  // Still sanitize!\n\
                             ```".to_string(),
                        "regex" =>
                            "Escape regex special characters or use literal match:\n\n\
                             ```javascript\n\
                             // Escape regex\n\
                             const escaped = userInput.replace(/[.*+?^${}()|[\\]\\\\]/g, '\\\\$&');\n\
                             db.users.find({ name: { $regex: escaped } });\n\
                             \n\
                             // Or use literal string match when possible\n\
                             db.users.find({ name: userInput });\n\
                             ```".to_string(),
                        "operator" =>
                            "Prevent operator injection by validating input types:\n\n\
                             ```javascript\n\
                             // User could send: { \"$ne\": \"\" } to bypass auth\n\
                             // Always validate/convert to expected type:\n\
                             const username = String(req.body.username);\n\
                             const password = String(req.body.password);\n\
                             \n\
                             // Or use mongo-sanitize\n\
                             const sanitize = require('mongo-sanitize');\n\
                             db.users.find({ username: sanitize(req.body.username) });\n\
                             ```".to_string(),
                        _ =>
                            "Sanitize all user input before using in queries:\n\n\
                             ```javascript\n\
                             const sanitize = require('mongo-sanitize');\n\
                             const cleanInput = sanitize(req.body);\n\
                             db.collection.find(cleanInput);\n\
                             ```".to_string(),
                    };

                    findings.push(Finding {
                        id: String::new(),
                        detector: "NosqlInjectionDetector".to_string(),
                        severity,
                        title: format!("NoSQL injection: {}", risk_desc),
                        description: format!(
                            "MongoDB query with user-controlled input can be exploited.{}",
                            context_notes
                        ),
                        affected_files: vec![path.to_path_buf()],
                        line_start: Some((i + 1) as u32),
                        line_end: Some((i + 1) as u32),
                        suggested_fix: Some(suggestion),
                        estimated_effort: Some("30 minutes".to_string()),
                        category: Some("security".to_string()),
                        cwe_id: Some("CWE-943".to_string()),
                        why_it_matters: Some(
                            "NoSQL injection can allow attackers to:\n\
                             • Bypass authentication ({ password: { $ne: '' } })\n\
                             • Extract data through $regex probing\n\
                             • Execute arbitrary JavaScript ($where)\n\
                             • Denial of service through ReDoS"
                                .to_string(),
                        ),
                        ..Default::default()
                    });
                }
            }
        }

        // Supplement with intra-function taint analysis (precomputed or fallback)
        let intra_paths = if let Some(intra) = self.precomputed_intra.get() {
            intra.clone()
        } else {
            let taint_analyzer = crate::detectors::taint::TaintAnalyzer::new();
            crate::detectors::taint::run_intra_function_taint(
                &taint_analyzer,
                graph,
                crate::detectors::taint::TaintCategory::SqlInjection,
                &self.repository_path,
            )
        };
        let mut seen: std::collections::HashSet<(String, u32)> = findings
            .iter()
            .filter_map(|f| {
                f.affected_files
                    .first()
                    .map(|p| (p.to_string_lossy().to_string(), f.line_start.unwrap_or(0)))
            })
            .collect();
        for path in intra_paths.iter().filter(|p| !p.is_sanitized) {
            let loc = (path.sink_file.clone(), path.sink_line);
            if !seen.insert(loc) {
                continue;
            }
            findings.push(crate::detectors::taint::taint_path_to_finding(
                path,
                "NosqlInjectionDetector",
                "NoSQL Injection",
            ));
            if findings.len() >= self.max_findings {
                break;
            }
        }

        info!(
            "NosqlInjectionDetector found {} findings (graph-aware + taint)",
            findings.len()
        );
        Ok(findings)
    }
}

impl crate::detectors::RegisteredDetector for NosqlInjectionDetector {
    fn create(init: &crate::detectors::DetectorInit) -> std::sync::Arc<dyn Detector> {
        std::sync::Arc::new(Self::new(init.repo_path))
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::graph::builder::GraphBuilder;

    #[test]
    fn test_detects_where_with_user_input() {
        let store = GraphBuilder::new().freeze();
        let detector = NosqlInjectionDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("routes.js", "const mongoose = require('mongoose');\nconst User = mongoose.model('User');\n\nasync function findUser(req, res) {\n    const name = req.body.name;\n    const result = await User.find({ $where: `this.name == '${name}'` });\n    res.json(result);\n}\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            !findings.is_empty(),
            "Should detect $where with user input from req.body"
        );
        assert!(
            findings.iter().any(|f| f.title.contains("$where")),
            "Finding should mention $where. Titles: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
    }

    #[test]
    fn test_no_finding_for_safe_query() {
        let store = GraphBuilder::new().freeze();
        let detector = NosqlInjectionDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("routes.js", "const mongoose = require('mongoose');\nconst User = mongoose.model('User');\n\nasync function findUser() {\n    const result = await User.find({ active: true });\n    return result;\n}\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            findings.is_empty(),
            "Safe MongoDB query without user input should produce no findings, but got: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
    }

    #[test]
    fn test_detects_find_with_req_body_in_js() {
        let store = GraphBuilder::new().freeze();
        let detector = NosqlInjectionDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("controller.js", "const mongoose = require('mongoose');\nconst User = mongoose.model('User');\n\nasync function login(req, res) {\n    const user = await User.findOne(req.body);\n    if (user) res.json(user);\n}\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            !findings.is_empty(),
            "Should detect MongoDB findOne with unsanitized req.body"
        );
        assert!(
            findings
                .iter()
                .any(|f| f.cwe_id.as_deref() == Some("CWE-943")),
            "Finding should have CWE-943"
        );
    }

    #[test]
    fn test_detects_aggregate_with_user_input_ts() {
        let store = GraphBuilder::new().freeze();
        let detector = NosqlInjectionDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("analytics.ts", "import mongoose from 'mongoose';\nconst Order = mongoose.model('Order');\n\nasync function getStats(req: Request, res: Response) {\n    const pipeline = req.body.pipeline;\n    const results = await Order.aggregate(req.body.pipeline);\n    res.json(results);\n}\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            !findings.is_empty(),
            "Should detect MongoDB aggregate with user-controlled pipeline from req.body"
        );
    }

    #[test]
    fn test_no_finding_for_sanitized_query() {
        let store = GraphBuilder::new().freeze();
        let detector = NosqlInjectionDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("safe_controller.js", "const mongoose = require('mongoose');\nconst sanitize = require('mongo-sanitize');\nconst User = mongoose.model('User');\n\nasync function login(req, res) {\n    const clean = sanitize(req.body);\n    const user = await User.findOne(clean);\n    res.json(user);\n}\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            findings.is_empty(),
            "Sanitized MongoDB query should not produce findings, but got: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
    }

    #[test]
    fn test_no_finding_for_array_find() {
        let store = GraphBuilder::new().freeze();
        let detector = NosqlInjectionDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(&store, vec![
            ("utils.js", "const mongoose = require('mongoose');\n\nfunction findItem(items, id) {\n    return items.find(item => item.id === id);\n}\n"),
        ]);
        let findings = detector.detect(&ctx).expect("detection should succeed");
        assert!(
            findings.is_empty(),
            "Array.find() should not be flagged as NoSQL injection, but got: {:?}",
            findings.iter().map(|f| &f.title).collect::<Vec<_>>()
        );
    }

    // ─────────────────────────────────────────────────────────────────
    // Phase 2i dual-branch integration tests.
    //
    // Mirror insecure-deserialize's flag-off / flag-on pattern (Phase 2h):
    //
    //   1. flag_off_nosql_injection_emits_single_branch_unchanged
    //   2. flag_on_case_a_unstructured_json_in_handler_realbug_high
    //   3. flag_on_case_b_typed_query_str_cast_collapses_benign
    //   4. flag_on_case_c_where_with_user_input_collapses_realbug_critical
    //   5. flag_on_case_d_developer_written_ne_predicts_benign  (FP fix)
    //   6. flag_on_case_e_dict_expansion_collapses_realbug_critical
    //   7. flag_on_case_f_objectid_cast_typed_query_collapses_benign
    //   8. flag_on_nosql_safe_annotation_collapses_benign
    //   9. flag_on_nosql_vulnerable_annotation_collapses_realbug
    //  10. flag_on_non_python_unchanged_per_d5_scope
    // ─────────────────────────────────────────────────────────────────

    fn run_dual_branch(file: &str, content: &str) -> Vec<Finding> {
        use crate::config::DualBranchConfig;
        use std::collections::HashMap;

        let store = GraphBuilder::new().freeze();
        let detector = NosqlInjectionDetector::new("/mock/repo");
        let mut detectors = HashMap::new();
        detectors.insert("nosql-injection".to_string(), true);
        let cfg = DualBranchConfig {
            enabled: true,
            detectors,
        };
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            vec![(file, content)],
        )
        .with_dual_branch(cfg);
        detector.detect(&ctx).expect("detection should succeed")
    }

    #[test]
    fn flag_off_nosql_injection_emits_single_branch_unchanged() {
        // Sanity: with flag off (default), Python pymongo sites emit
        // no `alternative_branch` and no predictor-contributed
        // (weight ≠ 0) reasons. Pins the opt-in promise — flipping
        // the flag must not change byte-output for users who haven't
        // turned it on.
        let store = GraphBuilder::new().freeze();
        let detector = NosqlInjectionDetector::new("/mock/repo");
        let ctx = crate::detectors::analysis_context::AnalysisContext::test_with_mock_files(
            &store,
            vec![(
                "vuln.py",
                "import pymongo\n\
                 from flask import request\n\
                 def handler():\n\
                 \x20   users = pymongo.MongoClient().db.users\n\
                 \x20   return users.find_one({\"$where\": f\"this.x == '{request.form['x']}'\"})\n",
            )],
        );
        let findings = detector.detect(&ctx).expect("detection should succeed");
        for f in &findings {
            assert!(
                f.alternative_branch.is_none(),
                "no alternative_branch when flag off: {:?}",
                f.title
            );
            assert!(
                f.prediction_reasons.iter().all(|r| r.weight == 0.0),
                "no weight-bearing predictor reasons when flag off; \
                 graph-enrichment weight-0 reasons are allowed. reasons: {:?}",
                f.prediction_reasons
                    .iter()
                    .map(|r| (&r.kind, r.weight))
                    .collect::<Vec<_>>()
            );
        }
    }

    #[test]
    fn flag_on_case_a_unstructured_json_in_handler_realbug_high() {
        // Case A from decisions §6: users.find_one with request.json
        // (UnstructuredJson source) inside a route handler. The
        // operator-injection vector is present even without explicit
        // `$where` because pymongo will faithfully serialize any dict
        // value as a query operator. Expected: RealBug High (weighted
        // -0.30 - 0.20 = -0.50).
        //
        // Note: we inline `request.get_json()["user"]` directly in the
        // dict value rather than going through an intermediate variable
        // — the v0 evidence extractor's typed-value-query classifier
        // only inspects each dict value's text for user-input
        // substrings (D5.3 cross-statement-flow limitation). When the
        // user-input identifier is the immediate value expression, the
        // dict-value scan correctly reclassifies the query as
        // Ambiguous → weighted scoring fires → -0.30 (UnstructuredJson)
        // - 0.20 (handler) = -0.50 → RealBug High.
        let findings = run_dual_branch(
            "case_a.py",
            "import pymongo\n\
             from flask import request\n\
             @app.route('/login', methods=['POST'])\n\
             def login_handler():\n\
             \x20   users = pymongo.MongoClient().db.users\n\
             \x20   return users.find_one({\"username\": request.get_json()[\"user\"], \"password\": request.get_json()[\"pw\"]})\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("must have a dual-branch finding for Case A");
        assert_eq!(
            f.severity,
            Severity::High,
            "Case A: UnstructuredJson + handler should weight to High; got {:?}, reasons={:?}",
            f.severity,
            f.prediction_reasons
                .iter()
                .map(|r| (&r.kind, r.weight))
                .collect::<Vec<_>>()
        );
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::Benign);
    }

    #[test]
    fn flag_on_case_b_typed_query_str_cast_collapses_benign() {
        // Case B from decisions §6: str() cast on request.form value.
        // Should classify as TypedValueQuery → D1.a Benign collapse → Info.
        let findings = run_dual_branch(
            "case_b.py",
            "import pymongo\n\
             from flask import request\n\
             def login():\n\
             \x20   users = pymongo.MongoClient().db.users\n\
             \x20   return users.find_one({\"username\": str(request.form[\"user\"])})\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("must have a dual-branch finding for Case B");
        assert_eq!(
            f.severity,
            Severity::Info,
            "Case B: TypedValueQuery must collapse to Benign Info; got {:?}",
            f.severity
        );
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::RealBug);
    }

    #[test]
    fn flag_on_case_c_where_with_user_input_collapses_realbug_critical() {
        // Case C from decisions §6: $where with f-string interpolation
        // of user input. Should classify as OperatorInjection → D1.b
        // RealBug collapse → Critical.
        let findings = run_dual_branch(
            "case_c.py",
            "import pymongo\n\
             from flask import request\n\
             def login_handler():\n\
             \x20   users = pymongo.MongoClient().db.users\n\
             \x20   return users.find_one({\"$where\": f\"this.username == '{request.form['user']}'\"})\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("must have a dual-branch finding for Case C");
        assert_eq!(
            f.severity,
            Severity::Critical,
            "Case C: OperatorInjection ($where) must collapse to RealBug Critical; got {:?}",
            f.severity
        );
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::Benign);
    }

    #[test]
    fn flag_on_case_d_developer_written_ne_predicts_benign() {
        // Case D from decisions §6 (THE HEADLINE FP REDUCTION):
        // developer-written $ne operator with no user input. The legacy
        // line scanner fires on $ne; the dual-branch predictor recognizes
        // this is a normal MongoDB query and collapses to Benign Info.
        let findings = run_dual_branch(
            "case_d.py",
            "import pymongo\n\
             def list_non_admins():\n\
             \x20   users = pymongo.MongoClient().db.users\n\
             \x20   return list(users.find({\"role\": {\"$ne\": \"admin\"}}))\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("must have a dual-branch finding for Case D");
        assert_eq!(
            f.severity,
            Severity::Info,
            "Case D (FP fix): developer-written $ne with no user input \
             must predict Benign Info; got {:?}, reasons={:?}",
            f.severity,
            f.prediction_reasons
                .iter()
                .map(|r| (&r.kind, r.weight))
                .collect::<Vec<_>>()
        );
    }

    #[test]
    fn flag_on_case_e_dict_expansion_collapses_realbug_critical() {
        // Case E from decisions §6: `**request.get_json()` expansion.
        // Attacker controls every key — textbook NoSQL auth-bypass.
        // Should classify as DictExpansion → D1.c RealBug collapse → Critical.
        let findings = run_dual_branch(
            "case_e.py",
            "import pymongo\n\
             from flask import request\n\
             @app.route('/find', methods=['POST'])\n\
             def find_handler():\n\
             \x20   users = pymongo.MongoClient().db.users\n\
             \x20   return users.find_one({**request.get_json()})\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("must have a dual-branch finding for Case E");
        assert_eq!(
            f.severity,
            Severity::Critical,
            "Case E: DictExpansion of request.get_json() must collapse to RealBug Critical; got {:?}",
            f.severity
        );
    }

    #[test]
    fn flag_on_case_f_objectid_cast_typed_query_collapses_benign() {
        // Case F from decisions §6: ObjectId() cast on user input.
        // ObjectId raises if the input isn't a valid 24-hex string —
        // structural type narrowing → TypedValueQuery → Benign collapse.
        let findings = run_dual_branch(
            "case_f.py",
            "import pymongo\n\
             from bson import ObjectId\n\
             from flask import request\n\
             def get_user():\n\
             \x20   users = pymongo.MongoClient().db.users\n\
             \x20   return users.find_one({\"_id\": ObjectId(request.form['id'])})\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("must have a dual-branch finding for Case F");
        assert_eq!(
            f.severity,
            Severity::Info,
            "Case F: ObjectId cast typed-value query must collapse to Benign Info; got {:?}",
            f.severity
        );
    }

    #[test]
    fn flag_on_nosql_safe_annotation_collapses_benign() {
        // Annotation > Step-1.5: $where with annotation → Benign Info.
        let findings = run_dual_branch(
            "annotated.py",
            "import pymongo\n\
             def f():\n\
             \x20   users = pymongo.MongoClient().db.users\n\
             \x20   return users.find_one({\"$where\": \"this.role == 'admin'\"})  # repotoire: nosql-safe[admin-literal]\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("must have a dual-branch finding (annotated)");
        assert_eq!(f.severity, Severity::Info);
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::RealBug);
    }

    #[test]
    fn flag_on_nosql_vulnerable_annotation_collapses_realbug() {
        // Annotation > Step-1.5: TypedValueQuery with nosql-vulnerable
        // annotation → RealBug Critical.
        let findings = run_dual_branch(
            "annotated.py",
            "import pymongo\n\
             def f(q):\n\
             \x20   users = pymongo.MongoClient().db.users\n\
             \x20   return users.find_one({\"name\": \"alice\"})  # repotoire: nosql-vulnerable[helper-assembled]\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("must have a dual-branch finding (vuln annotated)");
        assert_eq!(f.severity, Severity::Critical);
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::Benign);
    }

    #[test]
    fn flag_on_non_python_unchanged_per_d5_scope() {
        // D5.1 scope: Non-Python files go through the legacy regex
        // scanner regardless of the flag. Mongoose / pymongo-via-JS
        // sites still fire single-branch (no `alternative_branch`).
        let findings = run_dual_branch(
            "routes.js",
            "const mongoose = require('mongoose');\n\
             const User = mongoose.model('User');\n\
             async function findUser(req, res) {\n\
             \x20   const name = req.body.name;\n\
             \x20   const result = await User.find({ $where: `this.name == '${name}'` });\n\
             \x20   res.json(result);\n\
             }\n",
        );
        assert!(!findings.is_empty(), "JS still fires legacy scan");
        for f in &findings {
            assert!(
                f.alternative_branch.is_none(),
                "no dual-branch shape for non-Python: {:?}",
                f.title
            );
        }
    }

    // ─────────────────────────────────────────────────────────────────
    // Phase 2i real-world signature pins.
    //
    // Pin the predictor's verdicts against minimized-but-realistic
    // shapes from real Python codebases. Mirrors the 2e/2f/2g/2h real-
    // world tests: catch the day when an evidence-extractor refactor
    // accidentally breaks a known-correct verdict on a real-world
    // idiom.
    //
    // Four signatures pinned (per decisions doc §6 worked examples
    // and §4 architectural framing as the FP-reduction phase):
    //
    //   1. `real_typed_pymongo_login` — the canonical Flask login
    //      shape where the dev cast `request.form` via `str(...)`.
    //      Must Benign Info via D1.a TypedValueQuery collapse.
    //   2. `real_naked_request_json_query` — `**request.get_json()`
    //      expansion. The textbook auth-bypass shape from every
    //      OWASP MongoDB cheat-sheet writeup. Must RealBug Critical
    //      via D1.c DictExpansion collapse.
    //   3. `real_where_clause_with_user_input` — f-string of user
    //      input interpolated into `$where`. The classic CWE-943
    //      RCE shape (mongo-express, Rocket.Chat historical CVEs).
    //      Must RealBug Critical via D1.b OperatorInjection collapse.
    //   4. `real_developer_written_ne_operator` — developer-written
    //      `$ne` query with literal value, no user input. The
    //      headline FP-reduction case for Phase 2i. The legacy line
    //      scanner flags this as a Medium operator-injection finding;
    //      the dual-branch predictor correctly classifies it as
    //      Benign Info.
    //
    // The shapes are simplified to fit a single-file mock context.
    // The minimization is documented inline so a future contributor
    // can re-validate against upstream when the API drifts.
    // Citations point to upstream tutorials / OWASP guidance the
    // shape was distilled from.
    // ─────────────────────────────────────────────────────────────────

    #[test]
    fn real_typed_pymongo_login() {
        // Real shape from Flask-PyMongo tutorials and the
        // pymongo "Authentication Tutorial" page. Excerpt:
        //
        //   from flask import Flask, request
        //   from pymongo import MongoClient
        //   app = Flask(__name__)
        //   users = MongoClient().db.users
        //   @app.route('/login', methods=['POST'])
        //   def login():
        //       user = users.find_one({"username": str(request.form["user"])})
        //       return jsonify({"ok": user is not None})
        //
        // `request.form` is the Flask werkzeug-MultiDict which returns
        // Python `str` (TypedString source). The developer additionally
        // wraps in `str(...)` — defensive, but the dual cast doesn't
        // hurt. pymongo serializes a Python `str` to BSON String; there
        // is no operator-interpretation path. The query is safe by
        // structural construction.
        //
        // D1.a TypedValueQuery collapse must fire: Benign Info.
        let findings = run_dual_branch(
            "real_flask_login.py",
            "from flask import Flask, request\n\
             from pymongo import MongoClient\n\
             app = Flask(__name__)\n\
             users = MongoClient().db.users\n\
             @app.route('/login', methods=['POST'])\n\
             def login():\n\
             \x20   user = users.find_one({\"username\": str(request.form[\"user\"])})\n\
             \x20   return {\"ok\": user is not None}\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("dual-branch finding expected for typed pymongo login");
        assert_eq!(
            f.severity,
            Severity::Info,
            "str(request.form[...]) is a TypedValueQuery — must collapse \
             to Benign Info via D1.a even with @app.route handler signal; \
             got {:?}, reasons={:?}",
            f.severity,
            f.prediction_reasons
                .iter()
                .map(|r| (&r.kind, r.weight))
                .collect::<Vec<_>>()
        );
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::RealBug);
    }

    #[test]
    fn real_naked_request_json_query() {
        // Real shape from the OWASP MongoDB Cheat Sheet's "NoSQL
        // Injection — Operator Injection" worked example, also the
        // shape mongo-express historically shipped (CVE-2019-10758
        // family). Excerpt:
        //
        //   from flask import Flask, request
        //   from pymongo import MongoClient
        //   app = Flask(__name__)
        //   users = MongoClient().db.users
        //   @app.route('/find', methods=['POST'])
        //   def find():
        //       return users.find_one({**request.get_json()})
        //
        // The `**`-expansion of `request.get_json()` into the query
        // dict gives the attacker total control over every key/value.
        // The textbook auth-bypass payload `{"username": {"$ne": null}}`
        // bypasses login when the developer expects `{"username": "alice"}`.
        //
        // D1.c DictExpansion collapse must fire: RealBug Critical.
        let findings = run_dual_branch(
            "real_naked_json_query.py",
            "from flask import Flask, request\n\
             from pymongo import MongoClient\n\
             app = Flask(__name__)\n\
             users = MongoClient().db.users\n\
             @app.route('/find', methods=['POST'])\n\
             def find():\n\
             \x20   return users.find_one({**request.get_json()})\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("dual-branch finding expected for **request.get_json()");
        assert_eq!(
            f.severity,
            Severity::Critical,
            "{{**request.get_json()}} is a DictExpansion — must collapse \
             to RealBug Critical via D1.c; got {:?}, reasons={:?}",
            f.severity,
            f.prediction_reasons
                .iter()
                .map(|r| (&r.kind, r.weight))
                .collect::<Vec<_>>()
        );
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::Benign);
        assert_eq!(alt.severity, Severity::Info);
    }

    #[test]
    fn real_where_clause_with_user_input() {
        // Real shape from the OWASP "NoSQL Injection" attack pattern
        // page and the Rocket.Chat historical CVE-2020-25988 family.
        // Variant of the shape that mongo-shell tutorials use to
        // demonstrate $where's danger. Excerpt:
        //
        //   from flask import Flask, request
        //   from pymongo import MongoClient
        //   users = MongoClient().db.users
        //   @app.route('/search')
        //   def search():
        //       q = request.form['q']
        //       return users.find_one({"$where": f"this.name == '{q}'"})
        //
        // `$where` executes JavaScript on the MongoDB server. The
        // f-string interpolation of `q` (a TypedString source — but
        // for $where, every source is unsafe because the value is
        // interpolated into JS code) gives the attacker arbitrary JS
        // execution on the DB server (`'; while(1){}; '` for DoS,
        // `'; return db.collection.drop()' ` for data loss).
        //
        // D1.b OperatorInjection collapse must fire: RealBug Critical.
        let findings = run_dual_branch(
            "real_where_clause.py",
            "from flask import Flask, request\n\
             from pymongo import MongoClient\n\
             users = MongoClient().db.users\n\
             @app.route('/search')\n\
             def search():\n\
             \x20   q = request.form['q']\n\
             \x20   return users.find_one({\"$where\": f\"this.name == '{request.form['q']}'\"})\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("dual-branch finding expected for $where with user input");
        assert_eq!(
            f.severity,
            Severity::Critical,
            "$where with user-input f-string is OperatorInjection — must \
             collapse to RealBug Critical via D1.b; got {:?}, reasons={:?}",
            f.severity,
            f.prediction_reasons
                .iter()
                .map(|r| (&r.kind, r.weight))
                .collect::<Vec<_>>()
        );
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::Benign);
        assert_eq!(alt.severity, Severity::Info);
    }

    #[test]
    fn real_developer_written_ne_operator() {
        // The headline FP-reduction case for Phase 2i.
        //
        // Real shape from any Flask/FastAPI/Django app's admin-list-
        // users endpoint. Distilled from the standard "list everything
        // except admins" pattern that appears in countless tutorials.
        // Excerpt:
        //
        //   from pymongo import MongoClient
        //   users = MongoClient().db.users
        //   def list_non_admin_users():
        //       # List every user whose role is not "admin".
        //       return list(users.find({"role": {"$ne": "admin"}}))
        //
        // `$ne` is a legitimate MongoDB operator. The developer wrote
        // the query intentionally — the operator's semantics ARE the
        // intended behavior. There is no user input. There is no
        // operator-injection vector. The legacy line-regex scanner
        // flags every `$ne` as a Medium operator-injection finding
        // (because it matches `DANGEROUS_OPS`); the dual-branch
        // predictor recognizes "developer-written $ne with literal
        // value, no user input → +0.10 weight → Benign Info."
        //
        // This is the architectural justification for Phase 2i:
        // dual-branch wiring as PRIMARY FP-REDUCTION rather than new-
        // bug-discovery.
        let findings = run_dual_branch(
            "real_admin_list.py",
            "from pymongo import MongoClient\n\
             users = MongoClient().db.users\n\
             def list_non_admin_users():\n\
             \x20   # List every user whose role is not 'admin'.\n\
             \x20   return list(users.find({\"role\": {\"$ne\": \"admin\"}}))\n",
        );
        let f = findings
            .iter()
            .find(|f| f.is_dual_branch())
            .expect("dual-branch finding expected for $ne literal query");
        assert_eq!(
            f.severity,
            Severity::Info,
            "Developer-written $ne with literal value and no user input \
             must predict Benign Info — this is the headline FP-reduction \
             signal for Phase 2i; got {:?}, reasons={:?}",
            f.severity,
            f.prediction_reasons
                .iter()
                .map(|r| (&r.kind, r.weight))
                .collect::<Vec<_>>()
        );
        let alt = f.alternative_branch.as_ref().unwrap();
        assert_eq!(alt.label, crate::dual_branch::BranchLabel::RealBug);
    }
}