Skip to main content

tldr_core/security/
taint.rs

1//! Taint Analysis Types
2//!
3//! This module provides the core types for CFG-based taint analysis.
4//! Taint analysis tracks how untrusted data flows through a program
5//! to detect potential security vulnerabilities like SQL injection,
6//! command injection, and code injection.
7//!
8//! # Types
9//!
10//! - `TaintSourceType` - Categorizes sources of untrusted input
11//! - `TaintSinkType` - Categorizes dangerous operations (sinks)
12//! - `SanitizerType` - Categorizes sanitization operations
13//! - `TaintSource` - A detected source of tainted data
14//! - `TaintSink` - A detected dangerous sink
15//! - `TaintFlow` - A flow from source to sink (potential vulnerability)
16//! - `TaintInfo` - Complete taint analysis result for a function
17//!
18//! # References
19//! - session11-taint-spec.md
20
21use lazy_static::lazy_static;
22use regex::Regex;
23use serde::{Deserialize, Serialize};
24use std::collections::{HashMap, HashSet, VecDeque};
25
26use crate::types::{CfgInfo, RefType, VarRef};
27use crate::Language;
28use crate::TldrError;
29
30/// Hard cap on worklist iterations to prevent infinite loops in taint analysis.
31///
32/// The computed max_iterations (blocks * vars) can be enormous for real-world
33/// files with many blocks and variables. When the taint set oscillates (e.g.,
34/// due to substring matching in stmt.contains()), the worklist never converges.
35/// This cap ensures the analysis always terminates in bounded time.
36const MAX_TAINT_ITERATIONS: usize = 1000;
37
38// =============================================================================
39// Enums - Taint Categories
40// =============================================================================
41
42/// Source of tainted (untrusted) data.
43///
44/// These represent entry points where external data enters the program.
45#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
46#[serde(rename_all = "snake_case")]
47pub enum TaintSourceType {
48    /// User input: `input()`, `raw_input()`
49    UserInput,
50    /// Standard input: `sys.stdin.read()`, `sys.stdin.readline()`
51    Stdin,
52    /// HTTP query/form parameters: `request.args`, `request.form`, `request.values`
53    HttpParam,
54    /// HTTP body data: `request.json`, `request.data`, `request.body`
55    HttpBody,
56    /// Environment variables: `os.environ`, `os.getenv()`
57    EnvVar,
58    /// File reads: `open().read()`, `pathlib.read_text()`
59    FileRead,
60}
61
62/// Dangerous sink types where tainted data should not flow unsanitized.
63///
64/// These represent operations that can be exploited if fed untrusted data.
65#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
66#[serde(rename_all = "snake_case")]
67pub enum TaintSinkType {
68    /// SQL queries: `cursor.execute()`, raw SQL execution
69    SqlQuery,
70    /// Code evaluation: `eval()`
71    CodeEval,
72    /// Code execution: `exec()`
73    CodeExec,
74    /// Code compilation: `compile()`
75    CodeCompile,
76    /// Shell command execution: `os.system()`, `subprocess.run()`
77    ShellExec,
78    /// File writes: `open(..., 'w')`, `.write_text()`
79    FileWrite,
80}
81
82/// Sanitizer types that neutralize taint.
83///
84/// These represent operations that make tainted data safe for specific sinks.
85#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
86#[serde(rename_all = "snake_case")]
87pub enum SanitizerType {
88    /// Numeric conversion: `int()`, `float()`, `bool()` - safe for SQL
89    Numeric,
90    /// Shell escaping: `shlex.quote()` - safe for shell commands
91    Shell,
92    /// HTML escaping: `html.escape()`, `markupsafe.escape()` - safe for HTML output
93    Html,
94}
95
96// =============================================================================
97// Structs - Taint Data
98// =============================================================================
99
100/// A detected taint source - where untrusted data enters.
101#[derive(Debug, Clone, Serialize, Deserialize)]
102pub struct TaintSource {
103    /// Variable name that receives tainted data
104    pub var: String,
105    /// Line number of the source
106    pub line: u32,
107    /// Type of source
108    pub source_type: TaintSourceType,
109    /// Optional statement text for context
110    #[serde(skip_serializing_if = "Option::is_none")]
111    pub statement: Option<String>,
112}
113
114/// A detected taint sink - dangerous operation.
115#[derive(Debug, Clone, Serialize, Deserialize)]
116pub struct TaintSink {
117    /// Variable used in the sink
118    pub var: String,
119    /// Line number of the sink
120    pub line: u32,
121    /// Type of sink
122    pub sink_type: TaintSinkType,
123    /// Whether the variable is tainted at this sink (true = vulnerability)
124    pub tainted: bool,
125    /// Optional statement text for context
126    #[serde(skip_serializing_if = "Option::is_none")]
127    pub statement: Option<String>,
128}
129
130/// A taint flow from source to sink (represents a potential vulnerability).
131#[derive(Debug, Clone, Serialize, Deserialize)]
132pub struct TaintFlow {
133    /// The source of tainted data
134    pub source: TaintSource,
135    /// The sink where tainted data flows
136    pub sink: TaintSink,
137    /// Block IDs along the flow path from source to sink
138    pub path: Vec<usize>,
139}
140
141/// Complete taint analysis result for a function.
142///
143/// Contains all detected sources, sinks, and flows, plus the taint state
144/// at each CFG block.
145#[derive(Debug, Clone, Default, Serialize, Deserialize)]
146pub struct TaintInfo {
147    /// Function name
148    pub function_name: String,
149    /// Tainted variables at each block: block_id -> set of tainted variable names
150    pub tainted_vars: HashMap<usize, HashSet<String>>,
151    /// All detected taint sources
152    pub sources: Vec<TaintSource>,
153    /// All detected sinks (both tainted and untainted)
154    pub sinks: Vec<TaintSink>,
155    /// Flows from source to sink (vulnerabilities)
156    pub flows: Vec<TaintFlow>,
157    /// Variables that have been sanitized
158    pub sanitized_vars: HashSet<String>,
159    /// Convergence status: "converged" if the worklist reached a fixed point,
160    /// "iteration_limit_reached" if analysis was capped at MAX_TAINT_ITERATIONS.
161    #[serde(default = "default_convergence")]
162    #[serde(skip_serializing_if = "Option::is_none")]
163    pub convergence: Option<String>,
164}
165
166fn default_convergence() -> Option<String> {
167    None
168}
169
170// =============================================================================
171// Implementations
172// =============================================================================
173
174impl TaintInfo {
175    /// Create a new TaintInfo for a function with empty collections.
176    pub fn new(function_name: impl Into<String>) -> Self {
177        Self {
178            function_name: function_name.into(),
179            tainted_vars: HashMap::new(),
180            sources: Vec::new(),
181            sinks: Vec::new(),
182            flows: Vec::new(),
183            sanitized_vars: HashSet::new(),
184            convergence: None,
185        }
186    }
187
188    /// Check if a variable is tainted at a given block.
189    ///
190    /// Returns `false` if the block doesn't exist or the variable isn't tainted.
191    pub fn is_tainted(&self, block_id: usize, var: &str) -> bool {
192        self.tainted_vars
193            .get(&block_id)
194            .map(|vars| vars.contains(var))
195            .unwrap_or(false)
196    }
197
198    /// Get all sinks where the variable is tainted (vulnerabilities).
199    pub fn get_vulnerabilities(&self) -> Vec<&TaintSink> {
200        self.sinks.iter().filter(|s| s.tainted).collect()
201    }
202}
203
204// =============================================================================
205// Helper Functions - Phase 2
206// =============================================================================
207
208/// Build predecessor map from CFG edges.
209///
210/// Returns a mapping from each block ID to its list of predecessor block IDs.
211/// Every block is guaranteed to have an entry (even if empty).
212pub fn build_predecessors(cfg: &CfgInfo) -> HashMap<usize, Vec<usize>> {
213    let mut preds: HashMap<usize, Vec<usize>> = HashMap::new();
214
215    // Initialize all blocks with empty predecessor lists
216    for block in &cfg.blocks {
217        preds.entry(block.id).or_default();
218    }
219
220    // Add predecessors from edges
221    for edge in &cfg.edges {
222        preds.entry(edge.to).or_default().push(edge.from);
223    }
224
225    preds
226}
227
228/// Build successor map from CFG edges.
229///
230/// Returns a mapping from each block ID to its list of successor block IDs.
231/// Every block is guaranteed to have an entry (even if empty).
232pub fn build_successors(cfg: &CfgInfo) -> HashMap<usize, Vec<usize>> {
233    let mut succs: HashMap<usize, Vec<usize>> = HashMap::new();
234
235    // Initialize all blocks with empty successor lists
236    for block in &cfg.blocks {
237        succs.entry(block.id).or_default();
238    }
239
240    // Add successors from edges
241    for edge in &cfg.edges {
242        succs.entry(edge.from).or_default().push(edge.to);
243    }
244
245    succs
246}
247
248/// Build line-to-block mapping from CFG.
249///
250/// Maps each line number to the block that contains it.
251/// When blocks overlap (e.g., merge points within code blocks),
252/// prefers LARGER blocks (actual code blocks over merge points).
253/// For same-size blocks, prefers HIGHER block ID (branch bodies come after merge points).
254///
255/// This pattern is copied from reaching.rs:102-125 to handle overlapping blocks correctly.
256pub fn build_line_to_block(cfg: &CfgInfo) -> HashMap<u32, usize> {
257    let mut mapping: HashMap<u32, usize> = HashMap::new();
258
259    // For each line, find the best block that contains it
260    // We need to collect all lines first, then find the best block for each
261    let mut all_lines: HashSet<u32> = HashSet::new();
262    for block in &cfg.blocks {
263        for line in block.lines.0..=block.lines.1 {
264            all_lines.insert(line);
265        }
266    }
267
268    for line in all_lines {
269        let mut best_block: Option<(usize, u32)> = None; // (block_id, size)
270
271        for block in &cfg.blocks {
272            let (start, end) = block.lines;
273            if line >= start && line <= end {
274                let size = end - start + 1;
275                // Prefer LARGER blocks (more likely to be actual code blocks)
276                // For same size, prefer HIGHER block ID (branch bodies come after merge points)
277                if best_block.is_none()
278                    || size > best_block.unwrap().1
279                    || (size == best_block.unwrap().1 && block.id > best_block.unwrap().0)
280                {
281                    best_block = Some((block.id, size));
282                }
283            }
284        }
285
286        if let Some((block_id, _)) = best_block {
287            mapping.insert(line, block_id);
288        }
289    }
290
291    mapping
292}
293
294/// Group VarRefs by their containing block.
295///
296/// Uses the line_to_block mapping to assign each VarRef to its block.
297/// Refs within each block are sorted by line number.
298/// VarRefs that don't map to any block are excluded.
299pub fn build_refs_by_block<'a>(
300    refs: &'a [VarRef],
301    line_to_block: &HashMap<u32, usize>,
302) -> HashMap<usize, Vec<&'a VarRef>> {
303    let mut by_block: HashMap<usize, Vec<&VarRef>> = HashMap::new();
304
305    for var_ref in refs {
306        if let Some(&block_id) = line_to_block.get(&var_ref.line) {
307            by_block.entry(block_id).or_default().push(var_ref);
308        }
309    }
310
311    // Sort refs within each block by line number
312    for refs in by_block.values_mut() {
313        refs.sort_by_key(|r| r.line);
314    }
315
316    by_block
317}
318
319/// Validate CFG structure for taint analysis.
320///
321/// Checks:
322/// - CFG has at least one block
323/// - Entry block exists in the block list
324/// - All edge endpoints reference valid block IDs
325///
326/// # Errors
327///
328/// Returns `TldrError::InvalidArgs` if validation fails.
329pub fn validate_cfg(cfg: &CfgInfo) -> Result<(), TldrError> {
330    // Check for empty CFG
331    if cfg.blocks.is_empty() {
332        return Err(TldrError::InvalidArgs {
333            arg: "cfg".to_string(),
334            message: "Empty CFG".to_string(),
335            suggestion: None,
336        });
337    }
338
339    // Collect all valid block IDs
340    let block_ids: HashSet<usize> = cfg.blocks.iter().map(|b| b.id).collect();
341
342    // Check entry block exists
343    if !block_ids.contains(&cfg.entry_block) {
344        return Err(TldrError::InvalidArgs {
345            arg: "cfg".to_string(),
346            message: format!("Entry block {} not in blocks", cfg.entry_block),
347            suggestion: Some(format!(
348                "Valid block IDs are: {:?}",
349                block_ids.iter().collect::<Vec<_>>()
350            )),
351        });
352    }
353
354    // Check all edges reference valid blocks
355    for edge in &cfg.edges {
356        if !block_ids.contains(&edge.from) {
357            return Err(TldrError::InvalidArgs {
358                arg: "cfg".to_string(),
359                message: format!(
360                    "Edge references invalid source block: {} -> {}",
361                    edge.from, edge.to
362                ),
363                suggestion: Some(format!(
364                    "Valid block IDs are: {:?}",
365                    block_ids.iter().collect::<Vec<_>>()
366                )),
367            });
368        }
369        if !block_ids.contains(&edge.to) {
370            return Err(TldrError::InvalidArgs {
371                arg: "cfg".to_string(),
372                message: format!(
373                    "Edge references invalid target block: {} -> {}",
374                    edge.from, edge.to
375                ),
376                suggestion: Some(format!(
377                    "Valid block IDs are: {:?}",
378                    block_ids.iter().collect::<Vec<_>>()
379                )),
380            });
381        }
382    }
383
384    Ok(())
385}
386
387// =============================================================================
388// Pattern Matching - Phase 3
389// =============================================================================
390
391/// Language-specific taint analysis patterns.
392///
393/// Each language has its own set of source, sink, and sanitizer patterns.
394/// Currently only Python patterns are defined; other languages fall back to Python.
395pub struct LanguagePatterns {
396    /// Regex patterns that identify taint sources and their source type.
397    pub sources: Vec<(Regex, TaintSourceType)>,
398    /// Regex patterns that identify taint sinks and their sink type.
399    pub sinks: Vec<(Regex, TaintSinkType)>,
400    /// Regex patterns that identify sanitizer calls and their sanitizer type.
401    pub sanitizers: Vec<(Regex, SanitizerType)>,
402}
403
404lazy_static! {
405    /// Python-specific taint patterns.
406    static ref PYTHON_PATTERNS: LanguagePatterns = LanguagePatterns {
407        sources: vec![
408            // UserInput: input() function
409            (Regex::new(r"\binput\s*\(").unwrap(), TaintSourceType::UserInput),
410            // HttpParam: request.args, request.form, request.values, request.cookies, request.headers
411            (Regex::new(r"request\.(args|form|json|data|values|cookies|headers)").unwrap(), TaintSourceType::HttpParam),
412            // HttpBody: request.get_json()
413            (Regex::new(r"request\.get_json\s*\(").unwrap(), TaintSourceType::HttpBody),
414            // Stdin: sys.stdin
415            (Regex::new(r"sys\.stdin").unwrap(), TaintSourceType::Stdin),
416            // EnvVar: os.environ, os.getenv
417            (Regex::new(r"os\.(environ|getenv)").unwrap(), TaintSourceType::EnvVar),
418            // FileRead: .read(), .readlines(), .readline()
419            (Regex::new(r"\.(read|readlines|readline)\s*\(").unwrap(), TaintSourceType::FileRead),
420        ],
421        sinks: vec![
422            // SqlQuery: .execute(), .executemany()
423            (Regex::new(r"\.(execute|executemany)\s*\(").unwrap(), TaintSinkType::SqlQuery),
424            // CodeEval: eval()
425            (Regex::new(r"\beval\s*\(").unwrap(), TaintSinkType::CodeEval),
426            // CodeExec: exec()
427            (Regex::new(r"\bexec\s*\(").unwrap(), TaintSinkType::CodeExec),
428            // CodeCompile: compile()
429            (Regex::new(r"\bcompile\s*\(").unwrap(), TaintSinkType::CodeCompile),
430            // ShellExec: subprocess.run, subprocess.call, subprocess.Popen, subprocess.check_output
431            (Regex::new(r"subprocess\.(run|call|Popen|check_output)\s*\(").unwrap(), TaintSinkType::ShellExec),
432            // ShellExec: os.system, os.popen, os.spawn*
433            (Regex::new(r"os\.(system|popen|spawn\w*)\s*\(").unwrap(), TaintSinkType::ShellExec),
434            // FileWrite: .write()
435            (Regex::new(r"\.write\s*\(").unwrap(), TaintSinkType::FileWrite),
436        ],
437        sanitizers: vec![
438            // Numeric: int(), float(), bool()
439            (Regex::new(r"\b(int|float|bool)\s*\(").unwrap(), SanitizerType::Numeric),
440            // Shell: shlex.quote, pipes.quote
441            (Regex::new(r"(shlex|pipes)\.quote\s*\(").unwrap(), SanitizerType::Shell),
442            // Html: html.escape, markupsafe.escape, cgi.escape
443            (Regex::new(r"(html|markupsafe|cgi)\.escape\s*\(").unwrap(), SanitizerType::Html),
444        ],
445    };
446}
447
448lazy_static! {
449    /// TypeScript/JavaScript taint patterns.
450    static ref TYPESCRIPT_PATTERNS: LanguagePatterns = LanguagePatterns {
451        sources: vec![
452            // HttpBody: req.body
453            (Regex::new(r"req\.body").unwrap(), TaintSourceType::HttpBody),
454            // HttpParam: req.params, req.query, req.cookies, req.headers
455            (Regex::new(r"req\.(params|query|cookies|headers)").unwrap(), TaintSourceType::HttpParam),
456            // EnvVar: process.env
457            (Regex::new(r"process\.env").unwrap(), TaintSourceType::EnvVar),
458            // Stdin: process.stdin
459            (Regex::new(r"process\.stdin").unwrap(), TaintSourceType::Stdin),
460            // UserInput: readline()
461            (Regex::new(r"readline\s*\(").unwrap(), TaintSourceType::UserInput),
462            // FileRead: .read(), .readFile
463            (Regex::new(r"\.(read|readFile)\s*\(").unwrap(), TaintSourceType::FileRead),
464        ],
465        sinks: vec![
466            // CodeEval: eval()
467            (Regex::new(r"\beval\s*\(").unwrap(), TaintSinkType::CodeEval),
468            // CodeEval: new Function()
469            (Regex::new(r"new\s+Function\s*\(").unwrap(), TaintSinkType::CodeEval),
470            // ShellExec: child_process.exec/spawn/execSync/execFile
471            (Regex::new(r"child_process\.(exec|spawn|execSync|execFile)\s*\(").unwrap(), TaintSinkType::ShellExec),
472            // ShellExec: execSync()
473            (Regex::new(r"\bexecSync\s*\(").unwrap(), TaintSinkType::ShellExec),
474            // FileWrite: innerHTML = (XSS)
475            (Regex::new(r"\.innerHTML\s*=").unwrap(), TaintSinkType::FileWrite),
476            // FileWrite: document.write (XSS)
477            (Regex::new(r"document\.write\s*\(").unwrap(), TaintSinkType::FileWrite),
478            // SqlQuery: .query(), .execute()
479            (Regex::new(r"\.(query|execute)\s*\(").unwrap(), TaintSinkType::SqlQuery),
480        ],
481        sanitizers: vec![
482            // Numeric: parseInt, Number, parseFloat
483            (Regex::new(r"\b(parseInt|Number|parseFloat)\s*\(").unwrap(), SanitizerType::Numeric),
484            // Html: encodeURIComponent, DOMPurify.sanitize
485            (Regex::new(r"(encodeURIComponent|DOMPurify\.sanitize)\s*\(").unwrap(), SanitizerType::Html),
486        ],
487    };
488}
489
490lazy_static! {
491    /// Go taint patterns.
492    static ref GO_PATTERNS: LanguagePatterns = LanguagePatterns {
493        sources: vec![
494            // UserInput: fmt.Scan*, bufio.NewReader, bufio.NewScanner
495            (Regex::new(r"(fmt\.Scan|bufio\.NewReader|bufio\.NewScanner)").unwrap(), TaintSourceType::UserInput),
496            // HttpParam: r.FormValue, r.PostFormValue, r.URL.Query(), .Query()
497            (Regex::new(r"(r\.(FormValue|PostFormValue|URL\.Query)\s*\(|\.Query\(\))").unwrap(), TaintSourceType::HttpParam),
498            // HttpBody: r.Body, .ReadAll(r.Body)
499            (Regex::new(r"(r\.Body|\.ReadAll\(r\.Body\))").unwrap(), TaintSourceType::HttpBody),
500            // EnvVar: os.Getenv
501            (Regex::new(r"os\.Getenv\s*\(").unwrap(), TaintSourceType::EnvVar),
502            // Stdin: os.Stdin
503            (Regex::new(r"os\.Stdin").unwrap(), TaintSourceType::Stdin),
504            // FileRead: os.Open, ioutil.ReadFile
505            (Regex::new(r"(os\.Open|ioutil\.ReadFile)\s*\(").unwrap(), TaintSourceType::FileRead),
506        ],
507        sinks: vec![
508            // ShellExec: exec.Command
509            (Regex::new(r"exec\.Command\s*\(").unwrap(), TaintSinkType::ShellExec),
510            // SqlQuery: db.Exec, db.Query, db.QueryRow
511            (Regex::new(r"db\.(Exec|Query|QueryRow)\s*\(").unwrap(), TaintSinkType::SqlQuery),
512            // FileWrite: template.HTML (XSS), fmt.Fprintf(w, ...)
513            (Regex::new(r"(template\.HTML\s*\(|fmt\.Fprintf\s*\()").unwrap(), TaintSinkType::FileWrite),
514        ],
515        sanitizers: vec![
516            // Numeric: strconv.Atoi, strconv.ParseInt, strconv.ParseFloat
517            (Regex::new(r"strconv\.(Atoi|ParseInt|ParseFloat)\s*\(").unwrap(), SanitizerType::Numeric),
518            // Html: html.EscapeString, url.QueryEscape
519            (Regex::new(r"(html\.EscapeString|url\.QueryEscape)\s*\(").unwrap(), SanitizerType::Html),
520        ],
521    };
522}
523
524lazy_static! {
525    /// Java taint patterns.
526    static ref JAVA_PATTERNS: LanguagePatterns = LanguagePatterns {
527        sources: vec![
528            // Stdin: new Scanner(System.in)
529            (Regex::new(r"new\s+Scanner\s*\(System\.in\)").unwrap(), TaintSourceType::Stdin),
530            // UserInput: readLine(), new BufferedReader
531            (Regex::new(r"(readLine\s*\(|new\s+BufferedReader\s*\()").unwrap(), TaintSourceType::UserInput),
532            // HttpParam: request.getParameter, getQueryString
533            (Regex::new(r"(request\.getParameter\s*\(|getQueryString\s*\()").unwrap(), TaintSourceType::HttpParam),
534            // EnvVar: System.getenv
535            (Regex::new(r"System\.getenv\s*\(").unwrap(), TaintSourceType::EnvVar),
536            // FileRead: new FileReader, Files.readAllLines
537            (Regex::new(r"(new\s+FileReader|Files\.readAllLines)").unwrap(), TaintSourceType::FileRead),
538        ],
539        sinks: vec![
540            // ShellExec: Runtime.getRuntime().exec, ProcessBuilder
541            (Regex::new(r"(Runtime\.getRuntime\(\)\.exec\s*\(|ProcessBuilder\s*\()").unwrap(), TaintSinkType::ShellExec),
542            // SqlQuery: Statement/Connection.execute/executeQuery/executeUpdate
543            (Regex::new(r"\.(execute|executeQuery|executeUpdate)\s*\(").unwrap(), TaintSinkType::SqlQuery),
544            // CodeEval: Class.forName
545            (Regex::new(r"Class\.forName\s*\(").unwrap(), TaintSinkType::CodeEval),
546        ],
547        sanitizers: vec![
548            // Numeric: Integer.parseInt, Long.parseLong, Double.parseDouble
549            (Regex::new(r"(Integer\.parseInt|Long\.parseLong|Double\.parseDouble)\s*\(").unwrap(), SanitizerType::Numeric),
550            // Html: ESAPI.encoder, StringEscapeUtils.escapeHtml
551            (Regex::new(r"(ESAPI\.encoder\s*\(|StringEscapeUtils\.escapeHtml)").unwrap(), SanitizerType::Html),
552        ],
553    };
554}
555
556lazy_static! {
557    /// Rust taint patterns.
558    static ref RUST_PATTERNS: LanguagePatterns = LanguagePatterns {
559        sources: vec![
560            // Stdin: std::io::stdin(), io::stdin()
561            (Regex::new(r"(std::)?io::stdin\s*\(").unwrap(), TaintSourceType::Stdin),
562            // EnvVar: std::env::var, std::env::args, env::var, env::args
563            (Regex::new(r"(std::)?env::var\s*\(").unwrap(), TaintSourceType::EnvVar),
564            // UserInput: std::env::args (command line args are user input)
565            (Regex::new(r"(std::)?env::args\s*\(").unwrap(), TaintSourceType::UserInput),
566            // FileRead: std::fs::read_to_string, fs::read_to_string, File::open
567            (Regex::new(r"((std::)?fs::read_to_string\s*\(|File::open)").unwrap(), TaintSourceType::FileRead),
568        ],
569        sinks: vec![
570            // ShellExec: Command::new, std::process::Command
571            (Regex::new(r"(Command::new\s*\(|std::process::Command)").unwrap(), TaintSinkType::ShellExec),
572            // CodeEval: unsafe block
573            (Regex::new(r"\bunsafe\s*\{").unwrap(), TaintSinkType::CodeEval),
574            // FileWrite: std::ptr::write, std::ptr::read
575            (Regex::new(r"std::ptr::(write|read)\s*\(").unwrap(), TaintSinkType::FileWrite),
576        ],
577        sanitizers: vec![
578            // Numeric: .parse::<i32>(), etc.
579            (Regex::new(r"\.parse::<(i32|i64|u32|u64|f32|f64|usize|isize)>\s*\(").unwrap(), SanitizerType::Numeric),
580        ],
581    };
582}
583
584lazy_static! {
585    /// C taint patterns.
586    static ref C_PATTERNS: LanguagePatterns = LanguagePatterns {
587        sources: vec![
588            // UserInput: scanf, fscanf, sscanf
589            (Regex::new(r"\b(scanf|fscanf|sscanf)\s*\(").unwrap(), TaintSourceType::UserInput),
590            // UserInput: fgets, gets, getchar
591            (Regex::new(r"\b(fgets|gets|getchar)\s*\(").unwrap(), TaintSourceType::UserInput),
592            // EnvVar: getenv
593            (Regex::new(r"\bgetenv\s*\(").unwrap(), TaintSourceType::EnvVar),
594            // FileRead: fread, fopen
595            (Regex::new(r"\b(fread|fopen)\s*\(").unwrap(), TaintSourceType::FileRead),
596            // UserInput: recv, recvfrom (network)
597            (Regex::new(r"\b(recv|recvfrom)\s*\(").unwrap(), TaintSourceType::UserInput),
598        ],
599        sinks: vec![
600            // ShellExec: system, popen, execl, execv, execvp
601            (Regex::new(r"\b(system|popen|execl|execv|execvp)\s*\(").unwrap(), TaintSinkType::ShellExec),
602            // ShellExec: sprintf, vsprintf (format string vuln)
603            (Regex::new(r"\b(sprintf|vsprintf)\s*\(").unwrap(), TaintSinkType::ShellExec),
604            // FileWrite: strcpy, strcat, strncpy (buffer overflow)
605            (Regex::new(r"\b(strcpy|strcat|strncpy)\s*\(").unwrap(), TaintSinkType::FileWrite),
606        ],
607        sanitizers: vec![
608            // Numeric: atoi, atol, atof, strtol, strtoul, strtod
609            (Regex::new(r"\b(atoi|atol|atof|strtol|strtoul|strtod)\s*\(").unwrap(), SanitizerType::Numeric),
610            // Shell: snprintf (bounded write)
611            (Regex::new(r"\bsnprintf\s*\(").unwrap(), SanitizerType::Shell),
612        ],
613    };
614}
615
616lazy_static! {
617    /// C++ taint patterns.
618    static ref CPP_PATTERNS: LanguagePatterns = LanguagePatterns {
619        sources: vec![
620            // UserInput: std::cin >>
621            (Regex::new(r"std::cin\s*>>").unwrap(), TaintSourceType::UserInput),
622            // UserInput: std::getline, getline
623            (Regex::new(r"(std::)?getline\s*\(").unwrap(), TaintSourceType::UserInput),
624            // EnvVar: getenv
625            (Regex::new(r"\bgetenv\s*\(").unwrap(), TaintSourceType::EnvVar),
626            // FileRead: std::ifstream, std::fstream
627            (Regex::new(r"std::(ifstream|fstream)").unwrap(), TaintSourceType::FileRead),
628        ],
629        sinks: vec![
630            // ShellExec: system, popen, std::system
631            (Regex::new(r"(\bsystem\s*\(|\bpopen\s*\(|std::system\s*\()").unwrap(), TaintSinkType::ShellExec),
632            // ShellExec: sprintf
633            (Regex::new(r"\bsprintf\s*\(").unwrap(), TaintSinkType::ShellExec),
634        ],
635        sanitizers: vec![
636            // Numeric: std::stoi, std::stol, std::stoul, std::stoll, std::stof, std::stod
637            (Regex::new(r"std::sto(i|l|ul|ll|f|d)\s*\(").unwrap(), SanitizerType::Numeric),
638            // Numeric: static_cast<int/long/float/double>
639            (Regex::new(r"static_cast<(int|long|float|double)>\s*\(").unwrap(), SanitizerType::Numeric),
640        ],
641    };
642}
643
644lazy_static! {
645    /// Ruby taint patterns.
646    static ref RUBY_PATTERNS: LanguagePatterns = LanguagePatterns {
647        sources: vec![
648            // UserInput: gets
649            (Regex::new(r"\bgets\b").unwrap(), TaintSourceType::UserInput),
650            // Stdin: STDIN.read, STDIN.gets, STDIN.readline
651            (Regex::new(r"STDIN\.(read|gets|readline)").unwrap(), TaintSourceType::Stdin),
652            // HttpParam: params[
653            (Regex::new(r"\bparams\[").unwrap(), TaintSourceType::HttpParam),
654            // EnvVar: ENV[
655            (Regex::new(r"ENV\[").unwrap(), TaintSourceType::EnvVar),
656            // FileRead: File.read, File.open
657            (Regex::new(r"File\.(read|open)\s*\(").unwrap(), TaintSourceType::FileRead),
658        ],
659        sinks: vec![
660            // CodeEval: eval
661            (Regex::new(r"\beval\s*\(").unwrap(), TaintSinkType::CodeEval),
662            // ShellExec: system, exec
663            (Regex::new(r"\b(system|exec)\s*\(").unwrap(), TaintSinkType::ShellExec),
664            // ShellExec: IO.popen
665            (Regex::new(r"IO\.popen\s*\(").unwrap(), TaintSinkType::ShellExec),
666            // CodeEval: send (dynamic dispatch)
667            (Regex::new(r"\.send\s*\(").unwrap(), TaintSinkType::CodeEval),
668        ],
669        sanitizers: vec![
670            // Numeric: .to_i, .to_f
671            (Regex::new(r"\.(to_i|to_f)\b").unwrap(), SanitizerType::Numeric),
672            // Html: CGI.escapeHTML, Rack::Utils.escape_html
673            (Regex::new(r"(CGI\.escapeHTML|Rack::Utils\.escape_html)\s*\(").unwrap(), SanitizerType::Html),
674        ],
675    };
676}
677
678lazy_static! {
679    /// Kotlin taint patterns.
680    static ref KOTLIN_PATTERNS: LanguagePatterns = LanguagePatterns {
681        sources: vec![
682            // UserInput: readLine(), readln()
683            (Regex::new(r"\b(readLine|readln)\s*\(\)").unwrap(), TaintSourceType::UserInput),
684            // EnvVar: System.getenv
685            (Regex::new(r"System\.getenv\s*\(").unwrap(), TaintSourceType::EnvVar),
686            // UserInput: BufferedReader
687            (Regex::new(r"BufferedReader\s*\(").unwrap(), TaintSourceType::UserInput),
688            // HttpParam: request.getParameter
689            (Regex::new(r"request\.getParameter\s*\(").unwrap(), TaintSourceType::HttpParam),
690        ],
691        sinks: vec![
692            // ShellExec: Runtime.getRuntime().exec, ProcessBuilder
693            (Regex::new(r"(Runtime\.getRuntime\(\)\.exec\s*\(|ProcessBuilder\s*\()").unwrap(), TaintSinkType::ShellExec),
694            // SqlQuery: .execute, .executeQuery, prepareStatement
695            (Regex::new(r"\.(execute|executeQuery)\s*\(|prepareStatement\s*\(").unwrap(), TaintSinkType::SqlQuery),
696        ],
697        sanitizers: vec![
698            // Numeric: .toInt(), .toLong(), .toDouble(), .toFloat()
699            (Regex::new(r"\.(toInt|toLong|toDouble|toFloat)\s*\(\)").unwrap(), SanitizerType::Numeric),
700        ],
701    };
702}
703
704lazy_static! {
705    /// Swift taint patterns.
706    static ref SWIFT_PATTERNS: LanguagePatterns = LanguagePatterns {
707        sources: vec![
708            // UserInput: readLine()
709            (Regex::new(r"\breadLine\s*\(\)").unwrap(), TaintSourceType::UserInput),
710            // EnvVar: ProcessInfo.processInfo.environment[
711            (Regex::new(r"ProcessInfo\.processInfo\.environment\[").unwrap(), TaintSourceType::EnvVar),
712            // FileRead: FileManager.default, URLSession
713            (Regex::new(r"(FileManager\.default|URLSession)").unwrap(), TaintSourceType::FileRead),
714        ],
715        sinks: vec![
716            // ShellExec: Process(), NSTask
717            (Regex::new(r"(Process\s*\(\)|NSTask)").unwrap(), TaintSinkType::ShellExec),
718            // SqlQuery: sqlite3_exec
719            (Regex::new(r"sqlite3_exec\s*\(").unwrap(), TaintSinkType::SqlQuery),
720        ],
721        sanitizers: vec![
722            // Numeric: Int(), Double(), Float()
723            (Regex::new(r"\b(Int|Double|Float)\s*\(").unwrap(), SanitizerType::Numeric),
724            // Html: addingPercentEncoding
725            (Regex::new(r"addingPercentEncoding\s*\(").unwrap(), SanitizerType::Html),
726        ],
727    };
728}
729
730lazy_static! {
731    /// C# taint patterns.
732    static ref CSHARP_PATTERNS: LanguagePatterns = LanguagePatterns {
733        sources: vec![
734            // UserInput: Console.ReadLine()
735            (Regex::new(r"Console\.ReadLine\s*\(").unwrap(), TaintSourceType::UserInput),
736            // HttpParam: Request.QueryString[, Request.Form[
737            (Regex::new(r"Request\.(QueryString|Form)\[").unwrap(), TaintSourceType::HttpParam),
738            // EnvVar: Environment.GetEnvironmentVariable
739            (Regex::new(r"Environment\.GetEnvironmentVariable\s*\(").unwrap(), TaintSourceType::EnvVar),
740            // FileRead: File.ReadAllText, File.ReadAllLines, File.OpenRead, StreamReader
741            (Regex::new(r"(File\.(ReadAllText|ReadAllLines|OpenRead)\s*\(|StreamReader\s*\()").unwrap(), TaintSourceType::FileRead),
742        ],
743        sinks: vec![
744            // ShellExec: Process.Start
745            (Regex::new(r"Process\.Start\s*\(").unwrap(), TaintSinkType::ShellExec),
746            // SqlQuery: SqlCommand, .ExecuteNonQuery, .ExecuteReader
747            (Regex::new(r"(SqlCommand\s*\(|\.ExecuteNonQuery\s*\(|\.ExecuteReader\s*\()").unwrap(), TaintSinkType::SqlQuery),
748            // CodeEval: Activator.CreateInstance
749            (Regex::new(r"Activator\.CreateInstance\s*\(").unwrap(), TaintSinkType::CodeEval),
750        ],
751        sanitizers: vec![
752            // Numeric: int.Parse, Convert.ToInt32, double.Parse
753            (Regex::new(r"(int\.Parse|Convert\.ToInt32|double\.Parse)\s*\(").unwrap(), SanitizerType::Numeric),
754            // Html: HttpUtility.HtmlEncode
755            (Regex::new(r"HttpUtility\.HtmlEncode\s*\(").unwrap(), SanitizerType::Html),
756        ],
757    };
758}
759
760lazy_static! {
761    /// Scala taint patterns.
762    static ref SCALA_PATTERNS: LanguagePatterns = LanguagePatterns {
763        sources: vec![
764            // UserInput: StdIn.readLine, scala.io.StdIn
765            (Regex::new(r"(StdIn\.readLine\s*\(|scala\.io\.StdIn)").unwrap(), TaintSourceType::UserInput),
766            // EnvVar: System.getenv
767            (Regex::new(r"System\.getenv\s*\(").unwrap(), TaintSourceType::EnvVar),
768            // FileRead: Source.fromFile
769            (Regex::new(r"Source\.fromFile\s*\(").unwrap(), TaintSourceType::FileRead),
770        ],
771        sinks: vec![
772            // ShellExec: Runtime.getRuntime.exec, sys.process, Process()
773            (Regex::new(r"(Runtime\.getRuntime\.exec\s*\(|sys\.process|Process\s*\()").unwrap(), TaintSinkType::ShellExec),
774            // SqlQuery: stmt.execute, statement.execute, .executeQuery
775            (Regex::new(r"\.(execute|executeQuery)\s*\(").unwrap(), TaintSinkType::SqlQuery),
776        ],
777        sanitizers: vec![
778            // Numeric: .toInt, .toLong, .toDouble
779            (Regex::new(r"\.(toInt|toLong|toDouble)\b").unwrap(), SanitizerType::Numeric),
780            // Html: StringEscapeUtils.escapeHtml
781            (Regex::new(r"StringEscapeUtils\.escapeHtml").unwrap(), SanitizerType::Html),
782        ],
783    };
784}
785
786lazy_static! {
787    /// PHP taint patterns.
788    static ref PHP_PATTERNS: LanguagePatterns = LanguagePatterns {
789        sources: vec![
790            // HttpParam: $_GET, $_REQUEST, $_COOKIE, $_SERVER
791            (Regex::new(r"\$_(GET|REQUEST|COOKIE|SERVER)\[").unwrap(), TaintSourceType::HttpParam),
792            // HttpBody: $_POST
793            (Regex::new(r"\$_POST\[").unwrap(), TaintSourceType::HttpBody),
794            // UserInput: fgets
795            (Regex::new(r"\bfgets\s*\(").unwrap(), TaintSourceType::UserInput),
796            // FileRead: file_get_contents
797            (Regex::new(r"file_get_contents\s*\(").unwrap(), TaintSourceType::FileRead),
798            // EnvVar: getenv, $_ENV
799            (Regex::new(r"(getenv\s*\(|\$_ENV\[)").unwrap(), TaintSourceType::EnvVar),
800        ],
801        sinks: vec![
802            // CodeEval: eval
803            (Regex::new(r"\beval\s*\(").unwrap(), TaintSinkType::CodeEval),
804            // ShellExec: exec, system, passthru, shell_exec, popen, proc_open
805            (Regex::new(r"\b(exec|system|passthru|shell_exec|popen|proc_open)\s*\(").unwrap(), TaintSinkType::ShellExec),
806            // SqlQuery: mysqli_query, ->query
807            (Regex::new(r"(mysqli_query\s*\(|->query\s*\()").unwrap(), TaintSinkType::SqlQuery),
808        ],
809        sanitizers: vec![
810            // Numeric: intval, floatval, (int), (float)
811            (Regex::new(r"(\b(intval|floatval)\s*\(|\(int\)|\(float\))").unwrap(), SanitizerType::Numeric),
812            // Html: htmlspecialchars, htmlentities
813            (Regex::new(r"(htmlspecialchars|htmlentities)\s*\(").unwrap(), SanitizerType::Html),
814            // Shell: mysqli_real_escape_string
815            (Regex::new(r"mysqli_real_escape_string\s*\(").unwrap(), SanitizerType::Shell),
816        ],
817    };
818}
819
820lazy_static! {
821    /// Lua/Luau taint patterns.
822    static ref LUA_PATTERNS: LanguagePatterns = LanguagePatterns {
823        sources: vec![
824            // UserInput: io.read
825            (Regex::new(r"io\.read\s*\(").unwrap(), TaintSourceType::UserInput),
826            // EnvVar: os.getenv
827            (Regex::new(r"os\.getenv\s*\(").unwrap(), TaintSourceType::EnvVar),
828            // FileRead: io.open
829            (Regex::new(r"io\.open\s*\(").unwrap(), TaintSourceType::FileRead),
830        ],
831        sinks: vec![
832            // ShellExec: os.execute
833            (Regex::new(r"os\.execute\s*\(").unwrap(), TaintSinkType::ShellExec),
834            // ShellExec: io.popen
835            (Regex::new(r"io\.popen\s*\(").unwrap(), TaintSinkType::ShellExec),
836            // CodeEval: loadstring, load, dofile, loadfile
837            (Regex::new(r"\b(loadstring|load|dofile|loadfile)\s*\(").unwrap(), TaintSinkType::CodeEval),
838        ],
839        sanitizers: vec![
840            // Numeric: tonumber
841            (Regex::new(r"\btonumber\s*\(").unwrap(), SanitizerType::Numeric),
842        ],
843    };
844}
845
846lazy_static! {
847    /// Elixir taint patterns.
848    static ref ELIXIR_PATTERNS: LanguagePatterns = LanguagePatterns {
849        sources: vec![
850            // UserInput: IO.gets
851            (Regex::new(r"IO\.gets\s*\(").unwrap(), TaintSourceType::UserInput),
852            // EnvVar: System.get_env
853            (Regex::new(r"System\.get_env\s*\(").unwrap(), TaintSourceType::EnvVar),
854            // FileRead: File.read, File.read!
855            (Regex::new(r"File\.(read|read!)\s*\(").unwrap(), TaintSourceType::FileRead),
856        ],
857        sinks: vec![
858            // ShellExec: System.cmd
859            (Regex::new(r"System\.cmd\s*\(").unwrap(), TaintSinkType::ShellExec),
860            // CodeEval: Code.eval_string
861            (Regex::new(r"Code\.eval_string\s*\(").unwrap(), TaintSinkType::CodeEval),
862            // SqlQuery: Ecto.Adapters.SQL.query
863            (Regex::new(r"Ecto\.Adapters\.SQL\.query\s*\(").unwrap(), TaintSinkType::SqlQuery),
864        ],
865        sanitizers: vec![
866            // Numeric: String.to_integer, String.to_float
867            (Regex::new(r"String\.(to_integer|to_float)\s*\(").unwrap(), SanitizerType::Numeric),
868            // Html: Phoenix.HTML.html_escape
869            (Regex::new(r"Phoenix\.HTML\.html_escape\s*\(").unwrap(), SanitizerType::Html),
870        ],
871    };
872}
873
874lazy_static! {
875    /// OCaml taint patterns.
876    static ref OCAML_PATTERNS: LanguagePatterns = LanguagePatterns {
877        sources: vec![
878            // UserInput: read_line
879            (Regex::new(r"\bread_line\s").unwrap(), TaintSourceType::UserInput),
880            // EnvVar: Sys.getenv
881            (Regex::new(r"Sys\.getenv\s").unwrap(), TaintSourceType::EnvVar),
882            // UserInput: input_line (reading from a channel)
883            (Regex::new(r"\binput_line\s").unwrap(), TaintSourceType::UserInput),
884            // FileRead: In_channel.read_all, In_channel.input_all
885            (Regex::new(r"In_channel\.(read_all|input_all)\s").unwrap(), TaintSourceType::FileRead),
886        ],
887        sinks: vec![
888            // ShellExec: Sys.command
889            (Regex::new(r"Sys\.command\s").unwrap(), TaintSinkType::ShellExec),
890            // ShellExec: Unix.execvp
891            (Regex::new(r"Unix\.execvp\s").unwrap(), TaintSinkType::ShellExec),
892            // SqlQuery: Sqlite3.exec
893            (Regex::new(r"Sqlite3\.exec\s").unwrap(), TaintSinkType::SqlQuery),
894        ],
895        sanitizers: vec![
896            // Numeric: int_of_string, float_of_string
897            (Regex::new(r"\b(int_of_string|float_of_string)\s").unwrap(), SanitizerType::Numeric),
898        ],
899    };
900}
901
902/// Get taint analysis patterns for a given language.
903///
904/// Each language has its own set of source, sink, and sanitizer patterns.
905/// TypeScript/JavaScript share patterns, as do Lua/Luau.
906pub fn get_patterns(language: Language) -> &'static LanguagePatterns {
907    match language {
908        Language::Python => &PYTHON_PATTERNS,
909        Language::TypeScript | Language::JavaScript => &TYPESCRIPT_PATTERNS,
910        Language::Go => &GO_PATTERNS,
911        Language::Java => &JAVA_PATTERNS,
912        Language::Rust => &RUST_PATTERNS,
913        Language::C => &C_PATTERNS,
914        Language::Cpp => &CPP_PATTERNS,
915        Language::Ruby => &RUBY_PATTERNS,
916        Language::Kotlin => &KOTLIN_PATTERNS,
917        Language::Swift => &SWIFT_PATTERNS,
918        Language::CSharp => &CSHARP_PATTERNS,
919        Language::Scala => &SCALA_PATTERNS,
920        Language::Php => &PHP_PATTERNS,
921        Language::Lua | Language::Luau => &LUA_PATTERNS,
922        Language::Elixir => &ELIXIR_PATTERNS,
923        Language::Ocaml => &OCAML_PATTERNS,
924    }
925}
926
927/// Detect taint sources in a statement.
928///
929/// Scans the statement for patterns matching known taint sources (e.g., `input()`,
930/// `request.args`, `os.environ`). If a source is found and the statement is an
931/// assignment, returns a `TaintSource` with the assigned variable name.
932///
933/// # Arguments
934///
935/// * `statement` - The source code statement to analyze
936/// * `line` - The line number of the statement
937/// * `language` - The programming language (determines which patterns to use)
938///
939/// # Returns
940///
941/// A vector of detected `TaintSource`s. Usually 0 or 1, but could be more if
942/// multiple sources appear in the same statement.
943pub fn detect_sources(statement: &str, line: u32, language: Language) -> Vec<TaintSource> {
944    let mut sources = Vec::new();
945    let patterns = get_patterns(language);
946
947    for (pattern, source_type) in patterns.sources.iter() {
948        if pattern.is_match(statement) {
949            // Try to extract variable name from assignment (left side of =)
950            if let Some(var) = extract_assigned_var(statement) {
951                sources.push(TaintSource {
952                    var,
953                    line,
954                    source_type: *source_type,
955                    statement: Some(statement.to_string()),
956                });
957            } else {
958                // For non-assignment sources (e.g., C's scanf(buf), fgets(buf, ...)),
959                // extract the first variable argument from the call
960                if let Some(var) = extract_call_arg(statement, pattern) {
961                    sources.push(TaintSource {
962                        var,
963                        line,
964                        source_type: *source_type,
965                        statement: Some(statement.to_string()),
966                    });
967                } else {
968                    // Last resort: use a synthetic variable name from the source type
969                    // This handles patterns like "std::cin >> input" or "STDIN.read"
970                    // where neither assignment nor call extraction works
971                    let var = extract_source_var_from_statement(statement);
972                    if let Some(var) = var {
973                        sources.push(TaintSource {
974                            var,
975                            line,
976                            source_type: *source_type,
977                            statement: Some(statement.to_string()),
978                        });
979                    }
980                }
981            }
982        }
983    }
984
985    sources
986}
987
988/// Extract a variable name from a source statement when there's no assignment or call arg.
989///
990/// Handles patterns like:
991/// - "std::cin >> input" -> "input"
992/// - "fmt.Scan(&input)" -> "input"
993/// - "std::ifstream file(path)" -> "file"
994/// - "scanf(\"%s\", buf)" -> "buf" (already handled by extract_call_arg)
995fn extract_source_var_from_statement(statement: &str) -> Option<String> {
996    // Handle C++ "cin >> var" pattern
997    if let Some(pos) = statement.find(">>") {
998        let after = statement[pos + 2..].trim();
999        let var = after.split_whitespace().next().unwrap_or("");
1000        let var = var.trim_end_matches(|c: char| !c.is_alphanumeric() && c != '_');
1001        if is_valid_identifier(var) {
1002            return Some(var.to_string());
1003        }
1004    }
1005
1006    // Handle "&var" references (Go's fmt.Scan(&input))
1007    if let Some(pos) = statement.find('&') {
1008        let after = &statement[pos + 1..];
1009        let var = after
1010            .split(|c: char| !c.is_alphanumeric() && c != '_')
1011            .next()
1012            .unwrap_or("");
1013        if is_valid_identifier(var) {
1014            return Some(var.to_string());
1015        }
1016    }
1017
1018    // Handle C++ constructor-style declarations: "Type var(args)" or "Type var"
1019    // e.g., "std::ifstream file(path)" -> "file"
1020    let tokens: Vec<&str> = statement.split_whitespace().collect();
1021    if tokens.len() >= 2 {
1022        // Find a token that looks like a variable (followed by '(' or end)
1023        for tok in tokens.iter().skip(1) {
1024            // Strip trailing '(' and everything after for constructor calls
1025            let var = tok.split('(').next().unwrap_or(tok);
1026            let var = var.trim_end_matches(|c: char| !c.is_alphanumeric() && c != '_');
1027            if is_valid_identifier(var) && var.len() > 1 {
1028                return Some(var.to_string());
1029            }
1030        }
1031    }
1032
1033    None
1034}
1035
1036/// Detect taint sinks in a statement.
1037///
1038/// Scans the statement for patterns matching known taint sinks (e.g., `execute()`,
1039/// `eval()`, `os.system()`). If a sink is found, extracts the variable being
1040/// passed as an argument.
1041///
1042/// # Arguments
1043///
1044/// * `statement` - The source code statement to analyze
1045/// * `line` - The line number of the statement
1046/// * `language` - The programming language (determines which patterns to use)
1047///
1048/// # Returns
1049///
1050/// A vector of detected `TaintSink`s. The `tainted` field is set to `false`
1051/// initially; it will be updated by the taint propagation analysis.
1052pub fn detect_sinks(statement: &str, line: u32, language: Language) -> Vec<TaintSink> {
1053    let mut sinks = Vec::new();
1054    let patterns = get_patterns(language);
1055    for (pattern, sink_type) in patterns.sinks.iter() {
1056        if pattern.is_match(statement) {
1057            // Extract variable name from call argument
1058            if let Some(var) = extract_call_arg(statement, pattern) {
1059                sinks.push(TaintSink {
1060                    var,
1061                    line,
1062                    sink_type: *sink_type,
1063                    tainted: false,
1064                    statement: Some(statement.to_string()),
1065                });
1066            } else {
1067                // Handle assignment-style sinks (e.g., "element.innerHTML = userContent")
1068                // and patterns where the dangerous argument is on the RHS of an assignment
1069                if let Some(var) = extract_sink_var_from_statement(statement, pattern) {
1070                    sinks.push(TaintSink {
1071                        var,
1072                        line,
1073                        sink_type: *sink_type,
1074                        tainted: false,
1075                        statement: Some(statement.to_string()),
1076                    });
1077                } else {
1078                    // Fallback: extract interpolated variables from format strings.
1079                    // This catches f"SELECT {query}", `SELECT ${query}`, etc.
1080                    // where the sink argument is a string literal with embedded variables.
1081                    let interp_vars = extract_interpolated_vars(statement);
1082                    for var in interp_vars {
1083                        sinks.push(TaintSink {
1084                            var,
1085                            line,
1086                            sink_type: *sink_type,
1087                            tainted: false,
1088                            statement: Some(statement.to_string()),
1089                        });
1090                }
1091            }
1092        }
1093    }
1094}
1095sinks
1096}
1097
1098/// Extract a variable from a sink statement when extract_call_arg fails.
1099///
1100/// Handles:
1101/// - Assignment-style sinks: "element.innerHTML = userContent" -> "userContent"
1102/// - Non-call sinks: "unsafe { std::ptr::write(ptr, val) }" -> "ptr"
1103/// - Process constructors: "new ProcessBuilder(cmd).start()" -> "cmd"
1104/// - Space-separated args (OCaml): "Unix.execvp cmd args" -> "cmd"
1105/// - Scala: "import sys.process._; cmd.!" -> "cmd"
1106fn extract_sink_var_from_statement(statement: &str, pattern: &Regex) -> Option<String> {
1107    if let Some(m) = pattern.find(statement) {
1108        let after = &statement[m.end()..];
1109        let after = after.trim();
1110
1111        // If the pattern matched an assignment (innerHTML =), RHS is the var
1112        if after.is_empty() || !after.starts_with('(') {
1113            // Get what's after the "=" in the full statement
1114            if let Some(eq_pos) = statement.rfind('=') {
1115                // Make sure it's not == or other compound operators
1116                let before_eq = if eq_pos > 0 {
1117                    statement.as_bytes()[eq_pos - 1]
1118                } else {
1119                    b' '
1120                };
1121                let after_eq = if eq_pos + 1 < statement.len() {
1122                    statement.as_bytes()[eq_pos + 1]
1123                } else {
1124                    b' '
1125                };
1126                if before_eq != b'='
1127                    && before_eq != b'!'
1128                    && before_eq != b'<'
1129                    && before_eq != b'>'
1130                    && after_eq != b'='
1131                {
1132                    let rhs = statement[eq_pos + 1..].trim();
1133                    let var = rhs
1134                        .split(|c: char| !c.is_alphanumeric() && c != '_')
1135                        .next()
1136                        .unwrap_or("");
1137                    if is_valid_identifier(var) {
1138                        return Some(var.to_string());
1139                    }
1140                }
1141            }
1142        }
1143
1144        // Try to find a parenthesized argument after the pattern
1145        // Handle "new ProcessBuilder(cmd).start()" or "ProcessBuilder(cmd)"
1146        let search_area = &statement[m.start()..];
1147        if let Some(open) = search_area.find('(') {
1148            let rest = &search_area[open + 1..];
1149            let end = rest.find([',', ')']).unwrap_or(rest.len());
1150            let arg = rest[..end].trim();
1151            if !arg.starts_with('"') && !arg.starts_with('\'') && !arg.is_empty() {
1152                let var_name = arg.split('.').next().unwrap_or(arg);
1153                let var_name = var_name.trim_start_matches('$');
1154                if is_valid_identifier(var_name) {
1155                    return Some(var_name.to_string());
1156                }
1157            }
1158        }
1159
1160        // Handle space-separated arguments (OCaml, Haskell, etc.)
1161        // e.g., "Unix.execvp cmd args" -> "cmd"
1162        // e.g., "Sys.command cmd" -> "cmd"
1163        if !after.is_empty() && !after.starts_with('(') {
1164            // Take the first space-separated token after the pattern
1165            let token = after
1166                .split(|c: char| c.is_whitespace() || c == ';')
1167                .next()
1168                .unwrap_or("");
1169            let token = token.trim_end_matches(|c: char| !c.is_alphanumeric() && c != '_');
1170            if is_valid_identifier(token) {
1171                return Some(token.to_string());
1172            }
1173        }
1174
1175        // Handle semicolon-separated statements
1176        // e.g., "import sys.process._; cmd.!" -> look before semicolon for var
1177        if statement.contains(';') {
1178            // Look for identifiers in the other parts of the statement
1179            for part in statement.split(';') {
1180                let part = part.trim();
1181                // Skip the part that contains the pattern match
1182                if pattern.is_match(part) {
1183                    continue;
1184                }
1185                // Find first identifier in this part
1186                let var = part
1187                    .split(|c: char| !c.is_alphanumeric() && c != '_')
1188                    .find(|t| is_valid_identifier(t));
1189                if let Some(var) = var {
1190                    return Some(var.to_string());
1191                }
1192            }
1193        }
1194    }
1195
1196    None
1197}
1198
1199/// Check if a statement contains a sanitizer and return its type.
1200///
1201/// Scans for patterns like `int()`, `shlex.quote()`, `html.escape()` that
1202/// neutralize taint for specific sink types.
1203///
1204/// # Arguments
1205///
1206/// * `statement` - The source code statement to analyze
1207/// * `language` - The programming language (determines which patterns to use)
1208///
1209/// # Returns
1210///
1211/// `Some(SanitizerType)` if a sanitizer is detected, `None` otherwise.
1212pub fn detect_sanitizer(statement: &str, language: Language) -> Option<SanitizerType> {
1213    let patterns = get_patterns(language);
1214    for (pattern, sanitizer_type) in patterns.sanitizers.iter() {
1215        if pattern.is_match(statement) {
1216            return Some(*sanitizer_type);
1217        }
1218    }
1219    None
1220}
1221
1222/// Convenience wrapper to check if a statement contains any sanitizer.
1223///
1224/// # Arguments
1225///
1226/// * `statement` - The source code statement to analyze
1227/// * `language` - The programming language (determines which patterns to use)
1228///
1229/// # Returns
1230///
1231/// `true` if the statement contains a sanitizer, `false` otherwise.
1232pub fn is_sanitizer(statement: &str, language: Language) -> bool {
1233    detect_sanitizer(statement, language).is_some()
1234}
1235
1236/// Find sanitizers in a statement and return the sanitized variable with its type.
1237///
1238/// # Arguments
1239///
1240/// * `statement` - The source code statement to analyze
1241/// * `line` - The line number of the statement
1242/// * `language` - The programming language (determines which patterns to use)
1243///
1244/// # Returns
1245///
1246/// A vector of (variable_name, SanitizerType) pairs for each sanitizer found.
1247pub fn find_sanitizers_in_statement(
1248    statement: &str,
1249    _line: u32,
1250    language: Language,
1251) -> Vec<(String, SanitizerType)> {
1252    let mut result = Vec::new();
1253    let patterns = get_patterns(language);
1254
1255    for (pattern, sanitizer_type) in patterns.sanitizers.iter() {
1256        if pattern.is_match(statement) {
1257            // The sanitized variable is the one being assigned to
1258            if let Some(var) = extract_assigned_var(statement) {
1259                result.push((var, *sanitizer_type));
1260            }
1261        }
1262    }
1263
1264    result
1265}
1266
1267/// Extract variable name from assignment (LHS of =).
1268///
1269/// Handles various Python assignment patterns:
1270/// - Simple assignment: `var = ...`
1271/// - Type-annotated assignment: `var: Type = ...`
1272/// - Walrus operator: `(var := ...)`
1273///
1274/// # Arguments
1275///
1276/// * `statement` - The source code statement to analyze
1277///
1278/// # Returns
1279///
1280/// `Some(variable_name)` if an assignment is detected, `None` otherwise.
1281fn extract_assigned_var(statement: &str) -> Option<String> {
1282    let trimmed = statement.trim();
1283
1284    // Walrus operator / Go short declaration: (var := ...) or var := ...
1285    if let Some(pos) = trimmed.find(":=") {
1286        let before = &trimmed[..pos];
1287        let var = before.trim().trim_start_matches('(').trim();
1288        if is_valid_identifier(var) {
1289            return Some(var.to_string());
1290        }
1291        // Handle "rows, err := ..." -> take first variable
1292        if let Some(first) = var.split(',').next() {
1293            let first = first.trim();
1294            if is_valid_identifier(first) {
1295                return Some(first.to_string());
1296            }
1297        }
1298    }
1299
1300    // Standard assignment: var = ...
1301    if let Some(pos) = trimmed.find('=') {
1302        // Skip == comparison
1303        if pos > 0 && trimmed.chars().nth(pos.saturating_sub(1)) == Some('=') {
1304            return None;
1305        }
1306        if pos + 1 < trimmed.len() && trimmed.chars().nth(pos + 1) == Some('=') {
1307            return None;
1308        }
1309        // Skip !=, <=, >=
1310        if pos > 0 {
1311            let prev_char = trimmed.chars().nth(pos.saturating_sub(1));
1312            if prev_char == Some('!') || prev_char == Some('<') || prev_char == Some('>') {
1313                return None;
1314            }
1315        }
1316
1317        let before = &trimmed[..pos];
1318        // Handle type annotation: var: Type = ...
1319        let var_part = if let Some(colon_pos) = before.find(':') {
1320            &before[..colon_pos]
1321        } else {
1322            before
1323        };
1324        let var = var_part.trim();
1325        if is_valid_identifier(var) {
1326            return Some(var.to_string());
1327        }
1328
1329        // Handle multi-language patterns where there are keywords/types before the var:
1330        // JavaScript/TypeScript: const/let/var name = ...
1331        // Rust: let/let mut name = ...
1332        // Java/C#: TypeName name = ...
1333        // C/C++: type *name = ..., type name = ...
1334        // Kotlin: val/var name = ...
1335        // Swift: let/var name = ...
1336        // Lua: local name = ...
1337        // Scala: val/var name = ...
1338        // OCaml: let name = ...
1339        // PHP: $name = ...
1340        let tokens: Vec<&str> = var.split_whitespace().collect();
1341        if tokens.len() >= 2 {
1342            // Take the last token as the variable name
1343            let last = tokens[tokens.len() - 1];
1344            // Strip pointer/reference markers for C/C++
1345            let clean = last.trim_start_matches('*').trim_start_matches('&');
1346            // Strip PHP $ prefix for validation but keep it
1347            let check = clean.trim_start_matches('$');
1348            if !check.is_empty() && is_valid_identifier(check) {
1349                return Some(clean.to_string());
1350            }
1351        }
1352
1353        // Handle Elixir pattern match: {:ok, content} = ...
1354        // Extract last identifier from destructuring
1355        if var.contains('{') || var.contains('(') || var.contains('[') {
1356            // Find identifiers in the pattern
1357            let cleaned = var.replace(['{', '}', '(', ')', '[', ']', ':'], " ");
1358            let idents: Vec<&str> = cleaned
1359                .split_whitespace()
1360                .filter(|t| is_valid_identifier(t) && *t != "ok" && *t != "err")
1361                .collect();
1362            if let Some(last_ident) = idents.last() {
1363                return Some(last_ident.to_string());
1364            }
1365        }
1366
1367        // Handle PHP $var = ... (single token starting with $)
1368        if let Some(name) = var.strip_prefix('$') {
1369            if is_valid_identifier(name) {
1370                return Some(var.to_string());
1371            }
1372        }
1373    }
1374
1375    // No assignment found - check for function-call-as-source patterns
1376    // like scanf("%s", buf) or fgets(buf, ...) where the target variable
1377    // is an argument rather than the LHS of an assignment.
1378    // These are handled by detect_sources_from_call_args (separate path).
1379    None
1380}
1381
1382/// Extract the first argument from a function call that matches the pattern.
1383///
1384/// # Arguments
1385///
1386/// * `statement` - The source code statement to analyze
1387/// * `pattern` - The regex pattern that matched (to find the right call)
1388///
1389/// # Returns
1390///
1391/// `Some(argument_name)` if a variable argument is found, `None` if the argument
1392/// is a string literal or not a valid identifier.
1393fn extract_call_arg(statement: &str, pattern: &Regex) -> Option<String> {
1394    // Find where the pattern matches, then find the opening paren
1395    if let Some(m) = pattern.find(statement) {
1396        let after_match = &statement[m.end()..];
1397        // The pattern includes the `(`, so we're already past it
1398        // But some patterns end with `\(`, so we need to handle that
1399        let rest = after_match.strip_prefix('(').unwrap_or(after_match);
1400        // Try each argument until we find a valid variable
1401        let mut remaining = rest;
1402        loop {
1403            // Find end of current argument (comma or close paren)
1404            let end = remaining
1405                .find([',', ')'])
1406                .unwrap_or(remaining.len());
1407            let arg = remaining[..end].trim();
1408            // Check if it's a variable (not a string literal)
1409            if !arg.is_empty()
1410                && !arg.starts_with('"')
1411                && !arg.starts_with('\'')
1412                && !arg.starts_with("f\"")
1413                && !arg.starts_with("f'")
1414                && !arg.starts_with("r\"")
1415                && !arg.starts_with("r'")
1416            {
1417                // Handle attribute access like obj.attr - just get the first part
1418                let var_name = arg.split('.').next().unwrap_or(arg);
1419                // Strip PHP $ prefix for validation
1420                let check_name = var_name.trim_start_matches('$');
1421                if is_valid_identifier(check_name) {
1422                    return Some(var_name.to_string());
1423                }
1424            }
1425            // String concatenation: "..." + var — extract var from RHS of +
1426            if arg.contains('+') {
1427                for part in arg.split('+') {
1428                    let part = part.trim();
1429                    if !part.is_empty()
1430                        && !part.starts_with('"')
1431                        && !part.starts_with('\'')
1432                        && !part.starts_with("f\"")
1433                        && !part.starts_with("f'")
1434                    {
1435                        let var_name = part.split('.').next().unwrap_or(part);
1436                        let check_name = var_name.trim_start_matches('$');
1437                        if is_valid_identifier(check_name) {
1438                            return Some(var_name.to_string());
1439            }
1440            }
1441        }
1442    }
1443    // Move to next argument
1444    if end >= remaining.len() {
1445        break;
1446}
1447let next_char = remaining.as_bytes()[end];
1448if next_char == b')' {
1449    break;
1450}
1451// Skip comma and move to next arg
1452remaining = &remaining[end + 1..];
1453}
1454}
1455None
1456}
1457
1458/// Extract interpolated variables from format strings (f-strings, template literals).
1459///
1460/// Handles:
1461/// - Python f-strings: `f"SELECT {query}"` -> ["query"]
1462/// - Python .format(): `"SELECT {}".format(query)` -> ["query"]
1463/// - Python % formatting: `"SELECT %s" % query` -> ["query"]
1464/// - JS/TS template literals: `` `SELECT ${query}` `` -> ["query"]
1465/// - Ruby interpolation: `"SELECT #{query}"` -> ["query"]
1466/// - Rust format!: `format!("SELECT {}", query)` -> ["query"]
1467///
1468/// Returns all valid identifier names found inside interpolation braces.
1469fn extract_interpolated_vars(statement: &str) -> Vec<String> {
1470    let mut vars = Vec::new();
1471
1472    // Python f-string / JS template literal / Ruby: {var} or ${var} or #{var}
1473    // Match {identifier}, ${identifier}, #{identifier} patterns
1474    let _chars = statement.chars().peekable();
1475    let mut i = 0;
1476    let bytes = statement.as_bytes();
1477
1478    while i < bytes.len() {
1479        // Detect interpolation start: { or ${ or #{
1480        let is_interp = match bytes[i] {
1481            b'{' => {
1482                // Could be f-string {var} or standalone — check it's not {{
1483                i + 1 < bytes.len() && bytes[i + 1] != b'{'
1484            }
1485            b'$' | b'#' => {
1486                // ${var} or #{var}
1487                i + 1 < bytes.len() && bytes[i + 1] == b'{'
1488            }
1489            _ => false,
1490        };
1491
1492        if is_interp {
1493            // Skip to the opening brace
1494            let brace_start = if bytes[i] == b'{' { i } else { i + 1 };
1495            if brace_start + 1 < bytes.len() {
1496                // Find closing brace
1497                if let Some(close) = statement[brace_start + 1..].find('}') {
1498                    let inner = &statement[brace_start + 1..brace_start + 1 + close];
1499                    let inner = inner.trim();
1500                    // Could be an expression like `query` or `user.name` — take first identifier
1501                    let var_name = inner
1502                        .split(|c: char| !c.is_alphanumeric() && c != '_')
1503                        .next()
1504                        .unwrap_or("");
1505                    if is_valid_identifier(var_name) {
1506                        vars.push(var_name.to_string());
1507                    }
1508                    i = brace_start + 1 + close + 1;
1509                    continue;
1510                }
1511            }
1512        }
1513
1514        // Python .format() args: "...".format(var1, var2)
1515        if i + 8 < bytes.len() && &statement[i..i + 8] == ".format(" {
1516            let args_start = i + 8;
1517            if let Some(close) = statement[args_start..].find(')') {
1518                let args_str = &statement[args_start..args_start + close];
1519                for arg in args_str.split(',') {
1520                    let arg = arg.trim();
1521                    // Skip keyword args like key=val, take val
1522                    let val = if let Some(eq_pos) = arg.find('=') {
1523                        arg[eq_pos + 1..].trim()
1524                    } else {
1525                        arg
1526                    };
1527                    let var_name = val
1528                        .split(|c: char| !c.is_alphanumeric() && c != '_')
1529                        .next()
1530                        .unwrap_or("");
1531                    if is_valid_identifier(var_name) {
1532                        vars.push(var_name.to_string());
1533                    }
1534                }
1535                i = args_start + close + 1;
1536                continue;
1537            }
1538        }
1539
1540        // Python % formatting: "..." % (var,) or "..." % var
1541        if bytes[i] == b'%' && i > 0 {
1542            let before = statement[..i].trim_end();
1543            let after = statement[i + 1..].trim_start();
1544            if (before.ends_with('"') || before.ends_with('\'')) && !after.starts_with('%') {
1545                // Single var: "..." % var
1546                // Tuple: "..." % (var1, var2)
1547                let args_str = if after.starts_with('(') {
1548                    if let Some(close) = after.find(')') {
1549                        &after[1..close]
1550                    } else {
1551                        ""
1552                    }
1553                } else {
1554                    // Single variable
1555                    after.split(|c: char| c.is_whitespace() || c == ')' || c == ',')
1556                        .next()
1557                        .unwrap_or("")
1558                };
1559                for arg in args_str.split(',') {
1560                    let arg = arg.trim();
1561                    let var_name = arg
1562                        .split(|c: char| !c.is_alphanumeric() && c != '_')
1563                        .next()
1564                        .unwrap_or("");
1565                    if is_valid_identifier(var_name) {
1566                        vars.push(var_name.to_string());
1567                    }
1568                }
1569            }
1570        }
1571
1572        i += 1;
1573    }
1574
1575    // Deduplicate
1576    vars.sort();
1577    vars.dedup();
1578    vars
1579}
1580
1581/// Check if a string is a valid Python identifier.
1582///
1583/// A valid identifier starts with a letter or underscore, and contains
1584/// only letters, digits, and underscores.
1585fn is_valid_identifier(s: &str) -> bool {
1586    !s.is_empty()
1587        && s.chars()
1588            .next()
1589            .map(|c| c.is_alphabetic() || c == '_')
1590            .unwrap_or(false)
1591        && s.chars().all(|c| c.is_alphanumeric() || c == '_')
1592}
1593
1594/// Check if an identifier appears as a standalone word in text.
1595/// Uses word-boundary logic: the identifier must be surrounded by
1596/// non-alphanumeric, non-underscore characters (or be at string edges).
1597/// Prevents substring matches (e.g., "user" won't match inside "user_name").
1598fn identifier_in_text(text: &str, ident: &str) -> bool {
1599    let bytes = text.as_bytes();
1600    let ident_len = ident.len();
1601    if ident_len == 0 || ident_len > bytes.len() {
1602        return false;
1603    }
1604    let mut pos = 0;
1605    while pos + ident_len <= bytes.len() {
1606        match text[pos..].find(ident) {
1607            Some(offset) => {
1608                let abs = pos + offset;
1609                let before_ok = abs == 0 || {
1610                    let c = bytes[abs - 1];
1611                    !c.is_ascii_alphanumeric() && c != b'_'
1612                };
1613                let after_pos = abs + ident_len;
1614                let after_ok = after_pos >= bytes.len() || {
1615                    let c = bytes[after_pos];
1616                    !c.is_ascii_alphanumeric() && c != b'_'
1617                };
1618                if before_ok && after_ok {
1619                    return true;
1620                }
1621                pos = abs + 1;
1622            }
1623            None => break,
1624        }
1625    }
1626    false
1627}
1628
1629/// Check if a statement contains only a constant string (no taint).
1630///
1631/// Used to reduce false positives - string literals are not tainted.
1632///
1633/// # Arguments
1634///
1635/// * `statement` - The source code statement to analyze
1636///
1637/// # Returns
1638///
1639/// `true` if the statement is a constant string assignment, `false` otherwise.
1640#[allow(dead_code)]
1641pub fn is_constant_string(statement: &str) -> bool {
1642    // Match patterns like: var = "string" or var = 'string'
1643    lazy_static! {
1644        static ref CONST_STRING: Regex = Regex::new(r#"^\s*\w+\s*=\s*["'][^"']*["']\s*$"#).unwrap();
1645    }
1646    CONST_STRING.is_match(statement)
1647}
1648
1649/// Check if a statement uses ORM-safe patterns (parameterized queries).
1650///
1651/// SQLAlchemy and similar ORMs use operator overloading for safe queries.
1652/// These should not be flagged as SQL injection sinks.
1653///
1654/// # Arguments
1655///
1656/// * `statement` - The source code statement to analyze
1657///
1658/// # Returns
1659///
1660/// `true` if the statement uses ORM-safe patterns, `false` otherwise.
1661#[allow(dead_code)]
1662pub fn is_orm_safe_pattern(statement: &str) -> bool {
1663    lazy_static! {
1664        // SQLAlchemy patterns: session.query(...).filter(...), select(...).where(...)
1665        static ref ORM_SAFE: Regex =
1666            Regex::new(r"(\.filter\s*\(|\.where\s*\(|\.filter_by\s*\()").unwrap();
1667    }
1668    ORM_SAFE.is_match(statement)
1669}
1670
1671// Aliases for test compatibility (tests use different naming)
1672pub use detect_sinks as find_sinks_in_statement;
1673pub use detect_sources as find_sources_in_statement;
1674
1675// =============================================================================
1676// AST-Based Detection - Phase 9
1677// =============================================================================
1678//
1679// These functions use tree-sitter AST nodes to detect sources, sinks, and
1680// sanitizers. They complement the regex-based detection by:
1681// 1. Filtering out false positives from comments and string literals
1682// 2. Using structural matching instead of text patterns
1683// 3. Working with the full parsed tree for context
1684//
1685// The AST-based functions are used by `compute_taint_with_tree` and fall back
1686// to regex-based detection when the AST yields no results.
1687
1688use super::ast_utils::{
1689    call_node_kinds, extract_call_name, find_parent_assignment_var, is_in_comment, is_in_string,
1690    node_text, walk_descendants,
1691};
1692
1693/// AST-based source pattern: matches call names and member access patterns.
1694struct AstSourcePattern {
1695    /// Simple function names that indicate a source (e.g., "input", "readLine")
1696    call_names: &'static [&'static str],
1697    /// Dotted member access patterns (e.g., "request.args", "os.environ")
1698    /// Matched as substrings of the full call name text.
1699    member_patterns: &'static [&'static str],
1700    /// The source type to assign when matched
1701    source_type: TaintSourceType,
1702}
1703
1704/// AST-based sink pattern.
1705struct AstSinkPattern {
1706    call_names: &'static [&'static str],
1707    member_patterns: &'static [&'static str],
1708    sink_type: TaintSinkType,
1709}
1710
1711/// AST-based sanitizer pattern.
1712struct AstSanitizerPattern {
1713    call_names: &'static [&'static str],
1714    member_patterns: &'static [&'static str],
1715    sanitizer_type: SanitizerType,
1716}
1717
1718/// Complete AST pattern set for a language.
1719struct AstLanguagePatterns {
1720    sources: &'static [AstSourcePattern],
1721    sinks: &'static [AstSinkPattern],
1722    sanitizers: &'static [AstSanitizerPattern],
1723}
1724
1725// ---------------------------------------------------------------------------
1726// AST Pattern Definitions for All 18 Languages
1727// ---------------------------------------------------------------------------
1728
1729static PYTHON_AST_SOURCES: &[AstSourcePattern] = &[
1730    AstSourcePattern {
1731        call_names: &["input"],
1732        member_patterns: &[],
1733        source_type: TaintSourceType::UserInput,
1734    },
1735    AstSourcePattern {
1736        call_names: &[],
1737        member_patterns: &[
1738            "request.args",
1739            "request.form",
1740            "request.values",
1741            "request.cookies",
1742            "request.headers",
1743        ],
1744        source_type: TaintSourceType::HttpParam,
1745    },
1746    AstSourcePattern {
1747        call_names: &[],
1748        member_patterns: &["request.json", "request.data"],
1749        source_type: TaintSourceType::HttpParam,
1750    },
1751    AstSourcePattern {
1752        call_names: &[],
1753        member_patterns: &["request.get_json"],
1754        source_type: TaintSourceType::HttpBody,
1755    },
1756    AstSourcePattern {
1757        call_names: &[],
1758        member_patterns: &["sys.stdin"],
1759        source_type: TaintSourceType::Stdin,
1760    },
1761    AstSourcePattern {
1762        call_names: &[],
1763        member_patterns: &["os.environ", "os.getenv"],
1764        source_type: TaintSourceType::EnvVar,
1765    },
1766    AstSourcePattern {
1767        call_names: &[],
1768        member_patterns: &[".read(", ".readlines(", ".readline("],
1769        source_type: TaintSourceType::FileRead,
1770    },
1771];
1772
1773static PYTHON_AST_SINKS: &[AstSinkPattern] = &[
1774    AstSinkPattern {
1775        call_names: &[],
1776        member_patterns: &[".execute(", ".executemany("],
1777        sink_type: TaintSinkType::SqlQuery,
1778    },
1779    AstSinkPattern {
1780        call_names: &["eval"],
1781        member_patterns: &[],
1782        sink_type: TaintSinkType::CodeEval,
1783    },
1784    AstSinkPattern {
1785        call_names: &["exec"],
1786        member_patterns: &[],
1787        sink_type: TaintSinkType::CodeExec,
1788    },
1789    AstSinkPattern {
1790        call_names: &["compile"],
1791        member_patterns: &[],
1792        sink_type: TaintSinkType::CodeCompile,
1793    },
1794    AstSinkPattern {
1795        call_names: &[],
1796        member_patterns: &[
1797            "subprocess.run",
1798            "subprocess.call",
1799            "subprocess.Popen",
1800            "subprocess.check_output",
1801        ],
1802        sink_type: TaintSinkType::ShellExec,
1803    },
1804    AstSinkPattern {
1805        call_names: &[],
1806        member_patterns: &["os.system", "os.popen"],
1807        sink_type: TaintSinkType::ShellExec,
1808    },
1809    AstSinkPattern {
1810        call_names: &[],
1811        member_patterns: &[".write("],
1812        sink_type: TaintSinkType::FileWrite,
1813    },
1814];
1815
1816static PYTHON_AST_SANITIZERS: &[AstSanitizerPattern] = &[
1817    AstSanitizerPattern {
1818        call_names: &["int", "float", "bool"],
1819        member_patterns: &[],
1820        sanitizer_type: SanitizerType::Numeric,
1821    },
1822    AstSanitizerPattern {
1823        call_names: &[],
1824        member_patterns: &["shlex.quote", "pipes.quote"],
1825        sanitizer_type: SanitizerType::Shell,
1826    },
1827    AstSanitizerPattern {
1828        call_names: &[],
1829        member_patterns: &["html.escape", "markupsafe.escape", "cgi.escape"],
1830        sanitizer_type: SanitizerType::Html,
1831    },
1832];
1833
1834static TYPESCRIPT_AST_SOURCES: &[AstSourcePattern] = &[
1835    AstSourcePattern {
1836        call_names: &[],
1837        member_patterns: &["req.body"],
1838        source_type: TaintSourceType::HttpBody,
1839    },
1840    AstSourcePattern {
1841        call_names: &[],
1842        member_patterns: &["req.params", "req.query", "req.cookies", "req.headers"],
1843        source_type: TaintSourceType::HttpParam,
1844    },
1845    AstSourcePattern {
1846        call_names: &[],
1847        member_patterns: &["process.env"],
1848        source_type: TaintSourceType::EnvVar,
1849    },
1850    AstSourcePattern {
1851        call_names: &[],
1852        member_patterns: &["process.stdin"],
1853        source_type: TaintSourceType::Stdin,
1854    },
1855    AstSourcePattern {
1856        call_names: &["readline"],
1857        member_patterns: &[],
1858        source_type: TaintSourceType::UserInput,
1859    },
1860    AstSourcePattern {
1861        call_names: &[],
1862        member_patterns: &[".read(", ".readFile("],
1863        source_type: TaintSourceType::FileRead,
1864    },
1865];
1866
1867static TYPESCRIPT_AST_SINKS: &[AstSinkPattern] = &[
1868    AstSinkPattern {
1869        call_names: &["eval"],
1870        member_patterns: &[],
1871        sink_type: TaintSinkType::CodeEval,
1872    },
1873    AstSinkPattern {
1874        call_names: &[],
1875        member_patterns: &["new Function"],
1876        sink_type: TaintSinkType::CodeEval,
1877    },
1878    AstSinkPattern {
1879        call_names: &[],
1880        member_patterns: &[
1881            "child_process.exec",
1882            "child_process.spawn",
1883            "child_process.execSync",
1884            "child_process.execFile",
1885        ],
1886        sink_type: TaintSinkType::ShellExec,
1887    },
1888    AstSinkPattern {
1889        call_names: &["execSync"],
1890        member_patterns: &[],
1891        sink_type: TaintSinkType::ShellExec,
1892    },
1893    AstSinkPattern {
1894        call_names: &[],
1895        member_patterns: &[".innerHTML"],
1896        sink_type: TaintSinkType::FileWrite,
1897    },
1898    AstSinkPattern {
1899        call_names: &[],
1900        member_patterns: &["document.write"],
1901        sink_type: TaintSinkType::FileWrite,
1902    },
1903    AstSinkPattern {
1904        call_names: &[],
1905        member_patterns: &[".query(", ".execute("],
1906        sink_type: TaintSinkType::SqlQuery,
1907    },
1908];
1909
1910static TYPESCRIPT_AST_SANITIZERS: &[AstSanitizerPattern] = &[
1911    AstSanitizerPattern {
1912        call_names: &["parseInt", "Number", "parseFloat"],
1913        member_patterns: &[],
1914        sanitizer_type: SanitizerType::Numeric,
1915    },
1916    AstSanitizerPattern {
1917        call_names: &["encodeURIComponent"],
1918        member_patterns: &["DOMPurify.sanitize"],
1919        sanitizer_type: SanitizerType::Html,
1920    },
1921];
1922
1923static GO_AST_SOURCES: &[AstSourcePattern] = &[
1924    AstSourcePattern {
1925        call_names: &[],
1926        member_patterns: &["fmt.Scan", "bufio.NewReader", "bufio.NewScanner"],
1927        source_type: TaintSourceType::UserInput,
1928    },
1929    AstSourcePattern {
1930        call_names: &[],
1931        member_patterns: &["r.FormValue", "r.PostFormValue", "r.URL.Query", ".Query()"],
1932        source_type: TaintSourceType::HttpParam,
1933    },
1934    AstSourcePattern {
1935        call_names: &[],
1936        member_patterns: &["r.Body", ".ReadAll(r.Body)"],
1937        source_type: TaintSourceType::HttpBody,
1938    },
1939    AstSourcePattern {
1940        call_names: &[],
1941        member_patterns: &["os.Getenv"],
1942        source_type: TaintSourceType::EnvVar,
1943    },
1944    AstSourcePattern {
1945        call_names: &[],
1946        member_patterns: &["os.Stdin"],
1947        source_type: TaintSourceType::Stdin,
1948    },
1949    AstSourcePattern {
1950        call_names: &[],
1951        member_patterns: &["os.Open", "ioutil.ReadFile"],
1952        source_type: TaintSourceType::FileRead,
1953    },
1954];
1955
1956static GO_AST_SINKS: &[AstSinkPattern] = &[
1957    AstSinkPattern {
1958        call_names: &[],
1959        member_patterns: &["exec.Command"],
1960        sink_type: TaintSinkType::ShellExec,
1961    },
1962    AstSinkPattern {
1963        call_names: &[],
1964        member_patterns: &["db.Exec", "db.Query", "db.QueryRow"],
1965        sink_type: TaintSinkType::SqlQuery,
1966    },
1967    AstSinkPattern {
1968        call_names: &[],
1969        member_patterns: &["template.HTML", "fmt.Fprintf"],
1970        sink_type: TaintSinkType::FileWrite,
1971    },
1972];
1973
1974static GO_AST_SANITIZERS: &[AstSanitizerPattern] = &[
1975    AstSanitizerPattern {
1976        call_names: &[],
1977        member_patterns: &["strconv.Atoi", "strconv.ParseInt", "strconv.ParseFloat"],
1978        sanitizer_type: SanitizerType::Numeric,
1979    },
1980    AstSanitizerPattern {
1981        call_names: &[],
1982        member_patterns: &["html.EscapeString", "url.QueryEscape"],
1983        sanitizer_type: SanitizerType::Html,
1984    },
1985];
1986
1987static JAVA_AST_SOURCES: &[AstSourcePattern] = &[
1988    AstSourcePattern {
1989        call_names: &[],
1990        member_patterns: &["new Scanner(System.in)"],
1991        source_type: TaintSourceType::Stdin,
1992    },
1993    AstSourcePattern {
1994        call_names: &["readLine"],
1995        member_patterns: &["new BufferedReader"],
1996        source_type: TaintSourceType::UserInput,
1997    },
1998    AstSourcePattern {
1999        call_names: &[],
2000        member_patterns: &["request.getParameter", "getQueryString"],
2001        source_type: TaintSourceType::HttpParam,
2002    },
2003    AstSourcePattern {
2004        call_names: &[],
2005        member_patterns: &["System.getenv"],
2006        source_type: TaintSourceType::EnvVar,
2007    },
2008    AstSourcePattern {
2009        call_names: &[],
2010        member_patterns: &["new FileReader", "Files.readAllLines"],
2011        source_type: TaintSourceType::FileRead,
2012    },
2013];
2014
2015static JAVA_AST_SINKS: &[AstSinkPattern] = &[
2016    AstSinkPattern {
2017        call_names: &[],
2018        member_patterns: &["Runtime.getRuntime().exec", "ProcessBuilder"],
2019        sink_type: TaintSinkType::ShellExec,
2020    },
2021    AstSinkPattern {
2022        call_names: &[],
2023        member_patterns: &[".execute(", ".executeQuery(", ".executeUpdate("],
2024        sink_type: TaintSinkType::SqlQuery,
2025    },
2026    AstSinkPattern {
2027        call_names: &[],
2028        member_patterns: &["Class.forName"],
2029        sink_type: TaintSinkType::CodeEval,
2030    },
2031];
2032
2033static JAVA_AST_SANITIZERS: &[AstSanitizerPattern] = &[
2034    AstSanitizerPattern {
2035        call_names: &[],
2036        member_patterns: &["Integer.parseInt", "Long.parseLong", "Double.parseDouble"],
2037        sanitizer_type: SanitizerType::Numeric,
2038    },
2039    AstSanitizerPattern {
2040        call_names: &[],
2041        member_patterns: &["ESAPI.encoder", "StringEscapeUtils.escapeHtml"],
2042        sanitizer_type: SanitizerType::Html,
2043    },
2044];
2045
2046static RUST_AST_SOURCES: &[AstSourcePattern] = &[
2047    AstSourcePattern {
2048        call_names: &[],
2049        member_patterns: &["io::stdin", "std::io::stdin"],
2050        source_type: TaintSourceType::Stdin,
2051    },
2052    AstSourcePattern {
2053        call_names: &[],
2054        member_patterns: &["env::var", "std::env::var"],
2055        source_type: TaintSourceType::EnvVar,
2056    },
2057    AstSourcePattern {
2058        call_names: &[],
2059        member_patterns: &["env::args", "std::env::args"],
2060        source_type: TaintSourceType::UserInput,
2061    },
2062    AstSourcePattern {
2063        call_names: &[],
2064        member_patterns: &[
2065            "fs::read_to_string",
2066            "std::fs::read_to_string",
2067            "File::open",
2068        ],
2069        source_type: TaintSourceType::FileRead,
2070    },
2071];
2072
2073static RUST_AST_SINKS: &[AstSinkPattern] = &[
2074    AstSinkPattern {
2075        call_names: &[],
2076        member_patterns: &["Command::new", "std::process::Command"],
2077        sink_type: TaintSinkType::ShellExec,
2078    },
2079    AstSinkPattern {
2080        call_names: &[],
2081        member_patterns: &["unsafe"],
2082        sink_type: TaintSinkType::CodeEval,
2083    },
2084    AstSinkPattern {
2085        call_names: &[],
2086        member_patterns: &["std::ptr::write", "std::ptr::read"],
2087        sink_type: TaintSinkType::FileWrite,
2088    },
2089];
2090
2091static RUST_AST_SANITIZERS: &[AstSanitizerPattern] = &[AstSanitizerPattern {
2092    call_names: &[],
2093    member_patterns: &[
2094        ".parse::<i32>",
2095        ".parse::<i64>",
2096        ".parse::<u32>",
2097        ".parse::<u64>",
2098        ".parse::<f32>",
2099        ".parse::<f64>",
2100        ".parse::<usize>",
2101        ".parse::<isize>",
2102    ],
2103    sanitizer_type: SanitizerType::Numeric,
2104}];
2105
2106static C_AST_SOURCES: &[AstSourcePattern] = &[
2107    AstSourcePattern {
2108        call_names: &["scanf", "fscanf", "sscanf", "fgets", "gets", "getchar"],
2109        member_patterns: &[],
2110        source_type: TaintSourceType::UserInput,
2111    },
2112    AstSourcePattern {
2113        call_names: &["getenv"],
2114        member_patterns: &[],
2115        source_type: TaintSourceType::EnvVar,
2116    },
2117    AstSourcePattern {
2118        call_names: &["fread", "fopen"],
2119        member_patterns: &[],
2120        source_type: TaintSourceType::FileRead,
2121    },
2122    AstSourcePattern {
2123        call_names: &["recv", "recvfrom"],
2124        member_patterns: &[],
2125        source_type: TaintSourceType::UserInput,
2126    },
2127];
2128
2129static C_AST_SINKS: &[AstSinkPattern] = &[
2130    AstSinkPattern {
2131        call_names: &["system", "popen", "execl", "execv", "execvp"],
2132        member_patterns: &[],
2133        sink_type: TaintSinkType::ShellExec,
2134    },
2135    AstSinkPattern {
2136        call_names: &["sprintf", "vsprintf"],
2137        member_patterns: &[],
2138        sink_type: TaintSinkType::ShellExec,
2139    },
2140    AstSinkPattern {
2141        call_names: &["strcpy", "strcat", "strncpy"],
2142        member_patterns: &[],
2143        sink_type: TaintSinkType::FileWrite,
2144    },
2145];
2146
2147static C_AST_SANITIZERS: &[AstSanitizerPattern] = &[
2148    AstSanitizerPattern {
2149        call_names: &["atoi", "atol", "atof", "strtol", "strtoul", "strtod"],
2150        member_patterns: &[],
2151        sanitizer_type: SanitizerType::Numeric,
2152    },
2153    AstSanitizerPattern {
2154        call_names: &["snprintf"],
2155        member_patterns: &[],
2156        sanitizer_type: SanitizerType::Shell,
2157    },
2158];
2159
2160static CPP_AST_SOURCES: &[AstSourcePattern] = &[
2161    AstSourcePattern {
2162        call_names: &["getline"],
2163        member_patterns: &["std::cin", "std::getline"],
2164        source_type: TaintSourceType::UserInput,
2165    },
2166    AstSourcePattern {
2167        call_names: &["getenv"],
2168        member_patterns: &[],
2169        source_type: TaintSourceType::EnvVar,
2170    },
2171    AstSourcePattern {
2172        call_names: &[],
2173        member_patterns: &["std::ifstream", "std::fstream"],
2174        source_type: TaintSourceType::FileRead,
2175    },
2176];
2177
2178static CPP_AST_SINKS: &[AstSinkPattern] = &[
2179    AstSinkPattern {
2180        call_names: &["system", "popen"],
2181        member_patterns: &["std::system"],
2182        sink_type: TaintSinkType::ShellExec,
2183    },
2184    AstSinkPattern {
2185        call_names: &["sprintf"],
2186        member_patterns: &[],
2187        sink_type: TaintSinkType::ShellExec,
2188    },
2189];
2190
2191static CPP_AST_SANITIZERS: &[AstSanitizerPattern] = &[
2192    AstSanitizerPattern {
2193        call_names: &[],
2194        member_patterns: &[
2195            "std::stoi",
2196            "std::stol",
2197            "std::stoul",
2198            "std::stoll",
2199            "std::stof",
2200            "std::stod",
2201        ],
2202        sanitizer_type: SanitizerType::Numeric,
2203    },
2204    AstSanitizerPattern {
2205        call_names: &[],
2206        member_patterns: &[
2207            "static_cast<int>",
2208            "static_cast<long>",
2209            "static_cast<float>",
2210            "static_cast<double>",
2211        ],
2212        sanitizer_type: SanitizerType::Numeric,
2213    },
2214];
2215
2216static RUBY_AST_SOURCES: &[AstSourcePattern] = &[
2217    AstSourcePattern {
2218        call_names: &["gets"],
2219        member_patterns: &[],
2220        source_type: TaintSourceType::UserInput,
2221    },
2222    AstSourcePattern {
2223        call_names: &[],
2224        member_patterns: &["STDIN.read", "STDIN.gets", "STDIN.readline"],
2225        source_type: TaintSourceType::Stdin,
2226    },
2227    AstSourcePattern {
2228        call_names: &[],
2229        member_patterns: &["params["],
2230        source_type: TaintSourceType::HttpParam,
2231    },
2232    AstSourcePattern {
2233        call_names: &[],
2234        member_patterns: &["ENV["],
2235        source_type: TaintSourceType::EnvVar,
2236    },
2237    AstSourcePattern {
2238        call_names: &[],
2239        member_patterns: &["File.read", "File.open"],
2240        source_type: TaintSourceType::FileRead,
2241    },
2242];
2243
2244static RUBY_AST_SINKS: &[AstSinkPattern] = &[
2245    AstSinkPattern {
2246        call_names: &["eval"],
2247        member_patterns: &[],
2248        sink_type: TaintSinkType::CodeEval,
2249    },
2250    AstSinkPattern {
2251        call_names: &["system", "exec"],
2252        member_patterns: &[],
2253        sink_type: TaintSinkType::ShellExec,
2254    },
2255    AstSinkPattern {
2256        call_names: &[],
2257        member_patterns: &["IO.popen"],
2258        sink_type: TaintSinkType::ShellExec,
2259    },
2260    AstSinkPattern {
2261        call_names: &[],
2262        member_patterns: &[".send("],
2263        sink_type: TaintSinkType::CodeEval,
2264    },
2265];
2266
2267static RUBY_AST_SANITIZERS: &[AstSanitizerPattern] = &[
2268    AstSanitizerPattern {
2269        call_names: &[],
2270        member_patterns: &[".to_i", ".to_f"],
2271        sanitizer_type: SanitizerType::Numeric,
2272    },
2273    AstSanitizerPattern {
2274        call_names: &[],
2275        member_patterns: &["CGI.escapeHTML", "Rack::Utils.escape_html"],
2276        sanitizer_type: SanitizerType::Html,
2277    },
2278];
2279
2280static KOTLIN_AST_SOURCES: &[AstSourcePattern] = &[
2281    AstSourcePattern {
2282        call_names: &["readLine", "readln"],
2283        member_patterns: &[],
2284        source_type: TaintSourceType::UserInput,
2285    },
2286    AstSourcePattern {
2287        call_names: &[],
2288        member_patterns: &["System.getenv"],
2289        source_type: TaintSourceType::EnvVar,
2290    },
2291    AstSourcePattern {
2292        call_names: &[],
2293        member_patterns: &["BufferedReader"],
2294        source_type: TaintSourceType::UserInput,
2295    },
2296    AstSourcePattern {
2297        call_names: &[],
2298        member_patterns: &["request.getParameter"],
2299        source_type: TaintSourceType::HttpParam,
2300    },
2301];
2302
2303static KOTLIN_AST_SINKS: &[AstSinkPattern] = &[
2304    AstSinkPattern {
2305        call_names: &[],
2306        member_patterns: &["Runtime.getRuntime().exec", "ProcessBuilder"],
2307        sink_type: TaintSinkType::ShellExec,
2308    },
2309    AstSinkPattern {
2310        call_names: &[],
2311        member_patterns: &[".execute(", ".executeQuery(", "prepareStatement"],
2312        sink_type: TaintSinkType::SqlQuery,
2313    },
2314];
2315
2316static KOTLIN_AST_SANITIZERS: &[AstSanitizerPattern] = &[AstSanitizerPattern {
2317    call_names: &[],
2318    member_patterns: &[".toInt()", ".toLong()", ".toDouble()", ".toFloat()"],
2319    sanitizer_type: SanitizerType::Numeric,
2320}];
2321
2322static SWIFT_AST_SOURCES: &[AstSourcePattern] = &[
2323    AstSourcePattern {
2324        call_names: &["readLine"],
2325        member_patterns: &[],
2326        source_type: TaintSourceType::UserInput,
2327    },
2328    AstSourcePattern {
2329        call_names: &[],
2330        member_patterns: &["ProcessInfo.processInfo.environment"],
2331        source_type: TaintSourceType::EnvVar,
2332    },
2333    AstSourcePattern {
2334        call_names: &[],
2335        member_patterns: &["FileManager.default", "URLSession"],
2336        source_type: TaintSourceType::FileRead,
2337    },
2338];
2339
2340static SWIFT_AST_SINKS: &[AstSinkPattern] = &[
2341    AstSinkPattern {
2342        call_names: &[],
2343        member_patterns: &["Process()", "NSTask"],
2344        sink_type: TaintSinkType::ShellExec,
2345    },
2346    AstSinkPattern {
2347        call_names: &["sqlite3_exec"],
2348        member_patterns: &[],
2349        sink_type: TaintSinkType::SqlQuery,
2350    },
2351];
2352
2353static SWIFT_AST_SANITIZERS: &[AstSanitizerPattern] = &[
2354    AstSanitizerPattern {
2355        call_names: &["Int", "Double", "Float"],
2356        member_patterns: &[],
2357        sanitizer_type: SanitizerType::Numeric,
2358    },
2359    AstSanitizerPattern {
2360        call_names: &[],
2361        member_patterns: &["addingPercentEncoding"],
2362        sanitizer_type: SanitizerType::Html,
2363    },
2364];
2365
2366static CSHARP_AST_SOURCES: &[AstSourcePattern] = &[
2367    AstSourcePattern {
2368        call_names: &[],
2369        member_patterns: &["Console.ReadLine"],
2370        source_type: TaintSourceType::UserInput,
2371    },
2372    AstSourcePattern {
2373        call_names: &[],
2374        member_patterns: &["Request.QueryString", "Request.Form"],
2375        source_type: TaintSourceType::HttpParam,
2376    },
2377    AstSourcePattern {
2378        call_names: &[],
2379        member_patterns: &["Environment.GetEnvironmentVariable"],
2380        source_type: TaintSourceType::EnvVar,
2381    },
2382    AstSourcePattern {
2383        call_names: &[],
2384        member_patterns: &[
2385            "File.ReadAllText",
2386            "File.ReadAllLines",
2387            "File.OpenRead",
2388            "StreamReader",
2389        ],
2390        source_type: TaintSourceType::FileRead,
2391    },
2392];
2393
2394static CSHARP_AST_SINKS: &[AstSinkPattern] = &[
2395    AstSinkPattern {
2396        call_names: &[],
2397        member_patterns: &["Process.Start"],
2398        sink_type: TaintSinkType::ShellExec,
2399    },
2400    AstSinkPattern {
2401        call_names: &[],
2402        member_patterns: &["SqlCommand", ".ExecuteNonQuery", ".ExecuteReader"],
2403        sink_type: TaintSinkType::SqlQuery,
2404    },
2405    AstSinkPattern {
2406        call_names: &[],
2407        member_patterns: &["Activator.CreateInstance"],
2408        sink_type: TaintSinkType::CodeEval,
2409    },
2410];
2411
2412static CSHARP_AST_SANITIZERS: &[AstSanitizerPattern] = &[
2413    AstSanitizerPattern {
2414        call_names: &[],
2415        member_patterns: &["int.Parse", "Convert.ToInt32", "double.Parse"],
2416        sanitizer_type: SanitizerType::Numeric,
2417    },
2418    AstSanitizerPattern {
2419        call_names: &[],
2420        member_patterns: &["HttpUtility.HtmlEncode"],
2421        sanitizer_type: SanitizerType::Html,
2422    },
2423];
2424
2425static SCALA_AST_SOURCES: &[AstSourcePattern] = &[
2426    AstSourcePattern {
2427        call_names: &[],
2428        member_patterns: &["StdIn.readLine", "scala.io.StdIn"],
2429        source_type: TaintSourceType::UserInput,
2430    },
2431    AstSourcePattern {
2432        call_names: &[],
2433        member_patterns: &["System.getenv"],
2434        source_type: TaintSourceType::EnvVar,
2435    },
2436    AstSourcePattern {
2437        call_names: &[],
2438        member_patterns: &["Source.fromFile"],
2439        source_type: TaintSourceType::FileRead,
2440    },
2441];
2442
2443static SCALA_AST_SINKS: &[AstSinkPattern] = &[
2444    AstSinkPattern {
2445        call_names: &[],
2446        member_patterns: &["Runtime.getRuntime.exec", "sys.process", "Process("],
2447        sink_type: TaintSinkType::ShellExec,
2448    },
2449    AstSinkPattern {
2450        call_names: &[],
2451        member_patterns: &[".execute(", ".executeQuery("],
2452        sink_type: TaintSinkType::SqlQuery,
2453    },
2454];
2455
2456static SCALA_AST_SANITIZERS: &[AstSanitizerPattern] = &[
2457    AstSanitizerPattern {
2458        call_names: &[],
2459        member_patterns: &[".toInt", ".toLong", ".toDouble"],
2460        sanitizer_type: SanitizerType::Numeric,
2461    },
2462    AstSanitizerPattern {
2463        call_names: &[],
2464        member_patterns: &["StringEscapeUtils.escapeHtml"],
2465        sanitizer_type: SanitizerType::Html,
2466    },
2467];
2468
2469static PHP_AST_SOURCES: &[AstSourcePattern] = &[
2470    AstSourcePattern {
2471        call_names: &[],
2472        member_patterns: &["$_GET[", "$_REQUEST[", "$_COOKIE[", "$_SERVER["],
2473        source_type: TaintSourceType::HttpParam,
2474    },
2475    AstSourcePattern {
2476        call_names: &[],
2477        member_patterns: &["$_POST["],
2478        source_type: TaintSourceType::HttpBody,
2479    },
2480    AstSourcePattern {
2481        call_names: &["fgets"],
2482        member_patterns: &[],
2483        source_type: TaintSourceType::UserInput,
2484    },
2485    AstSourcePattern {
2486        call_names: &["file_get_contents"],
2487        member_patterns: &[],
2488        source_type: TaintSourceType::FileRead,
2489    },
2490    AstSourcePattern {
2491        call_names: &["getenv"],
2492        member_patterns: &["$_ENV["],
2493        source_type: TaintSourceType::EnvVar,
2494    },
2495];
2496
2497static PHP_AST_SINKS: &[AstSinkPattern] = &[
2498    AstSinkPattern {
2499        call_names: &["eval"],
2500        member_patterns: &[],
2501        sink_type: TaintSinkType::CodeEval,
2502    },
2503    AstSinkPattern {
2504        call_names: &[
2505            "exec",
2506            "system",
2507            "passthru",
2508            "shell_exec",
2509            "popen",
2510            "proc_open",
2511        ],
2512        member_patterns: &[],
2513        sink_type: TaintSinkType::ShellExec,
2514    },
2515    AstSinkPattern {
2516        call_names: &["mysqli_query"],
2517        member_patterns: &["->query("],
2518        sink_type: TaintSinkType::SqlQuery,
2519    },
2520];
2521
2522static PHP_AST_SANITIZERS: &[AstSanitizerPattern] = &[
2523    AstSanitizerPattern {
2524        call_names: &["intval", "floatval"],
2525        member_patterns: &["(int)", "(float)"],
2526        sanitizer_type: SanitizerType::Numeric,
2527    },
2528    AstSanitizerPattern {
2529        call_names: &["htmlspecialchars", "htmlentities"],
2530        member_patterns: &[],
2531        sanitizer_type: SanitizerType::Html,
2532    },
2533    AstSanitizerPattern {
2534        call_names: &["mysqli_real_escape_string"],
2535        member_patterns: &[],
2536        sanitizer_type: SanitizerType::Shell,
2537    },
2538];
2539
2540static LUA_AST_SOURCES: &[AstSourcePattern] = &[
2541    AstSourcePattern {
2542        call_names: &[],
2543        member_patterns: &["io.read"],
2544        source_type: TaintSourceType::UserInput,
2545    },
2546    AstSourcePattern {
2547        call_names: &[],
2548        member_patterns: &["os.getenv"],
2549        source_type: TaintSourceType::EnvVar,
2550    },
2551    AstSourcePattern {
2552        call_names: &[],
2553        member_patterns: &["io.open"],
2554        source_type: TaintSourceType::FileRead,
2555    },
2556];
2557
2558static LUA_AST_SINKS: &[AstSinkPattern] = &[
2559    AstSinkPattern {
2560        call_names: &[],
2561        member_patterns: &["os.execute"],
2562        sink_type: TaintSinkType::ShellExec,
2563    },
2564    AstSinkPattern {
2565        call_names: &[],
2566        member_patterns: &["io.popen"],
2567        sink_type: TaintSinkType::ShellExec,
2568    },
2569    AstSinkPattern {
2570        call_names: &["loadstring", "load", "dofile", "loadfile"],
2571        member_patterns: &[],
2572        sink_type: TaintSinkType::CodeEval,
2573    },
2574];
2575
2576static LUA_AST_SANITIZERS: &[AstSanitizerPattern] = &[AstSanitizerPattern {
2577    call_names: &["tonumber"],
2578    member_patterns: &[],
2579    sanitizer_type: SanitizerType::Numeric,
2580}];
2581
2582static ELIXIR_AST_SOURCES: &[AstSourcePattern] = &[
2583    AstSourcePattern {
2584        call_names: &[],
2585        member_patterns: &["IO.gets"],
2586        source_type: TaintSourceType::UserInput,
2587    },
2588    AstSourcePattern {
2589        call_names: &[],
2590        member_patterns: &["System.get_env"],
2591        source_type: TaintSourceType::EnvVar,
2592    },
2593    AstSourcePattern {
2594        call_names: &[],
2595        member_patterns: &["File.read", "File.read!"],
2596        source_type: TaintSourceType::FileRead,
2597    },
2598];
2599
2600static ELIXIR_AST_SINKS: &[AstSinkPattern] = &[
2601    AstSinkPattern {
2602        call_names: &[],
2603        member_patterns: &["System.cmd"],
2604        sink_type: TaintSinkType::ShellExec,
2605    },
2606    AstSinkPattern {
2607        call_names: &[],
2608        member_patterns: &["Code.eval_string"],
2609        sink_type: TaintSinkType::CodeEval,
2610    },
2611    AstSinkPattern {
2612        call_names: &[],
2613        member_patterns: &["Ecto.Adapters.SQL.query"],
2614        sink_type: TaintSinkType::SqlQuery,
2615    },
2616];
2617
2618static ELIXIR_AST_SANITIZERS: &[AstSanitizerPattern] = &[
2619    AstSanitizerPattern {
2620        call_names: &[],
2621        member_patterns: &["String.to_integer", "String.to_float"],
2622        sanitizer_type: SanitizerType::Numeric,
2623    },
2624    AstSanitizerPattern {
2625        call_names: &[],
2626        member_patterns: &["Phoenix.HTML.html_escape"],
2627        sanitizer_type: SanitizerType::Html,
2628    },
2629];
2630
2631static OCAML_AST_SOURCES: &[AstSourcePattern] = &[
2632    AstSourcePattern {
2633        call_names: &["read_line"],
2634        member_patterns: &[],
2635        source_type: TaintSourceType::UserInput,
2636    },
2637    AstSourcePattern {
2638        call_names: &["input_line"],
2639        member_patterns: &[],
2640        source_type: TaintSourceType::UserInput,
2641    },
2642    AstSourcePattern {
2643        call_names: &[],
2644        member_patterns: &["Sys.getenv"],
2645        source_type: TaintSourceType::EnvVar,
2646    },
2647    AstSourcePattern {
2648        call_names: &[],
2649        member_patterns: &["In_channel.read_all", "In_channel.input_all"],
2650        source_type: TaintSourceType::FileRead,
2651    },
2652];
2653
2654static OCAML_AST_SINKS: &[AstSinkPattern] = &[
2655    AstSinkPattern {
2656        call_names: &[],
2657        member_patterns: &["Sys.command"],
2658        sink_type: TaintSinkType::ShellExec,
2659    },
2660    AstSinkPattern {
2661        call_names: &[],
2662        member_patterns: &["Unix.execvp"],
2663        sink_type: TaintSinkType::ShellExec,
2664    },
2665    AstSinkPattern {
2666        call_names: &[],
2667        member_patterns: &["Sqlite3.exec"],
2668        sink_type: TaintSinkType::SqlQuery,
2669    },
2670];
2671
2672static OCAML_AST_SANITIZERS: &[AstSanitizerPattern] = &[AstSanitizerPattern {
2673    call_names: &["int_of_string", "float_of_string"],
2674    member_patterns: &[],
2675    sanitizer_type: SanitizerType::Numeric,
2676}];
2677
2678/// Get AST-based taint patterns for a given language.
2679fn get_ast_patterns(language: Language) -> AstLanguagePatterns {
2680    match language {
2681        Language::Python => AstLanguagePatterns {
2682            sources: PYTHON_AST_SOURCES,
2683            sinks: PYTHON_AST_SINKS,
2684            sanitizers: PYTHON_AST_SANITIZERS,
2685        },
2686        Language::TypeScript | Language::JavaScript => AstLanguagePatterns {
2687            sources: TYPESCRIPT_AST_SOURCES,
2688            sinks: TYPESCRIPT_AST_SINKS,
2689            sanitizers: TYPESCRIPT_AST_SANITIZERS,
2690        },
2691        Language::Go => AstLanguagePatterns {
2692            sources: GO_AST_SOURCES,
2693            sinks: GO_AST_SINKS,
2694            sanitizers: GO_AST_SANITIZERS,
2695        },
2696        Language::Java => AstLanguagePatterns {
2697            sources: JAVA_AST_SOURCES,
2698            sinks: JAVA_AST_SINKS,
2699            sanitizers: JAVA_AST_SANITIZERS,
2700        },
2701        Language::Rust => AstLanguagePatterns {
2702            sources: RUST_AST_SOURCES,
2703            sinks: RUST_AST_SINKS,
2704            sanitizers: RUST_AST_SANITIZERS,
2705        },
2706        Language::C => AstLanguagePatterns {
2707            sources: C_AST_SOURCES,
2708            sinks: C_AST_SINKS,
2709            sanitizers: C_AST_SANITIZERS,
2710        },
2711        Language::Cpp => AstLanguagePatterns {
2712            sources: CPP_AST_SOURCES,
2713            sinks: CPP_AST_SINKS,
2714            sanitizers: CPP_AST_SANITIZERS,
2715        },
2716        Language::Ruby => AstLanguagePatterns {
2717            sources: RUBY_AST_SOURCES,
2718            sinks: RUBY_AST_SINKS,
2719            sanitizers: RUBY_AST_SANITIZERS,
2720        },
2721        Language::Kotlin => AstLanguagePatterns {
2722            sources: KOTLIN_AST_SOURCES,
2723            sinks: KOTLIN_AST_SINKS,
2724            sanitizers: KOTLIN_AST_SANITIZERS,
2725        },
2726        Language::Swift => AstLanguagePatterns {
2727            sources: SWIFT_AST_SOURCES,
2728            sinks: SWIFT_AST_SINKS,
2729            sanitizers: SWIFT_AST_SANITIZERS,
2730        },
2731        Language::CSharp => AstLanguagePatterns {
2732            sources: CSHARP_AST_SOURCES,
2733            sinks: CSHARP_AST_SINKS,
2734            sanitizers: CSHARP_AST_SANITIZERS,
2735        },
2736        Language::Scala => AstLanguagePatterns {
2737            sources: SCALA_AST_SOURCES,
2738            sinks: SCALA_AST_SINKS,
2739            sanitizers: SCALA_AST_SANITIZERS,
2740        },
2741        Language::Php => AstLanguagePatterns {
2742            sources: PHP_AST_SOURCES,
2743            sinks: PHP_AST_SINKS,
2744            sanitizers: PHP_AST_SANITIZERS,
2745        },
2746        Language::Lua | Language::Luau => AstLanguagePatterns {
2747            sources: LUA_AST_SOURCES,
2748            sinks: LUA_AST_SINKS,
2749            sanitizers: LUA_AST_SANITIZERS,
2750        },
2751        Language::Elixir => AstLanguagePatterns {
2752            sources: ELIXIR_AST_SOURCES,
2753            sinks: ELIXIR_AST_SINKS,
2754            sanitizers: ELIXIR_AST_SANITIZERS,
2755        },
2756        Language::Ocaml => AstLanguagePatterns {
2757            sources: OCAML_AST_SOURCES,
2758            sinks: OCAML_AST_SINKS,
2759            sanitizers: OCAML_AST_SANITIZERS,
2760        },
2761    }
2762}
2763
2764// ---------------------------------------------------------------------------
2765// AST-Based Detection Functions
2766// ---------------------------------------------------------------------------
2767
2768/// Detect taint sources using AST nodes from a parsed tree.
2769///
2770/// Walks the tree looking for call nodes that match known source patterns.
2771/// Unlike regex-based detection, this correctly skips matches inside
2772/// comments and string literals.
2773///
2774/// # Arguments
2775/// * `root` - Root node of the function/file to analyze
2776/// * `source` - Source code bytes
2777/// * `language` - Programming language
2778/// * `line_filter` - If Some, only detect sources on this specific line
2779///
2780/// # Returns
2781/// Vector of detected taint sources
2782pub fn detect_sources_ast(
2783    root: &tree_sitter::Node,
2784    source: &[u8],
2785    language: Language,
2786    line_filter: Option<u32>,
2787) -> Vec<TaintSource> {
2788    let patterns = get_ast_patterns(language);
2789    let mut sources = Vec::new();
2790    let descendants = walk_descendants(*root);
2791
2792    for descendant in &descendants {
2793        // Skip comments and strings
2794        if is_in_comment(descendant, language) || is_in_string(descendant, language) {
2795            continue;
2796        }
2797
2798        let line = descendant.start_position().row as u32 + 1;
2799        if let Some(filter) = line_filter {
2800            if line != filter {
2801                continue;
2802            }
2803        }
2804
2805        let text = node_text(descendant, source);
2806
2807        for pattern in patterns.sources {
2808            let matched = pattern.call_names.iter().any(|name| {
2809                // Check if this is a call node with matching name
2810                let call_kinds = call_node_kinds(language);
2811                if call_kinds.contains(&descendant.kind()) {
2812                    if let Some(call_name) = extract_call_name(descendant, source, language) {
2813                        return call_name == *name || call_name.ends_with(&format!(".{}", name));
2814                    }
2815                }
2816                false
2817            }) || pattern.member_patterns.iter().any(|mp| text.contains(mp));
2818
2819            if matched {
2820                // Try to get variable from parent assignment
2821                let var = find_parent_assignment_var(descendant, source, language).or_else(|| {
2822                    extract_assigned_var(
2823                        std::str::from_utf8(source)
2824                            .unwrap_or("")
2825                            .lines()
2826                            .nth((line - 1) as usize)
2827                            .unwrap_or(""),
2828                    )
2829                });
2830
2831                if let Some(var) = var {
2832                    sources.push(TaintSource {
2833                        var,
2834                        line,
2835                        source_type: pattern.source_type,
2836                        statement: Some(
2837                            std::str::from_utf8(source)
2838                                .unwrap_or("")
2839                                .lines()
2840                                .nth((line - 1) as usize)
2841                                .unwrap_or("")
2842                                .to_string(),
2843                        ),
2844                    });
2845                    break; // Only one source per node
2846                }
2847            }
2848        }
2849    }
2850
2851    sources
2852}
2853
2854/// Detect taint sinks using AST nodes from a parsed tree.
2855///
2856/// Similar to `detect_sources_ast` but for dangerous operations (sinks).
2857pub fn detect_sinks_ast(
2858    root: &tree_sitter::Node,
2859    source: &[u8],
2860    language: Language,
2861    line_filter: Option<u32>,
2862) -> Vec<TaintSink> {
2863    let patterns = get_ast_patterns(language);
2864    let mut sinks = Vec::new();
2865    let descendants = walk_descendants(*root);
2866
2867    for descendant in &descendants {
2868        if is_in_comment(descendant, language) || is_in_string(descendant, language) {
2869            continue;
2870        }
2871
2872        let line = descendant.start_position().row as u32 + 1;
2873        if let Some(filter) = line_filter {
2874            if line != filter {
2875                continue;
2876            }
2877        }
2878
2879        let text = node_text(descendant, source);
2880
2881        for pattern in patterns.sinks {
2882            let matched = pattern.call_names.iter().any(|name| {
2883                let call_kinds = call_node_kinds(language);
2884                if call_kinds.contains(&descendant.kind()) {
2885                    if let Some(call_name) = extract_call_name(descendant, source, language) {
2886                        return call_name == *name || call_name.ends_with(&format!(".{}", name));
2887                    }
2888                }
2889                false
2890            }) || pattern.member_patterns.iter().any(|mp| text.contains(mp));
2891
2892            if matched {
2893                let stmt_text = std::str::from_utf8(source)
2894                    .unwrap_or("")
2895                    .lines()
2896                    .nth((line - 1) as usize)
2897                    .unwrap_or("");
2898
2899                // Extract variable argument
2900                let regex_patterns = get_patterns(language);
2901                let var = regex_patterns
2902                    .sinks
2903                    .iter()
2904                    .find(|(p, _)| p.is_match(stmt_text))
2905                    .and_then(|(p, _)| extract_call_arg(stmt_text, p))
2906                    .or_else(|| {
2907                        regex_patterns
2908                            .sinks
2909                            .iter()
2910                            .find(|(p, _)| p.is_match(stmt_text))
2911                            .and_then(|(p, _)| extract_sink_var_from_statement(stmt_text, p))
2912                    });
2913
2914                if let Some(var) = var {
2915                    sinks.push(TaintSink {
2916                        var,
2917                        line,
2918                        sink_type: pattern.sink_type,
2919                        tainted: false,
2920                        statement: Some(stmt_text.to_string()),
2921                    });
2922                    break;
2923                }
2924            }
2925        }
2926    }
2927
2928    sinks
2929}
2930
2931/// Detect sanitizers using AST nodes.
2932///
2933/// Returns the sanitizer type if found, checking that the match
2934/// is in actual code (not in a comment or string).
2935pub fn detect_sanitizer_ast(
2936    root: &tree_sitter::Node,
2937    source: &[u8],
2938    language: Language,
2939    line: u32,
2940) -> Option<SanitizerType> {
2941    let patterns = get_ast_patterns(language);
2942    let descendants = walk_descendants(*root);
2943
2944    for descendant in &descendants {
2945        if is_in_comment(descendant, language) || is_in_string(descendant, language) {
2946            continue;
2947        }
2948
2949        let node_line = descendant.start_position().row as u32 + 1;
2950        if node_line != line {
2951            continue;
2952        }
2953
2954        let text = node_text(descendant, source);
2955
2956        for pattern in patterns.sanitizers {
2957            let matched = pattern.call_names.iter().any(|name| {
2958                let call_kinds = call_node_kinds(language);
2959                if call_kinds.contains(&descendant.kind()) {
2960                    if let Some(call_name) = extract_call_name(descendant, source, language) {
2961                        return call_name == *name;
2962                    }
2963                }
2964                false
2965            }) || pattern.member_patterns.iter().any(|mp| text.contains(mp));
2966
2967            if matched {
2968                return Some(pattern.sanitizer_type);
2969            }
2970        }
2971    }
2972
2973    None
2974}
2975
2976/// Compute taint analysis with optional AST tree for improved detection.
2977///
2978/// When a parsed tree is provided, uses AST-based detection to filter out
2979/// false positives from comments and string literals. Falls back to regex
2980/// when AST detection yields no results.
2981///
2982/// This is the preferred entry point for CLI commands that have access to
2983/// the full parsed tree.
2984pub fn compute_taint_with_tree(
2985    cfg: &CfgInfo,
2986    refs: &[VarRef],
2987    statements: &HashMap<u32, String>,
2988    tree: Option<&tree_sitter::Tree>,
2989    source: Option<&[u8]>,
2990    language: Language,
2991) -> Result<TaintInfo, TldrError> {
2992    // If we have tree + source, use AST-enhanced detection within compute_taint
2993    // For now, delegate to the existing compute_taint which uses regex patterns.
2994    // The AST detection functions are available for direct use, and we integrate
2995    // them here as an enhancement layer.
2996
2997    // Validate CFG
2998    validate_cfg(cfg)?;
2999
3000    let mut result = TaintInfo::new(&cfg.function);
3001
3002    // Build helper maps
3003    let predecessors = build_predecessors(cfg);
3004    let successors = build_successors(cfg);
3005    let line_to_block = build_line_to_block(cfg);
3006    let refs_by_block = build_refs_by_block(refs, &line_to_block);
3007
3008    // Detect sources and sinks
3009    if let (Some(tree), Some(src)) = (tree, source) {
3010        // AST-based detection: walk the tree ONCE (no line filter) to avoid
3011        // O(lines * nodes) quadratic slowdown that caused infinite-loop-like hangs
3012        // on large files.
3013        let root = tree.root_node();
3014
3015        let all_ast_sources = detect_sources_ast(&root, src, language, None);
3016        let all_ast_sinks = detect_sinks_ast(&root, src, language, None);
3017
3018        // Index AST results by line for fast lookup
3019        let mut ast_sources_by_line: HashMap<u32, Vec<TaintSource>> = HashMap::new();
3020        for s in all_ast_sources {
3021            ast_sources_by_line.entry(s.line).or_default().push(s);
3022        }
3023        let mut ast_sinks_by_line: HashMap<u32, Vec<TaintSink>> = HashMap::new();
3024        for s in all_ast_sinks {
3025            ast_sinks_by_line.entry(s.line).or_default().push(s);
3026        }
3027
3028        for (&line, stmt) in statements {
3029            // Sources: prefer AST results, fall back to regex
3030            if let Some(sources) = ast_sources_by_line.remove(&line) {
3031                result.sources.extend(sources);
3032            } else {
3033                result.sources.extend(detect_sources(stmt, line, language));
3034            }
3035
3036            // Sinks: merge AST and regex results to avoid missing detections
3037            // when AST finds something on a line but misses certain sink patterns.
3038            // Dedup below handles any duplicates from the merge.
3039            if let Some(sinks) = ast_sinks_by_line.remove(&line) {
3040                result.sinks.extend(sinks);
3041            }
3042            result.sinks.extend(detect_sinks(stmt, line, language));
3043        }
3044    } else {
3045        // No tree available - use regex only (backward compatible)
3046        for (&line, stmt) in statements {
3047            result.sources.extend(detect_sources(stmt, line, language));
3048            result.sinks.extend(detect_sinks(stmt, line, language));
3049        }
3050    }
3051
3052    // Deduplicate sources by (line, source_type, var)
3053    result.sources.sort_by(|a, b| {
3054        a.line
3055            .cmp(&b.line)
3056            .then_with(|| format!("{:?}", a.source_type).cmp(&format!("{:?}", b.source_type)))
3057            .then_with(|| a.var.cmp(&b.var))
3058    });
3059    result.sources.dedup_by(|a, b| {
3060        a.line == b.line
3061            && a.var == b.var
3062            && std::mem::discriminant(&a.source_type) == std::mem::discriminant(&b.source_type)
3063    });
3064
3065    // Deduplicate sinks by (line, sink_type, var)
3066    result.sinks.sort_by(|a, b| {
3067        a.line
3068            .cmp(&b.line)
3069            .then_with(|| format!("{:?}", a.sink_type).cmp(&format!("{:?}", b.sink_type)))
3070            .then_with(|| a.var.cmp(&b.var))
3071    });
3072    result.sinks.dedup_by(|a, b| {
3073        a.line == b.line
3074            && a.var == b.var
3075            && std::mem::discriminant(&a.sink_type) == std::mem::discriminant(&b.sink_type)
3076    });
3077
3078    // The rest of the algorithm is the same as compute_taint
3079
3080    // Initialize taint sets per block
3081    let block_ids: Vec<usize> = cfg.blocks.iter().map(|b| b.id).collect();
3082    let mut tainted: HashMap<usize, HashSet<String>> = HashMap::new();
3083    for &bid in &block_ids {
3084        tainted.insert(bid, HashSet::new());
3085    }
3086
3087    for source in &result.sources {
3088        if let Some(&block_id) = line_to_block.get(&source.line) {
3089            tainted
3090                .entry(block_id)
3091                .or_default()
3092                .insert(source.var.clone());
3093        }
3094    }
3095
3096    // Worklist iteration
3097    // Cap iterations to prevent infinite loops on large real-world files
3098    let unique_vars: HashSet<&str> = refs.iter().map(|r| r.name.as_str()).collect();
3099    let computed_max = block_ids.len() * unique_vars.len().max(1) + 10;
3100    let max_iterations = computed_max.min(MAX_TAINT_ITERATIONS);
3101    let mut worklist: VecDeque<usize> = block_ids.iter().cloned().collect();
3102    let mut iterations = 0;
3103    let mut iteration_limit_reached = false;
3104
3105    let mut source_vars_by_block: HashMap<usize, HashSet<String>> = HashMap::new();
3106    for source in &result.sources {
3107        if let Some(&block_id) = line_to_block.get(&source.line) {
3108            source_vars_by_block
3109                .entry(block_id)
3110                .or_default()
3111                .insert(source.var.clone());
3112        }
3113    }
3114
3115    while let Some(block_id) = worklist.pop_front() {
3116        if iterations >= max_iterations {
3117            iteration_limit_reached = true;
3118            break;
3119        }
3120        iterations += 1;
3121
3122        let mut taint_in: HashSet<String> = predecessors
3123            .get(&block_id)
3124            .map(|preds| {
3125                preds
3126                    .iter()
3127                    .flat_map(|p| tainted.get(p).cloned().unwrap_or_default())
3128                    .collect()
3129            })
3130            .unwrap_or_default();
3131
3132        if let Some(source_vars) = source_vars_by_block.get(&block_id) {
3133            taint_in.extend(source_vars.clone());
3134        }
3135
3136        let taint_out = process_block(
3137            block_id,
3138            taint_in,
3139            &refs_by_block,
3140            statements,
3141            &line_to_block,
3142            &mut result.sanitized_vars,
3143            language,
3144        );
3145
3146        let old_taint = tainted.get(&block_id).cloned().unwrap_or_default();
3147        if taint_out != old_taint {
3148            tainted.insert(block_id, taint_out);
3149            if let Some(succs) = successors.get(&block_id) {
3150                for &s in succs {
3151                    if !worklist.contains(&s) {
3152                        worklist.push_back(s);
3153                    }
3154                }
3155            }
3156        }
3157    }
3158
3159    if iteration_limit_reached {
3160        result.convergence = Some("iteration_limit_reached".to_string());
3161    }
3162
3163    result.tainted_vars = tainted.clone();
3164
3165    // Phase 5: Detect vulnerabilities
3166    for sink in &mut result.sinks {
3167        if let Some(&sink_block) = line_to_block.get(&sink.line) {
3168            if let Some(tainted_at_block) = tainted.get(&sink_block) {
3169                // Direct match: sink variable itself is tainted
3170                if tainted_at_block.contains(&sink.var) {
3171                    sink.tainted = true;
3172                } else if !tainted_at_block.is_empty() {
3173                    // Indirect match: check if any tainted variable appears
3174                    // in the block's statements. Handles multi-line calls where
3175                    // the tainted argument is on a different line than the sink
3176                    // function name (e.g., conn.execute(\n "..." + username))
3177                    if let Some(block) = cfg.blocks.iter().find(|b| b.id == sink_block) {
3178                        let block_text: String = (block.lines.0..=block.lines.1)
3179                            .filter_map(|l| statements.get(&l))
3180                            .map(|s| s.as_str())
3181                            .collect::<Vec<_>>()
3182                            .join(" ");
3183                        for tvar in tainted_at_block {
3184                            if identifier_in_text(&block_text, tvar) {
3185                                sink.tainted = true;
3186                                break;
3187                            }
3188                        }
3189                    }
3190                }
3191            }
3192        }
3193    }
3194
3195    let sources_clone = result.sources.clone();
3196    let sinks_snapshot: Vec<(String, u32, TaintSinkType, bool, Option<String>)> = result
3197        .sinks
3198        .iter()
3199        .map(|s| {
3200            (
3201                s.var.clone(),
3202                s.line,
3203                s.sink_type,
3204                s.tainted,
3205                s.statement.clone(),
3206            )
3207        })
3208        .collect();
3209
3210    for (sink_var, sink_line, sink_type, sink_tainted, sink_statement) in sinks_snapshot {
3211        if !sink_tainted {
3212            continue;
3213        }
3214
3215        if let Some(&sink_block) = line_to_block.get(&sink_line) {
3216            for source in &sources_clone {
3217                if let Some(&source_block) = line_to_block.get(&source.line) {
3218                    if flows_to(&source.var, &sink_var, &tainted, &predecessors, sink_block) {
3219                        let is_sanitized = result.sanitized_vars.contains(&sink_var);
3220                        if !is_sanitized {
3221                            let path = compute_flow_path(source_block, sink_block, &successors);
3222                            let flow = TaintFlow {
3223                                source: source.clone(),
3224                                sink: TaintSink {
3225                                    var: sink_var.clone(),
3226                                    line: sink_line,
3227                                    sink_type,
3228                                    tainted: true,
3229                                    statement: sink_statement.clone(),
3230                                },
3231                                path,
3232                            };
3233                            result.flows.push(flow);
3234                        }
3235                    }
3236                }
3237            }
3238        }
3239    }
3240
3241    Ok(result)
3242}
3243
3244// =============================================================================
3245// Vulnerability Detection Helpers - Phase 5
3246// =============================================================================
3247
3248/// Check if source variable flows to target variable via taint propagation.
3249///
3250/// This is a conservative check that assumes any source could cause taint
3251/// if the target variable is tainted at the target block. A more precise
3252/// implementation would track per-variable taint provenance.
3253///
3254/// # Arguments
3255///
3256/// * `_source_var` - The source variable (unused in conservative check)
3257/// * `target_var` - The variable to check at the sink
3258/// * `tainted_vars` - Taint state at each block
3259/// * `_predecessors` - Block predecessor map (unused in conservative check)
3260/// * `target_block` - The block containing the sink
3261///
3262/// # Returns
3263///
3264/// `true` if the target variable is tainted at the target block.
3265fn flows_to(
3266    _source_var: &str,
3267    target_var: &str,
3268    tainted_vars: &HashMap<usize, HashSet<String>>,
3269    _predecessors: &HashMap<usize, Vec<usize>>,
3270    target_block: usize,
3271) -> bool {
3272    // Conservative approximation: if target_var is tainted at target_block,
3273    // assume any source could cause it. More precise tracking would require
3274    // per-variable taint provenance.
3275    tainted_vars
3276        .get(&target_block)
3277        .map(|t| t.contains(target_var))
3278        .unwrap_or(false)
3279}
3280
3281/// Compute block IDs along the flow path from source to sink.
3282///
3283/// Uses BFS to find the shortest path through the CFG from the block
3284/// containing the source to the block containing the sink.
3285///
3286/// # Arguments
3287///
3288/// * `source_block` - Block ID containing the taint source
3289/// * `sink_block` - Block ID containing the taint sink
3290/// * `successors` - Block successor map
3291///
3292/// # Returns
3293///
3294/// Vector of block IDs from source to sink (inclusive).
3295fn compute_flow_path(
3296    source_block: usize,
3297    sink_block: usize,
3298    successors: &HashMap<usize, Vec<usize>>,
3299) -> Vec<usize> {
3300    if source_block == sink_block {
3301        return vec![source_block];
3302    }
3303
3304    // BFS to find shortest path
3305    let mut visited: HashSet<usize> = HashSet::new();
3306    let mut queue: VecDeque<Vec<usize>> = VecDeque::new();
3307
3308    queue.push_back(vec![source_block]);
3309    visited.insert(source_block);
3310
3311    while let Some(path) = queue.pop_front() {
3312        let current = *path.last().unwrap();
3313
3314        if let Some(succs) = successors.get(&current) {
3315            for &next in succs {
3316                if next == sink_block {
3317                    let mut result = path.clone();
3318                    result.push(next);
3319                    return result;
3320                }
3321
3322                if !visited.contains(&next) {
3323                    visited.insert(next);
3324                    let mut new_path = path.clone();
3325                    new_path.push(next);
3326                    queue.push_back(new_path);
3327                }
3328            }
3329        }
3330    }
3331
3332    // No path found - return just source and sink
3333    vec![source_block, sink_block]
3334}
3335
3336// =============================================================================
3337// Worklist Algorithm - Phase 4
3338// =============================================================================
3339
3340/// Compute taint analysis for a function using worklist-based forward dataflow.
3341///
3342/// # Algorithm
3343///
3344/// Forward worklist-based dataflow analysis:
3345/// 1. Initialize: entry block tainted_vars = sources
3346/// 2. Worklist iteration until fixed point:
3347///    - taint_in[B] = union(taint_out[P] for P in predecessors[B])
3348///    - Process block: propagate taint through assignments
3349///    - taint_out[B] = process_block(taint_in[B])
3350///    - If changed, add successors to worklist
3351///
3352/// # Arguments
3353///
3354/// * `cfg` - Control flow graph for the function
3355/// * `refs` - Variable references (definitions and uses)
3356/// * `statements` - Map of line number to statement text (for pattern matching)
3357/// * `language` - The programming language (determines which taint patterns to use)
3358///
3359/// # Returns
3360///
3361/// `TaintInfo` containing all taint analysis results.
3362///
3363/// # Errors
3364///
3365/// Returns `TldrError::InvalidArgs` if the CFG is invalid.
3366pub fn compute_taint(
3367    cfg: &CfgInfo,
3368    refs: &[VarRef],
3369    statements: &HashMap<u32, String>,
3370    language: Language,
3371) -> Result<TaintInfo, TldrError> {
3372    // Validate CFG
3373    validate_cfg(cfg)?;
3374
3375    let mut result = TaintInfo::new(&cfg.function);
3376
3377    // Build helper maps
3378    let predecessors = build_predecessors(cfg);
3379    let successors = build_successors(cfg);
3380    let line_to_block = build_line_to_block(cfg);
3381    let refs_by_block = build_refs_by_block(refs, &line_to_block);
3382
3383    // Detect sources and sinks from statements
3384    for (&line, stmt) in statements {
3385        for source in detect_sources(stmt, line, language) {
3386            result.sources.push(source);
3387        }
3388        for sink in detect_sinks(stmt, line, language) {
3389            result.sinks.push(sink);
3390        }
3391    }
3392
3393    // Initialize taint sets per block
3394    let block_ids: Vec<usize> = cfg.blocks.iter().map(|b| b.id).collect();
3395    let mut tainted: HashMap<usize, HashSet<String>> = HashMap::new();
3396    for &bid in &block_ids {
3397        tainted.insert(bid, HashSet::new());
3398    }
3399
3400    // Initialize all blocks with their respective source variables
3401    for source in &result.sources {
3402        if let Some(&block_id) = line_to_block.get(&source.line) {
3403            tainted
3404                .entry(block_id)
3405                .or_default()
3406                .insert(source.var.clone());
3407        }
3408    }
3409
3410    // Worklist iteration
3411    // Max iterations bounded by O(blocks * vars) with hard cap to guarantee termination
3412    let unique_vars: HashSet<&str> = refs.iter().map(|r| r.name.as_str()).collect();
3413    let computed_max = block_ids.len() * unique_vars.len().max(1) + 10;
3414    let max_iterations = computed_max.min(MAX_TAINT_ITERATIONS);
3415    let mut worklist: VecDeque<usize> = block_ids.iter().cloned().collect();
3416    let mut iterations = 0;
3417    let mut iteration_limit_reached = false;
3418
3419    // Build a map of source variables by block for initialization
3420    let mut source_vars_by_block: HashMap<usize, HashSet<String>> = HashMap::new();
3421    for source in &result.sources {
3422        if let Some(&block_id) = line_to_block.get(&source.line) {
3423            source_vars_by_block
3424                .entry(block_id)
3425                .or_default()
3426                .insert(source.var.clone());
3427        }
3428    }
3429
3430    while let Some(block_id) = worklist.pop_front() {
3431        if iterations >= max_iterations {
3432            iteration_limit_reached = true;
3433            break; // Safety bound to prevent infinite loops
3434        }
3435        iterations += 1;
3436
3437        // Compute taint_in = union of predecessors' taint_out
3438        let mut taint_in: HashSet<String> = predecessors
3439            .get(&block_id)
3440            .map(|preds| {
3441                preds
3442                    .iter()
3443                    .flat_map(|p| tainted.get(p).cloned().unwrap_or_default())
3444                    .collect()
3445            })
3446            .unwrap_or_default();
3447
3448        // Add source variables that originate in this block
3449        if let Some(source_vars) = source_vars_by_block.get(&block_id) {
3450            taint_in.extend(source_vars.clone());
3451        }
3452
3453        // Process block: propagate taint through assignments
3454        let taint_out = process_block(
3455            block_id,
3456            taint_in,
3457            &refs_by_block,
3458            statements,
3459            &line_to_block,
3460            &mut result.sanitized_vars,
3461            language,
3462        );
3463
3464        // If changed, add successors to worklist
3465        let old_taint = tainted.get(&block_id).cloned().unwrap_or_default();
3466        if taint_out != old_taint {
3467            tainted.insert(block_id, taint_out);
3468            if let Some(succs) = successors.get(&block_id) {
3469                for &s in succs {
3470                    if !worklist.contains(&s) {
3471                        worklist.push_back(s);
3472                    }
3473                }
3474            }
3475        }
3476    }
3477
3478    if iteration_limit_reached {
3479        result.convergence = Some("iteration_limit_reached".to_string());
3480    }
3481
3482    result.tainted_vars = tainted.clone();
3483
3484    // =========================================================================
3485    // Phase 5: Detect vulnerabilities (source -> sink flows)
3486    // =========================================================================
3487    // For each sink, check if its variable is tainted at that block.
3488    // If tainted, find the source(s) that caused it and record the flow.
3489
3490    for sink in &mut result.sinks {
3491        // Get block containing this sink
3492        if let Some(&sink_block) = line_to_block.get(&sink.line) {
3493            // Check if sink variable is tainted at this point
3494            if let Some(tainted_at_block) = tainted.get(&sink_block) {
3495                if tainted_at_block.contains(&sink.var) {
3496                    sink.tainted = true;
3497                }
3498            }
3499        }
3500    }
3501
3502    // Now create flows for tainted sinks
3503    // We need to iterate over sinks again with immutable access to sources
3504    let sources_clone = result.sources.clone();
3505    let sinks_snapshot: Vec<(String, u32, TaintSinkType, bool, Option<String>)> = result
3506        .sinks
3507        .iter()
3508        .map(|s| {
3509            (
3510                s.var.clone(),
3511                s.line,
3512                s.sink_type,
3513                s.tainted,
3514                s.statement.clone(),
3515            )
3516        })
3517        .collect();
3518
3519    for (sink_var, sink_line, sink_type, sink_tainted, sink_statement) in sinks_snapshot {
3520        if !sink_tainted {
3521            continue;
3522        }
3523
3524        // Get block containing this sink
3525        if let Some(&sink_block) = line_to_block.get(&sink_line) {
3526            // Find which source(s) caused this taint
3527            for source in &sources_clone {
3528                // Get block containing this source
3529                if let Some(&source_block) = line_to_block.get(&source.line) {
3530                    // Check if source variable flows to sink variable
3531                    if flows_to(&source.var, &sink_var, &tainted, &predecessors, sink_block) {
3532                        // Check if the sink variable was sanitized
3533                        let is_sanitized = result.sanitized_vars.contains(&sink_var);
3534
3535                        // Only record if NOT sanitized
3536                        if !is_sanitized {
3537                            let path = compute_flow_path(source_block, sink_block, &successors);
3538
3539                            let flow = TaintFlow {
3540                                source: source.clone(),
3541                                sink: TaintSink {
3542                                    var: sink_var.clone(),
3543                                    line: sink_line,
3544                                    sink_type,
3545                                    tainted: true,
3546                                    statement: sink_statement.clone(),
3547                                },
3548                                path,
3549                            };
3550
3551                            result.flows.push(flow);
3552                        }
3553                    }
3554                }
3555            }
3556        }
3557    }
3558
3559    Ok(result)
3560}
3561
3562/// Process a single block for taint propagation.
3563///
3564/// Propagates taint through assignments in the block:
3565/// - If RHS uses a tainted variable, LHS becomes tainted
3566/// - If a sanitizer is applied, the result is NOT tainted
3567/// - Definitions without taint remove taint from the variable
3568///
3569/// # Arguments
3570///
3571/// * `block_id` - The block being processed
3572/// * `current_taint` - Set of tainted variables at block entry
3573/// * `refs_by_block` - VarRefs grouped by block
3574/// * `statements` - Statement text by line number
3575/// * `line_to_block` - Mapping from line to block ID
3576/// * `sanitized_vars` - Set of variables that have been sanitized (mutated)
3577/// * `language` - The programming language (determines which patterns to use)
3578///
3579/// # Returns
3580///
3581/// Set of tainted variables at block exit.
3582fn process_block(
3583    block_id: usize,
3584    mut current_taint: HashSet<String>,
3585    refs_by_block: &HashMap<usize, Vec<&VarRef>>,
3586    statements: &HashMap<u32, String>,
3587    _line_to_block: &HashMap<u32, usize>,
3588    sanitized_vars: &mut HashSet<String>,
3589    language: Language,
3590) -> HashSet<String> {
3591    let empty_refs = vec![];
3592    let block_refs = refs_by_block.get(&block_id).unwrap_or(&empty_refs);
3593
3594    for var_ref in block_refs {
3595        let stmt = statements
3596            .get(&var_ref.line)
3597            .map(|s| s.as_str())
3598            .unwrap_or("");
3599
3600        match var_ref.ref_type {
3601            RefType::Definition => {
3602                // Check if RHS uses a tainted variable
3603                let rhs_tainted = current_taint.iter().any(|tv| stmt.contains(tv.as_str()));
3604
3605                // Check if sanitized
3606                if detect_sanitizer(stmt, language).is_some() {
3607                    sanitized_vars.insert(var_ref.name.clone());
3608                    current_taint.remove(&var_ref.name);
3609                } else if rhs_tainted {
3610                    current_taint.insert(var_ref.name.clone());
3611                } else {
3612                    // Definition without taint removes taint
3613                    current_taint.remove(&var_ref.name);
3614                }
3615            }
3616            RefType::Use => {
3617                // Uses don't change taint state directly
3618            }
3619            RefType::Update => {
3620                // Update is use-then-def (e.g., x += y)
3621                // If RHS is tainted, result is tainted
3622                let rhs_tainted = current_taint.iter().any(|tv| stmt.contains(tv.as_str()));
3623                if rhs_tainted {
3624                    current_taint.insert(var_ref.name.clone());
3625                }
3626            }
3627        }
3628    }
3629
3630    current_taint
3631}
3632
3633#[cfg(test)]
3634mod tests {
3635    use super::*;
3636
3637    #[test]
3638    fn test_taint_source_type_serde() {
3639        let source = TaintSourceType::UserInput;
3640        let json = serde_json::to_string(&source).unwrap();
3641        assert_eq!(json, "\"user_input\"");
3642
3643        let parsed: TaintSourceType = serde_json::from_str(&json).unwrap();
3644        assert_eq!(parsed, source);
3645    }
3646
3647    #[test]
3648    fn test_taint_sink_type_serde() {
3649        let sink = TaintSinkType::SqlQuery;
3650        let json = serde_json::to_string(&sink).unwrap();
3651        assert_eq!(json, "\"sql_query\"");
3652
3653        let parsed: TaintSinkType = serde_json::from_str(&json).unwrap();
3654        assert_eq!(parsed, sink);
3655    }
3656
3657    #[test]
3658    fn test_sanitizer_type_serde() {
3659        let sanitizer = SanitizerType::Numeric;
3660        let json = serde_json::to_string(&sanitizer).unwrap();
3661        assert_eq!(json, "\"numeric\"");
3662
3663        let parsed: SanitizerType = serde_json::from_str(&json).unwrap();
3664        assert_eq!(parsed, sanitizer);
3665    }
3666
3667    #[test]
3668    fn test_taint_info_new() {
3669        let info = TaintInfo::new("my_function");
3670        assert_eq!(info.function_name, "my_function");
3671        assert!(info.tainted_vars.is_empty());
3672        assert!(info.sources.is_empty());
3673        assert!(info.sinks.is_empty());
3674        assert!(info.flows.is_empty());
3675        assert!(info.sanitized_vars.is_empty());
3676    }
3677
3678    #[test]
3679    fn test_taint_info_default() {
3680        let info = TaintInfo::default();
3681        assert!(info.function_name.is_empty());
3682        assert!(info.tainted_vars.is_empty());
3683    }
3684
3685    #[test]
3686    fn test_taint_info_is_tainted() {
3687        let mut info = TaintInfo::new("test");
3688        let mut block_taint = HashSet::new();
3689        block_taint.insert("user_input".to_string());
3690        info.tainted_vars.insert(0, block_taint);
3691
3692        assert!(info.is_tainted(0, "user_input"));
3693        assert!(!info.is_tainted(0, "other_var"));
3694        assert!(!info.is_tainted(1, "user_input")); // block 1 doesn't exist
3695    }
3696
3697    #[test]
3698    fn test_taint_info_get_vulnerabilities() {
3699        let mut info = TaintInfo::new("test");
3700
3701        // Add a tainted sink (vulnerability)
3702        info.sinks.push(TaintSink {
3703            var: "query".to_string(),
3704            line: 5,
3705            sink_type: TaintSinkType::SqlQuery,
3706            tainted: true,
3707            statement: Some("cursor.execute(query)".to_string()),
3708        });
3709
3710        // Add a non-tainted sink (safe)
3711        info.sinks.push(TaintSink {
3712            var: "safe_query".to_string(),
3713            line: 10,
3714            sink_type: TaintSinkType::SqlQuery,
3715            tainted: false,
3716            statement: Some("cursor.execute(safe_query)".to_string()),
3717        });
3718
3719        let vulns = info.get_vulnerabilities();
3720        assert_eq!(vulns.len(), 1);
3721        assert_eq!(vulns[0].var, "query");
3722    }
3723
3724    /// Test that compute_taint terminates on a large CFG with many variables
3725    /// and back-edges that could cause oscillation in the worklist algorithm.
3726    /// This test would hang forever without the MAX_TAINT_ITERATIONS cap.
3727    #[test]
3728    fn test_taint_terminates_on_large_cfg_with_backedges() {
3729        use crate::types::{BlockType, CfgBlock, CfgEdge, CfgInfo, EdgeType, RefType, VarRef};
3730
3731        // Create a CFG with 50 blocks in a chain, plus back-edges
3732        let num_blocks = 50;
3733        let mut blocks = Vec::new();
3734        let mut edges = Vec::new();
3735
3736        for i in 0..num_blocks {
3737            let start_line = (i * 10 + 1) as u32;
3738            let end_line = (i * 10 + 10) as u32;
3739            blocks.push(CfgBlock {
3740                id: i,
3741                block_type: BlockType::Body,
3742                lines: (start_line, end_line),
3743                calls: Vec::new(),
3744            });
3745        }
3746
3747        // Linear chain edges
3748        for i in 0..num_blocks - 1 {
3749            edges.push(CfgEdge {
3750                from: i,
3751                to: i + 1,
3752                edge_type: EdgeType::Unconditional,
3753                condition: None,
3754            });
3755        }
3756
3757        // Add back-edges to create loops that could cause oscillation
3758        for i in (5..num_blocks).step_by(5) {
3759            edges.push(CfgEdge {
3760                from: i,
3761                to: i - 3,
3762                edge_type: EdgeType::BackEdge,
3763                condition: None,
3764            });
3765        }
3766
3767        let cfg = CfgInfo {
3768            function: "large_func".to_string(),
3769            blocks,
3770            edges,
3771            entry_block: 0,
3772            exit_blocks: vec![num_blocks - 1],
3773            cyclomatic_complexity: 10,
3774            nested_functions: HashMap::new(),
3775        };
3776
3777        // Create many variable refs across blocks
3778        let mut refs = Vec::new();
3779        let mut statements = HashMap::new();
3780
3781        for i in 0..num_blocks {
3782            let line = (i * 10 + 1) as u32;
3783            let var_name = format!("var_{}", i);
3784            refs.push(VarRef {
3785                name: var_name.clone(),
3786                ref_type: RefType::Definition,
3787                line,
3788                column: 0,
3789                context: None,
3790                group_id: None,
3791            });
3792            // Create statements that reference previous variables to create taint chains
3793            if i > 0 {
3794                statements.insert(line, format!("var_{} = var_{}", i, i - 1));
3795            } else {
3796                statements.insert(line, "var_0 = input()".to_string());
3797            }
3798        }
3799
3800        // This MUST terminate within a reasonable time (< 1 second)
3801        let start = std::time::Instant::now();
3802        let result = compute_taint(&cfg, &refs, &statements, Language::Python);
3803        let elapsed = start.elapsed();
3804
3805        assert!(result.is_ok(), "compute_taint should succeed");
3806        assert!(
3807            elapsed.as_secs() < 5,
3808            "compute_taint took too long: {:?} (possible infinite loop)",
3809            elapsed
3810        );
3811
3812        // Should have found the input() source
3813        let info = result.unwrap();
3814        assert!(!info.sources.is_empty(), "Should detect input() source");
3815    }
3816
3817    /// Test that the hard iteration cap MAX_TAINT_ITERATIONS is respected
3818    /// even when the computed max_iterations would be very large.
3819    #[test]
3820    fn test_taint_iteration_cap_prevents_runaway() {
3821        use crate::types::{BlockType, CfgBlock, CfgEdge, CfgInfo, EdgeType, RefType, VarRef};
3822
3823        // Create a small CFG but with MANY variable references to inflate max_iterations
3824        let blocks = vec![
3825            CfgBlock {
3826                id: 0,
3827                block_type: BlockType::Body,
3828                lines: (1, 100),
3829                calls: Vec::new(),
3830            },
3831            CfgBlock {
3832                id: 1,
3833                block_type: BlockType::Body,
3834                lines: (101, 200),
3835                calls: Vec::new(),
3836            },
3837        ];
3838        let edges = vec![
3839            CfgEdge {
3840                from: 0,
3841                to: 1,
3842                edge_type: EdgeType::Unconditional,
3843                condition: None,
3844            },
3845            CfgEdge {
3846                from: 1,
3847                to: 0,
3848                edge_type: EdgeType::BackEdge,
3849                condition: None,
3850            },
3851        ];
3852
3853        let cfg = CfgInfo {
3854            function: "runaway".to_string(),
3855            blocks,
3856            edges,
3857            entry_block: 0,
3858            exit_blocks: vec![1],
3859            cyclomatic_complexity: 2,
3860            nested_functions: HashMap::new(),
3861        };
3862
3863        // Create 500 unique variable refs - this would make max_iterations = 2 * 500 + 10 = 1010
3864        // which is above our MAX_TAINT_ITERATIONS cap of 1000
3865        let mut refs = Vec::new();
3866        let mut statements = HashMap::new();
3867
3868        for i in 0..500 {
3869            let line = (i + 1) as u32;
3870            refs.push(VarRef {
3871                name: format!("v{}", i),
3872                ref_type: RefType::Definition,
3873                line,
3874                column: 0,
3875                context: None,
3876                group_id: None,
3877            });
3878            statements.insert(line, format!("v{} = input()", i));
3879        }
3880
3881        let start = std::time::Instant::now();
3882        let result = compute_taint(&cfg, &refs, &statements, Language::Python);
3883        let elapsed = start.elapsed();
3884
3885        assert!(result.is_ok());
3886        assert!(
3887            elapsed.as_secs() < 5,
3888            "Should terminate quickly with iteration cap, took {:?}",
3889            elapsed
3890        );
3891    }
3892
3893    /// Test that compute_taint_with_tree deduplicates sources that are detected
3894    /// by both AST-based and regex-based detection on the same line.
3895    #[test]
3896    fn test_sources_are_deduplicated() {
3897        use crate::ast::ParserPool;
3898        use crate::types::{BlockType, CfgBlock, CfgEdge, CfgInfo, EdgeType, RefType, VarRef};
3899
3900        let python_code = r#"import os
3901
3902def vulnerable_func(user_input):
3903    data = input("Enter: ")
3904    query = "SELECT * FROM users WHERE id = " + data
3905    os.system(user_input)
3906    eval(data)
3907"#;
3908
3909        let cfg = CfgInfo {
3910            function: "vulnerable_func".to_string(),
3911            blocks: vec![
3912                CfgBlock {
3913                    id: 0,
3914                    block_type: BlockType::Entry,
3915                    lines: (3, 3),
3916                    calls: Vec::new(),
3917                },
3918                CfgBlock {
3919                    id: 1,
3920                    block_type: BlockType::Body,
3921                    lines: (4, 7),
3922                    calls: vec![
3923                        "input".to_string(),
3924                        "os.system".to_string(),
3925                        "eval".to_string(),
3926                    ],
3927                },
3928            ],
3929            edges: vec![CfgEdge {
3930                from: 0,
3931                to: 1,
3932                edge_type: EdgeType::Unconditional,
3933                condition: None,
3934            }],
3935            entry_block: 0,
3936            exit_blocks: vec![1],
3937            cyclomatic_complexity: 1,
3938            nested_functions: HashMap::new(),
3939        };
3940
3941        let refs = vec![
3942            VarRef {
3943                name: "user_input".to_string(),
3944                ref_type: RefType::Definition,
3945                line: 3,
3946                column: 0,
3947                context: None,
3948                group_id: None,
3949            },
3950            VarRef {
3951                name: "data".to_string(),
3952                ref_type: RefType::Definition,
3953                line: 4,
3954                column: 0,
3955                context: None,
3956                group_id: None,
3957            },
3958            VarRef {
3959                name: "query".to_string(),
3960                ref_type: RefType::Definition,
3961                line: 5,
3962                column: 0,
3963                context: None,
3964                group_id: None,
3965            },
3966        ];
3967
3968        let mut statements: HashMap<u32, String> = HashMap::new();
3969        for (i, line) in python_code.lines().enumerate() {
3970            statements.insert((i + 1) as u32, line.to_string());
3971        }
3972
3973        let pool = ParserPool::new();
3974        let tree = pool.parse(python_code, Language::Python).ok();
3975
3976        let result = compute_taint_with_tree(
3977            &cfg,
3978            &refs,
3979            &statements,
3980            tree.as_ref(),
3981            Some(python_code.as_bytes()),
3982            Language::Python,
3983        )
3984        .unwrap();
3985
3986        // Each unique (line, source_type, var) should appear exactly once
3987        let mut seen = std::collections::HashSet::new();
3988        for source in &result.sources {
3989            let key = (
3990                source.line,
3991                std::mem::discriminant(&source.source_type),
3992                source.var.clone(),
3993            );
3994            assert!(
3995                seen.insert(key.clone()),
3996                "Duplicate source found: line={}, var={}, type={:?}",
3997                source.line,
3998                source.var,
3999                source.source_type
4000            );
4001        }
4002
4003        // Same for sinks
4004        let mut seen_sinks = std::collections::HashSet::new();
4005        for sink in &result.sinks {
4006            let key = (
4007                sink.line,
4008                std::mem::discriminant(&sink.sink_type),
4009                sink.var.clone(),
4010            );
4011            assert!(
4012                seen_sinks.insert(key.clone()),
4013                "Duplicate sink found: line={}, var={}, type={:?}",
4014                sink.line,
4015                sink.var,
4016                sink.sink_type
4017            );
4018        }
4019    }
4020
4021    /// Test that sinks are detected even when AST detection misses them
4022    /// but regex detection would catch them. Both sources should be merged.
4023    #[test]
4024    fn test_sinks_detected_via_merge() {
4025        use crate::ast::ParserPool;
4026        use crate::types::{BlockType, CfgBlock, CfgEdge, CfgInfo, EdgeType, RefType, VarRef};
4027
4028        let python_code = r#"import os
4029
4030def vuln(user_input):
4031    os.system(user_input)
4032    eval(user_input)
4033"#;
4034
4035        let cfg = CfgInfo {
4036            function: "vuln".to_string(),
4037            blocks: vec![
4038                CfgBlock {
4039                    id: 0,
4040                    block_type: BlockType::Entry,
4041                    lines: (3, 3),
4042                    calls: Vec::new(),
4043                },
4044                CfgBlock {
4045                    id: 1,
4046                    block_type: BlockType::Body,
4047                    lines: (4, 5),
4048                    calls: vec!["os.system".to_string(), "eval".to_string()],
4049                },
4050            ],
4051            edges: vec![CfgEdge {
4052                from: 0,
4053                to: 1,
4054                edge_type: EdgeType::Unconditional,
4055                condition: None,
4056            }],
4057            entry_block: 0,
4058            exit_blocks: vec![1],
4059            cyclomatic_complexity: 1,
4060            nested_functions: HashMap::new(),
4061        };
4062
4063        let refs = vec![VarRef {
4064            name: "user_input".to_string(),
4065            ref_type: RefType::Definition,
4066            line: 3,
4067            column: 0,
4068            context: None,
4069            group_id: None,
4070        }];
4071
4072        let mut statements: HashMap<u32, String> = HashMap::new();
4073        for (i, line) in python_code.lines().enumerate() {
4074            statements.insert((i + 1) as u32, line.to_string());
4075        }
4076
4077        let pool = ParserPool::new();
4078        let tree = pool.parse(python_code, Language::Python).ok();
4079
4080        let result = compute_taint_with_tree(
4081            &cfg,
4082            &refs,
4083            &statements,
4084            tree.as_ref(),
4085            Some(python_code.as_bytes()),
4086            Language::Python,
4087        )
4088        .unwrap();
4089
4090        // Should detect at least 2 sinks: os.system and eval
4091        let sink_types: Vec<_> = result.sinks.iter().map(|s| s.sink_type).collect();
4092        assert!(
4093            sink_types.contains(&TaintSinkType::ShellExec),
4094            "Should detect os.system as ShellExec sink, got: {:?}",
4095            sink_types
4096        );
4097        assert!(
4098            sink_types.contains(&TaintSinkType::CodeEval),
4099            "Should detect eval as CodeEval sink, got: {:?}",
4100            sink_types
4101        );
4102    }
4103}