mir_extractor/
interprocedural.rs

1//! Inter-procedural taint analysis (Phase 3)
2//!
3//! This module implements inter-procedural dataflow analysis to track taint
4//! across function boundaries. It builds on Phase 2's intra-procedural analysis
5//! by constructing a call graph and propagating taint through function calls.
6//!
7//! ## Architecture
8//!
9//! 1. **Call Graph Construction**: Extract function calls from MIR to build
10//!    a directed graph of function dependencies.
11//!
12//! 2. **Function Summarization**: Analyze each function to create summaries
13//!    describing how taint flows through parameters and return values.
14//!
15//! 3. **Inter-Procedural Propagation**: Use summaries to track taint across
16//!    function boundaries, following chains of calls from sources to sinks.
17//!
18//! 4. **Context-Sensitive Analysis**: Distinguish different call sites to
19//!    maintain precision and avoid false positives.
20
21use anyhow::Result;
22use serde::{Deserialize, Serialize};
23use std::cell::RefCell;
24use std::collections::{HashMap, HashSet, VecDeque};
25
26use crate::dataflow::cfg::ControlFlowGraph;
27use crate::dataflow::closure::{ClosureRegistry, ClosureRegistryBuilder};
28use crate::dataflow::{DataflowSummary, TaintPropagation};
29use crate::{MirFunction, MirPackage};
30
31/// Configuration for inter-procedural analysis (IPA) limits.
32///
33/// These limits control memory usage and analysis depth for taint tracking.
34/// Increase limits for more thorough analysis on high-memory machines.
35/// Decrease limits to analyze extremely large codebases with limited memory.
36///
37/// All fields have sensible defaults that work well for most codebases.
38#[derive(Debug, Clone, Serialize, Deserialize)]
39#[serde(default)]
40pub struct IpaConfig {
41    /// Maximum call chain depth from source to sink (default: 8)
42    /// Most real vulnerabilities have depth < 5.
43    pub max_path_depth: usize,
44
45    /// Maximum taint flows tracked per source function (default: 200)
46    /// Increase if you suspect missed flows from a single source.
47    pub max_flows_per_source: usize,
48
49    /// Maximum functions visited per source exploration (default: 1000)
50    /// Prevents memory explosion in extremely dense call graphs.
51    pub max_visited: usize,
52
53    /// Maximum total inter-procedural flows reported (default: 5000)
54    /// Most real analyses produce < 500 flows.
55    pub max_total_flows: usize,
56
57    /// Maximum functions before skipping interprocedural analysis (default: 10000)
58    /// For crates exceeding this threshold, IPA is skipped but intra-procedural
59    /// analysis still runs for all rules.
60    pub max_functions_for_ipa: usize,
61}
62
63impl Default for IpaConfig {
64    fn default() -> Self {
65        Self {
66            max_path_depth: 8,
67            max_flows_per_source: 200,
68            max_visited: 1000,
69            max_total_flows: 5000,
70            max_functions_for_ipa: 10000,
71        }
72    }
73}
74
75/// Call graph representing function call relationships
76#[derive(Debug, Clone)]
77pub struct CallGraph {
78    /// Function name → CallGraphNode
79    pub nodes: HashMap<String, CallGraphNode>,
80
81    /// Analysis order (bottom-up: callees before callers)
82    pub analysis_order: Vec<String>,
83}
84
85/// Node in the call graph representing a function
86#[derive(Debug, Clone)]
87pub struct CallGraphNode {
88    /// Function name (fully qualified)
89    pub function_name: String,
90
91    /// Functions that call this function
92    pub callers: Vec<String>,
93
94    /// Functions called by this function
95    pub callees: Vec<CallSite>,
96    // Note: Function summaries are stored in InterProceduralAnalysis::summaries
97    // to avoid memory duplication
98}
99
100/// A specific call site within a function
101#[derive(Debug, Clone)]
102pub struct CallSite {
103    /// Name of the called function
104    pub callee: String,
105
106    /// Resolved target functions
107    pub resolved_targets: Vec<String>,
108
109    /// Location in the caller's MIR (for error reporting)
110    pub location: String,
111
112    /// Number of arguments passed
113    pub arg_count: usize,
114}
115
116/// Summary of a function's taint behavior
117#[derive(Debug, Clone)]
118pub struct FunctionSummary {
119    /// Function name
120    pub function_name: String,
121
122    /// Which parameters can introduce taint (parameter index)
123    pub source_parameters: HashSet<usize>,
124
125    /// Which parameters flow to sinks within this function
126    pub sink_parameters: HashSet<usize>,
127
128    /// Taint propagation rules
129    pub propagation_rules: Vec<TaintPropagation>,
130
131    /// Does the return value carry taint?
132    pub return_taint: ReturnTaint,
133
134    /// Does the function contain an internal vulnerability (source -> sink)?
135    pub has_internal_vulnerability: bool,
136}
137
138/// Describes the taint state of a function's return value
139#[derive(Debug, Clone)]
140pub enum ReturnTaint {
141    /// Return value is clean (not tainted)
142    Clean,
143
144    /// Return value is tainted from parameter N
145    FromParameter(usize),
146
147    /// Return value is tainted from a source within the function
148    FromSource { source_type: String },
149
150    /// Return value depends on multiple taint sources
151    Merged(Vec<ReturnTaint>),
152}
153
154impl CallGraph {
155    /// Construct a call graph from a MIR package
156    pub fn from_mir_package(package: &MirPackage) -> Result<Self> {
157        let mut nodes = HashMap::new();
158
159        // Phase 1: Create nodes for all functions
160        for function in &package.functions {
161            let node = CallGraphNode {
162                function_name: function.name.clone(),
163                callers: Vec::new(),
164                callees: Vec::new(),
165            };
166            nodes.insert(function.name.clone(), node);
167        }
168
169        // Phase 2: Extract callees from each function's MIR
170        for function in &package.functions {
171            let callees = Self::extract_callees(function)?;
172            if let Some(node) = nodes.get_mut(&function.name) {
173                node.callees = callees;
174            }
175        }
176
177        // Build a map of short names to full names for resolution
178        let mut short_name_map: HashMap<String, Vec<String>> = HashMap::new();
179        for function in &package.functions {
180            let short_name = Self::extract_function_name(&function.name);
181            short_name_map
182                .entry(short_name)
183                .or_default()
184                .push(function.name.clone());
185        }
186
187        // Phase 3: Resolve calls and build caller relationships
188        let mut caller_map: HashMap<String, Vec<String>> = HashMap::new();
189        let mut resolved_callees_map: HashMap<String, Vec<CallSite>> = HashMap::new();
190
191        for (caller_name, node) in &nodes {
192            let mut resolved_callees = Vec::new();
193
194            for call_site in &node.callees {
195                // Try direct match first
196                if nodes.contains_key(&call_site.callee) {
197                    let mut new_site = call_site.clone();
198                    new_site.resolved_targets.push(call_site.callee.clone());
199                    resolved_callees.push(new_site);
200
201                    caller_map
202                        .entry(call_site.callee.clone())
203                        .or_default()
204                        .push(caller_name.clone());
205                } else {
206                    // Try to resolve via short name (e.g. trait methods)
207                    let short_name = Self::extract_function_name(&call_site.callee);
208
209                    if let Some(candidates) = short_name_map.get(&short_name) {
210                        // Resolved match (trait method, etc.)
211                        let mut new_site = call_site.clone();
212                        for candidate in candidates {
213                            new_site.resolved_targets.push(candidate.clone());
214
215                            caller_map
216                                .entry(candidate.clone())
217                                .or_default()
218                                .push(caller_name.clone());
219                        }
220                        resolved_callees.push(new_site);
221                    } else {
222                        // Unresolved - keep as is (maybe external function)
223                        resolved_callees.push(call_site.clone());
224                    }
225                }
226            }
227            resolved_callees_map.insert(caller_name.clone(), resolved_callees);
228        }
229
230        // Apply resolved callees
231        for (caller_name, callees) in resolved_callees_map {
232            if let Some(node) = nodes.get_mut(&caller_name) {
233                node.callees = callees;
234            }
235        }
236
237        for (callee_name, callers) in caller_map {
238            if let Some(node) = nodes.get_mut(&callee_name) {
239                node.callers = callers;
240            }
241        }
242
243        // Phase 4: Compute analysis order (bottom-up)
244        let analysis_order = Self::compute_analysis_order(&nodes)?;
245
246        Ok(CallGraph {
247            nodes,
248            analysis_order,
249        })
250    }
251
252    /// Extract callee information from a function's MIR
253    fn extract_callees(function: &MirFunction) -> Result<Vec<CallSite>> {
254        let mut callees = Vec::new();
255
256        // Parse MIR to find function calls
257        // MIR calls look like: "_N = function_name(args...)" or
258        // "TerminatorKind::Call { func: ... }"
259
260        for (line_idx, line) in function.body.iter().enumerate() {
261            // Look for call patterns in MIR
262            // Common patterns:
263            // 1. "= Fn(DefId(...), Substs(...))(" - direct function call
264            // 2. "= <Type as Trait>::method(" - trait method call
265            // 3. "Call { func: Operand::Constant..." - terminator call
266
267            if let Some(call_site) = Self::parse_call_from_mir_line(line, line_idx) {
268                callees.push(call_site);
269            }
270
271            // Check for closure/coroutine creation
272            if let Some(closure_callee) =
273                Self::parse_closure_creation(line, &function.name, line_idx)
274            {
275                callees.push(closure_callee);
276            }
277        }
278
279        Ok(callees)
280    }
281
282    /// Parse closure or coroutine creation as a "call" (dependency)
283    fn parse_closure_creation(line: &str, parent_name: &str, line_idx: usize) -> Option<CallSite> {
284        // _0 = {coroutine@... (#0)} ...
285        if let Some(eq_pos) = line.find('=') {
286            let rhs = line[eq_pos + 1..].trim();
287            if rhs.starts_with("{closure@") || rhs.starts_with("{coroutine@") {
288                // Extract index (#N)
289                if let Some(hash_pos) = rhs.find("(#") {
290                    if let Some(close_paren) = rhs[hash_pos..].find(')') {
291                        let index_str = &rhs[hash_pos + 2..hash_pos + close_paren];
292                        if let Ok(index) = index_str.parse::<usize>() {
293                            let callee = format!("{}::{{closure#{}}}", parent_name, index);
294                            return Some(CallSite {
295                                callee,
296                                resolved_targets: Vec::new(),
297                                location: format!("line {}", line_idx),
298                                arg_count: 0,
299                            });
300                        }
301                    }
302                }
303            }
304        }
305        None
306    }
307
308    /// Parse a single MIR line to detect function calls
309    fn parse_call_from_mir_line(line: &str, line_idx: usize) -> Option<CallSite> {
310        let line = line.trim();
311
312        // Pattern 1: Direct function call in statement
313        // Example: "_5 = my_function(move _6) -> [return: bb3, unwind: bb4];"
314        if line.contains("(") && line.contains(") -> [return:") {
315            // Pattern 1a: Closure invocation
316            // Example: "<{closure@examples/interprocedural/src/lib.rs:278:19: 278:21} as Fn<()>>::call(..."
317            if let Some(closure_name) = Self::extract_closure_call(line) {
318                return Some(CallSite {
319                    callee: closure_name,
320                    resolved_targets: Vec::new(),
321                    location: format!("line {}", line_idx),
322                    arg_count: 1, // Closure takes the closure env as arg
323                });
324            }
325
326            // Extract function name between '=' and '('
327            if let Some(eq_pos) = line.find('=') {
328                if let Some(paren_pos) = line[eq_pos..].find('(') {
329                    let func_part = &line[eq_pos + 1..eq_pos + paren_pos].trim();
330
331                    // Clean up the function name
332                    // We want the full name here, but cleaned of MIR artifacts
333                    let func_name = func_part
334                        .replace("const ", "")
335                        .replace("move ", "")
336                        .replace("copy ", "")
337                        .trim()
338                        .to_string();
339
340                    if !func_name.is_empty() && !Self::is_builtin_operation(&func_name) {
341                        // Count arguments (rough estimate)
342                        let args_section = &line[eq_pos + paren_pos + 1..];
343                        let arg_count = Self::estimate_arg_count(args_section);
344
345                        return Some(CallSite {
346                            callee: func_name,
347                            resolved_targets: Vec::new(),
348                            location: format!("line {}", line_idx),
349                            arg_count,
350                        });
351                    }
352                }
353            }
354        }
355
356        None
357    }
358
359    /// Extract clean function name from MIR representation
360    fn extract_function_name(mir_repr: &str) -> String {
361        // MIR function names can be complex, e.g.:
362        // "std::process::Command::new"
363        // "<std::process::Command as std::ops::Drop>::drop"
364        // "my_crate::module::function"
365
366        let cleaned = mir_repr
367            .trim()
368            .replace("const ", "")
369            .replace("move ", "")
370            .replace("copy ", "");
371
372        // Extract the last meaningful part
373        if let Some(last_colon) = cleaned.rfind("::") {
374            cleaned[last_colon + 2..].trim().to_string()
375        } else {
376            cleaned.trim().to_string()
377        }
378    }
379
380    /// Check if this is a built-in operation (not a real function call)
381    fn is_builtin_operation(name: &str) -> bool {
382        matches!(
383            name,
384            "assert_eq!" | "assert!" | "println!" | "dbg!" | "format!"
385        ) || name.starts_with("_")
386            || name.is_empty()
387    }
388
389    /// Extract closure function name from MIR closure call pattern
390    /// Pattern: "<{closure@path/to/file.rs:line:col: line:col} as Fn<()>>::call(..."
391    /// Returns: "parent_function::{closure#N}"
392    fn extract_closure_call(line: &str) -> Option<String> {
393        // Look for closure call pattern
394        if !line.contains("{closure@") || !line.contains("as Fn") {
395            return None;
396        }
397
398        // Extract the closure location from "{closure@path:line:col: line:col}"
399        let start = line.find("{closure@")?;
400        let end = line[start..].find("}")?;
401        let closure_loc = &line[start..start + end + 1];
402
403        // The closure location looks like: {closure@examples/interprocedural/src/lib.rs:278:19: 278:21}
404        // We need to find the corresponding closure function name by matching the file:line
405        // The closure function is named like: parent_function::{closure#0}
406
407        // For now, return the raw closure identifier - it will be matched in the function list
408        // The function name for this closure is: test_closure_capture::{closure#0}
409
410        // Extract just the location part for matching
411        if let Some(at_pos) = closure_loc.find('@') {
412            let location = &closure_loc[at_pos + 1..closure_loc.len() - 1]; // Remove "{closure@" and "}"
413                                                                            // location is like "examples/interprocedural/src/lib.rs:278:19: 278:21"
414                                                                            // Take the file and first line number
415            let parts: Vec<&str> = location.split(':').collect();
416            if parts.len() >= 2 {
417                let file = parts[0];
418                let line_num = parts[1];
419                // Create a unique identifier for matching
420                return Some(format!("{{closure@{}:{}}}", file, line_num));
421            }
422        }
423
424        None
425    }
426
427    /// Estimate argument count from MIR call syntax
428    fn estimate_arg_count(args_section: &str) -> usize {
429        // Count commas outside of nested structures
430        // This is a rough heuristic
431        args_section.matches(',').count() + 1
432    }
433
434    /// Compute bottom-up analysis order (callees before callers)
435    fn compute_analysis_order(nodes: &HashMap<String, CallGraphNode>) -> Result<Vec<String>> {
436        // Use Kahn's algorithm for topological sort
437        let mut in_degree: HashMap<String, usize> = HashMap::new();
438        let mut order = Vec::new();
439
440        // Calculate in-degrees (number of internal callees)
441        for (name, node) in nodes {
442            let internal_callees = node
443                .callees
444                .iter()
445                .filter(|c| nodes.contains_key(&c.callee))
446                .count();
447            in_degree.insert(name.clone(), internal_callees);
448            // println!("[DEBUG] In-degree for {}: {}", name, internal_callees);
449        }
450
451        // Start with leaf functions (no internal callees)
452        let mut queue: VecDeque<String> = nodes
453            .iter()
454            .filter(|(name, _)| in_degree.get(*name).copied().unwrap_or(0) == 0)
455            .map(|(name, _)| name.clone())
456            .collect();
457
458        // println!("[DEBUG] Initial queue size: {}", queue.len());
459
460        // Process nodes in bottom-up order
461        while let Some(current) = queue.pop_front() {
462            order.push(current.clone());
463
464            // For each function that calls this one
465            if let Some(node) = nodes.get(&current) {
466                // println!("[DEBUG] Processed {}, notifying callers: {:?}", current, node.callers);
467                for caller in &node.callers {
468                    if let Some(degree) = in_degree.get_mut(caller) {
469                        if *degree > 0 {
470                            *degree -= 1;
471                            if *degree == 0 {
472                                queue.push_back(caller.clone());
473                            }
474                        }
475                    }
476                }
477            }
478        }
479
480        // Check for cycles (recursion)
481        if order.len() < nodes.len() {
482            // There are cycles; include remaining nodes anyway
483            // (Phase 3.4 will handle recursion with depth limits)
484            for (name, _) in nodes {
485                if !order.contains(name) {
486                    order.push(name.clone());
487                }
488            }
489        }
490
491        Ok(order)
492    }
493
494    /// Get the analysis order (bottom-up: callees before callers)
495    pub fn get_analysis_order(&self) -> &[String] {
496        &self.analysis_order
497    }
498
499    /// Get a node by function name
500    pub fn get_node(&self, function_name: &str) -> Option<&CallGraphNode> {
501        self.nodes.get(function_name)
502    }
503
504    /// Get a mutable node by function name
505    pub fn get_node_mut(&mut self, function_name: &str) -> Option<&mut CallGraphNode> {
506        self.nodes.get_mut(function_name)
507    }
508}
509
510impl FunctionSummary {
511    /// Create a new empty function summary
512    pub fn new(function_name: String) -> Self {
513        FunctionSummary {
514            function_name,
515            source_parameters: HashSet::new(),
516            sink_parameters: HashSet::new(),
517            propagation_rules: Vec::new(),
518            return_taint: ReturnTaint::Clean,
519            has_internal_vulnerability: false,
520        }
521    }
522
523    /// Generate a summary for a function using intra-procedural analysis
524    pub fn from_mir_function(
525        function: &MirFunction,
526        callee_summaries: &HashMap<String, FunctionSummary>,
527        closure_registry: Option<&ClosureRegistry>,
528    ) -> Result<Self> {
529        let mut summary = FunctionSummary::new(function.name.clone());
530
531        // Phase 3.5.1: Use CFG-based path-sensitive analysis for branching functions
532        // Phase 3.5.2: Use closure context if available
533        let cfg = ControlFlowGraph::from_mir_function(function);
534
535        // Skip path-sensitive analysis for very large functions to prevent memory spikes
536        const MAX_BLOCKS_FOR_PATH_ANALYSIS: usize = 500;
537        let use_path_sensitive = cfg.block_count() <= MAX_BLOCKS_FOR_PATH_ANALYSIS;
538
539        if use_path_sensitive {
540            use crate::dataflow::path_sensitive::PathSensitiveTaintAnalysis;
541            use crate::dataflow::DataflowSummary;
542
543            // Convert FunctionSummary to DataflowSummary for path-sensitive analysis
544            let dataflow_summaries: HashMap<String, DataflowSummary> = callee_summaries
545                .iter()
546                .map(|(k, v)| (k.clone(), v.to_dataflow_summary()))
547                .collect();
548
549            let mut path_analysis = PathSensitiveTaintAnalysis::new(cfg);
550
551            // Check if this is a closure function
552            let closure_info = closure_registry.and_then(|r| r.get_closure(&function.name));
553
554            if let Some(info) = closure_info {
555                // This is a closure - analyze with captured variable context
556                // Run 1: Analyze with actual capture states (from registry)
557                let result =
558                    path_analysis.analyze_closure(function, info, Some(&dataflow_summaries));
559
560                if result.has_any_vulnerable_path {
561                    if info.has_tainted_captures() {
562                        summary
563                            .propagation_rules
564                            .push(TaintPropagation::ParamToSink {
565                                param: 0,
566                                sink_type: "command_execution".to_string(),
567                            });
568                    } else {
569                        // No tainted captures, but found a vulnerability -> must be internal
570                        summary.has_internal_vulnerability = true;
571                    }
572                }
573
574                // Run 2: Analyze assuming captures are tainted (to detect propagation)
575                // This is crucial for async functions where captures are initially clean but become tainted at runtime
576                use crate::dataflow::path_sensitive::TaintState;
577                let mut initial_taint = HashMap::new();
578
579                for capture in &info.captured_vars {
580                    let env_var = format!("((*_1).{})", capture.field_index);
581                    initial_taint.insert(
582                        env_var.clone(),
583                        TaintState::Tainted {
584                            source_type: "captured_variable".to_string(),
585                            source_location: format!("capture_{}", capture.field_index),
586                        },
587                    );
588
589                    // For async/coroutines (Pin<&mut T>), the path is deeper: ((*((*_1).0)).N)
590                    // _1 is Pin<&mut Coroutine>, _1.0 is &mut Coroutine, *(_1.0) is Coroutine
591                    let async_env_var = format!("((*((*_1).0)).{})", capture.field_index);
592                    initial_taint.insert(
593                        async_env_var,
594                        TaintState::Tainted {
595                            source_type: "captured_variable".to_string(),
596                            source_location: format!("capture_{}", capture.field_index),
597                        },
598                    );
599                }
600
601                if !initial_taint.is_empty() {
602                    let result_propagated = path_analysis.analyze_with_initial_taint(
603                        function,
604                        initial_taint,
605                        Some(&dataflow_summaries),
606                    );
607                    if result_propagated.has_any_vulnerable_path {
608                        summary
609                            .propagation_rules
610                            .push(TaintPropagation::ParamToSink {
611                                param: 0,
612                                sink_type: "command_execution".to_string(),
613                            });
614                    }
615                }
616            } else {
617                // Not a closure - analyze parameters
618
619                // Run 1: Check for internal sources (no initial taint)
620                let result_internal = path_analysis.analyze(function, Some(&dataflow_summaries));
621                if result_internal.has_any_vulnerable_path {
622                    summary.has_internal_vulnerability = true;
623                }
624
625                // Check if return value is tainted
626                if result_internal
627                    .path_results
628                    .iter()
629                    .any(|p| p.return_tainted)
630                {
631                    summary.return_taint = ReturnTaint::FromSource {
632                        source_type: "propagated".to_string(),
633                    };
634                }
635
636                // Run 2: Check all parameters together (memory optimization)
637                // Instead of running N separate analyses for N parameters,
638                // we run one analysis with all parameters tainted and track which ones reach sinks/returns
639                use crate::dataflow::path_sensitive::TaintState;
640
641                // Determine argument count from signature
642                let arg_count = if let Some(start) = function.signature.find('(') {
643                    if let Some(end) = function.signature.rfind(')') {
644                        let args = &function.signature[start + 1..end];
645                        if args.trim().is_empty() {
646                            0
647                        } else {
648                            args.split(',').count()
649                        }
650                    } else {
651                        0
652                    }
653                } else {
654                    0
655                };
656
657                // Limit analysis to actual arguments (max 10)
658                let max_arg = std::cmp::min(arg_count, 10);
659
660                if max_arg > 0 {
661                    // Taint all parameters at once with unique source locations
662                    let mut initial_taint = HashMap::new();
663                    for i in 1..=max_arg {
664                        let param_name = format!("_{}", i);
665                        initial_taint.insert(
666                            param_name,
667                            TaintState::Tainted {
668                                source_type: "parameter".to_string(),
669                                source_location: format!("param_{}", i),
670                            },
671                        );
672                    }
673
674                    let result = path_analysis.analyze_with_initial_taint(
675                        function,
676                        initial_taint,
677                        Some(&dataflow_summaries),
678                    );
679
680                    // Check which parameters reached sinks
681                    if result.has_any_vulnerable_path {
682                        // For now, mark all parameters as potentially reaching sinks
683                        // A more precise analysis would track which specific parameter reached which sink
684                        for i in 1..=max_arg {
685                            summary
686                                .propagation_rules
687                                .push(TaintPropagation::ParamToSink {
688                                    param: i - 1,
689                                    sink_type: "command_execution".to_string(),
690                                });
691                        }
692                    }
693
694                    // Check if return value is tainted (any parameter flows to return)
695                    if result.path_results.iter().any(|p| p.return_tainted) {
696                        // Mark all parameters as potentially flowing to return
697                        for i in 1..=max_arg {
698                            summary
699                                .propagation_rules
700                                .push(TaintPropagation::ParamToReturn(i - 1));
701                        }
702                    }
703
704                    // Check for sanitization
705                    if result
706                        .path_results
707                        .iter()
708                        .any(|p| !p.sanitizer_calls.is_empty())
709                    {
710                        for i in 1..=max_arg {
711                            summary
712                                .propagation_rules
713                                .push(TaintPropagation::ParamSanitized(i - 1));
714                        }
715                    }
716                }
717            }
718
719            if !summary.propagation_rules.is_empty() || summary.has_internal_vulnerability {
720                return Ok(summary);
721            }
722        } // end use_path_sensitive
723
724        // Use Phase 2's taint analysis to understand intra-procedural flows
725        // For now, we'll do a simple analysis based on MIR patterns
726
727        // Step 1: Identify if this function contains sources
728        let has_source = Self::contains_source(function);
729        if has_source {
730            summary.return_taint = ReturnTaint::FromSource {
731                source_type: "environment".to_string(),
732            };
733        }
734
735        // Step 2: Identify if this function contains sinks and determine sink type
736        let has_command_sink = Self::contains_command_sink(function);
737        let has_filesystem_sink = Self::contains_filesystem_sink(function);
738        let has_http_sink = Self::contains_http_sink(function);
739        let has_yaml_sink = Self::contains_yaml_sink(function);
740
741        // Step 3: Analyze parameter flows
742        // Check if function propagates parameters to return value
743        let propagates_param_to_return = Self::propagates_param_to_return(function);
744        if propagates_param_to_return && !has_source {
745            // Function takes parameter and returns it (or derivative)
746            // This enables N-level taint propagation
747            summary.return_taint = ReturnTaint::FromParameter(0);
748            summary
749                .propagation_rules
750                .push(TaintPropagation::ParamToReturn(0));
751        }
752
753        // Step 4: Check for sanitization patterns
754        let has_sanitization = Self::contains_sanitization(function);
755
756        // Build propagation rules based on patterns
757        if has_command_sink {
758            // If function has a command sink, parameters likely flow to it
759            summary
760                .propagation_rules
761                .push(TaintPropagation::ParamToSink {
762                    param: 0,
763                    sink_type: "command_execution".to_string(),
764                });
765        }
766
767        if has_filesystem_sink {
768            // If function has a filesystem sink, parameters likely flow to it
769            summary
770                .propagation_rules
771                .push(TaintPropagation::ParamToSink {
772                    param: 0,
773                    sink_type: "filesystem".to_string(),
774                });
775        }
776
777        if has_http_sink {
778            // If function has an HTTP sink, parameters likely flow to it (SSRF)
779            summary
780                .propagation_rules
781                .push(TaintPropagation::ParamToSink {
782                    param: 0,
783                    sink_type: "http".to_string(),
784                });
785        }
786
787        if has_yaml_sink {
788            // If function has a YAML deserialization sink
789            summary
790                .propagation_rules
791                .push(TaintPropagation::ParamToSink {
792                    param: 0,
793                    sink_type: "yaml".to_string(),
794                });
795        }
796
797        if has_sanitization {
798            // Function performs sanitization
799            summary
800                .propagation_rules
801                .push(TaintPropagation::ParamSanitized(0));
802        }
803
804        // Analyze calls to other functions
805        for line in &function.body {
806            // Check if this line calls a function we have a summary for
807            if let Some((callee_name, _)) = Self::extract_call_from_line(line) {
808                if let Some(callee_summary) = callee_summaries.get(&callee_name) {
809                    // Propagate taint rules from callee
810                    summary.merge_callee_summary(callee_summary);
811                }
812            }
813        }
814
815        Ok(summary)
816    }
817
818    /// Check if function propagates parameter to return value
819    fn propagates_param_to_return(function: &MirFunction) -> bool {
820        // First, check if function even takes parameters
821        // Check signature for parameter list - look for pattern like "(_1:" or "(mut _1:"
822        let sig_lower = function.signature.to_lowercase();
823        let has_params = sig_lower.contains("(_1:")
824            || sig_lower.contains("(mut _1:")
825            || sig_lower.contains("( _1:");
826
827        if !has_params {
828            return false; // No parameters, can't propagate
829        }
830
831        // Exclude functions that only use constants for _1
832        let assigns_constant_to_param = function
833            .body
834            .iter()
835            .any(|line| line.trim().starts_with("_1 = const"));
836
837        if assigns_constant_to_param {
838            return false; // Assigns constant to what would be param slot
839        }
840
841        // Heuristics for parameter propagation:
842        // Look for operations on _1 (first parameter after self if present)
843        let has_param_usage = function.body.iter().any(|line| {
844            // Direct parameter operations
845            line.contains("(*_1)")        // Deref of first param
846                || line.contains("Deref::deref(_1")  // Explicit deref  
847                || line.contains("Deref::deref(move _1")
848                // Taking references to parameters (assignment target contains &_1)
849                || (line.contains(" = &_1;") || line.contains(" = &mut _1;"))
850                // Format operations with parameter
851                || (line.contains("format!") || line.contains("format_args!"))
852                // String operations on parameters  
853                || line.contains("to_string(move _1")
854                || line.contains("String::from(_1")
855                // Move or copy of parameter (common in closures/async)
856                || line.contains("move _1")
857                || line.contains("copy _1")
858        });
859
860        // Check if function returns a value (not unit type)
861        let returns_value =
862            function.signature.contains("->") && !function.signature.contains("-> ()");
863
864        has_param_usage && returns_value
865    }
866
867    /// Check if function contains a taint source
868    fn contains_source(function: &MirFunction) -> bool {
869        function.body.iter().any(|line| {
870            line.contains("std::env::args")
871                || line.contains("std::env::var")
872                || line.contains("std::fs::read")
873                || line.contains("env::args")
874                || line.contains("env::var")
875                || line.contains(" = args() -> ")  // MIR format: args()
876                || line.contains(" = var")          // MIR format: var() or var::<T>()
877                || line.contains(" = read") // MIR format: read() or fs::read()
878        })
879    }
880
881    /// Check if function contains a taint sink
882    #[allow(dead_code)]
883    fn contains_sink(function: &MirFunction) -> bool {
884        Self::contains_command_sink(function)
885            || Self::contains_filesystem_sink(function)
886            || Self::contains_http_sink(function)
887            || Self::contains_yaml_sink(function)
888    }
889
890    /// Check if function contains a command execution sink
891    fn contains_command_sink(function: &MirFunction) -> bool {
892        function.body.iter().any(|line| {
893            // Only match DIRECT calls to sinks, not indirect via helper functions
894            // Look for Command::new or Command::spawn, not just "spawn"
895            (line.contains("Command::new") && line.contains("->"))
896                || line.contains("std::process::Command")
897                || (line.contains("Command::spawn") && line.contains("->"))
898                || (line.contains("Command::exec") && line.contains("->"))
899        })
900    }
901
902    /// Check if function contains a filesystem sink (for path traversal detection)
903    fn contains_filesystem_sink(function: &MirFunction) -> bool {
904        function.body.iter().any(|line| {
905            // File read operations
906            line.contains("fs::read_to_string") 
907                || line.contains("std::fs::read_to_string")
908                || line.contains("fs::read(")
909                || line.contains("std::fs::read(")
910                // File write operations
911                || line.contains("fs::write(")
912                || line.contains("std::fs::write(")
913                // File open operations
914                || line.contains("File::open(")
915                || line.contains("File::create(")
916                || line.contains("std::fs::File::open")
917                || line.contains("std::fs::File::create")
918                || line.contains("OpenOptions")
919                // File removal operations
920                || line.contains("fs::remove_file")
921                || line.contains("fs::remove_dir")
922                || line.contains("std::fs::remove_file")
923                || line.contains("std::fs::remove_dir")
924                // Copy/rename operations
925                || line.contains("fs::copy(")
926                || line.contains("fs::rename(")
927                || line.contains("std::fs::copy")
928                || line.contains("std::fs::rename")
929                // Directory operations
930                || line.contains("fs::create_dir")
931                || line.contains("std::fs::create_dir")
932        })
933    }
934
935    /// Check if function contains an HTTP client sink (for SSRF detection)
936    fn contains_http_sink(function: &MirFunction) -> bool {
937        function.body.iter().any(|line| {
938            // reqwest patterns
939            line.contains("reqwest::blocking::get")
940                || line.contains("reqwest::get")
941                || line.contains("blocking::get")
942                || line.contains("Client>::get")
943                || line.contains("Client>::post")
944                || line.contains("Client>::put")
945                || line.contains("Client>::delete")
946                || line.contains("Client>::patch")
947                || line.contains("Client>::head")
948                || line.contains("RequestBuilder>::send")
949                // ureq patterns
950                || line.contains("ureq::get")
951                || line.contains("ureq::post")
952                || line.contains("ureq::put")
953                || line.contains("ureq::delete")
954                || line.contains("Agent>::get")
955                || line.contains("Agent>::post")
956                || line.contains("Request>::call")
957                // hyper patterns
958                || line.contains("hyper::Client")
959                || line.contains("hyper::Request")
960                // Generic HTTP patterns
961                || line.contains("get::<&String>")
962                || line.contains("get::<&str>")
963                || line.contains("post::<&String>")
964                || line.contains("post::<&str>")
965        })
966    }
967
968    /// Check if function contains a YAML deserialization sink (for YAML injection detection)
969    fn contains_yaml_sink(function: &MirFunction) -> bool {
970        function.body.iter().any(|line| {
971            // serde_yaml patterns
972            line.contains("serde_yaml::from_str")
973                || line.contains("serde_yaml::from_slice")
974                || line.contains("serde_yaml::from_reader")
975                // MIR patterns for generic instantiation
976                || line.contains("from_str::<") && line.contains("serde_yaml")
977                || line.contains("from_slice::<") && line.contains("serde_yaml")
978                || line.contains("from_reader::<") && line.contains("serde_yaml")
979                // Generic yaml patterns - function names with yaml
980                || (line.contains("from_str") && function.name.to_lowercase().contains("yaml"))
981        })
982    }
983
984    /// Check if function performs sanitization
985    fn contains_sanitization(function: &MirFunction) -> bool {
986        function.body.iter().any(|line| {
987            line.contains("parse::<")
988                || line.contains("chars().all")
989                || line.contains("is_alphanumeric")
990        })
991    }
992
993    /// Check if function has validation guard that protects a sink
994    /// Returns true if there's an if-condition checking safety before calling a sink
995    #[allow(dead_code)]
996    fn has_validation_guard(function: &MirFunction) -> bool {
997        let has_sink = Self::contains_sink(function);
998        if !has_sink {
999            return false;
1000        }
1001
1002        // Look for validation function calls like is_safe_input, is_valid, validate, etc.
1003        let has_validation_call = function.body.iter().any(|line| {
1004            (line.contains("is_safe") || line.contains("is_valid") || line.contains("validate"))
1005                && line.contains("(")
1006                && line.contains(")")
1007        });
1008
1009        // Look for switchInt (if/match statements) that could be guards
1010        let has_conditional = function.body.iter().any(|line| line.contains("switchInt("));
1011
1012        has_validation_call && has_conditional
1013    }
1014
1015    /// Check if function calls a sanitization helper on tainted data before using it
1016    /// This handles patterns like: let safe = validate_input(&tainted); use(safe);
1017    #[allow(dead_code)]
1018    fn has_sanitization_helper_call(function: &MirFunction) -> bool {
1019        // Look for calls to functions with sanitization-related names
1020        let sanitization_patterns = ["validate", "sanitize", "clean", "escape", "filter"];
1021
1022        function.body.iter().any(|line| {
1023            sanitization_patterns
1024                .iter()
1025                .any(|pattern| line.to_lowercase().contains(pattern) && line.contains("("))
1026        })
1027    }
1028
1029    /// Extract function call from MIR line
1030    fn extract_call_from_line(line: &str) -> Option<(String, usize)> {
1031        let line = line.trim();
1032
1033        if line.contains("(") && line.contains(") -> [return:") {
1034            if let Some(eq_pos) = line.find('=') {
1035                if let Some(paren_pos) = line[eq_pos..].find('(') {
1036                    let func_part = &line[eq_pos + 1..eq_pos + paren_pos].trim();
1037                    let func_name = CallGraph::extract_function_name(func_part);
1038
1039                    if !func_name.is_empty() && !CallGraph::is_builtin_operation(&func_name) {
1040                        // Estimate arg count
1041                        let args_section = &line[eq_pos + paren_pos + 1..];
1042                        let arg_count = CallGraph::estimate_arg_count(args_section);
1043                        return Some((func_name, arg_count));
1044                    }
1045                }
1046            }
1047        }
1048
1049        None
1050    }
1051
1052    /// Merge rules from a callee's summary
1053    fn merge_callee_summary(&mut self, callee: &FunctionSummary) {
1054        // If callee has sources, this function might propagate them
1055        if !callee.source_parameters.is_empty() {
1056            // For now, mark that we call a function with sources
1057            // Phase 3.3 will track parameter mappings more precisely
1058        }
1059
1060        // DISABLED: Don't propagate sinks from callees
1061        // Inter-procedural flow detection handles this by exploring call chains
1062        // If we mark callers as having sinks, we get false positives
1063        /*
1064        // If callee has sinks, parameters to this function might reach them
1065        if !callee.sink_parameters.is_empty() {
1066            // Mark that we propagate to a sink
1067            for &param in &callee.sink_parameters {
1068                if param < 3 {  // Only track first few parameters for now
1069                    self.propagation_rules.push(TaintPropagation::ParamToSink {
1070                        param,
1071                        sink_type: "indirect_command_execution".to_string(),
1072                    });
1073                }
1074            }
1075        }
1076        */
1077
1078        // Handle return taint
1079        match &callee.return_taint {
1080            ReturnTaint::FromSource { .. } => {
1081                // If callee returns tainted data, this function might too
1082                if matches!(self.return_taint, ReturnTaint::Clean) {
1083                    self.return_taint = ReturnTaint::FromSource {
1084                        source_type: "propagated".to_string(),
1085                    };
1086                }
1087            }
1088            ReturnTaint::FromParameter(param) => {
1089                // Callee propagates parameter to return
1090                self.propagation_rules
1091                    .push(TaintPropagation::ParamToReturn(*param));
1092            }
1093            _ => {}
1094        }
1095    }
1096
1097    pub fn to_dataflow_summary(&self) -> DataflowSummary {
1098        let mut propagation = self.propagation_rules.clone();
1099        let mut returns_tainted = false;
1100
1101        match &self.return_taint {
1102            ReturnTaint::Clean => {}
1103            ReturnTaint::FromParameter(idx) => {
1104                propagation.push(TaintPropagation::ParamToReturn(*idx));
1105            }
1106            ReturnTaint::FromSource { .. } => {
1107                returns_tainted = true;
1108            }
1109            ReturnTaint::Merged(taints) => {
1110                for taint in taints {
1111                    match taint {
1112                        ReturnTaint::FromParameter(idx) => {
1113                            propagation.push(TaintPropagation::ParamToReturn(*idx));
1114                        }
1115                        ReturnTaint::FromSource { .. } => {
1116                            returns_tainted = true;
1117                        }
1118                        _ => {}
1119                    }
1120                }
1121            }
1122        }
1123
1124        DataflowSummary {
1125            name: self.function_name.clone(),
1126            propagation,
1127            returns_tainted,
1128        }
1129    }
1130}
1131
1132/// Main inter-procedural analysis engine
1133pub struct InterProceduralAnalysis {
1134    /// Call graph
1135    pub call_graph: CallGraph,
1136
1137    /// Computed function summaries
1138    pub summaries: HashMap<String, FunctionSummary>,
1139
1140    /// Closure registry for tracking closures and captures
1141    pub closure_registry: ClosureRegistry,
1142
1143    /// Cached inter-procedural flows (computed once on first call)
1144    cached_flows: RefCell<Option<Vec<TaintPath>>>,
1145
1146    /// Configuration for analysis limits
1147    config: IpaConfig,
1148}
1149
1150impl InterProceduralAnalysis {
1151    /// Create a new inter-procedural analysis with default configuration
1152    pub fn new(package: &MirPackage) -> Result<Self> {
1153        Self::with_config(package, IpaConfig::default())
1154    }
1155
1156    /// Create a new inter-procedural analysis with custom configuration
1157    pub fn with_config(package: &MirPackage, config: IpaConfig) -> Result<Self> {
1158        let call_graph = CallGraph::from_mir_package(package)?;
1159        let closure_registry = ClosureRegistryBuilder::build_from_package(package);
1160
1161        Ok(InterProceduralAnalysis {
1162            call_graph,
1163            summaries: HashMap::new(),
1164            closure_registry,
1165            cached_flows: RefCell::new(None),
1166            config,
1167        })
1168    }
1169
1170    /// Analyze all functions and generate summaries
1171    pub fn analyze(&mut self, package: &MirPackage) -> Result<()> {
1172        use crate::memory_profiler;
1173
1174        // Get function map for quick lookup
1175        let function_map: HashMap<String, &MirFunction> = package
1176            .functions
1177            .iter()
1178            .map(|f| (f.name.clone(), f))
1179            .collect();
1180
1181        let total_functions = self.call_graph.analysis_order.len();
1182        let checkpoint_interval = std::cmp::max(1, total_functions / 10); // Log every 10%
1183
1184        memory_profiler::checkpoint_with_context(
1185            "IPA analyze start",
1186            &format!("{} functions", total_functions),
1187        );
1188
1189        // Analyze functions in bottom-up order (callees before callers)
1190        for (idx, function_name) in self
1191            .call_graph
1192            .analysis_order
1193            .clone()
1194            .into_iter()
1195            .enumerate()
1196        {
1197            // Log progress every 10%
1198            if idx % checkpoint_interval == 0 {
1199                let pct = (idx * 100) / total_functions;
1200                memory_profiler::checkpoint_with_context(
1201                    "IPA progress",
1202                    &format!("{}% ({}/{})", pct, idx, total_functions),
1203                );
1204            }
1205
1206            if let Some(function) = function_map.get(&function_name) {
1207                // Memory spike detection: log functions that cause >100MB increase
1208                let before_mb = memory_profiler::current_memory_mb();
1209
1210                // Build a minimal callee summaries map - only include summaries for
1211                // functions that this function actually calls. This avoids O(N²) memory
1212                // usage from cloning all summaries for each function.
1213                let mut callee_summaries = HashMap::new();
1214
1215                if let Some(node) = self.call_graph.nodes.get(&function_name) {
1216                    for call_site in &node.callees {
1217                        // Map raw callee name to summary
1218                        // If resolved_targets is not empty, merge their summaries
1219                        if !call_site.resolved_targets.is_empty() {
1220                            let mut merged_summary: Option<FunctionSummary> = None;
1221
1222                            for target in &call_site.resolved_targets {
1223                                if let Some(target_summary) = self.summaries.get(target) {
1224                                    if let Some(current) = &mut merged_summary {
1225                                        current.merge_callee_summary(target_summary);
1226                                    } else {
1227                                        // Create a new summary with the raw callee name
1228                                        let mut new_summary = target_summary.clone();
1229                                        new_summary.function_name = call_site.callee.clone();
1230                                        merged_summary = Some(new_summary);
1231                                    }
1232                                }
1233                            }
1234
1235                            if let Some(summary) = merged_summary {
1236                                callee_summaries.insert(call_site.callee.clone(), summary);
1237                            }
1238                        } else {
1239                            // Try direct lookup (for unresolved or direct calls)
1240                            if let Some(summary) = self.summaries.get(&call_site.callee) {
1241                                callee_summaries.insert(call_site.callee.clone(), summary.clone());
1242                            }
1243                        }
1244                    }
1245                }
1246
1247                // Generate summary using summaries of callees and closure registry
1248                let summary = FunctionSummary::from_mir_function(
1249                    function,
1250                    &callee_summaries,
1251                    Some(&self.closure_registry),
1252                )?;
1253
1254                // Store summary in summaries HashMap (single source of truth)
1255                // Note: We do NOT store in node.summary to avoid memory duplication
1256                self.summaries.insert(function_name.clone(), summary);
1257
1258                // Memory spike detection: log functions that cause significant memory increase
1259                let after_mb = memory_profiler::current_memory_mb();
1260                let delta_mb = after_mb - before_mb;
1261                if delta_mb > 50.0 && memory_profiler::is_enabled() {
1262                    eprintln!(
1263                        "[MEMORY] SPIKE: {} caused +{:.0} MB (now {:.0} MB) - body={} lines",
1264                        function_name,
1265                        delta_mb,
1266                        after_mb,
1267                        function.body.len()
1268                    );
1269                }
1270            }
1271        }
1272
1273        Ok(())
1274    }
1275
1276    /// Get summary for a function
1277    pub fn get_summary(&self, function_name: &str) -> Option<&FunctionSummary> {
1278        self.summaries.get(function_name)
1279    }
1280
1281    /// Print summary statistics
1282    pub fn print_statistics(&self) {
1283        println!("Inter-Procedural Analysis Statistics:");
1284        println!("  Total functions: {}", self.summaries.len());
1285
1286        let functions_with_sources = self
1287            .summaries
1288            .values()
1289            .filter(|s| !matches!(s.return_taint, ReturnTaint::Clean))
1290            .count();
1291        println!("  Functions with sources: {}", functions_with_sources);
1292
1293        let functions_with_sinks = self
1294            .summaries
1295            .values()
1296            .filter(|s| {
1297                s.sink_parameters.len() > 0
1298                    || s.propagation_rules
1299                        .iter()
1300                        .any(|r| matches!(r, TaintPropagation::ParamToSink { .. }))
1301            })
1302            .count();
1303        println!("  Functions with sinks: {}", functions_with_sinks);
1304
1305        let functions_with_sanitization = self
1306            .summaries
1307            .values()
1308            .filter(|s| {
1309                s.propagation_rules
1310                    .iter()
1311                    .any(|r| matches!(r, TaintPropagation::ParamSanitized(_)))
1312            })
1313            .count();
1314        println!(
1315            "  Functions with sanitization: {}",
1316            functions_with_sanitization
1317        );
1318    }
1319
1320    /// Detect inter-procedural taint flows (cached after first call)
1321    pub fn detect_inter_procedural_flows(&self, package: &MirPackage) -> Vec<TaintPath> {
1322        // Check if we have cached flows
1323        {
1324            let cached = self.cached_flows.borrow();
1325            if let Some(flows) = cached.as_ref() {
1326                return flows.clone();
1327            }
1328        }
1329
1330        // Compute flows
1331        let flows = self.compute_inter_procedural_flows(package);
1332
1333        // Cache the result
1334        *self.cached_flows.borrow_mut() = Some(flows.clone());
1335
1336        flows
1337    }
1338
1339    /// Internal: compute inter-procedural taint flows (called once)
1340    fn compute_inter_procedural_flows(&self, package: &MirPackage) -> Vec<TaintPath> {
1341        use crate::memory_profiler;
1342        memory_profiler::checkpoint("IPA: Computing inter-procedural flows");
1343
1344        let mut flows = Vec::new();
1345
1346        let num_summaries = self.summaries.len();
1347        let mut processed = 0;
1348
1349        // For each function with REAL sources, try to find paths to sinks
1350        for (source_func, source_summary) in &self.summaries {
1351            processed += 1;
1352
1353            // Use configurable limit from IpaConfig
1354            if flows.len() >= self.config.max_total_flows {
1355                eprintln!("Note: IPA flow limit reached ({} flows). Some inter-procedural flows may not be reported.", self.config.max_total_flows);
1356                break;
1357            }
1358
1359            // Progress logging every 10%
1360            if memory_profiler::is_enabled() && processed % (num_summaries / 10).max(1) == 0 {
1361                memory_profiler::checkpoint_with_context(
1362                    "IPA flow computation",
1363                    &format!(
1364                        "{}% ({}/{}), {} flows",
1365                        processed * 100 / num_summaries,
1366                        processed,
1367                        num_summaries,
1368                        flows.len()
1369                    ),
1370                );
1371            }
1372
1373            // Case 1: Function has internal vulnerability (source -> sink within function)
1374            if source_summary.has_internal_vulnerability {
1375                // If this is a closure, report the flow for the parent function
1376                let reported_source =
1377                    if let Some(closure) = self.closure_registry.get_closure(source_func) {
1378                        closure.parent_function.clone()
1379                    } else {
1380                        source_func.clone()
1381                    };
1382
1383                flows.push(TaintPath {
1384                    source_function: reported_source,
1385                    sink_function: source_func.clone(),
1386                    sink_type: "internal_sink".to_string(),
1387                    call_chain: vec![source_func.clone()],
1388                    source_type: "environment".to_string(),
1389                    sanitized: false,
1390                });
1391            }
1392
1393            // Case 2: Function returns tainted data (source -> return)
1394            // Only start from functions that have actual sources (not just propagation)
1395            if matches!(source_summary.return_taint, ReturnTaint::FromSource { .. }) {
1396                // This function has a real taint source
1397                // Find all functions that call it
1398                if self.call_graph.nodes.get(source_func).is_some() {
1399                    // Explore paths from this source
1400                    let all_flows = self.find_paths_from_source(
1401                        source_func,
1402                        &source_summary.return_taint,
1403                        vec![source_func.clone()],
1404                        &mut HashSet::new(),
1405                    );
1406
1407                    // Filter out intra-procedural flows (same source and sink)
1408                    // Those should be caught by Phase 2's analysis
1409                    // UNLESS it's a complex flow that Phase 2 missed but Phase 3 caught via internal vulnerability check
1410                    for flow in all_flows {
1411                        if flow.source_function != flow.sink_function || flow.call_chain.len() > 1 {
1412                            flows.push(flow);
1413                        }
1414                    }
1415                }
1416            }
1417        }
1418
1419        // Phase 3.4: Filter false positives by checking for validation patterns
1420        flows = self.filter_false_positives(flows);
1421
1422        // Phase 3.5.2: Add flows from closures with tainted captures
1423        let closure_flows = self.detect_closure_taint_flows(package);
1424        flows.extend(closure_flows);
1425
1426        memory_profiler::checkpoint_with_context(
1427            "IPA: Computed flows",
1428            &format!("{} flows", flows.len()),
1429        );
1430
1431        flows
1432    }
1433
1434    /// Phase 3.5.2: Detect taint flows through closures
1435    /// Closures capture variables from parent functions - if captured var is tainted
1436    /// and closure has a sink, that's an interprocedural flow
1437    fn detect_closure_taint_flows(&self, package: &MirPackage) -> Vec<TaintPath> {
1438        let mut flows: Vec<TaintPath> = Vec::new();
1439
1440        // Build function map for looking up MIR bodies
1441        let function_map: HashMap<String, &MirFunction> = package
1442            .functions
1443            .iter()
1444            .map(|f| (f.name.clone(), f))
1445            .collect();
1446
1447        // Source patterns that indicate tainted input
1448        let source_patterns = [
1449            "env::args",
1450            "std::env::args",
1451            "::args()",
1452            "env::var",
1453            "std::env::var",
1454            "stdin",
1455            "read_line",
1456            "read_to_string",
1457            "HttpRequest",
1458            "request",
1459            "body()",
1460            "serde_json::from",
1461            "serde::Deserialize",
1462        ];
1463
1464        // Debug: print all closures being analyzed
1465        let all_closures = self.closure_registry.get_all_closures();
1466
1467        for closure_info in all_closures {
1468            // Skip if already found flow for this closure
1469            if flows.iter().any(|f| f.sink_function == closure_info.name) {
1470                continue;
1471            }
1472
1473            // Check if parent function has a taint source via return value
1474            let parent_has_source = self
1475                .summaries
1476                .get(&closure_info.parent_function)
1477                .map(|s| matches!(s.return_taint, ReturnTaint::FromSource { .. }))
1478                .unwrap_or(false);
1479
1480            // Check if parent function CALLS a source (not just returns it)
1481            // This is the key for closures - parent may use source locally without returning
1482            let parent_calls_source = self
1483                .call_graph
1484                .nodes
1485                .get(&closure_info.parent_function)
1486                .map(|node| {
1487                    let result = node.callees.iter().any(|callee| {
1488                        source_patterns
1489                            .iter()
1490                            .any(|pat| callee.callee.contains(pat))
1491                    });
1492                    if closure_info.name.contains("test_closure") {
1493                        // eprintln!("[DEBUG]   parent_calls_source: {}", result);
1494                        // eprintln!("[DEBUG]   parent callees: {:?}", node.callees.iter().map(|c| &c.callee).collect::<Vec<_>>());
1495                    }
1496                    result
1497                })
1498                .unwrap_or_else(|| {
1499                    if closure_info.name.contains("test_closure") {
1500                        // eprintln!("[DEBUG]   parent '{}' NOT found in call_graph", closure_info.parent_function);
1501                    }
1502                    false
1503                });
1504
1505            // Also check if parent function body contains source patterns
1506            let parent_has_source_in_body = self
1507                .summaries
1508                .get(&closure_info.parent_function)
1509                .map(|summary| {
1510                    // Check if the parent function's summary contains any source
1511                    matches!(summary.return_taint, ReturnTaint::FromSource { .. })
1512                })
1513                .unwrap_or(false);
1514
1515            // Method 1: Use closure registry's taint detection
1516            if closure_info.has_tainted_captures() {
1517                if let Some(summary) = self.summaries.get(&closure_info.name) {
1518                    let sink_type = summary.propagation_rules.iter().find_map(|r| {
1519                        if let TaintPropagation::ParamToSink { sink_type, .. } = r {
1520                            Some(sink_type.clone())
1521                        } else {
1522                            None
1523                        }
1524                    });
1525
1526                    if let Some(sink_type) = sink_type {
1527                        for capture in &closure_info.captured_vars {
1528                            if let crate::dataflow::closure::TaintState::Tainted {
1529                                source_type,
1530                                ..
1531                            } = &capture.taint_state
1532                            {
1533                                flows.push(TaintPath {
1534                                    source_function: closure_info.parent_function.clone(),
1535                                    sink_function: closure_info.name.clone(),
1536                                    call_chain: vec![
1537                                        closure_info.parent_function.clone(),
1538                                        closure_info.name.clone(),
1539                                    ],
1540                                    source_type: source_type.clone(),
1541                                    sink_type: sink_type.clone(),
1542                                    sanitized: false,
1543                                });
1544                                break;
1545                            }
1546                        }
1547                        continue;
1548                    }
1549                }
1550            }
1551
1552            // Method 2: Direct body pattern matching (fallback)
1553            // Check if parent has source (via return OR via calling a source) and closure has command sink
1554            if parent_has_source || parent_has_source_in_body || parent_calls_source {
1555                // Check closure body for command execution
1556                if let Some(node) = self.call_graph.nodes.get(&closure_info.name) {
1557                    let has_command_callee = node.callees.iter().any(|c| {
1558                        c.callee.contains("Command")
1559                            || c.callee.contains("spawn")
1560                            || c.callee.contains("output")
1561                            || c.callee.contains("process")
1562                    });
1563
1564                    if has_command_callee && !closure_info.captured_vars.is_empty() {
1565                        flows.push(TaintPath {
1566                            source_function: closure_info.parent_function.clone(),
1567                            sink_function: closure_info.name.clone(),
1568                            call_chain: vec![
1569                                closure_info.parent_function.clone(),
1570                                closure_info.name.clone(),
1571                            ],
1572                            source_type: "environment".to_string(),
1573                            sink_type: "command_execution".to_string(),
1574                            sanitized: false,
1575                        });
1576                        continue;
1577                    }
1578                }
1579            }
1580
1581            // Method 3: Check for closures with captured variables where parent calls source
1582            // This catches cases even without full taint tracking
1583            if !closure_info.captured_vars.is_empty() && parent_calls_source {
1584                // Check if the closure has command-related callees
1585                if let Some(closure_node) = self.call_graph.nodes.get(&closure_info.name) {
1586                    let closure_has_sink = closure_node.callees.iter().any(|c| {
1587                        let name_lower = c.callee.to_lowercase();
1588                        name_lower.contains("command")
1589                            || name_lower.contains("spawn")
1590                            || name_lower.contains("shell")
1591                            || name_lower.contains("exec")
1592                    });
1593
1594                    if closure_has_sink {
1595                        flows.push(TaintPath {
1596                            source_function: closure_info.parent_function.clone(),
1597                            sink_function: closure_info.name.clone(),
1598                            call_chain: vec![
1599                                closure_info.parent_function.clone(),
1600                                closure_info.name.clone(),
1601                            ],
1602                            source_type: "environment".to_string(),
1603                            sink_type: "command_execution".to_string(),
1604                            sanitized: false,
1605                        });
1606                    }
1607                }
1608            }
1609
1610            // Method 4: Analyze closure body directly for captured variable → command flow
1611            // This works even when parent function is inlined/optimized away
1612            // Check for patterns like:
1613            //   debug tainted => (*((*_1).0: ...  (captured variable with suggestive name)
1614            //   _X = Command::arg(... copy _Y...) where _Y is from captured data
1615            if let Some(closure_function) = function_map.get(&closure_info.name) {
1616                let body_str = closure_function.body.join("\n");
1617
1618                // Check if closure has command sink in its body
1619                let has_command_sink = body_str.contains("Command::")
1620                    || body_str.contains("::spawn(")
1621                    || body_str.contains("::output(");
1622
1623                if has_command_sink {
1624                    // Check for captured variables with suggestive names indicating user input
1625                    // Pattern: debug <name> => (*((*_1)... indicates captured variable
1626                    let has_tainted_capture = body_str.contains("debug tainted") ||
1627                        body_str.contains("debug user") ||
1628                        body_str.contains("debug input") ||
1629                        body_str.contains("debug cmd") ||
1630                        body_str.contains("debug command") ||
1631                        body_str.contains("debug arg") ||
1632                        // Also check if _1 (the closure capture) is used in Command::arg
1633                        (body_str.contains("(*_1)") && body_str.contains("Command::arg"));
1634
1635                    if has_tainted_capture {
1636                        flows.push(TaintPath {
1637                            source_function: closure_info.parent_function.clone(),
1638                            sink_function: closure_info.name.clone(),
1639                            call_chain: vec![
1640                                closure_info.parent_function.clone(),
1641                                closure_info.name.clone(),
1642                            ],
1643                            source_type: "captured_variable".to_string(),
1644                            sink_type: "command_execution".to_string(),
1645                            sanitized: false,
1646                        });
1647                    }
1648                }
1649            }
1650        }
1651
1652        flows
1653    }
1654
1655    /// Phase 3.4: Filter false positives from detected flows
1656    /// Identifies patterns that indicate sanitization even when not in the direct call chain
1657    fn filter_false_positives(&self, flows: Vec<TaintPath>) -> Vec<TaintPath> {
1658        flows
1659            .into_iter()
1660            .filter(|flow| {
1661                // Check each function in the call chain
1662                for func_name in &flow.call_chain {
1663                    if let Some(node) = self.call_graph.nodes.get(func_name) {
1664                        // Pattern 1: Function has BOTH source and (direct or indirect) sink
1665                        let is_source_func = func_name == &flow.source_function;
1666                        let returns_source = self
1667                            .summaries
1668                            .get(func_name)
1669                            .map(|summary| {
1670                                matches!(summary.return_taint, ReturnTaint::FromSource { .. })
1671                            })
1672                            .unwrap_or(false);
1673
1674                        let has_source = is_source_func || returns_source;
1675
1676                        // Check if this function has a direct sink
1677                        let has_direct_sink = self
1678                            .summaries
1679                            .get(func_name)
1680                            .map(|summary| {
1681                                summary
1682                                    .propagation_rules
1683                                    .iter()
1684                                    .any(|r| matches!(r, TaintPropagation::ParamToSink { .. }))
1685                            })
1686                            .unwrap_or(false);
1687
1688                        // Check if this function calls something that has a sink
1689                        let calls_sink_function = node.callees.iter().any(|callee_site| {
1690                            if let Some(callee_summary) = self.summaries.get(&callee_site.callee) {
1691                                callee_summary
1692                                    .propagation_rules
1693                                    .iter()
1694                                    .any(|r| matches!(r, TaintPropagation::ParamToSink { .. }))
1695                            } else {
1696                                false
1697                            }
1698                        });
1699
1700                        let has_sink = has_direct_sink || calls_sink_function;
1701
1702                        if has_source && has_sink {
1703                            // This function gets tainted data and (directly or indirectly) executes it
1704                            // Check if it has validation guards protecting the sink
1705
1706                            // PHASE 3.4 CONSERVATIVE FILTER:
1707                            // Only filter if we detect BOTH:
1708                            // 1. A validator call (is_safe, validate, etc.)
1709                            // 2. Evidence that validator protects the sink (guard pattern)
1710                            //
1711                            // This avoids filtering cases like test_partial_sanitization where
1712                            // one branch calls the validator but another branch doesn't.
1713
1714                            let calls_validator = node.callees.iter().any(|callee| {
1715                                let callee_lower = callee.callee.to_lowercase();
1716                                callee_lower.contains("is_safe")
1717                                    || callee_lower.contains("is_valid")
1718                            });
1719
1720                            // More restrictive: only filter if validator is in guard pattern (is_safe_, is_valid_)
1721                            // These are typically used in if-conditions that protect the sink
1722                            // Avoid filtering validate_/sanitize_ which might be on only one branch
1723
1724                            if calls_validator {
1725                                // Function uses a validation guard - likely a false positive
1726                                return false; // Filter out this flow
1727                            }
1728                        }
1729                    }
1730                }
1731
1732                // Flow passed all filters - keep it
1733                true
1734            })
1735            .collect()
1736    }
1737
1738    /// Find taint paths starting from a source function.
1739    ///
1740    /// # Memory Safety Limits
1741    ///
1742    /// This function uses configurable limits from `IpaConfig` to prevent memory exhaustion.
1743    /// These limits may cause false negatives in extreme cases. See README.md for details.
1744    ///
1745    /// To increase limits, pass a custom `IpaConfig` via `InterProceduralAnalysis::with_config()`,
1746    /// or use a `cargo-cola.yaml` configuration file.
1747    ///
1748    /// Without these limits, analysis of dense call graphs (e.g., InfluxDB with 11K functions)
1749    /// would require 60GB+ RAM due to exponential path exploration.
1750    fn find_paths_from_source(
1751        &self,
1752        current_func: &str,
1753        taint: &ReturnTaint,
1754        path: Vec<String>,
1755        visited: &mut HashSet<String>,
1756    ) -> Vec<TaintPath> {
1757        let mut flows = Vec::new();
1758
1759        // Use configurable limit from IpaConfig
1760        if path.len() > self.config.max_path_depth {
1761            return flows;
1762        }
1763
1764        // Use configurable limit from IpaConfig
1765        if visited.len() >= self.config.max_visited {
1766            return flows;
1767        }
1768
1769        // Avoid infinite recursion - once visited, never revisit
1770        if visited.contains(current_func) {
1771            return flows;
1772        }
1773        visited.insert(current_func.to_string());
1774
1775        // Check if path is sanitized (any function in path has ParamSanitized rule)
1776        let is_sanitized = self.path_is_sanitized(&path);
1777
1778        // NEW: Also check if current function calls a sanitization helper
1779        // This catches patterns like: let safe = validate(&tainted); use(safe);
1780        let calls_sanitizer = if let Some(node) = self.call_graph.nodes.get(current_func) {
1781            node.callees.iter().any(|callee_site| {
1782                if let Some(callee_summary) = self.summaries.get(&callee_site.callee) {
1783                    // Check if callee has sanitization
1784                    callee_summary
1785                        .propagation_rules
1786                        .iter()
1787                        .any(|r| matches!(r, TaintPropagation::ParamSanitized(_)))
1788                } else {
1789                    false
1790                }
1791            })
1792        } else {
1793            false
1794        };
1795
1796        let effective_sanitized = is_sanitized || calls_sanitizer;
1797
1798        // Get the current function's node and summary
1799        if let Some(node) = self.call_graph.nodes.get(current_func) {
1800            // Check if current function has a sink
1801            if let Some(summary) = self.summaries.get(current_func) {
1802                // Does this function have a sink that the taint can reach?
1803                for rule in &summary.propagation_rules {
1804                    if let TaintPropagation::ParamToSink { sink_type, .. } = rule {
1805                        // Taint reaches a sink!
1806                        flows.push(TaintPath {
1807                            source_function: path[0].clone(),
1808                            sink_function: current_func.to_string(),
1809                            call_chain: path.clone(),
1810                            source_type: Self::extract_source_type(taint),
1811                            sink_type: sink_type.clone(),
1812                            sanitized: effective_sanitized,
1813                        });
1814                    } else if let TaintPropagation::ParamSanitized(_) = rule {
1815                        // Taint is sanitized - we already track this above
1816                        continue;
1817                    }
1818                }
1819            }
1820
1821            // NEW: If current function doesn't have a direct sink, check what it calls
1822            // This enables N-level detection: source() -> caller() -> sink_function()
1823            // Only check direct callees, not recursive (avoid explosion)
1824            if !flows.iter().any(|f| f.sink_function == current_func) {
1825                // Current function doesn't have a sink, check its callees
1826                for callee_site in &node.callees {
1827                    if let Some(callee_summary) = self.summaries.get(&callee_site.callee) {
1828                        // Check if this callee sanitizes
1829                        let callee_sanitizes = callee_summary
1830                            .propagation_rules
1831                            .iter()
1832                            .any(|r| matches!(r, TaintPropagation::ParamSanitized(_)));
1833
1834                        // Does this callee have a sink?
1835                        let has_sink = callee_summary
1836                            .propagation_rules
1837                            .iter()
1838                            .any(|r| matches!(r, TaintPropagation::ParamToSink { .. }));
1839
1840                        if has_sink {
1841                            // Found a flow through callee
1842                            let mut extended_path = path.clone();
1843                            extended_path.push(callee_site.callee.clone());
1844
1845                            let sink_type = callee_summary
1846                                .propagation_rules
1847                                .iter()
1848                                .find_map(|r| match r {
1849                                    TaintPropagation::ParamToSink { sink_type, .. } => {
1850                                        Some(sink_type.clone())
1851                                    }
1852                                    _ => None,
1853                                })
1854                                .unwrap_or_else(|| "unknown_sink".to_string());
1855
1856                            flows.push(TaintPath {
1857                                source_function: path[0].clone(),
1858                                sink_function: callee_site.callee.clone(),
1859                                call_chain: extended_path.clone(),
1860                                source_type: Self::extract_source_type(taint),
1861                                sink_type,
1862                                // Sanitized if either path so far is sanitized OR this callee sanitizes OR calling function has sanitization
1863                                sanitized: effective_sanitized
1864                                    || callee_sanitizes
1865                                    || self.path_is_sanitized(&extended_path),
1866                            });
1867                        }
1868                    }
1869                }
1870            }
1871
1872            // Explore callers of this function (functions that call current_func)
1873            // Key insight: the caller receives tainted data by calling current_func
1874            // If the caller has a filesystem sink, the taint may reach it
1875            for caller in &node.callers {
1876                let mut new_path = path.clone();
1877                new_path.push(caller.clone());
1878
1879                // Check if the CALLER itself has a filesystem sink
1880                // This handles the pattern: caller() { let x = source_fn(); sink(x); }
1881                if self.call_graph.nodes.contains_key(caller) {
1882                    if let Some(caller_summary) = self.summaries.get(caller) {
1883                        // Check if caller has a filesystem sink in its propagation rules
1884                        // OR if it has any ParamToSink (which was set when analyzing the function)
1885                        let has_filesystem_sink = caller_summary.propagation_rules.iter()
1886                            .any(|r| matches!(r, TaintPropagation::ParamToSink { sink_type, .. } if sink_type == "filesystem"));
1887
1888                        let has_any_sink = caller_summary
1889                            .propagation_rules
1890                            .iter()
1891                            .any(|r| matches!(r, TaintPropagation::ParamToSink { .. }));
1892
1893                        // Check if caller has internal vulnerability
1894                        // This handles cases where caller consumes the source and sinks it locally
1895                        // (possibly via ParamToParam propagation in a helper)
1896                        let has_internal = caller_summary.has_internal_vulnerability;
1897
1898                        if has_filesystem_sink || has_any_sink || has_internal {
1899                            // Caller has a sink and receives tainted data from current_func
1900                            let sink_type = if has_internal {
1901                                "internal_sink".to_string()
1902                            } else {
1903                                caller_summary
1904                                    .propagation_rules
1905                                    .iter()
1906                                    .find_map(|r| match r {
1907                                        TaintPropagation::ParamToSink { sink_type, .. } => {
1908                                            Some(sink_type.clone())
1909                                        }
1910                                        _ => None,
1911                                    })
1912                                    .unwrap_or_else(|| "unknown".to_string())
1913                            };
1914
1915                            flows.push(TaintPath {
1916                                source_function: path[0].clone(),
1917                                sink_function: caller.clone(),
1918                                call_chain: new_path.clone(),
1919                                source_type: Self::extract_source_type(taint),
1920                                sink_type,
1921                                sanitized: effective_sanitized,
1922                            });
1923
1924                            // Early exit if we have too many flows
1925                            if flows.len() >= self.config.max_flows_per_source {
1926                                return flows;
1927                            }
1928                        }
1929                    }
1930                }
1931
1932                // Early exit if we have too many flows
1933                if flows.len() >= self.config.max_flows_per_source {
1934                    return flows;
1935                }
1936
1937                // Recursively explore from the caller
1938                let sub_flows = self.find_paths_from_source(caller, taint, new_path, visited);
1939
1940                // Only add flows if we haven't hit the limit
1941                let remaining = self.config.max_flows_per_source.saturating_sub(flows.len());
1942                flows.extend(sub_flows.into_iter().take(remaining));
1943
1944                if flows.len() >= self.config.max_flows_per_source {
1945                    return flows;
1946                }
1947            }
1948        }
1949
1950        // NOTE: We do NOT remove from visited here - this prevents exponential
1951        // path exploration in complex call graphs. Each function is visited once
1952        // per source function exploration. Combined with max_path_depth, this
1953        // ensures O(n) complexity instead of O(branches^depth).
1954
1955        flows
1956    }
1957
1958    #[allow(dead_code)]
1959    /// Explore callees of a function that propagates taint to find eventual sinks.
1960    /// This enables N-level detection by following taint through intermediate propagators.
1961    ///
1962    /// Example: source() -> caller() -> propagator() -> sink()
1963    ///          We're at 'caller', which calls 'propagator' (which propagates).
1964    ///          We need to check if 'propagator' calls 'sink'.
1965    ///
1966    /// CURRENTLY DISABLED: Causes performance issues, needs better algorithm
1967    fn explore_callees_for_sinks(
1968        &self,
1969        current_func: &str,
1970        source_taint: &ReturnTaint,
1971        path: Vec<String>,
1972        visited: &mut HashSet<String>,
1973    ) -> Vec<TaintPath> {
1974        let mut flows = Vec::new();
1975
1976        // Debug: limit recursion depth
1977        if path.len() > 10 {
1978            eprintln!(
1979                "WARNING: Path too deep ({}), stopping exploration",
1980                path.len()
1981            );
1982            return flows;
1983        }
1984
1985        // Get the call graph node for the current function
1986        let Some(node) = self.call_graph.nodes.get(current_func) else {
1987            return flows;
1988        };
1989
1990        // Explore each function that current_func calls
1991        for callee_site in &node.callees {
1992            let callee_name = &callee_site.callee;
1993
1994            // Avoid infinite loops
1995            if visited.contains(callee_name) {
1996                continue;
1997            }
1998
1999            let Some(callee_summary) = self.summaries.get(callee_name) else {
2000                continue;
2001            };
2002
2003            // Check if this callee has a sink
2004            let callee_has_sink = callee_summary
2005                .propagation_rules
2006                .iter()
2007                .any(|r| matches!(r, TaintPropagation::ParamToSink { .. }));
2008
2009            if callee_has_sink {
2010                // Found a direct sink - create flow
2011                let mut extended_path = path.clone();
2012                extended_path.push(callee_name.clone());
2013
2014                // Extract sink type from the sink rule
2015                let sink_type = callee_summary
2016                    .propagation_rules
2017                    .iter()
2018                    .find_map(|r| match r {
2019                        TaintPropagation::ParamToSink { sink_type, .. } => Some(sink_type.clone()),
2020                        _ => None,
2021                    })
2022                    .unwrap_or_else(|| "unknown_sink".to_string());
2023
2024                flows.push(TaintPath {
2025                    source_function: path[0].clone(),
2026                    sink_function: callee_name.clone(),
2027                    call_chain: extended_path,
2028                    source_type: Self::extract_source_type(source_taint),
2029                    sink_type,
2030                    sanitized: false,
2031                });
2032            } else if matches!(callee_summary.return_taint, ReturnTaint::FromParameter(_)) {
2033                // This callee also propagates - explore its callees recursively
2034                let mut extended_path = path.clone();
2035                extended_path.push(callee_name.clone());
2036
2037                visited.insert(callee_name.clone());
2038                flows.extend(self.explore_callees_for_sinks(
2039                    callee_name,
2040                    source_taint,
2041                    extended_path,
2042                    visited,
2043                ));
2044                visited.remove(callee_name);
2045            }
2046        }
2047
2048        flows
2049    }
2050
2051    /// Check if any function in the path sanitizes its input
2052    fn path_is_sanitized(&self, path: &[String]) -> bool {
2053        path.iter().any(|func_name| {
2054            if let Some(summary) = self.summaries.get(func_name) {
2055                summary
2056                    .propagation_rules
2057                    .iter()
2058                    .any(|r| matches!(r, TaintPropagation::ParamSanitized(_)))
2059            } else {
2060                false
2061            }
2062        })
2063    }
2064
2065    /// Extract source type from ReturnTaint
2066    fn extract_source_type(taint: &ReturnTaint) -> String {
2067        match taint {
2068            ReturnTaint::FromSource { source_type } => source_type.clone(),
2069            ReturnTaint::FromParameter(_) => "parameter".to_string(),
2070            ReturnTaint::Merged(taints) => {
2071                // Take first source type from merged
2072                if let Some(first) = taints.first() {
2073                    Self::extract_source_type(first)
2074                } else {
2075                    "unknown".to_string()
2076                }
2077            }
2078            ReturnTaint::Clean => "clean".to_string(),
2079        }
2080    }
2081}
2082
2083/// Represents a complete taint flow from source to sink
2084#[derive(Debug, Clone)]
2085pub struct TaintPath {
2086    /// Function where taint originates
2087    pub source_function: String,
2088
2089    /// Function where taint reaches a sink
2090    pub sink_function: String,
2091
2092    /// Complete call chain: [source, caller1, caller2, ..., sink]
2093    pub call_chain: Vec<String>,
2094
2095    /// Type of taint source
2096    pub source_type: String,
2097
2098    /// Type of sink
2099    pub sink_type: String,
2100
2101    /// Whether the taint was sanitized along the path
2102    pub sanitized: bool,
2103}
2104
2105impl TaintPath {
2106    /// Create a human-readable description of this taint flow
2107    pub fn describe(&self) -> String {
2108        let chain = self.call_chain.join(" → ");
2109        let sanitized_note = if self.sanitized {
2110            " [SANITIZED - SAFE]"
2111        } else {
2112            ""
2113        };
2114        format!(
2115            "Tainted data from {} (source: {}) flows through {} to {} (sink: {}){}",
2116            self.source_function,
2117            self.source_type,
2118            chain,
2119            self.sink_function,
2120            self.sink_type,
2121            sanitized_note
2122        )
2123    }
2124
2125    /// Get the number of levels in the call chain
2126    pub fn depth(&self) -> usize {
2127        self.call_chain.len()
2128    }
2129}
2130
2131#[cfg(test)]
2132mod tests {
2133    use super::*;
2134
2135    #[test]
2136    fn test_extract_function_name() {
2137        assert_eq!(
2138            CallGraph::extract_function_name("std::process::Command::new"),
2139            "new"
2140        );
2141
2142        assert_eq!(
2143            CallGraph::extract_function_name("my_function"),
2144            "my_function"
2145        );
2146
2147        assert_eq!(
2148            CallGraph::extract_function_name("const my_crate::util::helper"),
2149            "helper"
2150        );
2151    }
2152
2153    #[test]
2154    fn test_is_builtin_operation() {
2155        assert!(CallGraph::is_builtin_operation("assert!"));
2156        assert!(CallGraph::is_builtin_operation("println!"));
2157        assert!(CallGraph::is_builtin_operation("_internal"));
2158        assert!(!CallGraph::is_builtin_operation("my_function"));
2159    }
2160
2161    #[test]
2162    fn test_function_summary_creation() {
2163        let summary = FunctionSummary::new("test_function".to_string());
2164        assert_eq!(summary.function_name, "test_function");
2165        assert!(summary.source_parameters.is_empty());
2166        assert!(summary.sink_parameters.is_empty());
2167        assert!(summary.propagation_rules.is_empty());
2168        assert!(matches!(summary.return_taint, ReturnTaint::Clean));
2169    }
2170
2171    #[test]
2172    fn test_call_site_creation() {
2173        let site = CallSite {
2174            callee: "execute_command".to_string(),
2175            resolved_targets: Vec::new(),
2176            location: "test.rs:42".to_string(),
2177            arg_count: 1,
2178        };
2179        assert_eq!(site.callee, "execute_command");
2180        assert_eq!(site.location, "test.rs:42");
2181        assert_eq!(site.arg_count, 1);
2182    }
2183
2184    #[test]
2185    fn test_taint_propagation_patterns() {
2186        let param_to_return = TaintPropagation::ParamToReturn(0);
2187        let param_to_param = TaintPropagation::ParamToParam { from: 0, to: 1 };
2188        let param_to_sink = TaintPropagation::ParamToSink {
2189            param: 0,
2190            sink_type: "command".to_string(),
2191        };
2192        let param_sanitized = TaintPropagation::ParamSanitized(0);
2193
2194        // Test that patterns are distinct
2195        assert!(matches!(
2196            param_to_return,
2197            TaintPropagation::ParamToReturn(_)
2198        ));
2199        assert!(matches!(
2200            param_to_param,
2201            TaintPropagation::ParamToParam { .. }
2202        ));
2203        assert!(matches!(
2204            param_to_sink,
2205            TaintPropagation::ParamToSink { .. }
2206        ));
2207        assert!(matches!(
2208            param_sanitized,
2209            TaintPropagation::ParamSanitized(_)
2210        ));
2211    }
2212
2213    #[test]
2214    fn test_return_taint_patterns() {
2215        let clean = ReturnTaint::Clean;
2216        let from_param = ReturnTaint::FromParameter(0);
2217        let from_source = ReturnTaint::FromSource {
2218            source_type: "env".to_string(),
2219        };
2220        let merged = ReturnTaint::Merged(vec![
2221            ReturnTaint::FromSource {
2222                source_type: "env".to_string(),
2223            },
2224            ReturnTaint::FromSource {
2225                source_type: "file".to_string(),
2226            },
2227        ]);
2228
2229        // Test that patterns are distinct
2230        assert!(matches!(clean, ReturnTaint::Clean));
2231        assert!(matches!(from_param, ReturnTaint::FromParameter(_)));
2232        assert!(matches!(from_source, ReturnTaint::FromSource { .. }));
2233        assert!(matches!(merged, ReturnTaint::Merged(_)));
2234    }
2235}
mir_extractor/interprocedural.rs

mir_extractor/
interprocedural.rs