mir_extractor/interprocedural.rs
1//! Inter-procedural taint analysis (Phase 3)
2//!
3//! This module implements inter-procedural dataflow analysis to track taint
4//! across function boundaries. It builds on Phase 2's intra-procedural analysis
5//! by constructing a call graph and propagating taint through function calls.
6//!
7//! ## Architecture
8//!
9//! 1. **Call Graph Construction**: Extract function calls from MIR to build
10//! a directed graph of function dependencies.
11//!
12//! 2. **Function Summarization**: Analyze each function to create summaries
13//! describing how taint flows through parameters and return values.
14//!
15//! 3. **Inter-Procedural Propagation**: Use summaries to track taint across
16//! function boundaries, following chains of calls from sources to sinks.
17//!
18//! 4. **Context-Sensitive Analysis**: Distinguish different call sites to
19//! maintain precision and avoid false positives.
20
21use anyhow::Result;
22use serde::{Deserialize, Serialize};
23use std::cell::RefCell;
24use std::collections::{HashMap, HashSet, VecDeque};
25
26use crate::dataflow::cfg::ControlFlowGraph;
27use crate::dataflow::closure::{ClosureRegistry, ClosureRegistryBuilder};
28use crate::dataflow::{DataflowSummary, TaintPropagation};
29use crate::{MirFunction, MirPackage};
30
31/// Configuration for inter-procedural analysis (IPA) limits.
32///
33/// These limits control memory usage and analysis depth for taint tracking.
34/// Increase limits for more thorough analysis on high-memory machines.
35/// Decrease limits to analyze extremely large codebases with limited memory.
36///
37/// All fields have sensible defaults that work well for most codebases.
38#[derive(Debug, Clone, Serialize, Deserialize)]
39#[serde(default)]
40pub struct IpaConfig {
41 /// Maximum call chain depth from source to sink (default: 8)
42 /// Most real vulnerabilities have depth < 5.
43 pub max_path_depth: usize,
44
45 /// Maximum taint flows tracked per source function (default: 200)
46 /// Increase if you suspect missed flows from a single source.
47 pub max_flows_per_source: usize,
48
49 /// Maximum functions visited per source exploration (default: 1000)
50 /// Prevents memory explosion in extremely dense call graphs.
51 pub max_visited: usize,
52
53 /// Maximum total inter-procedural flows reported (default: 5000)
54 /// Most real analyses produce < 500 flows.
55 pub max_total_flows: usize,
56
57 /// Maximum functions before skipping interprocedural analysis (default: 10000)
58 /// For crates exceeding this threshold, IPA is skipped but intra-procedural
59 /// analysis still runs for all rules.
60 pub max_functions_for_ipa: usize,
61}
62
63impl Default for IpaConfig {
64 fn default() -> Self {
65 Self {
66 max_path_depth: 8,
67 max_flows_per_source: 200,
68 max_visited: 1000,
69 max_total_flows: 5000,
70 max_functions_for_ipa: 10000,
71 }
72 }
73}
74
75/// Call graph representing function call relationships
76#[derive(Debug, Clone)]
77pub struct CallGraph {
78 /// Function name → CallGraphNode
79 pub nodes: HashMap<String, CallGraphNode>,
80
81 /// Analysis order (bottom-up: callees before callers)
82 pub analysis_order: Vec<String>,
83}
84
85/// Node in the call graph representing a function
86#[derive(Debug, Clone)]
87pub struct CallGraphNode {
88 /// Function name (fully qualified)
89 pub function_name: String,
90
91 /// Functions that call this function
92 pub callers: Vec<String>,
93
94 /// Functions called by this function
95 pub callees: Vec<CallSite>,
96 // Note: Function summaries are stored in InterProceduralAnalysis::summaries
97 // to avoid memory duplication
98}
99
100/// A specific call site within a function
101#[derive(Debug, Clone)]
102pub struct CallSite {
103 /// Name of the called function
104 pub callee: String,
105
106 /// Resolved target functions
107 pub resolved_targets: Vec<String>,
108
109 /// Location in the caller's MIR (for error reporting)
110 pub location: String,
111
112 /// Number of arguments passed
113 pub arg_count: usize,
114}
115
116/// Summary of a function's taint behavior
117#[derive(Debug, Clone)]
118pub struct FunctionSummary {
119 /// Function name
120 pub function_name: String,
121
122 /// Which parameters can introduce taint (parameter index)
123 pub source_parameters: HashSet<usize>,
124
125 /// Which parameters flow to sinks within this function
126 pub sink_parameters: HashSet<usize>,
127
128 /// Taint propagation rules
129 pub propagation_rules: Vec<TaintPropagation>,
130
131 /// Does the return value carry taint?
132 pub return_taint: ReturnTaint,
133
134 /// Does the function contain an internal vulnerability (source -> sink)?
135 pub has_internal_vulnerability: bool,
136}
137
138/// Describes the taint state of a function's return value
139#[derive(Debug, Clone)]
140pub enum ReturnTaint {
141 /// Return value is clean (not tainted)
142 Clean,
143
144 /// Return value is tainted from parameter N
145 FromParameter(usize),
146
147 /// Return value is tainted from a source within the function
148 FromSource { source_type: String },
149
150 /// Return value depends on multiple taint sources
151 Merged(Vec<ReturnTaint>),
152}
153
154impl CallGraph {
155 /// Construct a call graph from a MIR package
156 pub fn from_mir_package(package: &MirPackage) -> Result<Self> {
157 let mut nodes = HashMap::new();
158
159 // Phase 1: Create nodes for all functions
160 for function in &package.functions {
161 let node = CallGraphNode {
162 function_name: function.name.clone(),
163 callers: Vec::new(),
164 callees: Vec::new(),
165 };
166 nodes.insert(function.name.clone(), node);
167 }
168
169 // Phase 2: Extract callees from each function's MIR
170 for function in &package.functions {
171 let callees = Self::extract_callees(function)?;
172 if let Some(node) = nodes.get_mut(&function.name) {
173 node.callees = callees;
174 }
175 }
176
177 // Build a map of short names to full names for resolution
178 let mut short_name_map: HashMap<String, Vec<String>> = HashMap::new();
179 for function in &package.functions {
180 let short_name = Self::extract_function_name(&function.name);
181 short_name_map
182 .entry(short_name)
183 .or_default()
184 .push(function.name.clone());
185 }
186
187 // Phase 3: Resolve calls and build caller relationships
188 let mut caller_map: HashMap<String, Vec<String>> = HashMap::new();
189 let mut resolved_callees_map: HashMap<String, Vec<CallSite>> = HashMap::new();
190
191 for (caller_name, node) in &nodes {
192 let mut resolved_callees = Vec::new();
193
194 for call_site in &node.callees {
195 // Try direct match first
196 if nodes.contains_key(&call_site.callee) {
197 let mut new_site = call_site.clone();
198 new_site.resolved_targets.push(call_site.callee.clone());
199 resolved_callees.push(new_site);
200
201 caller_map
202 .entry(call_site.callee.clone())
203 .or_default()
204 .push(caller_name.clone());
205 } else {
206 // Try to resolve via short name (e.g. trait methods)
207 let short_name = Self::extract_function_name(&call_site.callee);
208
209 if let Some(candidates) = short_name_map.get(&short_name) {
210 // Resolved match (trait method, etc.)
211 let mut new_site = call_site.clone();
212 for candidate in candidates {
213 new_site.resolved_targets.push(candidate.clone());
214
215 caller_map
216 .entry(candidate.clone())
217 .or_default()
218 .push(caller_name.clone());
219 }
220 resolved_callees.push(new_site);
221 } else {
222 // Unresolved - keep as is (maybe external function)
223 resolved_callees.push(call_site.clone());
224 }
225 }
226 }
227 resolved_callees_map.insert(caller_name.clone(), resolved_callees);
228 }
229
230 // Apply resolved callees
231 for (caller_name, callees) in resolved_callees_map {
232 if let Some(node) = nodes.get_mut(&caller_name) {
233 node.callees = callees;
234 }
235 }
236
237 for (callee_name, callers) in caller_map {
238 if let Some(node) = nodes.get_mut(&callee_name) {
239 node.callers = callers;
240 }
241 }
242
243 // Phase 4: Compute analysis order (bottom-up)
244 let analysis_order = Self::compute_analysis_order(&nodes)?;
245
246 Ok(CallGraph {
247 nodes,
248 analysis_order,
249 })
250 }
251
252 /// Extract callee information from a function's MIR
253 fn extract_callees(function: &MirFunction) -> Result<Vec<CallSite>> {
254 let mut callees = Vec::new();
255
256 // Parse MIR to find function calls
257 // MIR calls look like: "_N = function_name(args...)" or
258 // "TerminatorKind::Call { func: ... }"
259
260 for (line_idx, line) in function.body.iter().enumerate() {
261 // Look for call patterns in MIR
262 // Common patterns:
263 // 1. "= Fn(DefId(...), Substs(...))(" - direct function call
264 // 2. "= <Type as Trait>::method(" - trait method call
265 // 3. "Call { func: Operand::Constant..." - terminator call
266
267 if let Some(call_site) = Self::parse_call_from_mir_line(line, line_idx) {
268 callees.push(call_site);
269 }
270
271 // Check for closure/coroutine creation
272 if let Some(closure_callee) =
273 Self::parse_closure_creation(line, &function.name, line_idx)
274 {
275 callees.push(closure_callee);
276 }
277 }
278
279 Ok(callees)
280 }
281
282 /// Parse closure or coroutine creation as a "call" (dependency)
283 fn parse_closure_creation(line: &str, parent_name: &str, line_idx: usize) -> Option<CallSite> {
284 // _0 = {coroutine@... (#0)} ...
285 if let Some(eq_pos) = line.find('=') {
286 let rhs = line[eq_pos + 1..].trim();
287 if rhs.starts_with("{closure@") || rhs.starts_with("{coroutine@") {
288 // Extract index (#N)
289 if let Some(hash_pos) = rhs.find("(#") {
290 if let Some(close_paren) = rhs[hash_pos..].find(')') {
291 let index_str = &rhs[hash_pos + 2..hash_pos + close_paren];
292 if let Ok(index) = index_str.parse::<usize>() {
293 let callee = format!("{}::{{closure#{}}}", parent_name, index);
294 return Some(CallSite {
295 callee,
296 resolved_targets: Vec::new(),
297 location: format!("line {}", line_idx),
298 arg_count: 0,
299 });
300 }
301 }
302 }
303 }
304 }
305 None
306 }
307
308 /// Parse a single MIR line to detect function calls
309 fn parse_call_from_mir_line(line: &str, line_idx: usize) -> Option<CallSite> {
310 let line = line.trim();
311
312 // Pattern 1: Direct function call in statement
313 // Example: "_5 = my_function(move _6) -> [return: bb3, unwind: bb4];"
314 if line.contains("(") && line.contains(") -> [return:") {
315 // Pattern 1a: Closure invocation
316 // Example: "<{closure@examples/interprocedural/src/lib.rs:278:19: 278:21} as Fn<()>>::call(..."
317 if let Some(closure_name) = Self::extract_closure_call(line) {
318 return Some(CallSite {
319 callee: closure_name,
320 resolved_targets: Vec::new(),
321 location: format!("line {}", line_idx),
322 arg_count: 1, // Closure takes the closure env as arg
323 });
324 }
325
326 // Extract function name between '=' and '('
327 if let Some(eq_pos) = line.find('=') {
328 if let Some(paren_pos) = line[eq_pos..].find('(') {
329 let func_part = &line[eq_pos + 1..eq_pos + paren_pos].trim();
330
331 // Clean up the function name
332 // We want the full name here, but cleaned of MIR artifacts
333 let func_name = func_part
334 .replace("const ", "")
335 .replace("move ", "")
336 .replace("copy ", "")
337 .trim()
338 .to_string();
339
340 if !func_name.is_empty() && !Self::is_builtin_operation(&func_name) {
341 // Count arguments (rough estimate)
342 let args_section = &line[eq_pos + paren_pos + 1..];
343 let arg_count = Self::estimate_arg_count(args_section);
344
345 return Some(CallSite {
346 callee: func_name,
347 resolved_targets: Vec::new(),
348 location: format!("line {}", line_idx),
349 arg_count,
350 });
351 }
352 }
353 }
354 }
355
356 None
357 }
358
359 /// Extract clean function name from MIR representation
360 fn extract_function_name(mir_repr: &str) -> String {
361 // MIR function names can be complex, e.g.:
362 // "std::process::Command::new"
363 // "<std::process::Command as std::ops::Drop>::drop"
364 // "my_crate::module::function"
365
366 let cleaned = mir_repr
367 .trim()
368 .replace("const ", "")
369 .replace("move ", "")
370 .replace("copy ", "");
371
372 // Extract the last meaningful part
373 if let Some(last_colon) = cleaned.rfind("::") {
374 cleaned[last_colon + 2..].trim().to_string()
375 } else {
376 cleaned.trim().to_string()
377 }
378 }
379
380 /// Check if this is a built-in operation (not a real function call)
381 fn is_builtin_operation(name: &str) -> bool {
382 matches!(
383 name,
384 "assert_eq!" | "assert!" | "println!" | "dbg!" | "format!"
385 ) || name.starts_with("_")
386 || name.is_empty()
387 }
388
389 /// Extract closure function name from MIR closure call pattern
390 /// Pattern: "<{closure@path/to/file.rs:line:col: line:col} as Fn<()>>::call(..."
391 /// Returns: "parent_function::{closure#N}"
392 fn extract_closure_call(line: &str) -> Option<String> {
393 // Look for closure call pattern
394 if !line.contains("{closure@") || !line.contains("as Fn") {
395 return None;
396 }
397
398 // Extract the closure location from "{closure@path:line:col: line:col}"
399 let start = line.find("{closure@")?;
400 let end = line[start..].find("}")?;
401 let closure_loc = &line[start..start + end + 1];
402
403 // The closure location looks like: {closure@examples/interprocedural/src/lib.rs:278:19: 278:21}
404 // We need to find the corresponding closure function name by matching the file:line
405 // The closure function is named like: parent_function::{closure#0}
406
407 // For now, return the raw closure identifier - it will be matched in the function list
408 // The function name for this closure is: test_closure_capture::{closure#0}
409
410 // Extract just the location part for matching
411 if let Some(at_pos) = closure_loc.find('@') {
412 let location = &closure_loc[at_pos + 1..closure_loc.len() - 1]; // Remove "{closure@" and "}"
413 // location is like "examples/interprocedural/src/lib.rs:278:19: 278:21"
414 // Take the file and first line number
415 let parts: Vec<&str> = location.split(':').collect();
416 if parts.len() >= 2 {
417 let file = parts[0];
418 let line_num = parts[1];
419 // Create a unique identifier for matching
420 return Some(format!("{{closure@{}:{}}}", file, line_num));
421 }
422 }
423
424 None
425 }
426
427 /// Estimate argument count from MIR call syntax
428 fn estimate_arg_count(args_section: &str) -> usize {
429 // Count commas outside of nested structures
430 // This is a rough heuristic
431 args_section.matches(',').count() + 1
432 }
433
434 /// Compute bottom-up analysis order (callees before callers)
435 fn compute_analysis_order(nodes: &HashMap<String, CallGraphNode>) -> Result<Vec<String>> {
436 // Use Kahn's algorithm for topological sort
437 let mut in_degree: HashMap<String, usize> = HashMap::new();
438 let mut order = Vec::new();
439
440 // Calculate in-degrees (number of internal callees)
441 for (name, node) in nodes {
442 let internal_callees = node
443 .callees
444 .iter()
445 .filter(|c| nodes.contains_key(&c.callee))
446 .count();
447 in_degree.insert(name.clone(), internal_callees);
448 // println!("[DEBUG] In-degree for {}: {}", name, internal_callees);
449 }
450
451 // Start with leaf functions (no internal callees)
452 let mut queue: VecDeque<String> = nodes
453 .iter()
454 .filter(|(name, _)| in_degree.get(*name).copied().unwrap_or(0) == 0)
455 .map(|(name, _)| name.clone())
456 .collect();
457
458 // println!("[DEBUG] Initial queue size: {}", queue.len());
459
460 // Process nodes in bottom-up order
461 while let Some(current) = queue.pop_front() {
462 order.push(current.clone());
463
464 // For each function that calls this one
465 if let Some(node) = nodes.get(¤t) {
466 // println!("[DEBUG] Processed {}, notifying callers: {:?}", current, node.callers);
467 for caller in &node.callers {
468 if let Some(degree) = in_degree.get_mut(caller) {
469 if *degree > 0 {
470 *degree -= 1;
471 if *degree == 0 {
472 queue.push_back(caller.clone());
473 }
474 }
475 }
476 }
477 }
478 }
479
480 // Check for cycles (recursion)
481 if order.len() < nodes.len() {
482 // There are cycles; include remaining nodes anyway
483 // (Phase 3.4 will handle recursion with depth limits)
484 for (name, _) in nodes {
485 if !order.contains(name) {
486 order.push(name.clone());
487 }
488 }
489 }
490
491 Ok(order)
492 }
493
494 /// Get the analysis order (bottom-up: callees before callers)
495 pub fn get_analysis_order(&self) -> &[String] {
496 &self.analysis_order
497 }
498
499 /// Get a node by function name
500 pub fn get_node(&self, function_name: &str) -> Option<&CallGraphNode> {
501 self.nodes.get(function_name)
502 }
503
504 /// Get a mutable node by function name
505 pub fn get_node_mut(&mut self, function_name: &str) -> Option<&mut CallGraphNode> {
506 self.nodes.get_mut(function_name)
507 }
508}
509
510impl FunctionSummary {
511 /// Create a new empty function summary
512 pub fn new(function_name: String) -> Self {
513 FunctionSummary {
514 function_name,
515 source_parameters: HashSet::new(),
516 sink_parameters: HashSet::new(),
517 propagation_rules: Vec::new(),
518 return_taint: ReturnTaint::Clean,
519 has_internal_vulnerability: false,
520 }
521 }
522
523 /// Generate a summary for a function using intra-procedural analysis
524 pub fn from_mir_function(
525 function: &MirFunction,
526 callee_summaries: &HashMap<String, FunctionSummary>,
527 closure_registry: Option<&ClosureRegistry>,
528 ) -> Result<Self> {
529 let mut summary = FunctionSummary::new(function.name.clone());
530
531 // Phase 3.5.1: Use CFG-based path-sensitive analysis for branching functions
532 // Phase 3.5.2: Use closure context if available
533 let cfg = ControlFlowGraph::from_mir_function(function);
534
535 // Skip path-sensitive analysis for very large functions to prevent memory spikes
536 const MAX_BLOCKS_FOR_PATH_ANALYSIS: usize = 500;
537 let use_path_sensitive = cfg.block_count() <= MAX_BLOCKS_FOR_PATH_ANALYSIS;
538
539 if use_path_sensitive {
540 use crate::dataflow::path_sensitive::PathSensitiveTaintAnalysis;
541 use crate::dataflow::DataflowSummary;
542
543 // Convert FunctionSummary to DataflowSummary for path-sensitive analysis
544 let dataflow_summaries: HashMap<String, DataflowSummary> = callee_summaries
545 .iter()
546 .map(|(k, v)| (k.clone(), v.to_dataflow_summary()))
547 .collect();
548
549 let mut path_analysis = PathSensitiveTaintAnalysis::new(cfg);
550
551 // Check if this is a closure function
552 let closure_info = closure_registry.and_then(|r| r.get_closure(&function.name));
553
554 if let Some(info) = closure_info {
555 // This is a closure - analyze with captured variable context
556 // Run 1: Analyze with actual capture states (from registry)
557 let result =
558 path_analysis.analyze_closure(function, info, Some(&dataflow_summaries));
559
560 if result.has_any_vulnerable_path {
561 if info.has_tainted_captures() {
562 summary
563 .propagation_rules
564 .push(TaintPropagation::ParamToSink {
565 param: 0,
566 sink_type: "command_execution".to_string(),
567 });
568 } else {
569 // No tainted captures, but found a vulnerability -> must be internal
570 summary.has_internal_vulnerability = true;
571 }
572 }
573
574 // Run 2: Analyze assuming captures are tainted (to detect propagation)
575 // This is crucial for async functions where captures are initially clean but become tainted at runtime
576 use crate::dataflow::path_sensitive::TaintState;
577 let mut initial_taint = HashMap::new();
578
579 for capture in &info.captured_vars {
580 let env_var = format!("((*_1).{})", capture.field_index);
581 initial_taint.insert(
582 env_var.clone(),
583 TaintState::Tainted {
584 source_type: "captured_variable".to_string(),
585 source_location: format!("capture_{}", capture.field_index),
586 },
587 );
588
589 // For async/coroutines (Pin<&mut T>), the path is deeper: ((*((*_1).0)).N)
590 // _1 is Pin<&mut Coroutine>, _1.0 is &mut Coroutine, *(_1.0) is Coroutine
591 let async_env_var = format!("((*((*_1).0)).{})", capture.field_index);
592 initial_taint.insert(
593 async_env_var,
594 TaintState::Tainted {
595 source_type: "captured_variable".to_string(),
596 source_location: format!("capture_{}", capture.field_index),
597 },
598 );
599 }
600
601 if !initial_taint.is_empty() {
602 let result_propagated = path_analysis.analyze_with_initial_taint(
603 function,
604 initial_taint,
605 Some(&dataflow_summaries),
606 );
607 if result_propagated.has_any_vulnerable_path {
608 summary
609 .propagation_rules
610 .push(TaintPropagation::ParamToSink {
611 param: 0,
612 sink_type: "command_execution".to_string(),
613 });
614 }
615 }
616 } else {
617 // Not a closure - analyze parameters
618
619 // Run 1: Check for internal sources (no initial taint)
620 let result_internal = path_analysis.analyze(function, Some(&dataflow_summaries));
621 if result_internal.has_any_vulnerable_path {
622 summary.has_internal_vulnerability = true;
623 }
624
625 // Check if return value is tainted
626 if result_internal
627 .path_results
628 .iter()
629 .any(|p| p.return_tainted)
630 {
631 summary.return_taint = ReturnTaint::FromSource {
632 source_type: "propagated".to_string(),
633 };
634 }
635
636 // Run 2: Check all parameters together (memory optimization)
637 // Instead of running N separate analyses for N parameters,
638 // we run one analysis with all parameters tainted and track which ones reach sinks/returns
639 use crate::dataflow::path_sensitive::TaintState;
640
641 // Determine argument count from signature
642 let arg_count = if let Some(start) = function.signature.find('(') {
643 if let Some(end) = function.signature.rfind(')') {
644 let args = &function.signature[start + 1..end];
645 if args.trim().is_empty() {
646 0
647 } else {
648 args.split(',').count()
649 }
650 } else {
651 0
652 }
653 } else {
654 0
655 };
656
657 // Limit analysis to actual arguments (max 10)
658 let max_arg = std::cmp::min(arg_count, 10);
659
660 if max_arg > 0 {
661 // Taint all parameters at once with unique source locations
662 let mut initial_taint = HashMap::new();
663 for i in 1..=max_arg {
664 let param_name = format!("_{}", i);
665 initial_taint.insert(
666 param_name,
667 TaintState::Tainted {
668 source_type: "parameter".to_string(),
669 source_location: format!("param_{}", i),
670 },
671 );
672 }
673
674 let result = path_analysis.analyze_with_initial_taint(
675 function,
676 initial_taint,
677 Some(&dataflow_summaries),
678 );
679
680 // Check which parameters reached sinks
681 if result.has_any_vulnerable_path {
682 // For now, mark all parameters as potentially reaching sinks
683 // A more precise analysis would track which specific parameter reached which sink
684 for i in 1..=max_arg {
685 summary
686 .propagation_rules
687 .push(TaintPropagation::ParamToSink {
688 param: i - 1,
689 sink_type: "command_execution".to_string(),
690 });
691 }
692 }
693
694 // Check if return value is tainted (any parameter flows to return)
695 if result.path_results.iter().any(|p| p.return_tainted) {
696 // Mark all parameters as potentially flowing to return
697 for i in 1..=max_arg {
698 summary
699 .propagation_rules
700 .push(TaintPropagation::ParamToReturn(i - 1));
701 }
702 }
703
704 // Check for sanitization
705 if result
706 .path_results
707 .iter()
708 .any(|p| !p.sanitizer_calls.is_empty())
709 {
710 for i in 1..=max_arg {
711 summary
712 .propagation_rules
713 .push(TaintPropagation::ParamSanitized(i - 1));
714 }
715 }
716 }
717 }
718
719 if !summary.propagation_rules.is_empty() || summary.has_internal_vulnerability {
720 return Ok(summary);
721 }
722 } // end use_path_sensitive
723
724 // Use Phase 2's taint analysis to understand intra-procedural flows
725 // For now, we'll do a simple analysis based on MIR patterns
726
727 // Step 1: Identify if this function contains sources
728 let has_source = Self::contains_source(function);
729 if has_source {
730 summary.return_taint = ReturnTaint::FromSource {
731 source_type: "environment".to_string(),
732 };
733 }
734
735 // Step 2: Identify if this function contains sinks and determine sink type
736 let has_command_sink = Self::contains_command_sink(function);
737 let has_filesystem_sink = Self::contains_filesystem_sink(function);
738 let has_http_sink = Self::contains_http_sink(function);
739 let has_yaml_sink = Self::contains_yaml_sink(function);
740
741 // Step 3: Analyze parameter flows
742 // Check if function propagates parameters to return value
743 let propagates_param_to_return = Self::propagates_param_to_return(function);
744 if propagates_param_to_return && !has_source {
745 // Function takes parameter and returns it (or derivative)
746 // This enables N-level taint propagation
747 summary.return_taint = ReturnTaint::FromParameter(0);
748 summary
749 .propagation_rules
750 .push(TaintPropagation::ParamToReturn(0));
751 }
752
753 // Step 4: Check for sanitization patterns
754 let has_sanitization = Self::contains_sanitization(function);
755
756 // Build propagation rules based on patterns
757 if has_command_sink {
758 // If function has a command sink, parameters likely flow to it
759 summary
760 .propagation_rules
761 .push(TaintPropagation::ParamToSink {
762 param: 0,
763 sink_type: "command_execution".to_string(),
764 });
765 }
766
767 if has_filesystem_sink {
768 // If function has a filesystem sink, parameters likely flow to it
769 summary
770 .propagation_rules
771 .push(TaintPropagation::ParamToSink {
772 param: 0,
773 sink_type: "filesystem".to_string(),
774 });
775 }
776
777 if has_http_sink {
778 // If function has an HTTP sink, parameters likely flow to it (SSRF)
779 summary
780 .propagation_rules
781 .push(TaintPropagation::ParamToSink {
782 param: 0,
783 sink_type: "http".to_string(),
784 });
785 }
786
787 if has_yaml_sink {
788 // If function has a YAML deserialization sink
789 summary
790 .propagation_rules
791 .push(TaintPropagation::ParamToSink {
792 param: 0,
793 sink_type: "yaml".to_string(),
794 });
795 }
796
797 if has_sanitization {
798 // Function performs sanitization
799 summary
800 .propagation_rules
801 .push(TaintPropagation::ParamSanitized(0));
802 }
803
804 // Analyze calls to other functions
805 for line in &function.body {
806 // Check if this line calls a function we have a summary for
807 if let Some((callee_name, _)) = Self::extract_call_from_line(line) {
808 if let Some(callee_summary) = callee_summaries.get(&callee_name) {
809 // Propagate taint rules from callee
810 summary.merge_callee_summary(callee_summary);
811 }
812 }
813 }
814
815 Ok(summary)
816 }
817
818 /// Check if function propagates parameter to return value
819 fn propagates_param_to_return(function: &MirFunction) -> bool {
820 // First, check if function even takes parameters
821 // Check signature for parameter list - look for pattern like "(_1:" or "(mut _1:"
822 let sig_lower = function.signature.to_lowercase();
823 let has_params = sig_lower.contains("(_1:")
824 || sig_lower.contains("(mut _1:")
825 || sig_lower.contains("( _1:");
826
827 if !has_params {
828 return false; // No parameters, can't propagate
829 }
830
831 // Exclude functions that only use constants for _1
832 let assigns_constant_to_param = function
833 .body
834 .iter()
835 .any(|line| line.trim().starts_with("_1 = const"));
836
837 if assigns_constant_to_param {
838 return false; // Assigns constant to what would be param slot
839 }
840
841 // Heuristics for parameter propagation:
842 // Look for operations on _1 (first parameter after self if present)
843 let has_param_usage = function.body.iter().any(|line| {
844 // Direct parameter operations
845 line.contains("(*_1)") // Deref of first param
846 || line.contains("Deref::deref(_1") // Explicit deref
847 || line.contains("Deref::deref(move _1")
848 // Taking references to parameters (assignment target contains &_1)
849 || (line.contains(" = &_1;") || line.contains(" = &mut _1;"))
850 // Format operations with parameter
851 || (line.contains("format!") || line.contains("format_args!"))
852 // String operations on parameters
853 || line.contains("to_string(move _1")
854 || line.contains("String::from(_1")
855 // Move or copy of parameter (common in closures/async)
856 || line.contains("move _1")
857 || line.contains("copy _1")
858 });
859
860 // Check if function returns a value (not unit type)
861 let returns_value =
862 function.signature.contains("->") && !function.signature.contains("-> ()");
863
864 has_param_usage && returns_value
865 }
866
867 /// Check if function contains a taint source
868 fn contains_source(function: &MirFunction) -> bool {
869 function.body.iter().any(|line| {
870 line.contains("std::env::args")
871 || line.contains("std::env::var")
872 || line.contains("std::fs::read")
873 || line.contains("env::args")
874 || line.contains("env::var")
875 || line.contains(" = args() -> ") // MIR format: args()
876 || line.contains(" = var") // MIR format: var() or var::<T>()
877 || line.contains(" = read") // MIR format: read() or fs::read()
878 })
879 }
880
881 /// Check if function contains a taint sink
882 #[allow(dead_code)]
883 fn contains_sink(function: &MirFunction) -> bool {
884 Self::contains_command_sink(function)
885 || Self::contains_filesystem_sink(function)
886 || Self::contains_http_sink(function)
887 || Self::contains_yaml_sink(function)
888 }
889
890 /// Check if function contains a command execution sink
891 fn contains_command_sink(function: &MirFunction) -> bool {
892 function.body.iter().any(|line| {
893 // Only match DIRECT calls to sinks, not indirect via helper functions
894 // Look for Command::new or Command::spawn, not just "spawn"
895 (line.contains("Command::new") && line.contains("->"))
896 || line.contains("std::process::Command")
897 || (line.contains("Command::spawn") && line.contains("->"))
898 || (line.contains("Command::exec") && line.contains("->"))
899 })
900 }
901
902 /// Check if function contains a filesystem sink (for path traversal detection)
903 fn contains_filesystem_sink(function: &MirFunction) -> bool {
904 function.body.iter().any(|line| {
905 // File read operations
906 line.contains("fs::read_to_string")
907 || line.contains("std::fs::read_to_string")
908 || line.contains("fs::read(")
909 || line.contains("std::fs::read(")
910 // File write operations
911 || line.contains("fs::write(")
912 || line.contains("std::fs::write(")
913 // File open operations
914 || line.contains("File::open(")
915 || line.contains("File::create(")
916 || line.contains("std::fs::File::open")
917 || line.contains("std::fs::File::create")
918 || line.contains("OpenOptions")
919 // File removal operations
920 || line.contains("fs::remove_file")
921 || line.contains("fs::remove_dir")
922 || line.contains("std::fs::remove_file")
923 || line.contains("std::fs::remove_dir")
924 // Copy/rename operations
925 || line.contains("fs::copy(")
926 || line.contains("fs::rename(")
927 || line.contains("std::fs::copy")
928 || line.contains("std::fs::rename")
929 // Directory operations
930 || line.contains("fs::create_dir")
931 || line.contains("std::fs::create_dir")
932 })
933 }
934
935 /// Check if function contains an HTTP client sink (for SSRF detection)
936 fn contains_http_sink(function: &MirFunction) -> bool {
937 function.body.iter().any(|line| {
938 // reqwest patterns
939 line.contains("reqwest::blocking::get")
940 || line.contains("reqwest::get")
941 || line.contains("blocking::get")
942 || line.contains("Client>::get")
943 || line.contains("Client>::post")
944 || line.contains("Client>::put")
945 || line.contains("Client>::delete")
946 || line.contains("Client>::patch")
947 || line.contains("Client>::head")
948 || line.contains("RequestBuilder>::send")
949 // ureq patterns
950 || line.contains("ureq::get")
951 || line.contains("ureq::post")
952 || line.contains("ureq::put")
953 || line.contains("ureq::delete")
954 || line.contains("Agent>::get")
955 || line.contains("Agent>::post")
956 || line.contains("Request>::call")
957 // hyper patterns
958 || line.contains("hyper::Client")
959 || line.contains("hyper::Request")
960 // Generic HTTP patterns
961 || line.contains("get::<&String>")
962 || line.contains("get::<&str>")
963 || line.contains("post::<&String>")
964 || line.contains("post::<&str>")
965 })
966 }
967
968 /// Check if function contains a YAML deserialization sink (for YAML injection detection)
969 fn contains_yaml_sink(function: &MirFunction) -> bool {
970 function.body.iter().any(|line| {
971 // serde_yaml patterns
972 line.contains("serde_yaml::from_str")
973 || line.contains("serde_yaml::from_slice")
974 || line.contains("serde_yaml::from_reader")
975 // MIR patterns for generic instantiation
976 || line.contains("from_str::<") && line.contains("serde_yaml")
977 || line.contains("from_slice::<") && line.contains("serde_yaml")
978 || line.contains("from_reader::<") && line.contains("serde_yaml")
979 // Generic yaml patterns - function names with yaml
980 || (line.contains("from_str") && function.name.to_lowercase().contains("yaml"))
981 })
982 }
983
984 /// Check if function performs sanitization
985 fn contains_sanitization(function: &MirFunction) -> bool {
986 function.body.iter().any(|line| {
987 line.contains("parse::<")
988 || line.contains("chars().all")
989 || line.contains("is_alphanumeric")
990 })
991 }
992
993 /// Check if function has validation guard that protects a sink
994 /// Returns true if there's an if-condition checking safety before calling a sink
995 #[allow(dead_code)]
996 fn has_validation_guard(function: &MirFunction) -> bool {
997 let has_sink = Self::contains_sink(function);
998 if !has_sink {
999 return false;
1000 }
1001
1002 // Look for validation function calls like is_safe_input, is_valid, validate, etc.
1003 let has_validation_call = function.body.iter().any(|line| {
1004 (line.contains("is_safe") || line.contains("is_valid") || line.contains("validate"))
1005 && line.contains("(")
1006 && line.contains(")")
1007 });
1008
1009 // Look for switchInt (if/match statements) that could be guards
1010 let has_conditional = function.body.iter().any(|line| line.contains("switchInt("));
1011
1012 has_validation_call && has_conditional
1013 }
1014
1015 /// Check if function calls a sanitization helper on tainted data before using it
1016 /// This handles patterns like: let safe = validate_input(&tainted); use(safe);
1017 #[allow(dead_code)]
1018 fn has_sanitization_helper_call(function: &MirFunction) -> bool {
1019 // Look for calls to functions with sanitization-related names
1020 let sanitization_patterns = ["validate", "sanitize", "clean", "escape", "filter"];
1021
1022 function.body.iter().any(|line| {
1023 sanitization_patterns
1024 .iter()
1025 .any(|pattern| line.to_lowercase().contains(pattern) && line.contains("("))
1026 })
1027 }
1028
1029 /// Extract function call from MIR line
1030 fn extract_call_from_line(line: &str) -> Option<(String, usize)> {
1031 let line = line.trim();
1032
1033 if line.contains("(") && line.contains(") -> [return:") {
1034 if let Some(eq_pos) = line.find('=') {
1035 if let Some(paren_pos) = line[eq_pos..].find('(') {
1036 let func_part = &line[eq_pos + 1..eq_pos + paren_pos].trim();
1037 let func_name = CallGraph::extract_function_name(func_part);
1038
1039 if !func_name.is_empty() && !CallGraph::is_builtin_operation(&func_name) {
1040 // Estimate arg count
1041 let args_section = &line[eq_pos + paren_pos + 1..];
1042 let arg_count = CallGraph::estimate_arg_count(args_section);
1043 return Some((func_name, arg_count));
1044 }
1045 }
1046 }
1047 }
1048
1049 None
1050 }
1051
1052 /// Merge rules from a callee's summary
1053 fn merge_callee_summary(&mut self, callee: &FunctionSummary) {
1054 // If callee has sources, this function might propagate them
1055 if !callee.source_parameters.is_empty() {
1056 // For now, mark that we call a function with sources
1057 // Phase 3.3 will track parameter mappings more precisely
1058 }
1059
1060 // DISABLED: Don't propagate sinks from callees
1061 // Inter-procedural flow detection handles this by exploring call chains
1062 // If we mark callers as having sinks, we get false positives
1063 /*
1064 // If callee has sinks, parameters to this function might reach them
1065 if !callee.sink_parameters.is_empty() {
1066 // Mark that we propagate to a sink
1067 for ¶m in &callee.sink_parameters {
1068 if param < 3 { // Only track first few parameters for now
1069 self.propagation_rules.push(TaintPropagation::ParamToSink {
1070 param,
1071 sink_type: "indirect_command_execution".to_string(),
1072 });
1073 }
1074 }
1075 }
1076 */
1077
1078 // Handle return taint
1079 match &callee.return_taint {
1080 ReturnTaint::FromSource { .. } => {
1081 // If callee returns tainted data, this function might too
1082 if matches!(self.return_taint, ReturnTaint::Clean) {
1083 self.return_taint = ReturnTaint::FromSource {
1084 source_type: "propagated".to_string(),
1085 };
1086 }
1087 }
1088 ReturnTaint::FromParameter(param) => {
1089 // Callee propagates parameter to return
1090 self.propagation_rules
1091 .push(TaintPropagation::ParamToReturn(*param));
1092 }
1093 _ => {}
1094 }
1095 }
1096
1097 pub fn to_dataflow_summary(&self) -> DataflowSummary {
1098 let mut propagation = self.propagation_rules.clone();
1099 let mut returns_tainted = false;
1100
1101 match &self.return_taint {
1102 ReturnTaint::Clean => {}
1103 ReturnTaint::FromParameter(idx) => {
1104 propagation.push(TaintPropagation::ParamToReturn(*idx));
1105 }
1106 ReturnTaint::FromSource { .. } => {
1107 returns_tainted = true;
1108 }
1109 ReturnTaint::Merged(taints) => {
1110 for taint in taints {
1111 match taint {
1112 ReturnTaint::FromParameter(idx) => {
1113 propagation.push(TaintPropagation::ParamToReturn(*idx));
1114 }
1115 ReturnTaint::FromSource { .. } => {
1116 returns_tainted = true;
1117 }
1118 _ => {}
1119 }
1120 }
1121 }
1122 }
1123
1124 DataflowSummary {
1125 name: self.function_name.clone(),
1126 propagation,
1127 returns_tainted,
1128 }
1129 }
1130}
1131
1132/// Main inter-procedural analysis engine
1133pub struct InterProceduralAnalysis {
1134 /// Call graph
1135 pub call_graph: CallGraph,
1136
1137 /// Computed function summaries
1138 pub summaries: HashMap<String, FunctionSummary>,
1139
1140 /// Closure registry for tracking closures and captures
1141 pub closure_registry: ClosureRegistry,
1142
1143 /// Cached inter-procedural flows (computed once on first call)
1144 cached_flows: RefCell<Option<Vec<TaintPath>>>,
1145
1146 /// Configuration for analysis limits
1147 config: IpaConfig,
1148}
1149
1150impl InterProceduralAnalysis {
1151 /// Create a new inter-procedural analysis with default configuration
1152 pub fn new(package: &MirPackage) -> Result<Self> {
1153 Self::with_config(package, IpaConfig::default())
1154 }
1155
1156 /// Create a new inter-procedural analysis with custom configuration
1157 pub fn with_config(package: &MirPackage, config: IpaConfig) -> Result<Self> {
1158 let call_graph = CallGraph::from_mir_package(package)?;
1159 let closure_registry = ClosureRegistryBuilder::build_from_package(package);
1160
1161 Ok(InterProceduralAnalysis {
1162 call_graph,
1163 summaries: HashMap::new(),
1164 closure_registry,
1165 cached_flows: RefCell::new(None),
1166 config,
1167 })
1168 }
1169
1170 /// Analyze all functions and generate summaries
1171 pub fn analyze(&mut self, package: &MirPackage) -> Result<()> {
1172 use crate::memory_profiler;
1173
1174 // Get function map for quick lookup
1175 let function_map: HashMap<String, &MirFunction> = package
1176 .functions
1177 .iter()
1178 .map(|f| (f.name.clone(), f))
1179 .collect();
1180
1181 let total_functions = self.call_graph.analysis_order.len();
1182 let checkpoint_interval = std::cmp::max(1, total_functions / 10); // Log every 10%
1183
1184 memory_profiler::checkpoint_with_context(
1185 "IPA analyze start",
1186 &format!("{} functions", total_functions),
1187 );
1188
1189 // Analyze functions in bottom-up order (callees before callers)
1190 for (idx, function_name) in self
1191 .call_graph
1192 .analysis_order
1193 .clone()
1194 .into_iter()
1195 .enumerate()
1196 {
1197 // Log progress every 10%
1198 if idx % checkpoint_interval == 0 {
1199 let pct = (idx * 100) / total_functions;
1200 memory_profiler::checkpoint_with_context(
1201 "IPA progress",
1202 &format!("{}% ({}/{})", pct, idx, total_functions),
1203 );
1204 }
1205
1206 if let Some(function) = function_map.get(&function_name) {
1207 // Memory spike detection: log functions that cause >100MB increase
1208 let before_mb = memory_profiler::current_memory_mb();
1209
1210 // Build a minimal callee summaries map - only include summaries for
1211 // functions that this function actually calls. This avoids O(N²) memory
1212 // usage from cloning all summaries for each function.
1213 let mut callee_summaries = HashMap::new();
1214
1215 if let Some(node) = self.call_graph.nodes.get(&function_name) {
1216 for call_site in &node.callees {
1217 // Map raw callee name to summary
1218 // If resolved_targets is not empty, merge their summaries
1219 if !call_site.resolved_targets.is_empty() {
1220 let mut merged_summary: Option<FunctionSummary> = None;
1221
1222 for target in &call_site.resolved_targets {
1223 if let Some(target_summary) = self.summaries.get(target) {
1224 if let Some(current) = &mut merged_summary {
1225 current.merge_callee_summary(target_summary);
1226 } else {
1227 // Create a new summary with the raw callee name
1228 let mut new_summary = target_summary.clone();
1229 new_summary.function_name = call_site.callee.clone();
1230 merged_summary = Some(new_summary);
1231 }
1232 }
1233 }
1234
1235 if let Some(summary) = merged_summary {
1236 callee_summaries.insert(call_site.callee.clone(), summary);
1237 }
1238 } else {
1239 // Try direct lookup (for unresolved or direct calls)
1240 if let Some(summary) = self.summaries.get(&call_site.callee) {
1241 callee_summaries.insert(call_site.callee.clone(), summary.clone());
1242 }
1243 }
1244 }
1245 }
1246
1247 // Generate summary using summaries of callees and closure registry
1248 let summary = FunctionSummary::from_mir_function(
1249 function,
1250 &callee_summaries,
1251 Some(&self.closure_registry),
1252 )?;
1253
1254 // Store summary in summaries HashMap (single source of truth)
1255 // Note: We do NOT store in node.summary to avoid memory duplication
1256 self.summaries.insert(function_name.clone(), summary);
1257
1258 // Memory spike detection: log functions that cause significant memory increase
1259 let after_mb = memory_profiler::current_memory_mb();
1260 let delta_mb = after_mb - before_mb;
1261 if delta_mb > 50.0 && memory_profiler::is_enabled() {
1262 eprintln!(
1263 "[MEMORY] SPIKE: {} caused +{:.0} MB (now {:.0} MB) - body={} lines",
1264 function_name,
1265 delta_mb,
1266 after_mb,
1267 function.body.len()
1268 );
1269 }
1270 }
1271 }
1272
1273 Ok(())
1274 }
1275
1276 /// Get summary for a function
1277 pub fn get_summary(&self, function_name: &str) -> Option<&FunctionSummary> {
1278 self.summaries.get(function_name)
1279 }
1280
1281 /// Print summary statistics
1282 pub fn print_statistics(&self) {
1283 println!("Inter-Procedural Analysis Statistics:");
1284 println!(" Total functions: {}", self.summaries.len());
1285
1286 let functions_with_sources = self
1287 .summaries
1288 .values()
1289 .filter(|s| !matches!(s.return_taint, ReturnTaint::Clean))
1290 .count();
1291 println!(" Functions with sources: {}", functions_with_sources);
1292
1293 let functions_with_sinks = self
1294 .summaries
1295 .values()
1296 .filter(|s| {
1297 s.sink_parameters.len() > 0
1298 || s.propagation_rules
1299 .iter()
1300 .any(|r| matches!(r, TaintPropagation::ParamToSink { .. }))
1301 })
1302 .count();
1303 println!(" Functions with sinks: {}", functions_with_sinks);
1304
1305 let functions_with_sanitization = self
1306 .summaries
1307 .values()
1308 .filter(|s| {
1309 s.propagation_rules
1310 .iter()
1311 .any(|r| matches!(r, TaintPropagation::ParamSanitized(_)))
1312 })
1313 .count();
1314 println!(
1315 " Functions with sanitization: {}",
1316 functions_with_sanitization
1317 );
1318 }
1319
1320 /// Detect inter-procedural taint flows (cached after first call)
1321 pub fn detect_inter_procedural_flows(&self, package: &MirPackage) -> Vec<TaintPath> {
1322 // Check if we have cached flows
1323 {
1324 let cached = self.cached_flows.borrow();
1325 if let Some(flows) = cached.as_ref() {
1326 return flows.clone();
1327 }
1328 }
1329
1330 // Compute flows
1331 let flows = self.compute_inter_procedural_flows(package);
1332
1333 // Cache the result
1334 *self.cached_flows.borrow_mut() = Some(flows.clone());
1335
1336 flows
1337 }
1338
1339 /// Internal: compute inter-procedural taint flows (called once)
1340 fn compute_inter_procedural_flows(&self, package: &MirPackage) -> Vec<TaintPath> {
1341 use crate::memory_profiler;
1342 memory_profiler::checkpoint("IPA: Computing inter-procedural flows");
1343
1344 let mut flows = Vec::new();
1345
1346 let num_summaries = self.summaries.len();
1347 let mut processed = 0;
1348
1349 // For each function with REAL sources, try to find paths to sinks
1350 for (source_func, source_summary) in &self.summaries {
1351 processed += 1;
1352
1353 // Use configurable limit from IpaConfig
1354 if flows.len() >= self.config.max_total_flows {
1355 eprintln!("Note: IPA flow limit reached ({} flows). Some inter-procedural flows may not be reported.", self.config.max_total_flows);
1356 break;
1357 }
1358
1359 // Progress logging every 10%
1360 if memory_profiler::is_enabled() && processed % (num_summaries / 10).max(1) == 0 {
1361 memory_profiler::checkpoint_with_context(
1362 "IPA flow computation",
1363 &format!(
1364 "{}% ({}/{}), {} flows",
1365 processed * 100 / num_summaries,
1366 processed,
1367 num_summaries,
1368 flows.len()
1369 ),
1370 );
1371 }
1372
1373 // Case 1: Function has internal vulnerability (source -> sink within function)
1374 if source_summary.has_internal_vulnerability {
1375 // If this is a closure, report the flow for the parent function
1376 let reported_source =
1377 if let Some(closure) = self.closure_registry.get_closure(source_func) {
1378 closure.parent_function.clone()
1379 } else {
1380 source_func.clone()
1381 };
1382
1383 flows.push(TaintPath {
1384 source_function: reported_source,
1385 sink_function: source_func.clone(),
1386 sink_type: "internal_sink".to_string(),
1387 call_chain: vec![source_func.clone()],
1388 source_type: "environment".to_string(),
1389 sanitized: false,
1390 });
1391 }
1392
1393 // Case 2: Function returns tainted data (source -> return)
1394 // Only start from functions that have actual sources (not just propagation)
1395 if matches!(source_summary.return_taint, ReturnTaint::FromSource { .. }) {
1396 // This function has a real taint source
1397 // Find all functions that call it
1398 if self.call_graph.nodes.get(source_func).is_some() {
1399 // Explore paths from this source
1400 let all_flows = self.find_paths_from_source(
1401 source_func,
1402 &source_summary.return_taint,
1403 vec![source_func.clone()],
1404 &mut HashSet::new(),
1405 );
1406
1407 // Filter out intra-procedural flows (same source and sink)
1408 // Those should be caught by Phase 2's analysis
1409 // UNLESS it's a complex flow that Phase 2 missed but Phase 3 caught via internal vulnerability check
1410 for flow in all_flows {
1411 if flow.source_function != flow.sink_function || flow.call_chain.len() > 1 {
1412 flows.push(flow);
1413 }
1414 }
1415 }
1416 }
1417 }
1418
1419 // Phase 3.4: Filter false positives by checking for validation patterns
1420 flows = self.filter_false_positives(flows);
1421
1422 // Phase 3.5.2: Add flows from closures with tainted captures
1423 let closure_flows = self.detect_closure_taint_flows(package);
1424 flows.extend(closure_flows);
1425
1426 memory_profiler::checkpoint_with_context(
1427 "IPA: Computed flows",
1428 &format!("{} flows", flows.len()),
1429 );
1430
1431 flows
1432 }
1433
1434 /// Phase 3.5.2: Detect taint flows through closures
1435 /// Closures capture variables from parent functions - if captured var is tainted
1436 /// and closure has a sink, that's an interprocedural flow
1437 fn detect_closure_taint_flows(&self, package: &MirPackage) -> Vec<TaintPath> {
1438 let mut flows: Vec<TaintPath> = Vec::new();
1439
1440 // Build function map for looking up MIR bodies
1441 let function_map: HashMap<String, &MirFunction> = package
1442 .functions
1443 .iter()
1444 .map(|f| (f.name.clone(), f))
1445 .collect();
1446
1447 // Source patterns that indicate tainted input
1448 let source_patterns = [
1449 "env::args",
1450 "std::env::args",
1451 "::args()",
1452 "env::var",
1453 "std::env::var",
1454 "stdin",
1455 "read_line",
1456 "read_to_string",
1457 "HttpRequest",
1458 "request",
1459 "body()",
1460 "serde_json::from",
1461 "serde::Deserialize",
1462 ];
1463
1464 // Debug: print all closures being analyzed
1465 let all_closures = self.closure_registry.get_all_closures();
1466
1467 for closure_info in all_closures {
1468 // Skip if already found flow for this closure
1469 if flows.iter().any(|f| f.sink_function == closure_info.name) {
1470 continue;
1471 }
1472
1473 // Check if parent function has a taint source via return value
1474 let parent_has_source = self
1475 .summaries
1476 .get(&closure_info.parent_function)
1477 .map(|s| matches!(s.return_taint, ReturnTaint::FromSource { .. }))
1478 .unwrap_or(false);
1479
1480 // Check if parent function CALLS a source (not just returns it)
1481 // This is the key for closures - parent may use source locally without returning
1482 let parent_calls_source = self
1483 .call_graph
1484 .nodes
1485 .get(&closure_info.parent_function)
1486 .map(|node| {
1487 let result = node.callees.iter().any(|callee| {
1488 source_patterns
1489 .iter()
1490 .any(|pat| callee.callee.contains(pat))
1491 });
1492 if closure_info.name.contains("test_closure") {
1493 // eprintln!("[DEBUG] parent_calls_source: {}", result);
1494 // eprintln!("[DEBUG] parent callees: {:?}", node.callees.iter().map(|c| &c.callee).collect::<Vec<_>>());
1495 }
1496 result
1497 })
1498 .unwrap_or_else(|| {
1499 if closure_info.name.contains("test_closure") {
1500 // eprintln!("[DEBUG] parent '{}' NOT found in call_graph", closure_info.parent_function);
1501 }
1502 false
1503 });
1504
1505 // Also check if parent function body contains source patterns
1506 let parent_has_source_in_body = self
1507 .summaries
1508 .get(&closure_info.parent_function)
1509 .map(|summary| {
1510 // Check if the parent function's summary contains any source
1511 matches!(summary.return_taint, ReturnTaint::FromSource { .. })
1512 })
1513 .unwrap_or(false);
1514
1515 // Method 1: Use closure registry's taint detection
1516 if closure_info.has_tainted_captures() {
1517 if let Some(summary) = self.summaries.get(&closure_info.name) {
1518 let sink_type = summary.propagation_rules.iter().find_map(|r| {
1519 if let TaintPropagation::ParamToSink { sink_type, .. } = r {
1520 Some(sink_type.clone())
1521 } else {
1522 None
1523 }
1524 });
1525
1526 if let Some(sink_type) = sink_type {
1527 for capture in &closure_info.captured_vars {
1528 if let crate::dataflow::closure::TaintState::Tainted {
1529 source_type,
1530 ..
1531 } = &capture.taint_state
1532 {
1533 flows.push(TaintPath {
1534 source_function: closure_info.parent_function.clone(),
1535 sink_function: closure_info.name.clone(),
1536 call_chain: vec![
1537 closure_info.parent_function.clone(),
1538 closure_info.name.clone(),
1539 ],
1540 source_type: source_type.clone(),
1541 sink_type: sink_type.clone(),
1542 sanitized: false,
1543 });
1544 break;
1545 }
1546 }
1547 continue;
1548 }
1549 }
1550 }
1551
1552 // Method 2: Direct body pattern matching (fallback)
1553 // Check if parent has source (via return OR via calling a source) and closure has command sink
1554 if parent_has_source || parent_has_source_in_body || parent_calls_source {
1555 // Check closure body for command execution
1556 if let Some(node) = self.call_graph.nodes.get(&closure_info.name) {
1557 let has_command_callee = node.callees.iter().any(|c| {
1558 c.callee.contains("Command")
1559 || c.callee.contains("spawn")
1560 || c.callee.contains("output")
1561 || c.callee.contains("process")
1562 });
1563
1564 if has_command_callee && !closure_info.captured_vars.is_empty() {
1565 flows.push(TaintPath {
1566 source_function: closure_info.parent_function.clone(),
1567 sink_function: closure_info.name.clone(),
1568 call_chain: vec![
1569 closure_info.parent_function.clone(),
1570 closure_info.name.clone(),
1571 ],
1572 source_type: "environment".to_string(),
1573 sink_type: "command_execution".to_string(),
1574 sanitized: false,
1575 });
1576 continue;
1577 }
1578 }
1579 }
1580
1581 // Method 3: Check for closures with captured variables where parent calls source
1582 // This catches cases even without full taint tracking
1583 if !closure_info.captured_vars.is_empty() && parent_calls_source {
1584 // Check if the closure has command-related callees
1585 if let Some(closure_node) = self.call_graph.nodes.get(&closure_info.name) {
1586 let closure_has_sink = closure_node.callees.iter().any(|c| {
1587 let name_lower = c.callee.to_lowercase();
1588 name_lower.contains("command")
1589 || name_lower.contains("spawn")
1590 || name_lower.contains("shell")
1591 || name_lower.contains("exec")
1592 });
1593
1594 if closure_has_sink {
1595 flows.push(TaintPath {
1596 source_function: closure_info.parent_function.clone(),
1597 sink_function: closure_info.name.clone(),
1598 call_chain: vec![
1599 closure_info.parent_function.clone(),
1600 closure_info.name.clone(),
1601 ],
1602 source_type: "environment".to_string(),
1603 sink_type: "command_execution".to_string(),
1604 sanitized: false,
1605 });
1606 }
1607 }
1608 }
1609
1610 // Method 4: Analyze closure body directly for captured variable → command flow
1611 // This works even when parent function is inlined/optimized away
1612 // Check for patterns like:
1613 // debug tainted => (*((*_1).0: ... (captured variable with suggestive name)
1614 // _X = Command::arg(... copy _Y...) where _Y is from captured data
1615 if let Some(closure_function) = function_map.get(&closure_info.name) {
1616 let body_str = closure_function.body.join("\n");
1617
1618 // Check if closure has command sink in its body
1619 let has_command_sink = body_str.contains("Command::")
1620 || body_str.contains("::spawn(")
1621 || body_str.contains("::output(");
1622
1623 if has_command_sink {
1624 // Check for captured variables with suggestive names indicating user input
1625 // Pattern: debug <name> => (*((*_1)... indicates captured variable
1626 let has_tainted_capture = body_str.contains("debug tainted") ||
1627 body_str.contains("debug user") ||
1628 body_str.contains("debug input") ||
1629 body_str.contains("debug cmd") ||
1630 body_str.contains("debug command") ||
1631 body_str.contains("debug arg") ||
1632 // Also check if _1 (the closure capture) is used in Command::arg
1633 (body_str.contains("(*_1)") && body_str.contains("Command::arg"));
1634
1635 if has_tainted_capture {
1636 flows.push(TaintPath {
1637 source_function: closure_info.parent_function.clone(),
1638 sink_function: closure_info.name.clone(),
1639 call_chain: vec![
1640 closure_info.parent_function.clone(),
1641 closure_info.name.clone(),
1642 ],
1643 source_type: "captured_variable".to_string(),
1644 sink_type: "command_execution".to_string(),
1645 sanitized: false,
1646 });
1647 }
1648 }
1649 }
1650 }
1651
1652 flows
1653 }
1654
1655 /// Phase 3.4: Filter false positives from detected flows
1656 /// Identifies patterns that indicate sanitization even when not in the direct call chain
1657 fn filter_false_positives(&self, flows: Vec<TaintPath>) -> Vec<TaintPath> {
1658 flows
1659 .into_iter()
1660 .filter(|flow| {
1661 // Check each function in the call chain
1662 for func_name in &flow.call_chain {
1663 if let Some(node) = self.call_graph.nodes.get(func_name) {
1664 // Pattern 1: Function has BOTH source and (direct or indirect) sink
1665 let is_source_func = func_name == &flow.source_function;
1666 let returns_source = self
1667 .summaries
1668 .get(func_name)
1669 .map(|summary| {
1670 matches!(summary.return_taint, ReturnTaint::FromSource { .. })
1671 })
1672 .unwrap_or(false);
1673
1674 let has_source = is_source_func || returns_source;
1675
1676 // Check if this function has a direct sink
1677 let has_direct_sink = self
1678 .summaries
1679 .get(func_name)
1680 .map(|summary| {
1681 summary
1682 .propagation_rules
1683 .iter()
1684 .any(|r| matches!(r, TaintPropagation::ParamToSink { .. }))
1685 })
1686 .unwrap_or(false);
1687
1688 // Check if this function calls something that has a sink
1689 let calls_sink_function = node.callees.iter().any(|callee_site| {
1690 if let Some(callee_summary) = self.summaries.get(&callee_site.callee) {
1691 callee_summary
1692 .propagation_rules
1693 .iter()
1694 .any(|r| matches!(r, TaintPropagation::ParamToSink { .. }))
1695 } else {
1696 false
1697 }
1698 });
1699
1700 let has_sink = has_direct_sink || calls_sink_function;
1701
1702 if has_source && has_sink {
1703 // This function gets tainted data and (directly or indirectly) executes it
1704 // Check if it has validation guards protecting the sink
1705
1706 // PHASE 3.4 CONSERVATIVE FILTER:
1707 // Only filter if we detect BOTH:
1708 // 1. A validator call (is_safe, validate, etc.)
1709 // 2. Evidence that validator protects the sink (guard pattern)
1710 //
1711 // This avoids filtering cases like test_partial_sanitization where
1712 // one branch calls the validator but another branch doesn't.
1713
1714 let calls_validator = node.callees.iter().any(|callee| {
1715 let callee_lower = callee.callee.to_lowercase();
1716 callee_lower.contains("is_safe")
1717 || callee_lower.contains("is_valid")
1718 });
1719
1720 // More restrictive: only filter if validator is in guard pattern (is_safe_, is_valid_)
1721 // These are typically used in if-conditions that protect the sink
1722 // Avoid filtering validate_/sanitize_ which might be on only one branch
1723
1724 if calls_validator {
1725 // Function uses a validation guard - likely a false positive
1726 return false; // Filter out this flow
1727 }
1728 }
1729 }
1730 }
1731
1732 // Flow passed all filters - keep it
1733 true
1734 })
1735 .collect()
1736 }
1737
1738 /// Find taint paths starting from a source function.
1739 ///
1740 /// # Memory Safety Limits
1741 ///
1742 /// This function uses configurable limits from `IpaConfig` to prevent memory exhaustion.
1743 /// These limits may cause false negatives in extreme cases. See README.md for details.
1744 ///
1745 /// To increase limits, pass a custom `IpaConfig` via `InterProceduralAnalysis::with_config()`,
1746 /// or use a `cargo-cola.yaml` configuration file.
1747 ///
1748 /// Without these limits, analysis of dense call graphs (e.g., InfluxDB with 11K functions)
1749 /// would require 60GB+ RAM due to exponential path exploration.
1750 fn find_paths_from_source(
1751 &self,
1752 current_func: &str,
1753 taint: &ReturnTaint,
1754 path: Vec<String>,
1755 visited: &mut HashSet<String>,
1756 ) -> Vec<TaintPath> {
1757 let mut flows = Vec::new();
1758
1759 // Use configurable limit from IpaConfig
1760 if path.len() > self.config.max_path_depth {
1761 return flows;
1762 }
1763
1764 // Use configurable limit from IpaConfig
1765 if visited.len() >= self.config.max_visited {
1766 return flows;
1767 }
1768
1769 // Avoid infinite recursion - once visited, never revisit
1770 if visited.contains(current_func) {
1771 return flows;
1772 }
1773 visited.insert(current_func.to_string());
1774
1775 // Check if path is sanitized (any function in path has ParamSanitized rule)
1776 let is_sanitized = self.path_is_sanitized(&path);
1777
1778 // NEW: Also check if current function calls a sanitization helper
1779 // This catches patterns like: let safe = validate(&tainted); use(safe);
1780 let calls_sanitizer = if let Some(node) = self.call_graph.nodes.get(current_func) {
1781 node.callees.iter().any(|callee_site| {
1782 if let Some(callee_summary) = self.summaries.get(&callee_site.callee) {
1783 // Check if callee has sanitization
1784 callee_summary
1785 .propagation_rules
1786 .iter()
1787 .any(|r| matches!(r, TaintPropagation::ParamSanitized(_)))
1788 } else {
1789 false
1790 }
1791 })
1792 } else {
1793 false
1794 };
1795
1796 let effective_sanitized = is_sanitized || calls_sanitizer;
1797
1798 // Get the current function's node and summary
1799 if let Some(node) = self.call_graph.nodes.get(current_func) {
1800 // Check if current function has a sink
1801 if let Some(summary) = self.summaries.get(current_func) {
1802 // Does this function have a sink that the taint can reach?
1803 for rule in &summary.propagation_rules {
1804 if let TaintPropagation::ParamToSink { sink_type, .. } = rule {
1805 // Taint reaches a sink!
1806 flows.push(TaintPath {
1807 source_function: path[0].clone(),
1808 sink_function: current_func.to_string(),
1809 call_chain: path.clone(),
1810 source_type: Self::extract_source_type(taint),
1811 sink_type: sink_type.clone(),
1812 sanitized: effective_sanitized,
1813 });
1814 } else if let TaintPropagation::ParamSanitized(_) = rule {
1815 // Taint is sanitized - we already track this above
1816 continue;
1817 }
1818 }
1819 }
1820
1821 // NEW: If current function doesn't have a direct sink, check what it calls
1822 // This enables N-level detection: source() -> caller() -> sink_function()
1823 // Only check direct callees, not recursive (avoid explosion)
1824 if !flows.iter().any(|f| f.sink_function == current_func) {
1825 // Current function doesn't have a sink, check its callees
1826 for callee_site in &node.callees {
1827 if let Some(callee_summary) = self.summaries.get(&callee_site.callee) {
1828 // Check if this callee sanitizes
1829 let callee_sanitizes = callee_summary
1830 .propagation_rules
1831 .iter()
1832 .any(|r| matches!(r, TaintPropagation::ParamSanitized(_)));
1833
1834 // Does this callee have a sink?
1835 let has_sink = callee_summary
1836 .propagation_rules
1837 .iter()
1838 .any(|r| matches!(r, TaintPropagation::ParamToSink { .. }));
1839
1840 if has_sink {
1841 // Found a flow through callee
1842 let mut extended_path = path.clone();
1843 extended_path.push(callee_site.callee.clone());
1844
1845 let sink_type = callee_summary
1846 .propagation_rules
1847 .iter()
1848 .find_map(|r| match r {
1849 TaintPropagation::ParamToSink { sink_type, .. } => {
1850 Some(sink_type.clone())
1851 }
1852 _ => None,
1853 })
1854 .unwrap_or_else(|| "unknown_sink".to_string());
1855
1856 flows.push(TaintPath {
1857 source_function: path[0].clone(),
1858 sink_function: callee_site.callee.clone(),
1859 call_chain: extended_path.clone(),
1860 source_type: Self::extract_source_type(taint),
1861 sink_type,
1862 // Sanitized if either path so far is sanitized OR this callee sanitizes OR calling function has sanitization
1863 sanitized: effective_sanitized
1864 || callee_sanitizes
1865 || self.path_is_sanitized(&extended_path),
1866 });
1867 }
1868 }
1869 }
1870 }
1871
1872 // Explore callers of this function (functions that call current_func)
1873 // Key insight: the caller receives tainted data by calling current_func
1874 // If the caller has a filesystem sink, the taint may reach it
1875 for caller in &node.callers {
1876 let mut new_path = path.clone();
1877 new_path.push(caller.clone());
1878
1879 // Check if the CALLER itself has a filesystem sink
1880 // This handles the pattern: caller() { let x = source_fn(); sink(x); }
1881 if self.call_graph.nodes.contains_key(caller) {
1882 if let Some(caller_summary) = self.summaries.get(caller) {
1883 // Check if caller has a filesystem sink in its propagation rules
1884 // OR if it has any ParamToSink (which was set when analyzing the function)
1885 let has_filesystem_sink = caller_summary.propagation_rules.iter()
1886 .any(|r| matches!(r, TaintPropagation::ParamToSink { sink_type, .. } if sink_type == "filesystem"));
1887
1888 let has_any_sink = caller_summary
1889 .propagation_rules
1890 .iter()
1891 .any(|r| matches!(r, TaintPropagation::ParamToSink { .. }));
1892
1893 // Check if caller has internal vulnerability
1894 // This handles cases where caller consumes the source and sinks it locally
1895 // (possibly via ParamToParam propagation in a helper)
1896 let has_internal = caller_summary.has_internal_vulnerability;
1897
1898 if has_filesystem_sink || has_any_sink || has_internal {
1899 // Caller has a sink and receives tainted data from current_func
1900 let sink_type = if has_internal {
1901 "internal_sink".to_string()
1902 } else {
1903 caller_summary
1904 .propagation_rules
1905 .iter()
1906 .find_map(|r| match r {
1907 TaintPropagation::ParamToSink { sink_type, .. } => {
1908 Some(sink_type.clone())
1909 }
1910 _ => None,
1911 })
1912 .unwrap_or_else(|| "unknown".to_string())
1913 };
1914
1915 flows.push(TaintPath {
1916 source_function: path[0].clone(),
1917 sink_function: caller.clone(),
1918 call_chain: new_path.clone(),
1919 source_type: Self::extract_source_type(taint),
1920 sink_type,
1921 sanitized: effective_sanitized,
1922 });
1923
1924 // Early exit if we have too many flows
1925 if flows.len() >= self.config.max_flows_per_source {
1926 return flows;
1927 }
1928 }
1929 }
1930 }
1931
1932 // Early exit if we have too many flows
1933 if flows.len() >= self.config.max_flows_per_source {
1934 return flows;
1935 }
1936
1937 // Recursively explore from the caller
1938 let sub_flows = self.find_paths_from_source(caller, taint, new_path, visited);
1939
1940 // Only add flows if we haven't hit the limit
1941 let remaining = self.config.max_flows_per_source.saturating_sub(flows.len());
1942 flows.extend(sub_flows.into_iter().take(remaining));
1943
1944 if flows.len() >= self.config.max_flows_per_source {
1945 return flows;
1946 }
1947 }
1948 }
1949
1950 // NOTE: We do NOT remove from visited here - this prevents exponential
1951 // path exploration in complex call graphs. Each function is visited once
1952 // per source function exploration. Combined with max_path_depth, this
1953 // ensures O(n) complexity instead of O(branches^depth).
1954
1955 flows
1956 }
1957
1958 #[allow(dead_code)]
1959 /// Explore callees of a function that propagates taint to find eventual sinks.
1960 /// This enables N-level detection by following taint through intermediate propagators.
1961 ///
1962 /// Example: source() -> caller() -> propagator() -> sink()
1963 /// We're at 'caller', which calls 'propagator' (which propagates).
1964 /// We need to check if 'propagator' calls 'sink'.
1965 ///
1966 /// CURRENTLY DISABLED: Causes performance issues, needs better algorithm
1967 fn explore_callees_for_sinks(
1968 &self,
1969 current_func: &str,
1970 source_taint: &ReturnTaint,
1971 path: Vec<String>,
1972 visited: &mut HashSet<String>,
1973 ) -> Vec<TaintPath> {
1974 let mut flows = Vec::new();
1975
1976 // Debug: limit recursion depth
1977 if path.len() > 10 {
1978 eprintln!(
1979 "WARNING: Path too deep ({}), stopping exploration",
1980 path.len()
1981 );
1982 return flows;
1983 }
1984
1985 // Get the call graph node for the current function
1986 let Some(node) = self.call_graph.nodes.get(current_func) else {
1987 return flows;
1988 };
1989
1990 // Explore each function that current_func calls
1991 for callee_site in &node.callees {
1992 let callee_name = &callee_site.callee;
1993
1994 // Avoid infinite loops
1995 if visited.contains(callee_name) {
1996 continue;
1997 }
1998
1999 let Some(callee_summary) = self.summaries.get(callee_name) else {
2000 continue;
2001 };
2002
2003 // Check if this callee has a sink
2004 let callee_has_sink = callee_summary
2005 .propagation_rules
2006 .iter()
2007 .any(|r| matches!(r, TaintPropagation::ParamToSink { .. }));
2008
2009 if callee_has_sink {
2010 // Found a direct sink - create flow
2011 let mut extended_path = path.clone();
2012 extended_path.push(callee_name.clone());
2013
2014 // Extract sink type from the sink rule
2015 let sink_type = callee_summary
2016 .propagation_rules
2017 .iter()
2018 .find_map(|r| match r {
2019 TaintPropagation::ParamToSink { sink_type, .. } => Some(sink_type.clone()),
2020 _ => None,
2021 })
2022 .unwrap_or_else(|| "unknown_sink".to_string());
2023
2024 flows.push(TaintPath {
2025 source_function: path[0].clone(),
2026 sink_function: callee_name.clone(),
2027 call_chain: extended_path,
2028 source_type: Self::extract_source_type(source_taint),
2029 sink_type,
2030 sanitized: false,
2031 });
2032 } else if matches!(callee_summary.return_taint, ReturnTaint::FromParameter(_)) {
2033 // This callee also propagates - explore its callees recursively
2034 let mut extended_path = path.clone();
2035 extended_path.push(callee_name.clone());
2036
2037 visited.insert(callee_name.clone());
2038 flows.extend(self.explore_callees_for_sinks(
2039 callee_name,
2040 source_taint,
2041 extended_path,
2042 visited,
2043 ));
2044 visited.remove(callee_name);
2045 }
2046 }
2047
2048 flows
2049 }
2050
2051 /// Check if any function in the path sanitizes its input
2052 fn path_is_sanitized(&self, path: &[String]) -> bool {
2053 path.iter().any(|func_name| {
2054 if let Some(summary) = self.summaries.get(func_name) {
2055 summary
2056 .propagation_rules
2057 .iter()
2058 .any(|r| matches!(r, TaintPropagation::ParamSanitized(_)))
2059 } else {
2060 false
2061 }
2062 })
2063 }
2064
2065 /// Extract source type from ReturnTaint
2066 fn extract_source_type(taint: &ReturnTaint) -> String {
2067 match taint {
2068 ReturnTaint::FromSource { source_type } => source_type.clone(),
2069 ReturnTaint::FromParameter(_) => "parameter".to_string(),
2070 ReturnTaint::Merged(taints) => {
2071 // Take first source type from merged
2072 if let Some(first) = taints.first() {
2073 Self::extract_source_type(first)
2074 } else {
2075 "unknown".to_string()
2076 }
2077 }
2078 ReturnTaint::Clean => "clean".to_string(),
2079 }
2080 }
2081}
2082
2083/// Represents a complete taint flow from source to sink
2084#[derive(Debug, Clone)]
2085pub struct TaintPath {
2086 /// Function where taint originates
2087 pub source_function: String,
2088
2089 /// Function where taint reaches a sink
2090 pub sink_function: String,
2091
2092 /// Complete call chain: [source, caller1, caller2, ..., sink]
2093 pub call_chain: Vec<String>,
2094
2095 /// Type of taint source
2096 pub source_type: String,
2097
2098 /// Type of sink
2099 pub sink_type: String,
2100
2101 /// Whether the taint was sanitized along the path
2102 pub sanitized: bool,
2103}
2104
2105impl TaintPath {
2106 /// Create a human-readable description of this taint flow
2107 pub fn describe(&self) -> String {
2108 let chain = self.call_chain.join(" → ");
2109 let sanitized_note = if self.sanitized {
2110 " [SANITIZED - SAFE]"
2111 } else {
2112 ""
2113 };
2114 format!(
2115 "Tainted data from {} (source: {}) flows through {} to {} (sink: {}){}",
2116 self.source_function,
2117 self.source_type,
2118 chain,
2119 self.sink_function,
2120 self.sink_type,
2121 sanitized_note
2122 )
2123 }
2124
2125 /// Get the number of levels in the call chain
2126 pub fn depth(&self) -> usize {
2127 self.call_chain.len()
2128 }
2129}
2130
2131#[cfg(test)]
2132mod tests {
2133 use super::*;
2134
2135 #[test]
2136 fn test_extract_function_name() {
2137 assert_eq!(
2138 CallGraph::extract_function_name("std::process::Command::new"),
2139 "new"
2140 );
2141
2142 assert_eq!(
2143 CallGraph::extract_function_name("my_function"),
2144 "my_function"
2145 );
2146
2147 assert_eq!(
2148 CallGraph::extract_function_name("const my_crate::util::helper"),
2149 "helper"
2150 );
2151 }
2152
2153 #[test]
2154 fn test_is_builtin_operation() {
2155 assert!(CallGraph::is_builtin_operation("assert!"));
2156 assert!(CallGraph::is_builtin_operation("println!"));
2157 assert!(CallGraph::is_builtin_operation("_internal"));
2158 assert!(!CallGraph::is_builtin_operation("my_function"));
2159 }
2160
2161 #[test]
2162 fn test_function_summary_creation() {
2163 let summary = FunctionSummary::new("test_function".to_string());
2164 assert_eq!(summary.function_name, "test_function");
2165 assert!(summary.source_parameters.is_empty());
2166 assert!(summary.sink_parameters.is_empty());
2167 assert!(summary.propagation_rules.is_empty());
2168 assert!(matches!(summary.return_taint, ReturnTaint::Clean));
2169 }
2170
2171 #[test]
2172 fn test_call_site_creation() {
2173 let site = CallSite {
2174 callee: "execute_command".to_string(),
2175 resolved_targets: Vec::new(),
2176 location: "test.rs:42".to_string(),
2177 arg_count: 1,
2178 };
2179 assert_eq!(site.callee, "execute_command");
2180 assert_eq!(site.location, "test.rs:42");
2181 assert_eq!(site.arg_count, 1);
2182 }
2183
2184 #[test]
2185 fn test_taint_propagation_patterns() {
2186 let param_to_return = TaintPropagation::ParamToReturn(0);
2187 let param_to_param = TaintPropagation::ParamToParam { from: 0, to: 1 };
2188 let param_to_sink = TaintPropagation::ParamToSink {
2189 param: 0,
2190 sink_type: "command".to_string(),
2191 };
2192 let param_sanitized = TaintPropagation::ParamSanitized(0);
2193
2194 // Test that patterns are distinct
2195 assert!(matches!(
2196 param_to_return,
2197 TaintPropagation::ParamToReturn(_)
2198 ));
2199 assert!(matches!(
2200 param_to_param,
2201 TaintPropagation::ParamToParam { .. }
2202 ));
2203 assert!(matches!(
2204 param_to_sink,
2205 TaintPropagation::ParamToSink { .. }
2206 ));
2207 assert!(matches!(
2208 param_sanitized,
2209 TaintPropagation::ParamSanitized(_)
2210 ));
2211 }
2212
2213 #[test]
2214 fn test_return_taint_patterns() {
2215 let clean = ReturnTaint::Clean;
2216 let from_param = ReturnTaint::FromParameter(0);
2217 let from_source = ReturnTaint::FromSource {
2218 source_type: "env".to_string(),
2219 };
2220 let merged = ReturnTaint::Merged(vec![
2221 ReturnTaint::FromSource {
2222 source_type: "env".to_string(),
2223 },
2224 ReturnTaint::FromSource {
2225 source_type: "file".to_string(),
2226 },
2227 ]);
2228
2229 // Test that patterns are distinct
2230 assert!(matches!(clean, ReturnTaint::Clean));
2231 assert!(matches!(from_param, ReturnTaint::FromParameter(_)));
2232 assert!(matches!(from_source, ReturnTaint::FromSource { .. }));
2233 assert!(matches!(merged, ReturnTaint::Merged(_)));
2234 }
2235}