Skip to main content

agentshield/ir/
taint_builder.rs

1//! Builds a populated `DataSurface` from parsed tool definitions and execution surfaces.
2//!
3//! Called by each adapter after merging `ParsedFile` results into `ExecutionSurface`
4//! and `ToolSurface`. Constructs taint sources, sinks, and 1-hop taint paths.
5
6use std::path::PathBuf;
7
8use super::data_surface::*;
9use super::execution_surface::*;
10use super::tool_surface::ToolSurface;
11use super::ArgumentSource;
12use super::SourceLocation;
13
14/// Build a `DataSurface` from tool definitions and execution surface.
15///
16/// Extracts taint sources (tool parameters, env vars), sinks (process exec,
17/// HTTP requests, file writes, dynamic eval), and connects them with 1-hop
18/// taint paths when an operation uses a tainted argument.
19pub fn build_data_surface(tools: &[ToolSurface], execution: &ExecutionSurface) -> DataSurface {
20    let sources = collect_sources(tools, execution);
21    let sinks = collect_sinks(execution);
22    let taint_paths = build_taint_paths(&sources, execution);
23
24    DataSurface {
25        sources,
26        sinks,
27        taint_paths,
28    }
29}
30
31/// Collect taint sources from tool input schemas and environment accesses.
32fn collect_sources(tools: &[ToolSurface], execution: &ExecutionSurface) -> Vec<TaintSource> {
33    let mut sources = Vec::new();
34
35    // Sources from tool input parameters
36    for tool in tools {
37        let location = tool.defined_at.clone().unwrap_or_else(|| SourceLocation {
38            file: PathBuf::from("<unknown>"),
39            line: 0,
40            column: 0,
41            end_line: None,
42            end_column: None,
43        });
44
45        if let Some(ref schema) = tool.input_schema {
46            if let Some(props) = schema.get("properties").and_then(|v| v.as_object()) {
47                for param_name in props.keys() {
48                    sources.push(TaintSource {
49                        source_type: TaintSourceType::ToolArgument,
50                        description: format!("Tool '{}' parameter '{}'", tool.name, param_name),
51                        location: location.clone(),
52                    });
53                }
54            }
55        }
56    }
57
58    // Sources from environment variable accesses
59    for env in &execution.env_accesses {
60        let var_desc = match &env.var_name {
61            ArgumentSource::Literal(name) => name.clone(),
62            ArgumentSource::EnvVar { name } => name.clone(),
63            ArgumentSource::Parameter { name } => format!("(dynamic: {})", name),
64            _ => "(dynamic)".to_string(),
65        };
66        sources.push(TaintSource {
67            source_type: TaintSourceType::EnvVariable,
68            description: format!("Environment variable '{}'", var_desc),
69            location: env.location.clone(),
70        });
71    }
72
73    sources
74}
75
76/// Collect taint sinks from execution surface operations.
77fn collect_sinks(execution: &ExecutionSurface) -> Vec<TaintSink> {
78    let mut sinks = Vec::new();
79
80    for cmd in &execution.commands {
81        sinks.push(TaintSink {
82            sink_type: TaintSinkType::ProcessExec,
83            description: format!("Process execution via {}", cmd.function),
84            location: cmd.location.clone(),
85        });
86    }
87
88    for net in &execution.network_operations {
89        sinks.push(TaintSink {
90            sink_type: TaintSinkType::HttpRequest,
91            description: format!("HTTP request via {}", net.function),
92            location: net.location.clone(),
93        });
94    }
95
96    for file_op in &execution.file_operations {
97        if matches!(file_op.operation, FileOpType::Write) {
98            sinks.push(TaintSink {
99                sink_type: TaintSinkType::FileWrite,
100                description: "File write operation".to_string(),
101                location: file_op.location.clone(),
102            });
103        }
104    }
105
106    for dyn_exec in &execution.dynamic_exec {
107        sinks.push(TaintSink {
108            sink_type: TaintSinkType::DynamicEval,
109            description: format!("Dynamic code execution via {}", dyn_exec.function),
110            location: dyn_exec.location.clone(),
111        });
112    }
113
114    sinks
115}
116
117/// Build 1-hop taint paths connecting sources to sinks via tainted arguments.
118///
119/// For each operation that uses a tainted `ArgumentSource`, finds or creates
120/// a matching `TaintSource` and connects it to the operation's sink.
121fn build_taint_paths(sources: &[TaintSource], execution: &ExecutionSurface) -> Vec<TaintPath> {
122    let mut paths = Vec::new();
123
124    // Commands with tainted args
125    for cmd in &execution.commands {
126        if cmd.command_arg.is_tainted() {
127            let source = resolve_source(sources, &cmd.command_arg, &cmd.location);
128            paths.push(TaintPath {
129                source,
130                sink: TaintSink {
131                    sink_type: TaintSinkType::ProcessExec,
132                    description: format!("Process execution via {}", cmd.function),
133                    location: cmd.location.clone(),
134                },
135                through: vec![],
136                confidence: confidence_for_arg(&cmd.command_arg),
137            });
138        }
139    }
140
141    // Network operations with tainted URL args
142    for net in &execution.network_operations {
143        if net.url_arg.is_tainted() {
144            let source = resolve_source(sources, &net.url_arg, &net.location);
145            paths.push(TaintPath {
146                source,
147                sink: TaintSink {
148                    sink_type: TaintSinkType::HttpRequest,
149                    description: format!("HTTP request via {}", net.function),
150                    location: net.location.clone(),
151                },
152                through: vec![],
153                confidence: confidence_for_arg(&net.url_arg),
154            });
155        }
156    }
157
158    // File write operations with tainted path args
159    for file_op in &execution.file_operations {
160        if matches!(file_op.operation, FileOpType::Write) && file_op.path_arg.is_tainted() {
161            let source = resolve_source(sources, &file_op.path_arg, &file_op.location);
162            paths.push(TaintPath {
163                source,
164                sink: TaintSink {
165                    sink_type: TaintSinkType::FileWrite,
166                    description: "File write operation".to_string(),
167                    location: file_op.location.clone(),
168                },
169                through: vec![],
170                confidence: confidence_for_arg(&file_op.path_arg),
171            });
172        }
173    }
174
175    // Dynamic exec with tainted code args
176    for dyn_exec in &execution.dynamic_exec {
177        if dyn_exec.code_arg.is_tainted() {
178            let source = resolve_source(sources, &dyn_exec.code_arg, &dyn_exec.location);
179            paths.push(TaintPath {
180                source,
181                sink: TaintSink {
182                    sink_type: TaintSinkType::DynamicEval,
183                    description: format!("Dynamic code execution via {}", dyn_exec.function),
184                    location: dyn_exec.location.clone(),
185                },
186                through: vec![],
187                confidence: confidence_for_arg(&dyn_exec.code_arg),
188            });
189        }
190    }
191
192    paths
193}
194
195/// Resolve an `ArgumentSource` to a matching `TaintSource` from the collected sources.
196///
197/// If the argument references a known parameter or env var that matches a source,
198/// returns that source. Otherwise, creates a synthetic source for the argument.
199fn resolve_source(
200    sources: &[TaintSource],
201    arg: &ArgumentSource,
202    fallback_location: &SourceLocation,
203) -> TaintSource {
204    match arg {
205        ArgumentSource::Parameter { name } => {
206            // Try to find a matching tool argument source
207            if let Some(found) = sources.iter().find(|s| {
208                s.source_type == TaintSourceType::ToolArgument && s.description.contains(name)
209            }) {
210                return found.clone();
211            }
212            TaintSource {
213                source_type: TaintSourceType::ToolArgument,
214                description: format!("Function parameter '{}'", name),
215                location: fallback_location.clone(),
216            }
217        }
218        ArgumentSource::EnvVar { name } => {
219            if let Some(found) = sources.iter().find(|s| {
220                s.source_type == TaintSourceType::EnvVariable && s.description.contains(name)
221            }) {
222                return found.clone();
223            }
224            TaintSource {
225                source_type: TaintSourceType::EnvVariable,
226                description: format!("Environment variable '{}'", name),
227                location: fallback_location.clone(),
228            }
229        }
230        ArgumentSource::Interpolated => TaintSource {
231            source_type: TaintSourceType::ToolArgument,
232            description: "Interpolated string (potentially user-controlled)".to_string(),
233            location: fallback_location.clone(),
234        },
235        ArgumentSource::Unknown => TaintSource {
236            source_type: TaintSourceType::ToolArgument,
237            description: "Unknown source (could not determine origin)".to_string(),
238            location: fallback_location.clone(),
239        },
240        // Literal and Sanitized are not tainted, so they shouldn't reach here
241        ArgumentSource::Literal(_) | ArgumentSource::Sanitized { .. } => TaintSource {
242            source_type: TaintSourceType::ToolArgument,
243            description: "Unexpected safe source".to_string(),
244            location: fallback_location.clone(),
245        },
246    }
247}
248
249/// Assign confidence based on the argument source type.
250fn confidence_for_arg(arg: &ArgumentSource) -> f32 {
251    match arg {
252        ArgumentSource::Parameter { .. } => 0.9,
253        ArgumentSource::Interpolated => 0.8,
254        ArgumentSource::EnvVar { .. } => 0.7,
255        ArgumentSource::Unknown => 0.5,
256        ArgumentSource::Literal(_) | ArgumentSource::Sanitized { .. } => 0.1,
257    }
258}
259
260#[cfg(test)]
261mod tests {
262    use super::*;
263    use crate::ir::tool_surface::ToolSurface;
264    use crate::ir::ArgumentSource;
265    use serde_json::json;
266    use std::path::PathBuf;
267
268    fn make_location(line: usize) -> SourceLocation {
269        SourceLocation {
270            file: PathBuf::from("test.py"),
271            line,
272            column: 0,
273            end_line: None,
274            end_column: None,
275        }
276    }
277
278    fn make_tool(name: &str, params: &[&str]) -> ToolSurface {
279        let mut properties = serde_json::Map::new();
280        for p in params {
281            properties.insert(p.to_string(), json!({"type": "string"}));
282        }
283        ToolSurface {
284            name: name.to_string(),
285            description: Some("test tool".to_string()),
286            input_schema: Some(json!({"properties": properties})),
287            output_schema: None,
288            declared_permissions: vec![],
289            defined_at: Some(make_location(1)),
290        }
291    }
292
293    #[test]
294    fn test_sources_from_tool_parameters() {
295        let tools = vec![make_tool("run_cmd", &["command", "cwd"])];
296        let execution = ExecutionSurface::default();
297
298        let surface = build_data_surface(&tools, &execution);
299
300        assert_eq!(surface.sources.len(), 2);
301        assert!(surface
302            .sources
303            .iter()
304            .all(|s| s.source_type == TaintSourceType::ToolArgument));
305        assert!(surface
306            .sources
307            .iter()
308            .any(|s| s.description.contains("command")));
309        assert!(surface
310            .sources
311            .iter()
312            .any(|s| s.description.contains("cwd")));
313    }
314
315    #[test]
316    fn test_sources_from_env_accesses() {
317        let tools = vec![];
318        let execution = ExecutionSurface {
319            env_accesses: vec![EnvAccess {
320                var_name: ArgumentSource::Literal("API_KEY".to_string()),
321                is_sensitive: true,
322                location: make_location(10),
323            }],
324            ..Default::default()
325        };
326
327        let surface = build_data_surface(&tools, &execution);
328
329        assert_eq!(surface.sources.len(), 1);
330        assert_eq!(surface.sources[0].source_type, TaintSourceType::EnvVariable);
331        assert!(surface.sources[0].description.contains("API_KEY"));
332    }
333
334    #[test]
335    fn test_sinks_from_commands() {
336        let execution = ExecutionSurface {
337            commands: vec![CommandInvocation {
338                function: "subprocess.run".to_string(),
339                command_arg: ArgumentSource::Parameter {
340                    name: "cmd".to_string(),
341                },
342                location: make_location(5),
343            }],
344            ..Default::default()
345        };
346
347        let surface = build_data_surface(&[], &execution);
348
349        assert_eq!(surface.sinks.len(), 1);
350        assert_eq!(surface.sinks[0].sink_type, TaintSinkType::ProcessExec);
351        assert!(surface.sinks[0].description.contains("subprocess.run"));
352    }
353
354    #[test]
355    fn test_sinks_from_network_operations() {
356        let execution = ExecutionSurface {
357            network_operations: vec![NetworkOperation {
358                function: "requests.get".to_string(),
359                url_arg: ArgumentSource::Interpolated,
360                method: Some("GET".to_string()),
361                sends_data: false,
362                location: make_location(8),
363            }],
364            ..Default::default()
365        };
366
367        let surface = build_data_surface(&[], &execution);
368
369        assert_eq!(surface.sinks.len(), 1);
370        assert_eq!(surface.sinks[0].sink_type, TaintSinkType::HttpRequest);
371    }
372
373    #[test]
374    fn test_sinks_from_file_write_only() {
375        let execution = ExecutionSurface {
376            file_operations: vec![
377                FileOperation {
378                    operation: FileOpType::Read,
379                    path_arg: ArgumentSource::Parameter {
380                        name: "path".to_string(),
381                    },
382                    location: make_location(3),
383                },
384                FileOperation {
385                    operation: FileOpType::Write,
386                    path_arg: ArgumentSource::Parameter {
387                        name: "out".to_string(),
388                    },
389                    location: make_location(7),
390                },
391            ],
392            ..Default::default()
393        };
394
395        let surface = build_data_surface(&[], &execution);
396
397        // Only the Write should produce a sink
398        assert_eq!(surface.sinks.len(), 1);
399        assert_eq!(surface.sinks[0].sink_type, TaintSinkType::FileWrite);
400        assert_eq!(surface.sinks[0].location.line, 7);
401    }
402
403    #[test]
404    fn test_sinks_from_dynamic_exec() {
405        let execution = ExecutionSurface {
406            dynamic_exec: vec![DynamicExec {
407                function: "eval".to_string(),
408                code_arg: ArgumentSource::Unknown,
409                location: make_location(12),
410            }],
411            ..Default::default()
412        };
413
414        let surface = build_data_surface(&[], &execution);
415
416        assert_eq!(surface.sinks.len(), 1);
417        assert_eq!(surface.sinks[0].sink_type, TaintSinkType::DynamicEval);
418    }
419
420    #[test]
421    fn test_taint_path_from_parameter_to_command() {
422        let tools = vec![make_tool("exec_tool", &["command"])];
423        let execution = ExecutionSurface {
424            commands: vec![CommandInvocation {
425                function: "subprocess.run".to_string(),
426                command_arg: ArgumentSource::Parameter {
427                    name: "command".to_string(),
428                },
429                location: make_location(10),
430            }],
431            ..Default::default()
432        };
433
434        let surface = build_data_surface(&tools, &execution);
435
436        assert_eq!(surface.taint_paths.len(), 1);
437        let path = &surface.taint_paths[0];
438        assert_eq!(path.source.source_type, TaintSourceType::ToolArgument);
439        assert!(path.source.description.contains("command"));
440        assert_eq!(path.sink.sink_type, TaintSinkType::ProcessExec);
441        assert!((path.confidence - 0.9).abs() < f32::EPSILON);
442        assert!(path.through.is_empty());
443    }
444
445    #[test]
446    fn test_no_taint_path_for_literal() {
447        let execution = ExecutionSurface {
448            commands: vec![CommandInvocation {
449                function: "subprocess.run".to_string(),
450                command_arg: ArgumentSource::Literal("ls -la".to_string()),
451                location: make_location(5),
452            }],
453            ..Default::default()
454        };
455
456        let surface = build_data_surface(&[], &execution);
457
458        // Sink should exist, but no taint path (literal is safe)
459        assert_eq!(surface.sinks.len(), 1);
460        assert!(
461            surface.taint_paths.is_empty(),
462            "literal args should not produce taint paths"
463        );
464    }
465
466    #[test]
467    fn test_no_taint_path_for_sanitized() {
468        let execution = ExecutionSurface {
469            commands: vec![CommandInvocation {
470                function: "subprocess.run".to_string(),
471                command_arg: ArgumentSource::Sanitized {
472                    sanitizer: "validateCommand".to_string(),
473                },
474                location: make_location(5),
475            }],
476            ..Default::default()
477        };
478
479        let surface = build_data_surface(&[], &execution);
480
481        assert_eq!(surface.sinks.len(), 1);
482        assert!(
483            surface.taint_paths.is_empty(),
484            "sanitized args should not produce taint paths"
485        );
486    }
487
488    #[test]
489    fn test_interpolated_confidence() {
490        let execution = ExecutionSurface {
491            network_operations: vec![NetworkOperation {
492                function: "requests.get".to_string(),
493                url_arg: ArgumentSource::Interpolated,
494                method: Some("GET".to_string()),
495                sends_data: false,
496                location: make_location(15),
497            }],
498            ..Default::default()
499        };
500
501        let surface = build_data_surface(&[], &execution);
502
503        assert_eq!(surface.taint_paths.len(), 1);
504        assert!((surface.taint_paths[0].confidence - 0.8).abs() < f32::EPSILON);
505    }
506
507    #[test]
508    fn test_envvar_confidence() {
509        let execution = ExecutionSurface {
510            commands: vec![CommandInvocation {
511                function: "os.system".to_string(),
512                command_arg: ArgumentSource::EnvVar {
513                    name: "CMD".to_string(),
514                },
515                location: make_location(3),
516            }],
517            ..Default::default()
518        };
519
520        let surface = build_data_surface(&[], &execution);
521
522        assert_eq!(surface.taint_paths.len(), 1);
523        assert!((surface.taint_paths[0].confidence - 0.7).abs() < f32::EPSILON);
524    }
525
526    #[test]
527    fn test_unknown_confidence() {
528        let execution = ExecutionSurface {
529            dynamic_exec: vec![DynamicExec {
530                function: "eval".to_string(),
531                code_arg: ArgumentSource::Unknown,
532                location: make_location(20),
533            }],
534            ..Default::default()
535        };
536
537        let surface = build_data_surface(&[], &execution);
538
539        assert_eq!(surface.taint_paths.len(), 1);
540        assert!((surface.taint_paths[0].confidence - 0.5).abs() < f32::EPSILON);
541    }
542
543    #[test]
544    fn test_tool_without_schema_produces_no_sources() {
545        let tools = vec![ToolSurface {
546            name: "no_schema".to_string(),
547            description: None,
548            input_schema: None,
549            output_schema: None,
550            declared_permissions: vec![],
551            defined_at: None,
552        }];
553
554        let surface = build_data_surface(&tools, &ExecutionSurface::default());
555
556        assert!(surface.sources.is_empty());
557        assert!(surface.sinks.is_empty());
558        assert!(surface.taint_paths.is_empty());
559    }
560
561    #[test]
562    fn test_combined_sources_sinks_paths() {
563        let tools = vec![make_tool("fetch", &["url"])];
564        let execution = ExecutionSurface {
565            commands: vec![CommandInvocation {
566                function: "subprocess.run".to_string(),
567                command_arg: ArgumentSource::Literal("echo hi".to_string()),
568                location: make_location(5),
569            }],
570            network_operations: vec![NetworkOperation {
571                function: "requests.get".to_string(),
572                url_arg: ArgumentSource::Parameter {
573                    name: "url".to_string(),
574                },
575                method: Some("GET".to_string()),
576                sends_data: false,
577                location: make_location(10),
578            }],
579            env_accesses: vec![EnvAccess {
580                var_name: ArgumentSource::Literal("TOKEN".to_string()),
581                is_sensitive: true,
582                location: make_location(2),
583            }],
584            ..Default::default()
585        };
586
587        let surface = build_data_surface(&tools, &execution);
588
589        // 1 tool param source + 1 env source = 2 sources
590        assert_eq!(surface.sources.len(), 2);
591        // 1 command sink + 1 network sink = 2 sinks
592        assert_eq!(surface.sinks.len(), 2);
593        // Only network op is tainted (command is literal) = 1 path
594        assert_eq!(surface.taint_paths.len(), 1);
595        assert_eq!(
596            surface.taint_paths[0].sink.sink_type,
597            TaintSinkType::HttpRequest
598        );
599    }
600
601    #[test]
602    fn test_data_surface_from_vuln_fixture() {
603        use crate::adapter::Adapter;
604
605        let dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
606            .join("tests/fixtures/mcp_servers/vuln_cmd_inject");
607        let adapter = crate::adapter::mcp::McpAdapter;
608        let targets = adapter.load(&dir, false).unwrap();
609        assert_eq!(targets.len(), 1);
610
611        let target = &targets[0];
612
613        // The vuln_cmd_inject fixture has tainted commands, so DataSurface should be populated
614        assert!(
615            !target.data.sinks.is_empty(),
616            "vuln_cmd_inject should produce taint sinks"
617        );
618
619        // Should have ProcessExec sinks from subprocess calls
620        assert!(
621            target
622                .data
623                .sinks
624                .iter()
625                .any(|s| s.sink_type == TaintSinkType::ProcessExec),
626            "expected ProcessExec sink from subprocess usage"
627        );
628
629        // Should have taint paths connecting tainted args to sinks
630        assert!(
631            !target.data.taint_paths.is_empty(),
632            "vuln_cmd_inject should produce taint paths from parameter to subprocess"
633        );
634
635        // At least one path should have high confidence (parameter source)
636        assert!(
637            target.data.taint_paths.iter().any(|p| p.confidence >= 0.8),
638            "expected high-confidence taint path"
639        );
640    }
641}