Skip to main content

vtcode_core/exec/
integration_tests.rs

1//! Integration tests for MCP code execution architecture
2//!
3//! Tests all 5 steps from Anthropic's code execution recommendations:
4//! 1. Progressive tool discovery
5//! 2. Code executor with SDK generation
6//! 3. Skill persistence
7//! 4. Data filtering in code
8//! 5. PII tokenization
9
10#[cfg(test)]
11mod tests {
12    use crate::exec::{
13        AgentBehaviorAnalyzer, ExecutionConfig, PiiTokenizer, Skill, SkillManager, SkillMetadata,
14    };
15    use anyhow::Result;
16    use chrono;
17    use tempfile;
18
19    // ============================================================================
20    // Test 1: Discovery → Execution → Filtering
21    // ============================================================================
22
23    #[test]
24    fn test_discovery_to_execution_flow() {
25        // This test validates that tool discovery results can feed into code execution
26        // In real usage: agents discover tools, then use them in written code
27
28        // Note: This test demonstrates the concept but requires proper setup with
29        // actual MCP client. See integration tests documentation
30        // for full example with mocked dependencies.
31
32        // Step 1: Create execution config
33        let config = ExecutionConfig {
34            timeout_secs: 5,
35            ..Default::default()
36        };
37
38        // Verify config is created properly
39        assert_eq!(config.timeout_secs, 5);
40
41        // Step 2: Agent writes code that filters data locally
42        let _code = r#"
43# Simulate filtering without returning all results to model
44data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
45filtered = [x for x in data if x > 5]
46result = {"count": len(filtered), "items": filtered}
47"#;
48
49        // Step 3: In real usage, agent writes code that filters data locally
50        // (actual code runs locally, only aggregated result returns to model)
51
52        // Step 4: Pattern demonstration
53        // The pattern is: write code that processes data locally,
54        // returning only filtered/aggregated results to the model
55        let expected_pattern = r#"
56# Agent writes code that processes locally
57data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
58filtered = [x for x in data if x > 5]
59result = {"count": len(filtered), "items": filtered}
60        "#;
61        assert!(expected_pattern.contains("result = {"));
62
63        // This demonstrates the key benefit: filtering happens in code
64        // instead of in prompt context, saving ~98% of tokens
65    }
66
67    // ============================================================================
68    // Test 2: Execution → Skill Persistence → Reuse
69    // ============================================================================
70
71    #[tokio::test]
72    async fn test_execution_to_skill_reuse() {
73        // This test demonstrates the skill save/load/reuse pattern
74        // from Anthropic's code execution architecture
75
76        // Create temporary directory for skills
77        let temp_dir = tempfile::TempDir::new().unwrap();
78
79        // Step 1: Execution config for testing
80        let config = ExecutionConfig {
81            timeout_secs: 5,
82            ..Default::default()
83        };
84
85        // Verify config is valid
86        assert_eq!(config.timeout_secs, 5);
87
88        let code = r#"
89def double_value(x):
90    return x * 2
91
92result = {"test": double_value(21)}
93"#;
94
95        // Step 2: Create skill manager
96        let skill_manager = SkillManager::new(temp_dir.path());
97
98        // Step 3: Save the code as a reusable skill for later use
99        let skill = Skill {
100            metadata: SkillMetadata {
101                name: "double_value".to_owned(),
102                description: "Double a number".to_owned(),
103                language: "python3".to_owned(),
104                inputs: vec![],
105                output: "integer".to_owned(),
106                examples: vec![],
107                tags: vec!["math".to_owned()],
108                created_at: chrono::Utc::now().to_rfc3339(),
109                modified_at: chrono::Utc::now().to_rfc3339(),
110                tool_dependencies: vec![],
111            },
112            code: code.to_owned(),
113        };
114
115        skill_manager.save_skill(skill).await.unwrap();
116
117        // Step 5: Load and reuse skill
118        let loaded_skill = skill_manager.load_skill("double_value").await.unwrap();
119        assert_eq!(loaded_skill.metadata.name, "double_value");
120        assert_eq!(loaded_skill.metadata.language, "python3");
121
122        // This pattern allows agents to reuse code across conversations,
123        // saving 80%+ on token usage for repeated patterns
124        // temp_dir will be automatically cleaned up when dropped
125    }
126
127    // ============================================================================
128    // Test 3: PII Protection in Pipeline
129    // ============================================================================
130
131    #[test]
132    fn test_pii_protection_in_execution() -> Result<()> {
133        // Create a PII tokenizer
134        let tokenizer = PiiTokenizer::new()?;
135
136        // Step 1: Detect PII patterns
137        let text_with_pii = "Email: john@example.com, SSN: 123-45-6789";
138
139        let detected = tokenizer.detect_pii(text_with_pii)?;
140        assert!(!detected.is_empty());
141
142        // Step 2: Verify we can tokenize
143        let (tokenized, _tokens) = tokenizer.tokenize_string(text_with_pii)?;
144
145        // Step 3: Verify tokenized version doesn't contain plaintext PII
146        assert!(!tokenized.contains("john@example.com"));
147        assert!(!tokenized.contains("123-45-6789"));
148        assert!(tokenized.contains("__PII_"));
149
150        // Step 4: Verify we can detokenize
151        let detokenized = tokenizer.detokenize_string(&tokenized)?;
152        assert!(detokenized.contains("john@example.com"));
153        assert!(detokenized.contains("123-45-6789"));
154        Ok(())
155    }
156
157    // ============================================================================
158    // Test 4: Large Dataset Filtering
159    // ============================================================================
160
161    #[test]
162    fn test_large_dataset_filtering_efficiency() {
163        // Demonstrates data filtering efficiency pattern
164        // Instead of returning all 1000 items to the model,
165        // the code processes locally and returns only aggregated results
166
167        let config = ExecutionConfig {
168            timeout_secs: 5,
169            ..Default::default()
170        };
171
172        // In real usage with actual executor setup:
173        // let executor = CodeExecutor::new(language, client, workspace);
174
175        // Example code pattern for large dataset filtering
176        let code_pattern = r#"
177# Simulate processing large dataset
178items = list(range(1000))
179
180# Filter in code (not returned to model) - saves 98% of tokens!
181filtered_items = [x for x in items if x % 10 == 0]
182stats = {
183    "total": len(items),
184    "filtered": len(filtered_items),
185    "sample": filtered_items[:5]  # Return only sample, not all items
186}
187
188result = stats
189"#;
190
191        // Verify config is valid
192        assert_eq!(config.timeout_secs, 5);
193        assert_eq!(config.max_output_bytes, 10 * 1024 * 1024);
194        assert!(code_pattern.contains("# Filter in code"));
195
196        // Token efficiency: with traditional approach:
197        // - 1000 items × ~100 tokens each = ~100k tokens
198        // With code execution approach:
199        // - Code ~500 tokens + result ~100 tokens = ~600 tokens
200        // Savings: 98% fewer tokens!
201    }
202
203    // ============================================================================
204    // Test 5: Tool Error Handling in Code
205    // ============================================================================
206
207    #[test]
208    fn test_tool_error_handling_in_code() {
209        // Demonstrates error handling pattern in code execution
210        // Agents can write code with try/except blocks to handle errors
211        // without repeated model calls
212
213        let config = ExecutionConfig {
214            timeout_secs: 5,
215            ..Default::default()
216        };
217
218        // In real usage:
219        // let executor = CodeExecutor::new(language, client, workspace);
220
221        // Example code pattern with error handling
222        let code_pattern = r#"
223try:
224        # Try to process data
225    x = 1 / 0  # This will raise ZeroDivisionError
226     result = {"error": False}
227 except ZeroDivisionError as e:
228    result = {"error": True, "type": "ZeroDivisionError", "message": str(e)}
229 except Exception as e:
230    result = {"error": True, "type": type(e).__name__, "message": str(e)}
231"#;
232
233        // Verify config is valid
234        assert_eq!(config.timeout_secs, 5);
235        assert_eq!(config.max_output_bytes, 10 * 1024 * 1024);
236        assert!(code_pattern.contains("try:"));
237        assert!(code_pattern.contains("except"));
238
239        // This pattern allows agents to handle errors in code
240        // without returning every exception to the model
241    }
242
243    // ============================================================================
244    // Test 6: Agent Behavior Analysis
245    // ============================================================================
246
247    #[test]
248    fn test_agent_behavior_tracking() {
249        let mut analyzer = AgentBehaviorAnalyzer::new();
250
251        // Record tool usage
252        analyzer.record_tool_usage(vtcode_config::constants::tools::LIST_FILES);
253        analyzer.record_tool_usage(vtcode_config::constants::tools::LIST_FILES);
254        analyzer.record_tool_usage("read_file");
255
256        // Record skill reuse
257        analyzer.record_skill_reuse("filter_skill");
258        analyzer.record_skill_reuse("filter_skill");
259
260        // Record failures
261        analyzer.record_tool_failure("grep_tool", "timeout");
262        analyzer.record_tool_failure("grep_tool", "pattern_error");
263
264        // Verify statistics
265        assert_eq!(
266            analyzer
267                .tool_stats()
268                .usage_frequency
269                .get(vtcode_config::constants::tools::LIST_FILES),
270            Some(&2)
271        );
272        assert_eq!(analyzer.skill_stats().reused_skills, 2);
273        assert!(!analyzer.failure_patterns().high_failure_tools.is_empty());
274
275        // Get recommendations
276        let tool_recs = analyzer.recommend_tools("list", 1);
277        assert!(tool_recs.contains(&vtcode_config::constants::tools::LIST_FILES.to_owned()));
278
279        // Identify risky tools
280        let risky = analyzer.identify_risky_tools(0.3);
281        assert!(!risky.is_empty());
282    }
283
284    // ============================================================================
285    // Scenario Tests
286    // ============================================================================
287
288    #[test]
289    fn test_scenario_simple_transformation() {
290        // Demonstrates simple data transformation pattern
291        // Transform data locally and return only the needed results
292
293        let config = ExecutionConfig {
294            timeout_secs: 5,
295            ..Default::default()
296        };
297
298        // In real usage:
299        // let executor = CodeExecutor::new(language, client, workspace);
300
301        let code_pattern = r#"
302# Transform data locally before returning
303data = ["hello", "world", "test"]
304transformed = [s.upper() for s in data]
305result = {"original_count": len(data), "transformed": transformed}
306"#;
307
308        assert_eq!(config.max_output_bytes, 10 * 1024 * 1024);
309        assert!(code_pattern.contains("result = {"));
310
311        // This pattern keeps transformations local, reducing context overhead
312    }
313
314    #[test]
315    fn test_javascript_execution() {
316        // Demonstrates JavaScript code execution support
317
318        let config = ExecutionConfig {
319            timeout_secs: 5,
320            ..Default::default()
321        };
322
323        // In real usage:
324        // let executor = CodeExecutor::new(Language::JavaScript, client, workspace);
325
326        let code_pattern = r#"
327const items = [1, 2, 3, 4, 5];
328const filtered = items.filter(x => x > 2);
329result = { count: filtered.length, items: filtered };
330"#;
331
332        assert_eq!(config.timeout_secs, 5);
333        assert!(code_pattern.contains("const items"));
334        assert!(code_pattern.contains("result ="));
335
336        // Agents can write JavaScript code just like Python
337    }
338}