Skip to main content

dasein_agentic_core/distributed/
sandbox_validator.rs

1//! SandboxValidator - Ground truth validation using real code execution.
2//!
3//! Unlike the rule-based [`Validator`], this validator actually compiles and
4//! runs the code to detect real errors. This eliminates the "LLM reviewing LLM"
5//! bias problem by using objective, executable feedback.
6//!
7//! # Why Ground Truth Validation?
8//!
9//! ```text
10//! ┌─────────────────────────────────────────────────────────────┐
11//! │  LLM Review (Validator)    vs    Ground Truth (Sandbox)    │
12//! ├─────────────────────────────────────────────────────────────┤
13//! │  "This looks correct"      vs    "error[E0308]: expected   │
14//! │  Score: 8/10                      i32, found &str"         │
15//! │  Subjective                       Objective                 │
16//! │  Can miss bugs                    Catches all compile errs  │
17//! │  Fast                             Slower but accurate       │
18//! └─────────────────────────────────────────────────────────────┘
19//! ```
20//!
21//! # Supported Languages
22//!
23//! - **Rust**: `cargo check` + `cargo test`
24//! - **Python**: `python -m pytest`
25//! - **Go**: `go build` + `go test`
26//! - **TypeScript**: `tsc --noEmit` + `npm test`
27//!
28//! # Example: Basic Validation
29//!
30//! ```rust,no_run
31//! use dasein_agentic_core::distributed::SandboxValidator;
32//! use dasein_agentic_sandbox::ProcessSandbox;
33//!
34//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
35//! let sandbox = ProcessSandbox::new();
36//! let validator = SandboxValidator::new(sandbox);
37//!
38//! let code = r#"
39//!     pub fn add(a: i32, b: i32) -> i32 { a + b }
40//!
41//!     #[test]
42//!     fn test_add() { assert_eq!(add(2, 3), 5); }
43//! "#;
44//!
45//! let result = validator.validate_rust_code(code).await?;
46//!
47//! if result.passed {
48//!     println!("Code compiles and all tests pass!");
49//! } else if !result.compiles {
50//!     println!("Compiler errors: {:?}", result.compiler_errors);
51//! } else {
52//!     println!("Test failures: {:?}", result.test_errors);
53//! }
54//! # Ok(())
55//! # }
56//! ```
57//!
58//! # Example: Grounded Feedback Loop
59//!
60//! ```rust,no_run
61//! # use dasein_agentic_core::distributed::{Executor, SandboxValidator};
62//! # use dasein_agentic_sandbox::ProcessSandbox;
63//! # async fn example() -> Result<(), Box<dyn std::error::Error>> {
64//! let executor = Executor::new("exe-001", "sup-001").build();
65//! let validator = SandboxValidator::new(ProcessSandbox::new());
66//!
67//! let mut code = executor.execute("system", "write fibonacci").await?.content;
68//!
69//! for attempt in 0..5 {
70//!     let result = validator.validate_rust_code(&code).await?;
71//!
72//!     if result.passed {
73//!         println!("Success after {} attempts!", attempt + 1);
74//!         break;
75//!     }
76//!
77//!     // Feed real errors back to LLM
78//!     let feedback = result.feedback.unwrap_or_default();
79//!     code = executor.execute("system", &format!(
80//!         "Fix this code:\n{}\n\nErrors:\n{}", code, feedback
81//!     )).await?.content;
82//! }
83//! # Ok(())
84//! # }
85//! ```
86//!
87//! See `examples/grounded_loop.rs` for a complete implementation.
88
89use dasein_agentic_sandbox::{ExecutionResult as SandboxExecutionResult, Sandbox, SandboxError};
90use regex::Regex;
91use serde::{Deserialize, Serialize};
92use std::path::PathBuf;
93use std::sync::OnceLock;
94use tracing::{debug, info, instrument};
95
96// Pre-compiled regex patterns for parsing test output (avoids regex compilation in loops)
97fn passed_failed_regex() -> &'static Regex {
98    static RE: OnceLock<Regex> = OnceLock::new();
99    RE.get_or_init(|| Regex::new(r"(\d+) passed; (\d+) failed").unwrap())
100}
101
102fn passed_regex() -> &'static Regex {
103    static RE: OnceLock<Regex> = OnceLock::new();
104    RE.get_or_init(|| Regex::new(r"(\d+) passed").unwrap())
105}
106
107fn failed_regex() -> &'static Regex {
108    static RE: OnceLock<Regex> = OnceLock::new();
109    RE.get_or_init(|| Regex::new(r"(\d+) failed").unwrap())
110}
111
112fn total_regex() -> &'static Regex {
113    static RE: OnceLock<Regex> = OnceLock::new();
114    RE.get_or_init(|| Regex::new(r"(\d+) total").unwrap())
115}
116
117/// Result of sandbox validation.
118#[derive(Debug, Clone, Serialize, Deserialize)]
119pub struct SandboxValidationResult {
120    /// Overall pass/fail
121    pub passed: bool,
122    /// Compilation passed
123    pub compiles: bool,
124    /// Tests passed (if any)
125    pub tests_passed: bool,
126    /// Number of tests run
127    pub test_count: u32,
128    /// Number of tests passed
129    pub tests_ok: u32,
130    /// Number of tests failed
131    pub tests_failed: u32,
132    /// Compiler errors (if any)
133    pub compiler_errors: Vec<String>,
134    /// Test failures (if any)
135    pub test_errors: Vec<String>,
136    /// Warnings (non-blocking)
137    pub warnings: Vec<String>,
138    /// Structured feedback for LLM
139    pub feedback: Option<String>,
140    /// Total execution time in ms
141    pub execution_time_ms: u64,
142}
143
144impl SandboxValidationResult {
145    /// Create a successful result.
146    pub fn success(test_count: u32, execution_time_ms: u64) -> Self {
147        Self {
148            passed: true,
149            compiles: true,
150            tests_passed: true,
151            test_count,
152            tests_ok: test_count,
153            tests_failed: 0,
154            compiler_errors: vec![],
155            test_errors: vec![],
156            warnings: vec![],
157            feedback: None,
158            execution_time_ms,
159        }
160    }
161
162    /// Create a compilation failure result.
163    pub fn compile_error(errors: Vec<String>, execution_time_ms: u64) -> Self {
164        // SAFEGUARD: Never have passed=false with empty errors
165        // This prevents confusing "0 errors but failed" scenarios
166        let errors = if errors.is_empty() {
167            vec![
168                "Compilation failed but no specific error was captured. Check command output."
169                    .to_string(),
170            ]
171        } else {
172            errors
173        };
174        let feedback = format!(
175            "COMPILATION FAILED:\n\n{}\n\nFix these errors and try again.",
176            errors.join("\n")
177        );
178        Self {
179            passed: false,
180            compiles: false,
181            tests_passed: false,
182            test_count: 0,
183            tests_ok: 0,
184            tests_failed: 0,
185            compiler_errors: errors,
186            test_errors: vec![],
187            warnings: vec![],
188            feedback: Some(feedback),
189            execution_time_ms,
190        }
191    }
192
193    /// Create a test failure result.
194    pub fn test_error(
195        test_count: u32,
196        tests_ok: u32,
197        errors: Vec<String>,
198        execution_time_ms: u64,
199    ) -> Self {
200        // SAFEGUARD: Never have passed=false with empty errors
201        let errors = if errors.is_empty() {
202            vec!["Tests failed but no specific error was captured. Check test output.".to_string()]
203        } else {
204            errors
205        };
206        let feedback = format!(
207            "TESTS FAILED: {}/{} passed\n\nFailures:\n{}\n\nFix these test failures.",
208            tests_ok,
209            test_count,
210            errors.join("\n")
211        );
212        Self {
213            passed: false,
214            compiles: true,
215            tests_passed: false,
216            test_count,
217            tests_ok,
218            tests_failed: test_count - tests_ok,
219            compiler_errors: vec![],
220            test_errors: errors,
221            warnings: vec![],
222            feedback: Some(feedback),
223            execution_time_ms,
224        }
225    }
226
227    /// Create a lint failure result (clippy, flake8, etc.).
228    /// Code compiles but has anti-patterns or code quality issues.
229    pub fn lint_error(errors: Vec<String>, execution_time_ms: u64) -> Self {
230        // SAFEGUARD: Never have passed=false with empty errors
231        let errors = if errors.is_empty() {
232            vec![
233                "Lint check failed but no specific error was captured. Check linter output."
234                    .to_string(),
235            ]
236        } else {
237            errors
238        };
239        let feedback = format!(
240            "LINT ERRORS (code compiles but has issues):\n\n{}\n\nFix these anti-patterns.",
241            errors.join("\n")
242        );
243        Self {
244            passed: false,
245            compiles: true, // It compiles, but has lint issues
246            tests_passed: false,
247            test_count: 0,
248            tests_ok: 0,
249            tests_failed: 0,
250            compiler_errors: vec![],
251            test_errors: errors, // Use test_errors to store lint errors
252            warnings: vec![],
253            feedback: Some(feedback),
254            execution_time_ms,
255        }
256    }
257}
258
259/// Language for code validation.
260#[derive(Debug, Clone, Copy, PartialEq, Eq)]
261pub enum Language {
262    Rust,
263    Python,
264    JavaScript,
265    TypeScript,
266    Go,
267    Shell,
268}
269
270impl Language {
271    /// Get file extension for this language.
272    pub fn extension(&self) -> &'static str {
273        match self {
274            Self::Rust => "rs",
275            Self::Python => "py",
276            Self::JavaScript => "js",
277            Self::TypeScript => "ts",
278            Self::Go => "go",
279            Self::Shell => "sh",
280        }
281    }
282
283    /// Get compile command (if applicable).
284    pub fn compile_command(&self, project_dir: &str) -> Option<String> {
285        match self {
286            Self::Rust => Some(format!("cd {} && cargo check 2>&1", project_dir)),
287            Self::Go => Some(format!("cd {} && go build ./... 2>&1", project_dir)),
288            // Use tsc directly (globally installed)
289            Self::TypeScript => Some(format!("cd {} && tsc --noEmit 2>&1", project_dir)),
290            _ => None,
291        }
292    }
293
294    /// Get lint command (optional static analysis).
295    /// Returns None if no linter is available for this language.
296    pub fn lint_command(&self, project_dir: &str) -> Option<String> {
297        match self {
298            // Clippy catches: .forget() misuse, unused vars, anti-patterns
299            Self::Rust => Some(format!(
300                "cd {} && cargo clippy --all-targets -- -D warnings 2>&1",
301                project_dir
302            )),
303            // Pylint/flake8 for Python
304            Self::Python => Some(format!(
305                "cd {} && python3 -m flake8 --max-line-length=120 2>&1 || true",
306                project_dir
307            )),
308            // ESLint for JS/TS
309            Self::JavaScript | Self::TypeScript => {
310                Some(format!("cd {} && npx eslint . 2>&1 || true", project_dir))
311            }
312            // golint for Go
313            Self::Go => Some(format!("cd {} && go vet ./... 2>&1", project_dir)),
314            _ => None,
315        }
316    }
317
318    /// Get test command.
319    pub fn test_command(&self, project_dir: &str) -> String {
320        match self {
321            Self::Rust => format!("cd {} && cargo test 2>&1", project_dir),
322            Self::Python => format!("cd {} && python3 -m pytest -v 2>&1", project_dir),
323            Self::JavaScript => format!("cd {} && npm test 2>&1", project_dir),
324            // Use jest directly with NODE_PATH for global packages
325            Self::TypeScript => format!(
326                "cd {} && NODE_PATH=$(npm root -g) jest --config jest.config.js 2>&1",
327                project_dir
328            ),
329            Self::Go => format!("cd {} && go test ./... -v 2>&1", project_dir),
330            Self::Shell => format!("cd {} && bash -n *.sh 2>&1", project_dir),
331        }
332    }
333}
334
335/// Sandbox-based code validator.
336///
337/// Uses real compilation and test execution for ground truth validation.
338pub struct SandboxValidator<S: Sandbox> {
339    sandbox: S,
340    workspace: PathBuf,
341    run_tests: bool,
342}
343
344impl<S: Sandbox> SandboxValidator<S> {
345    /// Create a new sandbox validator.
346    pub fn new(sandbox: S) -> Self {
347        Self {
348            sandbox,
349            workspace: PathBuf::from("/tmp/agentic-validation"),
350            run_tests: true,
351        }
352    }
353
354    /// Set the workspace directory for validation files.
355    pub fn workspace(mut self, path: impl Into<PathBuf>) -> Self {
356        self.workspace = path.into();
357        self
358    }
359
360    /// Enable/disable test execution.
361    pub fn run_tests(mut self, run: bool) -> Self {
362        self.run_tests = run;
363        self
364    }
365
366    /// Validate Rust code with real compilation and tests.
367    #[instrument(skip(self, code), fields(lang = "rust"))]
368    pub async fn validate_rust_code(
369        &self,
370        code: &str,
371    ) -> Result<SandboxValidationResult, SandboxError> {
372        self.validate_code(code, Language::Rust).await
373    }
374
375    /// Validate code in any supported language.
376    #[instrument(skip(self, code))]
377    pub async fn validate_code(
378        &self,
379        code: &str,
380        language: Language,
381    ) -> Result<SandboxValidationResult, SandboxError> {
382        let start = std::time::Instant::now();
383        let project_id = uuid::Uuid::new_v4().to_string();
384        let project_dir = self.workspace.join(&project_id);
385
386        info!(
387            "Validating {:?} code in {}",
388            language,
389            project_dir.display()
390        );
391
392        // Setup project structure based on language
393        let setup_result = match language {
394            Language::Rust => self.setup_rust_project(&project_dir, code).await,
395            Language::Python => self.setup_python_project(&project_dir, code).await,
396            Language::TypeScript => self.setup_typescript_project(&project_dir, code).await,
397            Language::Go => self.setup_go_project(&project_dir, code).await,
398            _ => {
399                self.setup_generic_project(&project_dir, code, language)
400                    .await
401            }
402        }?;
403
404        if !setup_result.is_success() {
405            return Ok(SandboxValidationResult::compile_error(
406                vec![format!("Project setup failed: {}", setup_result.stderr)],
407                start.elapsed().as_millis() as u64,
408            ));
409        }
410
411        // Compile if language requires it
412        if let Some(compile_cmd) = language.compile_command(project_dir.to_str().unwrap_or("")) {
413            debug!("Running compile: {}", compile_cmd);
414            let compile_result = self.sandbox.execute(&compile_cmd).await?;
415
416            if !compile_result.is_success() {
417                let errors =
418                    Self::parse_compiler_errors(&compile_result.stdout, &compile_result.stderr);
419                // Cleanup
420                let _ = self
421                    .sandbox
422                    .execute(&format!("rm -rf {}", project_dir.display()))
423                    .await;
424                return Ok(SandboxValidationResult::compile_error(
425                    errors,
426                    start.elapsed().as_millis() as u64,
427                ));
428            }
429
430            // Extract warnings
431            let warnings = Self::extract_warnings(&compile_result.stdout, &compile_result.stderr);
432            if !warnings.is_empty() {
433                debug!("Warnings: {:?}", warnings);
434            }
435        }
436
437        // Run linter (clippy for Rust) to catch anti-patterns
438        if let Some(lint_cmd) = language.lint_command(project_dir.to_str().unwrap_or("")) {
439            debug!("Running linter: {}", lint_cmd);
440            let lint_result = self.sandbox.execute(&lint_cmd).await?;
441
442            // Clippy errors are treated as failures (we use -D warnings)
443            if !lint_result.is_success() {
444                let errors = Self::parse_clippy_errors(&lint_result.stdout, &lint_result.stderr);
445                if !errors.is_empty() {
446                    // Cleanup
447                    let _ = self
448                        .sandbox
449                        .execute(&format!("rm -rf {}", project_dir.display()))
450                        .await;
451                    return Ok(SandboxValidationResult::lint_error(
452                        errors,
453                        start.elapsed().as_millis() as u64,
454                    ));
455                }
456            }
457        }
458
459        // Run tests if enabled
460        if self.run_tests {
461            let test_cmd = language.test_command(project_dir.to_str().unwrap_or(""));
462            debug!("Running tests: {}", test_cmd);
463            let test_result = self.sandbox.execute(&test_cmd).await?;
464
465            // Cleanup
466            let _ = self
467                .sandbox
468                .execute(&format!("rm -rf {}", project_dir.display()))
469                .await;
470
471            // Parse test results
472            let (total, passed, failed, errors) =
473                Self::parse_test_results(&test_result.stdout, &test_result.stderr, language);
474
475            if failed > 0 || !test_result.is_success() {
476                // If no specific errors captured but test failed, include raw output
477                let final_errors = if errors.is_empty() {
478                    let combined = format!("{}\n{}", test_result.stdout, test_result.stderr);
479
480                    // Always include raw output - don't filter too aggressively
481                    let output_lines: Vec<&str> = combined.lines().collect();
482                    let truncated = if output_lines.len() > 100 {
483                        // Too long, take last 80 lines
484                        output_lines[output_lines.len() - 80..].join("\n")
485                    } else {
486                        combined.clone()
487                    };
488
489                    // If still empty, note that
490                    let final_output = if truncated.trim().is_empty() {
491                        format!(
492                            "Test failed with exit code {} but no output captured. \
493                            Check for panics, timeouts, or infinite loops.",
494                            test_result.exit_code
495                        )
496                    } else {
497                        format!(
498                            "Test failed (exit code {}):\n{}",
499                            test_result.exit_code, truncated
500                        )
501                    };
502
503                    vec![final_output]
504                } else {
505                    errors
506                };
507
508                return Ok(SandboxValidationResult::test_error(
509                    total,
510                    passed,
511                    final_errors,
512                    start.elapsed().as_millis() as u64,
513                ));
514            }
515
516            Ok(SandboxValidationResult::success(
517                total,
518                start.elapsed().as_millis() as u64,
519            ))
520        } else {
521            // Cleanup
522            let _ = self
523                .sandbox
524                .execute(&format!("rm -rf {}", project_dir.display()))
525                .await;
526            Ok(SandboxValidationResult::success(
527                0,
528                start.elapsed().as_millis() as u64,
529            ))
530        }
531    }
532
533    /// Setup a Rust project for validation.
534    async fn setup_rust_project(
535        &self,
536        project_dir: &PathBuf,
537        code: &str,
538    ) -> Result<SandboxExecutionResult, SandboxError> {
539        // Use base64 encoding to avoid all shell escaping issues
540        let encoded = base64_encode(code);
541
542        let setup_script = format!(
543            r#"
544mkdir -p {dir}/src && \
545cat > {dir}/Cargo.toml << 'CARGO_EOF'
546[package]
547name = "validation_project"
548version = "0.1.0"
549edition = "2021"
550
551[dependencies]
552tokio = {{ version = "1", features = ["full"] }}
553serde = {{ version = "1", features = ["derive"] }}
554serde_json = "1"
555thiserror = "2"
556anyhow = "1"
557async-trait = "0.1"
558futures = "0.3"
559reqwest = {{ version = "0.12", features = ["json"] }}
560
561[lints.clippy]
562# Allow common false positives - focus on real errors, not style
563new_without_default = "allow"
564must_use_candidate = "allow"
565missing_errors_doc = "allow"
566missing_panics_doc = "allow"
567module_name_repetitions = "allow"
568CARGO_EOF
569echo '{encoded}' | base64 -d > {dir}/src/lib.rs
570"#,
571            dir = project_dir.display(),
572            encoded = encoded
573        );
574
575        self.sandbox.execute(&setup_script).await
576    }
577
578    /// Setup a Python project for validation.
579    async fn setup_python_project(
580        &self,
581        project_dir: &PathBuf,
582        code: &str,
583    ) -> Result<SandboxExecutionResult, SandboxError> {
584        let encoded = base64_encode(code);
585
586        // Check if code already contains tests
587        let has_tests = code.contains("def test_") || code.contains("import pytest");
588        // Check if code uses async tests (pytest.mark.asyncio or async def test_)
589        let has_async_tests =
590            code.contains("@pytest.mark.asyncio") || code.contains("async def test_");
591
592        let setup_script = if has_tests {
593            if has_async_tests {
594                // Async tests need pytest-asyncio + conftest.py
595                format!(
596                    r#"
597pip3 install pytest-asyncio -q 2>/dev/null || true && \
598mkdir -p {dir} && \
599echo '{encoded}' | base64 -d > {dir}/test_main.py && \
600cat > {dir}/conftest.py << 'CONFTEST_EOF'
601import pytest
602pytest_plugins = ('pytest_asyncio',)
603CONFTEST_EOF
604"#,
605                    dir = project_dir.display(),
606                    encoded = encoded
607                )
608            } else {
609                // Sync tests - just the test file
610                format!(
611                    r"
612mkdir -p {dir} && \
613echo '{encoded}' | base64 -d > {dir}/test_main.py
614",
615                    dir = project_dir.display(),
616                    encoded = encoded
617                )
618            }
619        } else {
620            // No tests - create main.py and a stub test file
621            format!(
622                r"
623mkdir -p {dir} && \
624echo '{encoded}' | base64 -d > {dir}/main.py && \
625cat > {dir}/test_main.py << 'TEST_EOF'
626import pytest
627from main import *
628
629# Auto-generated test stub
630def test_placeholder():
631    pass
632TEST_EOF
633",
634                dir = project_dir.display(),
635                encoded = encoded
636            )
637        };
638
639        self.sandbox.execute(&setup_script).await
640    }
641
642    /// Setup a TypeScript project for validation.
643    /// Uses globally installed packages (typescript, jest, ts-jest) to avoid slow npm install.
644    async fn setup_typescript_project(
645        &self,
646        project_dir: &PathBuf,
647        code: &str,
648    ) -> Result<SandboxExecutionResult, SandboxError> {
649        let encoded = base64_encode(code);
650
651        // Check if code contains tests (Jest)
652        let has_tests =
653            code.contains("describe(") || code.contains("it(") || code.contains("test(");
654        let filename = if has_tests { "main.test.ts" } else { "main.ts" };
655
656        // Generate appropriate tsconfig based on whether we have tests
657        // Without tests: don't require @types/jest (avoids TS2688 error)
658        // With tests: include jest types for describe/it/expect
659        let tsconfig_types = if has_tests {
660            r#""types": ["jest", "node"]"#
661        } else {
662            // No types field = TypeScript auto-detects, no jest required
663            r#""types": []"#
664        };
665
666        // Use globally installed packages - no npm install required
667        // Requires: npm install -g typescript jest ts-jest @types/jest @types/node
668        // NOTE: typeRoots points to global npm types location so tsc can find @types/jest etc.
669        let setup_script = format!(
670            r#"
671mkdir -p {dir} && \
672echo '{encoded}' | base64 -d > {dir}/{filename} && \
673cat > {dir}/package.json << 'PKG_EOF'
674{{
675  "name": "validation",
676  "version": "1.0.0",
677  "scripts": {{
678    "test": "jest --config jest.config.js"
679  }}
680}}
681PKG_EOF
682cat > {dir}/tsconfig.json << TS_EOF
683{{
684  "compilerOptions": {{
685    "target": "ES2020",
686    "module": "commonjs",
687    "strict": true,
688    "esModuleInterop": true,
689    "skipLibCheck": true,
690    "outDir": "./dist",
691    "typeRoots": ["/usr/lib/node_modules/@types", "./node_modules/@types"],
692    {tsconfig_types}
693  }},
694  "include": ["*.ts"]
695}}
696TS_EOF
697cat > {dir}/jest.config.js << 'JEST_EOF'
698module.exports = {{
699  transform: {{
700    '^.+\\.tsx?$': ['ts-jest', {{ tsconfig: 'tsconfig.json' }}]
701  }},
702  testEnvironment: 'node',
703  testMatch: ['**/*.test.ts'],
704  moduleFileExtensions: ['ts', 'tsx', 'js', 'jsx', 'json', 'node'],
705}};
706JEST_EOF
707echo "TypeScript project setup complete (using global packages)"
708"#,
709            dir = project_dir.display(),
710            encoded = encoded,
711            filename = filename
712        );
713
714        self.sandbox.execute(&setup_script).await
715    }
716
717    /// Setup a Go project for validation.
718    async fn setup_go_project(
719        &self,
720        project_dir: &PathBuf,
721        code: &str,
722    ) -> Result<SandboxExecutionResult, SandboxError> {
723        let encoded = base64_encode(code);
724
725        // Check if code contains tests
726        let has_tests = code.contains("func Test") || code.contains("testing.T");
727        let filename = if has_tests { "main_test.go" } else { "main.go" };
728
729        let setup_script = format!(
730            r"
731mkdir -p {dir} && \
732echo '{encoded}' | base64 -d > {dir}/{filename} && \
733cd {dir} && go mod init validation 2>&1
734",
735            dir = project_dir.display(),
736            encoded = encoded,
737            filename = filename
738        );
739
740        self.sandbox.execute(&setup_script).await
741    }
742
743    /// Setup a generic project.
744    async fn setup_generic_project(
745        &self,
746        project_dir: &PathBuf,
747        code: &str,
748        language: Language,
749    ) -> Result<SandboxExecutionResult, SandboxError> {
750        let filename = format!("main.{}", language.extension());
751        let encoded = base64_encode(code);
752        let setup_script = format!(
753            r"
754mkdir -p {dir} && \
755echo '{encoded}' | base64 -d > {dir}/{filename}
756",
757            dir = project_dir.display(),
758            filename = filename,
759            encoded = encoded
760        );
761
762        self.sandbox.execute(&setup_script).await
763    }
764
765    /// Parse compiler error output.
766    fn parse_compiler_errors(stdout: &str, stderr: &str) -> Vec<String> {
767        let combined = format!("{}\n{}", stdout, stderr);
768        let mut errors = Vec::new();
769
770        for line in combined.lines() {
771            // Check if line matches any error pattern
772            let is_rust_error = line.contains("error[E") || line.starts_with("error:");
773            let is_ts_error = line.contains("): error TS") || line.contains(": error TS");
774            let is_go_error = line.contains(".go:")
775                && (line.contains("undefined")
776                    || line.contains("cannot")
777                    || line.contains("expected")
778                    || line.contains("invalid"));
779
780            if is_rust_error || is_ts_error || is_go_error {
781                errors.push(line.to_string());
782            }
783
784            // Also capture the context lines (up to 5 after error)
785            if line.contains(" --> ") || line.contains(" | ") {
786                if let Some(last) = errors.last_mut() {
787                    last.push('\n');
788                    last.push_str(line);
789                }
790            }
791        }
792
793        // Fallback: if no structured errors, return combined output
794        // NOTE: Many commands use 2>&1 which redirects stderr to stdout,
795        // so we must check combined output, not just stderr
796        if errors.is_empty() {
797            let combined_trimmed = combined.trim();
798            if !combined_trimmed.is_empty() {
799                errors.push(combined_trimmed.to_string());
800            }
801        }
802
803        errors
804    }
805
806    /// Extract warnings from output.
807    fn extract_warnings(stdout: &str, stderr: &str) -> Vec<String> {
808        let combined = format!("{}\n{}", stdout, stderr);
809        combined
810            .lines()
811            .filter(|line| line.contains("warning:") || line.contains("warn["))
812            .map(String::from)
813            .collect()
814    }
815
816    /// Parse clippy/lint error output.
817    /// Clippy output looks like:
818    /// "warning: ... --> src/lib.rs:10:5 ... help: consider using ..."
819    /// With -D warnings, these become errors.
820    fn parse_clippy_errors(stdout: &str, stderr: &str) -> Vec<String> {
821        let combined = format!("{}\n{}", stdout, stderr);
822        let mut errors = Vec::new();
823        let mut current_error = String::new();
824
825        for line in combined.lines() {
826            // Clippy error/warning pattern
827            if line.starts_with("error:") || line.starts_with("warning:") {
828                // Save previous error if any
829                if !current_error.is_empty() {
830                    errors.push(current_error.clone());
831                }
832                current_error = line.to_string();
833            }
834            // Context lines (location, help suggestions)
835            // Context lines and help suggestions
836            else if line.contains(" --> ")
837                || line.contains(" | ")
838                || line.trim().starts_with("= help:")
839                || line.trim().starts_with("help:")
840            {
841                if !current_error.is_empty() {
842                    current_error.push('\n');
843                    current_error.push_str(line);
844                }
845            }
846        }
847
848        // Don't forget the last error
849        if !current_error.is_empty() {
850            errors.push(current_error);
851        }
852
853        // Filter out non-actionable items (aborting due to errors, etc.)
854        errors
855            .into_iter()
856            .filter(|e| !e.contains("aborting due to") && !e.contains("could not compile"))
857            .collect()
858    }
859
860    /// Parse test results.
861    fn parse_test_results(
862        stdout: &str,
863        stderr: &str,
864        language: Language,
865    ) -> (u32, u32, u32, Vec<String>) {
866        let combined = format!("{}\n{}", stdout, stderr);
867
868        match language {
869            Language::Rust => Self::parse_rust_test_output(&combined),
870            Language::Python => Self::parse_pytest_output(&combined),
871            Language::TypeScript | Language::JavaScript => Self::parse_jest_output(&combined),
872            Language::Go => Self::parse_go_test_output(&combined),
873            _ => (0, 0, 0, vec![combined]),
874        }
875    }
876
877    /// Parse Rust test output (cargo test).
878    fn parse_rust_test_output(output: &str) -> (u32, u32, u32, Vec<String>) {
879        let mut total = 0u32;
880        let mut passed = 0u32;
881        let mut failed = 0u32;
882        let mut errors = Vec::new();
883        let lines: Vec<&str> = output.lines().collect();
884
885        for (i, &line) in lines.iter().enumerate() {
886            // Parse summary line: "test result: ok. 5 passed; 0 failed; 0 ignored"
887            // or "test result: FAILED. X passed; Y failed"
888            if line.starts_with("test result:") {
889                if let Some(caps) = passed_failed_regex().captures(line) {
890                    passed = caps
891                        .get(1)
892                        .and_then(|m| m.as_str().parse().ok())
893                        .unwrap_or(0);
894                    failed = caps
895                        .get(2)
896                        .and_then(|m| m.as_str().parse().ok())
897                        .unwrap_or(0);
898                    total = passed + failed;
899                }
900            }
901
902            // Capture failed test names (e.g., "test tests::test_cycle ... FAILED")
903            if line.contains("FAILED") && !line.starts_with("test result:") {
904                errors.push(line.to_string());
905            }
906
907            // Capture assertion failures with context (next 5 lines)
908            if line.contains("assertion `left == right` failed")
909                || line.contains("assertion failed")
910                || line.contains("panicked at")
911                || line.contains("thread 'main' panicked")
912                || line.contains("thread '") && line.contains("' panicked")
913            {
914                let mut context = line.to_string();
915                // Add following lines for context (left/right values, etc.)
916                for j in 1..=5 {
917                    if i + j < lines.len() {
918                        let next_line = lines[i + j].trim();
919                        if !next_line.is_empty() && !next_line.starts_with("note:") {
920                            context.push('\n');
921                            context.push_str(next_line);
922                        }
923                    }
924                }
925                errors.push(context);
926            }
927
928            // Capture "failures:" section header and subsequent test names
929            if line.trim() == "failures:" {
930                // Collect the failure list
931                for j in 1..=20 {
932                    if i + j < lines.len() {
933                        let next_line = lines[i + j].trim();
934                        if next_line.is_empty() || next_line.starts_with("test result:") {
935                            break;
936                        }
937                        if !next_line.starts_with("----") {
938                            errors.push(format!("Failed: {}", next_line));
939                        }
940                    }
941                }
942            }
943        }
944
945        // IMPORTANT: If we detected failures but no specific errors, include raw output
946        if failed > 0 && errors.is_empty() {
947            // Include the most relevant parts of the output
948            let truncated: String = output
949                .lines()
950                .filter(|l| {
951                    l.contains("FAILED")
952                        || l.contains("error")
953                        || l.contains("panicked")
954                        || l.contains("assertion")
955                        || l.starts_with("test ")
956                })
957                .take(20)
958                .collect::<Vec<_>>()
959                .join("\n");
960
961            if !truncated.is_empty() {
962                errors.push(truncated);
963            } else {
964                // Last resort: include last 30 lines
965                let last_lines: Vec<&str> = output.lines().collect();
966                let start = last_lines.len().saturating_sub(30);
967                errors.push(format!(
968                    "Test output (last 30 lines):\n{}",
969                    last_lines[start..].join("\n")
970                ));
971            }
972        }
973
974        (total, passed, failed, errors)
975    }
976
977    /// Parse pytest output.
978    fn parse_pytest_output(output: &str) -> (u32, u32, u32, Vec<String>) {
979        let mut total = 0u32;
980        let mut passed = 0u32;
981        let mut failed = 0u32;
982        let mut errors = Vec::new();
983
984        for line in output.lines() {
985            // Parse summary: "5 passed, 2 failed"
986            if line.contains("passed") || line.contains("failed") {
987                if let Some(caps) = passed_regex().captures(line) {
988                    passed = caps
989                        .get(1)
990                        .and_then(|m| m.as_str().parse().ok())
991                        .unwrap_or(0);
992                }
993                if let Some(caps) = failed_regex().captures(line) {
994                    failed = caps
995                        .get(1)
996                        .and_then(|m| m.as_str().parse().ok())
997                        .unwrap_or(0);
998                }
999                total = passed + failed;
1000            }
1001
1002            // Capture FAILED tests
1003            if line.contains("FAILED") || line.contains("AssertionError") {
1004                errors.push(line.to_string());
1005            }
1006        }
1007
1008        (total, passed, failed, errors)
1009    }
1010
1011    /// Parse Jest output (TypeScript/JavaScript).
1012    fn parse_jest_output(output: &str) -> (u32, u32, u32, Vec<String>) {
1013        let mut total = 0u32;
1014        let mut passed = 0u32;
1015        let mut failed = 0u32;
1016        let mut errors = Vec::new();
1017
1018        for line in output.lines() {
1019            // Parse summary: "Tests:       1 passed, 1 total" or "Tests:       1 failed, 2 passed, 3 total"
1020            if line.contains("Tests:") && line.contains("total") {
1021                if let Some(caps) = passed_regex().captures(line) {
1022                    passed = caps
1023                        .get(1)
1024                        .and_then(|m| m.as_str().parse().ok())
1025                        .unwrap_or(0);
1026                }
1027                if let Some(caps) = failed_regex().captures(line) {
1028                    failed = caps
1029                        .get(1)
1030                        .and_then(|m| m.as_str().parse().ok())
1031                        .unwrap_or(0);
1032                }
1033                if let Some(caps) = total_regex().captures(line) {
1034                    total = caps
1035                        .get(1)
1036                        .and_then(|m| m.as_str().parse().ok())
1037                        .unwrap_or(0);
1038                }
1039            }
1040
1041            // Capture FAIL lines
1042            if line.contains("FAIL ") || line.contains("✕") || line.contains("● ") {
1043                errors.push(line.to_string());
1044            }
1045            // Capture error details
1046            if line.contains("Error:") || line.contains("expect(") || line.contains("toBe(") {
1047                errors.push(line.to_string());
1048            }
1049        }
1050
1051        (total, passed, failed, errors)
1052    }
1053
1054    /// Parse Go test output.
1055    fn parse_go_test_output(output: &str) -> (u32, u32, u32, Vec<String>) {
1056        let mut total = 0u32;
1057        let mut passed = 0u32;
1058        let mut failed = 0u32;
1059        let mut errors = Vec::new();
1060
1061        for line in output.lines() {
1062            // Count tests: "--- PASS:" or "--- FAIL:"
1063            if line.contains("--- PASS:") {
1064                passed += 1;
1065                total += 1;
1066            } else if line.contains("--- FAIL:") {
1067                failed += 1;
1068                total += 1;
1069                errors.push(line.to_string());
1070            }
1071            // Also capture error messages
1072            if line.contains("Error Trace:") || line.contains("Error:") || line.contains("FAIL\t") {
1073                errors.push(line.to_string());
1074            }
1075            // Go compile errors
1076            if line.contains(".go:")
1077                && (line.contains("undefined")
1078                    || line.contains("cannot")
1079                    || line.contains("expected"))
1080            {
1081                errors.push(line.to_string());
1082            }
1083        }
1084
1085        (total, passed, failed, errors)
1086    }
1087}
1088
1089/// Base64 encode a string (simple implementation without external deps).
1090fn base64_encode(input: &str) -> String {
1091    const CHARSET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1092    let bytes = input.as_bytes();
1093    let mut result = String::new();
1094
1095    for chunk in bytes.chunks(3) {
1096        let b0 = chunk[0] as u32;
1097        let b1 = chunk.get(1).copied().unwrap_or(0) as u32;
1098        let b2 = chunk.get(2).copied().unwrap_or(0) as u32;
1099
1100        let n = (b0 << 16) | (b1 << 8) | b2;
1101
1102        result.push(CHARSET[((n >> 18) & 0x3F) as usize] as char);
1103        result.push(CHARSET[((n >> 12) & 0x3F) as usize] as char);
1104
1105        if chunk.len() > 1 {
1106            result.push(CHARSET[((n >> 6) & 0x3F) as usize] as char);
1107        } else {
1108            result.push('=');
1109        }
1110
1111        if chunk.len() > 2 {
1112            result.push(CHARSET[(n & 0x3F) as usize] as char);
1113        } else {
1114            result.push('=');
1115        }
1116    }
1117
1118    result
1119}
1120
1121#[cfg(test)]
1122mod tests {
1123    use super::*;
1124    use dasein_agentic_sandbox::ProcessSandbox;
1125
1126    #[tokio::test]
1127    async fn test_valid_rust_code() {
1128        let sandbox = ProcessSandbox::new().with_timeout(60000);
1129        let validator = SandboxValidator::new(sandbox).run_tests(false);
1130
1131        let code = r#"
1132pub fn add(a: i32, b: i32) -> i32 {
1133    a + b
1134}
1135
1136#[cfg(test)]
1137mod tests {
1138    use super::*;
1139
1140    #[test]
1141    fn test_add() {
1142        assert_eq!(add(2, 3), 5);
1143    }
1144}
1145"#;
1146
1147        let result = validator.validate_rust_code(code).await.unwrap();
1148        assert!(
1149            result.compiles,
1150            "Code should compile: {:?}",
1151            result.compiler_errors
1152        );
1153    }
1154
1155    #[tokio::test]
1156    async fn test_invalid_rust_code() {
1157        let sandbox = ProcessSandbox::new().with_timeout(60000);
1158        let validator = SandboxValidator::new(sandbox);
1159
1160        let code = r#"
1161pub fn broken() -> i32 {
1162    let x = "not an integer";
1163    x  // Type error: expected i32, found &str
1164}
1165"#;
1166
1167        let result = validator.validate_rust_code(code).await.unwrap();
1168        assert!(!result.compiles, "Code should not compile");
1169        assert!(!result.compiler_errors.is_empty(), "Should have errors");
1170        assert!(result.feedback.is_some(), "Should have feedback");
1171    }
1172
1173    #[test]
1174    fn test_parse_rust_errors() {
1175        let stderr = r#"
1176error[E0308]: mismatched types
1177 --> src/lib.rs:4:5
1178  |
11793 | pub fn broken() -> i32 {
1180  |                    --- expected `i32` because of return type
11814 |     "hello"
1182  |     ^^^^^^^ expected `i32`, found `&str`
1183"#;
1184
1185        let errors = SandboxValidator::<ProcessSandbox>::parse_compiler_errors("", stderr);
1186        assert!(!errors.is_empty());
1187        assert!(errors[0].contains("E0308"));
1188    }
1189
1190    #[test]
1191    fn test_parse_rust_test_results() {
1192        let output = r#"
1193running 3 tests
1194test tests::test_one ... ok
1195test tests::test_two ... FAILED
1196test tests::test_three ... ok
1197
1198failures:
1199
1200---- tests::test_two stdout ----
1201thread 'tests::test_two' panicked at src/lib.rs:15:9:
1202assertion `left == right` failed
1203  left: 1
1204 right: 2
1205
1206test result: FAILED. 2 passed; 1 failed; 0 ignored
1207"#;
1208
1209        let (total, passed, failed, errors) =
1210            SandboxValidator::<ProcessSandbox>::parse_rust_test_output(output);
1211
1212        assert_eq!(total, 3);
1213        assert_eq!(passed, 2);
1214        assert_eq!(failed, 1);
1215        assert!(!errors.is_empty());
1216    }
1217}