Skip to main content

forge_sandbox/
validator.rs

1//! Pre-execution code validator for the Forge sandbox.
2//!
3//! This validator is **defense-in-depth** — the V8 isolate is the real security
4//! boundary. These checks catch common escape patterns early, provide better
5//! error messages, and prevent prompt injection from reaching the runtime.
6
7use crate::error::SandboxError;
8#[cfg(not(feature = "ast-validator"))]
9use regex::Regex;
10
11/// Maximum code size in bytes (64 KB).
12const DEFAULT_MAX_CODE_SIZE: usize = 64 * 1024;
13
14/// Patterns that are banned from sandbox code (used by regex path only).
15#[cfg(not(feature = "ast-validator"))]
16const BANNED_PATTERNS: &[&str] = &[
17    "eval(",
18    "Function(",
19    "import(",                 // Dynamic imports
20    "require(",                // CommonJS
21    "Deno.",                   // Runtime escape
22    "__proto__",               // Prototype pollution
23    "constructor[",            // Prototype chain access via bracket notation
24    "constructor.constructor", // Function constructor bypass
25    "Reflect.",                // Reflect API escape
26    "globalThis[",             // Dynamic global access
27    "String.fromCharCode",     // String-based code construction
28    // Specific process.* patterns (not bare "process." to avoid false positives
29    // on e.g. data.process.status)
30    "process.env",
31    "process.exit",
32    "process.argv",
33    "process.stdin",
34    "process.stdout",
35    "process.stderr",
36    "process.kill",
37    "process.binding",
38    "String.raw",         // Tagged template code generation
39    "WebAssembly",        // WASM execution
40    "Symbol.toPrimitive", // Type confusion attacks
41];
42
43#[cfg(not(feature = "ast-validator"))]
44fn strip_js_comments(code: &str) -> String {
45    // Remove block comments (non-greedy to handle multiple comments)
46    let block_re = Regex::new(r"/\*[\s\S]*?\*/").expect("valid regex");
47    let without_blocks = block_re.replace_all(code, " ");
48    // Remove line comments
49    let line_re = Regex::new(r"//[^\n]*").expect("valid regex");
50    line_re.replace_all(&without_blocks, " ").into_owned()
51}
52
53/// Normalize Unicode confusables to ASCII equivalents for validation.
54///
55/// Maps common Cyrillic/Greek/fullwidth homoglyphs to their ASCII lookalikes
56/// so that `еval(` (Cyrillic е) is caught by the `eval(` pattern.
57fn normalize_unicode_confusables(code: &str) -> String {
58    code.chars()
59        .map(|c| match c {
60            // Cyrillic homoglyphs
61            '\u{0430}' => 'a', // Cyrillic а
62            '\u{0435}' => 'e', // Cyrillic е
63            '\u{043E}' => 'o', // Cyrillic о
64            '\u{0440}' => 'p', // Cyrillic р
65            '\u{0441}' => 'c', // Cyrillic с
66            '\u{0443}' => 'y', // Cyrillic у
67            '\u{0445}' => 'x', // Cyrillic х
68            '\u{0456}' => 'i', // Cyrillic і
69            '\u{0455}' => 's', // Cyrillic ѕ
70            // Cyrillic uppercase
71            '\u{0410}' => 'A', // Cyrillic А
72            '\u{0412}' => 'B', // Cyrillic В
73            '\u{0415}' => 'E', // Cyrillic Е
74            '\u{041A}' => 'K', // Cyrillic К
75            '\u{041C}' => 'M', // Cyrillic М
76            '\u{041D}' => 'H', // Cyrillic Н
77            '\u{041E}' => 'O', // Cyrillic О
78            '\u{0420}' => 'P', // Cyrillic Р
79            '\u{0421}' => 'C', // Cyrillic С
80            '\u{0422}' => 'T', // Cyrillic Т
81            '\u{0425}' => 'X', // Cyrillic Х
82            // Fullwidth ASCII (U+FF01..U+FF5E → U+0021..U+007E)
83            '\u{FF01}'..='\u{FF5E}' => (c as u32 - 0xFF01 + 0x21) as u8 as char,
84            _ => c,
85        })
86        .collect()
87}
88
89#[cfg(any(not(feature = "ast-validator"), test))]
90fn strip_string_contents(code: &str) -> String {
91    let mut result = String::with_capacity(code.len());
92    let chars: Vec<char> = code.chars().collect();
93    let len = chars.len();
94    let mut i = 0;
95
96    while i < len {
97        match chars[i] {
98            // Single or double quoted strings
99            q @ ('\'' | '"') => {
100                result.push(q);
101                i += 1;
102                while i < len && chars[i] != q {
103                    if chars[i] == '\\' && i + 1 < len {
104                        // Escaped character — replace both with spaces
105                        result.push(' ');
106                        result.push(' ');
107                        i += 2;
108                    } else {
109                        result.push(' ');
110                        i += 1;
111                    }
112                }
113                if i < len {
114                    result.push(q); // closing quote
115                    i += 1;
116                }
117            }
118            // Template literals
119            '`' => {
120                result.push('`');
121                i += 1;
122                while i < len && chars[i] != '`' {
123                    if chars[i] == '\\' && i + 1 < len {
124                        // Escaped character — replace both with spaces
125                        result.push(' ');
126                        result.push(' ');
127                        i += 2;
128                    } else if chars[i] == '$' && i + 1 < len && chars[i + 1] == '{' {
129                        // Template expression — preserve contents
130                        result.push('$');
131                        result.push('{');
132                        i += 2;
133                        let mut depth = 1;
134                        while i < len && depth > 0 {
135                            if chars[i] == '{' {
136                                depth += 1;
137                            } else if chars[i] == '}' {
138                                depth -= 1;
139                            }
140                            result.push(chars[i]);
141                            i += 1;
142                        }
143                    } else {
144                        result.push(' ');
145                        i += 1;
146                    }
147                }
148                if i < len {
149                    result.push('`'); // closing backtick
150                    i += 1;
151                }
152            }
153            other => {
154                result.push(other);
155                i += 1;
156            }
157        }
158    }
159
160    result
161}
162
163#[cfg(not(feature = "ast-validator"))]
164fn collapse_whitespace_before_parens(code: &str) -> String {
165    let re = Regex::new(r"(\w)\s+\(").expect("valid regex");
166    re.replace_all(code, "$1(").into_owned()
167}
168
169/// Skip leading `//` line comments and `/* ... */` block comments, returning
170/// the remaining code. This allows files to have header comments before the
171/// required `async () => { ... }` arrow function.
172fn skip_leading_comments(s: &str) -> &str {
173    let mut rest = s;
174    loop {
175        rest = rest.trim_start();
176        if rest.starts_with("//") {
177            // Skip to end of line
178            match rest.find('\n') {
179                Some(pos) => rest = &rest[pos + 1..],
180                None => return "",
181            }
182        } else if rest.starts_with("/*") {
183            // Skip to closing */
184            match rest.find("*/") {
185                Some(pos) => rest = &rest[pos + 2..],
186                None => return rest, // unclosed comment — let parser report it
187            }
188        } else {
189            return rest;
190        }
191    }
192}
193
194/// Validates LLM-generated code before sandbox execution.
195pub fn validate_code(code: &str, max_size: Option<usize>) -> Result<(), SandboxError> {
196    let max = max_size.unwrap_or(DEFAULT_MAX_CODE_SIZE);
197
198    // 1. Size limit
199    if code.len() > max {
200        return Err(SandboxError::CodeTooLarge {
201            max,
202            actual: code.len(),
203        });
204    }
205
206    // 2. Empty code
207    if code.trim().is_empty() {
208        return Err(SandboxError::ValidationFailed {
209            reason: "code is empty".into(),
210        });
211    }
212
213    // 3. Must be an async arrow function (skip leading // and /* */ comments)
214    let trimmed = code.trim();
215    let code_start = skip_leading_comments(trimmed);
216    if !code_start.starts_with("async") {
217        return Err(SandboxError::ValidationFailed {
218            reason: "code must be an async arrow function, e.g. `async () => { ... }`. \
219                     Do not provide bare statements — wrap your code in `async () => { ... }`"
220                .into(),
221        });
222    }
223
224    // 4. Pattern-based validation.
225    //    With ast-validator: Unicode normalize → AST parse + walk
226    //    Without ast-validator: Unicode normalize → comment strip → string strip → regex scan
227    validate_patterns(code)
228}
229
230/// AST-based pattern validation (defense-in-depth via oxc_parser AST walk).
231#[cfg(feature = "ast-validator")]
232fn validate_patterns(code: &str) -> Result<(), SandboxError> {
233    // Normalize Unicode confusables BEFORE parsing so that Cyrillic/fullwidth
234    // evasion is caught even at the AST level.
235    let normalized = normalize_unicode_confusables(code);
236
237    crate::ast_validator::validate_ast(&normalized).map_err(|v| match v {
238        crate::ast_validator::AstViolation::ParseError(msg) => SandboxError::ValidationFailed {
239            reason: format!("code could not be parsed: {msg}"),
240        },
241        crate::ast_validator::AstViolation::NestingTooDeep { max, actual } => {
242            SandboxError::ValidationFailed {
243                reason: format!("code nesting depth {actual} exceeds maximum {max}"),
244            }
245        }
246        crate::ast_validator::AstViolation::BannedPattern { description } => {
247            SandboxError::BannedPattern {
248                pattern: description,
249            }
250        }
251    })
252}
253
254/// Regex-based pattern validation (fallback when ast-validator feature is disabled).
255#[cfg(not(feature = "ast-validator"))]
256fn validate_patterns(code: &str) -> Result<(), SandboxError> {
257    let normalized = collapse_whitespace_before_parens(&strip_string_contents(&strip_js_comments(
258        &normalize_unicode_confusables(code),
259    )));
260    for pattern in BANNED_PATTERNS {
261        if normalized.contains(pattern) {
262            return Err(SandboxError::BannedPattern {
263                pattern: (*pattern).to_string(),
264            });
265        }
266    }
267    Ok(())
268}
269
270#[cfg(test)]
271mod tests {
272    use super::*;
273
274    #[test]
275    fn accepts_valid_async_arrow() {
276        let code = r#"async () => { return manifest.tools.filter(t => t.category === "ast"); }"#;
277        assert!(validate_code(code, None).is_ok());
278    }
279
280    #[test]
281    fn rejects_empty_code() {
282        assert!(validate_code("", None).is_err());
283        assert!(validate_code("   ", None).is_err());
284    }
285
286    #[test]
287    fn rejects_oversized_code() {
288        let big = "x".repeat(100_000);
289        let err = validate_code(&big, None).unwrap_err();
290        assert!(matches!(err, SandboxError::CodeTooLarge { .. }));
291    }
292
293    #[test]
294    fn rejects_eval() {
295        let code = r#"async () => { return eval("1+1"); }"#;
296        let err = validate_code(code, None).unwrap_err();
297        assert!(matches!(err, SandboxError::BannedPattern { .. }));
298    }
299
300    #[test]
301    fn rejects_dynamic_import() {
302        let code = r#"async () => { const m = await import("fs"); }"#;
303        let err = validate_code(code, None).unwrap_err();
304        assert!(matches!(err, SandboxError::BannedPattern { .. }));
305    }
306
307    #[test]
308    fn rejects_deno_access() {
309        let code = r#"async () => { return Deno.readFile("/etc/passwd"); }"#;
310        let err = validate_code(code, None).unwrap_err();
311        assert!(matches!(err, SandboxError::BannedPattern { .. }));
312    }
313
314    #[test]
315    fn rejects_proto_pollution() {
316        let code = r#"async () => { ({}).__proto__.polluted = true; }"#;
317        let err = validate_code(code, None).unwrap_err();
318        assert!(matches!(err, SandboxError::BannedPattern { .. }));
319    }
320
321    // --- New tests for WU3 ---
322
323    #[test]
324    fn accepts_data_process_status() {
325        // "process." as a substring should NOT be rejected — only specific
326        // process.env/exit/argv/etc. patterns are banned.
327        let code = r#"async () => { return data.process.status; }"#;
328        assert!(validate_code(code, None).is_ok());
329    }
330
331    #[test]
332    fn rejects_process_env() {
333        let code = r#"async () => { return process.env.SECRET; }"#;
334        let err = validate_code(code, None).unwrap_err();
335        assert!(matches!(err, SandboxError::BannedPattern { .. }));
336    }
337
338    #[test]
339    fn rejects_constructor_constructor() {
340        let code = r#"async () => { return "".constructor.constructor("return this")(); }"#;
341        let err = validate_code(code, None).unwrap_err();
342        assert!(matches!(err, SandboxError::BannedPattern { .. }));
343    }
344
345    #[test]
346    fn rejects_reflect_construct() {
347        let code = r#"async () => { return Reflect.construct(Array, []); }"#;
348        let err = validate_code(code, None).unwrap_err();
349        assert!(matches!(err, SandboxError::BannedPattern { .. }));
350    }
351
352    #[test]
353    fn rejects_globalthis_bracket_access() {
354        let code = r#"async () => { return globalThis["eval"]("1+1"); }"#;
355        let err = validate_code(code, None).unwrap_err();
356        assert!(matches!(err, SandboxError::BannedPattern { .. }));
357    }
358
359    #[test]
360    fn rejects_string_from_char_code() {
361        let code = r#"async () => { return String.fromCharCode(101, 118, 97, 108); }"#;
362        let err = validate_code(code, None).unwrap_err();
363        assert!(matches!(err, SandboxError::BannedPattern { .. }));
364    }
365
366    #[test]
367    fn accepts_legitimate_constructor_property() {
368        // Accessing .constructor (not .constructor[ or .constructor.constructor) is fine
369        let code = r#"async () => { return obj.constructor.name; }"#;
370        assert!(validate_code(code, None).is_ok());
371    }
372
373    #[test]
374    fn custom_max_size() {
375        let code = format!("async () => {{ {} }}", "x".repeat(100));
376        assert!(validate_code(&code, Some(50)).is_err());
377        assert!(validate_code(&code, Some(200)).is_ok());
378    }
379
380    #[test]
381    fn rejects_bare_statements() {
382        let code = r#"return manifest.servers.map(s => s.name);"#;
383        let err = validate_code(code, None).unwrap_err();
384        assert!(matches!(err, SandboxError::ValidationFailed { .. }));
385        let msg = err.to_string();
386        assert!(
387            msg.contains("async arrow function"),
388            "error should guide user to use async arrow: {msg}"
389        );
390    }
391
392    #[test]
393    fn rejects_non_async_function() {
394        let code = r#"() => { return 42; }"#;
395        let err = validate_code(code, None).unwrap_err();
396        assert!(matches!(err, SandboxError::ValidationFailed { .. }));
397    }
398
399    #[test]
400    fn accepts_leading_line_comments() {
401        let code = "// header comment\n// another comment\nasync () => { return 42; }";
402        assert!(validate_code(code, None).is_ok());
403    }
404
405    #[test]
406    fn accepts_leading_block_comments() {
407        let code = "/* block comment */\nasync () => { return 42; }";
408        assert!(validate_code(code, None).is_ok());
409    }
410
411    #[test]
412    fn accepts_mixed_leading_comments() {
413        let code =
414            "// @prompt test\n// @features none\n/* multi\nline */\nasync () => { return 1; }";
415        assert!(validate_code(code, None).is_ok());
416    }
417
418    #[test]
419    fn rejects_comments_without_async() {
420        let code = "// just a comment\nreturn 42;";
421        let err = validate_code(code, None).unwrap_err();
422        assert!(matches!(err, SandboxError::ValidationFailed { .. }));
423    }
424
425    // --- Evasion prevention tests ---
426
427    #[test]
428    fn rejects_eval_with_block_comment_bypass() {
429        // eval/*trick*/( should still be caught after comment stripping
430        let code = r#"async () => { return eval/*trick*/("1+1"); }"#;
431        let err = validate_code(code, None).unwrap_err();
432        assert!(matches!(err, SandboxError::BannedPattern { .. }));
433    }
434
435    #[test]
436    fn rejects_eval_with_line_comment_evasion() {
437        // Multi-line evasion with line comment
438        let code = "async () => { return eval//comment\n(\"1+1\"); }";
439        let err = validate_code(code, None).unwrap_err();
440        assert!(matches!(err, SandboxError::BannedPattern { .. }));
441    }
442
443    #[test]
444    fn rejects_eval_with_whitespace_bypass() {
445        // eval ( with space should be caught
446        let code = r#"async () => { return eval ("1+1"); }"#;
447        let err = validate_code(code, None).unwrap_err();
448        assert!(matches!(err, SandboxError::BannedPattern { .. }));
449    }
450
451    #[test]
452    fn rejects_eval_with_tab_bypass() {
453        let code = "async () => { return eval\t(\"1+1\"); }";
454        let err = validate_code(code, None).unwrap_err();
455        assert!(matches!(err, SandboxError::BannedPattern { .. }));
456    }
457
458    #[test]
459    fn rejects_cyrillic_eval_homoglyph() {
460        // Cyrillic е (U+0435) instead of Latin e
461        let code = "async () => { return \u{0435}val(\"1+1\"); }";
462        let err = validate_code(code, None).unwrap_err();
463        assert!(matches!(err, SandboxError::BannedPattern { .. }));
464    }
465
466    #[test]
467    fn rejects_cyrillic_deno_homoglyph() {
468        // Cyrillic а (U+0430) and е (U+0435) in "Deno"
469        let code = "async () => { return D\u{0435}no.readFile(\"/etc/passwd\"); }";
470        let err = validate_code(code, None).unwrap_err();
471        assert!(matches!(err, SandboxError::BannedPattern { .. }));
472    }
473
474    #[test]
475    fn rejects_fullwidth_eval() {
476        // Fullwidth e (U+FF45), v (U+FF56), a (U+FF41), l (U+FF4C)
477        let code = "async () => { return \u{FF45}\u{FF56}\u{FF41}\u{FF4C}(\"1+1\"); }";
478        let err = validate_code(code, None).unwrap_err();
479        assert!(matches!(err, SandboxError::BannedPattern { .. }));
480    }
481
482    #[test]
483    fn rejects_function_constructor_with_comment() {
484        let code = r#"async () => { return Function/**/("return this")(); }"#;
485        let err = validate_code(code, None).unwrap_err();
486        assert!(matches!(err, SandboxError::BannedPattern { .. }));
487    }
488
489    #[test]
490    fn rejects_import_with_whitespace() {
491        let code = r#"async () => { const m = await import ("fs"); }"#;
492        let err = validate_code(code, None).unwrap_err();
493        assert!(matches!(err, SandboxError::BannedPattern { .. }));
494    }
495
496    // --- VP-01: rejects String.raw ---
497    #[test]
498    fn vp01_rejects_string_raw() {
499        let code = r#"async () => { return String.raw`\x61\x62\x63`; }"#;
500        let err = validate_code(code, None).unwrap_err();
501        assert!(matches!(err, SandboxError::BannedPattern { .. }));
502    }
503
504    // --- VP-02: rejects WebAssembly ---
505    #[test]
506    fn vp02_rejects_webassembly() {
507        let code = r#"async () => { const m = new WebAssembly.Module(buf); }"#;
508        let err = validate_code(code, None).unwrap_err();
509        assert!(matches!(err, SandboxError::BannedPattern { .. }));
510    }
511
512    // --- VP-03: rejects Symbol.toPrimitive ---
513    #[test]
514    fn vp03_rejects_symbol_toprimitive() {
515        let code = r#"async () => { obj[Symbol.toPrimitive] = () => "exploit"; }"#;
516        let err = validate_code(code, None).unwrap_err();
517        assert!(matches!(err, SandboxError::BannedPattern { .. }));
518    }
519
520    // --- VP-04: no false positives on similar patterns ---
521    #[test]
522    fn vp04_no_false_positives() {
523        // "Symbol.iterator" should NOT be banned (legitimate JS usage)
524        let code = r#"async () => { for (const x of obj[Symbol.iterator]()) {} }"#;
525        assert!(validate_code(code, None).is_ok());
526
527        // Test that normal strings containing "raw" don't trigger
528        let code2 = r#"async () => { return "raw data"; }"#;
529        assert!(validate_code(code2, None).is_ok());
530    }
531
532    #[test]
533    fn legitimate_comments_dont_cause_false_positives() {
534        // A normal comment that happens to mention eval should be fine
535        // because after stripping, the code itself doesn't contain eval(
536        let code = r#"async () => { /* this does not use eval */ return 42; }"#;
537        assert!(validate_code(code, None).is_ok());
538    }
539
540    // --- WI-5: String literal content should not trigger banned patterns ---
541
542    #[test]
543    fn wi5_accepts_deno_in_string_literal() {
544        let code = r#"async () => { return { pattern: "Deno.readFile" }; }"#;
545        assert!(validate_code(code, None).is_ok());
546    }
547
548    #[test]
549    fn wi5_accepts_eval_in_string_literal() {
550        let code = r#"async () => { return "eval(is bad)"; }"#;
551        assert!(validate_code(code, None).is_ok());
552    }
553
554    #[test]
555    fn wi5_still_rejects_eval_outside_string() {
556        // eval() in code, even with "eval" also in a string, should be caught
557        let code = r#"async () => { const x = "eval"; return eval("1"); }"#;
558        let err = validate_code(code, None).unwrap_err();
559        assert!(matches!(err, SandboxError::BannedPattern { .. }));
560    }
561
562    #[test]
563    fn wi5_accepts_process_env_in_string_literal() {
564        let code = r#"async () => { return { query: "process.env search" }; }"#;
565        assert!(validate_code(code, None).is_ok());
566    }
567
568    #[test]
569    fn wi5_accepts_import_in_template_literal_text() {
570        let code = r#"async () => { return `import("x") is banned`; }"#;
571        assert!(validate_code(code, None).is_ok());
572    }
573
574    #[test]
575    fn wi5_still_catches_eval_in_template_expression() {
576        let code = r#"async () => { return `${eval("1")}`; }"#;
577        let err = validate_code(code, None).unwrap_err();
578        assert!(matches!(err, SandboxError::BannedPattern { .. }));
579    }
580
581    #[test]
582    fn wi5_handles_escaped_quotes_in_strings() {
583        let code = r#"async () => { return 'it\'s fine to mention Deno.'; }"#;
584        assert!(validate_code(code, None).is_ok());
585    }
586
587    #[test]
588    fn wi5_strip_string_contents_unit() {
589        // Direct function test
590        let input = r#"foo("Deno.readFile") + bar('eval(') + `import(`"#;
591        let stripped = strip_string_contents(input);
592        assert!(!stripped.contains("Deno"));
593        assert!(!stripped.contains("eval"));
594        // Template literal content stripped
595        assert!(!stripped.contains("import"));
596        // Delimiters preserved
597        assert!(stripped.contains('"'));
598        assert!(stripped.contains('\''));
599        assert!(stripped.contains('`'));
600    }
601}