Skip to main content

kintsugi_core/
parse.rs

1//! Shell AST front-end for the classifier (pure-Rust, via `brush-parser`).
2//!
3//! The Tier-1 classifier historically tokenized the command line with a small
4//! hand-rolled splitter. That is fast and dependency-light, but it can't see the
5//! true shell structure — commands hidden inside command substitution `$(…)` /
6//! backticks, here-documents fed to a shell, or unusual quoting. This module
7//! parses the line into a real bash AST and flattens it to the list of *simple
8//! commands* it would run, descending into:
9//!   - pipelines, `&&`/`||`/`;` lists, and compound commands (subshells, groups,
10//!     `if`/`for`/`while`/`case`/functions),
11//!   - command substitutions `$(…)` and backticks found in any word,
12//!   - here-document bodies fed to a shell (`bash <<EOF … EOF`),
13//!   - the `-c` script of a shell, and `find -exec` / `xargs` payloads.
14//!
15//! The classifier composes this with the existing tokenizer pass **worst-wins**,
16//! so the AST can only ever *add* detections (deeper, obfuscated payloads) and
17//! never downgrade a tokenizer verdict. A parse failure yields `None` — the
18//! caller treats that as "the AST found nothing", and the tokenizer pass (and
19//! the fail-toward-caution default) still stands.
20
21use brush_parser::ast;
22
23/// One simple command extracted from the AST: program plus its argument words.
24/// Word text is raw (quotes/expansions preserved), exactly as the agent wrote
25/// it — the classifier trims quotes where it matters.
26#[derive(Debug, Clone, PartialEq, Eq)]
27pub struct SimpleCmd {
28    pub program: String,
29    pub args: Vec<String>,
30}
31
32/// What the AST pass found: every simple command the line would run (flattened,
33/// including those nested in substitutions / compounds), plus the raw text of
34/// every command substitution `$(…)` / backtick — so the classifier can also
35/// whole-line-scan substitution bodies (e.g. a `curl … | sh` hidden in `$(…)`).
36#[derive(Debug, Default, Clone, PartialEq, Eq)]
37pub struct Analysis {
38    pub commands: Vec<SimpleCmd>,
39    pub substitutions: Vec<String>,
40    /// Set when the walk stopped early at [`MAX_DEPTH`] — some commands may not
41    /// have been collected, so the caller must fail toward caution (never Safe)
42    /// rather than trust an incomplete command list.
43    pub truncated: bool,
44}
45
46/// Parse `raw` into an [`Analysis`]. Returns `None` if the line can't be parsed
47/// (caller falls back to the tokenizer pass + the cautious default).
48pub fn analyze(raw: &str) -> Option<Analysis> {
49    let program = parse_program(raw)?;
50    let mut a = Analysis::default();
51    collect_program(&program, &mut a, 0);
52    Some(a)
53}
54
55/// Just the flattened simple commands (used in tests).
56pub fn ast_commands(raw: &str) -> Option<Vec<SimpleCmd>> {
57    analyze(raw).map(|a| a.commands)
58}
59
60/// Program basename without directory or `.exe`.
61fn basename(arg0: &str) -> &str {
62    let base = arg0.rsplit(['/', '\\']).next().unwrap_or(arg0);
63    base.strip_suffix(".exe").unwrap_or(base)
64}
65
66fn parse_program(raw: &str) -> Option<ast::Program> {
67    // `brush_parser` recurses per nesting level, and pathologically deep input
68    // (hundreds of nested `$(…)`) can overflow its stack — an *uncatchable*
69    // abort, not a panic. Refuse to parse such input; the caller stays cautious.
70    if exceeds_nesting(raw) {
71        return None;
72    }
73    // `brush_parser`'s here-doc / here-string tokenizer can attempt a multi-
74    // gigabyte allocation (heap-exhaustion DoS) on short *malformed* here-operator
75    // input (`)x<< .env$(…`, `<< ''`, `<<<<<`, …). Neutralize the here-operators
76    // *before* tokenizing so the parser never enters that reader — see
77    // `neutralize_here_operators`. Substitutions and other structure are
78    // preserved, so nothing is hidden from classification.
79    let prepared = neutralize_here_operators(raw);
80    let tokens = brush_parser::tokenize_str(&prepared).ok()?;
81    let opts = brush_parser::ParserOptions::default();
82    brush_parser::parse_tokens(&tokens, &opts).ok()
83}
84
85/// Rewrite every here-operator run (`<<`, `<<<`, `<<<<…`) to a single space so
86/// `brush_parser` never enters its heredoc reader (which can heap-exhaust on
87/// malformed input — a DoS the fuzzer found). Substitutions and other structure
88/// are preserved, so a `$(…)`-hidden catastrophe is still parsed (no leak); a
89/// here-doc body sits on its own newline-separated line and is still parsed as
90/// command(s). Here-strings (`bash <<< 'rm -rf /'`) are caught by the tokenizer
91/// pass instead (see `wrapped_commands`). A lone `<` (normal redirect) is left
92/// untouched.
93fn neutralize_here_operators(raw: &str) -> std::borrow::Cow<'_, str> {
94    if !raw.contains("<<") {
95        return std::borrow::Cow::Borrowed(raw);
96    }
97    let mut out = String::with_capacity(raw.len());
98    let mut chars = raw.chars().peekable();
99    while let Some(c) = chars.next() {
100        if c == '<' && chars.peek() == Some(&'<') {
101            while chars.peek() == Some(&'<') {
102                chars.next();
103            }
104            out.push(' ');
105        } else {
106            out.push(c);
107        }
108    }
109    std::borrow::Cow::Owned(out)
110}
111
112/// Hard ceiling on parser-recursion-driving nesting, well under the depth that
113/// overflows `brush_parser`'s stack. Real commands never approach it.
114const MAX_NESTING: usize = 48;
115
116/// Whether `raw` nests parens / brace groups / backticks / compound keywords
117/// deeply enough to risk the parser's stack. Cheap single pass; conservative.
118fn exceeds_nesting(raw: &str) -> bool {
119    let mut depth: i32 = 0;
120    let mut max_depth: i32 = 0;
121    let mut backticks = 0usize;
122    for b in raw.bytes() {
123        match b {
124            b'(' | b'{' => {
125                depth += 1;
126                max_depth = max_depth.max(depth);
127            }
128            b')' | b'}' => depth = (depth - 1).max(0),
129            b'`' => backticks += 1,
130            _ => {}
131        }
132    }
133    // Nested compound statements (`if … then … fi`) recurse the parser too.
134    let keywords = raw
135        .split_whitespace()
136        .filter(|t| {
137            matches!(
138                *t,
139                "if" | "for" | "while" | "until" | "case" | "select" | "do" | "then"
140            )
141        })
142        .count();
143    max_depth as usize > MAX_NESTING || backticks > MAX_NESTING || keywords > MAX_NESTING
144}
145
146/// Guard against pathological nesting of substitutions / compounds while walking
147/// the parsed tree. Generous (the parser-stack ceiling is enforced separately by
148/// [`exceeds_nesting`]); reaching it sets `Analysis::truncated` so the caller
149/// fails toward caution.
150const MAX_DEPTH: u8 = 64;
151
152fn collect_program(program: &ast::Program, a: &mut Analysis, depth: u8) {
153    if depth > MAX_DEPTH {
154        a.truncated = true;
155        return;
156    }
157    for complete in &program.complete_commands {
158        collect_compound_list(complete, a, depth);
159    }
160}
161
162fn collect_compound_list(list: &ast::CompoundList, a: &mut Analysis, depth: u8) {
163    if depth > MAX_DEPTH {
164        a.truncated = true;
165        return;
166    }
167    for item in &list.0 {
168        collect_and_or(&item.0, a, depth);
169    }
170}
171
172fn collect_and_or(and_or: &ast::AndOrList, a: &mut Analysis, depth: u8) {
173    collect_pipeline(&and_or.first, a, depth);
174    for extra in &and_or.additional {
175        let pipeline = match extra {
176            ast::AndOr::And(p) | ast::AndOr::Or(p) => p,
177        };
178        collect_pipeline(pipeline, a, depth);
179    }
180}
181
182fn collect_pipeline(pipeline: &ast::Pipeline, a: &mut Analysis, depth: u8) {
183    for cmd in &pipeline.seq {
184        collect_command(cmd, a, depth);
185    }
186}
187
188fn collect_command(cmd: &ast::Command, a: &mut Analysis, depth: u8) {
189    match cmd {
190        ast::Command::Simple(sc) => collect_simple(sc, a, depth),
191        ast::Command::Compound(compound, _redirects) => collect_compound(compound, a, depth + 1),
192        // A function definition doesn't *run* its body, but the body is run when
193        // the function is later called (often on the same line: `f(){ rm -rf /; }; f`).
194        // Walk it now — over-flagging a defined-but-uncalled function is the
195        // correct, cautious bias.
196        ast::Command::Function(func) => collect_compound(&func.body.0, a, depth + 1),
197        // Extended-test expressions (`[[ … ]]`) don't run a command. Ignore.
198        ast::Command::ExtendedTest(_, _) => {}
199    }
200}
201
202fn collect_compound(compound: &ast::CompoundCommand, a: &mut Analysis, depth: u8) {
203    if depth > MAX_DEPTH {
204        a.truncated = true;
205        return;
206    }
207    use ast::CompoundCommand::*;
208    match compound {
209        BraceGroup(g) => collect_compound_list(&g.list, a, depth),
210        Subshell(s) => collect_compound_list(&s.list, a, depth),
211        ForClause(f) => collect_compound_list(&f.body.list, a, depth),
212        // `WhileOrUntilClauseCommand(condition, do-group, loc)`.
213        WhileClause(w) | UntilClause(w) => {
214            collect_compound_list(&w.0, a, depth);
215            collect_compound_list(&w.1.list, a, depth);
216        }
217        IfClause(i) => {
218            collect_compound_list(&i.condition, a, depth);
219            collect_compound_list(&i.then, a, depth);
220            if let Some(elses) = &i.elses {
221                for e in elses {
222                    if let Some(cond) = &e.condition {
223                        collect_compound_list(cond, a, depth);
224                    }
225                    collect_compound_list(&e.body, a, depth);
226                }
227            }
228        }
229        CaseClause(c) => {
230            for item in &c.cases {
231                if let Some(cmds) = &item.cmd {
232                    collect_compound_list(cmds, a, depth);
233                }
234            }
235        }
236        Arithmetic(_) | ArithmeticForClause(_) | Coprocess(_) => {}
237    }
238}
239
240fn collect_simple(sc: &ast::SimpleCommand, a: &mut Analysis, depth: u8) {
241    // Every word that could carry a command substitution: the prefix
242    // assignments (e.g. `x=$(…)`), the program name, and the argument words.
243    let mut scan_words: Vec<String> = Vec::new();
244
245    if let Some(prefix) = &sc.prefix {
246        for item in &prefix.0 {
247            match item {
248                ast::CommandPrefixOrSuffixItem::AssignmentWord(_, w) => {
249                    scan_words.push(w.value.clone())
250                }
251                ast::CommandPrefixOrSuffixItem::Word(w) => scan_words.push(w.value.clone()),
252                // Process substitution `<(…)` / `>(…)` runs its inner command.
253                ast::CommandPrefixOrSuffixItem::ProcessSubstitution(_, sub) => {
254                    collect_compound_list(&sub.list, a, depth + 1)
255                }
256                _ => {}
257            }
258        }
259    }
260
261    // If the program is a shell, a here-doc / here-string body IS a script.
262    let is_shell = sc
263        .word_or_name
264        .as_ref()
265        .map(|n| {
266            matches!(
267                basename(&n.value),
268                "sh" | "bash" | "zsh" | "dash" | "ash" | "ksh"
269            )
270        })
271        .unwrap_or(false);
272
273    let mut args = Vec::new();
274    if let Some(suffix) = &sc.suffix {
275        for item in &suffix.0 {
276            match item {
277                ast::CommandPrefixOrSuffixItem::Word(w) => args.push(w.value.clone()),
278                // Process substitution `<(…)` / `>(…)` as an argument runs its
279                // inner command (e.g. `diff <(a) <(b)`, `grep x <(rm -rf /)`).
280                ast::CommandPrefixOrSuffixItem::ProcessSubstitution(_, sub) => {
281                    collect_compound_list(&sub.list, a, depth + 1)
282                }
283                ast::CommandPrefixOrSuffixItem::IoRedirect(io) => {
284                    // Process substitution as a redirect *target* also runs its
285                    // inner command (`cmd > >(rm -rf /)`).
286                    if let ast::IoRedirect::File(
287                        _,
288                        _,
289                        ast::IoFileRedirectTarget::ProcessSubstitution(_, sub),
290                    ) = io
291                    {
292                        collect_compound_list(&sub.list, a, depth + 1);
293                    }
294                    // A here-doc / here-string fed to a shell carries a script body.
295                    if is_shell {
296                        let body = match io {
297                            ast::IoRedirect::HereDocument(_, hd) => Some(hd.doc.value.clone()),
298                            // A here-string keeps its surrounding quotes in the word;
299                            // the actual stdin is the unquoted content.
300                            ast::IoRedirect::HereString(_, w) => {
301                                Some(w.value.trim_matches(['"', '\'']).to_string())
302                            }
303                            _ => None,
304                        };
305                        if let Some(body) = body {
306                            if let Some(inner) = parse_program(&body) {
307                                collect_program(&inner, a, depth + 1);
308                            }
309                        }
310                    }
311                }
312                _ => {}
313            }
314            // Plain file redirects are handled by the snapshot predictor, not here.
315        }
316    }
317
318    if let Some(name) = &sc.word_or_name {
319        scan_words.push(name.value.clone());
320    }
321    for arg in &args {
322        scan_words.push(arg.clone());
323    }
324
325    // Record + recurse command substitutions / backticks found in any of those
326    // words (so substitution bodies are both classified per-program and
327    // available for the whole-line scans).
328    for word in &scan_words {
329        for sub in command_substitutions(word) {
330            if let Some(inner) = parse_program(&sub) {
331                collect_program(&inner, a, depth + 1);
332            }
333            a.substitutions.push(sub);
334        }
335    }
336
337    // A pure assignment (`x=…`) has no program to classify, but its substitution
338    // was already recursed above.
339    if let Some(name) = &sc.word_or_name {
340        a.commands.push(SimpleCmd {
341            program: name.value.clone(),
342            args,
343        });
344    }
345}
346
347/// Extract command-substitution payloads from a single (raw) word: `$(…)` with
348/// balanced parens, and `` `…` `` backticks. Substitutions inside single quotes
349/// are NOT expanded by the shell, so they're skipped.
350fn command_substitutions(word: &str) -> Vec<String> {
351    let mut subs = Vec::new();
352    let bytes = word.as_bytes();
353    let mut i = 0;
354    let mut in_single = false;
355    while i < bytes.len() {
356        let c = bytes[i] as char;
357        if c == '\'' {
358            in_single = !in_single;
359            i += 1;
360            continue;
361        }
362        if in_single {
363            i += 1;
364            continue;
365        }
366        // `$(` … `)` with paren-depth tracking (handles nested `$( $() )`).
367        if c == '$' && i + 1 < bytes.len() && bytes[i + 1] == b'(' {
368            let start = i + 2;
369            let mut depth = 1;
370            let mut j = start;
371            while j < bytes.len() && depth > 0 {
372                match bytes[j] {
373                    b'(' => depth += 1,
374                    b')' => depth -= 1,
375                    _ => {}
376                }
377                j += 1;
378            }
379            if depth == 0 {
380                subs.push(word[start..j - 1].to_string());
381                i = j;
382                continue;
383            }
384        }
385        // Backticks: `…` up to the next backtick.
386        if c == '`' {
387            if let Some(end) = word[i + 1..].find('`') {
388                subs.push(word[i + 1..i + 1 + end].to_string());
389                i = i + 1 + end + 1;
390                continue;
391            }
392        }
393        i += 1;
394    }
395    subs
396}
397
398#[cfg(test)]
399mod tests {
400    use super::*;
401
402    fn progs(raw: &str) -> Vec<String> {
403        ast_commands(raw)
404            .unwrap_or_default()
405            .into_iter()
406            .map(|c| c.program)
407            .collect()
408    }
409
410    #[test]
411    fn flattens_pipelines_lists_and_separators() {
412        let p = progs("cd build && rm -rf ../dist; echo a | sh");
413        assert!(p.contains(&"cd".to_string()));
414        assert!(p.contains(&"rm".to_string()));
415        assert!(p.contains(&"echo".to_string()));
416        assert!(p.contains(&"sh".to_string()));
417    }
418
419    #[test]
420    fn recurses_command_substitution_and_backticks() {
421        // The dangerous program lives only inside `$(…)` / backticks.
422        assert!(progs("echo \"$(rm -rf /)\"").contains(&"rm".to_string()));
423        assert!(progs("x=`git push --force`").contains(&"git".to_string()));
424        // Nested substitution.
425        assert!(progs("echo $( echo $(terraform destroy) )").contains(&"terraform".to_string()));
426    }
427
428    #[test]
429    fn single_quotes_are_not_substitutions() {
430        // `$(...)` inside single quotes is literal text, not a command.
431        let p = progs("echo '$(rm -rf /)'");
432        assert!(p.contains(&"echo".to_string()));
433        assert!(
434            !p.contains(&"rm".to_string()),
435            "single-quoted is literal: {p:?}"
436        );
437    }
438
439    #[test]
440    fn descends_into_compounds() {
441        assert!(progs("if true; then rm -rf /; fi").contains(&"rm".to_string()));
442        assert!(progs("( cd x && git push --force )").contains(&"git".to_string()));
443    }
444
445    #[test]
446    fn descends_into_process_substitution() {
447        // `<(cmd)` / `>(cmd)` run their inner command.
448        assert!(progs("grep x <(rm -rf /)").contains(&"rm".to_string()));
449        assert!(progs("diff <(git push --force) /dev/null").contains(&"git".to_string()));
450        // Process substitution as a redirect target.
451        assert!(progs("echo hi > >(rm -rf /)").contains(&"rm".to_string()));
452    }
453
454    #[test]
455    fn descends_into_function_bodies() {
456        // A function body is walked (it runs when the function is called).
457        assert!(progs("f(){ rm -rf /; }; f").contains(&"rm".to_string()));
458        assert!(progs("function g { git push --force; }; g").contains(&"git".to_string()));
459    }
460
461    #[test]
462    fn deep_nesting_is_refused_not_aborted() {
463        // Hundreds of nested `$(` would overflow brush-parser's stack (an
464        // uncatchable abort). We must refuse to parse it, returning None.
465        let bomb = format!("echo {}rm -rf /{}", "$(".repeat(300), ")".repeat(300));
466        assert!(analyze(&bomb).is_none(), "deep nesting must be refused");
467    }
468
469    #[test]
470    fn moderate_nesting_is_fully_walked() {
471        // Within the ceiling, the buried command is still found (no silent drop).
472        let nested = format!("echo {}rm -rf /{}", "$(".repeat(12), ")".repeat(12));
473        assert!(progs(&nested).contains(&"rm".to_string()));
474    }
475
476    #[test]
477    fn backtick_and_keyword_bombs_are_refused() {
478        // Excessive backtick nesting and compound-keyword nesting are refused
479        // before the parser can overflow its stack.
480        let backticks: String = "`".repeat(MAX_NESTING + 5);
481        assert!(analyze(&backticks).is_none());
482        let keywords = "if true; then ".repeat(MAX_NESTING + 5);
483        assert!(analyze(&keywords).is_none());
484    }
485
486    #[test]
487    fn heredoc_bodies_are_conservatively_surfaced() {
488        // To stay DoS-safe, here-operators are neutralized before parsing, so a
489        // here-doc body is conservatively parsed as command(s) rather than data.
490        // This can over-flag (a heredoc to `cat` whose body reads like a command),
491        // which is recoverable — the point is a dangerous body is never hidden.
492        let p = progs("cat <<EOF\nrm -rf /\nEOF\n");
493        assert!(
494            p.contains(&"rm".to_string()),
495            "body must be surfaced: {p:?}"
496        );
497    }
498
499    #[test]
500    fn unparseable_is_none() {
501        // An unterminated quote is a parse error → None (caller stays cautious).
502        assert!(ast_commands("echo 'unterminated").is_none());
503    }
504
505    #[test]
506    fn args_are_captured() {
507        let cmds = ast_commands("rm -rf build").unwrap();
508        let rm = cmds.iter().find(|c| c.program == "rm").unwrap();
509        assert!(rm.args.iter().any(|a| a == "-rf"));
510        assert!(rm.args.iter().any(|a| a == "build"));
511    }
512}