Skip to main content

agent_shell_parser/parse/
types.rs

1use std::fmt;
2
3/// Shell operator separating consecutive pipeline segments.
4#[derive(Debug, Clone, PartialEq, Eq)]
5#[non_exhaustive]
6pub enum Operator {
7    /// `&&` — run next only if previous succeeded
8    And,
9    /// `||` — run next only if previous failed
10    Or,
11    /// `;` — run next unconditionally
12    Semi,
13    /// `|` — pipe stdout
14    Pipe,
15    /// `|&` — pipe stdout+stderr
16    PipeErr,
17    /// `&` — previous command backgrounded, next runs immediately
18    Background,
19}
20
21impl Operator {
22    pub fn as_str(&self) -> &'static str {
23        match self {
24            Operator::And => "&&",
25            Operator::Or => "||",
26            Operator::Semi => ";",
27            Operator::Pipe => "|",
28            Operator::PipeErr => "|&",
29            Operator::Background => "&",
30        }
31    }
32}
33
34impl fmt::Display for Operator {
35    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
36        f.write_str(self.as_str())
37    }
38}
39
40/// A fully decomposed compound command.
41///
42/// This is a recursive structure: segments may contain substitutions, and
43/// each substitution contains a recursively-parsed [`ParsedPipeline`].
44/// Evaluation proceeds bottom-up (a catamorphism): inner substitutions are
45/// evaluated first, their output feeds the outer command.
46#[derive(Debug, Clone)]
47pub struct ParsedPipeline {
48    pub segments: Vec<ShellSegment>,
49    /// Operators between consecutive segments.
50    pub operators: Vec<Operator>,
51    /// Substitutions in non-command structural positions: `for`-loop
52    /// iteration values (`for i in $(cmd)`), `case` subjects
53    /// (`case $(cmd) in`).
54    ///
55    /// These execute before any segment in this pipeline. Each is
56    /// recursively parsed.
57    pub structural_substitutions: Vec<SubstitutionSpan>,
58    /// `true` when tree-sitter produced error-recovery nodes in the AST.
59    ///
60    /// The pipeline is still usable — tree-sitter always produces a tree —
61    /// but callers should treat the result as best-effort.
62    pub has_parse_errors: bool,
63}
64
65impl ParsedPipeline {
66    /// An empty pipeline representing a parse failure.
67    pub fn empty_with_error() -> Self {
68        Self {
69            segments: vec![],
70            operators: vec![],
71            structural_substitutions: vec![],
72            has_parse_errors: true,
73        }
74    }
75
76    /// Walk all pipelines in the tree (this one and all nested ones),
77    /// depth-first. Returns the first `Some(T)` produced by `f`.
78    ///
79    /// This is the lowest-level traversal primitive — it visits pipeline
80    /// nodes rather than segments, enabling checks on pipeline-level
81    /// properties (like `has_parse_errors`).
82    pub fn find_pipeline<T>(&self, f: &impl Fn(&ParsedPipeline) -> Option<T>) -> Option<T> {
83        if let Some(hit) = f(self) {
84            return Some(hit);
85        }
86        for sub in &self.structural_substitutions {
87            if let Some(hit) = sub.pipeline.find_pipeline(f) {
88                return Some(hit);
89            }
90        }
91        for seg in &self.segments {
92            for sub in &seg.substitutions {
93                if let Some(hit) = sub.pipeline.find_pipeline(f) {
94                    return Some(hit);
95                }
96            }
97        }
98        None
99    }
100
101    /// Returns `true` if any pipeline in the tree satisfies `f`.
102    pub fn any_pipeline(&self, f: &impl Fn(&ParsedPipeline) -> bool) -> bool {
103        self.find_pipeline(&|p| if f(p) { Some(()) } else { None })
104            .is_some()
105    }
106
107    /// Walk the pipeline tree depth-first in execution order, applying `f`
108    /// to each [`ShellSegment`]. Returns the first `Some(T)` produced by
109    /// `f`, or `None` if every segment returns `None`.
110    ///
111    /// Traversal order mirrors shell evaluation:
112    /// 1. Structural substitutions (for-loop values, case subjects)
113    /// 2. For each segment: its substitutions first, then the segment itself
114    ///
115    /// This is the canonical way to inspect every command in the tree.
116    /// Both "does any segment satisfy P?" and "find the first segment
117    /// matching P" reduce to this.
118    pub fn find_segment<T>(&self, f: &impl Fn(&ShellSegment) -> Option<T>) -> Option<T> {
119        for sub in &self.structural_substitutions {
120            if let Some(hit) = sub.pipeline.find_segment(f) {
121                return Some(hit);
122            }
123        }
124        for seg in &self.segments {
125            for sub in &seg.substitutions {
126                if let Some(hit) = sub.pipeline.find_segment(f) {
127                    return Some(hit);
128                }
129            }
130            if let Some(hit) = f(seg) {
131                return Some(hit);
132            }
133        }
134        None
135    }
136
137    /// Walk the pipeline tree depth-first, applying `f` to each
138    /// [`ShellSegment`] and collecting every non-`None` result.
139    ///
140    /// Same traversal order as [`find_segment`](Self::find_segment) but
141    /// does not short-circuit.
142    pub fn filter_segments<T>(&self, f: &impl Fn(&ShellSegment) -> Option<T>) -> Vec<T> {
143        let mut out = Vec::new();
144        self.filter_segments_into(f, &mut out);
145        out
146    }
147
148    fn filter_segments_into<T>(&self, f: &impl Fn(&ShellSegment) -> Option<T>, out: &mut Vec<T>) {
149        for sub in &self.structural_substitutions {
150            sub.pipeline.filter_segments_into(f, out);
151        }
152        for seg in &self.segments {
153            for sub in &seg.substitutions {
154                sub.pipeline.filter_segments_into(f, out);
155            }
156            if let Some(hit) = f(seg) {
157                out.push(hit);
158            }
159        }
160    }
161
162    /// Returns `true` if this pipeline or any nested substitution has
163    /// parse errors.
164    ///
165    /// When tree-sitter uses error recovery, some commands may not have
166    /// been extracted. Callers enforcing a security boundary should
167    /// treat a `true` return as "cannot safely analyze — fail closed."
168    pub fn has_parse_errors_recursive(&self) -> bool {
169        self.any_pipeline(&|p| p.has_parse_errors)
170    }
171}
172
173/// A single evaluable command within a compound pipeline.
174#[derive(Debug, Clone)]
175pub struct ShellSegment {
176    /// The command text, exactly as it appears in the source (trimmed).
177    ///
178    /// Substitution syntax (`$()`, backticks, `<()`, `>()`) is preserved
179    /// verbatim — the [`substitutions`](Self::substitutions) field carries
180    /// the recursively-parsed contents with byte positions into this text.
181    pub command: String,
182
183    /// Output redirection detected on a wrapping construct.
184    ///
185    /// When the parser extracts commands from inside a control-flow block
186    /// that has output redirection (e.g. `for ... done > file`), the
187    /// redirect is not present in the segment's `command` text. This field
188    /// carries the redirection so the eval layer can escalate the decision.
189    pub redirection: Option<Redirection>,
190
191    /// Substitutions within this segment's command text, in source order.
192    ///
193    /// Each substitution is evaluated before this segment's command.
194    /// `start`/`end` byte offsets index into [`command`](Self::command).
195    pub substitutions: Vec<SubstitutionSpan>,
196}
197
198/// A command substitution's position and recursively-parsed contents.
199#[derive(Debug, Clone)]
200pub struct SubstitutionSpan {
201    /// Byte offset of the substitution start within the parent's text.
202    ///
203    /// For substitutions on a [`ShellSegment`], this indexes into
204    /// `segment.command`. For structural substitutions on a
205    /// [`ParsedPipeline`], this is relative to the source text passed
206    /// to [`parse_with_substitutions`] at this recursion level (for
207    /// nested pipelines, that is the inner text of the parent
208    /// substitution, not the top-level command string).
209    pub start: usize,
210    /// Byte offset past the end of the substitution.
211    pub end: usize,
212    /// The recursively-parsed inner pipeline.
213    pub pipeline: ParsedPipeline,
214}
215
216/// Describes an output redirection that may mutate filesystem state.
217#[derive(Debug, Clone, PartialEq, Eq)]
218pub struct Redirection {
219    /// The redirection operator (e.g., `>`, `>>`, `>|`, `&>`, `&>>`, `<>`, `>&`).
220    pub operator: &'static str,
221    /// Source file descriptor, if explicitly specified (e.g., `2>` → `Some(2)`).
222    pub fd: Option<u32>,
223    /// Destination (file path, fd number for `>&N`, or empty for `<>`).
224    pub target: String,
225}
226
227impl fmt::Display for Redirection {
228    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
229        match self.fd {
230            Some(fd) => write!(
231                f,
232                "output redirection ({fd}{} {})",
233                self.operator, self.target
234            ),
235            None => write!(f, "output redirection ({} {})", self.operator, self.target),
236        }
237    }
238}
239
240/// Tree-sitter failed to produce a syntax tree.
241///
242/// Extremely rare in practice — tree-sitter handles any input, including
243/// malformed shell. The only known causes are memory allocation failure
244/// or a cancelled parse.
245#[derive(Debug, thiserror::Error)]
246#[error("tree-sitter failed to produce a syntax tree")]
247pub struct ParseError;
248
249/// Classification of indirect execution patterns that may hide commands
250/// from static analysis.
251#[derive(Debug, Clone, PartialEq, Eq)]
252#[non_exhaustive]
253pub enum IndirectExecution {
254    /// `eval "..."` — argument string is executed as shell code.
255    /// Cannot be statically analyzed in the general case.
256    Eval,
257    /// `bash -c "..."` / `sh -c "..."` — spawns a new shell with
258    /// inline code. Cannot be statically analyzed.
259    ShellSpawn,
260    /// `env cmd` / `command cmd` / `sudo cmd` — transparent wrapper
261    /// around another command. Strip the wrapper and re-analyze.
262    CommandWrapper,
263    /// `source file` / `. file` — executes a script in the current
264    /// shell. Contents cannot be statically analyzed.
265    SourceScript,
266}
267
268/// Properties of a parsed command segment relevant to security analysis.
269#[derive(Debug, Clone, PartialEq, Eq)]
270pub struct CommandCharacteristics {
271    /// Base command name (path stripped, env vars skipped).
272    pub base_command: String,
273    /// If this is an indirect execution wrapper, what kind.
274    pub indirect_execution: Option<IndirectExecution>,
275    /// Whether the command position contains a variable expansion
276    /// (`$cmd`, `${cmd}`) that cannot be statically resolved.
277    pub has_dynamic_command: bool,
278}
279
280/// A parsed flag from a command's argument list.
281#[derive(Debug, Clone, PartialEq, Eq)]
282pub struct ParsedFlag {
283    /// The flag name without its value (e.g., `--force`, `-f`).
284    pub name: String,
285    /// Value if specified with `=` (e.g., `--color=always` → `Some("always")`).
286    pub value: Option<String>,
287}
288
289/// An argument in a parsed command line.
290#[derive(Debug, Clone, PartialEq, Eq)]
291#[non_exhaustive]
292pub enum CommandArg {
293    /// A flag token (e.g., `--force`, `-f`, `--color=always`).
294    Flag(ParsedFlag),
295    /// A non-flag token (subcommand, path, or other argument).
296    Positional(String),
297}
298
299/// Structurally decomposed command with arguments in source order.
300///
301/// Schema-free parse: flags are identified syntactically (tokens starting
302/// with `-`). Without a command's flag definitions, `--flag value` is
303/// ambiguous — the value appears as a separate positional. Schema-aware
304/// consumers walk `args` to associate values with flags they know about.
305#[derive(Debug, Clone, PartialEq, Eq)]
306pub struct ParsedCommand {
307    /// Base command name (path stripped, env vars skipped).
308    pub command: String,
309    /// Arguments in source order — flags and positionals interleaved.
310    pub args: Vec<CommandArg>,
311}
312
313impl ParsedCommand {
314    /// First positional argument (often a subcommand).
315    pub fn subcommand(&self) -> Option<&str> {
316        self.args.iter().find_map(|a| match a {
317            CommandArg::Positional(s) => Some(s.as_str()),
318            _ => None,
319        })
320    }
321
322    /// Iterate over all flags.
323    pub fn flags(&self) -> impl Iterator<Item = &ParsedFlag> {
324        self.args.iter().filter_map(|a| match a {
325            CommandArg::Flag(f) => Some(f),
326            _ => None,
327        })
328    }
329
330    /// Iterate over all positional arguments.
331    pub fn positional(&self) -> impl Iterator<Item = &str> {
332        self.args.iter().filter_map(|a| match a {
333            CommandArg::Positional(s) => Some(s.as_str()),
334            _ => None,
335        })
336    }
337
338    /// Check if a flag is present by name (e.g., `--force` or `-f`).
339    pub fn has_flag(&self, name: &str) -> bool {
340        self.flags().any(|f| f.name == name)
341    }
342
343    /// Reconstruct a flat word list.
344    pub fn to_words(&self) -> Vec<String> {
345        let mut words = vec![self.command.clone()];
346        for arg in &self.args {
347            match arg {
348                CommandArg::Flag(f) => match &f.value {
349                    Some(v) => words.push(format!("{}={}", f.name, v)),
350                    None => words.push(f.name.clone()),
351                },
352                CommandArg::Positional(s) => words.push(s.clone()),
353            }
354        }
355        words
356    }
357}
358
359/// Result of resolving a command through the indirection layer.
360#[derive(Debug, Clone)]
361#[non_exhaustive]
362pub enum ResolvedCommand {
363    /// Wrappers stripped, command structurally parsed.
364    Resolved(ParsedCommand),
365    /// The command is unanalyzable — eval, source, shell -c, dynamic `$cmd`.
366    Unanalyzable(UnanalyzableCommand),
367}
368
369/// A command that cannot be statically analyzed.
370#[derive(Debug, Clone)]
371pub struct UnanalyzableCommand {
372    /// The command that triggered the classification (e.g., `eval`, `bash`).
373    pub command: String,
374    /// Why it's unanalyzable.
375    pub kind: IndirectExecution,
376}
377
378/// Describes how to strip a transparent wrapper command to find the inner command.
379///
380/// Each wrapper has different flag semantics. This struct captures just enough
381/// to correctly skip past the wrapper and its flags to the real command.
382/// Designed for deserialization from config files — consumers load specs from
383/// JSON/TOML/YAML and pass them to [`resolve_command_with`].
384#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
385pub struct WrapperSpec {
386    /// Command name to match (basename, e.g., "sudo").
387    pub name: String,
388    /// Short flags that consume the next token as a value (e.g., `["-u", "-g"]`).
389    #[serde(default)]
390    pub short_value_flags: Vec<String>,
391    /// Long flags that consume the next token as a value (e.g., `["--user", "--group"]`).
392    #[serde(default)]
393    pub long_value_flags: Vec<String>,
394    /// Flags whose presence makes the entire invocation unanalyzable.
395    /// Example: `env -S` executes its value as a command string (eval-equivalent).
396    #[serde(default)]
397    pub unanalyzable_flags: Vec<String>,
398    /// Whether to skip leading `KEY=VALUE` tokens after the wrapper (env-style).
399    #[serde(default)]
400    pub skip_env_assignments: bool,
401    /// Whether `--` terminates flag processing for this wrapper.
402    #[serde(default)]
403    pub has_terminator: bool,
404    /// Number of leading positional arguments to skip before the inner command.
405    ///
406    /// Some wrappers require mandatory positional args before the command:
407    /// `timeout DURATION cmd`, `chrt PRIORITY cmd`, `taskset MASK cmd`.
408    /// Set this to the number of positionals to consume before treating
409    /// the next non-flag token as the inner command.
410    #[serde(default)]
411    pub skip_positionals: usize,
412}
413
414/// Complete command classification configuration.
415///
416/// Drives all indirect execution detection — no command knowledge is hardcoded
417/// in the parser source. Consumers load this from JSON/TOML/YAML and pass it
418/// to [`resolve_command_with`].
419#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
420pub struct CommandConfig {
421    /// Transparent wrappers that execute an inner command (env, sudo, etc.).
422    pub wrappers: Vec<WrapperSpec>,
423    /// Shells that can spawn inline code via `-c` (bash, sh, zsh, etc.).
424    /// When invoked without `-c`, classified as script execution.
425    pub shells: Vec<String>,
426    /// Commands that execute their argument as shell code (eval).
427    pub eval_commands: Vec<String>,
428    /// Commands that execute a file in the current shell (source, `.`).
429    pub source_commands: Vec<String>,
430}
431
432#[cfg(test)]
433mod tests {
434    use super::super::parse_with_substitutions;
435
436    fn parse(cmd: &str) -> super::ParsedPipeline {
437        parse_with_substitutions(cmd).expect("parse failed")
438    }
439
440    // --- find_segment ---
441
442    #[test]
443    fn find_segment_returns_first_match() {
444        let p = parse("echo hello && ls -la");
445        let found = p.find_segment(&|seg| {
446            if seg.command.starts_with("ls") {
447                Some(seg.command.clone())
448            } else {
449                None
450            }
451        });
452        assert_eq!(found.as_deref(), Some("ls -la"));
453    }
454
455    #[test]
456    fn find_segment_returns_none_when_no_match() {
457        let p = parse("echo hello && ls -la");
458        let found = p.find_segment(&|seg| {
459            if seg.command.starts_with("git") {
460                Some(())
461            } else {
462                None
463            }
464        });
465        assert!(found.is_none());
466    }
467
468    #[test]
469    fn find_segment_recurses_into_substitutions() {
470        let p = parse("echo $(git status)");
471        let found = p.find_segment(&|seg| {
472            if seg.command.contains("git status") {
473                Some(seg.command.clone())
474            } else {
475                None
476            }
477        });
478        assert_eq!(found.as_deref(), Some("git status"));
479    }
480
481    #[test]
482    fn find_segment_visits_substitutions_before_parent() {
483        // In "echo $(date)", the walker should visit "date" before "echo $(date)".
484        // filter_segments with Some for all collects in traversal order.
485        let p = parse("echo $(date)");
486        let all: Vec<String> = p.filter_segments(&|seg| Some(seg.command.clone()));
487        assert_eq!(all, vec!["date", "echo $(date)"]);
488    }
489
490    #[test]
491    fn find_segment_visits_structural_substitutions_first() {
492        let p = parse("for i in $(seq 10); do echo $i; done");
493        let all: Vec<String> = p.filter_segments(&|seg| Some(seg.command.clone()));
494        assert_eq!(all[0], "seq 10");
495    }
496
497    // --- filter_segments ---
498
499    #[test]
500    fn filter_segments_collects_all_matches() {
501        let p = parse("echo a && echo b && ls c");
502        let echoes: Vec<String> = p.filter_segments(&|seg| {
503            if seg.command.starts_with("echo") {
504                Some(seg.command.clone())
505            } else {
506                None
507            }
508        });
509        assert_eq!(echoes, vec!["echo a", "echo b"]);
510    }
511
512    #[test]
513    fn filter_segments_collects_from_nested() {
514        let p = parse("echo $(git status && git diff)");
515        let gits: Vec<String> = p.filter_segments(&|seg| {
516            if seg.command.starts_with("git") {
517                Some(seg.command.clone())
518            } else {
519                None
520            }
521        });
522        assert_eq!(gits, vec!["git status", "git diff"]);
523    }
524
525    // --- has_parse_errors_recursive ---
526
527    #[test]
528    fn no_errors_on_valid_input() {
529        assert!(!parse("echo hello").has_parse_errors_recursive());
530    }
531
532    #[test]
533    fn no_errors_on_compound() {
534        assert!(!parse("echo a && echo b | cat").has_parse_errors_recursive());
535    }
536
537    #[test]
538    fn no_errors_on_substitution() {
539        assert!(!parse("echo $(date)").has_parse_errors_recursive());
540    }
541}