Skip to main content

agent_shell_parser/parse/
types.rs

1use std::borrow::Borrow;
2use std::fmt;
3use std::ops::Deref;
4
5use super::tokenize::{is_env_assignment, is_valid_env_key};
6
7// ---------------------------------------------------------------------------
8// Word newtype
9// ---------------------------------------------------------------------------
10
11/// A single shell word token.
12///
13/// Wraps a `String` with domain-specific helpers for shell analysis (flag
14/// detection, env assignment parsing, basename extraction). Derefs to `str`
15/// for seamless use wherever a string slice is expected.
16///
17/// Note: `Word` carries raw shell text extracted from the parse tree. It is
18/// not sanitized or validated — consumers must not treat word equality as
19/// proof of command identity without considering the full resolution pipeline.
20#[derive(Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
21#[serde(transparent)]
22pub struct Word(String);
23
24impl Word {
25    /// Returns `true` if this word starts with `-`.
26    pub fn is_flag(&self) -> bool {
27        self.0.starts_with('-')
28    }
29
30    /// Returns `true` if this word is a valid `KEY=VALUE` environment assignment.
31    pub fn is_assignment(&self) -> bool {
32        is_env_assignment(&self.0)
33    }
34
35    /// Split at the first `=` and return `(key, value)` if the key is a valid
36    /// environment variable name.
37    pub fn as_assignment(&self) -> Option<(&str, &str)> {
38        let eq_pos = self.0.find('=')?;
39        let key = &self.0[..eq_pos];
40        if is_valid_env_key(key) {
41            Some((key, &self.0[eq_pos + 1..]))
42        } else {
43            None
44        }
45    }
46
47    /// Strip the path prefix, e.g. `/usr/bin/ls` -> `ls`.
48    pub fn basename(&self) -> &str {
49        match self.0.rsplit_once('/') {
50            Some((_, name)) if !name.is_empty() => name,
51            _ => &self.0,
52        }
53    }
54
55    /// Explicit accessor for the inner string slice.
56    pub fn as_str(&self) -> &str {
57        &self.0
58    }
59
60    /// Consume and return the inner `String`.
61    pub fn into_inner(self) -> String {
62        self.0
63    }
64}
65
66// --- Deref / AsRef / Borrow ---
67
68impl Deref for Word {
69    type Target = str;
70
71    fn deref(&self) -> &str {
72        &self.0
73    }
74}
75
76impl AsRef<str> for Word {
77    fn as_ref(&self) -> &str {
78        &self.0
79    }
80}
81
82impl Borrow<str> for Word {
83    fn borrow(&self) -> &str {
84        &self.0
85    }
86}
87
88// --- Display / Debug ---
89
90impl fmt::Display for Word {
91    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
92        f.write_str(&self.0)
93    }
94}
95
96impl fmt::Debug for Word {
97    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
98        fmt::Debug::fmt(&self.0, f)
99    }
100}
101
102// --- From conversions ---
103
104impl From<String> for Word {
105    fn from(s: String) -> Self {
106        Word(s)
107    }
108}
109
110impl From<&str> for Word {
111    fn from(s: &str) -> Self {
112        Word(s.to_string())
113    }
114}
115
116// --- PartialEq with str types ---
117
118impl PartialEq<str> for Word {
119    fn eq(&self, other: &str) -> bool {
120        self.0 == other
121    }
122}
123
124impl PartialEq<&str> for Word {
125    fn eq(&self, other: &&str) -> bool {
126        self.0 == *other
127    }
128}
129
130impl PartialEq<Word> for str {
131    fn eq(&self, other: &Word) -> bool {
132        self == other.0
133    }
134}
135
136impl PartialEq<Word> for &str {
137    fn eq(&self, other: &Word) -> bool {
138        *self == other.0
139    }
140}
141
142impl PartialEq<String> for Word {
143    fn eq(&self, other: &String) -> bool {
144        self.0 == *other
145    }
146}
147
148impl PartialEq<Word> for String {
149    fn eq(&self, other: &Word) -> bool {
150        *self == other.0
151    }
152}
153
154/// Shell operator separating consecutive pipeline segments.
155#[derive(Debug, Clone, PartialEq, Eq)]
156#[non_exhaustive]
157pub enum Operator {
158    /// `&&` — run next only if previous succeeded
159    And,
160    /// `||` — run next only if previous failed
161    Or,
162    /// `;` — run next unconditionally
163    Semi,
164    /// `|` — pipe stdout
165    Pipe,
166    /// `|&` — pipe stdout+stderr
167    PipeErr,
168    /// `&` — previous command backgrounded, next runs immediately
169    Background,
170}
171
172impl Operator {
173    pub fn as_str(&self) -> &'static str {
174        match self {
175            Operator::And => "&&",
176            Operator::Or => "||",
177            Operator::Semi => ";",
178            Operator::Pipe => "|",
179            Operator::PipeErr => "|&",
180            Operator::Background => "&",
181        }
182    }
183}
184
185impl fmt::Display for Operator {
186    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
187        f.write_str(self.as_str())
188    }
189}
190
191/// A fully decomposed compound command.
192///
193/// This is a recursive structure: segments may contain substitutions, and
194/// each substitution contains a recursively-parsed [`ParsedPipeline`].
195/// Evaluation proceeds bottom-up (a catamorphism): inner substitutions are
196/// evaluated first, their output feeds the outer command.
197#[derive(Debug, Clone)]
198pub struct ParsedPipeline {
199    pub segments: Vec<ShellSegment>,
200    /// Operators between consecutive segments.
201    pub operators: Vec<Operator>,
202    /// Substitutions in non-command structural positions: `for`-loop
203    /// iteration values (`for i in $(cmd)`), `case` subjects
204    /// (`case $(cmd) in`).
205    ///
206    /// These execute before any segment in this pipeline. Each is
207    /// recursively parsed.
208    pub structural_substitutions: Vec<SubstitutionSpan>,
209    /// `true` when tree-sitter produced error-recovery nodes in the AST.
210    ///
211    /// The pipeline is still usable — tree-sitter always produces a tree —
212    /// but callers should treat the result as best-effort.
213    pub has_parse_errors: bool,
214}
215
216impl ParsedPipeline {
217    /// An empty pipeline representing a parse failure.
218    pub fn empty_with_error() -> Self {
219        Self {
220            segments: vec![],
221            operators: vec![],
222            structural_substitutions: vec![],
223            has_parse_errors: true,
224        }
225    }
226
227    /// Walk all pipelines in the tree (this one and all nested ones),
228    /// depth-first. Returns the first `Some(T)` produced by `f`.
229    ///
230    /// This is the lowest-level traversal primitive — it visits pipeline
231    /// nodes rather than segments, enabling checks on pipeline-level
232    /// properties (like `has_parse_errors`).
233    pub fn find_pipeline<T>(&self, f: &impl Fn(&ParsedPipeline) -> Option<T>) -> Option<T> {
234        if let Some(hit) = f(self) {
235            return Some(hit);
236        }
237        for sub in &self.structural_substitutions {
238            if let Some(hit) = sub.pipeline.find_pipeline(f) {
239                return Some(hit);
240            }
241        }
242        for seg in &self.segments {
243            for sub in &seg.substitutions {
244                if let Some(hit) = sub.pipeline.find_pipeline(f) {
245                    return Some(hit);
246                }
247            }
248        }
249        None
250    }
251
252    /// Returns `true` if any pipeline in the tree satisfies `f`.
253    pub fn any_pipeline(&self, f: &impl Fn(&ParsedPipeline) -> bool) -> bool {
254        self.find_pipeline(&|p| if f(p) { Some(()) } else { None })
255            .is_some()
256    }
257
258    /// Walk the pipeline tree depth-first in execution order, applying `f`
259    /// to each [`ShellSegment`]. Returns the first `Some(T)` produced by
260    /// `f`, or `None` if every segment returns `None`.
261    ///
262    /// Traversal order mirrors shell evaluation:
263    /// 1. Structural substitutions (for-loop values, case subjects)
264    /// 2. For each segment: its substitutions first, then the segment itself
265    ///
266    /// This is the canonical way to inspect every command in the tree.
267    /// Both "does any segment satisfy P?" and "find the first segment
268    /// matching P" reduce to this.
269    pub fn find_segment<T>(&self, f: &impl Fn(&ShellSegment) -> Option<T>) -> Option<T> {
270        for sub in &self.structural_substitutions {
271            if let Some(hit) = sub.pipeline.find_segment(f) {
272                return Some(hit);
273            }
274        }
275        for seg in &self.segments {
276            for sub in &seg.substitutions {
277                if let Some(hit) = sub.pipeline.find_segment(f) {
278                    return Some(hit);
279                }
280            }
281            if let Some(hit) = f(seg) {
282                return Some(hit);
283            }
284        }
285        None
286    }
287
288    /// Walk the pipeline tree depth-first, applying `f` to each
289    /// [`ShellSegment`] and collecting every non-`None` result.
290    ///
291    /// Same traversal order as [`find_segment`](Self::find_segment) but
292    /// does not short-circuit.
293    pub fn filter_segments<T>(&self, f: &impl Fn(&ShellSegment) -> Option<T>) -> Vec<T> {
294        let mut out = Vec::new();
295        self.filter_segments_into(f, &mut out);
296        out
297    }
298
299    fn filter_segments_into<T>(&self, f: &impl Fn(&ShellSegment) -> Option<T>, out: &mut Vec<T>) {
300        for sub in &self.structural_substitutions {
301            sub.pipeline.filter_segments_into(f, out);
302        }
303        for seg in &self.segments {
304            for sub in &seg.substitutions {
305                sub.pipeline.filter_segments_into(f, out);
306            }
307            if let Some(hit) = f(seg) {
308                out.push(hit);
309            }
310        }
311    }
312
313    /// Returns `true` if this pipeline or any nested substitution has
314    /// parse errors.
315    ///
316    /// When tree-sitter uses error recovery, some commands may not have
317    /// been extracted. Callers enforcing a security boundary should
318    /// treat a `true` return as "cannot safely analyze — fail closed."
319    pub fn has_parse_errors_recursive(&self) -> bool {
320        self.any_pipeline(&|p| p.has_parse_errors)
321    }
322}
323
324/// A single evaluable command within a compound pipeline.
325#[derive(Debug, Clone)]
326pub struct ShellSegment {
327    /// The command text, exactly as it appears in the source (trimmed).
328    ///
329    /// Substitution syntax (`$()`, backticks, `<()`, `>()`) is preserved
330    /// verbatim — the [`substitutions`](Self::substitutions) field carries
331    /// the recursively-parsed contents with byte positions into this text.
332    pub command: String,
333
334    /// Pre-tokenized word list as tree-sitter understood word boundaries.
335    ///
336    /// Unlike shlex tokenization of [`command`](Self::command), this
337    /// correctly preserves substitution syntax as single tokens. For
338    /// example, `export FOO=$(echo test) BAR=baz` produces
339    /// `["export", "FOO=$(echo test)", "BAR=baz"]` — shlex would
340    /// incorrectly split inside the `$(...)`.
341    ///
342    /// Quotes are stripped: `"foo bar"` becomes `foo bar`. Both
343    /// tree-sitter extraction and shlex fallback produce unquoted tokens.
344    /// Substitution delimiters (`$(...)`, `` `...` ``, `<(...)`) are
345    /// preserved as-is since they are semantic, not syntactic wrappers.
346    ///
347    /// Falls back to shlex/whitespace tokenization when tree-sitter does
348    /// not provide word-level structure (e.g. unknown node types or
349    /// heredoc loose words). The fallback is documented per node type in
350    /// the parser source.
351    pub words: Vec<Word>,
352
353    /// Output redirection detected on a wrapping construct.
354    ///
355    /// When the parser extracts commands from inside a control-flow block
356    /// that has output redirection (e.g. `for ... done > file`), the
357    /// redirect is not present in the segment's `command` text. This field
358    /// carries the redirection so the eval layer can escalate the decision.
359    pub redirection: Option<Redirection>,
360
361    /// Substitutions within this segment's command text, in source order.
362    ///
363    /// Each substitution is evaluated before this segment's command.
364    /// `start`/`end` byte offsets index into [`command`](Self::command).
365    pub substitutions: Vec<SubstitutionSpan>,
366}
367
368/// A command substitution's position and recursively-parsed contents.
369#[derive(Debug, Clone)]
370pub struct SubstitutionSpan {
371    /// Byte offset of the substitution start within the parent's text.
372    ///
373    /// For substitutions on a [`ShellSegment`], this indexes into
374    /// `segment.command`. For structural substitutions on a
375    /// [`ParsedPipeline`], this is relative to the source text passed
376    /// to [`parse_with_substitutions`] at this recursion level (for
377    /// nested pipelines, that is the inner text of the parent
378    /// substitution, not the top-level command string).
379    pub start: usize,
380    /// Byte offset past the end of the substitution.
381    pub end: usize,
382    /// The recursively-parsed inner pipeline.
383    pub pipeline: ParsedPipeline,
384}
385
386/// Describes an output redirection that may mutate filesystem state.
387#[derive(Debug, Clone, PartialEq, Eq)]
388pub struct Redirection {
389    /// The redirection operator (e.g., `>`, `>>`, `>|`, `&>`, `&>>`, `<>`, `>&`).
390    pub operator: &'static str,
391    /// Source file descriptor, if explicitly specified (e.g., `2>` → `Some(2)`).
392    pub fd: Option<u32>,
393    /// Destination (file path, fd number for `>&N`, or empty for `<>`).
394    pub target: String,
395}
396
397impl fmt::Display for Redirection {
398    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
399        match self.fd {
400            Some(fd) => write!(
401                f,
402                "output redirection ({fd}{} {})",
403                self.operator, self.target
404            ),
405            None => write!(f, "output redirection ({} {})", self.operator, self.target),
406        }
407    }
408}
409
410/// Tree-sitter failed to produce a syntax tree.
411///
412/// Extremely rare in practice — tree-sitter handles any input, including
413/// malformed shell. The only known causes are memory allocation failure
414/// or a cancelled parse.
415#[derive(Debug, thiserror::Error)]
416#[error("tree-sitter failed to produce a syntax tree")]
417pub struct ParseError;
418
419/// Classification of indirect execution patterns that may hide commands
420/// from static analysis.
421#[derive(Debug, Clone, PartialEq, Eq)]
422#[non_exhaustive]
423pub enum IndirectExecution {
424    /// `eval "..."` — argument string is executed as shell code.
425    /// Cannot be statically analyzed in the general case.
426    Eval,
427    /// `bash -c "..."` / `sh -c "..."` — spawns a new shell with
428    /// inline code. Cannot be statically analyzed.
429    ShellSpawn,
430    /// `env cmd` / `command cmd` / `sudo cmd` — transparent wrapper
431    /// around another command. Strip the wrapper and re-analyze.
432    CommandWrapper,
433    /// `source file` / `. file` — executes a script in the current
434    /// shell. Contents cannot be statically analyzed.
435    SourceScript,
436}
437
438/// Properties of a parsed command segment relevant to security analysis.
439#[derive(Debug, Clone, PartialEq, Eq)]
440pub struct CommandCharacteristics {
441    /// Base command name (path stripped, env vars skipped).
442    pub base_command: String,
443    /// If this is an indirect execution wrapper, what kind.
444    pub indirect_execution: Option<IndirectExecution>,
445    /// Whether the command position contains a variable expansion
446    /// (`$cmd`, `${cmd}`) that cannot be statically resolved.
447    pub has_dynamic_command: bool,
448}
449
450/// A parsed flag from a command's argument list.
451#[derive(Debug, Clone, PartialEq, Eq)]
452pub struct ParsedFlag {
453    /// The flag name without its value (e.g., `--force`, `-f`).
454    pub name: Word,
455    /// Value if specified with `=` (e.g., `--color=always` → `Some("always")`).
456    pub value: Option<Word>,
457}
458
459/// An argument in a parsed command line.
460#[derive(Debug, Clone, PartialEq, Eq)]
461#[non_exhaustive]
462pub enum CommandArg {
463    /// A flag token (e.g., `--force`, `-f`, `--color=always`).
464    Flag(ParsedFlag),
465    /// A non-flag token (subcommand, path, or other argument).
466    Positional(Word),
467}
468
469/// Structurally decomposed command with arguments in source order.
470///
471/// Schema-free parse: flags are identified syntactically (tokens starting
472/// with `-`). Without a command's flag definitions, `--flag value` is
473/// ambiguous — the value appears as a separate positional. Schema-aware
474/// consumers walk `args` to associate values with flags they know about.
475#[derive(Debug, Clone, PartialEq, Eq)]
476pub struct ParsedCommand {
477    /// Base command name (path stripped, env vars skipped).
478    pub command: Word,
479    /// Arguments in source order — flags and positionals interleaved.
480    pub args: Vec<CommandArg>,
481}
482
483impl ParsedCommand {
484    /// Construct a `ParsedCommand` directly from a word slice, avoiding a
485    /// string round-trip through shlex.
486    ///
487    /// - First word that is not an env assignment becomes the `command`
488    ///   (with path prefix stripped).
489    /// - Remaining words are classified as [`CommandArg::Flag`] or
490    ///   [`CommandArg::Positional`] using the same schema-free rules as
491    ///   [`parse_command`](super::tokenize::parse_command).
492    pub fn from_words(words: &[Word]) -> Self {
493        let cmd_idx = words.iter().position(|w| !w.is_assignment());
494        let Some(cmd_idx) = cmd_idx else {
495            return ParsedCommand {
496                command: Word::from(""),
497                args: vec![],
498            };
499        };
500
501        let base = Word::from(words[cmd_idx].basename());
502
503        let mut args = Vec::new();
504        let mut past_double_dash = false;
505
506        for token in &words[cmd_idx + 1..] {
507            if past_double_dash {
508                args.push(CommandArg::Positional(token.clone()));
509                continue;
510            }
511            if token == "--" {
512                past_double_dash = true;
513                continue;
514            }
515            if let Some(rest) = token.strip_prefix("--") {
516                if let Some((name, value)) = rest.split_once('=') {
517                    args.push(CommandArg::Flag(ParsedFlag {
518                        name: Word::from(format!("--{name}")),
519                        value: Some(Word::from(value)),
520                    }));
521                } else {
522                    args.push(CommandArg::Flag(ParsedFlag {
523                        name: token.clone(),
524                        value: None,
525                    }));
526                }
527            } else if token.starts_with('-') && token.len() > 1 {
528                args.push(CommandArg::Flag(ParsedFlag {
529                    name: token.clone(),
530                    value: None,
531                }));
532            } else {
533                args.push(CommandArg::Positional(token.clone()));
534            }
535        }
536
537        ParsedCommand {
538            command: base,
539            args,
540        }
541    }
542
543    /// First positional argument (often a subcommand).
544    pub fn subcommand(&self) -> Option<&str> {
545        self.args.iter().find_map(|a| match a {
546            CommandArg::Positional(s) => Some(s.as_str()),
547            _ => None,
548        })
549    }
550
551    /// Iterate over all flags.
552    pub fn flags(&self) -> impl Iterator<Item = &ParsedFlag> {
553        self.args.iter().filter_map(|a| match a {
554            CommandArg::Flag(f) => Some(f),
555            _ => None,
556        })
557    }
558
559    /// Iterate over all positional arguments.
560    pub fn positional(&self) -> impl Iterator<Item = &str> {
561        self.args.iter().filter_map(|a| match a {
562            CommandArg::Positional(s) => Some(s.as_str()),
563            _ => None,
564        })
565    }
566
567    /// Check if a flag is present by name (e.g., `--force` or `-f`).
568    pub fn has_flag(&self, name: &str) -> bool {
569        self.flags().any(|f| f.name == name)
570    }
571
572    /// Reconstruct a flat word list.
573    pub fn to_words(&self) -> Vec<Word> {
574        let mut words = vec![self.command.clone()];
575        for arg in &self.args {
576            match arg {
577                CommandArg::Flag(f) => match &f.value {
578                    Some(v) => words.push(Word::from(format!("{}={}", f.name, v))),
579                    None => words.push(f.name.clone()),
580                },
581                CommandArg::Positional(s) => words.push(s.clone()),
582            }
583        }
584        words
585    }
586}
587
588/// Result of resolving a command through the indirection layer.
589#[derive(Debug, Clone)]
590#[non_exhaustive]
591pub enum ResolvedCommand {
592    /// Wrappers stripped, command structurally parsed.
593    Resolved(ParsedCommand),
594    /// The command is unanalyzable — eval, source, shell -c, dynamic `$cmd`.
595    Unanalyzable(UnanalyzableCommand),
596}
597
598/// A command that cannot be statically analyzed.
599#[derive(Debug, Clone)]
600pub struct UnanalyzableCommand {
601    /// The command that triggered the classification (e.g., `eval`, `bash`).
602    pub command: String,
603    /// Why it's unanalyzable.
604    pub kind: IndirectExecution,
605}
606
607/// Describes how to strip a transparent wrapper command to find the inner command.
608///
609/// Each wrapper has different flag semantics. This struct captures just enough
610/// to correctly skip past the wrapper and its flags to the real command.
611/// Designed for deserialization from config files — consumers load specs from
612/// JSON/TOML/YAML and pass them to [`resolve_command_with`].
613#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
614pub struct WrapperSpec {
615    /// Command name to match (basename, e.g., "sudo").
616    pub name: String,
617    /// Short flags that consume the next token as a value (e.g., `["-u", "-g"]`).
618    #[serde(default)]
619    pub short_value_flags: Vec<String>,
620    /// Long flags that consume the next token as a value (e.g., `["--user", "--group"]`).
621    #[serde(default)]
622    pub long_value_flags: Vec<String>,
623    /// Flags whose presence makes the entire invocation unanalyzable.
624    /// Example: `env -S` executes its value as a command string (eval-equivalent).
625    #[serde(default)]
626    pub unanalyzable_flags: Vec<String>,
627    /// Whether to skip leading `KEY=VALUE` tokens after the wrapper (env-style).
628    #[serde(default)]
629    pub skip_env_assignments: bool,
630    /// Whether `--` terminates flag processing for this wrapper.
631    #[serde(default)]
632    pub has_terminator: bool,
633    /// Number of leading positional arguments to skip before the inner command.
634    ///
635    /// Some wrappers require mandatory positional args before the command:
636    /// `timeout DURATION cmd`, `chrt PRIORITY cmd`, `taskset MASK cmd`.
637    /// Set this to the number of positionals to consume before treating
638    /// the next non-flag token as the inner command.
639    #[serde(default)]
640    pub skip_positionals: usize,
641}
642
643/// Complete command classification configuration.
644///
645/// Drives all indirect execution detection — no command knowledge is hardcoded
646/// in the parser source. Consumers load this from JSON/TOML/YAML and pass it
647/// to [`resolve_command_with`].
648#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
649pub struct CommandConfig {
650    /// Transparent wrappers that execute an inner command (env, sudo, etc.).
651    pub wrappers: Vec<WrapperSpec>,
652    /// Shells that can spawn inline code via `-c` (bash, sh, zsh, etc.).
653    /// When invoked without `-c`, classified as script execution.
654    pub shells: Vec<String>,
655    /// Commands that execute their argument as shell code (eval).
656    pub eval_commands: Vec<String>,
657    /// Commands that execute a file in the current shell (source, `.`).
658    pub source_commands: Vec<String>,
659}
660
661#[cfg(test)]
662#[path = "types_tests.rs"]
663mod types_tests;