agent_shell_parser/parse/types.rs
1use std::borrow::Borrow;
2use std::fmt;
3use std::ops::Deref;
4
5use super::tokenize::{is_env_assignment, is_valid_env_key};
6
7// ---------------------------------------------------------------------------
8// Word newtype
9// ---------------------------------------------------------------------------
10
11/// A single shell word token.
12///
13/// Wraps a `String` with domain-specific helpers for shell analysis (flag
14/// detection, env assignment parsing, basename extraction). Derefs to `str`
15/// for seamless use wherever a string slice is expected.
16///
17/// Note: `Word` carries raw shell text extracted from the parse tree. It is
18/// not sanitized or validated — consumers must not treat word equality as
19/// proof of command identity without considering the full resolution pipeline.
20#[derive(Clone, PartialEq, Eq, Hash, serde::Serialize, serde::Deserialize)]
21#[serde(transparent)]
22pub struct Word(String);
23
24impl Word {
25 /// Returns `true` if this word starts with `-`.
26 pub fn is_flag(&self) -> bool {
27 self.0.starts_with('-')
28 }
29
30 /// Returns `true` if this word is a valid `KEY=VALUE` environment assignment.
31 pub fn is_assignment(&self) -> bool {
32 is_env_assignment(&self.0)
33 }
34
35 /// Split at the first `=` and return `(key, value)` if the key is a valid
36 /// environment variable name.
37 pub fn as_assignment(&self) -> Option<(&str, &str)> {
38 let eq_pos = self.0.find('=')?;
39 let key = &self.0[..eq_pos];
40 if is_valid_env_key(key) {
41 Some((key, &self.0[eq_pos + 1..]))
42 } else {
43 None
44 }
45 }
46
47 /// Strip the path prefix, e.g. `/usr/bin/ls` -> `ls`.
48 pub fn basename(&self) -> &str {
49 match self.0.rsplit_once('/') {
50 Some((_, name)) if !name.is_empty() => name,
51 _ => &self.0,
52 }
53 }
54
55 /// Explicit accessor for the inner string slice.
56 pub fn as_str(&self) -> &str {
57 &self.0
58 }
59
60 /// Consume and return the inner `String`.
61 pub fn into_inner(self) -> String {
62 self.0
63 }
64}
65
66// --- Deref / AsRef / Borrow ---
67
68impl Deref for Word {
69 type Target = str;
70
71 fn deref(&self) -> &str {
72 &self.0
73 }
74}
75
76impl AsRef<str> for Word {
77 fn as_ref(&self) -> &str {
78 &self.0
79 }
80}
81
82impl Borrow<str> for Word {
83 fn borrow(&self) -> &str {
84 &self.0
85 }
86}
87
88// --- Display / Debug ---
89
90impl fmt::Display for Word {
91 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
92 f.write_str(&self.0)
93 }
94}
95
96impl fmt::Debug for Word {
97 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
98 fmt::Debug::fmt(&self.0, f)
99 }
100}
101
102// --- From conversions ---
103
104impl From<String> for Word {
105 fn from(s: String) -> Self {
106 Word(s)
107 }
108}
109
110impl From<&str> for Word {
111 fn from(s: &str) -> Self {
112 Word(s.to_string())
113 }
114}
115
116// --- PartialEq with str types ---
117
118impl PartialEq<str> for Word {
119 fn eq(&self, other: &str) -> bool {
120 self.0 == other
121 }
122}
123
124impl PartialEq<&str> for Word {
125 fn eq(&self, other: &&str) -> bool {
126 self.0 == *other
127 }
128}
129
130impl PartialEq<Word> for str {
131 fn eq(&self, other: &Word) -> bool {
132 self == other.0
133 }
134}
135
136impl PartialEq<Word> for &str {
137 fn eq(&self, other: &Word) -> bool {
138 *self == other.0
139 }
140}
141
142impl PartialEq<String> for Word {
143 fn eq(&self, other: &String) -> bool {
144 self.0 == *other
145 }
146}
147
148impl PartialEq<Word> for String {
149 fn eq(&self, other: &Word) -> bool {
150 *self == other.0
151 }
152}
153
154/// Shell operator separating consecutive pipeline segments.
155#[derive(Debug, Clone, PartialEq, Eq)]
156#[non_exhaustive]
157pub enum Operator {
158 /// `&&` — run next only if previous succeeded
159 And,
160 /// `||` — run next only if previous failed
161 Or,
162 /// `;` — run next unconditionally
163 Semi,
164 /// `|` — pipe stdout
165 Pipe,
166 /// `|&` — pipe stdout+stderr
167 PipeErr,
168 /// `&` — previous command backgrounded, next runs immediately
169 Background,
170}
171
172impl Operator {
173 pub fn as_str(&self) -> &'static str {
174 match self {
175 Operator::And => "&&",
176 Operator::Or => "||",
177 Operator::Semi => ";",
178 Operator::Pipe => "|",
179 Operator::PipeErr => "|&",
180 Operator::Background => "&",
181 }
182 }
183}
184
185impl fmt::Display for Operator {
186 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
187 f.write_str(self.as_str())
188 }
189}
190
191/// A fully decomposed compound command.
192///
193/// This is a recursive structure: segments may contain substitutions, and
194/// each substitution contains a recursively-parsed [`ParsedPipeline`].
195/// Evaluation proceeds bottom-up (a catamorphism): inner substitutions are
196/// evaluated first, their output feeds the outer command.
197#[derive(Debug, Clone)]
198pub struct ParsedPipeline {
199 pub segments: Vec<ShellSegment>,
200 /// Operators between consecutive segments.
201 pub operators: Vec<Operator>,
202 /// Substitutions in non-command structural positions: `for`-loop
203 /// iteration values (`for i in $(cmd)`), `case` subjects
204 /// (`case $(cmd) in`).
205 ///
206 /// These execute before any segment in this pipeline. Each is
207 /// recursively parsed.
208 pub structural_substitutions: Vec<SubstitutionSpan>,
209 /// `true` when tree-sitter produced error-recovery nodes in the AST.
210 ///
211 /// The pipeline is still usable — tree-sitter always produces a tree —
212 /// but callers should treat the result as best-effort.
213 pub has_parse_errors: bool,
214}
215
216impl ParsedPipeline {
217 /// An empty pipeline representing a parse failure.
218 pub fn empty_with_error() -> Self {
219 Self {
220 segments: vec![],
221 operators: vec![],
222 structural_substitutions: vec![],
223 has_parse_errors: true,
224 }
225 }
226
227 /// Walk all pipelines in the tree (this one and all nested ones),
228 /// depth-first. Returns the first `Some(T)` produced by `f`.
229 ///
230 /// This is the lowest-level traversal primitive — it visits pipeline
231 /// nodes rather than segments, enabling checks on pipeline-level
232 /// properties (like `has_parse_errors`).
233 pub fn find_pipeline<T>(&self, f: &impl Fn(&ParsedPipeline) -> Option<T>) -> Option<T> {
234 if let Some(hit) = f(self) {
235 return Some(hit);
236 }
237 for sub in &self.structural_substitutions {
238 if let Some(hit) = sub.pipeline.find_pipeline(f) {
239 return Some(hit);
240 }
241 }
242 for seg in &self.segments {
243 for sub in &seg.substitutions {
244 if let Some(hit) = sub.pipeline.find_pipeline(f) {
245 return Some(hit);
246 }
247 }
248 }
249 None
250 }
251
252 /// Returns `true` if any pipeline in the tree satisfies `f`.
253 pub fn any_pipeline(&self, f: &impl Fn(&ParsedPipeline) -> bool) -> bool {
254 self.find_pipeline(&|p| if f(p) { Some(()) } else { None })
255 .is_some()
256 }
257
258 /// Walk the pipeline tree depth-first in execution order, applying `f`
259 /// to each [`ShellSegment`]. Returns the first `Some(T)` produced by
260 /// `f`, or `None` if every segment returns `None`.
261 ///
262 /// Traversal order mirrors shell evaluation:
263 /// 1. Structural substitutions (for-loop values, case subjects)
264 /// 2. For each segment: its substitutions first, then the segment itself
265 ///
266 /// This is the canonical way to inspect every command in the tree.
267 /// Both "does any segment satisfy P?" and "find the first segment
268 /// matching P" reduce to this.
269 pub fn find_segment<T>(&self, f: &impl Fn(&ShellSegment) -> Option<T>) -> Option<T> {
270 for sub in &self.structural_substitutions {
271 if let Some(hit) = sub.pipeline.find_segment(f) {
272 return Some(hit);
273 }
274 }
275 for seg in &self.segments {
276 for sub in &seg.substitutions {
277 if let Some(hit) = sub.pipeline.find_segment(f) {
278 return Some(hit);
279 }
280 }
281 if let Some(hit) = f(seg) {
282 return Some(hit);
283 }
284 }
285 None
286 }
287
288 /// Walk the pipeline tree depth-first, applying `f` to each
289 /// [`ShellSegment`] and collecting every non-`None` result.
290 ///
291 /// Same traversal order as [`find_segment`](Self::find_segment) but
292 /// does not short-circuit.
293 pub fn filter_segments<T>(&self, f: &impl Fn(&ShellSegment) -> Option<T>) -> Vec<T> {
294 let mut out = Vec::new();
295 self.filter_segments_into(f, &mut out);
296 out
297 }
298
299 fn filter_segments_into<T>(&self, f: &impl Fn(&ShellSegment) -> Option<T>, out: &mut Vec<T>) {
300 for sub in &self.structural_substitutions {
301 sub.pipeline.filter_segments_into(f, out);
302 }
303 for seg in &self.segments {
304 for sub in &seg.substitutions {
305 sub.pipeline.filter_segments_into(f, out);
306 }
307 if let Some(hit) = f(seg) {
308 out.push(hit);
309 }
310 }
311 }
312
313 /// Returns `true` if this pipeline or any nested substitution has
314 /// parse errors.
315 ///
316 /// When tree-sitter uses error recovery, some commands may not have
317 /// been extracted. Callers enforcing a security boundary should
318 /// treat a `true` return as "cannot safely analyze — fail closed."
319 pub fn has_parse_errors_recursive(&self) -> bool {
320 self.any_pipeline(&|p| p.has_parse_errors)
321 }
322}
323
324/// A single evaluable command within a compound pipeline.
325#[derive(Debug, Clone)]
326pub struct ShellSegment {
327 /// The command text, exactly as it appears in the source (trimmed).
328 ///
329 /// Substitution syntax (`$()`, backticks, `<()`, `>()`) is preserved
330 /// verbatim — the [`substitutions`](Self::substitutions) field carries
331 /// the recursively-parsed contents with byte positions into this text.
332 pub command: String,
333
334 /// Pre-tokenized word list as tree-sitter understood word boundaries.
335 ///
336 /// Unlike shlex tokenization of [`command`](Self::command), this
337 /// correctly preserves substitution syntax as single tokens. For
338 /// example, `export FOO=$(echo test) BAR=baz` produces
339 /// `["export", "FOO=$(echo test)", "BAR=baz"]` — shlex would
340 /// incorrectly split inside the `$(...)`.
341 ///
342 /// Quotes are stripped: `"foo bar"` becomes `foo bar`. Both
343 /// tree-sitter extraction and shlex fallback produce unquoted tokens.
344 /// Substitution delimiters (`$(...)`, `` `...` ``, `<(...)`) are
345 /// preserved as-is since they are semantic, not syntactic wrappers.
346 ///
347 /// Falls back to shlex/whitespace tokenization when tree-sitter does
348 /// not provide word-level structure (e.g. unknown node types or
349 /// heredoc loose words). The fallback is documented per node type in
350 /// the parser source.
351 pub words: Vec<Word>,
352
353 /// Output redirection detected on a wrapping construct.
354 ///
355 /// When the parser extracts commands from inside a control-flow block
356 /// that has output redirection (e.g. `for ... done > file`), the
357 /// redirect is not present in the segment's `command` text. This field
358 /// carries the redirection so the eval layer can escalate the decision.
359 pub redirection: Option<Redirection>,
360
361 /// Substitutions within this segment's command text, in source order.
362 ///
363 /// Each substitution is evaluated before this segment's command.
364 /// `start`/`end` byte offsets index into [`command`](Self::command).
365 pub substitutions: Vec<SubstitutionSpan>,
366}
367
368/// A command substitution's position and recursively-parsed contents.
369#[derive(Debug, Clone)]
370pub struct SubstitutionSpan {
371 /// Byte offset of the substitution start within the parent's text.
372 ///
373 /// For substitutions on a [`ShellSegment`], this indexes into
374 /// `segment.command`. For structural substitutions on a
375 /// [`ParsedPipeline`], this is relative to the source text passed
376 /// to [`parse_with_substitutions`] at this recursion level (for
377 /// nested pipelines, that is the inner text of the parent
378 /// substitution, not the top-level command string).
379 pub start: usize,
380 /// Byte offset past the end of the substitution.
381 pub end: usize,
382 /// The recursively-parsed inner pipeline.
383 pub pipeline: ParsedPipeline,
384}
385
386/// Describes an output redirection that may mutate filesystem state.
387#[derive(Debug, Clone, PartialEq, Eq)]
388pub struct Redirection {
389 /// The redirection operator (e.g., `>`, `>>`, `>|`, `&>`, `&>>`, `<>`, `>&`).
390 pub operator: &'static str,
391 /// Source file descriptor, if explicitly specified (e.g., `2>` → `Some(2)`).
392 pub fd: Option<u32>,
393 /// Destination (file path, fd number for `>&N`, or empty for `<>`).
394 pub target: String,
395}
396
397impl fmt::Display for Redirection {
398 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
399 match self.fd {
400 Some(fd) => write!(
401 f,
402 "output redirection ({fd}{} {})",
403 self.operator, self.target
404 ),
405 None => write!(f, "output redirection ({} {})", self.operator, self.target),
406 }
407 }
408}
409
410/// Tree-sitter failed to produce a syntax tree.
411///
412/// Extremely rare in practice — tree-sitter handles any input, including
413/// malformed shell. The only known causes are memory allocation failure
414/// or a cancelled parse.
415#[derive(Debug, thiserror::Error)]
416#[error("tree-sitter failed to produce a syntax tree")]
417pub struct ParseError;
418
419/// Classification of indirect execution patterns that may hide commands
420/// from static analysis.
421#[derive(Debug, Clone, PartialEq, Eq)]
422#[non_exhaustive]
423pub enum IndirectExecution {
424 /// `eval "..."` — argument string is executed as shell code.
425 /// Cannot be statically analyzed in the general case.
426 Eval,
427 /// `bash -c "..."` / `sh -c "..."` — spawns a new shell with
428 /// inline code. Cannot be statically analyzed.
429 ShellSpawn,
430 /// `env cmd` / `command cmd` / `sudo cmd` — transparent wrapper
431 /// around another command. Strip the wrapper and re-analyze.
432 CommandWrapper,
433 /// `source file` / `. file` — executes a script in the current
434 /// shell. Contents cannot be statically analyzed.
435 SourceScript,
436}
437
438/// Properties of a parsed command segment relevant to security analysis.
439#[derive(Debug, Clone, PartialEq, Eq)]
440pub struct CommandCharacteristics {
441 /// Base command name (path stripped, env vars skipped).
442 pub base_command: String,
443 /// If this is an indirect execution wrapper, what kind.
444 pub indirect_execution: Option<IndirectExecution>,
445 /// Whether the command position contains a variable expansion
446 /// (`$cmd`, `${cmd}`) that cannot be statically resolved.
447 pub has_dynamic_command: bool,
448}
449
450/// A parsed flag from a command's argument list.
451#[derive(Debug, Clone, PartialEq, Eq)]
452pub struct ParsedFlag {
453 /// The flag name without its value (e.g., `--force`, `-f`).
454 pub name: Word,
455 /// Value if specified with `=` (e.g., `--color=always` → `Some("always")`).
456 pub value: Option<Word>,
457}
458
459/// An argument in a parsed command line.
460#[derive(Debug, Clone, PartialEq, Eq)]
461#[non_exhaustive]
462pub enum CommandArg {
463 /// A flag token (e.g., `--force`, `-f`, `--color=always`).
464 Flag(ParsedFlag),
465 /// A non-flag token (subcommand, path, or other argument).
466 Positional(Word),
467}
468
469/// Structurally decomposed command with arguments in source order.
470///
471/// Schema-free parse: flags are identified syntactically (tokens starting
472/// with `-`). Without a command's flag definitions, `--flag value` is
473/// ambiguous — the value appears as a separate positional. Schema-aware
474/// consumers walk `args` to associate values with flags they know about.
475#[derive(Debug, Clone, PartialEq, Eq)]
476pub struct ParsedCommand {
477 /// Base command name (path stripped, env vars skipped).
478 pub command: Word,
479 /// Arguments in source order — flags and positionals interleaved.
480 pub args: Vec<CommandArg>,
481}
482
483impl ParsedCommand {
484 /// Construct a `ParsedCommand` directly from a word slice, avoiding a
485 /// string round-trip through shlex.
486 ///
487 /// - First word that is not an env assignment becomes the `command`
488 /// (with path prefix stripped).
489 /// - Remaining words are classified as [`CommandArg::Flag`] or
490 /// [`CommandArg::Positional`] using the same schema-free rules as
491 /// [`parse_command`](super::tokenize::parse_command).
492 pub fn from_words(words: &[Word]) -> Self {
493 let cmd_idx = words.iter().position(|w| !w.is_assignment());
494 let Some(cmd_idx) = cmd_idx else {
495 return ParsedCommand {
496 command: Word::from(""),
497 args: vec![],
498 };
499 };
500
501 let base = Word::from(words[cmd_idx].basename());
502
503 let mut args = Vec::new();
504 let mut past_double_dash = false;
505
506 for token in &words[cmd_idx + 1..] {
507 if past_double_dash {
508 args.push(CommandArg::Positional(token.clone()));
509 continue;
510 }
511 if token == "--" {
512 past_double_dash = true;
513 continue;
514 }
515 if let Some(rest) = token.strip_prefix("--") {
516 if let Some((name, value)) = rest.split_once('=') {
517 args.push(CommandArg::Flag(ParsedFlag {
518 name: Word::from(format!("--{name}")),
519 value: Some(Word::from(value)),
520 }));
521 } else {
522 args.push(CommandArg::Flag(ParsedFlag {
523 name: token.clone(),
524 value: None,
525 }));
526 }
527 } else if token.starts_with('-') && token.len() > 1 {
528 args.push(CommandArg::Flag(ParsedFlag {
529 name: token.clone(),
530 value: None,
531 }));
532 } else {
533 args.push(CommandArg::Positional(token.clone()));
534 }
535 }
536
537 ParsedCommand {
538 command: base,
539 args,
540 }
541 }
542
543 /// First positional argument (often a subcommand).
544 pub fn subcommand(&self) -> Option<&str> {
545 self.args.iter().find_map(|a| match a {
546 CommandArg::Positional(s) => Some(s.as_str()),
547 _ => None,
548 })
549 }
550
551 /// Iterate over all flags.
552 pub fn flags(&self) -> impl Iterator<Item = &ParsedFlag> {
553 self.args.iter().filter_map(|a| match a {
554 CommandArg::Flag(f) => Some(f),
555 _ => None,
556 })
557 }
558
559 /// Iterate over all positional arguments.
560 pub fn positional(&self) -> impl Iterator<Item = &str> {
561 self.args.iter().filter_map(|a| match a {
562 CommandArg::Positional(s) => Some(s.as_str()),
563 _ => None,
564 })
565 }
566
567 /// Check if a flag is present by name (e.g., `--force` or `-f`).
568 pub fn has_flag(&self, name: &str) -> bool {
569 self.flags().any(|f| f.name == name)
570 }
571
572 /// Reconstruct a flat word list.
573 pub fn to_words(&self) -> Vec<Word> {
574 let mut words = vec![self.command.clone()];
575 for arg in &self.args {
576 match arg {
577 CommandArg::Flag(f) => match &f.value {
578 Some(v) => words.push(Word::from(format!("{}={}", f.name, v))),
579 None => words.push(f.name.clone()),
580 },
581 CommandArg::Positional(s) => words.push(s.clone()),
582 }
583 }
584 words
585 }
586}
587
588/// Result of resolving a command through the indirection layer.
589#[derive(Debug, Clone)]
590#[non_exhaustive]
591pub enum ResolvedCommand {
592 /// Wrappers stripped, command structurally parsed.
593 Resolved(ParsedCommand),
594 /// The command is unanalyzable — eval, source, shell -c, dynamic `$cmd`.
595 Unanalyzable(UnanalyzableCommand),
596}
597
598/// A command that cannot be statically analyzed.
599#[derive(Debug, Clone)]
600pub struct UnanalyzableCommand {
601 /// The command that triggered the classification (e.g., `eval`, `bash`).
602 pub command: String,
603 /// Why it's unanalyzable.
604 pub kind: IndirectExecution,
605}
606
607/// Describes how to strip a transparent wrapper command to find the inner command.
608///
609/// Each wrapper has different flag semantics. This struct captures just enough
610/// to correctly skip past the wrapper and its flags to the real command.
611/// Designed for deserialization from config files — consumers load specs from
612/// JSON/TOML/YAML and pass them to [`resolve_command_with`].
613#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
614pub struct WrapperSpec {
615 /// Command name to match (basename, e.g., "sudo").
616 pub name: String,
617 /// Short flags that consume the next token as a value (e.g., `["-u", "-g"]`).
618 #[serde(default)]
619 pub short_value_flags: Vec<String>,
620 /// Long flags that consume the next token as a value (e.g., `["--user", "--group"]`).
621 #[serde(default)]
622 pub long_value_flags: Vec<String>,
623 /// Flags whose presence makes the entire invocation unanalyzable.
624 /// Example: `env -S` executes its value as a command string (eval-equivalent).
625 #[serde(default)]
626 pub unanalyzable_flags: Vec<String>,
627 /// Whether to skip leading `KEY=VALUE` tokens after the wrapper (env-style).
628 #[serde(default)]
629 pub skip_env_assignments: bool,
630 /// Whether `--` terminates flag processing for this wrapper.
631 #[serde(default)]
632 pub has_terminator: bool,
633 /// Number of leading positional arguments to skip before the inner command.
634 ///
635 /// Some wrappers require mandatory positional args before the command:
636 /// `timeout DURATION cmd`, `chrt PRIORITY cmd`, `taskset MASK cmd`.
637 /// Set this to the number of positionals to consume before treating
638 /// the next non-flag token as the inner command.
639 #[serde(default)]
640 pub skip_positionals: usize,
641}
642
643/// Complete command classification configuration.
644///
645/// Drives all indirect execution detection — no command knowledge is hardcoded
646/// in the parser source. Consumers load this from JSON/TOML/YAML and pass it
647/// to [`resolve_command_with`].
648#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
649pub struct CommandConfig {
650 /// Transparent wrappers that execute an inner command (env, sudo, etc.).
651 pub wrappers: Vec<WrapperSpec>,
652 /// Shells that can spawn inline code via `-c` (bash, sh, zsh, etc.).
653 /// When invoked without `-c`, classified as script execution.
654 pub shells: Vec<String>,
655 /// Commands that execute their argument as shell code (eval).
656 pub eval_commands: Vec<String>,
657 /// Commands that execute a file in the current shell (source, `.`).
658 pub source_commands: Vec<String>,
659}
660
661#[cfg(test)]
662#[path = "types_tests.rs"]
663mod types_tests;