agent_shell_parser/parse/types.rs
1use std::fmt;
2
3/// Shell operator separating consecutive pipeline segments.
4#[derive(Debug, Clone, PartialEq, Eq)]
5#[non_exhaustive]
6pub enum Operator {
7 /// `&&` — run next only if previous succeeded
8 And,
9 /// `||` — run next only if previous failed
10 Or,
11 /// `;` — run next unconditionally
12 Semi,
13 /// `|` — pipe stdout
14 Pipe,
15 /// `|&` — pipe stdout+stderr
16 PipeErr,
17 /// `&` — previous command backgrounded, next runs immediately
18 Background,
19}
20
21impl Operator {
22 pub fn as_str(&self) -> &'static str {
23 match self {
24 Operator::And => "&&",
25 Operator::Or => "||",
26 Operator::Semi => ";",
27 Operator::Pipe => "|",
28 Operator::PipeErr => "|&",
29 Operator::Background => "&",
30 }
31 }
32}
33
34impl fmt::Display for Operator {
35 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
36 f.write_str(self.as_str())
37 }
38}
39
40/// A fully decomposed compound command.
41///
42/// This is a recursive structure: segments may contain substitutions, and
43/// each substitution contains a recursively-parsed [`ParsedPipeline`].
44/// Evaluation proceeds bottom-up (a catamorphism): inner substitutions are
45/// evaluated first, their output feeds the outer command.
46#[derive(Debug, Clone)]
47pub struct ParsedPipeline {
48 pub segments: Vec<ShellSegment>,
49 /// Operators between consecutive segments.
50 pub operators: Vec<Operator>,
51 /// Substitutions in non-command structural positions: `for`-loop
52 /// iteration values (`for i in $(cmd)`), `case` subjects
53 /// (`case $(cmd) in`).
54 ///
55 /// These execute before any segment in this pipeline. Each is
56 /// recursively parsed.
57 pub structural_substitutions: Vec<SubstitutionSpan>,
58 /// `true` when tree-sitter produced error-recovery nodes in the AST.
59 ///
60 /// The pipeline is still usable — tree-sitter always produces a tree —
61 /// but callers should treat the result as best-effort.
62 pub has_parse_errors: bool,
63}
64
65impl ParsedPipeline {
66 /// An empty pipeline representing a parse failure.
67 pub fn empty_with_error() -> Self {
68 Self {
69 segments: vec![],
70 operators: vec![],
71 structural_substitutions: vec![],
72 has_parse_errors: true,
73 }
74 }
75
76 /// Walk all pipelines in the tree (this one and all nested ones),
77 /// depth-first. Returns the first `Some(T)` produced by `f`.
78 ///
79 /// This is the lowest-level traversal primitive — it visits pipeline
80 /// nodes rather than segments, enabling checks on pipeline-level
81 /// properties (like `has_parse_errors`).
82 pub fn find_pipeline<T>(&self, f: &impl Fn(&ParsedPipeline) -> Option<T>) -> Option<T> {
83 if let Some(hit) = f(self) {
84 return Some(hit);
85 }
86 for sub in &self.structural_substitutions {
87 if let Some(hit) = sub.pipeline.find_pipeline(f) {
88 return Some(hit);
89 }
90 }
91 for seg in &self.segments {
92 for sub in &seg.substitutions {
93 if let Some(hit) = sub.pipeline.find_pipeline(f) {
94 return Some(hit);
95 }
96 }
97 }
98 None
99 }
100
101 /// Returns `true` if any pipeline in the tree satisfies `f`.
102 pub fn any_pipeline(&self, f: &impl Fn(&ParsedPipeline) -> bool) -> bool {
103 self.find_pipeline(&|p| if f(p) { Some(()) } else { None })
104 .is_some()
105 }
106
107 /// Walk the pipeline tree depth-first in execution order, applying `f`
108 /// to each [`ShellSegment`]. Returns the first `Some(T)` produced by
109 /// `f`, or `None` if every segment returns `None`.
110 ///
111 /// Traversal order mirrors shell evaluation:
112 /// 1. Structural substitutions (for-loop values, case subjects)
113 /// 2. For each segment: its substitutions first, then the segment itself
114 ///
115 /// This is the canonical way to inspect every command in the tree.
116 /// Both "does any segment satisfy P?" and "find the first segment
117 /// matching P" reduce to this.
118 pub fn find_segment<T>(&self, f: &impl Fn(&ShellSegment) -> Option<T>) -> Option<T> {
119 for sub in &self.structural_substitutions {
120 if let Some(hit) = sub.pipeline.find_segment(f) {
121 return Some(hit);
122 }
123 }
124 for seg in &self.segments {
125 for sub in &seg.substitutions {
126 if let Some(hit) = sub.pipeline.find_segment(f) {
127 return Some(hit);
128 }
129 }
130 if let Some(hit) = f(seg) {
131 return Some(hit);
132 }
133 }
134 None
135 }
136
137 /// Walk the pipeline tree depth-first, applying `f` to each
138 /// [`ShellSegment`] and collecting every non-`None` result.
139 ///
140 /// Same traversal order as [`find_segment`](Self::find_segment) but
141 /// does not short-circuit.
142 pub fn filter_segments<T>(&self, f: &impl Fn(&ShellSegment) -> Option<T>) -> Vec<T> {
143 let mut out = Vec::new();
144 self.filter_segments_into(f, &mut out);
145 out
146 }
147
148 fn filter_segments_into<T>(&self, f: &impl Fn(&ShellSegment) -> Option<T>, out: &mut Vec<T>) {
149 for sub in &self.structural_substitutions {
150 sub.pipeline.filter_segments_into(f, out);
151 }
152 for seg in &self.segments {
153 for sub in &seg.substitutions {
154 sub.pipeline.filter_segments_into(f, out);
155 }
156 if let Some(hit) = f(seg) {
157 out.push(hit);
158 }
159 }
160 }
161
162 /// Returns `true` if this pipeline or any nested substitution has
163 /// parse errors.
164 ///
165 /// When tree-sitter uses error recovery, some commands may not have
166 /// been extracted. Callers enforcing a security boundary should
167 /// treat a `true` return as "cannot safely analyze — fail closed."
168 pub fn has_parse_errors_recursive(&self) -> bool {
169 self.any_pipeline(&|p| p.has_parse_errors)
170 }
171}
172
173/// A single evaluable command within a compound pipeline.
174#[derive(Debug, Clone)]
175pub struct ShellSegment {
176 /// The command text, exactly as it appears in the source (trimmed).
177 ///
178 /// Substitution syntax (`$()`, backticks, `<()`, `>()`) is preserved
179 /// verbatim — the [`substitutions`](Self::substitutions) field carries
180 /// the recursively-parsed contents with byte positions into this text.
181 pub command: String,
182
183 /// Output redirection detected on a wrapping construct.
184 ///
185 /// When the parser extracts commands from inside a control-flow block
186 /// that has output redirection (e.g. `for ... done > file`), the
187 /// redirect is not present in the segment's `command` text. This field
188 /// carries the redirection so the eval layer can escalate the decision.
189 pub redirection: Option<Redirection>,
190
191 /// Substitutions within this segment's command text, in source order.
192 ///
193 /// Each substitution is evaluated before this segment's command.
194 /// `start`/`end` byte offsets index into [`command`](Self::command).
195 pub substitutions: Vec<SubstitutionSpan>,
196}
197
198/// A command substitution's position and recursively-parsed contents.
199#[derive(Debug, Clone)]
200pub struct SubstitutionSpan {
201 /// Byte offset of the substitution start within the parent's text.
202 ///
203 /// For substitutions on a [`ShellSegment`], this indexes into
204 /// `segment.command`. For structural substitutions on a
205 /// [`ParsedPipeline`], this is relative to the source text passed
206 /// to [`parse_with_substitutions`] at this recursion level (for
207 /// nested pipelines, that is the inner text of the parent
208 /// substitution, not the top-level command string).
209 pub start: usize,
210 /// Byte offset past the end of the substitution.
211 pub end: usize,
212 /// The recursively-parsed inner pipeline.
213 pub pipeline: ParsedPipeline,
214}
215
216/// Describes an output redirection that may mutate filesystem state.
217#[derive(Debug, Clone, PartialEq, Eq)]
218pub struct Redirection {
219 /// The redirection operator (e.g., `>`, `>>`, `>|`, `&>`, `&>>`, `<>`, `>&`).
220 pub operator: &'static str,
221 /// Source file descriptor, if explicitly specified (e.g., `2>` → `Some(2)`).
222 pub fd: Option<u32>,
223 /// Destination (file path, fd number for `>&N`, or empty for `<>`).
224 pub target: String,
225}
226
227impl fmt::Display for Redirection {
228 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
229 match self.fd {
230 Some(fd) => write!(
231 f,
232 "output redirection ({fd}{} {})",
233 self.operator, self.target
234 ),
235 None => write!(f, "output redirection ({} {})", self.operator, self.target),
236 }
237 }
238}
239
240/// Tree-sitter failed to produce a syntax tree.
241///
242/// Extremely rare in practice — tree-sitter handles any input, including
243/// malformed shell. The only known causes are memory allocation failure
244/// or a cancelled parse.
245#[derive(Debug, thiserror::Error)]
246#[error("tree-sitter failed to produce a syntax tree")]
247pub struct ParseError;
248
249/// Classification of indirect execution patterns that may hide commands
250/// from static analysis.
251#[derive(Debug, Clone, PartialEq, Eq)]
252#[non_exhaustive]
253pub enum IndirectExecution {
254 /// `eval "..."` — argument string is executed as shell code.
255 /// Cannot be statically analyzed in the general case.
256 Eval,
257 /// `bash -c "..."` / `sh -c "..."` — spawns a new shell with
258 /// inline code. Cannot be statically analyzed.
259 ShellSpawn,
260 /// `env cmd` / `command cmd` / `sudo cmd` — transparent wrapper
261 /// around another command. Strip the wrapper and re-analyze.
262 CommandWrapper,
263 /// `source file` / `. file` — executes a script in the current
264 /// shell. Contents cannot be statically analyzed.
265 SourceScript,
266}
267
268/// Properties of a parsed command segment relevant to security analysis.
269#[derive(Debug, Clone, PartialEq, Eq)]
270pub struct CommandCharacteristics {
271 /// Base command name (path stripped, env vars skipped).
272 pub base_command: String,
273 /// If this is an indirect execution wrapper, what kind.
274 pub indirect_execution: Option<IndirectExecution>,
275 /// Whether the command position contains a variable expansion
276 /// (`$cmd`, `${cmd}`) that cannot be statically resolved.
277 pub has_dynamic_command: bool,
278}
279
280/// A parsed flag from a command's argument list.
281#[derive(Debug, Clone, PartialEq, Eq)]
282pub struct ParsedFlag {
283 /// The flag name without its value (e.g., `--force`, `-f`).
284 pub name: String,
285 /// Value if specified with `=` (e.g., `--color=always` → `Some("always")`).
286 pub value: Option<String>,
287}
288
289/// An argument in a parsed command line.
290#[derive(Debug, Clone, PartialEq, Eq)]
291#[non_exhaustive]
292pub enum CommandArg {
293 /// A flag token (e.g., `--force`, `-f`, `--color=always`).
294 Flag(ParsedFlag),
295 /// A non-flag token (subcommand, path, or other argument).
296 Positional(String),
297}
298
299/// Structurally decomposed command with arguments in source order.
300///
301/// Schema-free parse: flags are identified syntactically (tokens starting
302/// with `-`). Without a command's flag definitions, `--flag value` is
303/// ambiguous — the value appears as a separate positional. Schema-aware
304/// consumers walk `args` to associate values with flags they know about.
305#[derive(Debug, Clone, PartialEq, Eq)]
306pub struct ParsedCommand {
307 /// Base command name (path stripped, env vars skipped).
308 pub command: String,
309 /// Arguments in source order — flags and positionals interleaved.
310 pub args: Vec<CommandArg>,
311}
312
313impl ParsedCommand {
314 /// First positional argument (often a subcommand).
315 pub fn subcommand(&self) -> Option<&str> {
316 self.args.iter().find_map(|a| match a {
317 CommandArg::Positional(s) => Some(s.as_str()),
318 _ => None,
319 })
320 }
321
322 /// Iterate over all flags.
323 pub fn flags(&self) -> impl Iterator<Item = &ParsedFlag> {
324 self.args.iter().filter_map(|a| match a {
325 CommandArg::Flag(f) => Some(f),
326 _ => None,
327 })
328 }
329
330 /// Iterate over all positional arguments.
331 pub fn positional(&self) -> impl Iterator<Item = &str> {
332 self.args.iter().filter_map(|a| match a {
333 CommandArg::Positional(s) => Some(s.as_str()),
334 _ => None,
335 })
336 }
337
338 /// Check if a flag is present by name (e.g., `--force` or `-f`).
339 pub fn has_flag(&self, name: &str) -> bool {
340 self.flags().any(|f| f.name == name)
341 }
342
343 /// Reconstruct a flat word list.
344 pub fn to_words(&self) -> Vec<String> {
345 let mut words = vec![self.command.clone()];
346 for arg in &self.args {
347 match arg {
348 CommandArg::Flag(f) => match &f.value {
349 Some(v) => words.push(format!("{}={}", f.name, v)),
350 None => words.push(f.name.clone()),
351 },
352 CommandArg::Positional(s) => words.push(s.clone()),
353 }
354 }
355 words
356 }
357}
358
359/// Result of resolving a command through the indirection layer.
360#[derive(Debug, Clone)]
361#[non_exhaustive]
362pub enum ResolvedCommand {
363 /// Wrappers stripped, command structurally parsed.
364 Resolved(ParsedCommand),
365 /// The command is unanalyzable — eval, source, shell -c, dynamic `$cmd`.
366 Unanalyzable(UnanalyzableCommand),
367}
368
369/// A command that cannot be statically analyzed.
370#[derive(Debug, Clone)]
371pub struct UnanalyzableCommand {
372 /// The command that triggered the classification (e.g., `eval`, `bash`).
373 pub command: String,
374 /// Why it's unanalyzable.
375 pub kind: IndirectExecution,
376}
377
378/// Describes how to strip a transparent wrapper command to find the inner command.
379///
380/// Each wrapper has different flag semantics. This struct captures just enough
381/// to correctly skip past the wrapper and its flags to the real command.
382/// Designed for deserialization from config files — consumers load specs from
383/// JSON/TOML/YAML and pass them to [`resolve_command_with`].
384#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
385pub struct WrapperSpec {
386 /// Command name to match (basename, e.g., "sudo").
387 pub name: String,
388 /// Short flags that consume the next token as a value (e.g., `["-u", "-g"]`).
389 #[serde(default)]
390 pub short_value_flags: Vec<String>,
391 /// Long flags that consume the next token as a value (e.g., `["--user", "--group"]`).
392 #[serde(default)]
393 pub long_value_flags: Vec<String>,
394 /// Flags whose presence makes the entire invocation unanalyzable.
395 /// Example: `env -S` executes its value as a command string (eval-equivalent).
396 #[serde(default)]
397 pub unanalyzable_flags: Vec<String>,
398 /// Whether to skip leading `KEY=VALUE` tokens after the wrapper (env-style).
399 #[serde(default)]
400 pub skip_env_assignments: bool,
401 /// Whether `--` terminates flag processing for this wrapper.
402 #[serde(default)]
403 pub has_terminator: bool,
404 /// Number of leading positional arguments to skip before the inner command.
405 ///
406 /// Some wrappers require mandatory positional args before the command:
407 /// `timeout DURATION cmd`, `chrt PRIORITY cmd`, `taskset MASK cmd`.
408 /// Set this to the number of positionals to consume before treating
409 /// the next non-flag token as the inner command.
410 #[serde(default)]
411 pub skip_positionals: usize,
412}
413
414/// Complete command classification configuration.
415///
416/// Drives all indirect execution detection — no command knowledge is hardcoded
417/// in the parser source. Consumers load this from JSON/TOML/YAML and pass it
418/// to [`resolve_command_with`].
419#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
420pub struct CommandConfig {
421 /// Transparent wrappers that execute an inner command (env, sudo, etc.).
422 pub wrappers: Vec<WrapperSpec>,
423 /// Shells that can spawn inline code via `-c` (bash, sh, zsh, etc.).
424 /// When invoked without `-c`, classified as script execution.
425 pub shells: Vec<String>,
426 /// Commands that execute their argument as shell code (eval).
427 pub eval_commands: Vec<String>,
428 /// Commands that execute a file in the current shell (source, `.`).
429 pub source_commands: Vec<String>,
430}
431
432#[cfg(test)]
433mod tests {
434 use super::super::parse_with_substitutions;
435
436 fn parse(cmd: &str) -> super::ParsedPipeline {
437 parse_with_substitutions(cmd).expect("parse failed")
438 }
439
440 // --- find_segment ---
441
442 #[test]
443 fn find_segment_returns_first_match() {
444 let p = parse("echo hello && ls -la");
445 let found = p.find_segment(&|seg| {
446 if seg.command.starts_with("ls") {
447 Some(seg.command.clone())
448 } else {
449 None
450 }
451 });
452 assert_eq!(found.as_deref(), Some("ls -la"));
453 }
454
455 #[test]
456 fn find_segment_returns_none_when_no_match() {
457 let p = parse("echo hello && ls -la");
458 let found = p.find_segment(&|seg| {
459 if seg.command.starts_with("git") {
460 Some(())
461 } else {
462 None
463 }
464 });
465 assert!(found.is_none());
466 }
467
468 #[test]
469 fn find_segment_recurses_into_substitutions() {
470 let p = parse("echo $(git status)");
471 let found = p.find_segment(&|seg| {
472 if seg.command.contains("git status") {
473 Some(seg.command.clone())
474 } else {
475 None
476 }
477 });
478 assert_eq!(found.as_deref(), Some("git status"));
479 }
480
481 #[test]
482 fn find_segment_visits_substitutions_before_parent() {
483 // In "echo $(date)", the walker should visit "date" before "echo $(date)".
484 // filter_segments with Some for all collects in traversal order.
485 let p = parse("echo $(date)");
486 let all: Vec<String> = p.filter_segments(&|seg| Some(seg.command.clone()));
487 assert_eq!(all, vec!["date", "echo $(date)"]);
488 }
489
490 #[test]
491 fn find_segment_visits_structural_substitutions_first() {
492 let p = parse("for i in $(seq 10); do echo $i; done");
493 let all: Vec<String> = p.filter_segments(&|seg| Some(seg.command.clone()));
494 assert_eq!(all[0], "seq 10");
495 }
496
497 // --- filter_segments ---
498
499 #[test]
500 fn filter_segments_collects_all_matches() {
501 let p = parse("echo a && echo b && ls c");
502 let echoes: Vec<String> = p.filter_segments(&|seg| {
503 if seg.command.starts_with("echo") {
504 Some(seg.command.clone())
505 } else {
506 None
507 }
508 });
509 assert_eq!(echoes, vec!["echo a", "echo b"]);
510 }
511
512 #[test]
513 fn filter_segments_collects_from_nested() {
514 let p = parse("echo $(git status && git diff)");
515 let gits: Vec<String> = p.filter_segments(&|seg| {
516 if seg.command.starts_with("git") {
517 Some(seg.command.clone())
518 } else {
519 None
520 }
521 });
522 assert_eq!(gits, vec!["git status", "git diff"]);
523 }
524
525 // --- has_parse_errors_recursive ---
526
527 #[test]
528 fn no_errors_on_valid_input() {
529 assert!(!parse("echo hello").has_parse_errors_recursive());
530 }
531
532 #[test]
533 fn no_errors_on_compound() {
534 assert!(!parse("echo a && echo b | cat").has_parse_errors_recursive());
535 }
536
537 #[test]
538 fn no_errors_on_substitution() {
539 assert!(!parse("echo $(date)").has_parse_errors_recursive());
540 }
541}