agent-shell-parser 0.4.2

Shared parsing substrate for agent hook binaries — JSON input, shell tokenization
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
use std::fmt;

/// Shell operator separating consecutive pipeline segments.
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum Operator {
    /// `&&` — run next only if previous succeeded
    And,
    /// `||` — run next only if previous failed
    Or,
    /// `;` — run next unconditionally
    Semi,
    /// `|` — pipe stdout
    Pipe,
    /// `|&` — pipe stdout+stderr
    PipeErr,
    /// `&` — previous command backgrounded, next runs immediately
    Background,
}

impl Operator {
    pub fn as_str(&self) -> &'static str {
        match self {
            Operator::And => "&&",
            Operator::Or => "||",
            Operator::Semi => ";",
            Operator::Pipe => "|",
            Operator::PipeErr => "|&",
            Operator::Background => "&",
        }
    }
}

impl fmt::Display for Operator {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.write_str(self.as_str())
    }
}

/// A fully decomposed compound command.
///
/// This is a recursive structure: segments may contain substitutions, and
/// each substitution contains a recursively-parsed [`ParsedPipeline`].
/// Evaluation proceeds bottom-up (a catamorphism): inner substitutions are
/// evaluated first, their output feeds the outer command.
#[derive(Debug, Clone)]
pub struct ParsedPipeline {
    pub segments: Vec<ShellSegment>,
    /// Operators between consecutive segments.
    pub operators: Vec<Operator>,
    /// Substitutions in non-command structural positions: `for`-loop
    /// iteration values (`for i in $(cmd)`), `case` subjects
    /// (`case $(cmd) in`).
    ///
    /// These execute before any segment in this pipeline. Each is
    /// recursively parsed.
    pub structural_substitutions: Vec<SubstitutionSpan>,
    /// `true` when tree-sitter produced error-recovery nodes in the AST.
    ///
    /// The pipeline is still usable — tree-sitter always produces a tree —
    /// but callers should treat the result as best-effort.
    pub has_parse_errors: bool,
}

impl ParsedPipeline {
    /// An empty pipeline representing a parse failure.
    pub fn empty_with_error() -> Self {
        Self {
            segments: vec![],
            operators: vec![],
            structural_substitutions: vec![],
            has_parse_errors: true,
        }
    }

    /// Walk all pipelines in the tree (this one and all nested ones),
    /// depth-first. Returns the first `Some(T)` produced by `f`.
    ///
    /// This is the lowest-level traversal primitive — it visits pipeline
    /// nodes rather than segments, enabling checks on pipeline-level
    /// properties (like `has_parse_errors`).
    pub fn find_pipeline<T>(&self, f: &impl Fn(&ParsedPipeline) -> Option<T>) -> Option<T> {
        if let Some(hit) = f(self) {
            return Some(hit);
        }
        for sub in &self.structural_substitutions {
            if let Some(hit) = sub.pipeline.find_pipeline(f) {
                return Some(hit);
            }
        }
        for seg in &self.segments {
            for sub in &seg.substitutions {
                if let Some(hit) = sub.pipeline.find_pipeline(f) {
                    return Some(hit);
                }
            }
        }
        None
    }

    /// Returns `true` if any pipeline in the tree satisfies `f`.
    pub fn any_pipeline(&self, f: &impl Fn(&ParsedPipeline) -> bool) -> bool {
        self.find_pipeline(&|p| if f(p) { Some(()) } else { None })
            .is_some()
    }

    /// Walk the pipeline tree depth-first in execution order, applying `f`
    /// to each [`ShellSegment`]. Returns the first `Some(T)` produced by
    /// `f`, or `None` if every segment returns `None`.
    ///
    /// Traversal order mirrors shell evaluation:
    /// 1. Structural substitutions (for-loop values, case subjects)
    /// 2. For each segment: its substitutions first, then the segment itself
    ///
    /// This is the canonical way to inspect every command in the tree.
    /// Both "does any segment satisfy P?" and "find the first segment
    /// matching P" reduce to this.
    pub fn find_segment<T>(&self, f: &impl Fn(&ShellSegment) -> Option<T>) -> Option<T> {
        for sub in &self.structural_substitutions {
            if let Some(hit) = sub.pipeline.find_segment(f) {
                return Some(hit);
            }
        }
        for seg in &self.segments {
            for sub in &seg.substitutions {
                if let Some(hit) = sub.pipeline.find_segment(f) {
                    return Some(hit);
                }
            }
            if let Some(hit) = f(seg) {
                return Some(hit);
            }
        }
        None
    }

    /// Walk the pipeline tree depth-first, applying `f` to each
    /// [`ShellSegment`] and collecting every non-`None` result.
    ///
    /// Same traversal order as [`find_segment`](Self::find_segment) but
    /// does not short-circuit.
    pub fn filter_segments<T>(&self, f: &impl Fn(&ShellSegment) -> Option<T>) -> Vec<T> {
        let mut out = Vec::new();
        self.filter_segments_into(f, &mut out);
        out
    }

    fn filter_segments_into<T>(&self, f: &impl Fn(&ShellSegment) -> Option<T>, out: &mut Vec<T>) {
        for sub in &self.structural_substitutions {
            sub.pipeline.filter_segments_into(f, out);
        }
        for seg in &self.segments {
            for sub in &seg.substitutions {
                sub.pipeline.filter_segments_into(f, out);
            }
            if let Some(hit) = f(seg) {
                out.push(hit);
            }
        }
    }

    /// Returns `true` if this pipeline or any nested substitution has
    /// parse errors.
    ///
    /// When tree-sitter uses error recovery, some commands may not have
    /// been extracted. Callers enforcing a security boundary should
    /// treat a `true` return as "cannot safely analyze — fail closed."
    pub fn has_parse_errors_recursive(&self) -> bool {
        self.any_pipeline(&|p| p.has_parse_errors)
    }
}

/// A single evaluable command within a compound pipeline.
#[derive(Debug, Clone)]
pub struct ShellSegment {
    /// The command text, exactly as it appears in the source (trimmed).
    ///
    /// Substitution syntax (`$()`, backticks, `<()`, `>()`) is preserved
    /// verbatim — the [`substitutions`](Self::substitutions) field carries
    /// the recursively-parsed contents with byte positions into this text.
    pub command: String,

    /// Output redirection detected on a wrapping construct.
    ///
    /// When the parser extracts commands from inside a control-flow block
    /// that has output redirection (e.g. `for ... done > file`), the
    /// redirect is not present in the segment's `command` text. This field
    /// carries the redirection so the eval layer can escalate the decision.
    pub redirection: Option<Redirection>,

    /// Substitutions within this segment's command text, in source order.
    ///
    /// Each substitution is evaluated before this segment's command.
    /// `start`/`end` byte offsets index into [`command`](Self::command).
    pub substitutions: Vec<SubstitutionSpan>,
}

/// A command substitution's position and recursively-parsed contents.
#[derive(Debug, Clone)]
pub struct SubstitutionSpan {
    /// Byte offset of the substitution start within the parent's text.
    ///
    /// For substitutions on a [`ShellSegment`], this indexes into
    /// `segment.command`. For structural substitutions on a
    /// [`ParsedPipeline`], this is relative to the source text passed
    /// to [`parse_with_substitutions`] at this recursion level (for
    /// nested pipelines, that is the inner text of the parent
    /// substitution, not the top-level command string).
    pub start: usize,
    /// Byte offset past the end of the substitution.
    pub end: usize,
    /// The recursively-parsed inner pipeline.
    pub pipeline: ParsedPipeline,
}

/// Describes an output redirection that may mutate filesystem state.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Redirection {
    /// The redirection operator (e.g., `>`, `>>`, `>|`, `&>`, `&>>`, `<>`, `>&`).
    pub operator: &'static str,
    /// Source file descriptor, if explicitly specified (e.g., `2>` → `Some(2)`).
    pub fd: Option<u32>,
    /// Destination (file path, fd number for `>&N`, or empty for `<>`).
    pub target: String,
}

impl fmt::Display for Redirection {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self.fd {
            Some(fd) => write!(
                f,
                "output redirection ({fd}{} {})",
                self.operator, self.target
            ),
            None => write!(f, "output redirection ({} {})", self.operator, self.target),
        }
    }
}

/// Tree-sitter failed to produce a syntax tree.
///
/// Extremely rare in practice — tree-sitter handles any input, including
/// malformed shell. The only known causes are memory allocation failure
/// or a cancelled parse.
#[derive(Debug, thiserror::Error)]
#[error("tree-sitter failed to produce a syntax tree")]
pub struct ParseError;

/// Classification of indirect execution patterns that may hide commands
/// from static analysis.
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum IndirectExecution {
    /// `eval "..."` — argument string is executed as shell code.
    /// Cannot be statically analyzed in the general case.
    Eval,
    /// `bash -c "..."` / `sh -c "..."` — spawns a new shell with
    /// inline code. Cannot be statically analyzed.
    ShellSpawn,
    /// `env cmd` / `command cmd` / `sudo cmd` — transparent wrapper
    /// around another command. Strip the wrapper and re-analyze.
    CommandWrapper,
    /// `source file` / `. file` — executes a script in the current
    /// shell. Contents cannot be statically analyzed.
    SourceScript,
}

/// Properties of a parsed command segment relevant to security analysis.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CommandCharacteristics {
    /// Base command name (path stripped, env vars skipped).
    pub base_command: String,
    /// If this is an indirect execution wrapper, what kind.
    pub indirect_execution: Option<IndirectExecution>,
    /// Whether the command position contains a variable expansion
    /// (`$cmd`, `${cmd}`) that cannot be statically resolved.
    pub has_dynamic_command: bool,
}

/// A parsed flag from a command's argument list.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ParsedFlag {
    /// The flag name without its value (e.g., `--force`, `-f`).
    pub name: String,
    /// Value if specified with `=` (e.g., `--color=always` → `Some("always")`).
    pub value: Option<String>,
}

/// An argument in a parsed command line.
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum CommandArg {
    /// A flag token (e.g., `--force`, `-f`, `--color=always`).
    Flag(ParsedFlag),
    /// A non-flag token (subcommand, path, or other argument).
    Positional(String),
}

/// Structurally decomposed command with arguments in source order.
///
/// Schema-free parse: flags are identified syntactically (tokens starting
/// with `-`). Without a command's flag definitions, `--flag value` is
/// ambiguous — the value appears as a separate positional. Schema-aware
/// consumers walk `args` to associate values with flags they know about.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ParsedCommand {
    /// Base command name (path stripped, env vars skipped).
    pub command: String,
    /// Arguments in source order — flags and positionals interleaved.
    pub args: Vec<CommandArg>,
}

impl ParsedCommand {
    /// First positional argument (often a subcommand).
    pub fn subcommand(&self) -> Option<&str> {
        self.args.iter().find_map(|a| match a {
            CommandArg::Positional(s) => Some(s.as_str()),
            _ => None,
        })
    }

    /// Iterate over all flags.
    pub fn flags(&self) -> impl Iterator<Item = &ParsedFlag> {
        self.args.iter().filter_map(|a| match a {
            CommandArg::Flag(f) => Some(f),
            _ => None,
        })
    }

    /// Iterate over all positional arguments.
    pub fn positional(&self) -> impl Iterator<Item = &str> {
        self.args.iter().filter_map(|a| match a {
            CommandArg::Positional(s) => Some(s.as_str()),
            _ => None,
        })
    }

    /// Check if a flag is present by name (e.g., `--force` or `-f`).
    pub fn has_flag(&self, name: &str) -> bool {
        self.flags().any(|f| f.name == name)
    }

    /// Reconstruct a flat word list.
    pub fn to_words(&self) -> Vec<String> {
        let mut words = vec![self.command.clone()];
        for arg in &self.args {
            match arg {
                CommandArg::Flag(f) => match &f.value {
                    Some(v) => words.push(format!("{}={}", f.name, v)),
                    None => words.push(f.name.clone()),
                },
                CommandArg::Positional(s) => words.push(s.clone()),
            }
        }
        words
    }
}

/// Result of resolving a command through the indirection layer.
#[derive(Debug, Clone)]
#[non_exhaustive]
pub enum ResolvedCommand {
    /// Wrappers stripped, command structurally parsed.
    Resolved(ParsedCommand),
    /// The command is unanalyzable — eval, source, shell -c, dynamic `$cmd`.
    Unanalyzable(UnanalyzableCommand),
}

/// A command that cannot be statically analyzed.
#[derive(Debug, Clone)]
pub struct UnanalyzableCommand {
    /// The command that triggered the classification (e.g., `eval`, `bash`).
    pub command: String,
    /// Why it's unanalyzable.
    pub kind: IndirectExecution,
}

/// Describes how to strip a transparent wrapper command to find the inner command.
///
/// Each wrapper has different flag semantics. This struct captures just enough
/// to correctly skip past the wrapper and its flags to the real command.
/// Designed for deserialization from config files — consumers load specs from
/// JSON/TOML/YAML and pass them to [`resolve_command_with`].
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct WrapperSpec {
    /// Command name to match (basename, e.g., "sudo").
    pub name: String,
    /// Short flags that consume the next token as a value (e.g., `["-u", "-g"]`).
    #[serde(default)]
    pub short_value_flags: Vec<String>,
    /// Long flags that consume the next token as a value (e.g., `["--user", "--group"]`).
    #[serde(default)]
    pub long_value_flags: Vec<String>,
    /// Flags whose presence makes the entire invocation unanalyzable.
    /// Example: `env -S` executes its value as a command string (eval-equivalent).
    #[serde(default)]
    pub unanalyzable_flags: Vec<String>,
    /// Whether to skip leading `KEY=VALUE` tokens after the wrapper (env-style).
    #[serde(default)]
    pub skip_env_assignments: bool,
    /// Whether `--` terminates flag processing for this wrapper.
    #[serde(default)]
    pub has_terminator: bool,
    /// Number of leading positional arguments to skip before the inner command.
    ///
    /// Some wrappers require mandatory positional args before the command:
    /// `timeout DURATION cmd`, `chrt PRIORITY cmd`, `taskset MASK cmd`.
    /// Set this to the number of positionals to consume before treating
    /// the next non-flag token as the inner command.
    #[serde(default)]
    pub skip_positionals: usize,
}

/// Complete command classification configuration.
///
/// Drives all indirect execution detection — no command knowledge is hardcoded
/// in the parser source. Consumers load this from JSON/TOML/YAML and pass it
/// to [`resolve_command_with`].
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
pub struct CommandConfig {
    /// Transparent wrappers that execute an inner command (env, sudo, etc.).
    pub wrappers: Vec<WrapperSpec>,
    /// Shells that can spawn inline code via `-c` (bash, sh, zsh, etc.).
    /// When invoked without `-c`, classified as script execution.
    pub shells: Vec<String>,
    /// Commands that execute their argument as shell code (eval).
    pub eval_commands: Vec<String>,
    /// Commands that execute a file in the current shell (source, `.`).
    pub source_commands: Vec<String>,
}

#[cfg(test)]
mod tests {
    use super::super::parse_with_substitutions;

    fn parse(cmd: &str) -> super::ParsedPipeline {
        parse_with_substitutions(cmd).expect("parse failed")
    }

    // --- find_segment ---

    #[test]
    fn find_segment_returns_first_match() {
        let p = parse("echo hello && ls -la");
        let found = p.find_segment(&|seg| {
            if seg.command.starts_with("ls") {
                Some(seg.command.clone())
            } else {
                None
            }
        });
        assert_eq!(found.as_deref(), Some("ls -la"));
    }

    #[test]
    fn find_segment_returns_none_when_no_match() {
        let p = parse("echo hello && ls -la");
        let found = p.find_segment(&|seg| {
            if seg.command.starts_with("git") {
                Some(())
            } else {
                None
            }
        });
        assert!(found.is_none());
    }

    #[test]
    fn find_segment_recurses_into_substitutions() {
        let p = parse("echo $(git status)");
        let found = p.find_segment(&|seg| {
            if seg.command.contains("git status") {
                Some(seg.command.clone())
            } else {
                None
            }
        });
        assert_eq!(found.as_deref(), Some("git status"));
    }

    #[test]
    fn find_segment_visits_substitutions_before_parent() {
        // In "echo $(date)", the walker should visit "date" before "echo $(date)".
        // filter_segments with Some for all collects in traversal order.
        let p = parse("echo $(date)");
        let all: Vec<String> = p.filter_segments(&|seg| Some(seg.command.clone()));
        assert_eq!(all, vec!["date", "echo $(date)"]);
    }

    #[test]
    fn find_segment_visits_structural_substitutions_first() {
        let p = parse("for i in $(seq 10); do echo $i; done");
        let all: Vec<String> = p.filter_segments(&|seg| Some(seg.command.clone()));
        assert_eq!(all[0], "seq 10");
    }

    // --- filter_segments ---

    #[test]
    fn filter_segments_collects_all_matches() {
        let p = parse("echo a && echo b && ls c");
        let echoes: Vec<String> = p.filter_segments(&|seg| {
            if seg.command.starts_with("echo") {
                Some(seg.command.clone())
            } else {
                None
            }
        });
        assert_eq!(echoes, vec!["echo a", "echo b"]);
    }

    #[test]
    fn filter_segments_collects_from_nested() {
        let p = parse("echo $(git status && git diff)");
        let gits: Vec<String> = p.filter_segments(&|seg| {
            if seg.command.starts_with("git") {
                Some(seg.command.clone())
            } else {
                None
            }
        });
        assert_eq!(gits, vec!["git status", "git diff"]);
    }

    // --- has_parse_errors_recursive ---

    #[test]
    fn no_errors_on_valid_input() {
        assert!(!parse("echo hello").has_parse_errors_recursive());
    }

    #[test]
    fn no_errors_on_compound() {
        assert!(!parse("echo a && echo b | cat").has_parse_errors_recursive());
    }

    #[test]
    fn no_errors_on_substitution() {
        assert!(!parse("echo $(date)").has_parse_errors_recursive());
    }
}