Skip to main content

agent_shell_parser/parse/
tokenize.rs

1use super::resolve::{classify_surface, default_command_config};
2use super::types::{CommandCharacteristics, ParsedCommand, Word};
3
4/// Extract the base command name from a word list, skipping env assignments
5/// and stripping path prefixes.
6pub fn find_base_command(words: &[Word]) -> String {
7    let cmd = words
8        .iter()
9        .find(|t| !is_env_assignment(t))
10        .map(|w| w.as_str())
11        .unwrap_or("");
12
13    match cmd.rsplit_once('/') {
14        Some((_, name)) if !name.is_empty() => name.to_string(),
15        _ => cmd.to_string(),
16    }
17}
18
19/// Analyze a command segment for security-relevant properties.
20///
21/// Reports the surface-level command classification: what the outermost
22/// command is and whether it's an indirect execution pattern. This is
23/// O(1) in wrapper depth — it does not recurse.
24///
25/// For the fully-resolved inner command (after recursively stripping
26/// wrappers), use [`resolve_command`](super::resolve::resolve_command).
27pub fn command_characteristics(command: &str) -> CommandCharacteristics {
28    let tokens = shlex_or_whitespace_words(command);
29    let base = find_base_command(&tokens);
30    let has_dynamic_command = base.starts_with('$');
31    let indirect_execution = classify_surface(&base, &tokens, default_command_config());
32
33    CommandCharacteristics {
34        base_command: base,
35        indirect_execution,
36        has_dynamic_command,
37    }
38}
39
40/// Extract the first real command word, skipping leading `KEY=VALUE` assignments.
41///
42/// Uses shlex for correct handling of quoted values like `FOO="bar baz"`.
43/// Returns the basename of the command (e.g. `/usr/bin/ls` → `ls`).
44pub fn base_command(command: &str) -> String {
45    command_characteristics(command).base_command
46}
47
48/// Extract leading `KEY=VALUE` pairs from a command string.
49///
50/// Uses shlex for correct handling of quoted values like `FOO="bar baz"`.
51/// Stops at the first token that is not a valid assignment.
52pub fn env_vars(command: &str) -> Vec<(String, String)> {
53    let tokens = shlex_or_whitespace_words(command);
54    let mut result = Vec::new();
55    for token in &tokens {
56        if let Some(eq_pos) = token.find('=') {
57            let key = &token[..eq_pos];
58            if is_valid_env_key(key) {
59                let val = &token[eq_pos + 1..];
60                result.push((key.to_string(), val.to_string()));
61                continue;
62            }
63        }
64        break;
65    }
66    result
67}
68
69/// Tokenize a command segment into words using shlex (POSIX word splitting).
70///
71/// Falls back to whitespace splitting if shlex cannot parse the input
72/// (e.g. unmatched quotes). The fallback preserves quote characters in
73/// the resulting tokens.
74pub fn tokenize(command: &str) -> Vec<Word> {
75    shlex_or_whitespace_words(command)
76}
77
78pub(crate) fn is_env_assignment(token: &str) -> bool {
79    match token.find('=') {
80        Some(eq_pos) => is_valid_env_key(&token[..eq_pos]),
81        None => false,
82    }
83}
84
85pub(crate) fn is_valid_env_key(key: &str) -> bool {
86    !key.is_empty()
87        && key.chars().all(|c| c.is_ascii_alphanumeric() || c == '_')
88        && key
89            .chars()
90            .next()
91            .is_some_and(|c| c.is_ascii_alphabetic() || c == '_')
92}
93
94/// Parse a command string into structured components with arguments in source order.
95///
96/// This is a schema-free parse. Flags are identified syntactically
97/// (tokens starting with `-`). `--flag=value` splits into name and
98/// value; all other flags are treated as value-less. Without knowing
99/// a command's flag definitions, `--flag value` is ambiguous — the
100/// value appears as a separate positional argument.
101pub fn parse_command(command: &str) -> ParsedCommand {
102    let tokens = shlex_or_whitespace_words(command);
103    ParsedCommand::from_words(&tokens)
104}
105
106pub(crate) fn shlex_or_whitespace_words(command: &str) -> Vec<Word> {
107    shlex::split(command)
108        .unwrap_or_else(|| command.split_whitespace().map(String::from).collect())
109        .into_iter()
110        .map(Word::from)
111        .collect()
112}
113
114#[cfg(test)]
115#[path = "tokenize_tests.rs"]
116mod tokenize_tests;