Skip to main content

agent_shell_parser/parse/
tokenize.rs

1use super::resolve::{classify_surface, default_command_config};
2use super::types::{CommandArg, CommandCharacteristics, ParsedCommand, ParsedFlag};
3
4/// Extract the base command name from a word list, skipping env assignments
5/// and stripping path prefixes.
6pub fn find_base_command(words: &[String]) -> String {
7    let cmd = words
8        .iter()
9        .find(|t| !is_env_assignment(t))
10        .map(String::as_str)
11        .unwrap_or("");
12
13    match cmd.rsplit_once('/') {
14        Some((_, name)) if !name.is_empty() => name.to_string(),
15        _ => cmd.to_string(),
16    }
17}
18
19/// Analyze a command segment for security-relevant properties.
20///
21/// Reports the surface-level command classification: what the outermost
22/// command is and whether it's an indirect execution pattern. This is
23/// O(1) in wrapper depth — it does not recurse.
24///
25/// For the fully-resolved inner command (after recursively stripping
26/// wrappers), use [`resolve_command`](super::resolve::resolve_command).
27pub fn command_characteristics(command: &str) -> CommandCharacteristics {
28    let tokens = shlex_or_whitespace(command);
29    let base = find_base_command(&tokens);
30    let has_dynamic_command = base.starts_with('$');
31    let indirect_execution = classify_surface(&base, &tokens, default_command_config());
32
33    CommandCharacteristics {
34        base_command: base,
35        indirect_execution,
36        has_dynamic_command,
37    }
38}
39
40/// Extract the first real command word, skipping leading `KEY=VALUE` assignments.
41///
42/// Uses shlex for correct handling of quoted values like `FOO="bar baz"`.
43/// Returns the basename of the command (e.g. `/usr/bin/ls` → `ls`).
44pub fn base_command(command: &str) -> String {
45    command_characteristics(command).base_command
46}
47
48/// Extract leading `KEY=VALUE` pairs from a command string.
49///
50/// Uses shlex for correct handling of quoted values like `FOO="bar baz"`.
51/// Stops at the first token that is not a valid assignment.
52pub fn env_vars(command: &str) -> Vec<(String, String)> {
53    let tokens = shlex_or_whitespace(command);
54    let mut result = Vec::new();
55    for token in &tokens {
56        if let Some(eq_pos) = token.find('=') {
57            let key = &token[..eq_pos];
58            if is_valid_env_key(key) {
59                let val = &token[eq_pos + 1..];
60                result.push((key.to_string(), val.to_string()));
61                continue;
62            }
63        }
64        break;
65    }
66    result
67}
68
69/// Tokenize a command segment into words using shlex (POSIX word splitting).
70///
71/// Falls back to whitespace splitting if shlex cannot parse the input
72/// (e.g. unmatched quotes). The fallback preserves quote characters in
73/// the resulting tokens.
74pub fn tokenize(command: &str) -> Vec<String> {
75    shlex_or_whitespace(command)
76}
77
78pub(crate) fn is_env_assignment(token: &str) -> bool {
79    match token.find('=') {
80        Some(eq_pos) => is_valid_env_key(&token[..eq_pos]),
81        None => false,
82    }
83}
84
85pub(crate) fn is_valid_env_key(key: &str) -> bool {
86    !key.is_empty()
87        && key.chars().all(|c| c.is_ascii_alphanumeric() || c == '_')
88        && key
89            .chars()
90            .next()
91            .is_some_and(|c| c.is_ascii_alphabetic() || c == '_')
92}
93
94/// Parse a command string into structured components with arguments in source order.
95///
96/// This is a schema-free parse. Flags are identified syntactically
97/// (tokens starting with `-`). `--flag=value` splits into name and
98/// value; all other flags are treated as value-less. Without knowing
99/// a command's flag definitions, `--flag value` is ambiguous — the
100/// value appears as a separate positional argument.
101pub fn parse_command(command: &str) -> ParsedCommand {
102    let tokens = shlex_or_whitespace(command);
103
104    let cmd_idx = tokens.iter().position(|t| !is_env_assignment(t));
105    let Some(cmd_idx) = cmd_idx else {
106        return ParsedCommand {
107            command: String::new(),
108            args: vec![],
109        };
110    };
111
112    let cmd_token = &tokens[cmd_idx];
113    let base = match cmd_token.rsplit_once('/') {
114        Some((_, name)) if !name.is_empty() => name.to_string(),
115        _ => cmd_token.to_string(),
116    };
117
118    let mut args = Vec::new();
119    let mut past_double_dash = false;
120
121    for token in &tokens[cmd_idx + 1..] {
122        if past_double_dash {
123            args.push(CommandArg::Positional(token.clone()));
124            continue;
125        }
126        if token == "--" {
127            past_double_dash = true;
128            continue;
129        }
130        if let Some(rest) = token.strip_prefix("--") {
131            if let Some((name, value)) = rest.split_once('=') {
132                args.push(CommandArg::Flag(ParsedFlag {
133                    name: format!("--{name}"),
134                    value: Some(value.to_string()),
135                }));
136            } else {
137                args.push(CommandArg::Flag(ParsedFlag {
138                    name: token.clone(),
139                    value: None,
140                }));
141            }
142        } else if token.starts_with('-') && token.len() > 1 {
143            args.push(CommandArg::Flag(ParsedFlag {
144                name: token.clone(),
145                value: None,
146            }));
147        } else {
148            args.push(CommandArg::Positional(token.clone()));
149        }
150    }
151
152    ParsedCommand {
153        command: base,
154        args,
155    }
156}
157
158fn shlex_or_whitespace(command: &str) -> Vec<String> {
159    shlex::split(command).unwrap_or_else(|| command.split_whitespace().map(String::from).collect())
160}
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165
166    #[test]
167    fn base_command_simple() {
168        assert_eq!(base_command("ls -la"), "ls");
169    }
170
171    #[test]
172    fn base_command_with_env() {
173        assert_eq!(
174            base_command("GIT_CONFIG_GLOBAL=~/.gitconfig.ai git push"),
175            "git"
176        );
177    }
178
179    #[test]
180    fn base_command_absolute_path() {
181        assert_eq!(base_command("/usr/bin/ls -la"), "ls");
182    }
183
184    #[test]
185    fn base_command_relative_path() {
186        assert_eq!(base_command("./script.sh --flag"), "script.sh");
187    }
188
189    #[test]
190    fn base_command_deep_path() {
191        assert_eq!(
192            base_command("/home/user/dev/tool/target/release/tool --dump-config"),
193            "tool"
194        );
195    }
196
197    #[test]
198    fn base_command_env_with_path() {
199        assert_eq!(base_command("FOO=bar /usr/local/bin/git status"), "git");
200    }
201
202    #[test]
203    fn base_command_empty() {
204        assert_eq!(base_command(""), "");
205    }
206
207    #[test]
208    fn base_command_quoted_env_value() {
209        assert_eq!(
210            base_command(r#"GIT_AUTHOR_NAME="Jane Doe" git commit"#),
211            "git"
212        );
213    }
214
215    #[test]
216    fn base_command_single_quoted_env_value() {
217        assert_eq!(base_command("FOO='bar baz' git push"), "git");
218    }
219
220    #[test]
221    fn base_command_multiple_quoted_env() {
222        assert_eq!(base_command(r#"A="x y" B='1 2' git status"#), "git");
223    }
224
225    #[test]
226    fn env_vars_single() {
227        assert_eq!(env_vars("FOO=bar cmd"), vec![("FOO".into(), "bar".into())]);
228    }
229
230    #[test]
231    fn env_vars_multiple() {
232        assert_eq!(
233            env_vars("A=1 B=2 cmd"),
234            vec![("A".into(), "1".into()), ("B".into(), "2".into())]
235        );
236    }
237
238    #[test]
239    fn env_vars_none() {
240        assert!(env_vars("cmd --flag").is_empty());
241    }
242
243    #[test]
244    fn env_vars_quoted_value() {
245        assert_eq!(
246            env_vars(r#"FOO="bar baz" cmd"#),
247            vec![("FOO".into(), "bar baz".into())]
248        );
249    }
250
251    #[test]
252    fn env_vars_single_quoted_value() {
253        assert_eq!(
254            env_vars("FOO='bar baz' cmd"),
255            vec![("FOO".into(), "bar baz".into())]
256        );
257    }
258
259    #[test]
260    fn env_vars_value_with_equals() {
261        assert_eq!(
262            env_vars(r#"OPTS="--foo=bar" cmd"#),
263            vec![("OPTS".into(), "--foo=bar".into())]
264        );
265    }
266
267    #[test]
268    fn tokenize_simple() {
269        assert_eq!(tokenize("ls -la /tmp"), vec!["ls", "-la", "/tmp"]);
270    }
271
272    #[test]
273    fn tokenize_quoted() {
274        assert_eq!(tokenize("echo 'hello world'"), vec!["echo", "hello world"]);
275    }
276
277    #[test]
278    fn tokenize_double_quoted() {
279        assert_eq!(
280            tokenize("echo \"hello world\""),
281            vec!["echo", "hello world"]
282        );
283    }
284
285    // --- parse_command ---
286
287    #[test]
288    fn parse_simple_command() {
289        let p = parse_command("ls -la /tmp");
290        assert_eq!(p.command, "ls");
291        assert_eq!(p.subcommand(), Some("/tmp"));
292        assert_eq!(p.flags().count(), 1);
293        assert_eq!(p.flags().next().map(|f| f.name.as_str()), Some("-la"));
294        assert_eq!(p.positional().collect::<Vec<_>>(), vec!["/tmp"]);
295    }
296
297    #[test]
298    fn parse_git_push() {
299        let p = parse_command("git push --force origin main");
300        assert_eq!(p.command, "git");
301        assert_eq!(p.subcommand(), Some("push"));
302        assert!(p.has_flag("--force"));
303        assert_eq!(
304            p.positional().collect::<Vec<_>>(),
305            vec!["push", "origin", "main"]
306        );
307    }
308
309    #[test]
310    fn parse_flag_with_equals() {
311        let p = parse_command("cargo build --color=always");
312        assert_eq!(p.command, "cargo");
313        let flags: Vec<_> = p.flags().collect();
314        assert_eq!(flags.len(), 1);
315        assert_eq!(flags[0].name, "--color");
316        assert_eq!(flags[0].value.as_deref(), Some("always"));
317    }
318
319    #[test]
320    fn parse_double_dash_separator() {
321        let p = parse_command("git log -- file.rs");
322        assert_eq!(p.command, "git");
323        assert!(p.positional().any(|s| s == "file.rs"));
324    }
325
326    #[test]
327    fn parse_with_env_vars() {
328        let p = parse_command("FOO=bar git status");
329        assert_eq!(p.command, "git");
330        assert_eq!(p.subcommand(), Some("status"));
331    }
332
333    #[test]
334    fn parse_path_command() {
335        let p = parse_command("/usr/bin/git commit -m test");
336        assert_eq!(p.command, "git");
337        assert_eq!(p.subcommand(), Some("commit"));
338    }
339
340    #[test]
341    fn parse_empty() {
342        let p = parse_command("");
343        assert_eq!(p.command, "");
344        assert!(p.subcommand().is_none());
345    }
346}