Skip to main content

agentzero_tools/
shell_parse.rs

1//! Quote-aware shell command tokenizer.
2//!
3//! Splits a command string into tokens respecting single quotes, double quotes,
4//! and backslash escapes. This prevents false-positive security rejections when
5//! metacharacters appear inside quoted arguments (e.g., `echo 'hello;world'`).
6
7/// Quoting context for a character in a shell token.
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
9pub enum QuoteContext {
10    Unquoted,
11    SingleQuoted,
12    DoubleQuoted,
13}
14
15/// A character annotated with its quoting context.
16#[derive(Debug, Clone, Copy)]
17pub struct AnnotatedChar {
18    pub ch: char,
19    pub context: QuoteContext,
20}
21
22/// A parsed shell token.
23#[derive(Debug, Clone, PartialEq, Eq)]
24pub struct ShellToken {
25    pub text: String,
26    pub was_quoted: bool,
27}
28
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
30enum State {
31    Unquoted,
32    SingleQuoted,
33    DoubleQuoted,
34}
35
36/// Tokenize a shell command string into tokens, respecting quoting rules.
37///
38/// Returns `Err` if quotes are unbalanced.
39pub fn tokenize(input: &str) -> anyhow::Result<Vec<ShellToken>> {
40    let mut tokens = Vec::new();
41    let mut current = String::new();
42    let mut was_quoted = false;
43    let mut state = State::Unquoted;
44    let mut chars = input.chars().peekable();
45
46    while let Some(ch) = chars.next() {
47        match state {
48            State::Unquoted => match ch {
49                '\'' => {
50                    state = State::SingleQuoted;
51                    was_quoted = true;
52                }
53                '"' => {
54                    state = State::DoubleQuoted;
55                    was_quoted = true;
56                }
57                '\\' => {
58                    if let Some(next) = chars.next() {
59                        current.push(next);
60                    }
61                }
62                c if c.is_ascii_whitespace() => {
63                    if !current.is_empty() || was_quoted {
64                        tokens.push(ShellToken {
65                            text: std::mem::take(&mut current),
66                            was_quoted,
67                        });
68                        was_quoted = false;
69                    }
70                }
71                c => current.push(c),
72            },
73            State::SingleQuoted => match ch {
74                '\'' => state = State::Unquoted,
75                c => current.push(c),
76            },
77            State::DoubleQuoted => match ch {
78                '"' => state = State::Unquoted,
79                '\\' => {
80                    if let Some(&next) = chars.peek() {
81                        if matches!(next, '$' | '`' | '"' | '\\' | '\n') {
82                            chars.next();
83                            current.push(next);
84                        } else {
85                            current.push('\\');
86                        }
87                    } else {
88                        current.push('\\');
89                    }
90                }
91                c => current.push(c),
92            },
93        }
94    }
95
96    if state != State::Unquoted {
97        anyhow::bail!("unbalanced quotes in shell command");
98    }
99
100    if !current.is_empty() || was_quoted {
101        tokens.push(ShellToken {
102            text: current,
103            was_quoted,
104        });
105    }
106
107    Ok(tokens)
108}
109
110/// Tokenize and return annotated characters per token for policy evaluation.
111///
112/// Each token is a `Vec<AnnotatedChar>` preserving the quoting context of every
113/// character. This allows the context-aware policy to distinguish between
114/// `echo "hello;world"` (semicolon in double quotes) and `echo hello;world`
115/// (bare semicolon).
116pub fn tokenize_annotated(input: &str) -> anyhow::Result<Vec<Vec<AnnotatedChar>>> {
117    let mut tokens: Vec<Vec<AnnotatedChar>> = Vec::new();
118    let mut current: Vec<AnnotatedChar> = Vec::new();
119    let mut in_token = false;
120    let mut state = State::Unquoted;
121    let mut chars = input.chars().peekable();
122
123    while let Some(ch) = chars.next() {
124        match state {
125            State::Unquoted => match ch {
126                '\'' => {
127                    state = State::SingleQuoted;
128                    in_token = true;
129                }
130                '"' => {
131                    state = State::DoubleQuoted;
132                    in_token = true;
133                }
134                '\\' => {
135                    if let Some(next) = chars.next() {
136                        current.push(AnnotatedChar {
137                            ch: next,
138                            context: QuoteContext::Unquoted,
139                        });
140                        in_token = true;
141                    }
142                }
143                c if c.is_ascii_whitespace() => {
144                    if !current.is_empty() || in_token {
145                        tokens.push(std::mem::take(&mut current));
146                        in_token = false;
147                    }
148                }
149                c => {
150                    current.push(AnnotatedChar {
151                        ch: c,
152                        context: QuoteContext::Unquoted,
153                    });
154                    in_token = true;
155                }
156            },
157            State::SingleQuoted => match ch {
158                '\'' => state = State::Unquoted,
159                c => {
160                    current.push(AnnotatedChar {
161                        ch: c,
162                        context: QuoteContext::SingleQuoted,
163                    });
164                }
165            },
166            State::DoubleQuoted => match ch {
167                '"' => state = State::Unquoted,
168                '\\' => {
169                    if let Some(&next) = chars.peek() {
170                        if matches!(next, '$' | '`' | '"' | '\\' | '\n') {
171                            chars.next();
172                            current.push(AnnotatedChar {
173                                ch: next,
174                                context: QuoteContext::DoubleQuoted,
175                            });
176                        } else {
177                            current.push(AnnotatedChar {
178                                ch: '\\',
179                                context: QuoteContext::DoubleQuoted,
180                            });
181                        }
182                    } else {
183                        current.push(AnnotatedChar {
184                            ch: '\\',
185                            context: QuoteContext::DoubleQuoted,
186                        });
187                    }
188                }
189                c => {
190                    current.push(AnnotatedChar {
191                        ch: c,
192                        context: QuoteContext::DoubleQuoted,
193                    });
194                }
195            },
196        }
197    }
198
199    if state != State::Unquoted {
200        anyhow::bail!("unbalanced quotes in shell command");
201    }
202
203    if !current.is_empty() || in_token {
204        tokens.push(current);
205    }
206
207    Ok(tokens)
208}
209
210#[cfg(test)]
211mod tests {
212    use super::*;
213
214    fn texts(tokens: &[ShellToken]) -> Vec<&str> {
215        tokens.iter().map(|t| t.text.as_str()).collect()
216    }
217
218    #[test]
219    fn tokenize_simple_command() {
220        let tokens = tokenize("echo hello").unwrap();
221        assert_eq!(texts(&tokens), vec!["echo", "hello"]);
222        assert!(!tokens[0].was_quoted);
223        assert!(!tokens[1].was_quoted);
224    }
225
226    #[test]
227    fn tokenize_single_quoted() {
228        let tokens = tokenize("echo 'hello world'").unwrap();
229        assert_eq!(texts(&tokens), vec!["echo", "hello world"]);
230        assert!(tokens[1].was_quoted);
231    }
232
233    #[test]
234    fn tokenize_double_quoted() {
235        let tokens = tokenize(r#"echo "hello world""#).unwrap();
236        assert_eq!(texts(&tokens), vec!["echo", "hello world"]);
237        assert!(tokens[1].was_quoted);
238    }
239
240    #[test]
241    fn tokenize_semicolon_in_single_quotes() {
242        let tokens = tokenize("echo 'hello;world'").unwrap();
243        assert_eq!(texts(&tokens), vec!["echo", "hello;world"]);
244    }
245
246    #[test]
247    fn tokenize_pipe_in_double_quotes() {
248        let tokens = tokenize(r#"echo "a|b""#).unwrap();
249        assert_eq!(texts(&tokens), vec!["echo", "a|b"]);
250    }
251
252    #[test]
253    fn tokenize_backslash_escape() {
254        let tokens = tokenize(r"echo hello\ world").unwrap();
255        assert_eq!(texts(&tokens), vec!["echo", "hello world"]);
256    }
257
258    #[test]
259    fn tokenize_unbalanced_single_quote_errors() {
260        assert!(tokenize("echo 'hello").is_err());
261    }
262
263    #[test]
264    fn tokenize_unbalanced_double_quote_errors() {
265        assert!(tokenize(r#"echo "hello"#).is_err());
266    }
267
268    #[test]
269    fn tokenize_empty_input() {
270        let tokens = tokenize("").unwrap();
271        assert!(tokens.is_empty());
272    }
273
274    #[test]
275    fn tokenize_backslash_in_double_quotes() {
276        let tokens = tokenize(r#"echo "a\"b""#).unwrap();
277        assert_eq!(texts(&tokens), vec!["echo", r#"a"b"#]);
278    }
279
280    #[test]
281    fn tokenize_adjacent_quotes() {
282        let tokens = tokenize(r#"echo 'a'"b""#).unwrap();
283        assert_eq!(texts(&tokens), vec!["echo", "ab"]);
284    }
285
286    #[test]
287    fn annotated_preserves_context() {
288        let tokens = tokenize_annotated("echo 'a;b'").unwrap();
289        assert_eq!(tokens.len(), 2);
290        // "echo" is all Unquoted
291        assert!(tokens[0]
292            .iter()
293            .all(|c| c.context == QuoteContext::Unquoted));
294        // "a;b" is all SingleQuoted
295        assert!(tokens[1]
296            .iter()
297            .all(|c| c.context == QuoteContext::SingleQuoted));
298        assert_eq!(tokens[1][1].ch, ';');
299    }
300
301    #[test]
302    fn annotated_mixed_context() {
303        let tokens = tokenize_annotated(r#"echo hello";"world"#).unwrap();
304        assert_eq!(tokens.len(), 2);
305        // The second token is: hello (Unquoted) + ; (DoubleQuoted) + world (Unquoted)
306        let second = &tokens[1];
307        assert_eq!(second[0].ch, 'h');
308        assert_eq!(second[0].context, QuoteContext::Unquoted);
309        assert_eq!(second[5].ch, ';');
310        assert_eq!(second[5].context, QuoteContext::DoubleQuoted);
311        assert_eq!(second[6].ch, 'w');
312        assert_eq!(second[6].context, QuoteContext::Unquoted);
313    }
314}