jeb/
shell_tokenizer.rs

1//! POSIX Shell Argument Tokenizer
2//!
3//! This module implements a tokenizer for splitting shell command lines into
4//! arguments, handling quoting and escape sequences according to POSIX shell
5//! rules.
6//!
7//! This tokenizer produces correct results for valid inputs that only use
8//! single-quoted strings, double-quoted strings, and backslash escapes. If
9//! unsupported shell syntax is encountered (such as variable expansion, command
10//! substitution, globs, or other shell features), the tokenizer produces a
11//! best-effort result but populates the `errors` list in the result, indicating
12//! that the output should not be trusted.
13
14use core::fmt;
15
16/// The kind of error encountered during shell tokenization.
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub enum ErrorKind {
19    /// An unclosed single quote was encountered.
20    UnclosedSingleQuote,
21    /// An unclosed double quote was encountered.
22    UnclosedDoubleQuote,
23    /// A trailing backslash was encountered at end of input.
24    TrailingBackslash,
25    /// Dollar sign for variable expansion (not interpreted).
26    DollarSign,
27    /// Backtick for command substitution (not interpreted).
28    Backtick,
29    /// Pipe for piping (not interpreted).
30    Pipe,
31    /// Ampersand for background/AND (not interpreted).
32    Ampersand,
33    /// Semicolon as command separator (not interpreted).
34    Semicolon,
35    /// Open parenthesis for subshell (not interpreted).
36    OpenParen,
37    /// Close parenthesis for subshell (not interpreted).
38    CloseParen,
39    /// Less-than for input redirection (not interpreted).
40    LessThan,
41    /// Greater-than for output redirection (not interpreted).
42    GreaterThan,
43    /// Hash for comment (not interpreted).
44    Hash,
45    /// Asterisk glob wildcard (not interpreted).
46    Asterisk,
47    /// Question mark glob wildcard (not interpreted).
48    QuestionMark,
49    /// Open bracket for glob bracket expression (not interpreted).
50    OpenBracket,
51    /// Tilde at word start for tilde expansion (not interpreted).
52    Tilde,
53}
54
55impl fmt::Display for ErrorKind {
56    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
57        match self {
58            Self::UnclosedSingleQuote => write!(f, "unclosed single quote"),
59            Self::UnclosedDoubleQuote => write!(f, "unclosed double quote"),
60            Self::TrailingBackslash => write!(f, "trailing backslash"),
61            Self::DollarSign => write!(f, "dollar sign (variable expansion not interpreted)"),
62            Self::Backtick => write!(f, "backtick (command substitution not interpreted)"),
63            Self::Pipe => write!(f, "pipe (piping not interpreted)"),
64            Self::Ampersand => write!(f, "ampersand (background/AND not interpreted)"),
65            Self::Semicolon => write!(f, "semicolon (command separator not interpreted)"),
66            Self::OpenParen => write!(f, "open parenthesis (subshell not interpreted)"),
67            Self::CloseParen => write!(f, "close parenthesis (subshell not interpreted)"),
68            Self::LessThan => write!(f, "less-than (input redirection not interpreted)"),
69            Self::GreaterThan => write!(f, "greater-than (output redirection not interpreted)"),
70            Self::Hash => write!(f, "hash (comment not interpreted)"),
71            Self::Asterisk => write!(f, "asterisk (glob wildcard not interpreted)"),
72            Self::QuestionMark => write!(f, "question mark (glob wildcard not interpreted)"),
73            Self::OpenBracket => {
74                write!(f, "open bracket (glob bracket expression not interpreted)")
75            }
76            Self::Tilde => write!(f, "tilde (tilde expansion not interpreted)"),
77        }
78    }
79}
80
81/// An error encountered during shell tokenization.
82#[derive(Debug, Clone, PartialEq, Eq)]
83pub struct Error {
84    /// The kind of error.
85    pub kind: ErrorKind,
86    /// The byte that triggered the error.
87    pub byte: u8,
88    /// The byte position in the input where the error occurred.
89    pub position: usize,
90}
91
92impl fmt::Display for Error {
93    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
94        write!(
95            f,
96            "error at position {}: {} (byte 0x{:02x})",
97            self.position, self.kind, self.byte
98        )
99    }
100}
101
102/// The result of tokenizing a shell command line.
103///
104/// If `errors` is non-empty, the `args` should not be trusted as they may be
105/// incorrect due to unsupported shell syntax being encountered.
106#[derive(Debug, Clone)]
107pub struct TokenizeResult {
108    /// The parsed arguments. If `errors` is non-empty, these may be incorrect.
109    pub args: Vec<Vec<u8>>,
110    /// Errors encountered during parsing. If non-empty, the args may be
111    /// incorrect.
112    pub errors: Vec<Error>,
113}
114
115/// The internal state of the tokenizer.
116#[derive(Debug, Clone, Copy, PartialEq, Eq)]
117enum State {
118    Normal,
119    SingleQuoted,
120    DoubleQuoted,
121}
122
123/// Bytes that trigger errors when unquoted.
124const UNQUOTED_WARN_BYTES: &[u8] = b"$`|&;()<>*?[";
125
126/// Tokenize a shell command line (as bytes) into arguments according to POSIX
127/// shell rules.
128///
129/// This function produces correct results for valid inputs that only use
130/// single-quoted strings, double-quoted strings, and backslash escapes. If
131/// unsupported shell syntax is encountered, the function produces a best-effort
132/// result but populates the `errors` list, indicating that the output should
133/// not be trusted.
134///
135/// # Arguments
136///
137/// * `input` - The shell command line as bytes.
138///
139/// # Returns
140///
141/// Returns a `TokenizeResult` with the parsed arguments and any errors
142/// encountered. If `errors` is non-empty, the `args` may be incorrect.
143#[expect(clippy::too_many_lines)]
144#[must_use]
145pub fn tokenize(input: &[u8]) -> TokenizeResult {
146    let mut state = State::Normal;
147    let mut at_word_start = true;
148    let mut current_token = Vec::<u8>::new();
149    let mut token_started = false;
150    let mut args = Vec::new();
151    let mut errors = Vec::new();
152
153    // Track the position where a quote started, for error messages
154    let mut quote_start_position: Option<usize> = None;
155
156    let mut i = 0;
157
158    while i < input.len() {
159        let b = input[i];
160        let position = i;
161
162        match state {
163            State::Normal => {
164                if b == b'\\' {
165                    if let Some(&next) = input.get(i + 1) {
166                        if next == b'\n' {
167                            // Line continuation - skip both bytes
168                            i += 1;
169                        } else {
170                            current_token.push(next);
171                            i += 1;
172                            at_word_start = false;
173                        }
174                    } else {
175                        errors.push(Error {
176                            kind: ErrorKind::TrailingBackslash,
177                            byte: b,
178                            position,
179                        });
180                    }
181                } else if b == b'\'' {
182                    state = State::SingleQuoted;
183                    at_word_start = false;
184                    token_started = true;
185                    quote_start_position = Some(position);
186                } else if b == b'"' {
187                    state = State::DoubleQuoted;
188                    at_word_start = false;
189                    token_started = true;
190                    quote_start_position = Some(position);
191                } else if b == b' ' || b == b'\t' {
192                    if token_started || !current_token.is_empty() {
193                        args.push(core::mem::take(&mut current_token));
194                        token_started = false;
195                    }
196                    at_word_start = true;
197                } else if b == b'~' && at_word_start {
198                    errors.push(Error {
199                        kind: ErrorKind::Tilde,
200                        byte: b,
201                        position,
202                    });
203                    current_token.push(b);
204                    at_word_start = false;
205                } else if b == b'#' && at_word_start {
206                    errors.push(Error {
207                        kind: ErrorKind::Hash,
208                        byte: b,
209                        position,
210                    });
211                    current_token.push(b);
212                    at_word_start = false;
213                } else if UNQUOTED_WARN_BYTES.contains(&b) {
214                    let kind = match b {
215                        b'$' => ErrorKind::DollarSign,
216                        b'`' => ErrorKind::Backtick,
217                        b'|' => ErrorKind::Pipe,
218                        b'&' => ErrorKind::Ampersand,
219                        b';' => ErrorKind::Semicolon,
220                        b'(' => ErrorKind::OpenParen,
221                        b')' => ErrorKind::CloseParen,
222                        b'<' => ErrorKind::LessThan,
223                        b'>' => ErrorKind::GreaterThan,
224                        b'*' => ErrorKind::Asterisk,
225                        b'?' => ErrorKind::QuestionMark,
226                        b'[' => ErrorKind::OpenBracket,
227                        _ => unreachable!(),
228                    };
229                    errors.push(Error {
230                        kind,
231                        byte: b,
232                        position,
233                    });
234                    current_token.push(b);
235                    at_word_start = false;
236                } else {
237                    current_token.push(b);
238                    at_word_start = false;
239                }
240            }
241            State::SingleQuoted => {
242                if b == b'\'' {
243                    state = State::Normal;
244                    at_word_start = false;
245                    quote_start_position = None;
246                } else {
247                    current_token.push(b);
248                }
249            }
250            State::DoubleQuoted => {
251                if b == b'\\' {
252                    if let Some(&next) = input.get(i + 1) {
253                        if matches!(next, b'$' | b'`' | b'"' | b'\\') {
254                            current_token.push(next);
255                            i += 1;
256                        } else if next == b'\n' {
257                            // Line continuation - skip both bytes
258                            i += 1;
259                        } else {
260                            current_token.push(b'\\');
261                        }
262                    } else {
263                        current_token.push(b'\\');
264                    }
265                } else if b == b'"' {
266                    state = State::Normal;
267                    at_word_start = false;
268                    quote_start_position = None;
269                } else if b == b'`' {
270                    errors.push(Error {
271                        kind: ErrorKind::Backtick,
272                        byte: b,
273                        position,
274                    });
275                    current_token.push(b);
276                } else if b == b'$' {
277                    errors.push(Error {
278                        kind: ErrorKind::DollarSign,
279                        byte: b,
280                        position,
281                    });
282                    current_token.push(b);
283                } else {
284                    current_token.push(b);
285                }
286            }
287        }
288
289        i += 1;
290    }
291
292    if token_started || !current_token.is_empty() {
293        args.push(current_token);
294    }
295
296    match state {
297        State::Normal => {}
298        State::SingleQuoted => {
299            errors.push(Error {
300                kind: ErrorKind::UnclosedSingleQuote,
301                byte: b'\'',
302                position: quote_start_position.unwrap_or(0),
303            });
304        }
305        State::DoubleQuoted => {
306            errors.push(Error {
307                kind: ErrorKind::UnclosedDoubleQuote,
308                byte: b'"',
309                position: quote_start_position.unwrap_or(0),
310            });
311        }
312    }
313
314    TokenizeResult { args, errors }
315}
316
317/// Tokenize a shell command line string into arguments according to POSIX shell
318/// rules.
319///
320/// This is a convenience wrapper around [`tokenize`] that works with `&str`
321/// input and produces `String` output. Since the tokenizer only operates on
322/// ASCII control characters, it preserves UTF-8 validity.
323///
324/// # Arguments
325///
326/// * `input` - The shell command line as a string.
327///
328/// # Returns
329///
330/// Returns a tuple of (args, errors) where args are the parsed arguments as
331/// strings and errors are any errors encountered. If errors is non-empty, the
332/// args may be incorrect.
333///
334/// # Examples
335///
336/// ```
337/// use jeb::shell_tokenizer::tokenize_str;
338///
339/// let (args, errors) = tokenize_str("hello world");
340/// assert_eq!(args, vec!["hello", "world"]);
341/// assert!(errors.is_empty());
342///
343/// let (args, errors) = tokenize_str("'hello world'");
344/// assert_eq!(args, vec!["hello world"]);
345///
346/// let (args, errors) = tokenize_str("hello\\ world");
347/// assert_eq!(args, vec!["hello world"]);
348/// ```
349#[must_use]
350pub fn tokenize_str(input: &str) -> (Vec<String>, Vec<Error>) {
351    let result = tokenize(input.as_bytes());
352    let args = result
353        .args
354        .into_iter()
355        .map(|bytes| String::from_utf8(bytes).expect("tokenizer should preserve UTF-8 validity"))
356        .collect();
357    (args, result.errors)
358}
359
360#[cfg(test)]
361mod tests {
362    use super::*;
363
364    // Helper to assert args without errors using tokenize_str
365    fn assert_args_str(input: &str, expected: &[&str]) {
366        let (args, errors) = tokenize_str(input);
367        assert_eq!(
368            args,
369            expected
370                .iter()
371                .map(|s| (*s).to_string())
372                .collect::<Vec<_>>()
373        );
374        assert!(errors.is_empty(), "expected no errors, got: {errors:?}");
375    }
376
377    // Helper to assert args without errors using tokenize (bytes)
378    fn assert_args(input: &[u8], expected: &[&[u8]]) {
379        let result = tokenize(input);
380        assert_eq!(
381            result.args,
382            expected.iter().map(|s| s.to_vec()).collect::<Vec<_>>()
383        );
384        assert!(
385            result.errors.is_empty(),
386            "expected no errors, got: {:?}",
387            result.errors
388        );
389    }
390
391    // Helper to assert args with errors using tokenize_str
392    fn assert_args_with_errors_str(input: &str, expected: &[&str], error_bytes: &[u8]) {
393        let (args, errors) = tokenize_str(input);
394        assert_eq!(
395            args,
396            expected
397                .iter()
398                .map(|s| (*s).to_string())
399                .collect::<Vec<_>>()
400        );
401        let actual_error_bytes: Vec<u8> = errors.iter().map(|e| e.byte).collect();
402        assert_eq!(actual_error_bytes, error_bytes);
403    }
404
405    // Helper to assert args with errors using tokenize (bytes)
406    fn assert_args_with_errors(input: &[u8], expected: &[&[u8]], error_bytes: &[u8]) {
407        let result = tokenize(input);
408        assert_eq!(
409            result.args,
410            expected.iter().map(|s| s.to_vec()).collect::<Vec<_>>()
411        );
412        let actual_error_bytes: Vec<u8> = result.errors.iter().map(|e| e.byte).collect();
413        assert_eq!(actual_error_bytes, error_bytes);
414    }
415
416    // Helper to assert specific error kind
417    fn assert_has_error(input: &str, expected_kind: ErrorKind) {
418        let (_, errors) = tokenize_str(input);
419        assert!(
420            errors.iter().any(|e| e.kind == expected_kind),
421            "expected error {expected_kind:?}, got: {errors:?}"
422        );
423    }
424
425    // MARK: Basic Tokenization (using tokenize_str)
426
427    #[test]
428    fn test_basic_tokenization() {
429        assert_args_str("hello world", &["hello", "world"]);
430        assert_args_str("hello   world", &["hello", "world"]);
431        assert_args_str("  hello world  ", &["hello", "world"]);
432        assert_args_str("hello", &["hello"]);
433        assert_args_str("", &[]);
434        assert_args_str("   ", &[]);
435    }
436
437    #[test]
438    fn test_tabs() {
439        assert_args_str("hello\tworld", &["hello", "world"]);
440        assert_args_str("hello \t world", &["hello", "world"]);
441    }
442
443    // MARK: Basic Tokenization (using tokenize with bytes)
444
445    #[test]
446    fn test_basic_tokenization_bytes() {
447        assert_args(b"hello world", &[b"hello", b"world"]);
448        assert_args(b"hello   world", &[b"hello", b"world"]);
449        assert_args(b"  hello world  ", &[b"hello", b"world"]);
450        assert_args(b"hello", &[b"hello"]);
451        assert_args(b"", &[]);
452        assert_args(b"   ", &[]);
453    }
454
455    // MARK: Single Quotes
456
457    #[test]
458    fn test_single_quotes() {
459        assert_args_str("'hello world'", &["hello world"]);
460        assert_args_str("'$HOME'", &["$HOME"]);
461        assert_args_str("'\\n'", &["\\n"]);
462        assert_args_str("'it'\\''s'", &["it's"]);
463    }
464
465    #[test]
466    fn test_unclosed_single_quote() {
467        assert_has_error("'hello", ErrorKind::UnclosedSingleQuote);
468    }
469
470    // MARK: Double Quotes
471
472    #[test]
473    fn test_double_quotes() {
474        assert_args_str("\"hello world\"", &["hello world"]);
475        assert_args_str("\"say \\\"hi\\\"\"", &["say \"hi\""]);
476        assert_args_str("\"back\\\\slash\"", &["back\\slash"]);
477        assert_args_str("\"\\$HOME\"", &["$HOME"]);
478        assert_args_str("\"\\n\"", &["\\n"]);
479        assert_args_str("\"\\z\"", &["\\z"]);
480    }
481
482    #[test]
483    fn test_dollar_in_double_quotes_warns() {
484        assert_args_with_errors_str("\"$HOME\"", &["$HOME"], b"$");
485    }
486
487    #[test]
488    fn test_backtick_in_double_quotes_warns() {
489        assert_args_with_errors_str("\"`cmd`\"", &["`cmd`"], b"``");
490    }
491
492    #[test]
493    fn test_unclosed_double_quote() {
494        assert_has_error("\"hello", ErrorKind::UnclosedDoubleQuote);
495    }
496
497    // MARK: Unquoted Escapes
498
499    #[test]
500    fn test_unquoted_escapes() {
501        assert_args_str("hello\\ world", &["hello world"]);
502        assert_args_str("\\$HOME", &["$HOME"]);
503        assert_args_str("\\\\", &["\\"]);
504        assert_args_str("\\*", &["*"]);
505    }
506
507    #[test]
508    fn test_trailing_backslash_error() {
509        assert_has_error("hello\\", ErrorKind::TrailingBackslash);
510    }
511
512    #[test]
513    fn test_line_continuation() {
514        assert_args_str("hello\\\nworld", &["helloworld"]);
515        assert_args_str("hello \\\n world", &["hello", "world"]);
516    }
517
518    #[test]
519    fn test_line_continuation_in_double_quotes() {
520        assert_args_str("\"hello\\\nworld\"", &["helloworld"]);
521    }
522
523    // MARK: Errors
524
525    #[test]
526    fn test_unquoted_dollar_warns() {
527        assert_args_with_errors_str("$HOME", &["$HOME"], b"$");
528    }
529
530    #[test]
531    fn test_unquoted_glob_warns() {
532        assert_args_with_errors_str("*.txt", &["*.txt"], b"*");
533        assert_args_with_errors_str("file?", &["file?"], b"?");
534        assert_args_with_errors_str("file[0]", &["file[0]"], b"[");
535    }
536
537    #[test]
538    fn test_tilde_at_word_start_warns() {
539        assert_args_with_errors_str("~user", &["~user"], b"~");
540    }
541
542    #[test]
543    fn test_tilde_mid_word_no_warn() {
544        assert_args_str("a~b", &["a~b"]);
545    }
546
547    #[test]
548    fn test_hash_at_word_start_warns() {
549        assert_args_with_errors_str("#comment", &["#comment"], b"#");
550        assert_args_with_errors_str("echo #test", &["echo", "#test"], b"#");
551    }
552
553    #[test]
554    fn test_hash_mid_word_no_warn() {
555        assert_args_str("foo#bar", &["foo#bar"]);
556        assert_args_str("C#", &["C#"]);
557    }
558
559    #[test]
560    fn test_pipe_and_semicolon_warn() {
561        assert_args_with_errors_str("echo hello|cat", &["echo", "hello|cat"], b"|");
562        assert_args_with_errors_str("echo; ls", &["echo;", "ls"], b";");
563    }
564
565    #[test]
566    fn test_redirections_warn() {
567        assert_args_with_errors_str("echo > file", &["echo", ">", "file"], b">");
568        assert_args_with_errors_str("cat < file", &["cat", "<", "file"], b"<");
569    }
570
571    // MARK: Token Concatenation
572
573    #[test]
574    fn test_concatenation() {
575        assert_args_str("a'b'c", &["abc"]);
576        assert_args_str("a\"b\"c", &["abc"]);
577        assert_args_str("'a'\"b\"c", &["abc"]);
578        assert_args_str("x=\"foo\"", &["x=foo"]);
579    }
580
581    // MARK: Empty Quotes
582
583    #[test]
584    fn test_empty_quotes() {
585        assert_args_str("''", &[""]);
586        assert_args_str("\"\"", &[""]);
587        assert_args_str("'' ''", &["", ""]);
588    }
589
590    #[test]
591    fn test_adjacent_empty_quotes() {
592        assert_args_str("a''b", &["ab"]);
593        assert_args_str("a\"\"b", &["ab"]);
594        assert_args_str("''\"\"", &[""]);
595    }
596
597    // MARK: Newlines in Quotes
598
599    #[test]
600    fn test_newlines_in_single_quotes() {
601        assert_args_str("'hello\nworld'", &["hello\nworld"]);
602    }
603
604    #[test]
605    fn test_newlines_in_double_quotes() {
606        assert_args_str("\"hello\nworld\"", &["hello\nworld"]);
607    }
608
609    // MARK: Escaped Quote Characters
610
611    #[test]
612    fn test_escaped_single_quote() {
613        assert_args_str("\\'", &["'"]);
614    }
615
616    #[test]
617    fn test_escaped_double_quote() {
618        assert_args_str("\\\"", &["\""]);
619    }
620
621    #[test]
622    fn test_escaped_quote_in_double_quotes() {
623        assert_args_str("\"he said \\\"hi\\\"\"", &["he said \"hi\""]);
624    }
625
626    // MARK: Complex Cases
627
628    #[test]
629    fn test_complex_concatenation() {
630        assert_args_str("a\"b\"c'd'e", &["abcde"]);
631    }
632
633    #[test]
634    fn test_multiple_warnings() {
635        let (args, errors) = tokenize_str("$HOME/*.txt");
636        assert_eq!(args, vec!["$HOME/*.txt"]);
637        assert_eq!(errors.len(), 2);
638        assert_eq!(errors[0].byte, b'$');
639        assert_eq!(errors[1].byte, b'*');
640    }
641
642    #[test]
643    fn test_backslash_in_double_quotes_before_regular_char() {
644        assert_args_str("\"\\a\"", &["\\a"]);
645        assert_args_str("\"\\x\"", &["\\x"]);
646    }
647
648    // MARK: Bytes API tests
649
650    #[test]
651    fn test_bytes_with_errors() {
652        assert_args_with_errors(b"$HOME", &[b"$HOME"], b"$");
653    }
654
655    #[test]
656    fn test_bytes_unclosed_quote() {
657        let result = tokenize(b"'hello");
658        assert!(
659            result
660                .errors
661                .iter()
662                .any(|e| e.kind == ErrorKind::UnclosedSingleQuote)
663        );
664    }
665}