1use core::fmt;
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub enum ErrorKind {
19 UnclosedSingleQuote,
21 UnclosedDoubleQuote,
23 TrailingBackslash,
25 DollarSign,
27 Backtick,
29 Pipe,
31 Ampersand,
33 Semicolon,
35 OpenParen,
37 CloseParen,
39 LessThan,
41 GreaterThan,
43 Hash,
45 Asterisk,
47 QuestionMark,
49 OpenBracket,
51 Tilde,
53}
54
55impl fmt::Display for ErrorKind {
56 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
57 match self {
58 Self::UnclosedSingleQuote => write!(f, "unclosed single quote"),
59 Self::UnclosedDoubleQuote => write!(f, "unclosed double quote"),
60 Self::TrailingBackslash => write!(f, "trailing backslash"),
61 Self::DollarSign => write!(f, "dollar sign (variable expansion not interpreted)"),
62 Self::Backtick => write!(f, "backtick (command substitution not interpreted)"),
63 Self::Pipe => write!(f, "pipe (piping not interpreted)"),
64 Self::Ampersand => write!(f, "ampersand (background/AND not interpreted)"),
65 Self::Semicolon => write!(f, "semicolon (command separator not interpreted)"),
66 Self::OpenParen => write!(f, "open parenthesis (subshell not interpreted)"),
67 Self::CloseParen => write!(f, "close parenthesis (subshell not interpreted)"),
68 Self::LessThan => write!(f, "less-than (input redirection not interpreted)"),
69 Self::GreaterThan => write!(f, "greater-than (output redirection not interpreted)"),
70 Self::Hash => write!(f, "hash (comment not interpreted)"),
71 Self::Asterisk => write!(f, "asterisk (glob wildcard not interpreted)"),
72 Self::QuestionMark => write!(f, "question mark (glob wildcard not interpreted)"),
73 Self::OpenBracket => {
74 write!(f, "open bracket (glob bracket expression not interpreted)")
75 }
76 Self::Tilde => write!(f, "tilde (tilde expansion not interpreted)"),
77 }
78 }
79}
80
81#[derive(Debug, Clone, PartialEq, Eq)]
83pub struct Error {
84 pub kind: ErrorKind,
86 pub byte: u8,
88 pub position: usize,
90}
91
92impl fmt::Display for Error {
93 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
94 write!(
95 f,
96 "error at position {}: {} (byte 0x{:02x})",
97 self.position, self.kind, self.byte
98 )
99 }
100}
101
102#[derive(Debug, Clone)]
107pub struct TokenizeResult {
108 pub args: Vec<Vec<u8>>,
110 pub errors: Vec<Error>,
113}
114
115#[derive(Debug, Clone, Copy, PartialEq, Eq)]
117enum State {
118 Normal,
119 SingleQuoted,
120 DoubleQuoted,
121}
122
123const UNQUOTED_WARN_BYTES: &[u8] = b"$`|&;()<>*?[";
125
126#[expect(clippy::too_many_lines)]
144#[must_use]
145pub fn tokenize(input: &[u8]) -> TokenizeResult {
146 let mut state = State::Normal;
147 let mut at_word_start = true;
148 let mut current_token = Vec::<u8>::new();
149 let mut token_started = false;
150 let mut args = Vec::new();
151 let mut errors = Vec::new();
152
153 let mut quote_start_position: Option<usize> = None;
155
156 let mut i = 0;
157
158 while i < input.len() {
159 let b = input[i];
160 let position = i;
161
162 match state {
163 State::Normal => {
164 if b == b'\\' {
165 if let Some(&next) = input.get(i + 1) {
166 if next == b'\n' {
167 i += 1;
169 } else {
170 current_token.push(next);
171 i += 1;
172 at_word_start = false;
173 }
174 } else {
175 errors.push(Error {
176 kind: ErrorKind::TrailingBackslash,
177 byte: b,
178 position,
179 });
180 }
181 } else if b == b'\'' {
182 state = State::SingleQuoted;
183 at_word_start = false;
184 token_started = true;
185 quote_start_position = Some(position);
186 } else if b == b'"' {
187 state = State::DoubleQuoted;
188 at_word_start = false;
189 token_started = true;
190 quote_start_position = Some(position);
191 } else if b == b' ' || b == b'\t' {
192 if token_started || !current_token.is_empty() {
193 args.push(core::mem::take(&mut current_token));
194 token_started = false;
195 }
196 at_word_start = true;
197 } else if b == b'~' && at_word_start {
198 errors.push(Error {
199 kind: ErrorKind::Tilde,
200 byte: b,
201 position,
202 });
203 current_token.push(b);
204 at_word_start = false;
205 } else if b == b'#' && at_word_start {
206 errors.push(Error {
207 kind: ErrorKind::Hash,
208 byte: b,
209 position,
210 });
211 current_token.push(b);
212 at_word_start = false;
213 } else if UNQUOTED_WARN_BYTES.contains(&b) {
214 let kind = match b {
215 b'$' => ErrorKind::DollarSign,
216 b'`' => ErrorKind::Backtick,
217 b'|' => ErrorKind::Pipe,
218 b'&' => ErrorKind::Ampersand,
219 b';' => ErrorKind::Semicolon,
220 b'(' => ErrorKind::OpenParen,
221 b')' => ErrorKind::CloseParen,
222 b'<' => ErrorKind::LessThan,
223 b'>' => ErrorKind::GreaterThan,
224 b'*' => ErrorKind::Asterisk,
225 b'?' => ErrorKind::QuestionMark,
226 b'[' => ErrorKind::OpenBracket,
227 _ => unreachable!(),
228 };
229 errors.push(Error {
230 kind,
231 byte: b,
232 position,
233 });
234 current_token.push(b);
235 at_word_start = false;
236 } else {
237 current_token.push(b);
238 at_word_start = false;
239 }
240 }
241 State::SingleQuoted => {
242 if b == b'\'' {
243 state = State::Normal;
244 at_word_start = false;
245 quote_start_position = None;
246 } else {
247 current_token.push(b);
248 }
249 }
250 State::DoubleQuoted => {
251 if b == b'\\' {
252 if let Some(&next) = input.get(i + 1) {
253 if matches!(next, b'$' | b'`' | b'"' | b'\\') {
254 current_token.push(next);
255 i += 1;
256 } else if next == b'\n' {
257 i += 1;
259 } else {
260 current_token.push(b'\\');
261 }
262 } else {
263 current_token.push(b'\\');
264 }
265 } else if b == b'"' {
266 state = State::Normal;
267 at_word_start = false;
268 quote_start_position = None;
269 } else if b == b'`' {
270 errors.push(Error {
271 kind: ErrorKind::Backtick,
272 byte: b,
273 position,
274 });
275 current_token.push(b);
276 } else if b == b'$' {
277 errors.push(Error {
278 kind: ErrorKind::DollarSign,
279 byte: b,
280 position,
281 });
282 current_token.push(b);
283 } else {
284 current_token.push(b);
285 }
286 }
287 }
288
289 i += 1;
290 }
291
292 if token_started || !current_token.is_empty() {
293 args.push(current_token);
294 }
295
296 match state {
297 State::Normal => {}
298 State::SingleQuoted => {
299 errors.push(Error {
300 kind: ErrorKind::UnclosedSingleQuote,
301 byte: b'\'',
302 position: quote_start_position.unwrap_or(0),
303 });
304 }
305 State::DoubleQuoted => {
306 errors.push(Error {
307 kind: ErrorKind::UnclosedDoubleQuote,
308 byte: b'"',
309 position: quote_start_position.unwrap_or(0),
310 });
311 }
312 }
313
314 TokenizeResult { args, errors }
315}
316
317#[must_use]
350pub fn tokenize_str(input: &str) -> (Vec<String>, Vec<Error>) {
351 let result = tokenize(input.as_bytes());
352 let args = result
353 .args
354 .into_iter()
355 .map(|bytes| String::from_utf8(bytes).expect("tokenizer should preserve UTF-8 validity"))
356 .collect();
357 (args, result.errors)
358}
359
360#[cfg(test)]
361mod tests {
362 use super::*;
363
364 fn assert_args_str(input: &str, expected: &[&str]) {
366 let (args, errors) = tokenize_str(input);
367 assert_eq!(
368 args,
369 expected
370 .iter()
371 .map(|s| (*s).to_string())
372 .collect::<Vec<_>>()
373 );
374 assert!(errors.is_empty(), "expected no errors, got: {errors:?}");
375 }
376
377 fn assert_args(input: &[u8], expected: &[&[u8]]) {
379 let result = tokenize(input);
380 assert_eq!(
381 result.args,
382 expected.iter().map(|s| s.to_vec()).collect::<Vec<_>>()
383 );
384 assert!(
385 result.errors.is_empty(),
386 "expected no errors, got: {:?}",
387 result.errors
388 );
389 }
390
391 fn assert_args_with_errors_str(input: &str, expected: &[&str], error_bytes: &[u8]) {
393 let (args, errors) = tokenize_str(input);
394 assert_eq!(
395 args,
396 expected
397 .iter()
398 .map(|s| (*s).to_string())
399 .collect::<Vec<_>>()
400 );
401 let actual_error_bytes: Vec<u8> = errors.iter().map(|e| e.byte).collect();
402 assert_eq!(actual_error_bytes, error_bytes);
403 }
404
405 fn assert_args_with_errors(input: &[u8], expected: &[&[u8]], error_bytes: &[u8]) {
407 let result = tokenize(input);
408 assert_eq!(
409 result.args,
410 expected.iter().map(|s| s.to_vec()).collect::<Vec<_>>()
411 );
412 let actual_error_bytes: Vec<u8> = result.errors.iter().map(|e| e.byte).collect();
413 assert_eq!(actual_error_bytes, error_bytes);
414 }
415
416 fn assert_has_error(input: &str, expected_kind: ErrorKind) {
418 let (_, errors) = tokenize_str(input);
419 assert!(
420 errors.iter().any(|e| e.kind == expected_kind),
421 "expected error {expected_kind:?}, got: {errors:?}"
422 );
423 }
424
425 #[test]
428 fn test_basic_tokenization() {
429 assert_args_str("hello world", &["hello", "world"]);
430 assert_args_str("hello world", &["hello", "world"]);
431 assert_args_str(" hello world ", &["hello", "world"]);
432 assert_args_str("hello", &["hello"]);
433 assert_args_str("", &[]);
434 assert_args_str(" ", &[]);
435 }
436
437 #[test]
438 fn test_tabs() {
439 assert_args_str("hello\tworld", &["hello", "world"]);
440 assert_args_str("hello \t world", &["hello", "world"]);
441 }
442
443 #[test]
446 fn test_basic_tokenization_bytes() {
447 assert_args(b"hello world", &[b"hello", b"world"]);
448 assert_args(b"hello world", &[b"hello", b"world"]);
449 assert_args(b" hello world ", &[b"hello", b"world"]);
450 assert_args(b"hello", &[b"hello"]);
451 assert_args(b"", &[]);
452 assert_args(b" ", &[]);
453 }
454
455 #[test]
458 fn test_single_quotes() {
459 assert_args_str("'hello world'", &["hello world"]);
460 assert_args_str("'$HOME'", &["$HOME"]);
461 assert_args_str("'\\n'", &["\\n"]);
462 assert_args_str("'it'\\''s'", &["it's"]);
463 }
464
465 #[test]
466 fn test_unclosed_single_quote() {
467 assert_has_error("'hello", ErrorKind::UnclosedSingleQuote);
468 }
469
470 #[test]
473 fn test_double_quotes() {
474 assert_args_str("\"hello world\"", &["hello world"]);
475 assert_args_str("\"say \\\"hi\\\"\"", &["say \"hi\""]);
476 assert_args_str("\"back\\\\slash\"", &["back\\slash"]);
477 assert_args_str("\"\\$HOME\"", &["$HOME"]);
478 assert_args_str("\"\\n\"", &["\\n"]);
479 assert_args_str("\"\\z\"", &["\\z"]);
480 }
481
482 #[test]
483 fn test_dollar_in_double_quotes_warns() {
484 assert_args_with_errors_str("\"$HOME\"", &["$HOME"], b"$");
485 }
486
487 #[test]
488 fn test_backtick_in_double_quotes_warns() {
489 assert_args_with_errors_str("\"`cmd`\"", &["`cmd`"], b"``");
490 }
491
492 #[test]
493 fn test_unclosed_double_quote() {
494 assert_has_error("\"hello", ErrorKind::UnclosedDoubleQuote);
495 }
496
497 #[test]
500 fn test_unquoted_escapes() {
501 assert_args_str("hello\\ world", &["hello world"]);
502 assert_args_str("\\$HOME", &["$HOME"]);
503 assert_args_str("\\\\", &["\\"]);
504 assert_args_str("\\*", &["*"]);
505 }
506
507 #[test]
508 fn test_trailing_backslash_error() {
509 assert_has_error("hello\\", ErrorKind::TrailingBackslash);
510 }
511
512 #[test]
513 fn test_line_continuation() {
514 assert_args_str("hello\\\nworld", &["helloworld"]);
515 assert_args_str("hello \\\n world", &["hello", "world"]);
516 }
517
518 #[test]
519 fn test_line_continuation_in_double_quotes() {
520 assert_args_str("\"hello\\\nworld\"", &["helloworld"]);
521 }
522
523 #[test]
526 fn test_unquoted_dollar_warns() {
527 assert_args_with_errors_str("$HOME", &["$HOME"], b"$");
528 }
529
530 #[test]
531 fn test_unquoted_glob_warns() {
532 assert_args_with_errors_str("*.txt", &["*.txt"], b"*");
533 assert_args_with_errors_str("file?", &["file?"], b"?");
534 assert_args_with_errors_str("file[0]", &["file[0]"], b"[");
535 }
536
537 #[test]
538 fn test_tilde_at_word_start_warns() {
539 assert_args_with_errors_str("~user", &["~user"], b"~");
540 }
541
542 #[test]
543 fn test_tilde_mid_word_no_warn() {
544 assert_args_str("a~b", &["a~b"]);
545 }
546
547 #[test]
548 fn test_hash_at_word_start_warns() {
549 assert_args_with_errors_str("#comment", &["#comment"], b"#");
550 assert_args_with_errors_str("echo #test", &["echo", "#test"], b"#");
551 }
552
553 #[test]
554 fn test_hash_mid_word_no_warn() {
555 assert_args_str("foo#bar", &["foo#bar"]);
556 assert_args_str("C#", &["C#"]);
557 }
558
559 #[test]
560 fn test_pipe_and_semicolon_warn() {
561 assert_args_with_errors_str("echo hello|cat", &["echo", "hello|cat"], b"|");
562 assert_args_with_errors_str("echo; ls", &["echo;", "ls"], b";");
563 }
564
565 #[test]
566 fn test_redirections_warn() {
567 assert_args_with_errors_str("echo > file", &["echo", ">", "file"], b">");
568 assert_args_with_errors_str("cat < file", &["cat", "<", "file"], b"<");
569 }
570
571 #[test]
574 fn test_concatenation() {
575 assert_args_str("a'b'c", &["abc"]);
576 assert_args_str("a\"b\"c", &["abc"]);
577 assert_args_str("'a'\"b\"c", &["abc"]);
578 assert_args_str("x=\"foo\"", &["x=foo"]);
579 }
580
581 #[test]
584 fn test_empty_quotes() {
585 assert_args_str("''", &[""]);
586 assert_args_str("\"\"", &[""]);
587 assert_args_str("'' ''", &["", ""]);
588 }
589
590 #[test]
591 fn test_adjacent_empty_quotes() {
592 assert_args_str("a''b", &["ab"]);
593 assert_args_str("a\"\"b", &["ab"]);
594 assert_args_str("''\"\"", &[""]);
595 }
596
597 #[test]
600 fn test_newlines_in_single_quotes() {
601 assert_args_str("'hello\nworld'", &["hello\nworld"]);
602 }
603
604 #[test]
605 fn test_newlines_in_double_quotes() {
606 assert_args_str("\"hello\nworld\"", &["hello\nworld"]);
607 }
608
609 #[test]
612 fn test_escaped_single_quote() {
613 assert_args_str("\\'", &["'"]);
614 }
615
616 #[test]
617 fn test_escaped_double_quote() {
618 assert_args_str("\\\"", &["\""]);
619 }
620
621 #[test]
622 fn test_escaped_quote_in_double_quotes() {
623 assert_args_str("\"he said \\\"hi\\\"\"", &["he said \"hi\""]);
624 }
625
626 #[test]
629 fn test_complex_concatenation() {
630 assert_args_str("a\"b\"c'd'e", &["abcde"]);
631 }
632
633 #[test]
634 fn test_multiple_warnings() {
635 let (args, errors) = tokenize_str("$HOME/*.txt");
636 assert_eq!(args, vec!["$HOME/*.txt"]);
637 assert_eq!(errors.len(), 2);
638 assert_eq!(errors[0].byte, b'$');
639 assert_eq!(errors[1].byte, b'*');
640 }
641
642 #[test]
643 fn test_backslash_in_double_quotes_before_regular_char() {
644 assert_args_str("\"\\a\"", &["\\a"]);
645 assert_args_str("\"\\x\"", &["\\x"]);
646 }
647
648 #[test]
651 fn test_bytes_with_errors() {
652 assert_args_with_errors(b"$HOME", &[b"$HOME"], b"$");
653 }
654
655 #[test]
656 fn test_bytes_unclosed_quote() {
657 let result = tokenize(b"'hello");
658 assert!(
659 result
660 .errors
661 .iter()
662 .any(|e| e.kind == ErrorKind::UnclosedSingleQuote)
663 );
664 }
665}