Skip to main content

tirith_core/
tokenize.rs

1use serde::{Deserialize, Serialize};
2
3/// Shell type for tokenization rules.
4#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
5#[serde(rename_all = "lowercase")]
6pub enum ShellType {
7    Posix,
8    Fish,
9    PowerShell,
10    Cmd,
11}
12
13impl std::str::FromStr for ShellType {
14    type Err = String;
15    fn from_str(s: &str) -> Result<Self, Self::Err> {
16        match s.to_lowercase().as_str() {
17            "posix" | "bash" | "zsh" | "sh" => Ok(ShellType::Posix),
18            "fish" => Ok(ShellType::Fish),
19            "powershell" | "pwsh" => Ok(ShellType::PowerShell),
20            "cmd" | "cmd.exe" => Ok(ShellType::Cmd),
21            _ => Err(format!("unknown shell type: {s}")),
22        }
23    }
24}
25
26/// A segment of a tokenized command.
27#[derive(Debug, Clone)]
28pub struct Segment {
29    /// The raw text of this segment.
30    pub raw: String,
31    /// The first word/command of this segment, if identifiable.
32    pub command: Option<String>,
33    /// Arguments following the command.
34    pub args: Vec<String>,
35    /// The separator that preceded this segment (e.g., `|`, `&&`).
36    pub preceding_separator: Option<String>,
37}
38
39/// Tokenize a command string according to shell type.
40pub fn tokenize(input: &str, shell: ShellType) -> Vec<Segment> {
41    match shell {
42        ShellType::Posix => tokenize_posix(input),
43        ShellType::Fish => tokenize_fish(input),
44        ShellType::PowerShell => tokenize_powershell(input),
45        ShellType::Cmd => tokenize_cmd(input),
46    }
47}
48
49fn tokenize_posix(input: &str) -> Vec<Segment> {
50    let mut segments = Vec::new();
51    let mut current = String::new();
52    let mut preceding_sep = None;
53    let chars: Vec<char> = input.chars().collect();
54    let len = chars.len();
55    let mut i = 0;
56
57    while i < len {
58        let ch = chars[i];
59
60        match ch {
61            // Backslash escaping
62            '\\' if i + 1 < len => {
63                current.push(chars[i]);
64                current.push(chars[i + 1]);
65                i += 2;
66                continue;
67            }
68            // Single quotes: everything literal until closing quote
69            '\'' => {
70                current.push(ch);
71                i += 1;
72                while i < len && chars[i] != '\'' {
73                    current.push(chars[i]);
74                    i += 1;
75                }
76                if i < len {
77                    current.push(chars[i]); // closing quote
78                    i += 1;
79                }
80                continue;
81            }
82            // Double quotes: allow backslash escaping inside
83            '"' => {
84                current.push(ch);
85                i += 1;
86                while i < len && chars[i] != '"' {
87                    if chars[i] == '\\' && i + 1 < len {
88                        current.push(chars[i]);
89                        current.push(chars[i + 1]);
90                        i += 2;
91                    } else {
92                        current.push(chars[i]);
93                        i += 1;
94                    }
95                }
96                if i < len {
97                    current.push(chars[i]); // closing quote
98                    i += 1;
99                }
100                continue;
101            }
102            // Pipe operators
103            '|' => {
104                if i + 1 < len && chars[i + 1] == '|' {
105                    // ||
106                    push_segment(&mut segments, &current, preceding_sep.take());
107                    current.clear();
108                    preceding_sep = Some("||".to_string());
109                    i += 2;
110                    continue;
111                } else if i + 1 < len && chars[i + 1] == '&' {
112                    // |& (bash: pipe stderr too)
113                    push_segment(&mut segments, &current, preceding_sep.take());
114                    current.clear();
115                    preceding_sep = Some("|&".to_string());
116                    i += 2;
117                    continue;
118                } else {
119                    // |
120                    push_segment(&mut segments, &current, preceding_sep.take());
121                    current.clear();
122                    preceding_sep = Some("|".to_string());
123                    i += 1;
124                    continue;
125                }
126            }
127            // && operator
128            '&' if i + 1 < len && chars[i + 1] == '&' => {
129                push_segment(&mut segments, &current, preceding_sep.take());
130                current.clear();
131                preceding_sep = Some("&&".to_string());
132                i += 2;
133                continue;
134            }
135            // Semicolon
136            ';' => {
137                push_segment(&mut segments, &current, preceding_sep.take());
138                current.clear();
139                preceding_sep = Some(";".to_string());
140                i += 1;
141                continue;
142            }
143            // Newline
144            '\n' => {
145                push_segment(&mut segments, &current, preceding_sep.take());
146                current.clear();
147                preceding_sep = Some("\n".to_string());
148                i += 1;
149                continue;
150            }
151            _ => {
152                current.push(ch);
153                i += 1;
154            }
155        }
156    }
157
158    push_segment(&mut segments, &current, preceding_sep.take());
159    segments
160}
161
162fn tokenize_fish(input: &str) -> Vec<Segment> {
163    // Fish is similar to POSIX but with some differences:
164    // - No backslash-newline continuation
165    // - Different quoting rules (but close enough for our purposes)
166    // For URL extraction, POSIX tokenization works well enough
167    tokenize_posix(input)
168}
169
170fn tokenize_powershell(input: &str) -> Vec<Segment> {
171    let mut segments = Vec::new();
172    let mut current = String::new();
173    let mut preceding_sep = None;
174    // Collect (byte_offset, char) pairs so byte slicing stays valid for multi-byte UTF-8.
175    let indexed: Vec<(usize, char)> = input.char_indices().collect();
176    let len = indexed.len();
177    let mut i = 0;
178
179    while i < len {
180        let (byte_off, ch) = indexed[i];
181
182        match ch {
183            // Backtick escaping in PowerShell
184            '`' if i + 1 < len => {
185                current.push(indexed[i].1);
186                current.push(indexed[i + 1].1);
187                i += 2;
188                continue;
189            }
190            // Single quotes: literal
191            '\'' => {
192                current.push(ch);
193                i += 1;
194                while i < len && indexed[i].1 != '\'' {
195                    current.push(indexed[i].1);
196                    i += 1;
197                }
198                if i < len {
199                    current.push(indexed[i].1);
200                    i += 1;
201                }
202                continue;
203            }
204            // Double quotes
205            '"' => {
206                current.push(ch);
207                i += 1;
208                while i < len && indexed[i].1 != '"' {
209                    if indexed[i].1 == '`' && i + 1 < len {
210                        current.push(indexed[i].1);
211                        current.push(indexed[i + 1].1);
212                        i += 2;
213                    } else {
214                        current.push(indexed[i].1);
215                        i += 1;
216                    }
217                }
218                if i < len {
219                    current.push(indexed[i].1);
220                    i += 1;
221                }
222                continue;
223            }
224            // Pipe
225            '|' => {
226                push_segment(&mut segments, &current, preceding_sep.take());
227                current.clear();
228                preceding_sep = Some("|".to_string());
229                i += 1;
230                continue;
231            }
232            // Semicolon
233            ';' => {
234                push_segment(&mut segments, &current, preceding_sep.take());
235                current.clear();
236                preceding_sep = Some(";".to_string());
237                i += 1;
238                continue;
239            }
240            // Check for -and / -or operators (PowerShell logical)
241            '-' if current.ends_with(char::is_whitespace) || current.is_empty() => {
242                let remaining = &input[byte_off..];
243                if remaining.starts_with("-and")
244                    && remaining[4..]
245                        .chars()
246                        .next()
247                        .is_none_or(|c| c.is_whitespace())
248                {
249                    push_segment(&mut segments, &current, preceding_sep.take());
250                    current.clear();
251                    preceding_sep = Some("-and".to_string());
252                    i += 4;
253                    continue;
254                } else if remaining.starts_with("-or")
255                    && remaining[3..]
256                        .chars()
257                        .next()
258                        .is_none_or(|c| c.is_whitespace())
259                {
260                    push_segment(&mut segments, &current, preceding_sep.take());
261                    current.clear();
262                    preceding_sep = Some("-or".to_string());
263                    i += 3;
264                    continue;
265                }
266                current.push(ch);
267                i += 1;
268            }
269            '\n' => {
270                push_segment(&mut segments, &current, preceding_sep.take());
271                current.clear();
272                preceding_sep = Some("\n".to_string());
273                i += 1;
274                continue;
275            }
276            _ => {
277                current.push(ch);
278                i += 1;
279            }
280        }
281    }
282
283    push_segment(&mut segments, &current, preceding_sep.take());
284    segments
285}
286
287fn tokenize_cmd(input: &str) -> Vec<Segment> {
288    let mut segments = Vec::new();
289    let mut current = String::new();
290    let mut preceding_sep = None;
291    let chars: Vec<char> = input.chars().collect();
292    let len = chars.len();
293    let mut i = 0;
294
295    while i < len {
296        let ch = chars[i];
297        match ch {
298            // Caret escaping (cmd.exe escape character)
299            '^' if i + 1 < len => {
300                current.push(chars[i]);
301                current.push(chars[i + 1]);
302                i += 2;
303                continue;
304            }
305            // Double quotes (only quoting mechanism in cmd)
306            '"' => {
307                current.push(ch);
308                i += 1;
309                while i < len && chars[i] != '"' {
310                    current.push(chars[i]);
311                    i += 1;
312                }
313                if i < len {
314                    current.push(chars[i]);
315                    i += 1;
316                }
317                continue;
318            }
319            // Pipe
320            '|' => {
321                if i + 1 < len && chars[i + 1] == '|' {
322                    push_segment(&mut segments, &current, preceding_sep.take());
323                    current.clear();
324                    preceding_sep = Some("||".to_string());
325                    i += 2;
326                } else {
327                    push_segment(&mut segments, &current, preceding_sep.take());
328                    current.clear();
329                    preceding_sep = Some("|".to_string());
330                    i += 1;
331                }
332                continue;
333            }
334            // & and &&
335            '&' => {
336                if i + 1 < len && chars[i + 1] == '&' {
337                    push_segment(&mut segments, &current, preceding_sep.take());
338                    current.clear();
339                    preceding_sep = Some("&&".to_string());
340                    i += 2;
341                } else {
342                    push_segment(&mut segments, &current, preceding_sep.take());
343                    current.clear();
344                    preceding_sep = Some("&".to_string());
345                    i += 1;
346                }
347                continue;
348            }
349            '\n' => {
350                push_segment(&mut segments, &current, preceding_sep.take());
351                current.clear();
352                preceding_sep = Some("\n".to_string());
353                i += 1;
354                continue;
355            }
356            _ => {
357                current.push(ch);
358                i += 1;
359            }
360        }
361    }
362    push_segment(&mut segments, &current, preceding_sep.take());
363    segments
364}
365
366fn push_segment(segments: &mut Vec<Segment>, raw: &str, preceding_sep: Option<String>) {
367    let trimmed = raw.trim();
368    if trimmed.is_empty() {
369        return;
370    }
371
372    let words = split_words(trimmed);
373    // Skip leading environment variable assignments (VAR=VALUE)
374    let first_non_assign = words.iter().position(|w| !is_env_assignment(w));
375    let (command, args) = match first_non_assign {
376        Some(idx) => {
377            let cmd = Some(words[idx].clone());
378            let args = if idx + 1 < words.len() {
379                words[idx + 1..].to_vec()
380            } else {
381                Vec::new()
382            };
383            (cmd, args)
384        }
385        None => {
386            // All words are assignments, no command
387            (None, Vec::new())
388        }
389    };
390
391    segments.push(Segment {
392        raw: trimmed.to_string(),
393        command,
394        args,
395        preceding_separator: preceding_sep,
396    });
397}
398
399/// Check if a word looks like a shell environment variable assignment (NAME=VALUE).
400/// Must have at least one char before `=`, and the name must be alphanumeric/underscore.
401pub fn is_env_assignment(word: &str) -> bool {
402    let s = word.trim();
403    if s.starts_with('-') || s.starts_with('=') {
404        return false;
405    }
406    if let Some(eq_pos) = s.find('=') {
407        if eq_pos == 0 {
408            return false;
409        }
410        let name = &s[..eq_pos];
411        let first = name.chars().next().unwrap_or('0');
412        if first.is_ascii_digit() {
413            return false;
414        }
415        name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_')
416    } else {
417        false
418    }
419}
420
421/// Return the values from leading `NAME=VALUE` tokens in a raw segment.
422/// Stops at the first non-assignment word, matching the shell prefix-assignment model.
423pub fn leading_env_assignments(segment_raw: &str) -> Vec<(String, String)> {
424    let mut assignments = Vec::new();
425    for word in split_words(segment_raw.trim()) {
426        if !is_env_assignment(&word) {
427            break;
428        }
429        if let Some((name, value)) = word.split_once('=') {
430            assignments.push((name.to_string(), value.to_string()));
431        }
432    }
433    assignments
434}
435
436/// Return the values from leading `NAME=VALUE` tokens in a raw segment.
437/// Stops at the first non-assignment word, matching the shell prefix-assignment model.
438pub fn leading_env_assignment_values(segment_raw: &str) -> Vec<String> {
439    leading_env_assignments(segment_raw)
440        .into_iter()
441        .map(|(_, value)| value)
442        .collect()
443}
444
445/// Split a segment into words, respecting quotes.
446fn split_words(input: &str) -> Vec<String> {
447    let mut words = Vec::new();
448    let mut current = String::new();
449    let chars: Vec<char> = input.chars().collect();
450    let len = chars.len();
451    let mut i = 0;
452
453    while i < len {
454        let ch = chars[i];
455        match ch {
456            ' ' | '\t' if !current.is_empty() => {
457                words.push(current.clone());
458                current.clear();
459                i += 1;
460                // Skip whitespace
461                while i < len && (chars[i] == ' ' || chars[i] == '\t') {
462                    i += 1;
463                }
464            }
465            ' ' | '\t' => {
466                i += 1;
467            }
468            '\'' => {
469                current.push(ch);
470                i += 1;
471                while i < len && chars[i] != '\'' {
472                    current.push(chars[i]);
473                    i += 1;
474                }
475                if i < len {
476                    current.push(chars[i]);
477                    i += 1;
478                }
479            }
480            '"' => {
481                current.push(ch);
482                i += 1;
483                while i < len && chars[i] != '"' {
484                    if chars[i] == '\\' && i + 1 < len {
485                        current.push(chars[i]);
486                        current.push(chars[i + 1]);
487                        i += 2;
488                    } else {
489                        current.push(chars[i]);
490                        i += 1;
491                    }
492                }
493                if i < len {
494                    current.push(chars[i]);
495                    i += 1;
496                }
497            }
498            '\\' if i + 1 < len => {
499                current.push(chars[i]);
500                current.push(chars[i + 1]);
501                i += 2;
502            }
503            _ => {
504                current.push(ch);
505                i += 1;
506            }
507        }
508    }
509
510    if !current.is_empty() {
511        words.push(current);
512    }
513
514    words
515}
516
517#[cfg(test)]
518mod tests {
519    use super::*;
520
521    #[test]
522    fn test_simple_pipe() {
523        let segs = tokenize("echo hello | grep world", ShellType::Posix);
524        assert_eq!(segs.len(), 2);
525        assert_eq!(segs[0].command.as_deref(), Some("echo"));
526        assert_eq!(segs[1].command.as_deref(), Some("grep"));
527        assert_eq!(segs[1].preceding_separator.as_deref(), Some("|"));
528    }
529
530    #[test]
531    fn test_quoted_pipe() {
532        let segs = tokenize(r#"echo "hello | world" | bash"#, ShellType::Posix);
533        assert_eq!(segs.len(), 2);
534        assert_eq!(segs[0].raw, r#"echo "hello | world""#);
535        assert_eq!(segs[1].command.as_deref(), Some("bash"));
536    }
537
538    #[test]
539    fn test_and_or() {
540        let segs = tokenize("cmd1 && cmd2 || cmd3", ShellType::Posix);
541        assert_eq!(segs.len(), 3);
542        assert_eq!(segs[1].preceding_separator.as_deref(), Some("&&"));
543        assert_eq!(segs[2].preceding_separator.as_deref(), Some("||"));
544    }
545
546    #[test]
547    fn test_semicolon() {
548        let segs = tokenize("cmd1; cmd2", ShellType::Posix);
549        assert_eq!(segs.len(), 2);
550        assert_eq!(segs[1].preceding_separator.as_deref(), Some(";"));
551    }
552
553    #[test]
554    fn test_pipe_ampersand() {
555        let segs = tokenize("cmd1 |& cmd2", ShellType::Posix);
556        assert_eq!(segs.len(), 2);
557        assert_eq!(segs[1].preceding_separator.as_deref(), Some("|&"));
558    }
559
560    #[test]
561    fn test_powershell_pipe() {
562        let segs = tokenize("iwr url | iex", ShellType::PowerShell);
563        assert_eq!(segs.len(), 2);
564        assert_eq!(segs[0].command.as_deref(), Some("iwr"));
565        assert_eq!(segs[1].command.as_deref(), Some("iex"));
566    }
567
568    #[test]
569    fn test_powershell_backtick() {
570        let segs = tokenize("echo `| not a pipe", ShellType::PowerShell);
571        // backtick escapes the pipe
572        assert_eq!(segs.len(), 1);
573    }
574
575    #[test]
576    fn test_single_quotes() {
577        let segs = tokenize("echo 'hello | world' | bash", ShellType::Posix);
578        assert_eq!(segs.len(), 2);
579    }
580
581    #[test]
582    fn test_backslash_escape() {
583        let segs = tokenize("echo hello\\|world | bash", ShellType::Posix);
584        // The backslash-pipe is inside the first segment
585        assert_eq!(segs.len(), 2);
586    }
587
588    #[test]
589    fn test_empty_input() {
590        let segs = tokenize("", ShellType::Posix);
591        assert!(segs.is_empty());
592    }
593
594    #[test]
595    fn test_whitespace_only() {
596        let segs = tokenize("   ", ShellType::Posix);
597        assert!(segs.is_empty());
598    }
599
600    #[test]
601    fn test_args_extraction() {
602        let segs = tokenize("curl -sSL https://example.com", ShellType::Posix);
603        assert_eq!(segs.len(), 1);
604        assert_eq!(segs[0].command.as_deref(), Some("curl"));
605        assert_eq!(segs[0].args.len(), 2);
606    }
607
608    #[test]
609    fn test_env_prefix_skipped() {
610        let segs = tokenize("TIRITH=0 curl evil.com", ShellType::Posix);
611        assert_eq!(segs.len(), 1);
612        assert_eq!(segs[0].command.as_deref(), Some("curl"));
613        assert_eq!(segs[0].args, vec!["evil.com"]);
614    }
615
616    #[test]
617    fn test_multiple_env_prefixes() {
618        let segs = tokenize("FOO=bar BAZ=1 python script.py", ShellType::Posix);
619        assert_eq!(segs.len(), 1);
620        assert_eq!(segs[0].command.as_deref(), Some("python"));
621        assert_eq!(segs[0].args, vec!["script.py"]);
622    }
623
624    #[test]
625    fn test_env_only_no_command() {
626        let segs = tokenize("TIRITH=0", ShellType::Posix);
627        assert_eq!(segs.len(), 1);
628        assert_eq!(segs[0].command, None);
629        assert!(segs[0].args.is_empty());
630    }
631
632    #[test]
633    fn test_is_env_assignment() {
634        assert!(is_env_assignment("FOO=bar"));
635        assert!(is_env_assignment("TIRITH=0"));
636        assert!(is_env_assignment("PATH=/usr/bin"));
637        assert!(is_env_assignment("A="));
638        assert!(!is_env_assignment("-o"));
639        assert!(!is_env_assignment("curl"));
640        assert!(!is_env_assignment("=value"));
641        assert!(!is_env_assignment("--flag=value"));
642        assert!(!is_env_assignment("1FOO=bar"));
643    }
644
645    #[test]
646    fn test_leading_env_assignment_values() {
647        assert_eq!(
648            leading_env_assignment_values("URL=https://example.com curl ok"),
649            vec!["https://example.com"]
650        );
651        assert_eq!(
652            leading_env_assignments("URL='https://example.com/a' FOO=bar curl ok"),
653            vec![
654                ("URL".to_string(), "'https://example.com/a'".to_string()),
655                ("FOO".to_string(), "bar".to_string())
656            ]
657        );
658        assert_eq!(
659            leading_env_assignment_values("URL='https://example.com/a' FOO=bar curl ok"),
660            vec!["'https://example.com/a'", "bar"]
661        );
662        assert!(leading_env_assignment_values("env URL=https://example.com curl ok").is_empty());
663    }
664
665    #[test]
666    fn test_cmd_pipe() {
667        let segs = tokenize("dir | findstr foo", ShellType::Cmd);
668        assert_eq!(segs.len(), 2);
669        assert_eq!(segs[0].command.as_deref(), Some("dir"));
670        assert_eq!(segs[1].command.as_deref(), Some("findstr"));
671    }
672
673    #[test]
674    fn test_cmd_ampersand_separator() {
675        let segs = tokenize("dir & echo done", ShellType::Cmd);
676        assert_eq!(segs.len(), 2);
677        assert_eq!(segs[1].preceding_separator.as_deref(), Some("&"));
678    }
679
680    #[test]
681    fn test_cmd_double_ampersand() {
682        let segs = tokenize("cmd1 && cmd2", ShellType::Cmd);
683        assert_eq!(segs.len(), 2);
684        assert_eq!(segs[1].preceding_separator.as_deref(), Some("&&"));
685    }
686
687    #[test]
688    fn test_cmd_caret_escape() {
689        let segs = tokenize("echo hello^|world | findstr x", ShellType::Cmd);
690        // ^| is escaped, not a pipe
691        assert_eq!(segs.len(), 2);
692    }
693
694    #[test]
695    fn test_cmd_double_quotes() {
696        let segs = tokenize(r#"echo "hello | world" | findstr x"#, ShellType::Cmd);
697        assert_eq!(segs.len(), 2);
698    }
699
700    #[test]
701    fn test_powershell_multibyte_and_operator_no_panic() {
702        // Regression test for fuzz crash: multi-byte UTF-8 before -and caused
703        // byte/char index mismatch panic in &input[i..] slicing.
704        let input = " ?]BB\u{07E7}\u{07E7} -\n-\r-and-~\0\u{c}-and-~\u{1d}";
705        let _ = tokenize(input, ShellType::PowerShell);
706    }
707}