Skip to main content

tirith_core/
tokenize.rs

1use serde::{Deserialize, Serialize};
2
3/// Shell type for tokenization rules.
4#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
5#[serde(rename_all = "lowercase")]
6pub enum ShellType {
7    Posix,
8    Fish,
9    PowerShell,
10    Cmd,
11}
12
13impl std::str::FromStr for ShellType {
14    type Err = String;
15    fn from_str(s: &str) -> Result<Self, Self::Err> {
16        match s.to_lowercase().as_str() {
17            "posix" | "bash" | "zsh" | "sh" => Ok(ShellType::Posix),
18            "fish" => Ok(ShellType::Fish),
19            "powershell" | "pwsh" => Ok(ShellType::PowerShell),
20            "cmd" | "cmd.exe" => Ok(ShellType::Cmd),
21            _ => Err(format!("unknown shell type: {s}")),
22        }
23    }
24}
25
26/// A segment of a tokenized command.
27#[derive(Debug, Clone)]
28pub struct Segment {
29    /// The raw text of this segment.
30    pub raw: String,
31    /// The first word/command of this segment, if identifiable.
32    pub command: Option<String>,
33    /// Arguments following the command.
34    pub args: Vec<String>,
35    /// The separator that preceded this segment (e.g., `|`, `&&`).
36    pub preceding_separator: Option<String>,
37}
38
39/// Tokenize a command string according to shell type.
40pub fn tokenize(input: &str, shell: ShellType) -> Vec<Segment> {
41    match shell {
42        ShellType::Posix => tokenize_posix(input),
43        ShellType::Fish => tokenize_fish(input),
44        ShellType::PowerShell => tokenize_powershell(input),
45        ShellType::Cmd => tokenize_cmd(input),
46    }
47}
48
49fn tokenize_posix(input: &str) -> Vec<Segment> {
50    let mut segments = Vec::new();
51    let mut current = String::new();
52    let mut preceding_sep = None;
53    let chars: Vec<char> = input.chars().collect();
54    let len = chars.len();
55    let mut i = 0;
56
57    while i < len {
58        let ch = chars[i];
59
60        match ch {
61            // Backslash escaping
62            '\\' if i + 1 < len => {
63                current.push(chars[i]);
64                current.push(chars[i + 1]);
65                i += 2;
66                continue;
67            }
68            // Single quotes: everything literal until closing quote
69            '\'' => {
70                current.push(ch);
71                i += 1;
72                while i < len && chars[i] != '\'' {
73                    current.push(chars[i]);
74                    i += 1;
75                }
76                if i < len {
77                    current.push(chars[i]); // closing quote
78                    i += 1;
79                }
80                continue;
81            }
82            // Double quotes: allow backslash escaping inside
83            '"' => {
84                current.push(ch);
85                i += 1;
86                while i < len && chars[i] != '"' {
87                    if chars[i] == '\\' && i + 1 < len {
88                        current.push(chars[i]);
89                        current.push(chars[i + 1]);
90                        i += 2;
91                    } else {
92                        current.push(chars[i]);
93                        i += 1;
94                    }
95                }
96                if i < len {
97                    current.push(chars[i]); // closing quote
98                    i += 1;
99                }
100                continue;
101            }
102            // Pipe operators
103            '|' => {
104                if i + 1 < len && chars[i + 1] == '|' {
105                    // ||
106                    push_segment(&mut segments, &current, preceding_sep.take());
107                    current.clear();
108                    preceding_sep = Some("||".to_string());
109                    i += 2;
110                    continue;
111                } else if i + 1 < len && chars[i + 1] == '&' {
112                    // |& (bash: pipe stderr too)
113                    push_segment(&mut segments, &current, preceding_sep.take());
114                    current.clear();
115                    preceding_sep = Some("|&".to_string());
116                    i += 2;
117                    continue;
118                } else {
119                    // |
120                    push_segment(&mut segments, &current, preceding_sep.take());
121                    current.clear();
122                    preceding_sep = Some("|".to_string());
123                    i += 1;
124                    continue;
125                }
126            }
127            // && operator
128            '&' if i + 1 < len && chars[i + 1] == '&' => {
129                push_segment(&mut segments, &current, preceding_sep.take());
130                current.clear();
131                preceding_sep = Some("&&".to_string());
132                i += 2;
133                continue;
134            }
135            // Semicolon
136            ';' => {
137                push_segment(&mut segments, &current, preceding_sep.take());
138                current.clear();
139                preceding_sep = Some(";".to_string());
140                i += 1;
141                continue;
142            }
143            // Newline
144            '\n' => {
145                push_segment(&mut segments, &current, preceding_sep.take());
146                current.clear();
147                preceding_sep = Some("\n".to_string());
148                i += 1;
149                continue;
150            }
151            _ => {
152                current.push(ch);
153                i += 1;
154            }
155        }
156    }
157
158    push_segment(&mut segments, &current, preceding_sep.take());
159    segments
160}
161
162fn tokenize_fish(input: &str) -> Vec<Segment> {
163    // Fish is similar to POSIX but with some differences:
164    // - No backslash-newline continuation
165    // - Different quoting rules (but close enough for our purposes)
166    // For URL extraction, POSIX tokenization works well enough
167    tokenize_posix(input)
168}
169
170fn tokenize_powershell(input: &str) -> Vec<Segment> {
171    let mut segments = Vec::new();
172    let mut current = String::new();
173    let mut preceding_sep = None;
174    // Collect (byte_offset, char) pairs so byte slicing stays valid for multi-byte UTF-8.
175    let indexed: Vec<(usize, char)> = input.char_indices().collect();
176    let len = indexed.len();
177    let mut i = 0;
178
179    while i < len {
180        let (byte_off, ch) = indexed[i];
181
182        match ch {
183            // Backtick escaping in PowerShell
184            '`' if i + 1 < len => {
185                current.push(indexed[i].1);
186                current.push(indexed[i + 1].1);
187                i += 2;
188                continue;
189            }
190            // Single quotes: literal
191            '\'' => {
192                current.push(ch);
193                i += 1;
194                while i < len && indexed[i].1 != '\'' {
195                    current.push(indexed[i].1);
196                    i += 1;
197                }
198                if i < len {
199                    current.push(indexed[i].1);
200                    i += 1;
201                }
202                continue;
203            }
204            // Double quotes
205            '"' => {
206                current.push(ch);
207                i += 1;
208                while i < len && indexed[i].1 != '"' {
209                    if indexed[i].1 == '`' && i + 1 < len {
210                        current.push(indexed[i].1);
211                        current.push(indexed[i + 1].1);
212                        i += 2;
213                    } else {
214                        current.push(indexed[i].1);
215                        i += 1;
216                    }
217                }
218                if i < len {
219                    current.push(indexed[i].1);
220                    i += 1;
221                }
222                continue;
223            }
224            // Pipe
225            '|' => {
226                push_segment(&mut segments, &current, preceding_sep.take());
227                current.clear();
228                preceding_sep = Some("|".to_string());
229                i += 1;
230                continue;
231            }
232            // Semicolon
233            ';' => {
234                push_segment(&mut segments, &current, preceding_sep.take());
235                current.clear();
236                preceding_sep = Some(";".to_string());
237                i += 1;
238                continue;
239            }
240            // Check for -and / -or operators (PowerShell logical)
241            '-' if current.ends_with(char::is_whitespace) || current.is_empty() => {
242                let remaining = &input[byte_off..];
243                if remaining.starts_with("-and")
244                    && remaining[4..]
245                        .chars()
246                        .next()
247                        .is_none_or(|c| c.is_whitespace())
248                {
249                    push_segment(&mut segments, &current, preceding_sep.take());
250                    current.clear();
251                    preceding_sep = Some("-and".to_string());
252                    i += 4;
253                    continue;
254                } else if remaining.starts_with("-or")
255                    && remaining[3..]
256                        .chars()
257                        .next()
258                        .is_none_or(|c| c.is_whitespace())
259                {
260                    push_segment(&mut segments, &current, preceding_sep.take());
261                    current.clear();
262                    preceding_sep = Some("-or".to_string());
263                    i += 3;
264                    continue;
265                }
266                current.push(ch);
267                i += 1;
268            }
269            '\n' => {
270                push_segment(&mut segments, &current, preceding_sep.take());
271                current.clear();
272                preceding_sep = Some("\n".to_string());
273                i += 1;
274                continue;
275            }
276            _ => {
277                current.push(ch);
278                i += 1;
279            }
280        }
281    }
282
283    push_segment(&mut segments, &current, preceding_sep.take());
284    segments
285}
286
287fn tokenize_cmd(input: &str) -> Vec<Segment> {
288    let mut segments = Vec::new();
289    let mut current = String::new();
290    let mut preceding_sep = None;
291    let chars: Vec<char> = input.chars().collect();
292    let len = chars.len();
293    let mut i = 0;
294
295    while i < len {
296        let ch = chars[i];
297        match ch {
298            // Caret escaping (cmd.exe escape character)
299            '^' if i + 1 < len => {
300                current.push(chars[i]);
301                current.push(chars[i + 1]);
302                i += 2;
303                continue;
304            }
305            // Double quotes (only quoting mechanism in cmd)
306            '"' => {
307                current.push(ch);
308                i += 1;
309                while i < len && chars[i] != '"' {
310                    current.push(chars[i]);
311                    i += 1;
312                }
313                if i < len {
314                    current.push(chars[i]);
315                    i += 1;
316                }
317                continue;
318            }
319            // Pipe
320            '|' => {
321                if i + 1 < len && chars[i + 1] == '|' {
322                    push_segment(&mut segments, &current, preceding_sep.take());
323                    current.clear();
324                    preceding_sep = Some("||".to_string());
325                    i += 2;
326                } else {
327                    push_segment(&mut segments, &current, preceding_sep.take());
328                    current.clear();
329                    preceding_sep = Some("|".to_string());
330                    i += 1;
331                }
332                continue;
333            }
334            // & and &&
335            '&' => {
336                if i + 1 < len && chars[i + 1] == '&' {
337                    push_segment(&mut segments, &current, preceding_sep.take());
338                    current.clear();
339                    preceding_sep = Some("&&".to_string());
340                    i += 2;
341                } else {
342                    push_segment(&mut segments, &current, preceding_sep.take());
343                    current.clear();
344                    preceding_sep = Some("&".to_string());
345                    i += 1;
346                }
347                continue;
348            }
349            '\n' => {
350                push_segment(&mut segments, &current, preceding_sep.take());
351                current.clear();
352                preceding_sep = Some("\n".to_string());
353                i += 1;
354                continue;
355            }
356            _ => {
357                current.push(ch);
358                i += 1;
359            }
360        }
361    }
362    push_segment(&mut segments, &current, preceding_sep.take());
363    segments
364}
365
366fn push_segment(segments: &mut Vec<Segment>, raw: &str, preceding_sep: Option<String>) {
367    let trimmed = raw.trim();
368    if trimmed.is_empty() {
369        return;
370    }
371
372    let words = split_words(trimmed);
373    // Skip leading environment variable assignments (VAR=VALUE)
374    let first_non_assign = words.iter().position(|w| !is_env_assignment(w));
375    let (command, args) = match first_non_assign {
376        Some(idx) => {
377            let cmd = Some(words[idx].clone());
378            let args = if idx + 1 < words.len() {
379                words[idx + 1..].to_vec()
380            } else {
381                Vec::new()
382            };
383            (cmd, args)
384        }
385        None => {
386            // All words are assignments, no command
387            (None, Vec::new())
388        }
389    };
390
391    segments.push(Segment {
392        raw: trimmed.to_string(),
393        command,
394        args,
395        preceding_separator: preceding_sep,
396    });
397}
398
399/// Check if a word looks like a shell environment variable assignment (NAME=VALUE).
400/// Must have at least one char before `=`, and the name must be alphanumeric/underscore.
401pub fn is_env_assignment(word: &str) -> bool {
402    let s = word.trim();
403    if s.starts_with('-') || s.starts_with('=') {
404        return false;
405    }
406    if let Some(eq_pos) = s.find('=') {
407        if eq_pos == 0 {
408            return false;
409        }
410        let name = &s[..eq_pos];
411        let first = name.chars().next().unwrap_or('0');
412        if first.is_ascii_digit() {
413            return false;
414        }
415        name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_')
416    } else {
417        false
418    }
419}
420
421/// Split a segment into words, respecting quotes.
422fn split_words(input: &str) -> Vec<String> {
423    let mut words = Vec::new();
424    let mut current = String::new();
425    let chars: Vec<char> = input.chars().collect();
426    let len = chars.len();
427    let mut i = 0;
428
429    while i < len {
430        let ch = chars[i];
431        match ch {
432            ' ' | '\t' if !current.is_empty() => {
433                words.push(current.clone());
434                current.clear();
435                i += 1;
436                // Skip whitespace
437                while i < len && (chars[i] == ' ' || chars[i] == '\t') {
438                    i += 1;
439                }
440            }
441            ' ' | '\t' => {
442                i += 1;
443            }
444            '\'' => {
445                current.push(ch);
446                i += 1;
447                while i < len && chars[i] != '\'' {
448                    current.push(chars[i]);
449                    i += 1;
450                }
451                if i < len {
452                    current.push(chars[i]);
453                    i += 1;
454                }
455            }
456            '"' => {
457                current.push(ch);
458                i += 1;
459                while i < len && chars[i] != '"' {
460                    if chars[i] == '\\' && i + 1 < len {
461                        current.push(chars[i]);
462                        current.push(chars[i + 1]);
463                        i += 2;
464                    } else {
465                        current.push(chars[i]);
466                        i += 1;
467                    }
468                }
469                if i < len {
470                    current.push(chars[i]);
471                    i += 1;
472                }
473            }
474            '\\' if i + 1 < len => {
475                current.push(chars[i]);
476                current.push(chars[i + 1]);
477                i += 2;
478            }
479            _ => {
480                current.push(ch);
481                i += 1;
482            }
483        }
484    }
485
486    if !current.is_empty() {
487        words.push(current);
488    }
489
490    words
491}
492
493#[cfg(test)]
494mod tests {
495    use super::*;
496
497    #[test]
498    fn test_simple_pipe() {
499        let segs = tokenize("echo hello | grep world", ShellType::Posix);
500        assert_eq!(segs.len(), 2);
501        assert_eq!(segs[0].command.as_deref(), Some("echo"));
502        assert_eq!(segs[1].command.as_deref(), Some("grep"));
503        assert_eq!(segs[1].preceding_separator.as_deref(), Some("|"));
504    }
505
506    #[test]
507    fn test_quoted_pipe() {
508        let segs = tokenize(r#"echo "hello | world" | bash"#, ShellType::Posix);
509        assert_eq!(segs.len(), 2);
510        assert_eq!(segs[0].raw, r#"echo "hello | world""#);
511        assert_eq!(segs[1].command.as_deref(), Some("bash"));
512    }
513
514    #[test]
515    fn test_and_or() {
516        let segs = tokenize("cmd1 && cmd2 || cmd3", ShellType::Posix);
517        assert_eq!(segs.len(), 3);
518        assert_eq!(segs[1].preceding_separator.as_deref(), Some("&&"));
519        assert_eq!(segs[2].preceding_separator.as_deref(), Some("||"));
520    }
521
522    #[test]
523    fn test_semicolon() {
524        let segs = tokenize("cmd1; cmd2", ShellType::Posix);
525        assert_eq!(segs.len(), 2);
526        assert_eq!(segs[1].preceding_separator.as_deref(), Some(";"));
527    }
528
529    #[test]
530    fn test_pipe_ampersand() {
531        let segs = tokenize("cmd1 |& cmd2", ShellType::Posix);
532        assert_eq!(segs.len(), 2);
533        assert_eq!(segs[1].preceding_separator.as_deref(), Some("|&"));
534    }
535
536    #[test]
537    fn test_powershell_pipe() {
538        let segs = tokenize("iwr url | iex", ShellType::PowerShell);
539        assert_eq!(segs.len(), 2);
540        assert_eq!(segs[0].command.as_deref(), Some("iwr"));
541        assert_eq!(segs[1].command.as_deref(), Some("iex"));
542    }
543
544    #[test]
545    fn test_powershell_backtick() {
546        let segs = tokenize("echo `| not a pipe", ShellType::PowerShell);
547        // backtick escapes the pipe
548        assert_eq!(segs.len(), 1);
549    }
550
551    #[test]
552    fn test_single_quotes() {
553        let segs = tokenize("echo 'hello | world' | bash", ShellType::Posix);
554        assert_eq!(segs.len(), 2);
555    }
556
557    #[test]
558    fn test_backslash_escape() {
559        let segs = tokenize("echo hello\\|world | bash", ShellType::Posix);
560        // The backslash-pipe is inside the first segment
561        assert_eq!(segs.len(), 2);
562    }
563
564    #[test]
565    fn test_empty_input() {
566        let segs = tokenize("", ShellType::Posix);
567        assert!(segs.is_empty());
568    }
569
570    #[test]
571    fn test_whitespace_only() {
572        let segs = tokenize("   ", ShellType::Posix);
573        assert!(segs.is_empty());
574    }
575
576    #[test]
577    fn test_args_extraction() {
578        let segs = tokenize("curl -sSL https://example.com", ShellType::Posix);
579        assert_eq!(segs.len(), 1);
580        assert_eq!(segs[0].command.as_deref(), Some("curl"));
581        assert_eq!(segs[0].args.len(), 2);
582    }
583
584    #[test]
585    fn test_env_prefix_skipped() {
586        let segs = tokenize("TIRITH=0 curl evil.com", ShellType::Posix);
587        assert_eq!(segs.len(), 1);
588        assert_eq!(segs[0].command.as_deref(), Some("curl"));
589        assert_eq!(segs[0].args, vec!["evil.com"]);
590    }
591
592    #[test]
593    fn test_multiple_env_prefixes() {
594        let segs = tokenize("FOO=bar BAZ=1 python script.py", ShellType::Posix);
595        assert_eq!(segs.len(), 1);
596        assert_eq!(segs[0].command.as_deref(), Some("python"));
597        assert_eq!(segs[0].args, vec!["script.py"]);
598    }
599
600    #[test]
601    fn test_env_only_no_command() {
602        let segs = tokenize("TIRITH=0", ShellType::Posix);
603        assert_eq!(segs.len(), 1);
604        assert_eq!(segs[0].command, None);
605        assert!(segs[0].args.is_empty());
606    }
607
608    #[test]
609    fn test_is_env_assignment() {
610        assert!(is_env_assignment("FOO=bar"));
611        assert!(is_env_assignment("TIRITH=0"));
612        assert!(is_env_assignment("PATH=/usr/bin"));
613        assert!(is_env_assignment("A="));
614        assert!(!is_env_assignment("-o"));
615        assert!(!is_env_assignment("curl"));
616        assert!(!is_env_assignment("=value"));
617        assert!(!is_env_assignment("--flag=value"));
618        assert!(!is_env_assignment("1FOO=bar"));
619    }
620
621    #[test]
622    fn test_cmd_pipe() {
623        let segs = tokenize("dir | findstr foo", ShellType::Cmd);
624        assert_eq!(segs.len(), 2);
625        assert_eq!(segs[0].command.as_deref(), Some("dir"));
626        assert_eq!(segs[1].command.as_deref(), Some("findstr"));
627    }
628
629    #[test]
630    fn test_cmd_ampersand_separator() {
631        let segs = tokenize("dir & echo done", ShellType::Cmd);
632        assert_eq!(segs.len(), 2);
633        assert_eq!(segs[1].preceding_separator.as_deref(), Some("&"));
634    }
635
636    #[test]
637    fn test_cmd_double_ampersand() {
638        let segs = tokenize("cmd1 && cmd2", ShellType::Cmd);
639        assert_eq!(segs.len(), 2);
640        assert_eq!(segs[1].preceding_separator.as_deref(), Some("&&"));
641    }
642
643    #[test]
644    fn test_cmd_caret_escape() {
645        let segs = tokenize("echo hello^|world | findstr x", ShellType::Cmd);
646        // ^| is escaped, not a pipe
647        assert_eq!(segs.len(), 2);
648    }
649
650    #[test]
651    fn test_cmd_double_quotes() {
652        let segs = tokenize(r#"echo "hello | world" | findstr x"#, ShellType::Cmd);
653        assert_eq!(segs.len(), 2);
654    }
655
656    #[test]
657    fn test_powershell_multibyte_and_operator_no_panic() {
658        // Regression test for fuzz crash: multi-byte UTF-8 before -and caused
659        // byte/char index mismatch panic in &input[i..] slicing.
660        let input = " ?]BB\u{07E7}\u{07E7} -\n-\r-and-~\0\u{c}-and-~\u{1d}";
661        let _ = tokenize(input, ShellType::PowerShell);
662    }
663}