tirith_core/
tokenize.rs

1use serde::{Deserialize, Serialize};
2
3/// Shell type for tokenization rules.
4#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
5#[serde(rename_all = "lowercase")]
6pub enum ShellType {
7    Posix,
8    Fish,
9    PowerShell,
10    Cmd,
11}
12
13impl std::str::FromStr for ShellType {
14    type Err = String;
15    fn from_str(s: &str) -> Result<Self, Self::Err> {
16        match s.to_lowercase().as_str() {
17            "posix" | "bash" | "zsh" | "sh" => Ok(ShellType::Posix),
18            "fish" => Ok(ShellType::Fish),
19            "powershell" | "pwsh" => Ok(ShellType::PowerShell),
20            "cmd" | "cmd.exe" => Ok(ShellType::Cmd),
21            _ => Err(format!("unknown shell type: {s}")),
22        }
23    }
24}
25
26/// A segment of a tokenized command.
27#[derive(Debug, Clone)]
28pub struct Segment {
29    /// The raw text of this segment.
30    pub raw: String,
31    /// The first word/command of this segment, if identifiable.
32    pub command: Option<String>,
33    /// Arguments following the command.
34    pub args: Vec<String>,
35    /// The separator that preceded this segment (e.g., `|`, `&&`).
36    pub preceding_separator: Option<String>,
37    /// Byte range of the *trimmed* segment content within the original input.
38    /// `input[segment.byte_range.clone()] == segment.raw` holds.
39    ///
40    /// Lets downstream rules carve out byte spans of specific segments (e.g.
41    /// args to `tirith diff/score/why`). Test helpers that construct `Segment`
42    /// directly can set any range; production code goes through `push_segment`
43    /// which derives it from `input`.
44    pub byte_range: std::ops::Range<usize>,
45}
46
47/// Tokenize a command string according to shell type.
48pub fn tokenize(input: &str, shell: ShellType) -> Vec<Segment> {
49    match shell {
50        ShellType::Posix => tokenize_posix(input),
51        ShellType::Fish => tokenize_fish(input),
52        ShellType::PowerShell => tokenize_powershell(input),
53        ShellType::Cmd => tokenize_cmd(input),
54    }
55}
56
57fn tokenize_posix(input: &str) -> Vec<Segment> {
58    let mut segments = Vec::new();
59    let mut current = String::new();
60    let mut preceding_sep = None;
61    let mut search_cursor: usize = 0;
62    let chars: Vec<char> = input.chars().collect();
63    let len = chars.len();
64    let mut i = 0;
65
66    while i < len {
67        let ch = chars[i];
68
69        match ch {
70            // Backslash escaping
71            '\\' if i + 1 < len => {
72                current.push(chars[i]);
73                current.push(chars[i + 1]);
74                i += 2;
75                continue;
76            }
77            // Single quotes: everything literal until closing quote
78            '\'' => {
79                current.push(ch);
80                i += 1;
81                while i < len && chars[i] != '\'' {
82                    current.push(chars[i]);
83                    i += 1;
84                }
85                if i < len {
86                    current.push(chars[i]); // closing quote
87                    i += 1;
88                }
89                continue;
90            }
91            // Double quotes: allow backslash escaping inside
92            '"' => {
93                current.push(ch);
94                i += 1;
95                while i < len && chars[i] != '"' {
96                    if chars[i] == '\\' && i + 1 < len {
97                        current.push(chars[i]);
98                        current.push(chars[i + 1]);
99                        i += 2;
100                    } else {
101                        current.push(chars[i]);
102                        i += 1;
103                    }
104                }
105                if i < len {
106                    current.push(chars[i]); // closing quote
107                    i += 1;
108                }
109                continue;
110            }
111            // Pipe operators
112            '|' => {
113                if i + 1 < len && chars[i + 1] == '|' {
114                    // ||
115                    push_segment(
116                        &mut segments,
117                        &current,
118                        preceding_sep.take(),
119                        input,
120                        &mut search_cursor,
121                    );
122                    current.clear();
123                    preceding_sep = Some("||".to_string());
124                    i += 2;
125                    continue;
126                } else if i + 1 < len && chars[i + 1] == '&' {
127                    // |& (bash: pipe stderr too)
128                    push_segment(
129                        &mut segments,
130                        &current,
131                        preceding_sep.take(),
132                        input,
133                        &mut search_cursor,
134                    );
135                    current.clear();
136                    preceding_sep = Some("|&".to_string());
137                    i += 2;
138                    continue;
139                } else {
140                    // |
141                    push_segment(
142                        &mut segments,
143                        &current,
144                        preceding_sep.take(),
145                        input,
146                        &mut search_cursor,
147                    );
148                    current.clear();
149                    preceding_sep = Some("|".to_string());
150                    i += 1;
151                    continue;
152                }
153            }
154            // && operator
155            '&' if i + 1 < len && chars[i + 1] == '&' => {
156                push_segment(
157                    &mut segments,
158                    &current,
159                    preceding_sep.take(),
160                    input,
161                    &mut search_cursor,
162                );
163                current.clear();
164                preceding_sep = Some("&&".to_string());
165                i += 2;
166                continue;
167            }
168            // Semicolon
169            ';' => {
170                push_segment(
171                    &mut segments,
172                    &current,
173                    preceding_sep.take(),
174                    input,
175                    &mut search_cursor,
176                );
177                current.clear();
178                preceding_sep = Some(";".to_string());
179                i += 1;
180                continue;
181            }
182            // Newline
183            '\n' => {
184                push_segment(
185                    &mut segments,
186                    &current,
187                    preceding_sep.take(),
188                    input,
189                    &mut search_cursor,
190                );
191                current.clear();
192                preceding_sep = Some("\n".to_string());
193                i += 1;
194                continue;
195            }
196            _ => {
197                current.push(ch);
198                i += 1;
199            }
200        }
201    }
202
203    push_segment(
204        &mut segments,
205        &current,
206        preceding_sep.take(),
207        input,
208        &mut search_cursor,
209    );
210    segments
211}
212
213fn tokenize_fish(input: &str) -> Vec<Segment> {
214    // Fish is similar to POSIX but with some differences:
215    // - No backslash-newline continuation
216    // - Different quoting rules (but close enough for our purposes)
217    // For URL extraction, POSIX tokenization works well enough
218    tokenize_posix(input)
219}
220
221fn tokenize_powershell(input: &str) -> Vec<Segment> {
222    let mut segments = Vec::new();
223    let mut current = String::new();
224    let mut preceding_sep = None;
225    let mut search_cursor: usize = 0;
226    // Collect (byte_offset, char) pairs so byte slicing stays valid for multi-byte UTF-8.
227    let indexed: Vec<(usize, char)> = input.char_indices().collect();
228    let len = indexed.len();
229    let mut i = 0;
230
231    while i < len {
232        let (byte_off, ch) = indexed[i];
233
234        match ch {
235            // Backtick escaping in PowerShell
236            '`' if i + 1 < len => {
237                current.push(indexed[i].1);
238                current.push(indexed[i + 1].1);
239                i += 2;
240                continue;
241            }
242            // Single quotes: literal
243            '\'' => {
244                current.push(ch);
245                i += 1;
246                while i < len && indexed[i].1 != '\'' {
247                    current.push(indexed[i].1);
248                    i += 1;
249                }
250                if i < len {
251                    current.push(indexed[i].1);
252                    i += 1;
253                }
254                continue;
255            }
256            // Double quotes
257            '"' => {
258                current.push(ch);
259                i += 1;
260                while i < len && indexed[i].1 != '"' {
261                    if indexed[i].1 == '`' && i + 1 < len {
262                        current.push(indexed[i].1);
263                        current.push(indexed[i + 1].1);
264                        i += 2;
265                    } else {
266                        current.push(indexed[i].1);
267                        i += 1;
268                    }
269                }
270                if i < len {
271                    current.push(indexed[i].1);
272                    i += 1;
273                }
274                continue;
275            }
276            // Pipe
277            '|' => {
278                push_segment(
279                    &mut segments,
280                    &current,
281                    preceding_sep.take(),
282                    input,
283                    &mut search_cursor,
284                );
285                current.clear();
286                preceding_sep = Some("|".to_string());
287                i += 1;
288                continue;
289            }
290            // Semicolon
291            ';' => {
292                push_segment(
293                    &mut segments,
294                    &current,
295                    preceding_sep.take(),
296                    input,
297                    &mut search_cursor,
298                );
299                current.clear();
300                preceding_sep = Some(";".to_string());
301                i += 1;
302                continue;
303            }
304            // Check for -and / -or operators (PowerShell logical)
305            '-' if current.ends_with(char::is_whitespace) || current.is_empty() => {
306                let remaining = &input[byte_off..];
307                if remaining.starts_with("-and")
308                    && remaining[4..]
309                        .chars()
310                        .next()
311                        .is_none_or(|c| c.is_whitespace())
312                {
313                    push_segment(
314                        &mut segments,
315                        &current,
316                        preceding_sep.take(),
317                        input,
318                        &mut search_cursor,
319                    );
320                    current.clear();
321                    preceding_sep = Some("-and".to_string());
322                    i += 4;
323                    continue;
324                } else if remaining.starts_with("-or")
325                    && remaining[3..]
326                        .chars()
327                        .next()
328                        .is_none_or(|c| c.is_whitespace())
329                {
330                    push_segment(
331                        &mut segments,
332                        &current,
333                        preceding_sep.take(),
334                        input,
335                        &mut search_cursor,
336                    );
337                    current.clear();
338                    preceding_sep = Some("-or".to_string());
339                    i += 3;
340                    continue;
341                }
342                current.push(ch);
343                i += 1;
344            }
345            '\n' => {
346                push_segment(
347                    &mut segments,
348                    &current,
349                    preceding_sep.take(),
350                    input,
351                    &mut search_cursor,
352                );
353                current.clear();
354                preceding_sep = Some("\n".to_string());
355                i += 1;
356                continue;
357            }
358            _ => {
359                current.push(ch);
360                i += 1;
361            }
362        }
363    }
364
365    push_segment(
366        &mut segments,
367        &current,
368        preceding_sep.take(),
369        input,
370        &mut search_cursor,
371    );
372    segments
373}
374
375fn tokenize_cmd(input: &str) -> Vec<Segment> {
376    let mut segments = Vec::new();
377    let mut current = String::new();
378    let mut preceding_sep = None;
379    let mut search_cursor: usize = 0;
380    let chars: Vec<char> = input.chars().collect();
381    let len = chars.len();
382    let mut i = 0;
383
384    while i < len {
385        let ch = chars[i];
386        match ch {
387            // Caret escaping (cmd.exe escape character)
388            '^' if i + 1 < len => {
389                current.push(chars[i]);
390                current.push(chars[i + 1]);
391                i += 2;
392                continue;
393            }
394            // Double quotes (only quoting mechanism in cmd)
395            '"' => {
396                current.push(ch);
397                i += 1;
398                while i < len && chars[i] != '"' {
399                    current.push(chars[i]);
400                    i += 1;
401                }
402                if i < len {
403                    current.push(chars[i]);
404                    i += 1;
405                }
406                continue;
407            }
408            // Pipe
409            '|' => {
410                if i + 1 < len && chars[i + 1] == '|' {
411                    push_segment(
412                        &mut segments,
413                        &current,
414                        preceding_sep.take(),
415                        input,
416                        &mut search_cursor,
417                    );
418                    current.clear();
419                    preceding_sep = Some("||".to_string());
420                    i += 2;
421                } else {
422                    push_segment(
423                        &mut segments,
424                        &current,
425                        preceding_sep.take(),
426                        input,
427                        &mut search_cursor,
428                    );
429                    current.clear();
430                    preceding_sep = Some("|".to_string());
431                    i += 1;
432                }
433                continue;
434            }
435            // & and &&
436            '&' => {
437                if i + 1 < len && chars[i + 1] == '&' {
438                    push_segment(
439                        &mut segments,
440                        &current,
441                        preceding_sep.take(),
442                        input,
443                        &mut search_cursor,
444                    );
445                    current.clear();
446                    preceding_sep = Some("&&".to_string());
447                    i += 2;
448                } else {
449                    push_segment(
450                        &mut segments,
451                        &current,
452                        preceding_sep.take(),
453                        input,
454                        &mut search_cursor,
455                    );
456                    current.clear();
457                    preceding_sep = Some("&".to_string());
458                    i += 1;
459                }
460                continue;
461            }
462            '\n' => {
463                push_segment(
464                    &mut segments,
465                    &current,
466                    preceding_sep.take(),
467                    input,
468                    &mut search_cursor,
469                );
470                current.clear();
471                preceding_sep = Some("\n".to_string());
472                i += 1;
473                continue;
474            }
475            _ => {
476                current.push(ch);
477                i += 1;
478            }
479        }
480    }
481    push_segment(
482        &mut segments,
483        &current,
484        preceding_sep.take(),
485        input,
486        &mut search_cursor,
487    );
488    segments
489}
490
491/// Push a tokenized segment into `segments`, trimming leading/trailing
492/// whitespace and locating the trimmed content in `input` to populate
493/// `byte_range`.
494///
495/// `search_cursor` is advanced past the pushed segment so subsequent
496/// searches skip already-consumed bytes (handles duplicate segments like
497/// `foo | foo` correctly).
498fn push_segment(
499    segments: &mut Vec<Segment>,
500    raw: &str,
501    preceding_sep: Option<String>,
502    input: &str,
503    search_cursor: &mut usize,
504) {
505    let trimmed = raw.trim();
506    if trimmed.is_empty() {
507        return;
508    }
509
510    // The tokenizer copies input bytes verbatim into the accumulator, so
511    // `trimmed` must appear as a substring of `input` at or after
512    // `*search_cursor`. If it doesn't, callers constructed `raw` in an
513    // unexpected way — fall back to a range that preserves invariants.
514    let byte_range = match input.get(*search_cursor..).and_then(|s| s.find(trimmed)) {
515        Some(rel_pos) => {
516            let start = *search_cursor + rel_pos;
517            let end = start + trimmed.len();
518            *search_cursor = end;
519            start..end
520        }
521        None => {
522            // Shouldn't happen in normal flow; emit a zero-width placeholder
523            // rooted at the cursor so downstream code still gets a valid
524            // Range<usize> and doesn't panic on slicing.
525            let cursor = (*search_cursor).min(input.len());
526            cursor..cursor
527        }
528    };
529
530    let words = split_words(trimmed);
531    // Skip leading environment variable assignments (VAR=VALUE)
532    let first_non_assign = words.iter().position(|w| !is_env_assignment(w));
533    let (command, args) = match first_non_assign {
534        Some(idx) => {
535            let cmd = Some(words[idx].clone());
536            let args = if idx + 1 < words.len() {
537                words[idx + 1..].to_vec()
538            } else {
539                Vec::new()
540            };
541            (cmd, args)
542        }
543        None => {
544            // All words are assignments, no command
545            (None, Vec::new())
546        }
547    };
548
549    segments.push(Segment {
550        raw: trimmed.to_string(),
551        command,
552        args,
553        preceding_separator: preceding_sep,
554        byte_range,
555    });
556}
557
558/// Check if a word looks like a shell environment variable assignment (NAME=VALUE).
559/// Must have at least one char before `=`, and the name must be alphanumeric/underscore.
560pub fn is_env_assignment(word: &str) -> bool {
561    let s = word.trim();
562    if s.starts_with('-') || s.starts_with('=') {
563        return false;
564    }
565    if let Some(eq_pos) = s.find('=') {
566        if eq_pos == 0 {
567            return false;
568        }
569        let name = &s[..eq_pos];
570        let first = name.chars().next().unwrap_or('0');
571        if first.is_ascii_digit() {
572            return false;
573        }
574        name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_')
575    } else {
576        false
577    }
578}
579
580/// Return the values from leading `NAME=VALUE` tokens in a raw segment.
581/// Stops at the first non-assignment word, matching the shell prefix-assignment model.
582pub fn leading_env_assignments(segment_raw: &str) -> Vec<(String, String)> {
583    let mut assignments = Vec::new();
584    for word in split_words(segment_raw.trim()) {
585        if !is_env_assignment(&word) {
586            break;
587        }
588        if let Some((name, value)) = word.split_once('=') {
589            assignments.push((name.to_string(), value.to_string()));
590        }
591    }
592    assignments
593}
594
595/// Return the values from leading `NAME=VALUE` tokens in a raw segment.
596/// Stops at the first non-assignment word, matching the shell prefix-assignment model.
597pub fn leading_env_assignment_values(segment_raw: &str) -> Vec<String> {
598    leading_env_assignments(segment_raw)
599        .into_iter()
600        .map(|(_, value)| value)
601        .collect()
602}
603
604/// Split a segment into words, respecting quotes.
605fn split_words(input: &str) -> Vec<String> {
606    let mut words = Vec::new();
607    let mut current = String::new();
608    let chars: Vec<char> = input.chars().collect();
609    let len = chars.len();
610    let mut i = 0;
611
612    while i < len {
613        let ch = chars[i];
614        match ch {
615            ' ' | '\t' if !current.is_empty() => {
616                words.push(current.clone());
617                current.clear();
618                i += 1;
619                // Skip whitespace
620                while i < len && (chars[i] == ' ' || chars[i] == '\t') {
621                    i += 1;
622                }
623            }
624            ' ' | '\t' => {
625                i += 1;
626            }
627            '\'' => {
628                current.push(ch);
629                i += 1;
630                while i < len && chars[i] != '\'' {
631                    current.push(chars[i]);
632                    i += 1;
633                }
634                if i < len {
635                    current.push(chars[i]);
636                    i += 1;
637                }
638            }
639            '"' => {
640                current.push(ch);
641                i += 1;
642                while i < len && chars[i] != '"' {
643                    if chars[i] == '\\' && i + 1 < len {
644                        current.push(chars[i]);
645                        current.push(chars[i + 1]);
646                        i += 2;
647                    } else {
648                        current.push(chars[i]);
649                        i += 1;
650                    }
651                }
652                if i < len {
653                    current.push(chars[i]);
654                    i += 1;
655                }
656            }
657            '\\' if i + 1 < len => {
658                current.push(chars[i]);
659                current.push(chars[i + 1]);
660                i += 2;
661            }
662            _ => {
663                current.push(ch);
664                i += 1;
665            }
666        }
667    }
668
669    if !current.is_empty() {
670        words.push(current);
671    }
672
673    words
674}
675
676#[cfg(test)]
677mod tests {
678    use super::*;
679
680    #[test]
681    fn test_simple_pipe() {
682        let segs = tokenize("echo hello | grep world", ShellType::Posix);
683        assert_eq!(segs.len(), 2);
684        assert_eq!(segs[0].command.as_deref(), Some("echo"));
685        assert_eq!(segs[1].command.as_deref(), Some("grep"));
686        assert_eq!(segs[1].preceding_separator.as_deref(), Some("|"));
687    }
688
689    #[test]
690    fn test_quoted_pipe() {
691        let segs = tokenize(r#"echo "hello | world" | bash"#, ShellType::Posix);
692        assert_eq!(segs.len(), 2);
693        assert_eq!(segs[0].raw, r#"echo "hello | world""#);
694        assert_eq!(segs[1].command.as_deref(), Some("bash"));
695    }
696
697    #[test]
698    fn test_and_or() {
699        let segs = tokenize("cmd1 && cmd2 || cmd3", ShellType::Posix);
700        assert_eq!(segs.len(), 3);
701        assert_eq!(segs[1].preceding_separator.as_deref(), Some("&&"));
702        assert_eq!(segs[2].preceding_separator.as_deref(), Some("||"));
703    }
704
705    #[test]
706    fn test_semicolon() {
707        let segs = tokenize("cmd1; cmd2", ShellType::Posix);
708        assert_eq!(segs.len(), 2);
709        assert_eq!(segs[1].preceding_separator.as_deref(), Some(";"));
710    }
711
712    #[test]
713    fn test_pipe_ampersand() {
714        let segs = tokenize("cmd1 |& cmd2", ShellType::Posix);
715        assert_eq!(segs.len(), 2);
716        assert_eq!(segs[1].preceding_separator.as_deref(), Some("|&"));
717    }
718
719    #[test]
720    fn test_powershell_pipe() {
721        let segs = tokenize("iwr url | iex", ShellType::PowerShell);
722        assert_eq!(segs.len(), 2);
723        assert_eq!(segs[0].command.as_deref(), Some("iwr"));
724        assert_eq!(segs[1].command.as_deref(), Some("iex"));
725    }
726
727    #[test]
728    fn test_powershell_backtick() {
729        let segs = tokenize("echo `| not a pipe", ShellType::PowerShell);
730        // backtick escapes the pipe
731        assert_eq!(segs.len(), 1);
732    }
733
734    #[test]
735    fn test_single_quotes() {
736        let segs = tokenize("echo 'hello | world' | bash", ShellType::Posix);
737        assert_eq!(segs.len(), 2);
738    }
739
740    #[test]
741    fn test_backslash_escape() {
742        let segs = tokenize("echo hello\\|world | bash", ShellType::Posix);
743        // The backslash-pipe is inside the first segment
744        assert_eq!(segs.len(), 2);
745    }
746
747    #[test]
748    fn test_empty_input() {
749        let segs = tokenize("", ShellType::Posix);
750        assert!(segs.is_empty());
751    }
752
753    #[test]
754    fn test_whitespace_only() {
755        let segs = tokenize("   ", ShellType::Posix);
756        assert!(segs.is_empty());
757    }
758
759    #[test]
760    fn test_args_extraction() {
761        let segs = tokenize("curl -sSL https://example.com", ShellType::Posix);
762        assert_eq!(segs.len(), 1);
763        assert_eq!(segs[0].command.as_deref(), Some("curl"));
764        assert_eq!(segs[0].args.len(), 2);
765    }
766
767    #[test]
768    fn test_env_prefix_skipped() {
769        let segs = tokenize("TIRITH=0 curl evil.com", ShellType::Posix);
770        assert_eq!(segs.len(), 1);
771        assert_eq!(segs[0].command.as_deref(), Some("curl"));
772        assert_eq!(segs[0].args, vec!["evil.com"]);
773    }
774
775    #[test]
776    fn test_multiple_env_prefixes() {
777        let segs = tokenize("FOO=bar BAZ=1 python script.py", ShellType::Posix);
778        assert_eq!(segs.len(), 1);
779        assert_eq!(segs[0].command.as_deref(), Some("python"));
780        assert_eq!(segs[0].args, vec!["script.py"]);
781    }
782
783    #[test]
784    fn test_env_only_no_command() {
785        let segs = tokenize("TIRITH=0", ShellType::Posix);
786        assert_eq!(segs.len(), 1);
787        assert_eq!(segs[0].command, None);
788        assert!(segs[0].args.is_empty());
789    }
790
791    #[test]
792    fn test_is_env_assignment() {
793        assert!(is_env_assignment("FOO=bar"));
794        assert!(is_env_assignment("TIRITH=0"));
795        assert!(is_env_assignment("PATH=/usr/bin"));
796        assert!(is_env_assignment("A="));
797        assert!(!is_env_assignment("-o"));
798        assert!(!is_env_assignment("curl"));
799        assert!(!is_env_assignment("=value"));
800        assert!(!is_env_assignment("--flag=value"));
801        assert!(!is_env_assignment("1FOO=bar"));
802    }
803
804    #[test]
805    fn test_leading_env_assignment_values() {
806        assert_eq!(
807            leading_env_assignment_values("URL=https://example.com curl ok"),
808            vec!["https://example.com"]
809        );
810        assert_eq!(
811            leading_env_assignments("URL='https://example.com/a' FOO=bar curl ok"),
812            vec![
813                ("URL".to_string(), "'https://example.com/a'".to_string()),
814                ("FOO".to_string(), "bar".to_string())
815            ]
816        );
817        assert_eq!(
818            leading_env_assignment_values("URL='https://example.com/a' FOO=bar curl ok"),
819            vec!["'https://example.com/a'", "bar"]
820        );
821        assert!(leading_env_assignment_values("env URL=https://example.com curl ok").is_empty());
822    }
823
824    #[test]
825    fn test_cmd_pipe() {
826        let segs = tokenize("dir | findstr foo", ShellType::Cmd);
827        assert_eq!(segs.len(), 2);
828        assert_eq!(segs[0].command.as_deref(), Some("dir"));
829        assert_eq!(segs[1].command.as_deref(), Some("findstr"));
830    }
831
832    #[test]
833    fn test_cmd_ampersand_separator() {
834        let segs = tokenize("dir & echo done", ShellType::Cmd);
835        assert_eq!(segs.len(), 2);
836        assert_eq!(segs[1].preceding_separator.as_deref(), Some("&"));
837    }
838
839    #[test]
840    fn test_cmd_double_ampersand() {
841        let segs = tokenize("cmd1 && cmd2", ShellType::Cmd);
842        assert_eq!(segs.len(), 2);
843        assert_eq!(segs[1].preceding_separator.as_deref(), Some("&&"));
844    }
845
846    #[test]
847    fn test_cmd_caret_escape() {
848        let segs = tokenize("echo hello^|world | findstr x", ShellType::Cmd);
849        // ^| is escaped, not a pipe
850        assert_eq!(segs.len(), 2);
851    }
852
853    #[test]
854    fn test_cmd_double_quotes() {
855        let segs = tokenize(r#"echo "hello | world" | findstr x"#, ShellType::Cmd);
856        assert_eq!(segs.len(), 2);
857    }
858
859    #[test]
860    fn test_powershell_multibyte_and_operator_no_panic() {
861        // Regression test for fuzz crash: multi-byte UTF-8 before -and caused
862        // byte/char index mismatch panic in &input[i..] slicing.
863        let input = " ?]BB\u{07E7}\u{07E7} -\n-\r-and-~\0\u{c}-and-~\u{1d}";
864        let _ = tokenize(input, ShellType::PowerShell);
865    }
866
867    // Segment.byte_range invariant: `input[segment.byte_range.clone()] ==
868    // segment.raw` for every segment in every shell tokenizer. The range spans
869    // the TRIMMED content, not the raw accumulator (see push_segment for why).
870
871    fn assert_byte_ranges_match_raw(input: &str, segs: &[Segment]) {
872        for (i, seg) in segs.iter().enumerate() {
873            assert_eq!(
874                &input[seg.byte_range.clone()],
875                seg.raw,
876                "segment {i} byte_range {:?} does not match raw {:?} in input {:?}",
877                seg.byte_range,
878                seg.raw,
879                input
880            );
881        }
882    }
883
884    #[test]
885    fn test_byte_range_posix_simple_pipe() {
886        let input = "foo bar | baz";
887        let segs = tokenize(input, ShellType::Posix);
888        assert_eq!(segs.len(), 2);
889        assert_byte_ranges_match_raw(input, &segs);
890        assert_eq!(&input[segs[0].byte_range.clone()], "foo bar");
891        assert_eq!(&input[segs[1].byte_range.clone()], "baz");
892    }
893
894    #[test]
895    fn test_byte_range_posix_leading_trailing_whitespace() {
896        // push_segment trims; byte_range must match the trimmed content.
897        let input = "  foo bar  | baz  ";
898        let segs = tokenize(input, ShellType::Posix);
899        assert_eq!(segs.len(), 2);
900        assert_byte_ranges_match_raw(input, &segs);
901        assert_eq!(segs[0].byte_range, 2..9); // "foo bar"
902        assert_eq!(segs[1].byte_range, 13..16); // "baz"
903    }
904
905    #[test]
906    fn test_byte_range_posix_duplicate_segments() {
907        // search_cursor must advance so duplicates don't all match at the
908        // first position.
909        let input = "foo | foo | foo";
910        let segs = tokenize(input, ShellType::Posix);
911        assert_eq!(segs.len(), 3);
912        assert_byte_ranges_match_raw(input, &segs);
913        assert_eq!(segs[0].byte_range, 0..3);
914        assert_eq!(segs[1].byte_range, 6..9);
915        assert_eq!(segs[2].byte_range, 12..15);
916    }
917
918    #[test]
919    fn test_byte_range_posix_with_quoted_pipe() {
920        // Quoted pipe stays inside its segment; byte_range covers both quotes.
921        let input = r#"echo "a | b" | grep x"#;
922        let segs = tokenize(input, ShellType::Posix);
923        assert_eq!(segs.len(), 2);
924        assert_byte_ranges_match_raw(input, &segs);
925        assert_eq!(segs[0].raw, r#"echo "a | b""#);
926    }
927
928    #[test]
929    fn test_byte_range_posix_multibyte_content() {
930        // Multi-byte UTF-8 chars in a segment — raw must still be a byte-exact
931        // substring of input, not a char-index slice.
932        let input = "echo 日本語 | grep x";
933        let segs = tokenize(input, ShellType::Posix);
934        assert_eq!(segs.len(), 2);
935        assert_byte_ranges_match_raw(input, &segs);
936        assert_eq!(segs[0].raw, "echo 日本語");
937    }
938
939    #[test]
940    fn test_byte_range_powershell_simple_pipe() {
941        let input = "Get-Process | Where-Object { $_.Name -eq 'x' }";
942        let segs = tokenize(input, ShellType::PowerShell);
943        assert!(segs.len() >= 2);
944        assert_byte_ranges_match_raw(input, &segs);
945    }
946
947    #[test]
948    fn test_byte_range_cmd_pipe() {
949        let input = "dir | findstr foo";
950        let segs = tokenize(input, ShellType::Cmd);
951        assert_eq!(segs.len(), 2);
952        assert_byte_ranges_match_raw(input, &segs);
953    }
954
955    #[test]
956    fn test_byte_range_fish_delegates_to_posix() {
957        // Fish tokenization goes through tokenize_posix; byte_range behavior is identical.
958        let input = "echo hi | cat";
959        let segs = tokenize(input, ShellType::Fish);
960        assert_eq!(segs.len(), 2);
961        assert_byte_ranges_match_raw(input, &segs);
962    }
963
964    #[test]
965    fn test_byte_range_empty_input() {
966        let segs = tokenize("", ShellType::Posix);
967        assert!(segs.is_empty());
968    }
969
970    #[test]
971    fn test_byte_range_whitespace_only() {
972        let segs = tokenize("   \t  ", ShellType::Posix);
973        assert!(segs.is_empty());
974    }
975
976    #[test]
977    fn test_byte_range_sequence_operators() {
978        let input = "ls && echo done";
979        let segs = tokenize(input, ShellType::Posix);
980        assert_eq!(segs.len(), 2);
981        assert_byte_ranges_match_raw(input, &segs);
982        assert_eq!(segs[0].byte_range, 0..2); // "ls"
983        assert_eq!(segs[1].byte_range, 6..15); // "echo done"
984    }
985}
tirith_core/tokenize.rs

tirith_core/
tokenize.rs