Skip to main content

tirith_core/
tokenize.rs

1use serde::{Deserialize, Serialize};
2
3/// Shell type for tokenization rules.
4#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
5#[serde(rename_all = "lowercase")]
6pub enum ShellType {
7    Posix,
8    Fish,
9    PowerShell,
10}
11
12impl std::str::FromStr for ShellType {
13    type Err = String;
14    fn from_str(s: &str) -> Result<Self, Self::Err> {
15        match s.to_lowercase().as_str() {
16            "posix" | "bash" | "zsh" | "sh" => Ok(ShellType::Posix),
17            "fish" => Ok(ShellType::Fish),
18            "powershell" | "pwsh" => Ok(ShellType::PowerShell),
19            _ => Err(format!("unknown shell type: {s}")),
20        }
21    }
22}
23
24/// A segment of a tokenized command.
25#[derive(Debug, Clone)]
26pub struct Segment {
27    /// The raw text of this segment.
28    pub raw: String,
29    /// The first word/command of this segment, if identifiable.
30    pub command: Option<String>,
31    /// Arguments following the command.
32    pub args: Vec<String>,
33    /// The separator that preceded this segment (e.g., `|`, `&&`).
34    pub preceding_separator: Option<String>,
35}
36
37/// Tokenize a command string according to shell type.
38pub fn tokenize(input: &str, shell: ShellType) -> Vec<Segment> {
39    match shell {
40        ShellType::Posix => tokenize_posix(input),
41        ShellType::Fish => tokenize_fish(input),
42        ShellType::PowerShell => tokenize_powershell(input),
43    }
44}
45
46fn tokenize_posix(input: &str) -> Vec<Segment> {
47    let mut segments = Vec::new();
48    let mut current = String::new();
49    let mut preceding_sep = None;
50    let chars: Vec<char> = input.chars().collect();
51    let len = chars.len();
52    let mut i = 0;
53
54    while i < len {
55        let ch = chars[i];
56
57        match ch {
58            // Backslash escaping
59            '\\' if i + 1 < len => {
60                current.push(chars[i]);
61                current.push(chars[i + 1]);
62                i += 2;
63                continue;
64            }
65            // Single quotes: everything literal until closing quote
66            '\'' => {
67                current.push(ch);
68                i += 1;
69                while i < len && chars[i] != '\'' {
70                    current.push(chars[i]);
71                    i += 1;
72                }
73                if i < len {
74                    current.push(chars[i]); // closing quote
75                    i += 1;
76                }
77                continue;
78            }
79            // Double quotes: allow backslash escaping inside
80            '"' => {
81                current.push(ch);
82                i += 1;
83                while i < len && chars[i] != '"' {
84                    if chars[i] == '\\' && i + 1 < len {
85                        current.push(chars[i]);
86                        current.push(chars[i + 1]);
87                        i += 2;
88                    } else {
89                        current.push(chars[i]);
90                        i += 1;
91                    }
92                }
93                if i < len {
94                    current.push(chars[i]); // closing quote
95                    i += 1;
96                }
97                continue;
98            }
99            // Pipe operators
100            '|' => {
101                if i + 1 < len && chars[i + 1] == '|' {
102                    // ||
103                    push_segment(&mut segments, &current, preceding_sep.take());
104                    current.clear();
105                    preceding_sep = Some("||".to_string());
106                    i += 2;
107                    continue;
108                } else if i + 1 < len && chars[i + 1] == '&' {
109                    // |& (bash: pipe stderr too)
110                    push_segment(&mut segments, &current, preceding_sep.take());
111                    current.clear();
112                    preceding_sep = Some("|&".to_string());
113                    i += 2;
114                    continue;
115                } else {
116                    // |
117                    push_segment(&mut segments, &current, preceding_sep.take());
118                    current.clear();
119                    preceding_sep = Some("|".to_string());
120                    i += 1;
121                    continue;
122                }
123            }
124            // && operator
125            '&' if i + 1 < len && chars[i + 1] == '&' => {
126                push_segment(&mut segments, &current, preceding_sep.take());
127                current.clear();
128                preceding_sep = Some("&&".to_string());
129                i += 2;
130                continue;
131            }
132            // Semicolon
133            ';' => {
134                push_segment(&mut segments, &current, preceding_sep.take());
135                current.clear();
136                preceding_sep = Some(";".to_string());
137                i += 1;
138                continue;
139            }
140            // Newline
141            '\n' => {
142                push_segment(&mut segments, &current, preceding_sep.take());
143                current.clear();
144                preceding_sep = Some("\n".to_string());
145                i += 1;
146                continue;
147            }
148            _ => {
149                current.push(ch);
150                i += 1;
151            }
152        }
153    }
154
155    push_segment(&mut segments, &current, preceding_sep.take());
156    segments
157}
158
159fn tokenize_fish(input: &str) -> Vec<Segment> {
160    // Fish is similar to POSIX but with some differences:
161    // - No backslash-newline continuation
162    // - Different quoting rules (but close enough for our purposes)
163    // For URL extraction, POSIX tokenization works well enough
164    tokenize_posix(input)
165}
166
167fn tokenize_powershell(input: &str) -> Vec<Segment> {
168    let mut segments = Vec::new();
169    let mut current = String::new();
170    let mut preceding_sep = None;
171    let chars: Vec<char> = input.chars().collect();
172    let len = chars.len();
173    let mut i = 0;
174
175    while i < len {
176        let ch = chars[i];
177
178        match ch {
179            // Backtick escaping in PowerShell
180            '`' if i + 1 < len => {
181                current.push(chars[i]);
182                current.push(chars[i + 1]);
183                i += 2;
184                continue;
185            }
186            // Single quotes: literal
187            '\'' => {
188                current.push(ch);
189                i += 1;
190                while i < len && chars[i] != '\'' {
191                    current.push(chars[i]);
192                    i += 1;
193                }
194                if i < len {
195                    current.push(chars[i]);
196                    i += 1;
197                }
198                continue;
199            }
200            // Double quotes
201            '"' => {
202                current.push(ch);
203                i += 1;
204                while i < len && chars[i] != '"' {
205                    if chars[i] == '`' && i + 1 < len {
206                        current.push(chars[i]);
207                        current.push(chars[i + 1]);
208                        i += 2;
209                    } else {
210                        current.push(chars[i]);
211                        i += 1;
212                    }
213                }
214                if i < len {
215                    current.push(chars[i]);
216                    i += 1;
217                }
218                continue;
219            }
220            // Pipe
221            '|' => {
222                push_segment(&mut segments, &current, preceding_sep.take());
223                current.clear();
224                preceding_sep = Some("|".to_string());
225                i += 1;
226                continue;
227            }
228            // Semicolon
229            ';' => {
230                push_segment(&mut segments, &current, preceding_sep.take());
231                current.clear();
232                preceding_sep = Some(";".to_string());
233                i += 1;
234                continue;
235            }
236            // Check for -and / -or operators (PowerShell logical)
237            '-' if current.ends_with(char::is_whitespace) || current.is_empty() => {
238                let remaining = &input[i..];
239                if remaining.starts_with("-and")
240                    && remaining[4..]
241                        .chars()
242                        .next()
243                        .is_none_or(|c| c.is_whitespace())
244                {
245                    push_segment(&mut segments, &current, preceding_sep.take());
246                    current.clear();
247                    preceding_sep = Some("-and".to_string());
248                    i += 4;
249                    continue;
250                } else if remaining.starts_with("-or")
251                    && remaining[3..]
252                        .chars()
253                        .next()
254                        .is_none_or(|c| c.is_whitespace())
255                {
256                    push_segment(&mut segments, &current, preceding_sep.take());
257                    current.clear();
258                    preceding_sep = Some("-or".to_string());
259                    i += 3;
260                    continue;
261                }
262                current.push(ch);
263                i += 1;
264            }
265            '\n' => {
266                push_segment(&mut segments, &current, preceding_sep.take());
267                current.clear();
268                preceding_sep = Some("\n".to_string());
269                i += 1;
270                continue;
271            }
272            _ => {
273                current.push(ch);
274                i += 1;
275            }
276        }
277    }
278
279    push_segment(&mut segments, &current, preceding_sep.take());
280    segments
281}
282
283fn push_segment(segments: &mut Vec<Segment>, raw: &str, preceding_sep: Option<String>) {
284    let trimmed = raw.trim();
285    if trimmed.is_empty() {
286        return;
287    }
288
289    let words = split_words(trimmed);
290    let command = words.first().cloned();
291    let args = if words.len() > 1 {
292        words[1..].to_vec()
293    } else {
294        Vec::new()
295    };
296
297    segments.push(Segment {
298        raw: trimmed.to_string(),
299        command,
300        args,
301        preceding_separator: preceding_sep,
302    });
303}
304
305/// Split a segment into words, respecting quotes.
306fn split_words(input: &str) -> Vec<String> {
307    let mut words = Vec::new();
308    let mut current = String::new();
309    let chars: Vec<char> = input.chars().collect();
310    let len = chars.len();
311    let mut i = 0;
312
313    while i < len {
314        let ch = chars[i];
315        match ch {
316            ' ' | '\t' if !current.is_empty() => {
317                words.push(current.clone());
318                current.clear();
319                i += 1;
320                // Skip whitespace
321                while i < len && (chars[i] == ' ' || chars[i] == '\t') {
322                    i += 1;
323                }
324            }
325            ' ' | '\t' => {
326                i += 1;
327            }
328            '\'' => {
329                current.push(ch);
330                i += 1;
331                while i < len && chars[i] != '\'' {
332                    current.push(chars[i]);
333                    i += 1;
334                }
335                if i < len {
336                    current.push(chars[i]);
337                    i += 1;
338                }
339            }
340            '"' => {
341                current.push(ch);
342                i += 1;
343                while i < len && chars[i] != '"' {
344                    if chars[i] == '\\' && i + 1 < len {
345                        current.push(chars[i]);
346                        current.push(chars[i + 1]);
347                        i += 2;
348                    } else {
349                        current.push(chars[i]);
350                        i += 1;
351                    }
352                }
353                if i < len {
354                    current.push(chars[i]);
355                    i += 1;
356                }
357            }
358            '\\' if i + 1 < len => {
359                current.push(chars[i]);
360                current.push(chars[i + 1]);
361                i += 2;
362            }
363            _ => {
364                current.push(ch);
365                i += 1;
366            }
367        }
368    }
369
370    if !current.is_empty() {
371        words.push(current);
372    }
373
374    words
375}
376
377#[cfg(test)]
378mod tests {
379    use super::*;
380
381    #[test]
382    fn test_simple_pipe() {
383        let segs = tokenize("echo hello | grep world", ShellType::Posix);
384        assert_eq!(segs.len(), 2);
385        assert_eq!(segs[0].command.as_deref(), Some("echo"));
386        assert_eq!(segs[1].command.as_deref(), Some("grep"));
387        assert_eq!(segs[1].preceding_separator.as_deref(), Some("|"));
388    }
389
390    #[test]
391    fn test_quoted_pipe() {
392        let segs = tokenize(r#"echo "hello | world" | bash"#, ShellType::Posix);
393        assert_eq!(segs.len(), 2);
394        assert_eq!(segs[0].raw, r#"echo "hello | world""#);
395        assert_eq!(segs[1].command.as_deref(), Some("bash"));
396    }
397
398    #[test]
399    fn test_and_or() {
400        let segs = tokenize("cmd1 && cmd2 || cmd3", ShellType::Posix);
401        assert_eq!(segs.len(), 3);
402        assert_eq!(segs[1].preceding_separator.as_deref(), Some("&&"));
403        assert_eq!(segs[2].preceding_separator.as_deref(), Some("||"));
404    }
405
406    #[test]
407    fn test_semicolon() {
408        let segs = tokenize("cmd1; cmd2", ShellType::Posix);
409        assert_eq!(segs.len(), 2);
410        assert_eq!(segs[1].preceding_separator.as_deref(), Some(";"));
411    }
412
413    #[test]
414    fn test_pipe_ampersand() {
415        let segs = tokenize("cmd1 |& cmd2", ShellType::Posix);
416        assert_eq!(segs.len(), 2);
417        assert_eq!(segs[1].preceding_separator.as_deref(), Some("|&"));
418    }
419
420    #[test]
421    fn test_powershell_pipe() {
422        let segs = tokenize("iwr url | iex", ShellType::PowerShell);
423        assert_eq!(segs.len(), 2);
424        assert_eq!(segs[0].command.as_deref(), Some("iwr"));
425        assert_eq!(segs[1].command.as_deref(), Some("iex"));
426    }
427
428    #[test]
429    fn test_powershell_backtick() {
430        let segs = tokenize("echo `| not a pipe", ShellType::PowerShell);
431        // backtick escapes the pipe
432        assert_eq!(segs.len(), 1);
433    }
434
435    #[test]
436    fn test_single_quotes() {
437        let segs = tokenize("echo 'hello | world' | bash", ShellType::Posix);
438        assert_eq!(segs.len(), 2);
439    }
440
441    #[test]
442    fn test_backslash_escape() {
443        let segs = tokenize("echo hello\\|world | bash", ShellType::Posix);
444        // The backslash-pipe is inside the first segment
445        assert_eq!(segs.len(), 2);
446    }
447
448    #[test]
449    fn test_empty_input() {
450        let segs = tokenize("", ShellType::Posix);
451        assert!(segs.is_empty());
452    }
453
454    #[test]
455    fn test_whitespace_only() {
456        let segs = tokenize("   ", ShellType::Posix);
457        assert!(segs.is_empty());
458    }
459
460    #[test]
461    fn test_args_extraction() {
462        let segs = tokenize("curl -sSL https://example.com", ShellType::Posix);
463        assert_eq!(segs.len(), 1);
464        assert_eq!(segs[0].command.as_deref(), Some("curl"));
465        assert_eq!(segs[0].args.len(), 2);
466    }
467}