1use serde::{Deserialize, Serialize};
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
5#[serde(rename_all = "lowercase")]
6pub enum ShellType {
7 Posix,
8 Fish,
9 PowerShell,
10 Cmd,
11}
12
13impl std::str::FromStr for ShellType {
14 type Err = String;
15 fn from_str(s: &str) -> Result<Self, Self::Err> {
16 match s.to_lowercase().as_str() {
17 "posix" | "bash" | "zsh" | "sh" => Ok(ShellType::Posix),
18 "fish" => Ok(ShellType::Fish),
19 "powershell" | "pwsh" => Ok(ShellType::PowerShell),
20 "cmd" | "cmd.exe" => Ok(ShellType::Cmd),
21 _ => Err(format!("unknown shell type: {s}")),
22 }
23 }
24}
25
26#[derive(Debug, Clone)]
28pub struct Segment {
29 pub raw: String,
31 pub command: Option<String>,
33 pub args: Vec<String>,
35 pub preceding_separator: Option<String>,
37}
38
39pub fn tokenize(input: &str, shell: ShellType) -> Vec<Segment> {
41 match shell {
42 ShellType::Posix => tokenize_posix(input),
43 ShellType::Fish => tokenize_fish(input),
44 ShellType::PowerShell => tokenize_powershell(input),
45 ShellType::Cmd => tokenize_cmd(input),
46 }
47}
48
49fn tokenize_posix(input: &str) -> Vec<Segment> {
50 let mut segments = Vec::new();
51 let mut current = String::new();
52 let mut preceding_sep = None;
53 let chars: Vec<char> = input.chars().collect();
54 let len = chars.len();
55 let mut i = 0;
56
57 while i < len {
58 let ch = chars[i];
59
60 match ch {
61 '\\' if i + 1 < len => {
63 current.push(chars[i]);
64 current.push(chars[i + 1]);
65 i += 2;
66 continue;
67 }
68 '\'' => {
70 current.push(ch);
71 i += 1;
72 while i < len && chars[i] != '\'' {
73 current.push(chars[i]);
74 i += 1;
75 }
76 if i < len {
77 current.push(chars[i]); i += 1;
79 }
80 continue;
81 }
82 '"' => {
84 current.push(ch);
85 i += 1;
86 while i < len && chars[i] != '"' {
87 if chars[i] == '\\' && i + 1 < len {
88 current.push(chars[i]);
89 current.push(chars[i + 1]);
90 i += 2;
91 } else {
92 current.push(chars[i]);
93 i += 1;
94 }
95 }
96 if i < len {
97 current.push(chars[i]); i += 1;
99 }
100 continue;
101 }
102 '|' => {
104 if i + 1 < len && chars[i + 1] == '|' {
105 push_segment(&mut segments, ¤t, preceding_sep.take());
107 current.clear();
108 preceding_sep = Some("||".to_string());
109 i += 2;
110 continue;
111 } else if i + 1 < len && chars[i + 1] == '&' {
112 push_segment(&mut segments, ¤t, preceding_sep.take());
114 current.clear();
115 preceding_sep = Some("|&".to_string());
116 i += 2;
117 continue;
118 } else {
119 push_segment(&mut segments, ¤t, preceding_sep.take());
121 current.clear();
122 preceding_sep = Some("|".to_string());
123 i += 1;
124 continue;
125 }
126 }
127 '&' if i + 1 < len && chars[i + 1] == '&' => {
129 push_segment(&mut segments, ¤t, preceding_sep.take());
130 current.clear();
131 preceding_sep = Some("&&".to_string());
132 i += 2;
133 continue;
134 }
135 ';' => {
137 push_segment(&mut segments, ¤t, preceding_sep.take());
138 current.clear();
139 preceding_sep = Some(";".to_string());
140 i += 1;
141 continue;
142 }
143 '\n' => {
145 push_segment(&mut segments, ¤t, preceding_sep.take());
146 current.clear();
147 preceding_sep = Some("\n".to_string());
148 i += 1;
149 continue;
150 }
151 _ => {
152 current.push(ch);
153 i += 1;
154 }
155 }
156 }
157
158 push_segment(&mut segments, ¤t, preceding_sep.take());
159 segments
160}
161
162fn tokenize_fish(input: &str) -> Vec<Segment> {
163 tokenize_posix(input)
168}
169
170fn tokenize_powershell(input: &str) -> Vec<Segment> {
171 let mut segments = Vec::new();
172 let mut current = String::new();
173 let mut preceding_sep = None;
174 let indexed: Vec<(usize, char)> = input.char_indices().collect();
176 let len = indexed.len();
177 let mut i = 0;
178
179 while i < len {
180 let (byte_off, ch) = indexed[i];
181
182 match ch {
183 '`' if i + 1 < len => {
185 current.push(indexed[i].1);
186 current.push(indexed[i + 1].1);
187 i += 2;
188 continue;
189 }
190 '\'' => {
192 current.push(ch);
193 i += 1;
194 while i < len && indexed[i].1 != '\'' {
195 current.push(indexed[i].1);
196 i += 1;
197 }
198 if i < len {
199 current.push(indexed[i].1);
200 i += 1;
201 }
202 continue;
203 }
204 '"' => {
206 current.push(ch);
207 i += 1;
208 while i < len && indexed[i].1 != '"' {
209 if indexed[i].1 == '`' && i + 1 < len {
210 current.push(indexed[i].1);
211 current.push(indexed[i + 1].1);
212 i += 2;
213 } else {
214 current.push(indexed[i].1);
215 i += 1;
216 }
217 }
218 if i < len {
219 current.push(indexed[i].1);
220 i += 1;
221 }
222 continue;
223 }
224 '|' => {
226 push_segment(&mut segments, ¤t, preceding_sep.take());
227 current.clear();
228 preceding_sep = Some("|".to_string());
229 i += 1;
230 continue;
231 }
232 ';' => {
234 push_segment(&mut segments, ¤t, preceding_sep.take());
235 current.clear();
236 preceding_sep = Some(";".to_string());
237 i += 1;
238 continue;
239 }
240 '-' if current.ends_with(char::is_whitespace) || current.is_empty() => {
242 let remaining = &input[byte_off..];
243 if remaining.starts_with("-and")
244 && remaining[4..]
245 .chars()
246 .next()
247 .is_none_or(|c| c.is_whitespace())
248 {
249 push_segment(&mut segments, ¤t, preceding_sep.take());
250 current.clear();
251 preceding_sep = Some("-and".to_string());
252 i += 4;
253 continue;
254 } else if remaining.starts_with("-or")
255 && remaining[3..]
256 .chars()
257 .next()
258 .is_none_or(|c| c.is_whitespace())
259 {
260 push_segment(&mut segments, ¤t, preceding_sep.take());
261 current.clear();
262 preceding_sep = Some("-or".to_string());
263 i += 3;
264 continue;
265 }
266 current.push(ch);
267 i += 1;
268 }
269 '\n' => {
270 push_segment(&mut segments, ¤t, preceding_sep.take());
271 current.clear();
272 preceding_sep = Some("\n".to_string());
273 i += 1;
274 continue;
275 }
276 _ => {
277 current.push(ch);
278 i += 1;
279 }
280 }
281 }
282
283 push_segment(&mut segments, ¤t, preceding_sep.take());
284 segments
285}
286
287fn tokenize_cmd(input: &str) -> Vec<Segment> {
288 let mut segments = Vec::new();
289 let mut current = String::new();
290 let mut preceding_sep = None;
291 let chars: Vec<char> = input.chars().collect();
292 let len = chars.len();
293 let mut i = 0;
294
295 while i < len {
296 let ch = chars[i];
297 match ch {
298 '^' if i + 1 < len => {
300 current.push(chars[i]);
301 current.push(chars[i + 1]);
302 i += 2;
303 continue;
304 }
305 '"' => {
307 current.push(ch);
308 i += 1;
309 while i < len && chars[i] != '"' {
310 current.push(chars[i]);
311 i += 1;
312 }
313 if i < len {
314 current.push(chars[i]);
315 i += 1;
316 }
317 continue;
318 }
319 '|' => {
321 if i + 1 < len && chars[i + 1] == '|' {
322 push_segment(&mut segments, ¤t, preceding_sep.take());
323 current.clear();
324 preceding_sep = Some("||".to_string());
325 i += 2;
326 } else {
327 push_segment(&mut segments, ¤t, preceding_sep.take());
328 current.clear();
329 preceding_sep = Some("|".to_string());
330 i += 1;
331 }
332 continue;
333 }
334 '&' => {
336 if i + 1 < len && chars[i + 1] == '&' {
337 push_segment(&mut segments, ¤t, preceding_sep.take());
338 current.clear();
339 preceding_sep = Some("&&".to_string());
340 i += 2;
341 } else {
342 push_segment(&mut segments, ¤t, preceding_sep.take());
343 current.clear();
344 preceding_sep = Some("&".to_string());
345 i += 1;
346 }
347 continue;
348 }
349 '\n' => {
350 push_segment(&mut segments, ¤t, preceding_sep.take());
351 current.clear();
352 preceding_sep = Some("\n".to_string());
353 i += 1;
354 continue;
355 }
356 _ => {
357 current.push(ch);
358 i += 1;
359 }
360 }
361 }
362 push_segment(&mut segments, ¤t, preceding_sep.take());
363 segments
364}
365
366fn push_segment(segments: &mut Vec<Segment>, raw: &str, preceding_sep: Option<String>) {
367 let trimmed = raw.trim();
368 if trimmed.is_empty() {
369 return;
370 }
371
372 let words = split_words(trimmed);
373 let first_non_assign = words.iter().position(|w| !is_env_assignment(w));
375 let (command, args) = match first_non_assign {
376 Some(idx) => {
377 let cmd = Some(words[idx].clone());
378 let args = if idx + 1 < words.len() {
379 words[idx + 1..].to_vec()
380 } else {
381 Vec::new()
382 };
383 (cmd, args)
384 }
385 None => {
386 (None, Vec::new())
388 }
389 };
390
391 segments.push(Segment {
392 raw: trimmed.to_string(),
393 command,
394 args,
395 preceding_separator: preceding_sep,
396 });
397}
398
399pub fn is_env_assignment(word: &str) -> bool {
402 let s = word.trim();
403 if s.starts_with('-') || s.starts_with('=') {
404 return false;
405 }
406 if let Some(eq_pos) = s.find('=') {
407 if eq_pos == 0 {
408 return false;
409 }
410 let name = &s[..eq_pos];
411 let first = name.chars().next().unwrap_or('0');
412 if first.is_ascii_digit() {
413 return false;
414 }
415 name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_')
416 } else {
417 false
418 }
419}
420
421fn split_words(input: &str) -> Vec<String> {
423 let mut words = Vec::new();
424 let mut current = String::new();
425 let chars: Vec<char> = input.chars().collect();
426 let len = chars.len();
427 let mut i = 0;
428
429 while i < len {
430 let ch = chars[i];
431 match ch {
432 ' ' | '\t' if !current.is_empty() => {
433 words.push(current.clone());
434 current.clear();
435 i += 1;
436 while i < len && (chars[i] == ' ' || chars[i] == '\t') {
438 i += 1;
439 }
440 }
441 ' ' | '\t' => {
442 i += 1;
443 }
444 '\'' => {
445 current.push(ch);
446 i += 1;
447 while i < len && chars[i] != '\'' {
448 current.push(chars[i]);
449 i += 1;
450 }
451 if i < len {
452 current.push(chars[i]);
453 i += 1;
454 }
455 }
456 '"' => {
457 current.push(ch);
458 i += 1;
459 while i < len && chars[i] != '"' {
460 if chars[i] == '\\' && i + 1 < len {
461 current.push(chars[i]);
462 current.push(chars[i + 1]);
463 i += 2;
464 } else {
465 current.push(chars[i]);
466 i += 1;
467 }
468 }
469 if i < len {
470 current.push(chars[i]);
471 i += 1;
472 }
473 }
474 '\\' if i + 1 < len => {
475 current.push(chars[i]);
476 current.push(chars[i + 1]);
477 i += 2;
478 }
479 _ => {
480 current.push(ch);
481 i += 1;
482 }
483 }
484 }
485
486 if !current.is_empty() {
487 words.push(current);
488 }
489
490 words
491}
492
493#[cfg(test)]
494mod tests {
495 use super::*;
496
497 #[test]
498 fn test_simple_pipe() {
499 let segs = tokenize("echo hello | grep world", ShellType::Posix);
500 assert_eq!(segs.len(), 2);
501 assert_eq!(segs[0].command.as_deref(), Some("echo"));
502 assert_eq!(segs[1].command.as_deref(), Some("grep"));
503 assert_eq!(segs[1].preceding_separator.as_deref(), Some("|"));
504 }
505
506 #[test]
507 fn test_quoted_pipe() {
508 let segs = tokenize(r#"echo "hello | world" | bash"#, ShellType::Posix);
509 assert_eq!(segs.len(), 2);
510 assert_eq!(segs[0].raw, r#"echo "hello | world""#);
511 assert_eq!(segs[1].command.as_deref(), Some("bash"));
512 }
513
514 #[test]
515 fn test_and_or() {
516 let segs = tokenize("cmd1 && cmd2 || cmd3", ShellType::Posix);
517 assert_eq!(segs.len(), 3);
518 assert_eq!(segs[1].preceding_separator.as_deref(), Some("&&"));
519 assert_eq!(segs[2].preceding_separator.as_deref(), Some("||"));
520 }
521
522 #[test]
523 fn test_semicolon() {
524 let segs = tokenize("cmd1; cmd2", ShellType::Posix);
525 assert_eq!(segs.len(), 2);
526 assert_eq!(segs[1].preceding_separator.as_deref(), Some(";"));
527 }
528
529 #[test]
530 fn test_pipe_ampersand() {
531 let segs = tokenize("cmd1 |& cmd2", ShellType::Posix);
532 assert_eq!(segs.len(), 2);
533 assert_eq!(segs[1].preceding_separator.as_deref(), Some("|&"));
534 }
535
536 #[test]
537 fn test_powershell_pipe() {
538 let segs = tokenize("iwr url | iex", ShellType::PowerShell);
539 assert_eq!(segs.len(), 2);
540 assert_eq!(segs[0].command.as_deref(), Some("iwr"));
541 assert_eq!(segs[1].command.as_deref(), Some("iex"));
542 }
543
544 #[test]
545 fn test_powershell_backtick() {
546 let segs = tokenize("echo `| not a pipe", ShellType::PowerShell);
547 assert_eq!(segs.len(), 1);
549 }
550
551 #[test]
552 fn test_single_quotes() {
553 let segs = tokenize("echo 'hello | world' | bash", ShellType::Posix);
554 assert_eq!(segs.len(), 2);
555 }
556
557 #[test]
558 fn test_backslash_escape() {
559 let segs = tokenize("echo hello\\|world | bash", ShellType::Posix);
560 assert_eq!(segs.len(), 2);
562 }
563
564 #[test]
565 fn test_empty_input() {
566 let segs = tokenize("", ShellType::Posix);
567 assert!(segs.is_empty());
568 }
569
570 #[test]
571 fn test_whitespace_only() {
572 let segs = tokenize(" ", ShellType::Posix);
573 assert!(segs.is_empty());
574 }
575
576 #[test]
577 fn test_args_extraction() {
578 let segs = tokenize("curl -sSL https://example.com", ShellType::Posix);
579 assert_eq!(segs.len(), 1);
580 assert_eq!(segs[0].command.as_deref(), Some("curl"));
581 assert_eq!(segs[0].args.len(), 2);
582 }
583
584 #[test]
585 fn test_env_prefix_skipped() {
586 let segs = tokenize("TIRITH=0 curl evil.com", ShellType::Posix);
587 assert_eq!(segs.len(), 1);
588 assert_eq!(segs[0].command.as_deref(), Some("curl"));
589 assert_eq!(segs[0].args, vec!["evil.com"]);
590 }
591
592 #[test]
593 fn test_multiple_env_prefixes() {
594 let segs = tokenize("FOO=bar BAZ=1 python script.py", ShellType::Posix);
595 assert_eq!(segs.len(), 1);
596 assert_eq!(segs[0].command.as_deref(), Some("python"));
597 assert_eq!(segs[0].args, vec!["script.py"]);
598 }
599
600 #[test]
601 fn test_env_only_no_command() {
602 let segs = tokenize("TIRITH=0", ShellType::Posix);
603 assert_eq!(segs.len(), 1);
604 assert_eq!(segs[0].command, None);
605 assert!(segs[0].args.is_empty());
606 }
607
608 #[test]
609 fn test_is_env_assignment() {
610 assert!(is_env_assignment("FOO=bar"));
611 assert!(is_env_assignment("TIRITH=0"));
612 assert!(is_env_assignment("PATH=/usr/bin"));
613 assert!(is_env_assignment("A="));
614 assert!(!is_env_assignment("-o"));
615 assert!(!is_env_assignment("curl"));
616 assert!(!is_env_assignment("=value"));
617 assert!(!is_env_assignment("--flag=value"));
618 assert!(!is_env_assignment("1FOO=bar"));
619 }
620
621 #[test]
622 fn test_cmd_pipe() {
623 let segs = tokenize("dir | findstr foo", ShellType::Cmd);
624 assert_eq!(segs.len(), 2);
625 assert_eq!(segs[0].command.as_deref(), Some("dir"));
626 assert_eq!(segs[1].command.as_deref(), Some("findstr"));
627 }
628
629 #[test]
630 fn test_cmd_ampersand_separator() {
631 let segs = tokenize("dir & echo done", ShellType::Cmd);
632 assert_eq!(segs.len(), 2);
633 assert_eq!(segs[1].preceding_separator.as_deref(), Some("&"));
634 }
635
636 #[test]
637 fn test_cmd_double_ampersand() {
638 let segs = tokenize("cmd1 && cmd2", ShellType::Cmd);
639 assert_eq!(segs.len(), 2);
640 assert_eq!(segs[1].preceding_separator.as_deref(), Some("&&"));
641 }
642
643 #[test]
644 fn test_cmd_caret_escape() {
645 let segs = tokenize("echo hello^|world | findstr x", ShellType::Cmd);
646 assert_eq!(segs.len(), 2);
648 }
649
650 #[test]
651 fn test_cmd_double_quotes() {
652 let segs = tokenize(r#"echo "hello | world" | findstr x"#, ShellType::Cmd);
653 assert_eq!(segs.len(), 2);
654 }
655
656 #[test]
657 fn test_powershell_multibyte_and_operator_no_panic() {
658 let input = " ?]BB\u{07E7}\u{07E7} -\n-\r-and-~\0\u{c}-and-~\u{1d}";
661 let _ = tokenize(input, ShellType::PowerShell);
662 }
663}