1use serde::{Deserialize, Serialize};
2
3#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
5#[serde(rename_all = "lowercase")]
6pub enum ShellType {
7 Posix,
8 Fish,
9 PowerShell,
10 Cmd,
11}
12
13impl std::str::FromStr for ShellType {
14 type Err = String;
15 fn from_str(s: &str) -> Result<Self, Self::Err> {
16 match s.to_lowercase().as_str() {
17 "posix" | "bash" | "zsh" | "sh" => Ok(ShellType::Posix),
18 "fish" => Ok(ShellType::Fish),
19 "powershell" | "pwsh" => Ok(ShellType::PowerShell),
20 "cmd" | "cmd.exe" => Ok(ShellType::Cmd),
21 _ => Err(format!("unknown shell type: {s}")),
22 }
23 }
24}
25
26#[derive(Debug, Clone)]
28pub struct Segment {
29 pub raw: String,
31 pub command: Option<String>,
33 pub args: Vec<String>,
35 pub preceding_separator: Option<String>,
37}
38
39pub fn tokenize(input: &str, shell: ShellType) -> Vec<Segment> {
41 match shell {
42 ShellType::Posix => tokenize_posix(input),
43 ShellType::Fish => tokenize_fish(input),
44 ShellType::PowerShell => tokenize_powershell(input),
45 ShellType::Cmd => tokenize_cmd(input),
46 }
47}
48
49fn tokenize_posix(input: &str) -> Vec<Segment> {
50 let mut segments = Vec::new();
51 let mut current = String::new();
52 let mut preceding_sep = None;
53 let chars: Vec<char> = input.chars().collect();
54 let len = chars.len();
55 let mut i = 0;
56
57 while i < len {
58 let ch = chars[i];
59
60 match ch {
61 '\\' if i + 1 < len => {
63 current.push(chars[i]);
64 current.push(chars[i + 1]);
65 i += 2;
66 continue;
67 }
68 '\'' => {
70 current.push(ch);
71 i += 1;
72 while i < len && chars[i] != '\'' {
73 current.push(chars[i]);
74 i += 1;
75 }
76 if i < len {
77 current.push(chars[i]); i += 1;
79 }
80 continue;
81 }
82 '"' => {
84 current.push(ch);
85 i += 1;
86 while i < len && chars[i] != '"' {
87 if chars[i] == '\\' && i + 1 < len {
88 current.push(chars[i]);
89 current.push(chars[i + 1]);
90 i += 2;
91 } else {
92 current.push(chars[i]);
93 i += 1;
94 }
95 }
96 if i < len {
97 current.push(chars[i]); i += 1;
99 }
100 continue;
101 }
102 '|' => {
104 if i + 1 < len && chars[i + 1] == '|' {
105 push_segment(&mut segments, ¤t, preceding_sep.take());
107 current.clear();
108 preceding_sep = Some("||".to_string());
109 i += 2;
110 continue;
111 } else if i + 1 < len && chars[i + 1] == '&' {
112 push_segment(&mut segments, ¤t, preceding_sep.take());
114 current.clear();
115 preceding_sep = Some("|&".to_string());
116 i += 2;
117 continue;
118 } else {
119 push_segment(&mut segments, ¤t, preceding_sep.take());
121 current.clear();
122 preceding_sep = Some("|".to_string());
123 i += 1;
124 continue;
125 }
126 }
127 '&' if i + 1 < len && chars[i + 1] == '&' => {
129 push_segment(&mut segments, ¤t, preceding_sep.take());
130 current.clear();
131 preceding_sep = Some("&&".to_string());
132 i += 2;
133 continue;
134 }
135 ';' => {
137 push_segment(&mut segments, ¤t, preceding_sep.take());
138 current.clear();
139 preceding_sep = Some(";".to_string());
140 i += 1;
141 continue;
142 }
143 '\n' => {
145 push_segment(&mut segments, ¤t, preceding_sep.take());
146 current.clear();
147 preceding_sep = Some("\n".to_string());
148 i += 1;
149 continue;
150 }
151 _ => {
152 current.push(ch);
153 i += 1;
154 }
155 }
156 }
157
158 push_segment(&mut segments, ¤t, preceding_sep.take());
159 segments
160}
161
162fn tokenize_fish(input: &str) -> Vec<Segment> {
163 tokenize_posix(input)
168}
169
170fn tokenize_powershell(input: &str) -> Vec<Segment> {
171 let mut segments = Vec::new();
172 let mut current = String::new();
173 let mut preceding_sep = None;
174 let indexed: Vec<(usize, char)> = input.char_indices().collect();
176 let len = indexed.len();
177 let mut i = 0;
178
179 while i < len {
180 let (byte_off, ch) = indexed[i];
181
182 match ch {
183 '`' if i + 1 < len => {
185 current.push(indexed[i].1);
186 current.push(indexed[i + 1].1);
187 i += 2;
188 continue;
189 }
190 '\'' => {
192 current.push(ch);
193 i += 1;
194 while i < len && indexed[i].1 != '\'' {
195 current.push(indexed[i].1);
196 i += 1;
197 }
198 if i < len {
199 current.push(indexed[i].1);
200 i += 1;
201 }
202 continue;
203 }
204 '"' => {
206 current.push(ch);
207 i += 1;
208 while i < len && indexed[i].1 != '"' {
209 if indexed[i].1 == '`' && i + 1 < len {
210 current.push(indexed[i].1);
211 current.push(indexed[i + 1].1);
212 i += 2;
213 } else {
214 current.push(indexed[i].1);
215 i += 1;
216 }
217 }
218 if i < len {
219 current.push(indexed[i].1);
220 i += 1;
221 }
222 continue;
223 }
224 '|' => {
226 push_segment(&mut segments, ¤t, preceding_sep.take());
227 current.clear();
228 preceding_sep = Some("|".to_string());
229 i += 1;
230 continue;
231 }
232 ';' => {
234 push_segment(&mut segments, ¤t, preceding_sep.take());
235 current.clear();
236 preceding_sep = Some(";".to_string());
237 i += 1;
238 continue;
239 }
240 '-' if current.ends_with(char::is_whitespace) || current.is_empty() => {
242 let remaining = &input[byte_off..];
243 if remaining.starts_with("-and")
244 && remaining[4..]
245 .chars()
246 .next()
247 .is_none_or(|c| c.is_whitespace())
248 {
249 push_segment(&mut segments, ¤t, preceding_sep.take());
250 current.clear();
251 preceding_sep = Some("-and".to_string());
252 i += 4;
253 continue;
254 } else if remaining.starts_with("-or")
255 && remaining[3..]
256 .chars()
257 .next()
258 .is_none_or(|c| c.is_whitespace())
259 {
260 push_segment(&mut segments, ¤t, preceding_sep.take());
261 current.clear();
262 preceding_sep = Some("-or".to_string());
263 i += 3;
264 continue;
265 }
266 current.push(ch);
267 i += 1;
268 }
269 '\n' => {
270 push_segment(&mut segments, ¤t, preceding_sep.take());
271 current.clear();
272 preceding_sep = Some("\n".to_string());
273 i += 1;
274 continue;
275 }
276 _ => {
277 current.push(ch);
278 i += 1;
279 }
280 }
281 }
282
283 push_segment(&mut segments, ¤t, preceding_sep.take());
284 segments
285}
286
287fn tokenize_cmd(input: &str) -> Vec<Segment> {
288 let mut segments = Vec::new();
289 let mut current = String::new();
290 let mut preceding_sep = None;
291 let chars: Vec<char> = input.chars().collect();
292 let len = chars.len();
293 let mut i = 0;
294
295 while i < len {
296 let ch = chars[i];
297 match ch {
298 '^' if i + 1 < len => {
300 current.push(chars[i]);
301 current.push(chars[i + 1]);
302 i += 2;
303 continue;
304 }
305 '"' => {
307 current.push(ch);
308 i += 1;
309 while i < len && chars[i] != '"' {
310 current.push(chars[i]);
311 i += 1;
312 }
313 if i < len {
314 current.push(chars[i]);
315 i += 1;
316 }
317 continue;
318 }
319 '|' => {
321 if i + 1 < len && chars[i + 1] == '|' {
322 push_segment(&mut segments, ¤t, preceding_sep.take());
323 current.clear();
324 preceding_sep = Some("||".to_string());
325 i += 2;
326 } else {
327 push_segment(&mut segments, ¤t, preceding_sep.take());
328 current.clear();
329 preceding_sep = Some("|".to_string());
330 i += 1;
331 }
332 continue;
333 }
334 '&' => {
336 if i + 1 < len && chars[i + 1] == '&' {
337 push_segment(&mut segments, ¤t, preceding_sep.take());
338 current.clear();
339 preceding_sep = Some("&&".to_string());
340 i += 2;
341 } else {
342 push_segment(&mut segments, ¤t, preceding_sep.take());
343 current.clear();
344 preceding_sep = Some("&".to_string());
345 i += 1;
346 }
347 continue;
348 }
349 '\n' => {
350 push_segment(&mut segments, ¤t, preceding_sep.take());
351 current.clear();
352 preceding_sep = Some("\n".to_string());
353 i += 1;
354 continue;
355 }
356 _ => {
357 current.push(ch);
358 i += 1;
359 }
360 }
361 }
362 push_segment(&mut segments, ¤t, preceding_sep.take());
363 segments
364}
365
366fn push_segment(segments: &mut Vec<Segment>, raw: &str, preceding_sep: Option<String>) {
367 let trimmed = raw.trim();
368 if trimmed.is_empty() {
369 return;
370 }
371
372 let words = split_words(trimmed);
373 let first_non_assign = words.iter().position(|w| !is_env_assignment(w));
375 let (command, args) = match first_non_assign {
376 Some(idx) => {
377 let cmd = Some(words[idx].clone());
378 let args = if idx + 1 < words.len() {
379 words[idx + 1..].to_vec()
380 } else {
381 Vec::new()
382 };
383 (cmd, args)
384 }
385 None => {
386 (None, Vec::new())
388 }
389 };
390
391 segments.push(Segment {
392 raw: trimmed.to_string(),
393 command,
394 args,
395 preceding_separator: preceding_sep,
396 });
397}
398
399pub fn is_env_assignment(word: &str) -> bool {
402 let s = word.trim();
403 if s.starts_with('-') || s.starts_with('=') {
404 return false;
405 }
406 if let Some(eq_pos) = s.find('=') {
407 if eq_pos == 0 {
408 return false;
409 }
410 let name = &s[..eq_pos];
411 let first = name.chars().next().unwrap_or('0');
412 if first.is_ascii_digit() {
413 return false;
414 }
415 name.chars().all(|c| c.is_ascii_alphanumeric() || c == '_')
416 } else {
417 false
418 }
419}
420
421pub fn leading_env_assignments(segment_raw: &str) -> Vec<(String, String)> {
424 let mut assignments = Vec::new();
425 for word in split_words(segment_raw.trim()) {
426 if !is_env_assignment(&word) {
427 break;
428 }
429 if let Some((name, value)) = word.split_once('=') {
430 assignments.push((name.to_string(), value.to_string()));
431 }
432 }
433 assignments
434}
435
436pub fn leading_env_assignment_values(segment_raw: &str) -> Vec<String> {
439 leading_env_assignments(segment_raw)
440 .into_iter()
441 .map(|(_, value)| value)
442 .collect()
443}
444
445fn split_words(input: &str) -> Vec<String> {
447 let mut words = Vec::new();
448 let mut current = String::new();
449 let chars: Vec<char> = input.chars().collect();
450 let len = chars.len();
451 let mut i = 0;
452
453 while i < len {
454 let ch = chars[i];
455 match ch {
456 ' ' | '\t' if !current.is_empty() => {
457 words.push(current.clone());
458 current.clear();
459 i += 1;
460 while i < len && (chars[i] == ' ' || chars[i] == '\t') {
462 i += 1;
463 }
464 }
465 ' ' | '\t' => {
466 i += 1;
467 }
468 '\'' => {
469 current.push(ch);
470 i += 1;
471 while i < len && chars[i] != '\'' {
472 current.push(chars[i]);
473 i += 1;
474 }
475 if i < len {
476 current.push(chars[i]);
477 i += 1;
478 }
479 }
480 '"' => {
481 current.push(ch);
482 i += 1;
483 while i < len && chars[i] != '"' {
484 if chars[i] == '\\' && i + 1 < len {
485 current.push(chars[i]);
486 current.push(chars[i + 1]);
487 i += 2;
488 } else {
489 current.push(chars[i]);
490 i += 1;
491 }
492 }
493 if i < len {
494 current.push(chars[i]);
495 i += 1;
496 }
497 }
498 '\\' if i + 1 < len => {
499 current.push(chars[i]);
500 current.push(chars[i + 1]);
501 i += 2;
502 }
503 _ => {
504 current.push(ch);
505 i += 1;
506 }
507 }
508 }
509
510 if !current.is_empty() {
511 words.push(current);
512 }
513
514 words
515}
516
517#[cfg(test)]
518mod tests {
519 use super::*;
520
521 #[test]
522 fn test_simple_pipe() {
523 let segs = tokenize("echo hello | grep world", ShellType::Posix);
524 assert_eq!(segs.len(), 2);
525 assert_eq!(segs[0].command.as_deref(), Some("echo"));
526 assert_eq!(segs[1].command.as_deref(), Some("grep"));
527 assert_eq!(segs[1].preceding_separator.as_deref(), Some("|"));
528 }
529
530 #[test]
531 fn test_quoted_pipe() {
532 let segs = tokenize(r#"echo "hello | world" | bash"#, ShellType::Posix);
533 assert_eq!(segs.len(), 2);
534 assert_eq!(segs[0].raw, r#"echo "hello | world""#);
535 assert_eq!(segs[1].command.as_deref(), Some("bash"));
536 }
537
538 #[test]
539 fn test_and_or() {
540 let segs = tokenize("cmd1 && cmd2 || cmd3", ShellType::Posix);
541 assert_eq!(segs.len(), 3);
542 assert_eq!(segs[1].preceding_separator.as_deref(), Some("&&"));
543 assert_eq!(segs[2].preceding_separator.as_deref(), Some("||"));
544 }
545
546 #[test]
547 fn test_semicolon() {
548 let segs = tokenize("cmd1; cmd2", ShellType::Posix);
549 assert_eq!(segs.len(), 2);
550 assert_eq!(segs[1].preceding_separator.as_deref(), Some(";"));
551 }
552
553 #[test]
554 fn test_pipe_ampersand() {
555 let segs = tokenize("cmd1 |& cmd2", ShellType::Posix);
556 assert_eq!(segs.len(), 2);
557 assert_eq!(segs[1].preceding_separator.as_deref(), Some("|&"));
558 }
559
560 #[test]
561 fn test_powershell_pipe() {
562 let segs = tokenize("iwr url | iex", ShellType::PowerShell);
563 assert_eq!(segs.len(), 2);
564 assert_eq!(segs[0].command.as_deref(), Some("iwr"));
565 assert_eq!(segs[1].command.as_deref(), Some("iex"));
566 }
567
568 #[test]
569 fn test_powershell_backtick() {
570 let segs = tokenize("echo `| not a pipe", ShellType::PowerShell);
571 assert_eq!(segs.len(), 1);
573 }
574
575 #[test]
576 fn test_single_quotes() {
577 let segs = tokenize("echo 'hello | world' | bash", ShellType::Posix);
578 assert_eq!(segs.len(), 2);
579 }
580
581 #[test]
582 fn test_backslash_escape() {
583 let segs = tokenize("echo hello\\|world | bash", ShellType::Posix);
584 assert_eq!(segs.len(), 2);
586 }
587
588 #[test]
589 fn test_empty_input() {
590 let segs = tokenize("", ShellType::Posix);
591 assert!(segs.is_empty());
592 }
593
594 #[test]
595 fn test_whitespace_only() {
596 let segs = tokenize(" ", ShellType::Posix);
597 assert!(segs.is_empty());
598 }
599
600 #[test]
601 fn test_args_extraction() {
602 let segs = tokenize("curl -sSL https://example.com", ShellType::Posix);
603 assert_eq!(segs.len(), 1);
604 assert_eq!(segs[0].command.as_deref(), Some("curl"));
605 assert_eq!(segs[0].args.len(), 2);
606 }
607
608 #[test]
609 fn test_env_prefix_skipped() {
610 let segs = tokenize("TIRITH=0 curl evil.com", ShellType::Posix);
611 assert_eq!(segs.len(), 1);
612 assert_eq!(segs[0].command.as_deref(), Some("curl"));
613 assert_eq!(segs[0].args, vec!["evil.com"]);
614 }
615
616 #[test]
617 fn test_multiple_env_prefixes() {
618 let segs = tokenize("FOO=bar BAZ=1 python script.py", ShellType::Posix);
619 assert_eq!(segs.len(), 1);
620 assert_eq!(segs[0].command.as_deref(), Some("python"));
621 assert_eq!(segs[0].args, vec!["script.py"]);
622 }
623
624 #[test]
625 fn test_env_only_no_command() {
626 let segs = tokenize("TIRITH=0", ShellType::Posix);
627 assert_eq!(segs.len(), 1);
628 assert_eq!(segs[0].command, None);
629 assert!(segs[0].args.is_empty());
630 }
631
632 #[test]
633 fn test_is_env_assignment() {
634 assert!(is_env_assignment("FOO=bar"));
635 assert!(is_env_assignment("TIRITH=0"));
636 assert!(is_env_assignment("PATH=/usr/bin"));
637 assert!(is_env_assignment("A="));
638 assert!(!is_env_assignment("-o"));
639 assert!(!is_env_assignment("curl"));
640 assert!(!is_env_assignment("=value"));
641 assert!(!is_env_assignment("--flag=value"));
642 assert!(!is_env_assignment("1FOO=bar"));
643 }
644
645 #[test]
646 fn test_leading_env_assignment_values() {
647 assert_eq!(
648 leading_env_assignment_values("URL=https://example.com curl ok"),
649 vec!["https://example.com"]
650 );
651 assert_eq!(
652 leading_env_assignments("URL='https://example.com/a' FOO=bar curl ok"),
653 vec![
654 ("URL".to_string(), "'https://example.com/a'".to_string()),
655 ("FOO".to_string(), "bar".to_string())
656 ]
657 );
658 assert_eq!(
659 leading_env_assignment_values("URL='https://example.com/a' FOO=bar curl ok"),
660 vec!["'https://example.com/a'", "bar"]
661 );
662 assert!(leading_env_assignment_values("env URL=https://example.com curl ok").is_empty());
663 }
664
665 #[test]
666 fn test_cmd_pipe() {
667 let segs = tokenize("dir | findstr foo", ShellType::Cmd);
668 assert_eq!(segs.len(), 2);
669 assert_eq!(segs[0].command.as_deref(), Some("dir"));
670 assert_eq!(segs[1].command.as_deref(), Some("findstr"));
671 }
672
673 #[test]
674 fn test_cmd_ampersand_separator() {
675 let segs = tokenize("dir & echo done", ShellType::Cmd);
676 assert_eq!(segs.len(), 2);
677 assert_eq!(segs[1].preceding_separator.as_deref(), Some("&"));
678 }
679
680 #[test]
681 fn test_cmd_double_ampersand() {
682 let segs = tokenize("cmd1 && cmd2", ShellType::Cmd);
683 assert_eq!(segs.len(), 2);
684 assert_eq!(segs[1].preceding_separator.as_deref(), Some("&&"));
685 }
686
687 #[test]
688 fn test_cmd_caret_escape() {
689 let segs = tokenize("echo hello^|world | findstr x", ShellType::Cmd);
690 assert_eq!(segs.len(), 2);
692 }
693
694 #[test]
695 fn test_cmd_double_quotes() {
696 let segs = tokenize(r#"echo "hello | world" | findstr x"#, ShellType::Cmd);
697 assert_eq!(segs.len(), 2);
698 }
699
700 #[test]
701 fn test_powershell_multibyte_and_operator_no_panic() {
702 let input = " ?]BB\u{07E7}\u{07E7} -\n-\r-and-~\0\u{c}-and-~\u{1d}";
705 let _ = tokenize(input, ShellType::PowerShell);
706 }
707}