1use logos::{Logos, Span};
17use std::fmt;
18use std::sync::atomic::{AtomicU64, Ordering};
19use std::time::{SystemTime, UNIX_EPOCH};
20
21static MARKER_COUNTER: AtomicU64 = AtomicU64::new(0);
23
24const MAX_PAREN_DEPTH: usize = 256;
27
28#[derive(Debug, Clone)]
32struct SpanReplacement {
33 preprocessed_pos: usize,
35 marker_len: usize,
37 original_len: usize,
39}
40
41fn correct_span(span: Span, replacements: &[SpanReplacement]) -> Span {
43 let mut start_adjustment: isize = 0;
44 let mut end_adjustment: isize = 0;
45
46 for r in replacements {
47 let delta = r.original_len as isize - r.marker_len as isize;
49
50 if span.start > r.preprocessed_pos + r.marker_len {
52 start_adjustment += delta;
53 } else if span.start > r.preprocessed_pos {
54 start_adjustment += delta;
57 }
58
59 if span.end > r.preprocessed_pos + r.marker_len {
61 end_adjustment += delta;
62 } else if span.end > r.preprocessed_pos {
63 end_adjustment += delta;
65 }
66 }
67
68 let new_start = (span.start as isize + start_adjustment).max(0) as usize;
69 let new_end = (span.end as isize + end_adjustment).max(new_start as isize) as usize;
70 new_start..new_end
71}
72
73fn unique_marker_id() -> String {
76 let timestamp = SystemTime::now()
77 .duration_since(UNIX_EPOCH)
78 .map(|d| d.as_nanos())
79 .unwrap_or(0);
80 let counter = MARKER_COUNTER.fetch_add(1, Ordering::Relaxed);
81 let pid = std::process::id();
82 format!("{:x}_{:x}_{:x}", timestamp, counter, pid)
83}
84
85#[derive(Debug, Clone, PartialEq)]
87pub struct Spanned<T> {
88 pub token: T,
89 pub span: Span,
90}
91
92impl<T> Spanned<T> {
93 pub fn new(token: T, span: Span) -> Self {
94 Self { token, span }
95 }
96}
97
98#[derive(Debug, Clone, PartialEq, Default)]
100pub enum LexerError {
101 #[default]
102 UnexpectedCharacter,
103 UnterminatedString,
104 UnterminatedVarRef,
105 InvalidEscape,
106 InvalidNumber,
107 AmbiguousBoolean(String),
108 AmbiguousBooleanLike(String),
109 InvalidNumberIdent(String),
110 InvalidFloatNoLeading,
111 InvalidFloatNoTrailing,
112 NestingTooDeep,
114}
115
116impl fmt::Display for LexerError {
117 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
118 match self {
119 LexerError::UnexpectedCharacter => write!(f, "unexpected character"),
120 LexerError::UnterminatedString => write!(f, "unterminated string"),
121 LexerError::UnterminatedVarRef => write!(f, "unterminated variable reference"),
122 LexerError::InvalidEscape => write!(f, "invalid escape sequence"),
123 LexerError::InvalidNumber => write!(f, "invalid number"),
124 LexerError::AmbiguousBoolean(s) => {
125 write!(f, "ambiguous boolean, use lowercase '{}'", s.to_lowercase())
126 }
127 LexerError::AmbiguousBooleanLike(s) => {
128 let suggest = if s.eq_ignore_ascii_case("yes") { "true" } else { "false" };
129 write!(f, "ambiguous boolean-like '{}', use '{}' or '\"{}\"'", s, suggest, s)
130 }
131 LexerError::InvalidNumberIdent(s) => {
132 write!(f, "identifier cannot start with digit: {}", s)
133 }
134 LexerError::InvalidFloatNoLeading => write!(f, "float must have leading digit"),
135 LexerError::InvalidFloatNoTrailing => write!(f, "float must have trailing digit"),
136 LexerError::NestingTooDeep => write!(f, "nesting depth exceeded (max {})", MAX_PAREN_DEPTH),
137 }
138 }
139}
140
141#[derive(Debug, Clone, PartialEq)]
153pub struct HereDocData {
154 pub content: String,
155 pub literal: bool,
156}
157
158#[derive(Logos, Debug, Clone, PartialEq)]
159#[logos(error = LexerError)]
160#[logos(skip r"[ \t]+")]
161pub enum Token {
162 #[token("set")]
166 Set,
167
168 #[token("local")]
169 Local,
170
171 #[token("if")]
172 If,
173
174 #[token("then")]
175 Then,
176
177 #[token("else")]
178 Else,
179
180 #[token("elif")]
181 Elif,
182
183 #[token("fi")]
184 Fi,
185
186 #[token("for")]
187 For,
188
189 #[token("while")]
190 While,
191
192 #[token("in")]
193 In,
194
195 #[token("do")]
196 Do,
197
198 #[token("done")]
199 Done,
200
201 #[token("case")]
202 Case,
203
204 #[token("esac")]
205 Esac,
206
207 #[token("function")]
208 Function,
209
210 #[token("break")]
211 Break,
212
213 #[token("continue")]
214 Continue,
215
216 #[token("return")]
217 Return,
218
219 #[token("exit")]
220 Exit,
221
222 #[token("true")]
223 True,
224
225 #[token("false")]
226 False,
227
228 #[token("string")]
232 TypeString,
233
234 #[token("int")]
235 TypeInt,
236
237 #[token("float")]
238 TypeFloat,
239
240 #[token("bool")]
241 TypeBool,
242
243 #[token("&&")]
247 And,
248
249 #[token("||")]
250 Or,
251
252 #[token("==")]
253 EqEq,
254
255 #[token("!=")]
256 NotEq,
257
258 #[token("=~")]
259 Match,
260
261 #[token("!~")]
262 NotMatch,
263
264 #[token(">=")]
265 GtEq,
266
267 #[token("<=")]
268 LtEq,
269
270 #[token(">>")]
271 GtGt,
272
273 #[token("2>&1")]
274 StderrToStdout,
275
276 #[token("1>&2")]
277 StdoutToStderr,
278
279 #[token(">&2")]
280 StdoutToStderr2,
281
282 #[token("2>")]
283 Stderr,
284
285 #[token("&>")]
286 Both,
287
288 #[token("<<")]
289 HereDocStart,
290
291 #[token(";;")]
292 DoubleSemi,
293
294 #[token("=")]
298 Eq,
299
300 #[token("|")]
301 Pipe,
302
303 #[token("&")]
304 Amp,
305
306 #[token(">")]
307 Gt,
308
309 #[token("<")]
310 Lt,
311
312 #[token(";")]
313 Semi,
314
315 #[token(":")]
316 Colon,
317
318 #[token(",")]
319 Comma,
320
321 #[token("..")]
322 DotDot,
323
324 #[token(".")]
325 Dot,
326
327 #[regex(r"~[a-zA-Z0-9_./+-]+", lex_tilde_path, priority = 3)]
329 TildePath(String),
330
331 #[token("~")]
333 Tilde,
334
335 #[regex(r"\.\./[a-zA-Z0-9_./-]+", lex_relative_path, priority = 3)]
337 RelativePath(String),
338
339 #[regex(r"\./[a-zA-Z0-9_./-]+", lex_dot_slash_path, priority = 3)]
341 DotSlashPath(String),
342
343 #[token("{")]
344 LBrace,
345
346 #[token("}")]
347 RBrace,
348
349 #[token("[")]
350 LBracket,
351
352 #[token("]")]
353 RBracket,
354
355 #[token("(")]
356 LParen,
357
358 #[token(")")]
359 RParen,
360
361 #[token("*")]
362 Star,
363
364 #[token("!")]
365 Bang,
366
367 #[token("?")]
368 Question,
369
370 GlobWord(String),
373
374 Arithmetic(String),
381
382 #[token("$(")]
384 CmdSubstStart,
385
386 #[regex(r"--[a-zA-Z][a-zA-Z0-9-]*", lex_long_flag, priority = 3)]
392 LongFlag(String),
393
394 #[regex(r"-[a-zA-Z][a-zA-Z0-9]*", lex_short_flag, priority = 3)]
396 ShortFlag(String),
397
398 #[regex(r"\+[a-zA-Z][a-zA-Z0-9]*", lex_plus_flag, priority = 3)]
400 PlusFlag(String),
401
402 #[token("--")]
404 DoubleDash,
405
406 #[regex(r"\+[^a-zA-Z\s][^\s]*", lex_plus_bare, priority = 2)]
409 PlusBare(String),
410
411 #[regex(r"-[^a-zA-Z0-9\s\-][^\s]*", lex_minus_bare, priority = 1)]
415 MinusBare(String),
416
417 #[token("-")]
421 MinusAlone,
422
423 #[regex(r#""([^"\\]|\\.)*""#, lex_string)]
429 String(String),
430
431 #[regex(r"'[^']*'", lex_single_string)]
433 SingleString(String),
434
435 #[regex(r"\$\{[^}]+\}", lex_varref)]
437 VarRef(String),
438
439 #[regex(r"\$[a-zA-Z_][a-zA-Z0-9_]*", lex_simple_varref)]
441 SimpleVarRef(String),
442
443 #[regex(r"\$[0-9]", lex_positional)]
445 Positional(usize),
446
447 #[token("$@")]
449 AllArgs,
450
451 #[token("$#")]
453 ArgCount,
454
455 #[token("$?")]
457 LastExitCode,
458
459 #[token("$$")]
461 CurrentPid,
462
463 #[regex(r"\$\{#[a-zA-Z_][a-zA-Z0-9_]*\}", lex_var_length)]
465 VarLength(String),
466
467 HereDoc(HereDocData),
470
471 #[regex(r"-?[0-9]+", lex_int, priority = 2)]
473 Int(i64),
474
475 #[regex(r"-?[0-9]+\.[0-9]+", lex_float)]
477 Float(f64),
478
479 #[regex(r"[0-9]+[a-zA-Z_][a-zA-Z0-9_-]*", lex_invalid_number_ident, priority = 3)]
485 InvalidNumberIdent,
486
487 #[regex(r"\.[0-9]+", lex_invalid_float_no_leading, priority = 3)]
489 InvalidFloatNoLeading,
490
491 #[regex(r"[0-9]+\.", lex_invalid_float_no_trailing, priority = 2)]
494 InvalidFloatNoTrailing,
495
496 #[regex(r"/[a-zA-Z0-9_./+-]*", lex_path)]
502 Path(String),
503
504 #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*", lex_ident)]
511 Ident(String),
512
513 #[regex(r"#[^\n\r]*", allow_greedy = true)]
519 Comment,
520
521 #[regex(r"\n|\r\n")]
523 Newline,
524
525 #[regex(r"\\[ \t]*(\n|\r\n)")]
527 LineContinuation,
528}
529
530#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
535pub enum TokenCategory {
536 Keyword,
538 Operator,
540 String,
542 Number,
544 Variable,
546 Comment,
548 Punctuation,
550 Command,
552 Path,
554 Flag,
556 Error,
558}
559
560impl Token {
561 pub fn category(&self) -> TokenCategory {
563 match self {
564 Token::If
566 | Token::Then
567 | Token::Else
568 | Token::Elif
569 | Token::Fi
570 | Token::For
571 | Token::In
572 | Token::Do
573 | Token::Done
574 | Token::While
575 | Token::Case
576 | Token::Esac
577 | Token::Function
578 | Token::Return
579 | Token::Break
580 | Token::Continue
581 | Token::Exit
582 | Token::Set
583 | Token::Local
584 | Token::True
585 | Token::False
586 | Token::TypeString
587 | Token::TypeInt
588 | Token::TypeFloat
589 | Token::TypeBool => TokenCategory::Keyword,
590
591 Token::Pipe
593 | Token::And
594 | Token::Or
595 | Token::Amp
596 | Token::Eq
597 | Token::EqEq
598 | Token::NotEq
599 | Token::Match
600 | Token::NotMatch
601 | Token::Lt
602 | Token::Gt
603 | Token::LtEq
604 | Token::GtEq
605 | Token::GtGt
606 | Token::Stderr
607 | Token::Both
608 | Token::HereDocStart
609 | Token::StderrToStdout
610 | Token::StdoutToStderr
611 | Token::StdoutToStderr2 => TokenCategory::Operator,
612
613 Token::String(_) | Token::SingleString(_) | Token::HereDoc(_) => TokenCategory::String,
615
616 Token::Int(_) | Token::Float(_) | Token::Arithmetic(_) => TokenCategory::Number,
618
619 Token::VarRef(_)
621 | Token::SimpleVarRef(_)
622 | Token::Positional(_)
623 | Token::AllArgs
624 | Token::ArgCount
625 | Token::VarLength(_)
626 | Token::LastExitCode
627 | Token::CurrentPid => TokenCategory::Variable,
628
629 Token::LongFlag(_)
631 | Token::ShortFlag(_)
632 | Token::PlusFlag(_)
633 | Token::DoubleDash => TokenCategory::Flag,
634
635 Token::Semi
637 | Token::DoubleSemi
638 | Token::Colon
639 | Token::Comma
640 | Token::Dot
641 | Token::LParen
642 | Token::RParen
643 | Token::LBrace
644 | Token::RBrace
645 | Token::LBracket
646 | Token::RBracket
647 | Token::Bang
648 | Token::Question
649 | Token::Star
650 | Token::Newline
651 | Token::LineContinuation
652 | Token::CmdSubstStart => TokenCategory::Punctuation,
653
654 Token::GlobWord(_) => TokenCategory::Path,
656
657 Token::Comment => TokenCategory::Comment,
659
660 Token::Path(_)
662 | Token::TildePath(_)
663 | Token::RelativePath(_)
664 | Token::Tilde
665 | Token::DotDot
666 | Token::DotSlashPath(_) => TokenCategory::Path,
667
668 Token::Ident(_)
670 | Token::PlusBare(_)
671 | Token::MinusBare(_)
672 | Token::MinusAlone => TokenCategory::Command,
673
674 Token::InvalidNumberIdent
676 | Token::InvalidFloatNoLeading
677 | Token::InvalidFloatNoTrailing => TokenCategory::Error,
678 }
679 }
680}
681
682fn lex_string(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
684 parse_string_literal(lex.slice())
685}
686
687fn lex_single_string(lex: &mut logos::Lexer<Token>) -> String {
689 let s = lex.slice();
690 s[1..s.len() - 1].to_string()
692}
693
694fn lex_varref(lex: &mut logos::Lexer<Token>) -> String {
696 lex.slice().to_string()
698}
699
700fn lex_simple_varref(lex: &mut logos::Lexer<Token>) -> String {
702 lex.slice()[1..].to_string()
704}
705
706fn lex_positional(lex: &mut logos::Lexer<Token>) -> usize {
708 lex.slice()[1..].parse().unwrap_or(0)
710}
711
712fn lex_var_length(lex: &mut logos::Lexer<Token>) -> String {
714 let s = lex.slice();
716 s[3..s.len() - 1].to_string()
717}
718
719fn lex_int(lex: &mut logos::Lexer<Token>) -> Result<i64, LexerError> {
721 lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
722}
723
724fn lex_float(lex: &mut logos::Lexer<Token>) -> Result<f64, LexerError> {
726 lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
727}
728
729fn lex_invalid_number_ident(lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
732 Err(LexerError::InvalidNumberIdent(lex.slice().to_string()))
733}
734
735fn lex_invalid_float_no_leading(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
738 Err(LexerError::InvalidFloatNoLeading)
739}
740
741fn lex_invalid_float_no_trailing(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
744 Err(LexerError::InvalidFloatNoTrailing)
745}
746
747fn lex_ident(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
749 let s = lex.slice();
750
751 match s.to_lowercase().as_str() {
754 "true" | "false" if s != "true" && s != "false" => {
755 return Err(LexerError::AmbiguousBoolean(s.to_string()));
756 }
757 _ => {}
758 }
759
760 if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("no") {
762 return Err(LexerError::AmbiguousBooleanLike(s.to_string()));
763 }
764
765 Ok(s.to_string())
766}
767
768fn lex_long_flag(lex: &mut logos::Lexer<Token>) -> String {
770 lex.slice()[2..].to_string()
772}
773
774fn lex_short_flag(lex: &mut logos::Lexer<Token>) -> String {
776 lex.slice()[1..].to_string()
778}
779
780fn lex_plus_flag(lex: &mut logos::Lexer<Token>) -> String {
782 lex.slice()[1..].to_string()
784}
785
786fn lex_plus_bare(lex: &mut logos::Lexer<Token>) -> String {
788 lex.slice().to_string()
789}
790
791fn lex_minus_bare(lex: &mut logos::Lexer<Token>) -> String {
793 lex.slice().to_string()
794}
795
796fn lex_path(lex: &mut logos::Lexer<Token>) -> String {
798 lex.slice().to_string()
799}
800
801fn lex_tilde_path(lex: &mut logos::Lexer<Token>) -> String {
803 lex.slice().to_string()
804}
805
806fn lex_relative_path(lex: &mut logos::Lexer<Token>) -> String {
808 lex.slice().to_string()
809}
810
811fn lex_dot_slash_path(lex: &mut logos::Lexer<Token>) -> String {
813 lex.slice().to_string()
814}
815
816impl fmt::Display for Token {
817 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
818 match self {
819 Token::Set => write!(f, "set"),
820 Token::Local => write!(f, "local"),
821 Token::If => write!(f, "if"),
822 Token::Then => write!(f, "then"),
823 Token::Else => write!(f, "else"),
824 Token::Elif => write!(f, "elif"),
825 Token::Fi => write!(f, "fi"),
826 Token::For => write!(f, "for"),
827 Token::While => write!(f, "while"),
828 Token::In => write!(f, "in"),
829 Token::Do => write!(f, "do"),
830 Token::Done => write!(f, "done"),
831 Token::Case => write!(f, "case"),
832 Token::Esac => write!(f, "esac"),
833 Token::Function => write!(f, "function"),
834 Token::Break => write!(f, "break"),
835 Token::Continue => write!(f, "continue"),
836 Token::Return => write!(f, "return"),
837 Token::Exit => write!(f, "exit"),
838 Token::True => write!(f, "true"),
839 Token::False => write!(f, "false"),
840 Token::TypeString => write!(f, "string"),
841 Token::TypeInt => write!(f, "int"),
842 Token::TypeFloat => write!(f, "float"),
843 Token::TypeBool => write!(f, "bool"),
844 Token::And => write!(f, "&&"),
845 Token::Or => write!(f, "||"),
846 Token::EqEq => write!(f, "=="),
847 Token::NotEq => write!(f, "!="),
848 Token::Match => write!(f, "=~"),
849 Token::NotMatch => write!(f, "!~"),
850 Token::GtEq => write!(f, ">="),
851 Token::LtEq => write!(f, "<="),
852 Token::GtGt => write!(f, ">>"),
853 Token::StderrToStdout => write!(f, "2>&1"),
854 Token::StdoutToStderr => write!(f, "1>&2"),
855 Token::StdoutToStderr2 => write!(f, ">&2"),
856 Token::Stderr => write!(f, "2>"),
857 Token::Both => write!(f, "&>"),
858 Token::HereDocStart => write!(f, "<<"),
859 Token::DoubleSemi => write!(f, ";;"),
860 Token::Eq => write!(f, "="),
861 Token::Pipe => write!(f, "|"),
862 Token::Amp => write!(f, "&"),
863 Token::Gt => write!(f, ">"),
864 Token::Lt => write!(f, "<"),
865 Token::Semi => write!(f, ";"),
866 Token::Colon => write!(f, ":"),
867 Token::Comma => write!(f, ","),
868 Token::Dot => write!(f, "."),
869 Token::DotDot => write!(f, ".."),
870 Token::Tilde => write!(f, "~"),
871 Token::TildePath(s) => write!(f, "{}", s),
872 Token::RelativePath(s) => write!(f, "{}", s),
873 Token::DotSlashPath(s) => write!(f, "{}", s),
874 Token::LBrace => write!(f, "{{"),
875 Token::RBrace => write!(f, "}}"),
876 Token::LBracket => write!(f, "["),
877 Token::RBracket => write!(f, "]"),
878 Token::LParen => write!(f, "("),
879 Token::RParen => write!(f, ")"),
880 Token::Star => write!(f, "*"),
881 Token::Bang => write!(f, "!"),
882 Token::Question => write!(f, "?"),
883 Token::GlobWord(s) => write!(f, "GLOB({})", s),
884 Token::Arithmetic(s) => write!(f, "ARITHMETIC({})", s),
885 Token::CmdSubstStart => write!(f, "$("),
886 Token::LongFlag(s) => write!(f, "--{}", s),
887 Token::ShortFlag(s) => write!(f, "-{}", s),
888 Token::PlusFlag(s) => write!(f, "+{}", s),
889 Token::DoubleDash => write!(f, "--"),
890 Token::PlusBare(s) => write!(f, "{}", s),
891 Token::MinusBare(s) => write!(f, "{}", s),
892 Token::MinusAlone => write!(f, "-"),
893 Token::String(s) => write!(f, "STRING({:?})", s),
894 Token::SingleString(s) => write!(f, "SINGLESTRING({:?})", s),
895 Token::HereDoc(d) => write!(f, "HEREDOC({:?}, literal={})", d.content, d.literal),
896 Token::VarRef(v) => write!(f, "VARREF({})", v),
897 Token::SimpleVarRef(v) => write!(f, "SIMPLEVARREF({})", v),
898 Token::Positional(n) => write!(f, "${}", n),
899 Token::AllArgs => write!(f, "$@"),
900 Token::ArgCount => write!(f, "$#"),
901 Token::LastExitCode => write!(f, "$?"),
902 Token::CurrentPid => write!(f, "$$"),
903 Token::VarLength(v) => write!(f, "${{#{}}}", v),
904 Token::Int(n) => write!(f, "INT({})", n),
905 Token::Float(n) => write!(f, "FLOAT({})", n),
906 Token::Path(s) => write!(f, "PATH({})", s),
907 Token::Ident(s) => write!(f, "IDENT({})", s),
908 Token::Comment => write!(f, "COMMENT"),
909 Token::Newline => write!(f, "NEWLINE"),
910 Token::LineContinuation => write!(f, "LINECONT"),
911 Token::InvalidNumberIdent => write!(f, "INVALID_NUMBER_IDENT"),
913 Token::InvalidFloatNoLeading => write!(f, "INVALID_FLOAT_NO_LEADING"),
914 Token::InvalidFloatNoTrailing => write!(f, "INVALID_FLOAT_NO_TRAILING"),
915 }
916 }
917}
918
919impl Token {
920 pub fn is_keyword(&self) -> bool {
922 matches!(
923 self,
924 Token::Set
925 | Token::Local
926 | Token::If
927 | Token::Then
928 | Token::Else
929 | Token::Elif
930 | Token::Fi
931 | Token::For
932 | Token::In
933 | Token::Do
934 | Token::Done
935 | Token::Case
936 | Token::Esac
937 | Token::Function
938 | Token::True
939 | Token::False
940 )
941 }
942
943 pub fn is_type(&self) -> bool {
945 matches!(
946 self,
947 Token::TypeString
948 | Token::TypeInt
949 | Token::TypeFloat
950 | Token::TypeBool
951 )
952 }
953
954 pub fn starts_statement(&self) -> bool {
956 matches!(
957 self,
958 Token::Set | Token::Local | Token::Function | Token::If | Token::For | Token::Case | Token::Ident(_) | Token::LBracket
959 )
960 }
961
962 pub fn is_value(&self) -> bool {
964 matches!(
965 self,
966 Token::String(_)
967 | Token::SingleString(_)
968 | Token::HereDoc(_)
969 | Token::Arithmetic(_)
970 | Token::Int(_)
971 | Token::Float(_)
972 | Token::True
973 | Token::False
974 | Token::VarRef(_)
975 | Token::SimpleVarRef(_)
976 | Token::CmdSubstStart
977 | Token::Path(_)
978 | Token::GlobWord(_)
979 | Token::LastExitCode
980 | Token::CurrentPid
981 )
982 }
983}
984
985struct ArithmeticPreprocessResult {
987 text: String,
989 arithmetics: Vec<(String, String)>,
991 replacements: Vec<SpanReplacement>,
993}
994
995fn skip_command_substitution(
1004 chars: &[char],
1005 i: &mut usize,
1006 source_pos: &mut usize,
1007 result: &mut String,
1008) {
1009 result.push('$');
1011 result.push('(');
1012 *i += 2;
1013 *source_pos += 2;
1014
1015 let mut depth: usize = 1;
1016 let mut in_single_quote = false;
1017 let mut in_double_quote = false;
1018
1019 while *i < chars.len() && depth > 0 {
1020 let c = chars[*i];
1021
1022 if in_single_quote {
1023 result.push(c);
1024 *source_pos += c.len_utf8();
1025 *i += 1;
1026 if c == '\'' {
1027 in_single_quote = false;
1028 }
1029 continue;
1030 }
1031
1032 if in_double_quote {
1033 if c == '\\' && *i + 1 < chars.len() {
1034 let next = chars[*i + 1];
1035 if next == '"' || next == '\\' || next == '$' || next == '`' {
1036 result.push(c);
1037 result.push(next);
1038 *source_pos += c.len_utf8() + next.len_utf8();
1039 *i += 2;
1040 continue;
1041 }
1042 }
1043 if c == '"' {
1044 in_double_quote = false;
1045 }
1046 result.push(c);
1047 *source_pos += c.len_utf8();
1048 *i += 1;
1049 continue;
1050 }
1051
1052 match c {
1054 '\'' => {
1055 in_single_quote = true;
1056 result.push(c);
1057 *source_pos += c.len_utf8();
1058 *i += 1;
1059 }
1060 '"' => {
1061 in_double_quote = true;
1062 result.push(c);
1063 *source_pos += c.len_utf8();
1064 *i += 1;
1065 }
1066 '\\' if *i + 1 < chars.len() => {
1067 result.push(c);
1068 result.push(chars[*i + 1]);
1069 *source_pos += c.len_utf8() + chars[*i + 1].len_utf8();
1070 *i += 2;
1071 }
1072 '(' => {
1073 depth += 1;
1074 result.push(c);
1075 *source_pos += c.len_utf8();
1076 *i += 1;
1077 }
1078 ')' => {
1079 depth -= 1;
1080 result.push(c);
1081 *source_pos += c.len_utf8();
1082 *i += 1;
1083 }
1084 _ => {
1085 result.push(c);
1086 *source_pos += c.len_utf8();
1087 *i += 1;
1088 }
1089 }
1090 }
1091}
1092
1093fn preprocess_arithmetic(source: &str) -> Result<ArithmeticPreprocessResult, LexerError> {
1107 let mut result = String::with_capacity(source.len());
1108 let mut arithmetics: Vec<(String, String)> = Vec::new();
1109 let mut replacements: Vec<SpanReplacement> = Vec::new();
1110 let mut source_pos: usize = 0;
1111 let chars_vec: Vec<char> = source.chars().collect();
1112 let mut i = 0;
1113
1114 let mut in_double_quote = false;
1117
1118 while i < chars_vec.len() {
1119 let ch = chars_vec[i];
1120
1121 if !in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1123 result.push(ch);
1124 result.push(chars_vec[i + 1]);
1125 source_pos += ch.len_utf8() + chars_vec[i + 1].len_utf8();
1126 i += 2;
1127 continue;
1128 }
1129
1130 if ch == '\'' && !in_double_quote {
1132 result.push(ch);
1133 i += 1;
1134 source_pos += 1;
1135 while i < chars_vec.len() && chars_vec[i] != '\'' {
1136 result.push(chars_vec[i]);
1137 source_pos += chars_vec[i].len_utf8();
1138 i += 1;
1139 }
1140 if i < chars_vec.len() {
1141 result.push(chars_vec[i]); source_pos += 1;
1143 i += 1;
1144 }
1145 continue;
1146 }
1147
1148 if ch == '"' {
1150 in_double_quote = !in_double_quote;
1151 result.push(ch);
1152 i += 1;
1153 source_pos += 1;
1154 continue;
1155 }
1156
1157 if in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1159 let next = chars_vec[i + 1];
1160 if next == '"' || next == '\\' || next == '$' || next == '`' {
1161 result.push(ch);
1162 result.push(next);
1163 source_pos += ch.len_utf8() + next.len_utf8();
1164 i += 2;
1165 continue;
1166 }
1167 }
1168
1169 if ch == '$' && i + 1 < chars_vec.len() && chars_vec[i + 1] == '('
1171 && !(i + 2 < chars_vec.len() && chars_vec[i + 2] == '(')
1172 {
1173 skip_command_substitution(&chars_vec, &mut i, &mut source_pos, &mut result);
1174 continue;
1175 }
1176
1177 if ch == '$' && i + 2 < chars_vec.len() && chars_vec[i + 1] == '(' && chars_vec[i + 2] == '(' {
1179 let arith_start_pos = result.len();
1180 let original_start = source_pos;
1181
1182 i += 3;
1184 source_pos += 3;
1185
1186 let mut expr = String::new();
1188 let mut paren_depth: usize = 0;
1189
1190 while i < chars_vec.len() {
1191 let c = chars_vec[i];
1192 match c {
1193 '(' => {
1194 paren_depth += 1;
1195 if paren_depth > MAX_PAREN_DEPTH {
1196 return Err(LexerError::NestingTooDeep);
1197 }
1198 expr.push('(');
1199 i += 1;
1200 source_pos += c.len_utf8();
1201 }
1202 ')' => {
1203 if paren_depth > 0 {
1204 paren_depth -= 1;
1205 expr.push(')');
1206 i += 1;
1207 source_pos += 1;
1208 } else if i + 1 < chars_vec.len() && chars_vec[i + 1] == ')' {
1209 i += 2;
1211 source_pos += 2;
1212 break;
1213 } else {
1214 expr.push(')');
1216 i += 1;
1217 source_pos += 1;
1218 }
1219 }
1220 _ => {
1221 expr.push(c);
1222 i += 1;
1223 source_pos += c.len_utf8();
1224 }
1225 }
1226 }
1227
1228 let original_len = source_pos - original_start;
1230
1231 let marker = format!("__KAISH_ARITH_{}__", unique_marker_id());
1233 let marker_len = marker.len();
1234
1235 replacements.push(SpanReplacement {
1237 preprocessed_pos: arith_start_pos,
1238 marker_len,
1239 original_len,
1240 });
1241
1242 arithmetics.push((marker.clone(), expr));
1243 result.push_str(&marker);
1244 } else {
1245 result.push(ch);
1246 i += 1;
1247 source_pos += ch.len_utf8();
1248 }
1249 }
1250
1251 Ok(ArithmeticPreprocessResult {
1252 text: result,
1253 arithmetics,
1254 replacements,
1255 })
1256}
1257
1258fn preprocess_heredocs(source: &str) -> (String, Vec<(String, String, bool)>) {
1269 let mut result = String::with_capacity(source.len());
1270 let mut heredocs: Vec<(String, String, bool)> = Vec::new();
1271 let mut chars = source.chars().peekable();
1272
1273 while let Some(ch) = chars.next() {
1274 if ch == '<' && chars.peek() == Some(&'<') {
1276 chars.next(); let strip_tabs = chars.peek() == Some(&'-');
1280 if strip_tabs {
1281 chars.next();
1282 }
1283
1284 while let Some(&c) = chars.peek() {
1286 if c == ' ' || c == '\t' {
1287 chars.next();
1288 } else {
1289 break;
1290 }
1291 }
1292
1293 let mut delimiter = String::new();
1295 let quoted = chars.peek() == Some(&'\'') || chars.peek() == Some(&'"');
1296 let quote_char = if quoted { chars.next() } else { None };
1297
1298 while let Some(&c) = chars.peek() {
1299 if quoted {
1300 if Some(c) == quote_char {
1301 chars.next(); break;
1303 }
1304 } else if c.is_whitespace() || c == '\n' || c == '\r' {
1305 break;
1306 }
1307 if let Some(ch) = chars.next() {
1308 delimiter.push(ch);
1309 }
1310 }
1311
1312 if delimiter.is_empty() {
1313 result.push_str("<<");
1315 if strip_tabs {
1316 result.push('-');
1317 }
1318 continue;
1319 }
1320
1321 let mut after_delimiter = String::new();
1324 while let Some(&c) = chars.peek() {
1325 if c == '\n' {
1326 chars.next();
1327 break;
1328 } else if c == '\r' {
1329 chars.next();
1330 if chars.peek() == Some(&'\n') {
1331 chars.next();
1332 }
1333 break;
1334 }
1335 if let Some(ch) = chars.next() {
1336 after_delimiter.push(ch);
1337 }
1338 }
1339
1340 let mut content = String::new();
1342 let mut current_line = String::new();
1343
1344 loop {
1345 match chars.next() {
1346 Some('\n') => {
1347 let trimmed = if strip_tabs {
1349 current_line.trim_start_matches('\t')
1350 } else {
1351 ¤t_line
1352 };
1353 if trimmed == delimiter {
1354 break;
1356 }
1357 content.push_str(¤t_line);
1359 content.push('\n');
1360 current_line.clear();
1361 }
1362 Some('\r') => {
1363 if chars.peek() == Some(&'\n') {
1365 chars.next();
1366 }
1367 let trimmed = if strip_tabs {
1368 current_line.trim_start_matches('\t')
1369 } else {
1370 ¤t_line
1371 };
1372 if trimmed == delimiter {
1373 break;
1374 }
1375 content.push_str(¤t_line);
1376 content.push('\n');
1377 current_line.clear();
1378 }
1379 Some(c) => {
1380 current_line.push(c);
1381 }
1382 None => {
1383 let trimmed = if strip_tabs {
1385 current_line.trim_start_matches('\t')
1386 } else {
1387 ¤t_line
1388 };
1389 if trimmed == delimiter {
1390 break;
1392 }
1393 if !current_line.is_empty() {
1395 content.push_str(¤t_line);
1396 }
1397 break;
1398 }
1399 }
1400 }
1401
1402 let content = content.trim_end_matches('\n').to_string();
1404
1405 let marker = format!("__KAISH_HEREDOC_{}__", unique_marker_id());
1407 heredocs.push((marker.clone(), content, quoted));
1408
1409 result.push_str("<<");
1412 result.push_str(&marker);
1413 result.push_str(&after_delimiter);
1414 result.push('\n');
1415 } else {
1416 result.push(ch);
1417 }
1418 }
1419
1420 (result, heredocs)
1421}
1422
1423fn mergeable_text(token: &Token) -> Option<String> {
1428 match token {
1429 Token::Ident(s) => Some(s.clone()),
1430 Token::Colon => Some(":".to_string()),
1431 Token::Int(n) => Some(n.to_string()),
1432 Token::Path(p) => Some(p.clone()),
1433 Token::Float(f) => Some(f.to_string()),
1434 _ => None,
1435 }
1436}
1437
1438fn merge_colon_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1447 if tokens.is_empty() {
1448 return tokens;
1449 }
1450
1451 let mut result = Vec::with_capacity(tokens.len());
1452 let mut run: Vec<&Spanned<Token>> = Vec::new();
1453
1454 for token in &tokens {
1455 if run.is_empty() {
1456 if mergeable_text(&token.token).is_some() {
1457 run.push(token);
1458 } else {
1459 result.push(token.clone());
1460 }
1461 continue;
1462 }
1463
1464 let Some(last) = run.last() else { unreachable!() };
1467 let adjacent = last.span.end == token.span.start;
1468
1469 if adjacent && mergeable_text(&token.token).is_some() {
1470 run.push(token);
1471 } else {
1472 flush_colon_run(&mut run, &mut result);
1473 if mergeable_text(&token.token).is_some() {
1474 run.push(token);
1475 } else {
1476 result.push(token.clone());
1477 }
1478 }
1479 }
1480
1481 flush_colon_run(&mut run, &mut result);
1482
1483 result
1484}
1485
1486fn flush_colon_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1488 if run.is_empty() {
1489 return;
1490 }
1491
1492 let has_colon = run.iter().any(|t| matches!(t.token, Token::Colon));
1493
1494 if run.len() >= 2 && has_colon {
1495 let text: String = run
1496 .iter()
1497 .filter_map(|t| mergeable_text(&t.token))
1498 .collect();
1499 let start = run.first().map(|t| t.span.start).unwrap_or(0);
1501 let end = run.last().map(|t| t.span.end).unwrap_or(0);
1502 result.push(Spanned::new(Token::Ident(text), start..end));
1503 } else {
1504 for t in run.iter() {
1505 result.push((*t).clone());
1506 }
1507 }
1508
1509 run.clear();
1510}
1511
1512fn glob_mergeable_text(token: &Token) -> Option<String> {
1517 match token {
1518 Token::Star => Some("*".to_string()),
1519 Token::Question => Some("?".to_string()),
1520 Token::Dot => Some(".".to_string()),
1521 Token::DotDot => Some("..".to_string()),
1522 Token::Ident(s) => Some(s.clone()),
1523 Token::Path(s) => Some(s.clone()),
1524 Token::Int(n) => Some(n.to_string()),
1525 Token::LBracket => Some("[".to_string()),
1526 Token::RBracket => Some("]".to_string()),
1527 Token::Bang => Some("!".to_string()),
1528 Token::DotSlashPath(s) => Some(s.clone()),
1529 Token::RelativePath(s) => Some(s.clone()),
1530 Token::TildePath(s) => Some(s.clone()),
1531 Token::Tilde => Some("~".to_string()),
1532 Token::LBrace => Some("{".to_string()),
1533 Token::RBrace => Some("}".to_string()),
1534 Token::Comma => Some(",".to_string()),
1535 _ => None,
1536 }
1537}
1538
1539fn merge_glob_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1547 if tokens.is_empty() {
1548 return tokens;
1549 }
1550
1551 let mut result = Vec::with_capacity(tokens.len());
1552 let mut run: Vec<&Spanned<Token>> = Vec::new();
1553
1554 for token in &tokens {
1555 if run.is_empty() {
1556 if glob_mergeable_text(&token.token).is_some() {
1557 run.push(token);
1558 } else {
1559 result.push(token.clone());
1560 }
1561 continue;
1562 }
1563
1564 let Some(last) = run.last() else { unreachable!() };
1566 let adjacent = last.span.end == token.span.start;
1567
1568 if adjacent && glob_mergeable_text(&token.token).is_some() {
1569 run.push(token);
1570 } else {
1571 flush_glob_run(&mut run, &mut result);
1572 if glob_mergeable_text(&token.token).is_some() {
1573 run.push(token);
1574 } else {
1575 result.push(token.clone());
1576 }
1577 }
1578 }
1579
1580 flush_glob_run(&mut run, &mut result);
1581
1582 result
1583}
1584
1585fn flush_glob_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1587 if run.is_empty() {
1588 return;
1589 }
1590
1591 let has_glob = run.iter().any(|t| {
1592 matches!(t.token, Token::Star | Token::Question)
1593 }) || (run.iter().any(|t| matches!(t.token, Token::LBracket))
1594 && run.iter().any(|t| matches!(t.token, Token::RBracket)));
1595
1596 if run.len() >= 2 && has_glob {
1597 let text: String = run
1598 .iter()
1599 .filter_map(|t| glob_mergeable_text(&t.token))
1600 .collect();
1601 let start = run.first().map(|t| t.span.start).unwrap_or(0);
1602 let end = run.last().map(|t| t.span.end).unwrap_or(0);
1603 result.push(Spanned::new(Token::GlobWord(text), start..end));
1604 } else {
1605 for t in run.iter() {
1606 result.push((*t).clone());
1607 }
1608 }
1609
1610 run.clear();
1611}
1612
1613pub fn tokenize(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1623 let arith_result = preprocess_arithmetic(source)
1625 .map_err(|e| vec![Spanned::new(e, 0..source.len())])?;
1626
1627 let (preprocessed, heredocs) = preprocess_heredocs(&arith_result.text);
1629
1630 let span_replacements = arith_result.replacements;
1632
1633 let lexer = Token::lexer(&preprocessed);
1634 let mut tokens = Vec::new();
1635 let mut errors = Vec::new();
1636
1637 for (result, span) in lexer.spanned() {
1638 let corrected_span = correct_span(span, &span_replacements);
1640 match result {
1641 Ok(token) => {
1642 if !matches!(token, Token::Comment | Token::LineContinuation) {
1644 tokens.push(Spanned::new(token, corrected_span));
1645 }
1646 }
1647 Err(err) => {
1648 errors.push(Spanned::new(err, corrected_span));
1649 }
1650 }
1651 }
1652
1653 if !errors.is_empty() {
1654 return Err(errors);
1655 }
1656
1657 let mut final_tokens = Vec::with_capacity(tokens.len());
1659 let mut i = 0;
1660
1661 while i < tokens.len() {
1662 if let Token::Ident(ref name) = tokens[i].token
1664 && name.starts_with("__KAISH_ARITH_") && name.ends_with("__")
1665 && let Some((_, expr)) = arith_result.arithmetics.iter().find(|(marker, _)| marker == name) {
1666 final_tokens.push(Spanned::new(Token::Arithmetic(expr.clone()), tokens[i].span.clone()));
1667 i += 1;
1668 continue;
1669 }
1670
1671 if matches!(tokens[i].token, Token::HereDocStart) {
1673 if i + 1 < tokens.len()
1675 && let Token::Ident(ref name) = tokens[i + 1].token
1676 && name.starts_with("__KAISH_HEREDOC_") && name.ends_with("__") {
1677 if let Some((_, content, literal)) = heredocs.iter().find(|(marker, _, _)| marker == name) {
1679 final_tokens.push(Spanned::new(Token::HereDocStart, tokens[i].span.clone()));
1680 final_tokens.push(Spanned::new(Token::HereDoc(HereDocData { content: content.clone(), literal: *literal }), tokens[i + 1].span.clone()));
1681 i += 2;
1682 continue;
1683 }
1684 }
1685 }
1686
1687 let token = if let Token::String(ref s) = tokens[i].token {
1689 let mut new_content = s.clone();
1691 for (marker, expr) in &arith_result.arithmetics {
1692 if new_content.contains(marker) {
1693 new_content = new_content.replace(marker, &format!("${{__ARITH:{}__}}", expr));
1696 }
1697 }
1698 if new_content != *s {
1699 Spanned::new(Token::String(new_content), tokens[i].span.clone())
1700 } else {
1701 tokens[i].clone()
1702 }
1703 } else {
1704 tokens[i].clone()
1705 };
1706 final_tokens.push(token);
1707 i += 1;
1708 }
1709
1710 Ok(merge_glob_adjacent(merge_colon_adjacent(final_tokens)))
1711}
1712
1713pub fn tokenize_with_comments(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1717 let lexer = Token::lexer(source);
1718 let mut tokens = Vec::new();
1719 let mut errors = Vec::new();
1720
1721 for (result, span) in lexer.spanned() {
1722 match result {
1723 Ok(token) => {
1724 tokens.push(Spanned::new(token, span));
1725 }
1726 Err(err) => {
1727 errors.push(Spanned::new(err, span));
1728 }
1729 }
1730 }
1731
1732 if errors.is_empty() {
1733 Ok(tokens)
1734 } else {
1735 Err(errors)
1736 }
1737}
1738
1739pub fn parse_string_literal(source: &str) -> Result<String, LexerError> {
1741 if source.len() < 2 || !source.starts_with('"') || !source.ends_with('"') {
1743 return Err(LexerError::UnterminatedString);
1744 }
1745
1746 let inner = &source[1..source.len() - 1];
1747 let mut result = String::with_capacity(inner.len());
1748 let mut chars = inner.chars().peekable();
1749
1750 while let Some(ch) = chars.next() {
1751 if ch == '\\' {
1752 match chars.next() {
1753 Some('n') => result.push('\n'),
1754 Some('t') => result.push('\t'),
1755 Some('r') => result.push('\r'),
1756 Some('\\') => result.push('\\'),
1757 Some('"') => result.push('"'),
1758 Some('$') => result.push_str("__KAISH_ESCAPED_DOLLAR__"),
1761 Some('u') => {
1762 let mut hex = String::with_capacity(4);
1764 for _ in 0..4 {
1765 match chars.next() {
1766 Some(h) if h.is_ascii_hexdigit() => hex.push(h),
1767 _ => return Err(LexerError::InvalidEscape),
1768 }
1769 }
1770 let codepoint = u32::from_str_radix(&hex, 16)
1771 .map_err(|_| LexerError::InvalidEscape)?;
1772 let ch = char::from_u32(codepoint)
1773 .ok_or(LexerError::InvalidEscape)?;
1774 result.push(ch);
1775 }
1776 Some(next) => {
1778 result.push('\\');
1779 result.push(next);
1780 }
1781 None => return Err(LexerError::InvalidEscape),
1782 }
1783 } else {
1784 result.push(ch);
1785 }
1786 }
1787
1788 Ok(result)
1789}
1790
1791pub fn parse_var_ref(source: &str) -> Result<Vec<String>, LexerError> {
1794 if source.len() < 4 || !source.starts_with("${") || !source.ends_with('}') {
1796 return Err(LexerError::UnterminatedVarRef);
1797 }
1798
1799 let inner = &source[2..source.len() - 1];
1800
1801 if inner == "?" {
1803 return Ok(vec!["?".to_string()]);
1804 }
1805
1806 let mut segments = Vec::new();
1807 let mut current = String::new();
1808 let mut chars = inner.chars().peekable();
1809
1810 while let Some(ch) = chars.next() {
1811 match ch {
1812 '.' => {
1813 if !current.is_empty() {
1814 segments.push(current.clone());
1815 current.clear();
1816 }
1817 }
1818 '[' => {
1819 if !current.is_empty() {
1820 segments.push(current.clone());
1821 current.clear();
1822 }
1823 let mut index = String::from("[");
1825 while let Some(&c) = chars.peek() {
1826 if let Some(c) = chars.next() {
1827 index.push(c);
1828 }
1829 if c == ']' {
1830 break;
1831 }
1832 }
1833 segments.push(index);
1834 }
1835 _ => {
1836 current.push(ch);
1837 }
1838 }
1839 }
1840
1841 if !current.is_empty() {
1842 segments.push(current);
1843 }
1844
1845 Ok(segments)
1846}
1847
1848pub fn parse_int(source: &str) -> Result<i64, LexerError> {
1850 source.parse().map_err(|_| LexerError::InvalidNumber)
1851}
1852
1853pub fn parse_float(source: &str) -> Result<f64, LexerError> {
1855 source.parse().map_err(|_| LexerError::InvalidNumber)
1856}
1857
1858#[cfg(test)]
1859mod tests {
1860 use super::*;
1861
1862 fn lex(source: &str) -> Vec<Token> {
1863 tokenize(source)
1864 .expect("lexer should succeed")
1865 .into_iter()
1866 .map(|s| s.token)
1867 .collect()
1868 }
1869
1870 #[test]
1875 fn keywords() {
1876 assert_eq!(lex("set"), vec![Token::Set]);
1877 assert_eq!(lex("if"), vec![Token::If]);
1878 assert_eq!(lex("then"), vec![Token::Then]);
1879 assert_eq!(lex("else"), vec![Token::Else]);
1880 assert_eq!(lex("elif"), vec![Token::Elif]);
1881 assert_eq!(lex("fi"), vec![Token::Fi]);
1882 assert_eq!(lex("for"), vec![Token::For]);
1883 assert_eq!(lex("in"), vec![Token::In]);
1884 assert_eq!(lex("do"), vec![Token::Do]);
1885 assert_eq!(lex("done"), vec![Token::Done]);
1886 assert_eq!(lex("case"), vec![Token::Case]);
1887 assert_eq!(lex("esac"), vec![Token::Esac]);
1888 assert_eq!(lex("function"), vec![Token::Function]);
1889 assert_eq!(lex("true"), vec![Token::True]);
1890 assert_eq!(lex("false"), vec![Token::False]);
1891 }
1892
1893 #[test]
1894 fn double_semicolon() {
1895 assert_eq!(lex(";;"), vec![Token::DoubleSemi]);
1896 assert_eq!(lex("echo \"hi\";;"), vec![
1898 Token::Ident("echo".to_string()),
1899 Token::String("hi".to_string()),
1900 Token::DoubleSemi,
1901 ]);
1902 }
1903
1904 #[test]
1905 fn type_keywords() {
1906 assert_eq!(lex("string"), vec![Token::TypeString]);
1907 assert_eq!(lex("int"), vec![Token::TypeInt]);
1908 assert_eq!(lex("float"), vec![Token::TypeFloat]);
1909 assert_eq!(lex("bool"), vec![Token::TypeBool]);
1910 }
1911
1912 #[test]
1917 fn single_char_operators() {
1918 assert_eq!(lex("="), vec![Token::Eq]);
1919 assert_eq!(lex("|"), vec![Token::Pipe]);
1920 assert_eq!(lex("&"), vec![Token::Amp]);
1921 assert_eq!(lex(">"), vec![Token::Gt]);
1922 assert_eq!(lex("<"), vec![Token::Lt]);
1923 assert_eq!(lex(";"), vec![Token::Semi]);
1924 assert_eq!(lex(":"), vec![Token::Colon]);
1925 assert_eq!(lex(","), vec![Token::Comma]);
1926 assert_eq!(lex("."), vec![Token::Dot]);
1927 }
1928
1929 #[test]
1930 fn multi_char_operators() {
1931 assert_eq!(lex("&&"), vec![Token::And]);
1932 assert_eq!(lex("||"), vec![Token::Or]);
1933 assert_eq!(lex("=="), vec![Token::EqEq]);
1934 assert_eq!(lex("!="), vec![Token::NotEq]);
1935 assert_eq!(lex("=~"), vec![Token::Match]);
1936 assert_eq!(lex("!~"), vec![Token::NotMatch]);
1937 assert_eq!(lex(">="), vec![Token::GtEq]);
1938 assert_eq!(lex("<="), vec![Token::LtEq]);
1939 assert_eq!(lex(">>"), vec![Token::GtGt]);
1940 assert_eq!(lex("2>"), vec![Token::Stderr]);
1941 assert_eq!(lex("&>"), vec![Token::Both]);
1942 }
1943
1944 #[test]
1945 fn brackets() {
1946 assert_eq!(lex("{"), vec![Token::LBrace]);
1947 assert_eq!(lex("}"), vec![Token::RBrace]);
1948 assert_eq!(lex("["), vec![Token::LBracket]);
1949 assert_eq!(lex("]"), vec![Token::RBracket]);
1950 assert_eq!(lex("("), vec![Token::LParen]);
1951 assert_eq!(lex(")"), vec![Token::RParen]);
1952 }
1953
1954 #[test]
1959 fn integers() {
1960 assert_eq!(lex("0"), vec![Token::Int(0)]);
1961 assert_eq!(lex("42"), vec![Token::Int(42)]);
1962 assert_eq!(lex("-1"), vec![Token::Int(-1)]);
1963 assert_eq!(lex("999999"), vec![Token::Int(999999)]);
1964 }
1965
1966 #[test]
1967 fn floats() {
1968 assert_eq!(lex("3.14"), vec![Token::Float(3.14)]);
1969 assert_eq!(lex("-0.5"), vec![Token::Float(-0.5)]);
1970 assert_eq!(lex("123.456"), vec![Token::Float(123.456)]);
1971 }
1972
1973 #[test]
1974 fn strings() {
1975 assert_eq!(lex(r#""hello""#), vec![Token::String("hello".to_string())]);
1976 assert_eq!(lex(r#""hello world""#), vec![Token::String("hello world".to_string())]);
1977 assert_eq!(lex(r#""""#), vec![Token::String("".to_string())]); assert_eq!(lex(r#""with \"quotes\"""#), vec![Token::String("with \"quotes\"".to_string())]);
1979 assert_eq!(lex(r#""with\nnewline""#), vec![Token::String("with\nnewline".to_string())]);
1980 }
1981
1982 #[test]
1983 fn var_refs() {
1984 assert_eq!(lex("${X}"), vec![Token::VarRef("${X}".to_string())]);
1985 assert_eq!(lex("${VAR}"), vec![Token::VarRef("${VAR}".to_string())]);
1986 assert_eq!(lex("${VAR.field}"), vec![Token::VarRef("${VAR.field}".to_string())]);
1987 assert_eq!(lex("${VAR[0]}"), vec![Token::VarRef("${VAR[0]}".to_string())]);
1988 assert_eq!(lex("${?.ok}"), vec![Token::VarRef("${?.ok}".to_string())]);
1989 }
1990
1991 #[test]
1996 fn identifiers() {
1997 assert_eq!(lex("foo"), vec![Token::Ident("foo".to_string())]);
1998 assert_eq!(lex("foo_bar"), vec![Token::Ident("foo_bar".to_string())]);
1999 assert_eq!(lex("foo-bar"), vec![Token::Ident("foo-bar".to_string())]);
2000 assert_eq!(lex("_private"), vec![Token::Ident("_private".to_string())]);
2001 assert_eq!(lex("cmd123"), vec![Token::Ident("cmd123".to_string())]);
2002 }
2003
2004 #[test]
2005 fn keyword_prefix_identifiers() {
2006 assert_eq!(lex("setup"), vec![Token::Ident("setup".to_string())]);
2008 assert_eq!(lex("kaish-tools"), vec![Token::Ident("kaish-tools".to_string())]);
2009 assert_eq!(lex("iffy"), vec![Token::Ident("iffy".to_string())]);
2010 assert_eq!(lex("forked"), vec![Token::Ident("forked".to_string())]);
2011 assert_eq!(lex("done-with-it"), vec![Token::Ident("done-with-it".to_string())]);
2012 }
2013
2014 #[test]
2019 fn assignment() {
2020 assert_eq!(
2021 lex("set X = 5"),
2022 vec![Token::Set, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2023 );
2024 }
2025
2026 #[test]
2027 fn command_simple() {
2028 assert_eq!(lex("echo"), vec![Token::Ident("echo".to_string())]);
2029 assert_eq!(
2030 lex(r#"echo "hello""#),
2031 vec![Token::Ident("echo".to_string()), Token::String("hello".to_string())]
2032 );
2033 }
2034
2035 #[test]
2036 fn command_with_args() {
2037 assert_eq!(
2038 lex("cmd arg1 arg2"),
2039 vec![Token::Ident("cmd".to_string()), Token::Ident("arg1".to_string()), Token::Ident("arg2".to_string())]
2040 );
2041 }
2042
2043 #[test]
2044 fn command_with_named_args() {
2045 assert_eq!(
2046 lex("cmd key=value"),
2047 vec![Token::Ident("cmd".to_string()), Token::Ident("key".to_string()), Token::Eq, Token::Ident("value".to_string())]
2048 );
2049 }
2050
2051 #[test]
2052 fn pipeline() {
2053 assert_eq!(
2054 lex("a | b | c"),
2055 vec![Token::Ident("a".to_string()), Token::Pipe, Token::Ident("b".to_string()), Token::Pipe, Token::Ident("c".to_string())]
2056 );
2057 }
2058
2059 #[test]
2060 fn if_statement() {
2061 assert_eq!(
2062 lex("if true; then echo; fi"),
2063 vec![
2064 Token::If,
2065 Token::True,
2066 Token::Semi,
2067 Token::Then,
2068 Token::Ident("echo".to_string()),
2069 Token::Semi,
2070 Token::Fi
2071 ]
2072 );
2073 }
2074
2075 #[test]
2076 fn for_loop() {
2077 assert_eq!(
2078 lex("for X in items; do echo; done"),
2079 vec![
2080 Token::For,
2081 Token::Ident("X".to_string()),
2082 Token::In,
2083 Token::Ident("items".to_string()),
2084 Token::Semi,
2085 Token::Do,
2086 Token::Ident("echo".to_string()),
2087 Token::Semi,
2088 Token::Done
2089 ]
2090 );
2091 }
2092
2093 #[test]
2098 fn whitespace_ignored() {
2099 assert_eq!(lex(" set X = 5 "), lex("set X = 5"));
2100 }
2101
2102 #[test]
2103 fn newlines_preserved() {
2104 let tokens = lex("a\nb");
2105 assert_eq!(
2106 tokens,
2107 vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2108 );
2109 }
2110
2111 #[test]
2112 fn multiple_newlines() {
2113 let tokens = lex("a\n\n\nb");
2114 assert_eq!(
2115 tokens,
2116 vec![Token::Ident("a".to_string()), Token::Newline, Token::Newline, Token::Newline, Token::Ident("b".to_string())]
2117 );
2118 }
2119
2120 #[test]
2125 fn comments_skipped() {
2126 assert_eq!(lex("# comment"), vec![]);
2127 assert_eq!(lex("a # comment"), vec![Token::Ident("a".to_string())]);
2128 assert_eq!(
2129 lex("a # comment\nb"),
2130 vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2131 );
2132 }
2133
2134 #[test]
2135 fn comments_preserved_when_requested() {
2136 let tokens = tokenize_with_comments("a # comment")
2137 .expect("should succeed")
2138 .into_iter()
2139 .map(|s| s.token)
2140 .collect::<Vec<_>>();
2141 assert_eq!(tokens, vec![Token::Ident("a".to_string()), Token::Comment]);
2142 }
2143
2144 #[test]
2149 fn parse_simple_string() {
2150 assert_eq!(parse_string_literal(r#""hello""#).expect("ok"), "hello");
2151 }
2152
2153 #[test]
2154 fn parse_string_with_escapes() {
2155 assert_eq!(
2156 parse_string_literal(r#""hello\nworld""#).expect("ok"),
2157 "hello\nworld"
2158 );
2159 assert_eq!(
2160 parse_string_literal(r#""tab\there""#).expect("ok"),
2161 "tab\there"
2162 );
2163 assert_eq!(
2164 parse_string_literal(r#""quote\"here""#).expect("ok"),
2165 "quote\"here"
2166 );
2167 }
2168
2169 #[test]
2170 fn parse_string_with_unicode() {
2171 assert_eq!(
2172 parse_string_literal(r#""emoji \u2764""#).expect("ok"),
2173 "emoji ❤"
2174 );
2175 }
2176
2177 #[test]
2178 fn parse_string_with_escaped_dollar() {
2179 assert_eq!(
2182 parse_string_literal(r#""\$VAR""#).expect("ok"),
2183 "__KAISH_ESCAPED_DOLLAR__VAR"
2184 );
2185 assert_eq!(
2186 parse_string_literal(r#""cost: \$100""#).expect("ok"),
2187 "cost: __KAISH_ESCAPED_DOLLAR__100"
2188 );
2189 }
2190
2191 #[test]
2196 fn parse_simple_var() {
2197 assert_eq!(
2198 parse_var_ref("${X}").expect("ok"),
2199 vec!["X"]
2200 );
2201 }
2202
2203 #[test]
2204 fn parse_var_with_field() {
2205 assert_eq!(
2206 parse_var_ref("${VAR.field}").expect("ok"),
2207 vec!["VAR", "field"]
2208 );
2209 }
2210
2211 #[test]
2212 fn parse_var_with_index() {
2213 assert_eq!(
2214 parse_var_ref("${VAR[0]}").expect("ok"),
2215 vec!["VAR", "[0]"]
2216 );
2217 }
2218
2219 #[test]
2220 fn parse_var_nested() {
2221 assert_eq!(
2222 parse_var_ref("${VAR.field[0].nested}").expect("ok"),
2223 vec!["VAR", "field", "[0]", "nested"]
2224 );
2225 }
2226
2227 #[test]
2228 fn parse_last_result() {
2229 assert_eq!(
2230 parse_var_ref("${?}").expect("ok"),
2231 vec!["?"]
2232 );
2233 assert_eq!(
2234 parse_var_ref("${?.ok}").expect("ok"),
2235 vec!["?", "ok"]
2236 );
2237 }
2238
2239 #[test]
2244 fn parse_integers() {
2245 assert_eq!(parse_int("0").expect("ok"), 0);
2246 assert_eq!(parse_int("42").expect("ok"), 42);
2247 assert_eq!(parse_int("-1").expect("ok"), -1);
2248 }
2249
2250 #[test]
2251 fn parse_floats() {
2252 assert!((parse_float("3.14").expect("ok") - 3.14).abs() < f64::EPSILON);
2253 assert!((parse_float("-0.5").expect("ok") - (-0.5)).abs() < f64::EPSILON);
2254 }
2255
2256 #[test]
2261 fn empty_input() {
2262 assert_eq!(lex(""), vec![]);
2263 }
2264
2265 #[test]
2266 fn only_whitespace() {
2267 assert_eq!(lex(" \t\t "), vec![]);
2268 }
2269
2270 #[test]
2271 fn json_array() {
2272 assert_eq!(
2273 lex(r#"[1, 2, 3]"#),
2274 vec![
2275 Token::LBracket,
2276 Token::Int(1),
2277 Token::Comma,
2278 Token::Int(2),
2279 Token::Comma,
2280 Token::Int(3),
2281 Token::RBracket
2282 ]
2283 );
2284 }
2285
2286 #[test]
2287 fn json_object() {
2288 assert_eq!(
2289 lex(r#"{"key": "value"}"#),
2290 vec![
2291 Token::LBrace,
2292 Token::String("key".to_string()),
2293 Token::Colon,
2294 Token::String("value".to_string()),
2295 Token::RBrace
2296 ]
2297 );
2298 }
2299
2300 #[test]
2301 fn redirect_operators() {
2302 assert_eq!(
2303 lex("cmd > file"),
2304 vec![Token::Ident("cmd".to_string()), Token::Gt, Token::Ident("file".to_string())]
2305 );
2306 assert_eq!(
2307 lex("cmd >> file"),
2308 vec![Token::Ident("cmd".to_string()), Token::GtGt, Token::Ident("file".to_string())]
2309 );
2310 assert_eq!(
2311 lex("cmd 2> err"),
2312 vec![Token::Ident("cmd".to_string()), Token::Stderr, Token::Ident("err".to_string())]
2313 );
2314 assert_eq!(
2315 lex("cmd &> all"),
2316 vec![Token::Ident("cmd".to_string()), Token::Both, Token::Ident("all".to_string())]
2317 );
2318 }
2319
2320 #[test]
2321 fn background_job() {
2322 assert_eq!(
2323 lex("cmd &"),
2324 vec![Token::Ident("cmd".to_string()), Token::Amp]
2325 );
2326 }
2327
2328 #[test]
2329 fn command_substitution() {
2330 assert_eq!(
2331 lex("$(cmd)"),
2332 vec![Token::CmdSubstStart, Token::Ident("cmd".to_string()), Token::RParen]
2333 );
2334 assert_eq!(
2335 lex("$(cmd arg)"),
2336 vec![
2337 Token::CmdSubstStart,
2338 Token::Ident("cmd".to_string()),
2339 Token::Ident("arg".to_string()),
2340 Token::RParen
2341 ]
2342 );
2343 assert_eq!(
2344 lex("$(a | b)"),
2345 vec![
2346 Token::CmdSubstStart,
2347 Token::Ident("a".to_string()),
2348 Token::Pipe,
2349 Token::Ident("b".to_string()),
2350 Token::RParen
2351 ]
2352 );
2353 }
2354
2355 #[test]
2356 fn complex_pipeline() {
2357 assert_eq!(
2358 lex(r#"cat file | grep pattern="foo" | head count=10"#),
2359 vec![
2360 Token::Ident("cat".to_string()),
2361 Token::Ident("file".to_string()),
2362 Token::Pipe,
2363 Token::Ident("grep".to_string()),
2364 Token::Ident("pattern".to_string()),
2365 Token::Eq,
2366 Token::String("foo".to_string()),
2367 Token::Pipe,
2368 Token::Ident("head".to_string()),
2369 Token::Ident("count".to_string()),
2370 Token::Eq,
2371 Token::Int(10),
2372 ]
2373 );
2374 }
2375
2376 #[test]
2381 fn short_flag() {
2382 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2383 assert_eq!(lex("-a"), vec![Token::ShortFlag("a".to_string())]);
2384 assert_eq!(lex("-v"), vec![Token::ShortFlag("v".to_string())]);
2385 }
2386
2387 #[test]
2388 fn short_flag_combined() {
2389 assert_eq!(lex("-la"), vec![Token::ShortFlag("la".to_string())]);
2391 assert_eq!(lex("-vvv"), vec![Token::ShortFlag("vvv".to_string())]);
2392 }
2393
2394 #[test]
2395 fn long_flag() {
2396 assert_eq!(lex("--force"), vec![Token::LongFlag("force".to_string())]);
2397 assert_eq!(lex("--verbose"), vec![Token::LongFlag("verbose".to_string())]);
2398 assert_eq!(lex("--foo-bar"), vec![Token::LongFlag("foo-bar".to_string())]);
2399 }
2400
2401 #[test]
2402 fn double_dash() {
2403 assert_eq!(lex("--"), vec![Token::DoubleDash]);
2405 }
2406
2407 #[test]
2408 fn flags_vs_negative_numbers() {
2409 assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2411 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2413 assert_eq!(
2416 lex("-1 a"),
2417 vec![Token::Int(-1), Token::Ident("a".to_string())]
2418 );
2419 }
2420
2421 #[test]
2422 fn command_with_flags() {
2423 assert_eq!(
2424 lex("ls -l"),
2425 vec![
2426 Token::Ident("ls".to_string()),
2427 Token::ShortFlag("l".to_string()),
2428 ]
2429 );
2430 assert_eq!(
2431 lex("git commit -m"),
2432 vec![
2433 Token::Ident("git".to_string()),
2434 Token::Ident("commit".to_string()),
2435 Token::ShortFlag("m".to_string()),
2436 ]
2437 );
2438 assert_eq!(
2439 lex("git push --force"),
2440 vec![
2441 Token::Ident("git".to_string()),
2442 Token::Ident("push".to_string()),
2443 Token::LongFlag("force".to_string()),
2444 ]
2445 );
2446 }
2447
2448 #[test]
2449 fn flag_with_value() {
2450 assert_eq!(
2451 lex(r#"git commit -m "message""#),
2452 vec![
2453 Token::Ident("git".to_string()),
2454 Token::Ident("commit".to_string()),
2455 Token::ShortFlag("m".to_string()),
2456 Token::String("message".to_string()),
2457 ]
2458 );
2459 assert_eq!(
2460 lex(r#"--message="hello""#),
2461 vec![
2462 Token::LongFlag("message".to_string()),
2463 Token::Eq,
2464 Token::String("hello".to_string()),
2465 ]
2466 );
2467 }
2468
2469 #[test]
2470 fn end_of_flags_marker() {
2471 assert_eq!(
2472 lex("git checkout -- file"),
2473 vec![
2474 Token::Ident("git".to_string()),
2475 Token::Ident("checkout".to_string()),
2476 Token::DoubleDash,
2477 Token::Ident("file".to_string()),
2478 ]
2479 );
2480 }
2481
2482 #[test]
2487 fn local_keyword() {
2488 assert_eq!(lex("local"), vec![Token::Local]);
2489 assert_eq!(
2490 lex("local X = 5"),
2491 vec![Token::Local, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2492 );
2493 }
2494
2495 #[test]
2496 fn simple_var_ref() {
2497 assert_eq!(lex("$X"), vec![Token::SimpleVarRef("X".to_string())]);
2498 assert_eq!(lex("$foo"), vec![Token::SimpleVarRef("foo".to_string())]);
2499 assert_eq!(lex("$foo_bar"), vec![Token::SimpleVarRef("foo_bar".to_string())]);
2500 assert_eq!(lex("$_private"), vec![Token::SimpleVarRef("_private".to_string())]);
2501 }
2502
2503 #[test]
2504 fn simple_var_ref_in_command() {
2505 assert_eq!(
2506 lex("echo $NAME"),
2507 vec![Token::Ident("echo".to_string()), Token::SimpleVarRef("NAME".to_string())]
2508 );
2509 }
2510
2511 #[test]
2512 fn single_quoted_strings() {
2513 assert_eq!(lex("'hello'"), vec![Token::SingleString("hello".to_string())]);
2514 assert_eq!(lex("'hello world'"), vec![Token::SingleString("hello world".to_string())]);
2515 assert_eq!(lex("''"), vec![Token::SingleString("".to_string())]);
2516 assert_eq!(lex(r"'no $VAR here'"), vec![Token::SingleString("no $VAR here".to_string())]);
2518 assert_eq!(lex(r"'backslash \n stays'"), vec![Token::SingleString(r"backslash \n stays".to_string())]);
2519 }
2520
2521 #[test]
2522 fn test_brackets() {
2523 assert_eq!(lex("[["), vec![Token::LBracket, Token::LBracket]);
2525 assert_eq!(lex("]]"), vec![Token::RBracket, Token::RBracket]);
2526 assert_eq!(
2527 lex("[[ -f file ]]"),
2528 vec![
2529 Token::LBracket,
2530 Token::LBracket,
2531 Token::ShortFlag("f".to_string()),
2532 Token::Ident("file".to_string()),
2533 Token::RBracket,
2534 Token::RBracket
2535 ]
2536 );
2537 }
2538
2539 #[test]
2540 fn test_expression_syntax() {
2541 assert_eq!(
2542 lex(r#"[[ $X == "value" ]]"#),
2543 vec![
2544 Token::LBracket,
2545 Token::LBracket,
2546 Token::SimpleVarRef("X".to_string()),
2547 Token::EqEq,
2548 Token::String("value".to_string()),
2549 Token::RBracket,
2550 Token::RBracket
2551 ]
2552 );
2553 }
2554
2555 #[test]
2556 fn bash_style_assignment() {
2557 assert_eq!(
2559 lex(r#"NAME="value""#),
2560 vec![
2561 Token::Ident("NAME".to_string()),
2562 Token::Eq,
2563 Token::String("value".to_string())
2564 ]
2565 );
2566 }
2567
2568 #[test]
2569 fn positional_params() {
2570 assert_eq!(lex("$0"), vec![Token::Positional(0)]);
2571 assert_eq!(lex("$1"), vec![Token::Positional(1)]);
2572 assert_eq!(lex("$9"), vec![Token::Positional(9)]);
2573 assert_eq!(lex("$@"), vec![Token::AllArgs]);
2574 assert_eq!(lex("$#"), vec![Token::ArgCount]);
2575 }
2576
2577 #[test]
2578 fn positional_in_context() {
2579 assert_eq!(
2580 lex("echo $1 $2"),
2581 vec![
2582 Token::Ident("echo".to_string()),
2583 Token::Positional(1),
2584 Token::Positional(2),
2585 ]
2586 );
2587 }
2588
2589 #[test]
2590 fn var_length() {
2591 assert_eq!(lex("${#X}"), vec![Token::VarLength("X".to_string())]);
2592 assert_eq!(lex("${#NAME}"), vec![Token::VarLength("NAME".to_string())]);
2593 assert_eq!(lex("${#foo_bar}"), vec![Token::VarLength("foo_bar".to_string())]);
2594 }
2595
2596 #[test]
2597 fn var_length_in_context() {
2598 assert_eq!(
2599 lex("echo ${#NAME}"),
2600 vec![
2601 Token::Ident("echo".to_string()),
2602 Token::VarLength("NAME".to_string()),
2603 ]
2604 );
2605 }
2606
2607 #[test]
2612 fn plus_flag() {
2613 assert_eq!(lex("+e"), vec![Token::PlusFlag("e".to_string())]);
2615 assert_eq!(lex("+x"), vec![Token::PlusFlag("x".to_string())]);
2616 assert_eq!(lex("+ex"), vec![Token::PlusFlag("ex".to_string())]);
2617 }
2618
2619 #[test]
2620 fn set_with_plus_flag() {
2621 assert_eq!(
2622 lex("set +e"),
2623 vec![
2624 Token::Set,
2625 Token::PlusFlag("e".to_string()),
2626 ]
2627 );
2628 }
2629
2630 #[test]
2631 fn set_with_multiple_flags() {
2632 assert_eq!(
2633 lex("set -e -u"),
2634 vec![
2635 Token::Set,
2636 Token::ShortFlag("e".to_string()),
2637 Token::ShortFlag("u".to_string()),
2638 ]
2639 );
2640 }
2641
2642 #[test]
2643 fn flags_vs_negative_numbers_edge_cases() {
2644 assert_eq!(
2646 lex("-1 a"),
2647 vec![Token::Int(-1), Token::Ident("a".to_string())]
2648 );
2649 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2651 assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2653 }
2654
2655 #[test]
2656 fn single_dash_is_minus_alone() {
2657 let result = tokenize("-").expect("should lex");
2659 assert_eq!(result.len(), 1);
2660 assert!(matches!(result[0].token, Token::MinusAlone));
2661 }
2662
2663 #[test]
2664 fn plus_bare_for_date_format() {
2665 let result = tokenize("+%s").expect("should lex");
2667 assert_eq!(result.len(), 1);
2668 assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%s"));
2669
2670 let result = tokenize("+%Y-%m-%d").expect("should lex");
2672 assert_eq!(result.len(), 1);
2673 assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%Y-%m-%d"));
2674 }
2675
2676 #[test]
2677 fn plus_flag_still_works() {
2678 let result = tokenize("+e").expect("should lex");
2680 assert_eq!(result.len(), 1);
2681 assert!(matches!(result[0].token, Token::PlusFlag(ref s) if s == "e"));
2682 }
2683
2684 #[test]
2685 fn while_keyword_vs_while_loop() {
2686 assert_eq!(lex("while"), vec![Token::While]);
2688 assert_eq!(
2690 lex("while true"),
2691 vec![Token::While, Token::True]
2692 );
2693 }
2694
2695 #[test]
2696 fn control_flow_keywords() {
2697 assert_eq!(lex("break"), vec![Token::Break]);
2698 assert_eq!(lex("continue"), vec![Token::Continue]);
2699 assert_eq!(lex("return"), vec![Token::Return]);
2700 assert_eq!(lex("exit"), vec![Token::Exit]);
2701 }
2702
2703 #[test]
2704 fn control_flow_with_numbers() {
2705 assert_eq!(
2706 lex("break 2"),
2707 vec![Token::Break, Token::Int(2)]
2708 );
2709 assert_eq!(
2710 lex("continue 3"),
2711 vec![Token::Continue, Token::Int(3)]
2712 );
2713 assert_eq!(
2714 lex("exit 1"),
2715 vec![Token::Exit, Token::Int(1)]
2716 );
2717 }
2718
2719 #[test]
2724 fn heredoc_simple() {
2725 let source = "cat <<EOF\nhello\nworld\nEOF";
2726 let tokens = lex(source);
2727 assert_eq!(tokens, vec![
2728 Token::Ident("cat".to_string()),
2729 Token::HereDocStart,
2730 Token::HereDoc(HereDocData { content: "hello\nworld".to_string(), literal: false }),
2731 Token::Newline,
2732 ]);
2733 }
2734
2735 #[test]
2736 fn heredoc_empty() {
2737 let source = "cat <<EOF\nEOF";
2738 let tokens = lex(source);
2739 assert_eq!(tokens, vec![
2740 Token::Ident("cat".to_string()),
2741 Token::HereDocStart,
2742 Token::HereDoc(HereDocData { content: "".to_string(), literal: false }),
2743 Token::Newline,
2744 ]);
2745 }
2746
2747 #[test]
2748 fn heredoc_with_special_chars() {
2749 let source = "cat <<EOF\n$VAR and \"quoted\" 'single'\nEOF";
2750 let tokens = lex(source);
2751 assert_eq!(tokens, vec![
2752 Token::Ident("cat".to_string()),
2753 Token::HereDocStart,
2754 Token::HereDoc(HereDocData { content: "$VAR and \"quoted\" 'single'".to_string(), literal: false }),
2755 Token::Newline,
2756 ]);
2757 }
2758
2759 #[test]
2760 fn heredoc_multiline() {
2761 let source = "cat <<END\nline1\nline2\nline3\nEND";
2762 let tokens = lex(source);
2763 assert_eq!(tokens, vec![
2764 Token::Ident("cat".to_string()),
2765 Token::HereDocStart,
2766 Token::HereDoc(HereDocData { content: "line1\nline2\nline3".to_string(), literal: false }),
2767 Token::Newline,
2768 ]);
2769 }
2770
2771 #[test]
2772 fn heredoc_in_command() {
2773 let source = "cat <<EOF\nhello\nEOF\necho goodbye";
2774 let tokens = lex(source);
2775 assert_eq!(tokens, vec![
2776 Token::Ident("cat".to_string()),
2777 Token::HereDocStart,
2778 Token::HereDoc(HereDocData { content: "hello".to_string(), literal: false }),
2779 Token::Newline,
2780 Token::Ident("echo".to_string()),
2781 Token::Ident("goodbye".to_string()),
2782 ]);
2783 }
2784
2785 #[test]
2786 fn heredoc_strip_tabs() {
2787 let source = "cat <<-EOF\n\thello\n\tworld\n\tEOF";
2788 let tokens = lex(source);
2789 assert_eq!(tokens, vec![
2791 Token::Ident("cat".to_string()),
2792 Token::HereDocStart,
2793 Token::HereDoc(HereDocData { content: "\thello\n\tworld".to_string(), literal: false }),
2794 Token::Newline,
2795 ]);
2796 }
2797
2798 #[test]
2803 fn arithmetic_simple() {
2804 let source = "$((1 + 2))";
2805 let tokens = lex(source);
2806 assert_eq!(tokens, vec![Token::Arithmetic("1 + 2".to_string())]);
2807 }
2808
2809 #[test]
2810 fn arithmetic_in_assignment() {
2811 let source = "X=$((5 * 3))";
2812 let tokens = lex(source);
2813 assert_eq!(tokens, vec![
2814 Token::Ident("X".to_string()),
2815 Token::Eq,
2816 Token::Arithmetic("5 * 3".to_string()),
2817 ]);
2818 }
2819
2820 #[test]
2821 fn arithmetic_with_nested_parens() {
2822 let source = "$((2 * (3 + 4)))";
2823 let tokens = lex(source);
2824 assert_eq!(tokens, vec![Token::Arithmetic("2 * (3 + 4)".to_string())]);
2825 }
2826
2827 #[test]
2828 fn arithmetic_with_variable() {
2829 let source = "$((X + 1))";
2830 let tokens = lex(source);
2831 assert_eq!(tokens, vec![Token::Arithmetic("X + 1".to_string())]);
2832 }
2833
2834 #[test]
2835 fn arithmetic_command_subst_not_confused() {
2836 let source = "$(echo hello)";
2838 let tokens = lex(source);
2839 assert_eq!(tokens, vec![
2840 Token::CmdSubstStart,
2841 Token::Ident("echo".to_string()),
2842 Token::Ident("hello".to_string()),
2843 Token::RParen,
2844 ]);
2845 }
2846
2847 #[test]
2848 fn arithmetic_nesting_limit() {
2849 let open_parens = "(".repeat(300);
2851 let close_parens = ")".repeat(300);
2852 let source = format!("$(({}1{}))", open_parens, close_parens);
2853 let result = tokenize(&source);
2854 assert!(result.is_err());
2855 let errors = result.unwrap_err();
2856 assert_eq!(errors.len(), 1);
2857 assert_eq!(errors[0].token, LexerError::NestingTooDeep);
2858 }
2859
2860 #[test]
2861 fn arithmetic_nesting_within_limit() {
2862 let source = "$((((1 + 2) * 3)))";
2864 let tokens = lex(source);
2865 assert_eq!(tokens, vec![Token::Arithmetic("((1 + 2) * 3)".to_string())]);
2866 }
2867
2868 #[test]
2873 fn token_categories() {
2874 assert_eq!(Token::If.category(), TokenCategory::Keyword);
2876 assert_eq!(Token::Then.category(), TokenCategory::Keyword);
2877 assert_eq!(Token::For.category(), TokenCategory::Keyword);
2878 assert_eq!(Token::Function.category(), TokenCategory::Keyword);
2879 assert_eq!(Token::True.category(), TokenCategory::Keyword);
2880 assert_eq!(Token::TypeString.category(), TokenCategory::Keyword);
2881
2882 assert_eq!(Token::Pipe.category(), TokenCategory::Operator);
2884 assert_eq!(Token::And.category(), TokenCategory::Operator);
2885 assert_eq!(Token::Or.category(), TokenCategory::Operator);
2886 assert_eq!(Token::StderrToStdout.category(), TokenCategory::Operator);
2887 assert_eq!(Token::GtGt.category(), TokenCategory::Operator);
2888
2889 assert_eq!(Token::String("test".to_string()).category(), TokenCategory::String);
2891 assert_eq!(Token::SingleString("test".to_string()).category(), TokenCategory::String);
2892 assert_eq!(Token::HereDoc(HereDocData { content: "test".to_string(), literal: false }).category(), TokenCategory::String);
2893
2894 assert_eq!(Token::Int(42).category(), TokenCategory::Number);
2896 assert_eq!(Token::Float(3.14).category(), TokenCategory::Number);
2897 assert_eq!(Token::Arithmetic("1+2".to_string()).category(), TokenCategory::Number);
2898
2899 assert_eq!(Token::SimpleVarRef("X".to_string()).category(), TokenCategory::Variable);
2901 assert_eq!(Token::VarRef("${X}".to_string()).category(), TokenCategory::Variable);
2902 assert_eq!(Token::Positional(1).category(), TokenCategory::Variable);
2903 assert_eq!(Token::AllArgs.category(), TokenCategory::Variable);
2904 assert_eq!(Token::ArgCount.category(), TokenCategory::Variable);
2905 assert_eq!(Token::LastExitCode.category(), TokenCategory::Variable);
2906 assert_eq!(Token::CurrentPid.category(), TokenCategory::Variable);
2907
2908 assert_eq!(Token::ShortFlag("l".to_string()).category(), TokenCategory::Flag);
2910 assert_eq!(Token::LongFlag("verbose".to_string()).category(), TokenCategory::Flag);
2911 assert_eq!(Token::PlusFlag("e".to_string()).category(), TokenCategory::Flag);
2912 assert_eq!(Token::DoubleDash.category(), TokenCategory::Flag);
2913
2914 assert_eq!(Token::Semi.category(), TokenCategory::Punctuation);
2916 assert_eq!(Token::LParen.category(), TokenCategory::Punctuation);
2917 assert_eq!(Token::LBracket.category(), TokenCategory::Punctuation);
2918 assert_eq!(Token::Newline.category(), TokenCategory::Punctuation);
2919
2920 assert_eq!(Token::Comment.category(), TokenCategory::Comment);
2922
2923 assert_eq!(Token::Path("/tmp/file".to_string()).category(), TokenCategory::Path);
2925
2926 assert_eq!(Token::Ident("echo".to_string()).category(), TokenCategory::Command);
2928
2929 assert_eq!(Token::InvalidNumberIdent.category(), TokenCategory::Error);
2931 assert_eq!(Token::InvalidFloatNoLeading.category(), TokenCategory::Error);
2932 assert_eq!(Token::InvalidFloatNoTrailing.category(), TokenCategory::Error);
2933 }
2934
2935 #[test]
2936 fn test_heredoc_piped_to_command() {
2937 let tokens = tokenize("cat <<EOF | jq\n{\"key\": \"val\"}\nEOF").unwrap();
2940 let heredoc_pos = tokens.iter().position(|t| matches!(t.token, Token::HereDoc(_)));
2941 let pipe_pos = tokens.iter().position(|t| matches!(t.token, Token::Pipe));
2942 assert!(heredoc_pos.is_some(), "should have a heredoc token");
2943 assert!(pipe_pos.is_some(), "should have a pipe token");
2944 assert!(
2945 pipe_pos.unwrap() > heredoc_pos.unwrap(),
2946 "Pipe must come after heredoc, got heredoc at {}, pipe at {}. Tokens: {:?}",
2947 heredoc_pos.unwrap(), pipe_pos.unwrap(), tokens,
2948 );
2949 }
2950
2951 #[test]
2952 fn test_heredoc_standalone_still_works() {
2953 let tokens = tokenize("cat <<EOF\nhello\nEOF").unwrap();
2955 assert!(tokens.iter().any(|t| matches!(t.token, Token::HereDoc(_))));
2956 assert!(!tokens.iter().any(|t| matches!(t.token, Token::Pipe)));
2957 }
2958
2959 #[test]
2960 fn test_heredoc_preserves_leading_empty_lines() {
2961 let tokens = tokenize("cat <<EOF\n\nhello\nEOF").unwrap();
2963 let heredoc = tokens.iter().find_map(|t| {
2964 if let Token::HereDoc(data) = &t.token {
2965 Some(data.clone())
2966 } else {
2967 None
2968 }
2969 });
2970 assert!(heredoc.is_some(), "should have a heredoc token");
2971 let data = heredoc.unwrap();
2972 assert!(data.content.starts_with('\n'), "leading empty line must be preserved, got: {:?}", data.content);
2973 assert_eq!(data.content, "\nhello");
2974 }
2975
2976 #[test]
2977 fn test_heredoc_quoted_delimiter_sets_literal() {
2978 let tokens = tokenize("cat <<'EOF'\nhello $HOME\nEOF").unwrap();
2980 let heredoc = tokens.iter().find_map(|t| {
2981 if let Token::HereDoc(data) = &t.token {
2982 Some(data.clone())
2983 } else {
2984 None
2985 }
2986 });
2987 assert!(heredoc.is_some(), "should have a heredoc token");
2988 let data = heredoc.unwrap();
2989 assert!(data.literal, "quoted delimiter should set literal=true");
2990 assert_eq!(data.content, "hello $HOME");
2991 }
2992
2993 #[test]
2994 fn test_heredoc_unquoted_delimiter_not_literal() {
2995 let tokens = tokenize("cat <<EOF\nhello $HOME\nEOF").unwrap();
2997 let heredoc = tokens.iter().find_map(|t| {
2998 if let Token::HereDoc(data) = &t.token {
2999 Some(data.clone())
3000 } else {
3001 None
3002 }
3003 });
3004 assert!(heredoc.is_some(), "should have a heredoc token");
3005 let data = heredoc.unwrap();
3006 assert!(!data.literal, "unquoted delimiter should have literal=false");
3007 }
3008
3009 #[test]
3014 fn colon_double_in_word() {
3015 assert_eq!(lex("foo::bar"), vec![Token::Ident("foo::bar".into())]);
3016 }
3017
3018 #[test]
3019 fn colon_single_in_word() {
3020 assert_eq!(lex("a:b:c"), vec![Token::Ident("a:b:c".into())]);
3021 }
3022
3023 #[test]
3024 fn colon_with_port() {
3025 assert_eq!(lex("host:8080"), vec![Token::Ident("host:8080".into())]);
3026 }
3027
3028 #[test]
3029 fn colon_standalone() {
3030 assert_eq!(lex(":"), vec![Token::Colon]);
3031 }
3032
3033 #[test]
3034 fn colon_spaced_no_merge() {
3035 assert_eq!(
3036 lex("foo : bar"),
3037 vec![
3038 Token::Ident("foo".into()),
3039 Token::Colon,
3040 Token::Ident("bar".into()),
3041 ]
3042 );
3043 }
3044
3045 #[test]
3046 fn colon_in_command_arg() {
3047 assert_eq!(
3048 lex("echo foo::bar"),
3049 vec![
3050 Token::Ident("echo".into()),
3051 Token::Ident("foo::bar".into()),
3052 ]
3053 );
3054 }
3055
3056 #[test]
3057 fn colon_trailing() {
3058 assert_eq!(lex("foo:"), vec![Token::Ident("foo:".into())]);
3060 }
3061
3062 #[test]
3063 fn colon_leading() {
3064 assert_eq!(lex(":foo"), vec![Token::Ident(":foo".into())]);
3066 }
3067
3068 #[test]
3069 fn colon_with_path() {
3070 assert_eq!(
3072 lex("/usr/bin:8080"),
3073 vec![Token::Ident("/usr/bin:8080".into())]
3074 );
3075 }
3076}