1use logos::{Logos, Span};
17use std::fmt;
18use std::sync::atomic::{AtomicU64, Ordering};
19use std::time::{SystemTime, UNIX_EPOCH};
20
21static MARKER_COUNTER: AtomicU64 = AtomicU64::new(0);
23
24const MAX_PAREN_DEPTH: usize = 256;
27
28#[derive(Debug, Clone)]
32struct SpanReplacement {
33 preprocessed_pos: usize,
35 marker_len: usize,
37 original_len: usize,
39}
40
41fn correct_span(span: Span, replacements: &[SpanReplacement]) -> Span {
43 let mut start_adjustment: isize = 0;
44 let mut end_adjustment: isize = 0;
45
46 for r in replacements {
47 let delta = r.original_len as isize - r.marker_len as isize;
49
50 if span.start > r.preprocessed_pos + r.marker_len {
52 start_adjustment += delta;
53 } else if span.start > r.preprocessed_pos {
54 start_adjustment += delta;
57 }
58
59 if span.end > r.preprocessed_pos + r.marker_len {
61 end_adjustment += delta;
62 } else if span.end > r.preprocessed_pos {
63 end_adjustment += delta;
65 }
66 }
67
68 let new_start = (span.start as isize + start_adjustment).max(0) as usize;
69 let new_end = (span.end as isize + end_adjustment).max(new_start as isize) as usize;
70 new_start..new_end
71}
72
73fn unique_marker_id() -> String {
76 let timestamp = SystemTime::now()
77 .duration_since(UNIX_EPOCH)
78 .map(|d| d.as_nanos())
79 .unwrap_or(0);
80 let counter = MARKER_COUNTER.fetch_add(1, Ordering::Relaxed);
81 let pid = std::process::id();
82 format!("{:x}_{:x}_{:x}", timestamp, counter, pid)
83}
84
85#[derive(Debug, Clone, PartialEq)]
87pub struct Spanned<T> {
88 pub token: T,
89 pub span: Span,
90}
91
92impl<T> Spanned<T> {
93 pub fn new(token: T, span: Span) -> Self {
94 Self { token, span }
95 }
96}
97
98#[derive(Debug, Clone, PartialEq, Default)]
100pub enum LexerError {
101 #[default]
102 UnexpectedCharacter,
103 UnterminatedString,
104 UnterminatedVarRef,
105 InvalidEscape,
106 InvalidNumber,
107 AmbiguousBoolean(String),
108 AmbiguousBooleanLike(String),
109 InvalidNumberIdent(String),
110 InvalidFloatNoLeading,
111 InvalidFloatNoTrailing,
112 NestingTooDeep,
114}
115
116impl fmt::Display for LexerError {
117 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
118 match self {
119 LexerError::UnexpectedCharacter => write!(f, "unexpected character"),
120 LexerError::UnterminatedString => write!(f, "unterminated string"),
121 LexerError::UnterminatedVarRef => write!(f, "unterminated variable reference"),
122 LexerError::InvalidEscape => write!(f, "invalid escape sequence"),
123 LexerError::InvalidNumber => write!(f, "invalid number"),
124 LexerError::AmbiguousBoolean(s) => {
125 write!(f, "ambiguous boolean, use lowercase '{}'", s.to_lowercase())
126 }
127 LexerError::AmbiguousBooleanLike(s) => {
128 let suggest = if s.eq_ignore_ascii_case("yes") { "true" } else { "false" };
129 write!(f, "ambiguous boolean-like '{}', use '{}' or '\"{}\"'", s, suggest, s)
130 }
131 LexerError::InvalidNumberIdent(s) => {
132 write!(f, "identifier cannot start with digit: {}", s)
133 }
134 LexerError::InvalidFloatNoLeading => write!(f, "float must have leading digit"),
135 LexerError::InvalidFloatNoTrailing => write!(f, "float must have trailing digit"),
136 LexerError::NestingTooDeep => write!(f, "nesting depth exceeded (max {})", MAX_PAREN_DEPTH),
137 }
138 }
139}
140
141#[derive(Logos, Debug, Clone, PartialEq)]
150#[logos(error = LexerError)]
151#[logos(skip r"[ \t]+")]
152pub enum Token {
153 #[token("set")]
157 Set,
158
159 #[token("local")]
160 Local,
161
162 #[token("if")]
163 If,
164
165 #[token("then")]
166 Then,
167
168 #[token("else")]
169 Else,
170
171 #[token("elif")]
172 Elif,
173
174 #[token("fi")]
175 Fi,
176
177 #[token("for")]
178 For,
179
180 #[token("while")]
181 While,
182
183 #[token("in")]
184 In,
185
186 #[token("do")]
187 Do,
188
189 #[token("done")]
190 Done,
191
192 #[token("case")]
193 Case,
194
195 #[token("esac")]
196 Esac,
197
198 #[token("function")]
199 Function,
200
201 #[token("break")]
202 Break,
203
204 #[token("continue")]
205 Continue,
206
207 #[token("return")]
208 Return,
209
210 #[token("exit")]
211 Exit,
212
213 #[token("true")]
214 True,
215
216 #[token("false")]
217 False,
218
219 #[token("string")]
223 TypeString,
224
225 #[token("int")]
226 TypeInt,
227
228 #[token("float")]
229 TypeFloat,
230
231 #[token("bool")]
232 TypeBool,
233
234 #[token("&&")]
238 And,
239
240 #[token("||")]
241 Or,
242
243 #[token("==")]
244 EqEq,
245
246 #[token("!=")]
247 NotEq,
248
249 #[token("=~")]
250 Match,
251
252 #[token("!~")]
253 NotMatch,
254
255 #[token(">=")]
256 GtEq,
257
258 #[token("<=")]
259 LtEq,
260
261 #[token(">>")]
262 GtGt,
263
264 #[token("2>&1")]
265 StderrToStdout,
266
267 #[token("1>&2")]
268 StdoutToStderr,
269
270 #[token(">&2")]
271 StdoutToStderr2,
272
273 #[token("2>")]
274 Stderr,
275
276 #[token("&>")]
277 Both,
278
279 #[token("<<")]
280 HereDocStart,
281
282 #[token(";;")]
283 DoubleSemi,
284
285 #[token("=")]
289 Eq,
290
291 #[token("|")]
292 Pipe,
293
294 #[token("&")]
295 Amp,
296
297 #[token(">")]
298 Gt,
299
300 #[token("<")]
301 Lt,
302
303 #[token(";")]
304 Semi,
305
306 #[token(":")]
307 Colon,
308
309 #[token(",")]
310 Comma,
311
312 #[token(".")]
313 Dot,
314
315 #[token("{")]
316 LBrace,
317
318 #[token("}")]
319 RBrace,
320
321 #[token("[")]
322 LBracket,
323
324 #[token("]")]
325 RBracket,
326
327 #[token("(")]
328 LParen,
329
330 #[token(")")]
331 RParen,
332
333 #[token("*")]
334 Star,
335
336 #[token("!")]
337 Bang,
338
339 #[token("?")]
340 Question,
341
342 Arithmetic(String),
349
350 #[token("$(")]
352 CmdSubstStart,
353
354 #[regex(r"--[a-zA-Z][a-zA-Z0-9-]*", lex_long_flag, priority = 3)]
360 LongFlag(String),
361
362 #[regex(r"-[a-zA-Z][a-zA-Z0-9]*", lex_short_flag, priority = 3)]
364 ShortFlag(String),
365
366 #[regex(r"\+[a-zA-Z][a-zA-Z0-9]*", lex_plus_flag, priority = 3)]
368 PlusFlag(String),
369
370 #[token("--")]
372 DoubleDash,
373
374 #[regex(r"\+[^a-zA-Z\s][^\s]*", lex_plus_bare, priority = 2)]
377 PlusBare(String),
378
379 #[regex(r"-[^a-zA-Z0-9\s\-][^\s]*", lex_minus_bare, priority = 1)]
383 MinusBare(String),
384
385 #[token("-")]
389 MinusAlone,
390
391 #[regex(r#""([^"\\]|\\.)*""#, lex_string)]
397 String(String),
398
399 #[regex(r"'[^']*'", lex_single_string)]
401 SingleString(String),
402
403 #[regex(r"\$\{[^}]+\}", lex_varref)]
405 VarRef(String),
406
407 #[regex(r"\$[a-zA-Z_][a-zA-Z0-9_]*", lex_simple_varref)]
409 SimpleVarRef(String),
410
411 #[regex(r"\$[0-9]", lex_positional)]
413 Positional(usize),
414
415 #[token("$@")]
417 AllArgs,
418
419 #[token("$#")]
421 ArgCount,
422
423 #[token("$?")]
425 LastExitCode,
426
427 #[token("$$")]
429 CurrentPid,
430
431 #[regex(r"\$\{#[a-zA-Z_][a-zA-Z0-9_]*\}", lex_var_length)]
433 VarLength(String),
434
435 HereDoc(String),
438
439 #[regex(r"-?[0-9]+", lex_int, priority = 2)]
441 Int(i64),
442
443 #[regex(r"-?[0-9]+\.[0-9]+", lex_float)]
445 Float(f64),
446
447 #[regex(r"[0-9]+[a-zA-Z_][a-zA-Z0-9_-]*", lex_invalid_number_ident, priority = 3)]
453 InvalidNumberIdent,
454
455 #[regex(r"\.[0-9]+", lex_invalid_float_no_leading, priority = 3)]
457 InvalidFloatNoLeading,
458
459 #[regex(r"[0-9]+\.", lex_invalid_float_no_trailing, priority = 2)]
462 InvalidFloatNoTrailing,
463
464 #[regex(r"/[a-zA-Z0-9_./+-]*", lex_path)]
470 Path(String),
471
472 #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*", lex_ident)]
479 Ident(String),
480
481 #[regex(r"#[^\n\r]*", allow_greedy = true)]
487 Comment,
488
489 #[regex(r"\n|\r\n")]
491 Newline,
492
493 #[regex(r"\\[ \t]*(\n|\r\n)")]
495 LineContinuation,
496}
497
498#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
503pub enum TokenCategory {
504 Keyword,
506 Operator,
508 String,
510 Number,
512 Variable,
514 Comment,
516 Punctuation,
518 Command,
520 Path,
522 Flag,
524 Error,
526}
527
528impl Token {
529 pub fn category(&self) -> TokenCategory {
531 match self {
532 Token::If
534 | Token::Then
535 | Token::Else
536 | Token::Elif
537 | Token::Fi
538 | Token::For
539 | Token::In
540 | Token::Do
541 | Token::Done
542 | Token::While
543 | Token::Case
544 | Token::Esac
545 | Token::Function
546 | Token::Return
547 | Token::Break
548 | Token::Continue
549 | Token::Exit
550 | Token::Set
551 | Token::Local
552 | Token::True
553 | Token::False
554 | Token::TypeString
555 | Token::TypeInt
556 | Token::TypeFloat
557 | Token::TypeBool => TokenCategory::Keyword,
558
559 Token::Pipe
561 | Token::And
562 | Token::Or
563 | Token::Amp
564 | Token::Eq
565 | Token::EqEq
566 | Token::NotEq
567 | Token::Match
568 | Token::NotMatch
569 | Token::Lt
570 | Token::Gt
571 | Token::LtEq
572 | Token::GtEq
573 | Token::GtGt
574 | Token::Stderr
575 | Token::Both
576 | Token::HereDocStart
577 | Token::StderrToStdout
578 | Token::StdoutToStderr
579 | Token::StdoutToStderr2 => TokenCategory::Operator,
580
581 Token::String(_) | Token::SingleString(_) | Token::HereDoc(_) => TokenCategory::String,
583
584 Token::Int(_) | Token::Float(_) | Token::Arithmetic(_) => TokenCategory::Number,
586
587 Token::VarRef(_)
589 | Token::SimpleVarRef(_)
590 | Token::Positional(_)
591 | Token::AllArgs
592 | Token::ArgCount
593 | Token::VarLength(_)
594 | Token::LastExitCode
595 | Token::CurrentPid => TokenCategory::Variable,
596
597 Token::LongFlag(_)
599 | Token::ShortFlag(_)
600 | Token::PlusFlag(_)
601 | Token::DoubleDash => TokenCategory::Flag,
602
603 Token::Semi
605 | Token::DoubleSemi
606 | Token::Colon
607 | Token::Comma
608 | Token::Dot
609 | Token::LParen
610 | Token::RParen
611 | Token::LBrace
612 | Token::RBrace
613 | Token::LBracket
614 | Token::RBracket
615 | Token::Bang
616 | Token::Question
617 | Token::Star
618 | Token::Newline
619 | Token::LineContinuation
620 | Token::CmdSubstStart => TokenCategory::Punctuation,
621
622 Token::Comment => TokenCategory::Comment,
624
625 Token::Path(_) => TokenCategory::Path,
627
628 Token::Ident(_)
630 | Token::PlusBare(_)
631 | Token::MinusBare(_)
632 | Token::MinusAlone => TokenCategory::Command,
633
634 Token::InvalidNumberIdent
636 | Token::InvalidFloatNoLeading
637 | Token::InvalidFloatNoTrailing => TokenCategory::Error,
638 }
639 }
640}
641
642fn lex_string(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
644 parse_string_literal(lex.slice())
645}
646
647fn lex_single_string(lex: &mut logos::Lexer<Token>) -> String {
649 let s = lex.slice();
650 s[1..s.len() - 1].to_string()
652}
653
654fn lex_varref(lex: &mut logos::Lexer<Token>) -> String {
656 lex.slice().to_string()
658}
659
660fn lex_simple_varref(lex: &mut logos::Lexer<Token>) -> String {
662 lex.slice()[1..].to_string()
664}
665
666fn lex_positional(lex: &mut logos::Lexer<Token>) -> usize {
668 lex.slice()[1..].parse().unwrap_or(0)
670}
671
672fn lex_var_length(lex: &mut logos::Lexer<Token>) -> String {
674 let s = lex.slice();
676 s[3..s.len() - 1].to_string()
677}
678
679fn lex_int(lex: &mut logos::Lexer<Token>) -> Result<i64, LexerError> {
681 lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
682}
683
684fn lex_float(lex: &mut logos::Lexer<Token>) -> Result<f64, LexerError> {
686 lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
687}
688
689fn lex_invalid_number_ident(lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
692 Err(LexerError::InvalidNumberIdent(lex.slice().to_string()))
693}
694
695fn lex_invalid_float_no_leading(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
698 Err(LexerError::InvalidFloatNoLeading)
699}
700
701fn lex_invalid_float_no_trailing(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
704 Err(LexerError::InvalidFloatNoTrailing)
705}
706
707fn lex_ident(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
709 let s = lex.slice();
710
711 match s.to_lowercase().as_str() {
714 "true" | "false" if s != "true" && s != "false" => {
715 return Err(LexerError::AmbiguousBoolean(s.to_string()));
716 }
717 _ => {}
718 }
719
720 if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("no") {
722 return Err(LexerError::AmbiguousBooleanLike(s.to_string()));
723 }
724
725 Ok(s.to_string())
726}
727
728fn lex_long_flag(lex: &mut logos::Lexer<Token>) -> String {
730 lex.slice()[2..].to_string()
732}
733
734fn lex_short_flag(lex: &mut logos::Lexer<Token>) -> String {
736 lex.slice()[1..].to_string()
738}
739
740fn lex_plus_flag(lex: &mut logos::Lexer<Token>) -> String {
742 lex.slice()[1..].to_string()
744}
745
746fn lex_plus_bare(lex: &mut logos::Lexer<Token>) -> String {
748 lex.slice().to_string()
749}
750
751fn lex_minus_bare(lex: &mut logos::Lexer<Token>) -> String {
753 lex.slice().to_string()
754}
755
756fn lex_path(lex: &mut logos::Lexer<Token>) -> String {
758 lex.slice().to_string()
759}
760
761impl fmt::Display for Token {
762 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
763 match self {
764 Token::Set => write!(f, "set"),
765 Token::Local => write!(f, "local"),
766 Token::If => write!(f, "if"),
767 Token::Then => write!(f, "then"),
768 Token::Else => write!(f, "else"),
769 Token::Elif => write!(f, "elif"),
770 Token::Fi => write!(f, "fi"),
771 Token::For => write!(f, "for"),
772 Token::While => write!(f, "while"),
773 Token::In => write!(f, "in"),
774 Token::Do => write!(f, "do"),
775 Token::Done => write!(f, "done"),
776 Token::Case => write!(f, "case"),
777 Token::Esac => write!(f, "esac"),
778 Token::Function => write!(f, "function"),
779 Token::Break => write!(f, "break"),
780 Token::Continue => write!(f, "continue"),
781 Token::Return => write!(f, "return"),
782 Token::Exit => write!(f, "exit"),
783 Token::True => write!(f, "true"),
784 Token::False => write!(f, "false"),
785 Token::TypeString => write!(f, "string"),
786 Token::TypeInt => write!(f, "int"),
787 Token::TypeFloat => write!(f, "float"),
788 Token::TypeBool => write!(f, "bool"),
789 Token::And => write!(f, "&&"),
790 Token::Or => write!(f, "||"),
791 Token::EqEq => write!(f, "=="),
792 Token::NotEq => write!(f, "!="),
793 Token::Match => write!(f, "=~"),
794 Token::NotMatch => write!(f, "!~"),
795 Token::GtEq => write!(f, ">="),
796 Token::LtEq => write!(f, "<="),
797 Token::GtGt => write!(f, ">>"),
798 Token::StderrToStdout => write!(f, "2>&1"),
799 Token::StdoutToStderr => write!(f, "1>&2"),
800 Token::StdoutToStderr2 => write!(f, ">&2"),
801 Token::Stderr => write!(f, "2>"),
802 Token::Both => write!(f, "&>"),
803 Token::HereDocStart => write!(f, "<<"),
804 Token::DoubleSemi => write!(f, ";;"),
805 Token::Eq => write!(f, "="),
806 Token::Pipe => write!(f, "|"),
807 Token::Amp => write!(f, "&"),
808 Token::Gt => write!(f, ">"),
809 Token::Lt => write!(f, "<"),
810 Token::Semi => write!(f, ";"),
811 Token::Colon => write!(f, ":"),
812 Token::Comma => write!(f, ","),
813 Token::Dot => write!(f, "."),
814 Token::LBrace => write!(f, "{{"),
815 Token::RBrace => write!(f, "}}"),
816 Token::LBracket => write!(f, "["),
817 Token::RBracket => write!(f, "]"),
818 Token::LParen => write!(f, "("),
819 Token::RParen => write!(f, ")"),
820 Token::Star => write!(f, "*"),
821 Token::Bang => write!(f, "!"),
822 Token::Question => write!(f, "?"),
823 Token::Arithmetic(s) => write!(f, "ARITHMETIC({})", s),
824 Token::CmdSubstStart => write!(f, "$("),
825 Token::LongFlag(s) => write!(f, "--{}", s),
826 Token::ShortFlag(s) => write!(f, "-{}", s),
827 Token::PlusFlag(s) => write!(f, "+{}", s),
828 Token::DoubleDash => write!(f, "--"),
829 Token::PlusBare(s) => write!(f, "{}", s),
830 Token::MinusBare(s) => write!(f, "{}", s),
831 Token::MinusAlone => write!(f, "-"),
832 Token::String(s) => write!(f, "STRING({:?})", s),
833 Token::SingleString(s) => write!(f, "SINGLESTRING({:?})", s),
834 Token::HereDoc(s) => write!(f, "HEREDOC({:?})", s),
835 Token::VarRef(v) => write!(f, "VARREF({})", v),
836 Token::SimpleVarRef(v) => write!(f, "SIMPLEVARREF({})", v),
837 Token::Positional(n) => write!(f, "${}", n),
838 Token::AllArgs => write!(f, "$@"),
839 Token::ArgCount => write!(f, "$#"),
840 Token::LastExitCode => write!(f, "$?"),
841 Token::CurrentPid => write!(f, "$$"),
842 Token::VarLength(v) => write!(f, "${{#{}}}", v),
843 Token::Int(n) => write!(f, "INT({})", n),
844 Token::Float(n) => write!(f, "FLOAT({})", n),
845 Token::Path(s) => write!(f, "PATH({})", s),
846 Token::Ident(s) => write!(f, "IDENT({})", s),
847 Token::Comment => write!(f, "COMMENT"),
848 Token::Newline => write!(f, "NEWLINE"),
849 Token::LineContinuation => write!(f, "LINECONT"),
850 Token::InvalidNumberIdent => write!(f, "INVALID_NUMBER_IDENT"),
852 Token::InvalidFloatNoLeading => write!(f, "INVALID_FLOAT_NO_LEADING"),
853 Token::InvalidFloatNoTrailing => write!(f, "INVALID_FLOAT_NO_TRAILING"),
854 }
855 }
856}
857
858impl Token {
859 pub fn is_keyword(&self) -> bool {
861 matches!(
862 self,
863 Token::Set
864 | Token::Local
865 | Token::If
866 | Token::Then
867 | Token::Else
868 | Token::Elif
869 | Token::Fi
870 | Token::For
871 | Token::In
872 | Token::Do
873 | Token::Done
874 | Token::Case
875 | Token::Esac
876 | Token::Function
877 | Token::True
878 | Token::False
879 )
880 }
881
882 pub fn is_type(&self) -> bool {
884 matches!(
885 self,
886 Token::TypeString
887 | Token::TypeInt
888 | Token::TypeFloat
889 | Token::TypeBool
890 )
891 }
892
893 pub fn starts_statement(&self) -> bool {
895 matches!(
896 self,
897 Token::Set | Token::Local | Token::Function | Token::If | Token::For | Token::Case | Token::Ident(_) | Token::LBracket
898 )
899 }
900
901 pub fn is_value(&self) -> bool {
903 matches!(
904 self,
905 Token::String(_)
906 | Token::SingleString(_)
907 | Token::HereDoc(_)
908 | Token::Arithmetic(_)
909 | Token::Int(_)
910 | Token::Float(_)
911 | Token::True
912 | Token::False
913 | Token::VarRef(_)
914 | Token::SimpleVarRef(_)
915 | Token::CmdSubstStart
916 | Token::Path(_)
917 | Token::LastExitCode
918 | Token::CurrentPid
919 )
920 }
921}
922
923struct ArithmeticPreprocessResult {
925 text: String,
927 arithmetics: Vec<(String, String)>,
929 replacements: Vec<SpanReplacement>,
931}
932
933fn preprocess_arithmetic(source: &str) -> Result<ArithmeticPreprocessResult, LexerError> {
947 let mut result = String::with_capacity(source.len());
948 let mut arithmetics: Vec<(String, String)> = Vec::new();
949 let mut replacements: Vec<SpanReplacement> = Vec::new();
950 let mut source_pos: usize = 0;
951 let chars_vec: Vec<char> = source.chars().collect();
952 let mut i = 0;
953
954 while i < chars_vec.len() {
955 let ch = chars_vec[i];
956
957 if ch == '$' && i + 2 < chars_vec.len() && chars_vec[i + 1] == '(' && chars_vec[i + 2] == '(' {
959 let arith_start_pos = result.len();
960 let original_start = source_pos;
961
962 i += 3;
964 source_pos += 3;
965
966 let mut expr = String::new();
968 let mut paren_depth: usize = 0;
969
970 while i < chars_vec.len() {
971 let c = chars_vec[i];
972 match c {
973 '(' => {
974 paren_depth += 1;
975 if paren_depth > MAX_PAREN_DEPTH {
976 return Err(LexerError::NestingTooDeep);
977 }
978 expr.push('(');
979 i += 1;
980 source_pos += c.len_utf8();
981 }
982 ')' => {
983 if paren_depth > 0 {
984 paren_depth -= 1;
985 expr.push(')');
986 i += 1;
987 source_pos += 1;
988 } else if i + 1 < chars_vec.len() && chars_vec[i + 1] == ')' {
989 i += 2;
991 source_pos += 2;
992 break;
993 } else {
994 expr.push(')');
996 i += 1;
997 source_pos += 1;
998 }
999 }
1000 _ => {
1001 expr.push(c);
1002 i += 1;
1003 source_pos += c.len_utf8();
1004 }
1005 }
1006 }
1007
1008 let original_len = source_pos - original_start;
1010
1011 let marker = format!("__KAISH_ARITH_{}__", unique_marker_id());
1013 let marker_len = marker.len();
1014
1015 replacements.push(SpanReplacement {
1017 preprocessed_pos: arith_start_pos,
1018 marker_len,
1019 original_len,
1020 });
1021
1022 arithmetics.push((marker.clone(), expr));
1023 result.push_str(&marker);
1024 } else {
1025 result.push(ch);
1026 i += 1;
1027 source_pos += ch.len_utf8();
1028 }
1029 }
1030
1031 Ok(ArithmeticPreprocessResult {
1032 text: result,
1033 arithmetics,
1034 replacements,
1035 })
1036}
1037
1038fn preprocess_heredocs(source: &str) -> (String, Vec<(String, String)>) {
1049 let mut result = String::with_capacity(source.len());
1050 let mut heredocs: Vec<(String, String)> = Vec::new();
1051 let mut chars = source.chars().peekable();
1052
1053 while let Some(ch) = chars.next() {
1054 if ch == '<' && chars.peek() == Some(&'<') {
1056 chars.next(); let strip_tabs = chars.peek() == Some(&'-');
1060 if strip_tabs {
1061 chars.next();
1062 }
1063
1064 while let Some(&c) = chars.peek() {
1066 if c == ' ' || c == '\t' {
1067 chars.next();
1068 } else {
1069 break;
1070 }
1071 }
1072
1073 let mut delimiter = String::new();
1075 let quoted = chars.peek() == Some(&'\'') || chars.peek() == Some(&'"');
1076 let quote_char = if quoted { chars.next() } else { None };
1077
1078 while let Some(&c) = chars.peek() {
1079 if quoted {
1080 if Some(c) == quote_char {
1081 chars.next(); break;
1083 }
1084 } else if c.is_whitespace() || c == '\n' || c == '\r' {
1085 break;
1086 }
1087 if let Some(ch) = chars.next() {
1088 delimiter.push(ch);
1089 }
1090 }
1091
1092 if delimiter.is_empty() {
1093 result.push_str("<<");
1095 if strip_tabs {
1096 result.push('-');
1097 }
1098 continue;
1099 }
1100
1101 while let Some(&c) = chars.peek() {
1103 if c == '\n' {
1104 chars.next();
1105 break;
1106 } else if c == '\r' {
1107 chars.next();
1108 if chars.peek() == Some(&'\n') {
1109 chars.next();
1110 }
1111 break;
1112 }
1113 if let Some(ch) = chars.next() {
1114 result.push(ch);
1115 }
1116 }
1117
1118 let mut content = String::new();
1120 let mut current_line = String::new();
1121
1122 loop {
1123 match chars.next() {
1124 Some('\n') => {
1125 let trimmed = if strip_tabs {
1127 current_line.trim_start_matches('\t')
1128 } else {
1129 ¤t_line
1130 };
1131 if trimmed == delimiter {
1132 break;
1134 }
1135 if !content.is_empty() || !current_line.is_empty() {
1137 content.push_str(¤t_line);
1138 content.push('\n');
1139 }
1140 current_line.clear();
1141 }
1142 Some('\r') => {
1143 if chars.peek() == Some(&'\n') {
1145 chars.next();
1146 }
1147 let trimmed = if strip_tabs {
1148 current_line.trim_start_matches('\t')
1149 } else {
1150 ¤t_line
1151 };
1152 if trimmed == delimiter {
1153 break;
1154 }
1155 if !content.is_empty() || !current_line.is_empty() {
1156 content.push_str(¤t_line);
1157 content.push('\n');
1158 }
1159 current_line.clear();
1160 }
1161 Some(c) => {
1162 current_line.push(c);
1163 }
1164 None => {
1165 let trimmed = if strip_tabs {
1167 current_line.trim_start_matches('\t')
1168 } else {
1169 ¤t_line
1170 };
1171 if trimmed == delimiter {
1172 break;
1174 }
1175 if !current_line.is_empty() {
1177 content.push_str(¤t_line);
1178 }
1179 break;
1180 }
1181 }
1182 }
1183
1184 let content = content.trim_end_matches('\n').to_string();
1186
1187 let marker = format!("__KAISH_HEREDOC_{}__", unique_marker_id());
1189 heredocs.push((marker.clone(), content));
1190
1191 result.push_str("<<");
1193 result.push_str(&marker);
1194 result.push('\n'); } else {
1196 result.push(ch);
1197 }
1198 }
1199
1200 (result, heredocs)
1201}
1202
1203pub fn tokenize(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1212 let arith_result = preprocess_arithmetic(source)
1214 .map_err(|e| vec![Spanned::new(e, 0..source.len())])?;
1215
1216 let (preprocessed, heredocs) = preprocess_heredocs(&arith_result.text);
1218
1219 let span_replacements = arith_result.replacements;
1221
1222 let lexer = Token::lexer(&preprocessed);
1223 let mut tokens = Vec::new();
1224 let mut errors = Vec::new();
1225
1226 for (result, span) in lexer.spanned() {
1227 let corrected_span = correct_span(span, &span_replacements);
1229 match result {
1230 Ok(token) => {
1231 if !matches!(token, Token::Comment | Token::LineContinuation) {
1233 tokens.push(Spanned::new(token, corrected_span));
1234 }
1235 }
1236 Err(err) => {
1237 errors.push(Spanned::new(err, corrected_span));
1238 }
1239 }
1240 }
1241
1242 if !errors.is_empty() {
1243 return Err(errors);
1244 }
1245
1246 let mut final_tokens = Vec::with_capacity(tokens.len());
1248 let mut i = 0;
1249
1250 while i < tokens.len() {
1251 if let Token::Ident(ref name) = tokens[i].token
1253 && name.starts_with("__KAISH_ARITH_") && name.ends_with("__")
1254 && let Some((_, expr)) = arith_result.arithmetics.iter().find(|(marker, _)| marker == name) {
1255 final_tokens.push(Spanned::new(Token::Arithmetic(expr.clone()), tokens[i].span.clone()));
1256 i += 1;
1257 continue;
1258 }
1259
1260 if matches!(tokens[i].token, Token::HereDocStart) {
1262 if i + 1 < tokens.len()
1264 && let Token::Ident(ref name) = tokens[i + 1].token
1265 && name.starts_with("__KAISH_HEREDOC_") && name.ends_with("__") {
1266 if let Some((_, content)) = heredocs.iter().find(|(marker, _)| marker == name) {
1268 final_tokens.push(Spanned::new(Token::HereDocStart, tokens[i].span.clone()));
1269 final_tokens.push(Spanned::new(Token::HereDoc(content.clone()), tokens[i + 1].span.clone()));
1270 i += 2;
1271 continue;
1272 }
1273 }
1274 }
1275
1276 let token = if let Token::String(ref s) = tokens[i].token {
1278 let mut new_content = s.clone();
1280 for (marker, expr) in &arith_result.arithmetics {
1281 if new_content.contains(marker) {
1282 new_content = new_content.replace(marker, &format!("${{__ARITH:{}__}}", expr));
1285 }
1286 }
1287 if new_content != *s {
1288 Spanned::new(Token::String(new_content), tokens[i].span.clone())
1289 } else {
1290 tokens[i].clone()
1291 }
1292 } else {
1293 tokens[i].clone()
1294 };
1295 final_tokens.push(token);
1296 i += 1;
1297 }
1298
1299 Ok(final_tokens)
1300}
1301
1302pub fn tokenize_with_comments(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1306 let lexer = Token::lexer(source);
1307 let mut tokens = Vec::new();
1308 let mut errors = Vec::new();
1309
1310 for (result, span) in lexer.spanned() {
1311 match result {
1312 Ok(token) => {
1313 tokens.push(Spanned::new(token, span));
1314 }
1315 Err(err) => {
1316 errors.push(Spanned::new(err, span));
1317 }
1318 }
1319 }
1320
1321 if errors.is_empty() {
1322 Ok(tokens)
1323 } else {
1324 Err(errors)
1325 }
1326}
1327
1328pub fn parse_string_literal(source: &str) -> Result<String, LexerError> {
1330 if source.len() < 2 || !source.starts_with('"') || !source.ends_with('"') {
1332 return Err(LexerError::UnterminatedString);
1333 }
1334
1335 let inner = &source[1..source.len() - 1];
1336 let mut result = String::with_capacity(inner.len());
1337 let mut chars = inner.chars().peekable();
1338
1339 while let Some(ch) = chars.next() {
1340 if ch == '\\' {
1341 match chars.next() {
1342 Some('n') => result.push('\n'),
1343 Some('t') => result.push('\t'),
1344 Some('r') => result.push('\r'),
1345 Some('\\') => result.push('\\'),
1346 Some('"') => result.push('"'),
1347 Some('$') => result.push_str("__KAISH_ESCAPED_DOLLAR__"),
1350 Some('u') => {
1351 let mut hex = String::with_capacity(4);
1353 for _ in 0..4 {
1354 match chars.next() {
1355 Some(h) if h.is_ascii_hexdigit() => hex.push(h),
1356 _ => return Err(LexerError::InvalidEscape),
1357 }
1358 }
1359 let codepoint = u32::from_str_radix(&hex, 16)
1360 .map_err(|_| LexerError::InvalidEscape)?;
1361 let ch = char::from_u32(codepoint)
1362 .ok_or(LexerError::InvalidEscape)?;
1363 result.push(ch);
1364 }
1365 Some(next) => {
1367 result.push('\\');
1368 result.push(next);
1369 }
1370 None => return Err(LexerError::InvalidEscape),
1371 }
1372 } else {
1373 result.push(ch);
1374 }
1375 }
1376
1377 Ok(result)
1378}
1379
1380pub fn parse_var_ref(source: &str) -> Result<Vec<String>, LexerError> {
1383 if source.len() < 4 || !source.starts_with("${") || !source.ends_with('}') {
1385 return Err(LexerError::UnterminatedVarRef);
1386 }
1387
1388 let inner = &source[2..source.len() - 1];
1389
1390 if inner == "?" {
1392 return Ok(vec!["?".to_string()]);
1393 }
1394
1395 let mut segments = Vec::new();
1396 let mut current = String::new();
1397 let mut chars = inner.chars().peekable();
1398
1399 while let Some(ch) = chars.next() {
1400 match ch {
1401 '.' => {
1402 if !current.is_empty() {
1403 segments.push(current.clone());
1404 current.clear();
1405 }
1406 }
1407 '[' => {
1408 if !current.is_empty() {
1409 segments.push(current.clone());
1410 current.clear();
1411 }
1412 let mut index = String::from("[");
1414 while let Some(&c) = chars.peek() {
1415 if let Some(c) = chars.next() {
1416 index.push(c);
1417 }
1418 if c == ']' {
1419 break;
1420 }
1421 }
1422 segments.push(index);
1423 }
1424 _ => {
1425 current.push(ch);
1426 }
1427 }
1428 }
1429
1430 if !current.is_empty() {
1431 segments.push(current);
1432 }
1433
1434 Ok(segments)
1435}
1436
1437pub fn parse_int(source: &str) -> Result<i64, LexerError> {
1439 source.parse().map_err(|_| LexerError::InvalidNumber)
1440}
1441
1442pub fn parse_float(source: &str) -> Result<f64, LexerError> {
1444 source.parse().map_err(|_| LexerError::InvalidNumber)
1445}
1446
1447#[cfg(test)]
1448mod tests {
1449 use super::*;
1450
1451 fn lex(source: &str) -> Vec<Token> {
1452 tokenize(source)
1453 .expect("lexer should succeed")
1454 .into_iter()
1455 .map(|s| s.token)
1456 .collect()
1457 }
1458
1459 #[test]
1464 fn keywords() {
1465 assert_eq!(lex("set"), vec![Token::Set]);
1466 assert_eq!(lex("if"), vec![Token::If]);
1467 assert_eq!(lex("then"), vec![Token::Then]);
1468 assert_eq!(lex("else"), vec![Token::Else]);
1469 assert_eq!(lex("elif"), vec![Token::Elif]);
1470 assert_eq!(lex("fi"), vec![Token::Fi]);
1471 assert_eq!(lex("for"), vec![Token::For]);
1472 assert_eq!(lex("in"), vec![Token::In]);
1473 assert_eq!(lex("do"), vec![Token::Do]);
1474 assert_eq!(lex("done"), vec![Token::Done]);
1475 assert_eq!(lex("case"), vec![Token::Case]);
1476 assert_eq!(lex("esac"), vec![Token::Esac]);
1477 assert_eq!(lex("function"), vec![Token::Function]);
1478 assert_eq!(lex("true"), vec![Token::True]);
1479 assert_eq!(lex("false"), vec![Token::False]);
1480 }
1481
1482 #[test]
1483 fn double_semicolon() {
1484 assert_eq!(lex(";;"), vec![Token::DoubleSemi]);
1485 assert_eq!(lex("echo \"hi\";;"), vec![
1487 Token::Ident("echo".to_string()),
1488 Token::String("hi".to_string()),
1489 Token::DoubleSemi,
1490 ]);
1491 }
1492
1493 #[test]
1494 fn type_keywords() {
1495 assert_eq!(lex("string"), vec![Token::TypeString]);
1496 assert_eq!(lex("int"), vec![Token::TypeInt]);
1497 assert_eq!(lex("float"), vec![Token::TypeFloat]);
1498 assert_eq!(lex("bool"), vec![Token::TypeBool]);
1499 }
1500
1501 #[test]
1506 fn single_char_operators() {
1507 assert_eq!(lex("="), vec![Token::Eq]);
1508 assert_eq!(lex("|"), vec![Token::Pipe]);
1509 assert_eq!(lex("&"), vec![Token::Amp]);
1510 assert_eq!(lex(">"), vec![Token::Gt]);
1511 assert_eq!(lex("<"), vec![Token::Lt]);
1512 assert_eq!(lex(";"), vec![Token::Semi]);
1513 assert_eq!(lex(":"), vec![Token::Colon]);
1514 assert_eq!(lex(","), vec![Token::Comma]);
1515 assert_eq!(lex("."), vec![Token::Dot]);
1516 }
1517
1518 #[test]
1519 fn multi_char_operators() {
1520 assert_eq!(lex("&&"), vec![Token::And]);
1521 assert_eq!(lex("||"), vec![Token::Or]);
1522 assert_eq!(lex("=="), vec![Token::EqEq]);
1523 assert_eq!(lex("!="), vec![Token::NotEq]);
1524 assert_eq!(lex("=~"), vec![Token::Match]);
1525 assert_eq!(lex("!~"), vec![Token::NotMatch]);
1526 assert_eq!(lex(">="), vec![Token::GtEq]);
1527 assert_eq!(lex("<="), vec![Token::LtEq]);
1528 assert_eq!(lex(">>"), vec![Token::GtGt]);
1529 assert_eq!(lex("2>"), vec![Token::Stderr]);
1530 assert_eq!(lex("&>"), vec![Token::Both]);
1531 }
1532
1533 #[test]
1534 fn brackets() {
1535 assert_eq!(lex("{"), vec![Token::LBrace]);
1536 assert_eq!(lex("}"), vec![Token::RBrace]);
1537 assert_eq!(lex("["), vec![Token::LBracket]);
1538 assert_eq!(lex("]"), vec![Token::RBracket]);
1539 assert_eq!(lex("("), vec![Token::LParen]);
1540 assert_eq!(lex(")"), vec![Token::RParen]);
1541 }
1542
1543 #[test]
1548 fn integers() {
1549 assert_eq!(lex("0"), vec![Token::Int(0)]);
1550 assert_eq!(lex("42"), vec![Token::Int(42)]);
1551 assert_eq!(lex("-1"), vec![Token::Int(-1)]);
1552 assert_eq!(lex("999999"), vec![Token::Int(999999)]);
1553 }
1554
1555 #[test]
1556 fn floats() {
1557 assert_eq!(lex("3.14"), vec![Token::Float(3.14)]);
1558 assert_eq!(lex("-0.5"), vec![Token::Float(-0.5)]);
1559 assert_eq!(lex("123.456"), vec![Token::Float(123.456)]);
1560 }
1561
1562 #[test]
1563 fn strings() {
1564 assert_eq!(lex(r#""hello""#), vec![Token::String("hello".to_string())]);
1565 assert_eq!(lex(r#""hello world""#), vec![Token::String("hello world".to_string())]);
1566 assert_eq!(lex(r#""""#), vec![Token::String("".to_string())]); assert_eq!(lex(r#""with \"quotes\"""#), vec![Token::String("with \"quotes\"".to_string())]);
1568 assert_eq!(lex(r#""with\nnewline""#), vec![Token::String("with\nnewline".to_string())]);
1569 }
1570
1571 #[test]
1572 fn var_refs() {
1573 assert_eq!(lex("${X}"), vec![Token::VarRef("${X}".to_string())]);
1574 assert_eq!(lex("${VAR}"), vec![Token::VarRef("${VAR}".to_string())]);
1575 assert_eq!(lex("${VAR.field}"), vec![Token::VarRef("${VAR.field}".to_string())]);
1576 assert_eq!(lex("${VAR[0]}"), vec![Token::VarRef("${VAR[0]}".to_string())]);
1577 assert_eq!(lex("${?.ok}"), vec![Token::VarRef("${?.ok}".to_string())]);
1578 }
1579
1580 #[test]
1585 fn identifiers() {
1586 assert_eq!(lex("foo"), vec![Token::Ident("foo".to_string())]);
1587 assert_eq!(lex("foo_bar"), vec![Token::Ident("foo_bar".to_string())]);
1588 assert_eq!(lex("foo-bar"), vec![Token::Ident("foo-bar".to_string())]);
1589 assert_eq!(lex("_private"), vec![Token::Ident("_private".to_string())]);
1590 assert_eq!(lex("cmd123"), vec![Token::Ident("cmd123".to_string())]);
1591 }
1592
1593 #[test]
1594 fn keyword_prefix_identifiers() {
1595 assert_eq!(lex("setup"), vec![Token::Ident("setup".to_string())]);
1597 assert_eq!(lex("tools"), vec![Token::Ident("tools".to_string())]);
1598 assert_eq!(lex("iffy"), vec![Token::Ident("iffy".to_string())]);
1599 assert_eq!(lex("forked"), vec![Token::Ident("forked".to_string())]);
1600 assert_eq!(lex("done-with-it"), vec![Token::Ident("done-with-it".to_string())]);
1601 }
1602
1603 #[test]
1608 fn assignment() {
1609 assert_eq!(
1610 lex("set X = 5"),
1611 vec![Token::Set, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
1612 );
1613 }
1614
1615 #[test]
1616 fn command_simple() {
1617 assert_eq!(lex("echo"), vec![Token::Ident("echo".to_string())]);
1618 assert_eq!(
1619 lex(r#"echo "hello""#),
1620 vec![Token::Ident("echo".to_string()), Token::String("hello".to_string())]
1621 );
1622 }
1623
1624 #[test]
1625 fn command_with_args() {
1626 assert_eq!(
1627 lex("cmd arg1 arg2"),
1628 vec![Token::Ident("cmd".to_string()), Token::Ident("arg1".to_string()), Token::Ident("arg2".to_string())]
1629 );
1630 }
1631
1632 #[test]
1633 fn command_with_named_args() {
1634 assert_eq!(
1635 lex("cmd key=value"),
1636 vec![Token::Ident("cmd".to_string()), Token::Ident("key".to_string()), Token::Eq, Token::Ident("value".to_string())]
1637 );
1638 }
1639
1640 #[test]
1641 fn pipeline() {
1642 assert_eq!(
1643 lex("a | b | c"),
1644 vec![Token::Ident("a".to_string()), Token::Pipe, Token::Ident("b".to_string()), Token::Pipe, Token::Ident("c".to_string())]
1645 );
1646 }
1647
1648 #[test]
1649 fn if_statement() {
1650 assert_eq!(
1651 lex("if true; then echo; fi"),
1652 vec![
1653 Token::If,
1654 Token::True,
1655 Token::Semi,
1656 Token::Then,
1657 Token::Ident("echo".to_string()),
1658 Token::Semi,
1659 Token::Fi
1660 ]
1661 );
1662 }
1663
1664 #[test]
1665 fn for_loop() {
1666 assert_eq!(
1667 lex("for X in items; do echo; done"),
1668 vec![
1669 Token::For,
1670 Token::Ident("X".to_string()),
1671 Token::In,
1672 Token::Ident("items".to_string()),
1673 Token::Semi,
1674 Token::Do,
1675 Token::Ident("echo".to_string()),
1676 Token::Semi,
1677 Token::Done
1678 ]
1679 );
1680 }
1681
1682 #[test]
1687 fn whitespace_ignored() {
1688 assert_eq!(lex(" set X = 5 "), lex("set X = 5"));
1689 }
1690
1691 #[test]
1692 fn newlines_preserved() {
1693 let tokens = lex("a\nb");
1694 assert_eq!(
1695 tokens,
1696 vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
1697 );
1698 }
1699
1700 #[test]
1701 fn multiple_newlines() {
1702 let tokens = lex("a\n\n\nb");
1703 assert_eq!(
1704 tokens,
1705 vec![Token::Ident("a".to_string()), Token::Newline, Token::Newline, Token::Newline, Token::Ident("b".to_string())]
1706 );
1707 }
1708
1709 #[test]
1714 fn comments_skipped() {
1715 assert_eq!(lex("# comment"), vec![]);
1716 assert_eq!(lex("a # comment"), vec![Token::Ident("a".to_string())]);
1717 assert_eq!(
1718 lex("a # comment\nb"),
1719 vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
1720 );
1721 }
1722
1723 #[test]
1724 fn comments_preserved_when_requested() {
1725 let tokens = tokenize_with_comments("a # comment")
1726 .expect("should succeed")
1727 .into_iter()
1728 .map(|s| s.token)
1729 .collect::<Vec<_>>();
1730 assert_eq!(tokens, vec![Token::Ident("a".to_string()), Token::Comment]);
1731 }
1732
1733 #[test]
1738 fn parse_simple_string() {
1739 assert_eq!(parse_string_literal(r#""hello""#).expect("ok"), "hello");
1740 }
1741
1742 #[test]
1743 fn parse_string_with_escapes() {
1744 assert_eq!(
1745 parse_string_literal(r#""hello\nworld""#).expect("ok"),
1746 "hello\nworld"
1747 );
1748 assert_eq!(
1749 parse_string_literal(r#""tab\there""#).expect("ok"),
1750 "tab\there"
1751 );
1752 assert_eq!(
1753 parse_string_literal(r#""quote\"here""#).expect("ok"),
1754 "quote\"here"
1755 );
1756 }
1757
1758 #[test]
1759 fn parse_string_with_unicode() {
1760 assert_eq!(
1761 parse_string_literal(r#""emoji \u2764""#).expect("ok"),
1762 "emoji ❤"
1763 );
1764 }
1765
1766 #[test]
1767 fn parse_string_with_escaped_dollar() {
1768 assert_eq!(
1771 parse_string_literal(r#""\$VAR""#).expect("ok"),
1772 "__KAISH_ESCAPED_DOLLAR__VAR"
1773 );
1774 assert_eq!(
1775 parse_string_literal(r#""cost: \$100""#).expect("ok"),
1776 "cost: __KAISH_ESCAPED_DOLLAR__100"
1777 );
1778 }
1779
1780 #[test]
1785 fn parse_simple_var() {
1786 assert_eq!(
1787 parse_var_ref("${X}").expect("ok"),
1788 vec!["X"]
1789 );
1790 }
1791
1792 #[test]
1793 fn parse_var_with_field() {
1794 assert_eq!(
1795 parse_var_ref("${VAR.field}").expect("ok"),
1796 vec!["VAR", "field"]
1797 );
1798 }
1799
1800 #[test]
1801 fn parse_var_with_index() {
1802 assert_eq!(
1803 parse_var_ref("${VAR[0]}").expect("ok"),
1804 vec!["VAR", "[0]"]
1805 );
1806 }
1807
1808 #[test]
1809 fn parse_var_nested() {
1810 assert_eq!(
1811 parse_var_ref("${VAR.field[0].nested}").expect("ok"),
1812 vec!["VAR", "field", "[0]", "nested"]
1813 );
1814 }
1815
1816 #[test]
1817 fn parse_last_result() {
1818 assert_eq!(
1819 parse_var_ref("${?}").expect("ok"),
1820 vec!["?"]
1821 );
1822 assert_eq!(
1823 parse_var_ref("${?.ok}").expect("ok"),
1824 vec!["?", "ok"]
1825 );
1826 }
1827
1828 #[test]
1833 fn parse_integers() {
1834 assert_eq!(parse_int("0").expect("ok"), 0);
1835 assert_eq!(parse_int("42").expect("ok"), 42);
1836 assert_eq!(parse_int("-1").expect("ok"), -1);
1837 }
1838
1839 #[test]
1840 fn parse_floats() {
1841 assert!((parse_float("3.14").expect("ok") - 3.14).abs() < f64::EPSILON);
1842 assert!((parse_float("-0.5").expect("ok") - (-0.5)).abs() < f64::EPSILON);
1843 }
1844
1845 #[test]
1850 fn empty_input() {
1851 assert_eq!(lex(""), vec![]);
1852 }
1853
1854 #[test]
1855 fn only_whitespace() {
1856 assert_eq!(lex(" \t\t "), vec![]);
1857 }
1858
1859 #[test]
1860 fn json_array() {
1861 assert_eq!(
1862 lex(r#"[1, 2, 3]"#),
1863 vec![
1864 Token::LBracket,
1865 Token::Int(1),
1866 Token::Comma,
1867 Token::Int(2),
1868 Token::Comma,
1869 Token::Int(3),
1870 Token::RBracket
1871 ]
1872 );
1873 }
1874
1875 #[test]
1876 fn json_object() {
1877 assert_eq!(
1878 lex(r#"{"key": "value"}"#),
1879 vec![
1880 Token::LBrace,
1881 Token::String("key".to_string()),
1882 Token::Colon,
1883 Token::String("value".to_string()),
1884 Token::RBrace
1885 ]
1886 );
1887 }
1888
1889 #[test]
1890 fn redirect_operators() {
1891 assert_eq!(
1892 lex("cmd > file"),
1893 vec![Token::Ident("cmd".to_string()), Token::Gt, Token::Ident("file".to_string())]
1894 );
1895 assert_eq!(
1896 lex("cmd >> file"),
1897 vec![Token::Ident("cmd".to_string()), Token::GtGt, Token::Ident("file".to_string())]
1898 );
1899 assert_eq!(
1900 lex("cmd 2> err"),
1901 vec![Token::Ident("cmd".to_string()), Token::Stderr, Token::Ident("err".to_string())]
1902 );
1903 assert_eq!(
1904 lex("cmd &> all"),
1905 vec![Token::Ident("cmd".to_string()), Token::Both, Token::Ident("all".to_string())]
1906 );
1907 }
1908
1909 #[test]
1910 fn background_job() {
1911 assert_eq!(
1912 lex("cmd &"),
1913 vec![Token::Ident("cmd".to_string()), Token::Amp]
1914 );
1915 }
1916
1917 #[test]
1918 fn command_substitution() {
1919 assert_eq!(
1920 lex("$(cmd)"),
1921 vec![Token::CmdSubstStart, Token::Ident("cmd".to_string()), Token::RParen]
1922 );
1923 assert_eq!(
1924 lex("$(cmd arg)"),
1925 vec![
1926 Token::CmdSubstStart,
1927 Token::Ident("cmd".to_string()),
1928 Token::Ident("arg".to_string()),
1929 Token::RParen
1930 ]
1931 );
1932 assert_eq!(
1933 lex("$(a | b)"),
1934 vec![
1935 Token::CmdSubstStart,
1936 Token::Ident("a".to_string()),
1937 Token::Pipe,
1938 Token::Ident("b".to_string()),
1939 Token::RParen
1940 ]
1941 );
1942 }
1943
1944 #[test]
1945 fn complex_pipeline() {
1946 assert_eq!(
1947 lex(r#"cat file | grep pattern="foo" | head count=10"#),
1948 vec![
1949 Token::Ident("cat".to_string()),
1950 Token::Ident("file".to_string()),
1951 Token::Pipe,
1952 Token::Ident("grep".to_string()),
1953 Token::Ident("pattern".to_string()),
1954 Token::Eq,
1955 Token::String("foo".to_string()),
1956 Token::Pipe,
1957 Token::Ident("head".to_string()),
1958 Token::Ident("count".to_string()),
1959 Token::Eq,
1960 Token::Int(10),
1961 ]
1962 );
1963 }
1964
1965 #[test]
1970 fn short_flag() {
1971 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
1972 assert_eq!(lex("-a"), vec![Token::ShortFlag("a".to_string())]);
1973 assert_eq!(lex("-v"), vec![Token::ShortFlag("v".to_string())]);
1974 }
1975
1976 #[test]
1977 fn short_flag_combined() {
1978 assert_eq!(lex("-la"), vec![Token::ShortFlag("la".to_string())]);
1980 assert_eq!(lex("-vvv"), vec![Token::ShortFlag("vvv".to_string())]);
1981 }
1982
1983 #[test]
1984 fn long_flag() {
1985 assert_eq!(lex("--force"), vec![Token::LongFlag("force".to_string())]);
1986 assert_eq!(lex("--verbose"), vec![Token::LongFlag("verbose".to_string())]);
1987 assert_eq!(lex("--foo-bar"), vec![Token::LongFlag("foo-bar".to_string())]);
1988 }
1989
1990 #[test]
1991 fn double_dash() {
1992 assert_eq!(lex("--"), vec![Token::DoubleDash]);
1994 }
1995
1996 #[test]
1997 fn flags_vs_negative_numbers() {
1998 assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2000 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2002 assert_eq!(
2005 lex("-1 a"),
2006 vec![Token::Int(-1), Token::Ident("a".to_string())]
2007 );
2008 }
2009
2010 #[test]
2011 fn command_with_flags() {
2012 assert_eq!(
2013 lex("ls -l"),
2014 vec![
2015 Token::Ident("ls".to_string()),
2016 Token::ShortFlag("l".to_string()),
2017 ]
2018 );
2019 assert_eq!(
2020 lex("git commit -m"),
2021 vec![
2022 Token::Ident("git".to_string()),
2023 Token::Ident("commit".to_string()),
2024 Token::ShortFlag("m".to_string()),
2025 ]
2026 );
2027 assert_eq!(
2028 lex("git push --force"),
2029 vec![
2030 Token::Ident("git".to_string()),
2031 Token::Ident("push".to_string()),
2032 Token::LongFlag("force".to_string()),
2033 ]
2034 );
2035 }
2036
2037 #[test]
2038 fn flag_with_value() {
2039 assert_eq!(
2040 lex(r#"git commit -m "message""#),
2041 vec![
2042 Token::Ident("git".to_string()),
2043 Token::Ident("commit".to_string()),
2044 Token::ShortFlag("m".to_string()),
2045 Token::String("message".to_string()),
2046 ]
2047 );
2048 assert_eq!(
2049 lex(r#"--message="hello""#),
2050 vec![
2051 Token::LongFlag("message".to_string()),
2052 Token::Eq,
2053 Token::String("hello".to_string()),
2054 ]
2055 );
2056 }
2057
2058 #[test]
2059 fn end_of_flags_marker() {
2060 assert_eq!(
2061 lex("git checkout -- file"),
2062 vec![
2063 Token::Ident("git".to_string()),
2064 Token::Ident("checkout".to_string()),
2065 Token::DoubleDash,
2066 Token::Ident("file".to_string()),
2067 ]
2068 );
2069 }
2070
2071 #[test]
2076 fn local_keyword() {
2077 assert_eq!(lex("local"), vec![Token::Local]);
2078 assert_eq!(
2079 lex("local X = 5"),
2080 vec![Token::Local, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2081 );
2082 }
2083
2084 #[test]
2085 fn simple_var_ref() {
2086 assert_eq!(lex("$X"), vec![Token::SimpleVarRef("X".to_string())]);
2087 assert_eq!(lex("$foo"), vec![Token::SimpleVarRef("foo".to_string())]);
2088 assert_eq!(lex("$foo_bar"), vec![Token::SimpleVarRef("foo_bar".to_string())]);
2089 assert_eq!(lex("$_private"), vec![Token::SimpleVarRef("_private".to_string())]);
2090 }
2091
2092 #[test]
2093 fn simple_var_ref_in_command() {
2094 assert_eq!(
2095 lex("echo $NAME"),
2096 vec![Token::Ident("echo".to_string()), Token::SimpleVarRef("NAME".to_string())]
2097 );
2098 }
2099
2100 #[test]
2101 fn single_quoted_strings() {
2102 assert_eq!(lex("'hello'"), vec![Token::SingleString("hello".to_string())]);
2103 assert_eq!(lex("'hello world'"), vec![Token::SingleString("hello world".to_string())]);
2104 assert_eq!(lex("''"), vec![Token::SingleString("".to_string())]);
2105 assert_eq!(lex(r"'no $VAR here'"), vec![Token::SingleString("no $VAR here".to_string())]);
2107 assert_eq!(lex(r"'backslash \n stays'"), vec![Token::SingleString(r"backslash \n stays".to_string())]);
2108 }
2109
2110 #[test]
2111 fn test_brackets() {
2112 assert_eq!(lex("[["), vec![Token::LBracket, Token::LBracket]);
2114 assert_eq!(lex("]]"), vec![Token::RBracket, Token::RBracket]);
2115 assert_eq!(
2116 lex("[[ -f file ]]"),
2117 vec![
2118 Token::LBracket,
2119 Token::LBracket,
2120 Token::ShortFlag("f".to_string()),
2121 Token::Ident("file".to_string()),
2122 Token::RBracket,
2123 Token::RBracket
2124 ]
2125 );
2126 }
2127
2128 #[test]
2129 fn test_expression_syntax() {
2130 assert_eq!(
2131 lex(r#"[[ $X == "value" ]]"#),
2132 vec![
2133 Token::LBracket,
2134 Token::LBracket,
2135 Token::SimpleVarRef("X".to_string()),
2136 Token::EqEq,
2137 Token::String("value".to_string()),
2138 Token::RBracket,
2139 Token::RBracket
2140 ]
2141 );
2142 }
2143
2144 #[test]
2145 fn bash_style_assignment() {
2146 assert_eq!(
2148 lex(r#"NAME="value""#),
2149 vec![
2150 Token::Ident("NAME".to_string()),
2151 Token::Eq,
2152 Token::String("value".to_string())
2153 ]
2154 );
2155 }
2156
2157 #[test]
2158 fn positional_params() {
2159 assert_eq!(lex("$0"), vec![Token::Positional(0)]);
2160 assert_eq!(lex("$1"), vec![Token::Positional(1)]);
2161 assert_eq!(lex("$9"), vec![Token::Positional(9)]);
2162 assert_eq!(lex("$@"), vec![Token::AllArgs]);
2163 assert_eq!(lex("$#"), vec![Token::ArgCount]);
2164 }
2165
2166 #[test]
2167 fn positional_in_context() {
2168 assert_eq!(
2169 lex("echo $1 $2"),
2170 vec![
2171 Token::Ident("echo".to_string()),
2172 Token::Positional(1),
2173 Token::Positional(2),
2174 ]
2175 );
2176 }
2177
2178 #[test]
2179 fn var_length() {
2180 assert_eq!(lex("${#X}"), vec![Token::VarLength("X".to_string())]);
2181 assert_eq!(lex("${#NAME}"), vec![Token::VarLength("NAME".to_string())]);
2182 assert_eq!(lex("${#foo_bar}"), vec![Token::VarLength("foo_bar".to_string())]);
2183 }
2184
2185 #[test]
2186 fn var_length_in_context() {
2187 assert_eq!(
2188 lex("echo ${#NAME}"),
2189 vec![
2190 Token::Ident("echo".to_string()),
2191 Token::VarLength("NAME".to_string()),
2192 ]
2193 );
2194 }
2195
2196 #[test]
2201 fn plus_flag() {
2202 assert_eq!(lex("+e"), vec![Token::PlusFlag("e".to_string())]);
2204 assert_eq!(lex("+x"), vec![Token::PlusFlag("x".to_string())]);
2205 assert_eq!(lex("+ex"), vec![Token::PlusFlag("ex".to_string())]);
2206 }
2207
2208 #[test]
2209 fn set_with_plus_flag() {
2210 assert_eq!(
2211 lex("set +e"),
2212 vec![
2213 Token::Set,
2214 Token::PlusFlag("e".to_string()),
2215 ]
2216 );
2217 }
2218
2219 #[test]
2220 fn set_with_multiple_flags() {
2221 assert_eq!(
2222 lex("set -e -u"),
2223 vec![
2224 Token::Set,
2225 Token::ShortFlag("e".to_string()),
2226 Token::ShortFlag("u".to_string()),
2227 ]
2228 );
2229 }
2230
2231 #[test]
2232 fn flags_vs_negative_numbers_edge_cases() {
2233 assert_eq!(
2235 lex("-1 a"),
2236 vec![Token::Int(-1), Token::Ident("a".to_string())]
2237 );
2238 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2240 assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2242 }
2243
2244 #[test]
2245 fn single_dash_is_minus_alone() {
2246 let result = tokenize("-").expect("should lex");
2248 assert_eq!(result.len(), 1);
2249 assert!(matches!(result[0].token, Token::MinusAlone));
2250 }
2251
2252 #[test]
2253 fn plus_bare_for_date_format() {
2254 let result = tokenize("+%s").expect("should lex");
2256 assert_eq!(result.len(), 1);
2257 assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%s"));
2258
2259 let result = tokenize("+%Y-%m-%d").expect("should lex");
2261 assert_eq!(result.len(), 1);
2262 assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%Y-%m-%d"));
2263 }
2264
2265 #[test]
2266 fn plus_flag_still_works() {
2267 let result = tokenize("+e").expect("should lex");
2269 assert_eq!(result.len(), 1);
2270 assert!(matches!(result[0].token, Token::PlusFlag(ref s) if s == "e"));
2271 }
2272
2273 #[test]
2274 fn while_keyword_vs_while_loop() {
2275 assert_eq!(lex("while"), vec![Token::While]);
2277 assert_eq!(
2279 lex("while true"),
2280 vec![Token::While, Token::True]
2281 );
2282 }
2283
2284 #[test]
2285 fn control_flow_keywords() {
2286 assert_eq!(lex("break"), vec![Token::Break]);
2287 assert_eq!(lex("continue"), vec![Token::Continue]);
2288 assert_eq!(lex("return"), vec![Token::Return]);
2289 assert_eq!(lex("exit"), vec![Token::Exit]);
2290 }
2291
2292 #[test]
2293 fn control_flow_with_numbers() {
2294 assert_eq!(
2295 lex("break 2"),
2296 vec![Token::Break, Token::Int(2)]
2297 );
2298 assert_eq!(
2299 lex("continue 3"),
2300 vec![Token::Continue, Token::Int(3)]
2301 );
2302 assert_eq!(
2303 lex("exit 1"),
2304 vec![Token::Exit, Token::Int(1)]
2305 );
2306 }
2307
2308 #[test]
2313 fn heredoc_simple() {
2314 let source = "cat <<EOF\nhello\nworld\nEOF";
2315 let tokens = lex(source);
2316 assert_eq!(tokens, vec![
2317 Token::Ident("cat".to_string()),
2318 Token::HereDocStart,
2319 Token::HereDoc("hello\nworld".to_string()),
2320 Token::Newline,
2321 ]);
2322 }
2323
2324 #[test]
2325 fn heredoc_empty() {
2326 let source = "cat <<EOF\nEOF";
2327 let tokens = lex(source);
2328 assert_eq!(tokens, vec![
2329 Token::Ident("cat".to_string()),
2330 Token::HereDocStart,
2331 Token::HereDoc("".to_string()),
2332 Token::Newline,
2333 ]);
2334 }
2335
2336 #[test]
2337 fn heredoc_with_special_chars() {
2338 let source = "cat <<EOF\n$VAR and \"quoted\" 'single'\nEOF";
2339 let tokens = lex(source);
2340 assert_eq!(tokens, vec![
2341 Token::Ident("cat".to_string()),
2342 Token::HereDocStart,
2343 Token::HereDoc("$VAR and \"quoted\" 'single'".to_string()),
2344 Token::Newline,
2345 ]);
2346 }
2347
2348 #[test]
2349 fn heredoc_multiline() {
2350 let source = "cat <<END\nline1\nline2\nline3\nEND";
2351 let tokens = lex(source);
2352 assert_eq!(tokens, vec![
2353 Token::Ident("cat".to_string()),
2354 Token::HereDocStart,
2355 Token::HereDoc("line1\nline2\nline3".to_string()),
2356 Token::Newline,
2357 ]);
2358 }
2359
2360 #[test]
2361 fn heredoc_in_command() {
2362 let source = "cat <<EOF\nhello\nEOF\necho goodbye";
2363 let tokens = lex(source);
2364 assert_eq!(tokens, vec![
2365 Token::Ident("cat".to_string()),
2366 Token::HereDocStart,
2367 Token::HereDoc("hello".to_string()),
2368 Token::Newline,
2369 Token::Ident("echo".to_string()),
2370 Token::Ident("goodbye".to_string()),
2371 ]);
2372 }
2373
2374 #[test]
2375 fn heredoc_strip_tabs() {
2376 let source = "cat <<-EOF\n\thello\n\tworld\n\tEOF";
2377 let tokens = lex(source);
2378 assert_eq!(tokens, vec![
2380 Token::Ident("cat".to_string()),
2381 Token::HereDocStart,
2382 Token::HereDoc("\thello\n\tworld".to_string()),
2383 Token::Newline,
2384 ]);
2385 }
2386
2387 #[test]
2392 fn arithmetic_simple() {
2393 let source = "$((1 + 2))";
2394 let tokens = lex(source);
2395 assert_eq!(tokens, vec![Token::Arithmetic("1 + 2".to_string())]);
2396 }
2397
2398 #[test]
2399 fn arithmetic_in_assignment() {
2400 let source = "X=$((5 * 3))";
2401 let tokens = lex(source);
2402 assert_eq!(tokens, vec![
2403 Token::Ident("X".to_string()),
2404 Token::Eq,
2405 Token::Arithmetic("5 * 3".to_string()),
2406 ]);
2407 }
2408
2409 #[test]
2410 fn arithmetic_with_nested_parens() {
2411 let source = "$((2 * (3 + 4)))";
2412 let tokens = lex(source);
2413 assert_eq!(tokens, vec![Token::Arithmetic("2 * (3 + 4)".to_string())]);
2414 }
2415
2416 #[test]
2417 fn arithmetic_with_variable() {
2418 let source = "$((X + 1))";
2419 let tokens = lex(source);
2420 assert_eq!(tokens, vec![Token::Arithmetic("X + 1".to_string())]);
2421 }
2422
2423 #[test]
2424 fn arithmetic_command_subst_not_confused() {
2425 let source = "$(echo hello)";
2427 let tokens = lex(source);
2428 assert_eq!(tokens, vec![
2429 Token::CmdSubstStart,
2430 Token::Ident("echo".to_string()),
2431 Token::Ident("hello".to_string()),
2432 Token::RParen,
2433 ]);
2434 }
2435
2436 #[test]
2437 fn arithmetic_nesting_limit() {
2438 let open_parens = "(".repeat(300);
2440 let close_parens = ")".repeat(300);
2441 let source = format!("$(({}1{}))", open_parens, close_parens);
2442 let result = tokenize(&source);
2443 assert!(result.is_err());
2444 let errors = result.unwrap_err();
2445 assert_eq!(errors.len(), 1);
2446 assert_eq!(errors[0].token, LexerError::NestingTooDeep);
2447 }
2448
2449 #[test]
2450 fn arithmetic_nesting_within_limit() {
2451 let source = "$((((1 + 2) * 3)))";
2453 let tokens = lex(source);
2454 assert_eq!(tokens, vec![Token::Arithmetic("((1 + 2) * 3)".to_string())]);
2455 }
2456
2457 #[test]
2462 fn token_categories() {
2463 assert_eq!(Token::If.category(), TokenCategory::Keyword);
2465 assert_eq!(Token::Then.category(), TokenCategory::Keyword);
2466 assert_eq!(Token::For.category(), TokenCategory::Keyword);
2467 assert_eq!(Token::Function.category(), TokenCategory::Keyword);
2468 assert_eq!(Token::True.category(), TokenCategory::Keyword);
2469 assert_eq!(Token::TypeString.category(), TokenCategory::Keyword);
2470
2471 assert_eq!(Token::Pipe.category(), TokenCategory::Operator);
2473 assert_eq!(Token::And.category(), TokenCategory::Operator);
2474 assert_eq!(Token::Or.category(), TokenCategory::Operator);
2475 assert_eq!(Token::StderrToStdout.category(), TokenCategory::Operator);
2476 assert_eq!(Token::GtGt.category(), TokenCategory::Operator);
2477
2478 assert_eq!(Token::String("test".to_string()).category(), TokenCategory::String);
2480 assert_eq!(Token::SingleString("test".to_string()).category(), TokenCategory::String);
2481 assert_eq!(Token::HereDoc("test".to_string()).category(), TokenCategory::String);
2482
2483 assert_eq!(Token::Int(42).category(), TokenCategory::Number);
2485 assert_eq!(Token::Float(3.14).category(), TokenCategory::Number);
2486 assert_eq!(Token::Arithmetic("1+2".to_string()).category(), TokenCategory::Number);
2487
2488 assert_eq!(Token::SimpleVarRef("X".to_string()).category(), TokenCategory::Variable);
2490 assert_eq!(Token::VarRef("${X}".to_string()).category(), TokenCategory::Variable);
2491 assert_eq!(Token::Positional(1).category(), TokenCategory::Variable);
2492 assert_eq!(Token::AllArgs.category(), TokenCategory::Variable);
2493 assert_eq!(Token::ArgCount.category(), TokenCategory::Variable);
2494 assert_eq!(Token::LastExitCode.category(), TokenCategory::Variable);
2495 assert_eq!(Token::CurrentPid.category(), TokenCategory::Variable);
2496
2497 assert_eq!(Token::ShortFlag("l".to_string()).category(), TokenCategory::Flag);
2499 assert_eq!(Token::LongFlag("verbose".to_string()).category(), TokenCategory::Flag);
2500 assert_eq!(Token::PlusFlag("e".to_string()).category(), TokenCategory::Flag);
2501 assert_eq!(Token::DoubleDash.category(), TokenCategory::Flag);
2502
2503 assert_eq!(Token::Semi.category(), TokenCategory::Punctuation);
2505 assert_eq!(Token::LParen.category(), TokenCategory::Punctuation);
2506 assert_eq!(Token::LBracket.category(), TokenCategory::Punctuation);
2507 assert_eq!(Token::Newline.category(), TokenCategory::Punctuation);
2508
2509 assert_eq!(Token::Comment.category(), TokenCategory::Comment);
2511
2512 assert_eq!(Token::Path("/tmp/file".to_string()).category(), TokenCategory::Path);
2514
2515 assert_eq!(Token::Ident("echo".to_string()).category(), TokenCategory::Command);
2517
2518 assert_eq!(Token::InvalidNumberIdent.category(), TokenCategory::Error);
2520 assert_eq!(Token::InvalidFloatNoLeading.category(), TokenCategory::Error);
2521 assert_eq!(Token::InvalidFloatNoTrailing.category(), TokenCategory::Error);
2522 }
2523}