1use logos::{Logos, Span};
17use std::fmt;
18use std::sync::atomic::{AtomicU64, Ordering};
19use std::time::{SystemTime, UNIX_EPOCH};
20
21static MARKER_COUNTER: AtomicU64 = AtomicU64::new(0);
23
24const MAX_PAREN_DEPTH: usize = 256;
27
28#[derive(Debug, Clone)]
32struct SpanReplacement {
33 preprocessed_pos: usize,
35 marker_len: usize,
37 original_len: usize,
39}
40
41fn correct_span(span: Span, replacements: &[SpanReplacement]) -> Span {
43 let mut start_adjustment: isize = 0;
44 let mut end_adjustment: isize = 0;
45
46 for r in replacements {
47 let delta = r.original_len as isize - r.marker_len as isize;
49
50 if span.start > r.preprocessed_pos + r.marker_len {
52 start_adjustment += delta;
53 } else if span.start > r.preprocessed_pos {
54 start_adjustment += delta;
57 }
58
59 if span.end > r.preprocessed_pos + r.marker_len {
61 end_adjustment += delta;
62 } else if span.end > r.preprocessed_pos {
63 end_adjustment += delta;
65 }
66 }
67
68 let new_start = (span.start as isize + start_adjustment).max(0) as usize;
69 let new_end = (span.end as isize + end_adjustment).max(new_start as isize) as usize;
70 new_start..new_end
71}
72
73fn unique_marker_id() -> String {
76 let timestamp = SystemTime::now()
77 .duration_since(UNIX_EPOCH)
78 .map(|d| d.as_nanos())
79 .unwrap_or(0);
80 let counter = MARKER_COUNTER.fetch_add(1, Ordering::Relaxed);
81 #[cfg(target_os = "wasi")]
82 let pid = 0u32;
83 #[cfg(not(target_os = "wasi"))]
84 let pid = std::process::id();
85 format!("{:x}_{:x}_{:x}", timestamp, counter, pid)
86}
87
88#[derive(Debug, Clone, PartialEq)]
90pub struct Spanned<T> {
91 pub token: T,
92 pub span: Span,
93}
94
95impl<T> Spanned<T> {
96 pub fn new(token: T, span: Span) -> Self {
97 Self { token, span }
98 }
99}
100
101#[derive(Debug, Clone, PartialEq, Default)]
103pub enum LexerError {
104 #[default]
105 UnexpectedCharacter,
106 UnterminatedString,
107 UnterminatedVarRef,
108 InvalidEscape,
109 InvalidNumber,
110 AmbiguousBoolean(String),
111 AmbiguousBooleanLike(String),
112 InvalidFloatNoLeading,
113 InvalidFloatNoTrailing,
114 NestingTooDeep,
116 UnterminatedHeredoc { delimiter: String },
120}
121
122impl fmt::Display for LexerError {
123 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
124 match self {
125 LexerError::UnexpectedCharacter => write!(f, "unexpected character"),
126 LexerError::UnterminatedString => write!(f, "unterminated string"),
127 LexerError::UnterminatedVarRef => write!(f, "unterminated variable reference"),
128 LexerError::InvalidEscape => write!(f, "invalid escape sequence"),
129 LexerError::InvalidNumber => write!(f, "invalid number"),
130 LexerError::AmbiguousBoolean(s) => {
131 write!(f, "ambiguous boolean, use lowercase '{}'", s.to_lowercase())
132 }
133 LexerError::AmbiguousBooleanLike(s) => {
134 let suggest = if s.eq_ignore_ascii_case("yes") { "true" } else { "false" };
135 write!(f, "ambiguous boolean-like '{}', use '{}' or '\"{}\"'", s, suggest, s)
136 }
137 LexerError::InvalidFloatNoLeading => write!(f, "float must have leading digit"),
138 LexerError::InvalidFloatNoTrailing => write!(f, "float must have trailing digit"),
139 LexerError::NestingTooDeep => write!(f, "nesting depth exceeded (max {})", MAX_PAREN_DEPTH),
140 LexerError::UnterminatedHeredoc { delimiter } => {
141 write!(f, "unterminated heredoc, expected closing delimiter `{}` on its own line", delimiter)
142 }
143 }
144 }
145}
146
147#[derive(Debug, Clone, PartialEq)]
170pub struct HereDocData {
171 pub content: String,
172 pub literal: bool,
173 pub strip_tabs: bool,
174 pub body_start_offset: usize,
175}
176
177#[derive(Logos, Debug, Clone, PartialEq)]
178#[logos(error = LexerError)]
179#[logos(skip r"[ \t]+")]
180pub enum Token {
181 #[token("set")]
185 Set,
186
187 #[token("local")]
188 Local,
189
190 #[token("if")]
191 If,
192
193 #[token("then")]
194 Then,
195
196 #[token("else")]
197 Else,
198
199 #[token("elif")]
200 Elif,
201
202 #[token("fi")]
203 Fi,
204
205 #[token("for")]
206 For,
207
208 #[token("while")]
209 While,
210
211 #[token("in")]
212 In,
213
214 #[token("do")]
215 Do,
216
217 #[token("done")]
218 Done,
219
220 #[token("case")]
221 Case,
222
223 #[token("esac")]
224 Esac,
225
226 #[token("function")]
227 Function,
228
229 #[token("break")]
230 Break,
231
232 #[token("continue")]
233 Continue,
234
235 #[token("return")]
236 Return,
237
238 #[token("exit")]
239 Exit,
240
241 #[token("true")]
242 True,
243
244 #[token("false")]
245 False,
246
247 #[token("string")]
251 TypeString,
252
253 #[token("int")]
254 TypeInt,
255
256 #[token("float")]
257 TypeFloat,
258
259 #[token("bool")]
260 TypeBool,
261
262 #[token("&&")]
266 And,
267
268 #[token("||")]
269 Or,
270
271 #[token("==")]
272 EqEq,
273
274 #[token("!=")]
275 NotEq,
276
277 #[token("=~")]
278 Match,
279
280 #[token("!~")]
281 NotMatch,
282
283 #[token(">=")]
284 GtEq,
285
286 #[token("<=")]
287 LtEq,
288
289 #[token(">>")]
290 GtGt,
291
292 #[token("2>&1")]
293 StderrToStdout,
294
295 #[token("1>&2")]
296 StdoutToStderr,
297
298 #[token(">&2")]
299 StdoutToStderr2,
300
301 #[token("2>")]
302 Stderr,
303
304 #[token("&>")]
305 Both,
306
307 #[token("<<<")]
308 HereString,
309
310 #[token("<<")]
311 HereDocStart,
312
313 #[token(";;")]
314 DoubleSemi,
315
316 #[token("=")]
320 Eq,
321
322 #[token("|")]
323 Pipe,
324
325 #[token("&")]
326 Amp,
327
328 #[token(">")]
329 Gt,
330
331 #[token("<")]
332 Lt,
333
334 #[token(";")]
335 Semi,
336
337 #[token(":")]
338 Colon,
339
340 #[token(",")]
341 Comma,
342
343 #[token("..")]
344 DotDot,
345
346 #[token(".")]
347 Dot,
348
349 #[regex(r"~[a-zA-Z0-9_./+-]+", lex_tilde_path, priority = 3)]
351 TildePath(String),
352
353 #[token("~")]
355 Tilde,
356
357 #[regex(r"\.\./[a-zA-Z0-9_./-]+", lex_relative_path, priority = 3)]
359 #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*/[a-zA-Z0-9_./-]+", lex_relative_path, priority = 3)]
360 RelativePath(String),
361
362 #[regex(r"\./[a-zA-Z0-9_./-]+", lex_dot_slash_path, priority = 3)]
364 DotSlashPath(String),
365
366 #[regex(r"\.[a-zA-Z_][a-zA-Z0-9_.-]*", lex_dotted_ident, priority = 3)]
372 DottedIdent(String),
373
374 #[token("{")]
375 LBrace,
376
377 #[token("}")]
378 RBrace,
379
380 #[token("[")]
381 LBracket,
382
383 #[token("]")]
384 RBracket,
385
386 #[token("(")]
387 LParen,
388
389 #[token(")")]
390 RParen,
391
392 #[token("*")]
393 Star,
394
395 #[token("!")]
396 Bang,
397
398 #[token("?")]
399 Question,
400
401 GlobWord(String),
404
405 Arithmetic(String),
412
413 #[token("$(")]
415 CmdSubstStart,
416
417 #[regex(r"--[a-zA-Z][a-zA-Z0-9-]*", lex_long_flag, priority = 3)]
423 LongFlag(String),
424
425 #[regex(r"-[a-zA-Z][a-zA-Z0-9]*", lex_short_flag, priority = 3)]
427 ShortFlag(String),
428
429 #[regex(r"\+[a-zA-Z][a-zA-Z0-9]*", lex_plus_flag, priority = 3)]
431 PlusFlag(String),
432
433 #[token("--")]
435 DoubleDash,
436
437 #[regex(r"\+[^a-zA-Z\s][^\s]*", lex_plus_bare, priority = 2)]
440 PlusBare(String),
441
442 #[regex(r"-[^a-zA-Z0-9\s\-][^\s]*", lex_minus_bare, priority = 1)]
446 MinusBare(String),
447
448 #[token("-")]
452 MinusAlone,
453
454 #[regex(r#""([^"\\]|\\.)*""#, lex_string)]
460 String(String),
461
462 #[regex(r"'[^']*'", lex_single_string)]
464 SingleString(String),
465
466 #[regex(r"\$\{[^}]+\}", lex_varref)]
468 VarRef(String),
469
470 #[regex(r"\$[a-zA-Z_][a-zA-Z0-9_]*", lex_simple_varref)]
472 SimpleVarRef(String),
473
474 #[regex(r"\$[0-9]", lex_positional)]
476 Positional(usize),
477
478 #[token("$@")]
480 AllArgs,
481
482 #[token("$#")]
484 ArgCount,
485
486 #[token("$?")]
488 LastExitCode,
489
490 #[token("$$")]
492 CurrentPid,
493
494 #[regex(r"\$\{#[a-zA-Z_][a-zA-Z0-9_]*\}", lex_var_length)]
496 VarLength(String),
497
498 HereDoc(HereDocData),
501
502 #[regex(r"-?[0-9]+", lex_int, priority = 2)]
504 Int(i64),
505
506 #[regex(r"-?[0-9]+\.[0-9]+", lex_float)]
508 Float(f64),
509
510 #[regex(r"[0-9]+[a-zA-Z_][a-zA-Z0-9_.-]*", lex_number_ident, priority = 3)]
519 NumberIdent(String),
520
521 #[regex(r"\.[0-9]+", lex_invalid_float_no_leading, priority = 3)]
523 InvalidFloatNoLeading,
524
525 #[regex(r"[0-9]+\.", lex_invalid_float_no_trailing, priority = 2)]
528 InvalidFloatNoTrailing,
529
530 #[regex(r"/[a-zA-Z0-9_./+-]*", lex_path)]
536 Path(String),
537
538 #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*", lex_ident)]
545 Ident(String),
546
547 #[regex(r"#[^\n\r]*", allow_greedy = true)]
553 Comment,
554
555 #[regex(r"\n|\r\n")]
557 Newline,
558
559 #[regex(r"\\[ \t]*(\n|\r\n)")]
561 LineContinuation,
562}
563
564#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
569pub enum TokenCategory {
570 Keyword,
572 Operator,
574 String,
576 Number,
578 Variable,
580 Comment,
582 Punctuation,
584 Command,
586 Path,
588 Flag,
590 Error,
592}
593
594impl Token {
595 pub fn category(&self) -> TokenCategory {
597 match self {
598 Token::If
600 | Token::Then
601 | Token::Else
602 | Token::Elif
603 | Token::Fi
604 | Token::For
605 | Token::In
606 | Token::Do
607 | Token::Done
608 | Token::While
609 | Token::Case
610 | Token::Esac
611 | Token::Function
612 | Token::Return
613 | Token::Break
614 | Token::Continue
615 | Token::Exit
616 | Token::Set
617 | Token::Local
618 | Token::True
619 | Token::False
620 | Token::TypeString
621 | Token::TypeInt
622 | Token::TypeFloat
623 | Token::TypeBool => TokenCategory::Keyword,
624
625 Token::Pipe
627 | Token::And
628 | Token::Or
629 | Token::Amp
630 | Token::Eq
631 | Token::EqEq
632 | Token::NotEq
633 | Token::Match
634 | Token::NotMatch
635 | Token::Lt
636 | Token::Gt
637 | Token::LtEq
638 | Token::GtEq
639 | Token::GtGt
640 | Token::Stderr
641 | Token::Both
642 | Token::HereDocStart
643 | Token::HereString
644 | Token::StderrToStdout
645 | Token::StdoutToStderr
646 | Token::StdoutToStderr2 => TokenCategory::Operator,
647
648 Token::String(_) | Token::SingleString(_) | Token::HereDoc(_) => TokenCategory::String,
650
651 Token::Int(_) | Token::Float(_) | Token::Arithmetic(_) => TokenCategory::Number,
653
654 Token::VarRef(_)
656 | Token::SimpleVarRef(_)
657 | Token::Positional(_)
658 | Token::AllArgs
659 | Token::ArgCount
660 | Token::VarLength(_)
661 | Token::LastExitCode
662 | Token::CurrentPid => TokenCategory::Variable,
663
664 Token::LongFlag(_)
666 | Token::ShortFlag(_)
667 | Token::PlusFlag(_)
668 | Token::DoubleDash => TokenCategory::Flag,
669
670 Token::Semi
672 | Token::DoubleSemi
673 | Token::Colon
674 | Token::Comma
675 | Token::Dot
676 | Token::LParen
677 | Token::RParen
678 | Token::LBrace
679 | Token::RBrace
680 | Token::LBracket
681 | Token::RBracket
682 | Token::Bang
683 | Token::Question
684 | Token::Star
685 | Token::Newline
686 | Token::LineContinuation
687 | Token::CmdSubstStart => TokenCategory::Punctuation,
688
689 Token::GlobWord(_) => TokenCategory::Path,
691
692 Token::Comment => TokenCategory::Comment,
694
695 Token::Path(_)
697 | Token::TildePath(_)
698 | Token::RelativePath(_)
699 | Token::Tilde
700 | Token::DotDot
701 | Token::DotSlashPath(_) => TokenCategory::Path,
702
703 Token::Ident(_)
705 | Token::PlusBare(_)
706 | Token::MinusBare(_)
707 | Token::MinusAlone
708 | Token::NumberIdent(_)
709 | Token::DottedIdent(_) => TokenCategory::Command,
710
711 Token::InvalidFloatNoLeading
713 | Token::InvalidFloatNoTrailing => TokenCategory::Error,
714 }
715 }
716}
717
718fn lex_string(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
720 parse_string_literal(lex.slice())
721}
722
723fn lex_single_string(lex: &mut logos::Lexer<Token>) -> String {
725 let s = lex.slice();
726 s[1..s.len() - 1].to_string()
728}
729
730fn lex_varref(lex: &mut logos::Lexer<Token>) -> String {
732 lex.slice().to_string()
734}
735
736fn lex_simple_varref(lex: &mut logos::Lexer<Token>) -> String {
738 lex.slice()[1..].to_string()
740}
741
742fn lex_positional(lex: &mut logos::Lexer<Token>) -> usize {
744 lex.slice()[1..].parse().unwrap_or(0)
746}
747
748fn lex_var_length(lex: &mut logos::Lexer<Token>) -> String {
750 let s = lex.slice();
752 s[3..s.len() - 1].to_string()
753}
754
755fn lex_int(lex: &mut logos::Lexer<Token>) -> Result<i64, LexerError> {
757 lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
758}
759
760fn lex_float(lex: &mut logos::Lexer<Token>) -> Result<f64, LexerError> {
762 lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
763}
764
765fn lex_number_ident(lex: &mut logos::Lexer<Token>) -> String {
769 lex.slice().to_string()
770}
771
772fn lex_dotted_ident(lex: &mut logos::Lexer<Token>) -> String {
774 lex.slice().to_string()
775}
776
777fn lex_invalid_float_no_leading(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
780 Err(LexerError::InvalidFloatNoLeading)
781}
782
783fn lex_invalid_float_no_trailing(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
786 Err(LexerError::InvalidFloatNoTrailing)
787}
788
789fn lex_ident(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
791 let s = lex.slice();
792
793 match s.to_lowercase().as_str() {
796 "true" | "false" if s != "true" && s != "false" => {
797 return Err(LexerError::AmbiguousBoolean(s.to_string()));
798 }
799 _ => {}
800 }
801
802 if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("no") {
804 return Err(LexerError::AmbiguousBooleanLike(s.to_string()));
805 }
806
807 Ok(s.to_string())
808}
809
810fn lex_long_flag(lex: &mut logos::Lexer<Token>) -> String {
812 lex.slice()[2..].to_string()
814}
815
816fn lex_short_flag(lex: &mut logos::Lexer<Token>) -> String {
818 lex.slice()[1..].to_string()
820}
821
822fn lex_plus_flag(lex: &mut logos::Lexer<Token>) -> String {
824 lex.slice()[1..].to_string()
826}
827
828fn lex_plus_bare(lex: &mut logos::Lexer<Token>) -> String {
830 lex.slice().to_string()
831}
832
833fn lex_minus_bare(lex: &mut logos::Lexer<Token>) -> String {
835 lex.slice().to_string()
836}
837
838fn lex_path(lex: &mut logos::Lexer<Token>) -> String {
840 lex.slice().to_string()
841}
842
843fn lex_tilde_path(lex: &mut logos::Lexer<Token>) -> String {
845 lex.slice().to_string()
846}
847
848fn lex_relative_path(lex: &mut logos::Lexer<Token>) -> String {
850 lex.slice().to_string()
851}
852
853fn lex_dot_slash_path(lex: &mut logos::Lexer<Token>) -> String {
855 lex.slice().to_string()
856}
857
858impl fmt::Display for Token {
859 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
860 match self {
861 Token::Set => write!(f, "set"),
862 Token::Local => write!(f, "local"),
863 Token::If => write!(f, "if"),
864 Token::Then => write!(f, "then"),
865 Token::Else => write!(f, "else"),
866 Token::Elif => write!(f, "elif"),
867 Token::Fi => write!(f, "fi"),
868 Token::For => write!(f, "for"),
869 Token::While => write!(f, "while"),
870 Token::In => write!(f, "in"),
871 Token::Do => write!(f, "do"),
872 Token::Done => write!(f, "done"),
873 Token::Case => write!(f, "case"),
874 Token::Esac => write!(f, "esac"),
875 Token::Function => write!(f, "function"),
876 Token::Break => write!(f, "break"),
877 Token::Continue => write!(f, "continue"),
878 Token::Return => write!(f, "return"),
879 Token::Exit => write!(f, "exit"),
880 Token::True => write!(f, "true"),
881 Token::False => write!(f, "false"),
882 Token::TypeString => write!(f, "string"),
883 Token::TypeInt => write!(f, "int"),
884 Token::TypeFloat => write!(f, "float"),
885 Token::TypeBool => write!(f, "bool"),
886 Token::And => write!(f, "&&"),
887 Token::Or => write!(f, "||"),
888 Token::EqEq => write!(f, "=="),
889 Token::NotEq => write!(f, "!="),
890 Token::Match => write!(f, "=~"),
891 Token::NotMatch => write!(f, "!~"),
892 Token::GtEq => write!(f, ">="),
893 Token::LtEq => write!(f, "<="),
894 Token::GtGt => write!(f, ">>"),
895 Token::StderrToStdout => write!(f, "2>&1"),
896 Token::StdoutToStderr => write!(f, "1>&2"),
897 Token::StdoutToStderr2 => write!(f, ">&2"),
898 Token::Stderr => write!(f, "2>"),
899 Token::Both => write!(f, "&>"),
900 Token::HereDocStart => write!(f, "<<"),
901 Token::HereString => write!(f, "<<<"),
902 Token::DoubleSemi => write!(f, ";;"),
903 Token::Eq => write!(f, "="),
904 Token::Pipe => write!(f, "|"),
905 Token::Amp => write!(f, "&"),
906 Token::Gt => write!(f, ">"),
907 Token::Lt => write!(f, "<"),
908 Token::Semi => write!(f, ";"),
909 Token::Colon => write!(f, ":"),
910 Token::Comma => write!(f, ","),
911 Token::Dot => write!(f, "."),
912 Token::DotDot => write!(f, ".."),
913 Token::Tilde => write!(f, "~"),
914 Token::TildePath(s) => write!(f, "{}", s),
915 Token::RelativePath(s) => write!(f, "{}", s),
916 Token::DotSlashPath(s) => write!(f, "{}", s),
917 Token::LBrace => write!(f, "{{"),
918 Token::RBrace => write!(f, "}}"),
919 Token::LBracket => write!(f, "["),
920 Token::RBracket => write!(f, "]"),
921 Token::LParen => write!(f, "("),
922 Token::RParen => write!(f, ")"),
923 Token::Star => write!(f, "*"),
924 Token::Bang => write!(f, "!"),
925 Token::Question => write!(f, "?"),
926 Token::GlobWord(s) => write!(f, "GLOB({})", s),
927 Token::Arithmetic(s) => write!(f, "ARITHMETIC({})", s),
928 Token::CmdSubstStart => write!(f, "$("),
929 Token::LongFlag(s) => write!(f, "--{}", s),
930 Token::ShortFlag(s) => write!(f, "-{}", s),
931 Token::PlusFlag(s) => write!(f, "+{}", s),
932 Token::DoubleDash => write!(f, "--"),
933 Token::PlusBare(s) => write!(f, "{}", s),
934 Token::MinusBare(s) => write!(f, "{}", s),
935 Token::MinusAlone => write!(f, "-"),
936 Token::String(s) => write!(f, "STRING({:?})", s),
937 Token::SingleString(s) => write!(f, "SINGLESTRING({:?})", s),
938 Token::HereDoc(d) => write!(f, "HEREDOC({:?}, literal={})", d.content, d.literal),
939 Token::VarRef(v) => write!(f, "VARREF({})", v),
940 Token::SimpleVarRef(v) => write!(f, "SIMPLEVARREF({})", v),
941 Token::Positional(n) => write!(f, "${}", n),
942 Token::AllArgs => write!(f, "$@"),
943 Token::ArgCount => write!(f, "$#"),
944 Token::LastExitCode => write!(f, "$?"),
945 Token::CurrentPid => write!(f, "$$"),
946 Token::VarLength(v) => write!(f, "${{#{}}}", v),
947 Token::Int(n) => write!(f, "INT({})", n),
948 Token::Float(n) => write!(f, "FLOAT({})", n),
949 Token::Path(s) => write!(f, "PATH({})", s),
950 Token::Ident(s) => write!(f, "IDENT({})", s),
951 Token::NumberIdent(s) => write!(f, "NUMIDENT({})", s),
952 Token::DottedIdent(s) => write!(f, "DOTIDENT({})", s),
953 Token::Comment => write!(f, "COMMENT"),
954 Token::Newline => write!(f, "NEWLINE"),
955 Token::LineContinuation => write!(f, "LINECONT"),
956 Token::InvalidFloatNoLeading => write!(f, "INVALID_FLOAT_NO_LEADING"),
958 Token::InvalidFloatNoTrailing => write!(f, "INVALID_FLOAT_NO_TRAILING"),
959 }
960 }
961}
962
963impl Token {
964 pub fn is_keyword(&self) -> bool {
969 matches!(
970 self,
971 Token::Set
972 | Token::Local
973 | Token::If
974 | Token::Then
975 | Token::Else
976 | Token::Elif
977 | Token::Fi
978 | Token::For
979 | Token::In
980 | Token::Do
981 | Token::Done
982 | Token::While
983 | Token::Case
984 | Token::Esac
985 | Token::Function
986 | Token::Return
987 | Token::Break
988 | Token::Continue
989 | Token::Exit
990 | Token::True
991 | Token::False
992 )
993 }
994
995 pub fn is_type(&self) -> bool {
997 matches!(
998 self,
999 Token::TypeString
1000 | Token::TypeInt
1001 | Token::TypeFloat
1002 | Token::TypeBool
1003 )
1004 }
1005
1006 pub fn starts_statement(&self) -> bool {
1009 matches!(
1010 self,
1011 Token::Set
1012 | Token::Local
1013 | Token::Function
1014 | Token::If
1015 | Token::For
1016 | Token::While
1017 | Token::Case
1018 | Token::Ident(_)
1019 | Token::LBracket
1020 )
1021 }
1022
1023 pub fn is_value(&self) -> bool {
1025 matches!(
1026 self,
1027 Token::String(_)
1028 | Token::SingleString(_)
1029 | Token::HereDoc(_)
1030 | Token::Arithmetic(_)
1031 | Token::Int(_)
1032 | Token::Float(_)
1033 | Token::True
1034 | Token::False
1035 | Token::VarRef(_)
1036 | Token::SimpleVarRef(_)
1037 | Token::CmdSubstStart
1038 | Token::Path(_)
1039 | Token::GlobWord(_)
1040 | Token::LastExitCode
1041 | Token::CurrentPid
1042 )
1043 }
1044}
1045
1046struct ArithmeticPreprocessResult {
1048 text: String,
1050 arithmetics: Vec<(String, String)>,
1052 replacements: Vec<SpanReplacement>,
1054}
1055
1056fn skip_command_substitution(
1065 chars: &[char],
1066 i: &mut usize,
1067 source_pos: &mut usize,
1068 result: &mut String,
1069) {
1070 result.push('$');
1072 result.push('(');
1073 *i += 2;
1074 *source_pos += 2;
1075
1076 let mut depth: usize = 1;
1077 let mut in_single_quote = false;
1078 let mut in_double_quote = false;
1079
1080 while *i < chars.len() && depth > 0 {
1081 let c = chars[*i];
1082
1083 if in_single_quote {
1084 result.push(c);
1085 *source_pos += c.len_utf8();
1086 *i += 1;
1087 if c == '\'' {
1088 in_single_quote = false;
1089 }
1090 continue;
1091 }
1092
1093 if in_double_quote {
1094 if c == '\\' && *i + 1 < chars.len() {
1095 let next = chars[*i + 1];
1096 if next == '"' || next == '\\' || next == '$' || next == '`' {
1097 result.push(c);
1098 result.push(next);
1099 *source_pos += c.len_utf8() + next.len_utf8();
1100 *i += 2;
1101 continue;
1102 }
1103 }
1104 if c == '"' {
1105 in_double_quote = false;
1106 }
1107 result.push(c);
1108 *source_pos += c.len_utf8();
1109 *i += 1;
1110 continue;
1111 }
1112
1113 match c {
1115 '\'' => {
1116 in_single_quote = true;
1117 result.push(c);
1118 *source_pos += c.len_utf8();
1119 *i += 1;
1120 }
1121 '"' => {
1122 in_double_quote = true;
1123 result.push(c);
1124 *source_pos += c.len_utf8();
1125 *i += 1;
1126 }
1127 '\\' if *i + 1 < chars.len() => {
1128 result.push(c);
1129 result.push(chars[*i + 1]);
1130 *source_pos += c.len_utf8() + chars[*i + 1].len_utf8();
1131 *i += 2;
1132 }
1133 '(' => {
1134 depth += 1;
1135 result.push(c);
1136 *source_pos += c.len_utf8();
1137 *i += 1;
1138 }
1139 ')' => {
1140 depth -= 1;
1141 result.push(c);
1142 *source_pos += c.len_utf8();
1143 *i += 1;
1144 }
1145 _ => {
1146 result.push(c);
1147 *source_pos += c.len_utf8();
1148 *i += 1;
1149 }
1150 }
1151 }
1152}
1153
1154fn preprocess_arithmetic(source: &str) -> Result<ArithmeticPreprocessResult, LexerError> {
1168 let mut result = String::with_capacity(source.len());
1169 let mut arithmetics: Vec<(String, String)> = Vec::new();
1170 let mut replacements: Vec<SpanReplacement> = Vec::new();
1171 let mut source_pos: usize = 0;
1172 let chars_vec: Vec<char> = source.chars().collect();
1173 let mut i = 0;
1174
1175 let mut in_double_quote = false;
1178
1179 while i < chars_vec.len() {
1180 let ch = chars_vec[i];
1181
1182 if !in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1184 result.push(ch);
1185 result.push(chars_vec[i + 1]);
1186 source_pos += ch.len_utf8() + chars_vec[i + 1].len_utf8();
1187 i += 2;
1188 continue;
1189 }
1190
1191 if ch == '\'' && !in_double_quote {
1193 result.push(ch);
1194 i += 1;
1195 source_pos += 1;
1196 while i < chars_vec.len() && chars_vec[i] != '\'' {
1197 result.push(chars_vec[i]);
1198 source_pos += chars_vec[i].len_utf8();
1199 i += 1;
1200 }
1201 if i < chars_vec.len() {
1202 result.push(chars_vec[i]); source_pos += 1;
1204 i += 1;
1205 }
1206 continue;
1207 }
1208
1209 if ch == '"' {
1211 in_double_quote = !in_double_quote;
1212 result.push(ch);
1213 i += 1;
1214 source_pos += 1;
1215 continue;
1216 }
1217
1218 if in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1220 let next = chars_vec[i + 1];
1221 if next == '"' || next == '\\' || next == '$' || next == '`' {
1222 result.push(ch);
1223 result.push(next);
1224 source_pos += ch.len_utf8() + next.len_utf8();
1225 i += 2;
1226 continue;
1227 }
1228 }
1229
1230 if ch == '$' && i + 1 < chars_vec.len() && chars_vec[i + 1] == '('
1232 && !(i + 2 < chars_vec.len() && chars_vec[i + 2] == '(')
1233 {
1234 skip_command_substitution(&chars_vec, &mut i, &mut source_pos, &mut result);
1235 continue;
1236 }
1237
1238 if ch == '$' && i + 2 < chars_vec.len() && chars_vec[i + 1] == '(' && chars_vec[i + 2] == '(' {
1240 let arith_start_pos = result.len();
1241 let original_start = source_pos;
1242
1243 i += 3;
1245 source_pos += 3;
1246
1247 let mut expr = String::new();
1249 let mut paren_depth: usize = 0;
1250
1251 while i < chars_vec.len() {
1252 let c = chars_vec[i];
1253 match c {
1254 '(' => {
1255 paren_depth += 1;
1256 if paren_depth > MAX_PAREN_DEPTH {
1257 return Err(LexerError::NestingTooDeep);
1258 }
1259 expr.push('(');
1260 i += 1;
1261 source_pos += c.len_utf8();
1262 }
1263 ')' => {
1264 if paren_depth > 0 {
1265 paren_depth -= 1;
1266 expr.push(')');
1267 i += 1;
1268 source_pos += 1;
1269 } else if i + 1 < chars_vec.len() && chars_vec[i + 1] == ')' {
1270 i += 2;
1272 source_pos += 2;
1273 break;
1274 } else {
1275 expr.push(')');
1277 i += 1;
1278 source_pos += 1;
1279 }
1280 }
1281 _ => {
1282 expr.push(c);
1283 i += 1;
1284 source_pos += c.len_utf8();
1285 }
1286 }
1287 }
1288
1289 let original_len = source_pos - original_start;
1291
1292 let marker = format!("__KAISH_ARITH_{}__", unique_marker_id());
1294 let marker_len = marker.len();
1295
1296 replacements.push(SpanReplacement {
1298 preprocessed_pos: arith_start_pos,
1299 marker_len,
1300 original_len,
1301 });
1302
1303 arithmetics.push((marker.clone(), expr));
1304 result.push_str(&marker);
1305 } else {
1306 result.push(ch);
1307 i += 1;
1308 source_pos += ch.len_utf8();
1309 }
1310 }
1311
1312 Ok(ArithmeticPreprocessResult {
1313 text: result,
1314 arithmetics,
1315 replacements,
1316 })
1317}
1318
1319#[derive(Debug, Clone)]
1334struct HeredocReplacement {
1335 marker: String,
1336 body: String,
1337 literal: bool,
1338 strip_tabs: bool,
1339 body_start_offset: usize,
1340}
1341
1342fn preprocess_heredocs(source: &str) -> Result<(String, Vec<HeredocReplacement>), Spanned<LexerError>> {
1354 let mut result = String::with_capacity(source.len());
1355 let mut heredocs: Vec<HeredocReplacement> = Vec::new();
1356 let chars_vec: Vec<char> = source.chars().collect();
1357 let mut i = 0;
1358 let mut pos: usize = 0;
1362
1363 while i < chars_vec.len() {
1364 let ch = chars_vec[i];
1365
1366 if ch == '<'
1370 && chars_vec.get(i + 1) == Some(&'<')
1371 && chars_vec.get(i + 2) == Some(&'<')
1372 {
1373 result.push_str("<<<");
1374 i += 3;
1375 pos += 3;
1376 continue;
1377 }
1378
1379 if ch == '<' && chars_vec.get(i + 1) == Some(&'<') {
1381 let introducer_start = pos;
1384 i += 2; pos += 2;
1386
1387 let strip_tabs = chars_vec.get(i) == Some(&'-');
1389 if strip_tabs {
1390 i += 1;
1391 pos += 1;
1392 }
1393
1394 while let Some(&c) = chars_vec.get(i) {
1396 if c == ' ' || c == '\t' {
1397 i += 1;
1398 pos += 1;
1399 } else {
1400 break;
1401 }
1402 }
1403
1404 let mut delimiter = String::new();
1406 let quoted = chars_vec.get(i) == Some(&'\'') || chars_vec.get(i) == Some(&'"');
1407 let quote_char = if quoted {
1408 let q = chars_vec.get(i).copied();
1409 i += 1;
1410 pos += 1;
1411 q
1412 } else {
1413 None
1414 };
1415
1416 while let Some(&c) = chars_vec.get(i) {
1417 if quoted {
1418 if Some(c) == quote_char {
1419 i += 1; pos += 1;
1421 break;
1422 }
1423 } else if c.is_whitespace() || c == '\n' || c == '\r' {
1424 break;
1425 }
1426 delimiter.push(c);
1427 i += 1;
1428 pos += c.len_utf8();
1429 }
1430
1431 if delimiter.is_empty() {
1432 result.push_str("<<");
1434 if strip_tabs {
1435 result.push('-');
1436 }
1437 continue;
1438 }
1439
1440 let mut after_delimiter = String::new();
1443 while let Some(&c) = chars_vec.get(i) {
1444 if c == '\n' {
1445 i += 1;
1446 pos += 1;
1447 break;
1448 } else if c == '\r' {
1449 i += 1;
1450 pos += 1;
1451 if chars_vec.get(i) == Some(&'\n') {
1452 i += 1;
1453 pos += 1;
1454 }
1455 break;
1456 }
1457 after_delimiter.push(c);
1458 i += 1;
1459 pos += c.len_utf8();
1460 }
1461
1462 let body_start_offset = pos;
1468 let mut content = String::new();
1469 let mut current_line = String::new();
1470
1471 loop {
1472 let next = chars_vec.get(i).copied();
1473 match next {
1474 Some('\n') => {
1475 i += 1;
1476 pos += 1;
1477 let trimmed = if strip_tabs {
1479 current_line.trim_start_matches('\t')
1480 } else {
1481 ¤t_line
1482 };
1483 if trimmed == delimiter {
1484 break;
1486 }
1487 content.push_str(¤t_line);
1489 content.push('\n');
1490 current_line.clear();
1491 }
1492 Some('\r') => {
1493 i += 1;
1494 pos += 1;
1495 let crlf = chars_vec.get(i) == Some(&'\n');
1501 if crlf {
1502 i += 1;
1503 pos += 1;
1504 }
1505 let trimmed = if strip_tabs {
1506 current_line.trim_start_matches('\t')
1507 } else {
1508 ¤t_line
1509 };
1510 if trimmed == delimiter {
1511 break;
1512 }
1513 content.push_str(¤t_line);
1514 content.push_str(if crlf { "\r\n" } else { "\r" });
1515 current_line.clear();
1516 }
1517 Some(c) => {
1518 current_line.push(c);
1519 i += 1;
1520 pos += c.len_utf8();
1521 }
1522 None => {
1523 let trimmed = if strip_tabs {
1526 current_line.trim_start_matches('\t')
1527 } else {
1528 ¤t_line
1529 };
1530 if trimmed == delimiter {
1531 break;
1532 }
1533 let span_end = introducer_start
1538 + 2
1539 + if strip_tabs { 1 } else { 0 }
1540 + delimiter.len();
1541 return Err(Spanned::new(
1542 LexerError::UnterminatedHeredoc {
1543 delimiter: delimiter.clone(),
1544 },
1545 introducer_start..span_end,
1546 ));
1547 }
1548 }
1549 }
1550
1551 let marker = format!("__KAISH_HEREDOC_{}__", unique_marker_id());
1553 heredocs.push(HeredocReplacement {
1554 marker: marker.clone(),
1555 body: content,
1556 literal: quoted,
1557 strip_tabs,
1558 body_start_offset,
1559 });
1560
1561 result.push_str("<<");
1564 result.push_str(&marker);
1565 result.push_str(&after_delimiter);
1566 result.push('\n');
1567 } else {
1568 result.push(ch);
1569 i += 1;
1570 pos += ch.len_utf8();
1571 }
1572 }
1573
1574 Ok((result, heredocs))
1575}
1576
1577fn mergeable_text(token: &Token) -> Option<String> {
1582 match token {
1583 Token::Ident(s) => Some(s.clone()),
1584 Token::NumberIdent(s) => Some(s.clone()),
1585 Token::DottedIdent(s) => Some(s.clone()),
1586 Token::Colon => Some(":".to_string()),
1587 Token::Int(n) => Some(n.to_string()),
1588 Token::Path(p) => Some(p.clone()),
1589 Token::Float(f) => Some(f.to_string()),
1590 _ => None,
1591 }
1592}
1593
1594fn merge_colon_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1603 if tokens.is_empty() {
1604 return tokens;
1605 }
1606
1607 let mut result = Vec::with_capacity(tokens.len());
1608 let mut run: Vec<&Spanned<Token>> = Vec::new();
1609
1610 for token in &tokens {
1611 if run.is_empty() {
1612 if mergeable_text(&token.token).is_some() {
1613 run.push(token);
1614 } else {
1615 result.push(token.clone());
1616 }
1617 continue;
1618 }
1619
1620 let Some(last) = run.last() else { unreachable!() };
1623 let adjacent = last.span.end == token.span.start;
1624
1625 if adjacent && mergeable_text(&token.token).is_some() {
1626 run.push(token);
1627 } else {
1628 flush_colon_run(&mut run, &mut result);
1629 if mergeable_text(&token.token).is_some() {
1630 run.push(token);
1631 } else {
1632 result.push(token.clone());
1633 }
1634 }
1635 }
1636
1637 flush_colon_run(&mut run, &mut result);
1638
1639 result
1640}
1641
1642fn flush_colon_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1644 if run.is_empty() {
1645 return;
1646 }
1647
1648 let has_colon = run.iter().any(|t| matches!(t.token, Token::Colon));
1649
1650 if run.len() >= 2 && has_colon {
1651 let text: String = run
1652 .iter()
1653 .filter_map(|t| mergeable_text(&t.token))
1654 .collect();
1655 let start = run.first().map(|t| t.span.start).unwrap_or(0);
1657 let end = run.last().map(|t| t.span.end).unwrap_or(0);
1658 result.push(Spanned::new(Token::Ident(text), start..end));
1659 } else {
1660 for t in run.iter() {
1661 result.push((*t).clone());
1662 }
1663 }
1664
1665 run.clear();
1666}
1667
1668fn glob_mergeable_text(token: &Token) -> Option<String> {
1673 match token {
1674 Token::Star => Some("*".to_string()),
1675 Token::Question => Some("?".to_string()),
1676 Token::Dot => Some(".".to_string()),
1677 Token::DotDot => Some("..".to_string()),
1678 Token::Ident(s) => Some(s.clone()),
1679 Token::NumberIdent(s) => Some(s.clone()),
1680 Token::DottedIdent(s) => Some(s.clone()),
1681 Token::Path(s) => Some(s.clone()),
1682 Token::Int(n) => Some(n.to_string()),
1683 Token::LBracket => Some("[".to_string()),
1684 Token::RBracket => Some("]".to_string()),
1685 Token::Bang => Some("!".to_string()),
1686 Token::DotSlashPath(s) => Some(s.clone()),
1687 Token::RelativePath(s) => Some(s.clone()),
1688 Token::TildePath(s) => Some(s.clone()),
1689 Token::Tilde => Some("~".to_string()),
1690 Token::LBrace => Some("{".to_string()),
1691 Token::RBrace => Some("}".to_string()),
1692 Token::Comma => Some(",".to_string()),
1693 _ => None,
1694 }
1695}
1696
1697fn merge_glob_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1705 if tokens.is_empty() {
1706 return tokens;
1707 }
1708
1709 let mut result = Vec::with_capacity(tokens.len());
1710 let mut run: Vec<&Spanned<Token>> = Vec::new();
1711
1712 for token in &tokens {
1713 if run.is_empty() {
1714 if glob_mergeable_text(&token.token).is_some() {
1715 run.push(token);
1716 } else {
1717 result.push(token.clone());
1718 }
1719 continue;
1720 }
1721
1722 let Some(last) = run.last() else { unreachable!() };
1724 let adjacent = last.span.end == token.span.start;
1725
1726 if adjacent && glob_mergeable_text(&token.token).is_some() {
1727 run.push(token);
1728 } else {
1729 flush_glob_run(&mut run, &mut result);
1730 if glob_mergeable_text(&token.token).is_some() {
1731 run.push(token);
1732 } else {
1733 result.push(token.clone());
1734 }
1735 }
1736 }
1737
1738 flush_glob_run(&mut run, &mut result);
1739
1740 result
1741}
1742
1743fn flush_glob_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1745 if run.is_empty() {
1746 return;
1747 }
1748
1749 let has_glob = run.iter().any(|t| {
1750 matches!(t.token, Token::Star | Token::Question)
1751 }) || (run.iter().any(|t| matches!(t.token, Token::LBracket))
1752 && run.iter().any(|t| matches!(t.token, Token::RBracket)));
1753
1754 if run.len() >= 2 && has_glob {
1755 let text: String = run
1756 .iter()
1757 .filter_map(|t| glob_mergeable_text(&t.token))
1758 .collect();
1759 let start = run.first().map(|t| t.span.start).unwrap_or(0);
1760 let end = run.last().map(|t| t.span.end).unwrap_or(0);
1761 result.push(Spanned::new(Token::GlobWord(text), start..end));
1762 } else {
1763 for t in run.iter() {
1764 result.push((*t).clone());
1765 }
1766 }
1767
1768 run.clear();
1769}
1770
1771pub fn tokenize(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1781 let arith_result = preprocess_arithmetic(source)
1783 .map_err(|e| vec![Spanned::new(e, 0..source.len())])?;
1784
1785 let span_replacements = arith_result.replacements;
1789 let (preprocessed, heredocs) = preprocess_heredocs(&arith_result.text)
1790 .map_err(|e| {
1791 let span = correct_span(e.span, &span_replacements);
1792 vec![Spanned::new(e.token, span)]
1793 })?;
1794
1795 let lexer = Token::lexer(&preprocessed);
1796 let mut tokens = Vec::new();
1797 let mut errors = Vec::new();
1798
1799 for (result, span) in lexer.spanned() {
1800 let corrected_span = correct_span(span, &span_replacements);
1802 match result {
1803 Ok(token) => {
1804 if !matches!(token, Token::Comment | Token::LineContinuation) {
1806 tokens.push(Spanned::new(token, corrected_span));
1807 }
1808 }
1809 Err(err) => {
1810 errors.push(Spanned::new(err, corrected_span));
1811 }
1812 }
1813 }
1814
1815 if !errors.is_empty() {
1816 return Err(errors);
1817 }
1818
1819 let mut final_tokens = Vec::with_capacity(tokens.len());
1821 let mut i = 0;
1822
1823 while i < tokens.len() {
1824 if let Token::Ident(ref name) = tokens[i].token
1826 && name.starts_with("__KAISH_ARITH_") && name.ends_with("__")
1827 && let Some((_, expr)) = arith_result.arithmetics.iter().find(|(marker, _)| marker == name) {
1828 final_tokens.push(Spanned::new(Token::Arithmetic(expr.clone()), tokens[i].span.clone()));
1829 i += 1;
1830 continue;
1831 }
1832
1833 if matches!(tokens[i].token, Token::HereDocStart) {
1835 if i + 1 < tokens.len()
1837 && let Token::Ident(ref name) = tokens[i + 1].token
1838 && name.starts_with("__KAISH_HEREDOC_") && name.ends_with("__") {
1839 if let Some(hd) = heredocs.iter().find(|h| h.marker == *name) {
1841 let mut content = hd.body.clone();
1853 for (marker, expr) in &arith_result.arithmetics {
1854 if content.contains(marker) {
1855 let replacement = if hd.literal {
1856 format!("$(({}))", expr)
1857 } else {
1858 format!("${{__ARITH:{}__}}", expr)
1859 };
1860 content = content.replace(marker, &replacement);
1861 }
1862 }
1863 final_tokens.push(Spanned::new(Token::HereDocStart, tokens[i].span.clone()));
1864 final_tokens.push(Spanned::new(
1865 Token::HereDoc(HereDocData {
1866 content,
1867 literal: hd.literal,
1868 strip_tabs: hd.strip_tabs,
1869 body_start_offset: hd.body_start_offset,
1870 }),
1871 tokens[i + 1].span.clone(),
1872 ));
1873 i += 2;
1874 continue;
1875 }
1876 }
1877 }
1878
1879 let token = if let Token::String(ref s) = tokens[i].token {
1881 let mut new_content = s.clone();
1883 for (marker, expr) in &arith_result.arithmetics {
1884 if new_content.contains(marker) {
1885 new_content = new_content.replace(marker, &format!("${{__ARITH:{}__}}", expr));
1888 }
1889 }
1890 if new_content != *s {
1891 Spanned::new(Token::String(new_content), tokens[i].span.clone())
1892 } else {
1893 tokens[i].clone()
1894 }
1895 } else {
1896 tokens[i].clone()
1897 };
1898 final_tokens.push(token);
1899 i += 1;
1900 }
1901
1902 Ok(merge_glob_adjacent(merge_colon_adjacent(final_tokens)))
1903}
1904
1905pub fn tokenize_with_comments(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1909 let lexer = Token::lexer(source);
1910 let mut tokens = Vec::new();
1911 let mut errors = Vec::new();
1912
1913 for (result, span) in lexer.spanned() {
1914 match result {
1915 Ok(token) => {
1916 tokens.push(Spanned::new(token, span));
1917 }
1918 Err(err) => {
1919 errors.push(Spanned::new(err, span));
1920 }
1921 }
1922 }
1923
1924 if errors.is_empty() {
1925 Ok(tokens)
1926 } else {
1927 Err(errors)
1928 }
1929}
1930
1931pub fn parse_string_literal(source: &str) -> Result<String, LexerError> {
1933 if source.len() < 2 || !source.starts_with('"') || !source.ends_with('"') {
1935 return Err(LexerError::UnterminatedString);
1936 }
1937
1938 let inner = &source[1..source.len() - 1];
1939 let mut result = String::with_capacity(inner.len());
1940 let mut chars = inner.chars().peekable();
1941
1942 while let Some(ch) = chars.next() {
1943 if ch == '\\' {
1944 match chars.next() {
1945 Some('n') => result.push('\n'),
1946 Some('t') => result.push('\t'),
1947 Some('r') => result.push('\r'),
1948 Some('\\') => result.push('\\'),
1949 Some('"') => result.push('"'),
1950 Some('$') => result.push_str("__KAISH_ESCAPED_DOLLAR__"),
1953 Some('u') => {
1954 let mut hex = String::with_capacity(4);
1956 for _ in 0..4 {
1957 match chars.next() {
1958 Some(h) if h.is_ascii_hexdigit() => hex.push(h),
1959 _ => return Err(LexerError::InvalidEscape),
1960 }
1961 }
1962 let codepoint = u32::from_str_radix(&hex, 16)
1963 .map_err(|_| LexerError::InvalidEscape)?;
1964 let ch = char::from_u32(codepoint)
1965 .ok_or(LexerError::InvalidEscape)?;
1966 result.push(ch);
1967 }
1968 Some(next) => {
1970 result.push('\\');
1971 result.push(next);
1972 }
1973 None => return Err(LexerError::InvalidEscape),
1974 }
1975 } else {
1976 result.push(ch);
1977 }
1978 }
1979
1980 Ok(result)
1981}
1982
1983pub fn parse_var_ref(source: &str) -> Result<Vec<String>, LexerError> {
1986 if source.len() < 4 || !source.starts_with("${") || !source.ends_with('}') {
1988 return Err(LexerError::UnterminatedVarRef);
1989 }
1990
1991 let inner = &source[2..source.len() - 1];
1992
1993 if inner == "?" {
1995 return Ok(vec!["?".to_string()]);
1996 }
1997
1998 let mut segments = Vec::new();
1999 let mut current = String::new();
2000 let mut chars = inner.chars().peekable();
2001
2002 while let Some(ch) = chars.next() {
2003 match ch {
2004 '.' => {
2005 if !current.is_empty() {
2006 segments.push(current.clone());
2007 current.clear();
2008 }
2009 }
2010 '[' => {
2011 if !current.is_empty() {
2012 segments.push(current.clone());
2013 current.clear();
2014 }
2015 let mut index = String::from("[");
2017 while let Some(&c) = chars.peek() {
2018 if let Some(c) = chars.next() {
2019 index.push(c);
2020 }
2021 if c == ']' {
2022 break;
2023 }
2024 }
2025 segments.push(index);
2026 }
2027 _ => {
2028 current.push(ch);
2029 }
2030 }
2031 }
2032
2033 if !current.is_empty() {
2034 segments.push(current);
2035 }
2036
2037 Ok(segments)
2038}
2039
2040pub fn parse_int(source: &str) -> Result<i64, LexerError> {
2042 source.parse().map_err(|_| LexerError::InvalidNumber)
2043}
2044
2045pub fn parse_float(source: &str) -> Result<f64, LexerError> {
2047 source.parse().map_err(|_| LexerError::InvalidNumber)
2048}
2049
2050#[cfg(test)]
2051mod tests {
2052 use super::*;
2053
2054 fn lex(source: &str) -> Vec<Token> {
2055 tokenize(source)
2056 .expect("lexer should succeed")
2057 .into_iter()
2058 .map(|s| s.token)
2059 .collect()
2060 }
2061
2062 #[test]
2067 fn keywords() {
2068 assert_eq!(lex("set"), vec![Token::Set]);
2069 assert_eq!(lex("if"), vec![Token::If]);
2070 assert_eq!(lex("then"), vec![Token::Then]);
2071 assert_eq!(lex("else"), vec![Token::Else]);
2072 assert_eq!(lex("elif"), vec![Token::Elif]);
2073 assert_eq!(lex("fi"), vec![Token::Fi]);
2074 assert_eq!(lex("for"), vec![Token::For]);
2075 assert_eq!(lex("in"), vec![Token::In]);
2076 assert_eq!(lex("do"), vec![Token::Do]);
2077 assert_eq!(lex("done"), vec![Token::Done]);
2078 assert_eq!(lex("case"), vec![Token::Case]);
2079 assert_eq!(lex("esac"), vec![Token::Esac]);
2080 assert_eq!(lex("function"), vec![Token::Function]);
2081 assert_eq!(lex("true"), vec![Token::True]);
2082 assert_eq!(lex("false"), vec![Token::False]);
2083 }
2084
2085 #[test]
2086 fn double_semicolon() {
2087 assert_eq!(lex(";;"), vec![Token::DoubleSemi]);
2088 assert_eq!(lex("echo \"hi\";;"), vec![
2090 Token::Ident("echo".to_string()),
2091 Token::String("hi".to_string()),
2092 Token::DoubleSemi,
2093 ]);
2094 }
2095
2096 #[test]
2097 fn type_keywords() {
2098 assert_eq!(lex("string"), vec![Token::TypeString]);
2099 assert_eq!(lex("int"), vec![Token::TypeInt]);
2100 assert_eq!(lex("float"), vec![Token::TypeFloat]);
2101 assert_eq!(lex("bool"), vec![Token::TypeBool]);
2102 }
2103
2104 #[test]
2109 fn single_char_operators() {
2110 assert_eq!(lex("="), vec![Token::Eq]);
2111 assert_eq!(lex("|"), vec![Token::Pipe]);
2112 assert_eq!(lex("&"), vec![Token::Amp]);
2113 assert_eq!(lex(">"), vec![Token::Gt]);
2114 assert_eq!(lex("<"), vec![Token::Lt]);
2115 assert_eq!(lex(";"), vec![Token::Semi]);
2116 assert_eq!(lex(":"), vec![Token::Colon]);
2117 assert_eq!(lex(","), vec![Token::Comma]);
2118 assert_eq!(lex("."), vec![Token::Dot]);
2119 }
2120
2121 #[test]
2122 fn multi_char_operators() {
2123 assert_eq!(lex("&&"), vec![Token::And]);
2124 assert_eq!(lex("||"), vec![Token::Or]);
2125 assert_eq!(lex("=="), vec![Token::EqEq]);
2126 assert_eq!(lex("!="), vec![Token::NotEq]);
2127 assert_eq!(lex("=~"), vec![Token::Match]);
2128 assert_eq!(lex("!~"), vec![Token::NotMatch]);
2129 assert_eq!(lex(">="), vec![Token::GtEq]);
2130 assert_eq!(lex("<="), vec![Token::LtEq]);
2131 assert_eq!(lex(">>"), vec![Token::GtGt]);
2132 assert_eq!(lex("2>"), vec![Token::Stderr]);
2133 assert_eq!(lex("&>"), vec![Token::Both]);
2134 }
2135
2136 #[test]
2137 fn brackets() {
2138 assert_eq!(lex("{"), vec![Token::LBrace]);
2139 assert_eq!(lex("}"), vec![Token::RBrace]);
2140 assert_eq!(lex("["), vec![Token::LBracket]);
2141 assert_eq!(lex("]"), vec![Token::RBracket]);
2142 assert_eq!(lex("("), vec![Token::LParen]);
2143 assert_eq!(lex(")"), vec![Token::RParen]);
2144 }
2145
2146 #[test]
2151 fn integers() {
2152 assert_eq!(lex("0"), vec![Token::Int(0)]);
2153 assert_eq!(lex("42"), vec![Token::Int(42)]);
2154 assert_eq!(lex("-1"), vec![Token::Int(-1)]);
2155 assert_eq!(lex("999999"), vec![Token::Int(999999)]);
2156 }
2157
2158 #[test]
2159 fn floats() {
2160 assert_eq!(lex("3.14"), vec![Token::Float(3.14)]);
2161 assert_eq!(lex("-0.5"), vec![Token::Float(-0.5)]);
2162 assert_eq!(lex("123.456"), vec![Token::Float(123.456)]);
2163 }
2164
2165 #[test]
2166 fn strings() {
2167 assert_eq!(lex(r#""hello""#), vec![Token::String("hello".to_string())]);
2168 assert_eq!(lex(r#""hello world""#), vec![Token::String("hello world".to_string())]);
2169 assert_eq!(lex(r#""""#), vec![Token::String("".to_string())]); assert_eq!(lex(r#""with \"quotes\"""#), vec![Token::String("with \"quotes\"".to_string())]);
2171 assert_eq!(lex(r#""with\nnewline""#), vec![Token::String("with\nnewline".to_string())]);
2172 }
2173
2174 #[test]
2175 fn var_refs() {
2176 assert_eq!(lex("${X}"), vec![Token::VarRef("${X}".to_string())]);
2177 assert_eq!(lex("${VAR}"), vec![Token::VarRef("${VAR}".to_string())]);
2178 assert_eq!(lex("${VAR.field}"), vec![Token::VarRef("${VAR.field}".to_string())]);
2179 assert_eq!(lex("${VAR[0]}"), vec![Token::VarRef("${VAR[0]}".to_string())]);
2180 }
2181
2182 #[test]
2187 fn identifiers() {
2188 assert_eq!(lex("foo"), vec![Token::Ident("foo".to_string())]);
2189 assert_eq!(lex("foo_bar"), vec![Token::Ident("foo_bar".to_string())]);
2190 assert_eq!(lex("foo-bar"), vec![Token::Ident("foo-bar".to_string())]);
2191 assert_eq!(lex("_private"), vec![Token::Ident("_private".to_string())]);
2192 assert_eq!(lex("cmd123"), vec![Token::Ident("cmd123".to_string())]);
2193 }
2194
2195 #[test]
2196 fn keyword_prefix_identifiers() {
2197 assert_eq!(lex("setup"), vec![Token::Ident("setup".to_string())]);
2199 assert_eq!(lex("kaish-tools"), vec![Token::Ident("kaish-tools".to_string())]);
2200 assert_eq!(lex("iffy"), vec![Token::Ident("iffy".to_string())]);
2201 assert_eq!(lex("forked"), vec![Token::Ident("forked".to_string())]);
2202 assert_eq!(lex("done-with-it"), vec![Token::Ident("done-with-it".to_string())]);
2203 }
2204
2205 #[test]
2210 fn assignment() {
2211 assert_eq!(
2212 lex("set X = 5"),
2213 vec![Token::Set, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2214 );
2215 }
2216
2217 #[test]
2218 fn command_simple() {
2219 assert_eq!(lex("echo"), vec![Token::Ident("echo".to_string())]);
2220 assert_eq!(
2221 lex(r#"echo "hello""#),
2222 vec![Token::Ident("echo".to_string()), Token::String("hello".to_string())]
2223 );
2224 }
2225
2226 #[test]
2227 fn command_with_args() {
2228 assert_eq!(
2229 lex("cmd arg1 arg2"),
2230 vec![Token::Ident("cmd".to_string()), Token::Ident("arg1".to_string()), Token::Ident("arg2".to_string())]
2231 );
2232 }
2233
2234 #[test]
2235 fn command_with_named_args() {
2236 assert_eq!(
2237 lex("cmd key=value"),
2238 vec![Token::Ident("cmd".to_string()), Token::Ident("key".to_string()), Token::Eq, Token::Ident("value".to_string())]
2239 );
2240 }
2241
2242 #[test]
2243 fn pipeline() {
2244 assert_eq!(
2245 lex("a | b | c"),
2246 vec![Token::Ident("a".to_string()), Token::Pipe, Token::Ident("b".to_string()), Token::Pipe, Token::Ident("c".to_string())]
2247 );
2248 }
2249
2250 #[test]
2251 fn if_statement() {
2252 assert_eq!(
2253 lex("if true; then echo; fi"),
2254 vec![
2255 Token::If,
2256 Token::True,
2257 Token::Semi,
2258 Token::Then,
2259 Token::Ident("echo".to_string()),
2260 Token::Semi,
2261 Token::Fi
2262 ]
2263 );
2264 }
2265
2266 #[test]
2267 fn for_loop() {
2268 assert_eq!(
2269 lex("for X in items; do echo; done"),
2270 vec![
2271 Token::For,
2272 Token::Ident("X".to_string()),
2273 Token::In,
2274 Token::Ident("items".to_string()),
2275 Token::Semi,
2276 Token::Do,
2277 Token::Ident("echo".to_string()),
2278 Token::Semi,
2279 Token::Done
2280 ]
2281 );
2282 }
2283
2284 #[test]
2289 fn whitespace_ignored() {
2290 assert_eq!(lex(" set X = 5 "), lex("set X = 5"));
2291 }
2292
2293 #[test]
2294 fn newlines_preserved() {
2295 let tokens = lex("a\nb");
2296 assert_eq!(
2297 tokens,
2298 vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2299 );
2300 }
2301
2302 #[test]
2303 fn multiple_newlines() {
2304 let tokens = lex("a\n\n\nb");
2305 assert_eq!(
2306 tokens,
2307 vec![Token::Ident("a".to_string()), Token::Newline, Token::Newline, Token::Newline, Token::Ident("b".to_string())]
2308 );
2309 }
2310
2311 #[test]
2316 fn comments_skipped() {
2317 assert_eq!(lex("# comment"), vec![]);
2318 assert_eq!(lex("a # comment"), vec![Token::Ident("a".to_string())]);
2319 assert_eq!(
2320 lex("a # comment\nb"),
2321 vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2322 );
2323 }
2324
2325 #[test]
2326 fn comments_preserved_when_requested() {
2327 let tokens = tokenize_with_comments("a # comment")
2328 .expect("should succeed")
2329 .into_iter()
2330 .map(|s| s.token)
2331 .collect::<Vec<_>>();
2332 assert_eq!(tokens, vec![Token::Ident("a".to_string()), Token::Comment]);
2333 }
2334
2335 #[test]
2340 fn parse_simple_string() {
2341 assert_eq!(parse_string_literal(r#""hello""#).expect("ok"), "hello");
2342 }
2343
2344 #[test]
2345 fn parse_string_with_escapes() {
2346 assert_eq!(
2347 parse_string_literal(r#""hello\nworld""#).expect("ok"),
2348 "hello\nworld"
2349 );
2350 assert_eq!(
2351 parse_string_literal(r#""tab\there""#).expect("ok"),
2352 "tab\there"
2353 );
2354 assert_eq!(
2355 parse_string_literal(r#""quote\"here""#).expect("ok"),
2356 "quote\"here"
2357 );
2358 }
2359
2360 #[test]
2361 fn parse_string_with_unicode() {
2362 assert_eq!(
2363 parse_string_literal(r#""emoji \u2764""#).expect("ok"),
2364 "emoji ❤"
2365 );
2366 }
2367
2368 #[test]
2369 fn parse_string_with_escaped_dollar() {
2370 assert_eq!(
2373 parse_string_literal(r#""\$VAR""#).expect("ok"),
2374 "__KAISH_ESCAPED_DOLLAR__VAR"
2375 );
2376 assert_eq!(
2377 parse_string_literal(r#""cost: \$100""#).expect("ok"),
2378 "cost: __KAISH_ESCAPED_DOLLAR__100"
2379 );
2380 }
2381
2382 #[test]
2387 fn parse_simple_var() {
2388 assert_eq!(
2389 parse_var_ref("${X}").expect("ok"),
2390 vec!["X"]
2391 );
2392 }
2393
2394 #[test]
2395 fn parse_var_with_field() {
2396 assert_eq!(
2397 parse_var_ref("${VAR.field}").expect("ok"),
2398 vec!["VAR", "field"]
2399 );
2400 }
2401
2402 #[test]
2403 fn parse_var_with_index() {
2404 assert_eq!(
2405 parse_var_ref("${VAR[0]}").expect("ok"),
2406 vec!["VAR", "[0]"]
2407 );
2408 }
2409
2410 #[test]
2411 fn parse_var_nested() {
2412 assert_eq!(
2413 parse_var_ref("${VAR.field[0].nested}").expect("ok"),
2414 vec!["VAR", "field", "[0]", "nested"]
2415 );
2416 }
2417
2418 #[test]
2419 fn parse_last_result() {
2420 assert_eq!(
2421 parse_var_ref("${?}").expect("ok"),
2422 vec!["?"]
2423 );
2424 }
2425
2426 #[test]
2431 fn parse_integers() {
2432 assert_eq!(parse_int("0").expect("ok"), 0);
2433 assert_eq!(parse_int("42").expect("ok"), 42);
2434 assert_eq!(parse_int("-1").expect("ok"), -1);
2435 }
2436
2437 #[test]
2438 fn parse_floats() {
2439 assert!((parse_float("3.14").expect("ok") - 3.14).abs() < f64::EPSILON);
2440 assert!((parse_float("-0.5").expect("ok") - (-0.5)).abs() < f64::EPSILON);
2441 }
2442
2443 #[test]
2448 fn empty_input() {
2449 assert_eq!(lex(""), vec![]);
2450 }
2451
2452 #[test]
2453 fn only_whitespace() {
2454 assert_eq!(lex(" \t\t "), vec![]);
2455 }
2456
2457 #[test]
2458 fn json_array() {
2459 assert_eq!(
2460 lex(r#"[1, 2, 3]"#),
2461 vec![
2462 Token::LBracket,
2463 Token::Int(1),
2464 Token::Comma,
2465 Token::Int(2),
2466 Token::Comma,
2467 Token::Int(3),
2468 Token::RBracket
2469 ]
2470 );
2471 }
2472
2473 #[test]
2474 fn json_object() {
2475 assert_eq!(
2476 lex(r#"{"key": "value"}"#),
2477 vec![
2478 Token::LBrace,
2479 Token::String("key".to_string()),
2480 Token::Colon,
2481 Token::String("value".to_string()),
2482 Token::RBrace
2483 ]
2484 );
2485 }
2486
2487 #[test]
2488 fn redirect_operators() {
2489 assert_eq!(
2490 lex("cmd > file"),
2491 vec![Token::Ident("cmd".to_string()), Token::Gt, Token::Ident("file".to_string())]
2492 );
2493 assert_eq!(
2494 lex("cmd >> file"),
2495 vec![Token::Ident("cmd".to_string()), Token::GtGt, Token::Ident("file".to_string())]
2496 );
2497 assert_eq!(
2498 lex("cmd 2> err"),
2499 vec![Token::Ident("cmd".to_string()), Token::Stderr, Token::Ident("err".to_string())]
2500 );
2501 assert_eq!(
2502 lex("cmd &> all"),
2503 vec![Token::Ident("cmd".to_string()), Token::Both, Token::Ident("all".to_string())]
2504 );
2505 }
2506
2507 #[test]
2508 fn background_job() {
2509 assert_eq!(
2510 lex("cmd &"),
2511 vec![Token::Ident("cmd".to_string()), Token::Amp]
2512 );
2513 }
2514
2515 #[test]
2516 fn command_substitution() {
2517 assert_eq!(
2518 lex("$(cmd)"),
2519 vec![Token::CmdSubstStart, Token::Ident("cmd".to_string()), Token::RParen]
2520 );
2521 assert_eq!(
2522 lex("$(cmd arg)"),
2523 vec![
2524 Token::CmdSubstStart,
2525 Token::Ident("cmd".to_string()),
2526 Token::Ident("arg".to_string()),
2527 Token::RParen
2528 ]
2529 );
2530 assert_eq!(
2531 lex("$(a | b)"),
2532 vec![
2533 Token::CmdSubstStart,
2534 Token::Ident("a".to_string()),
2535 Token::Pipe,
2536 Token::Ident("b".to_string()),
2537 Token::RParen
2538 ]
2539 );
2540 }
2541
2542 #[test]
2543 fn complex_pipeline() {
2544 assert_eq!(
2545 lex(r#"cat file | grep pattern="foo" | head count=10"#),
2546 vec![
2547 Token::Ident("cat".to_string()),
2548 Token::Ident("file".to_string()),
2549 Token::Pipe,
2550 Token::Ident("grep".to_string()),
2551 Token::Ident("pattern".to_string()),
2552 Token::Eq,
2553 Token::String("foo".to_string()),
2554 Token::Pipe,
2555 Token::Ident("head".to_string()),
2556 Token::Ident("count".to_string()),
2557 Token::Eq,
2558 Token::Int(10),
2559 ]
2560 );
2561 }
2562
2563 #[test]
2568 fn short_flag() {
2569 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2570 assert_eq!(lex("-a"), vec![Token::ShortFlag("a".to_string())]);
2571 assert_eq!(lex("-v"), vec![Token::ShortFlag("v".to_string())]);
2572 }
2573
2574 #[test]
2575 fn short_flag_combined() {
2576 assert_eq!(lex("-la"), vec![Token::ShortFlag("la".to_string())]);
2578 assert_eq!(lex("-vvv"), vec![Token::ShortFlag("vvv".to_string())]);
2579 }
2580
2581 #[test]
2582 fn long_flag() {
2583 assert_eq!(lex("--force"), vec![Token::LongFlag("force".to_string())]);
2584 assert_eq!(lex("--verbose"), vec![Token::LongFlag("verbose".to_string())]);
2585 assert_eq!(lex("--foo-bar"), vec![Token::LongFlag("foo-bar".to_string())]);
2586 }
2587
2588 #[test]
2589 fn double_dash() {
2590 assert_eq!(lex("--"), vec![Token::DoubleDash]);
2592 }
2593
2594 #[test]
2595 fn flags_vs_negative_numbers() {
2596 assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2598 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2600 assert_eq!(
2603 lex("-1 a"),
2604 vec![Token::Int(-1), Token::Ident("a".to_string())]
2605 );
2606 }
2607
2608 #[test]
2609 fn command_with_flags() {
2610 assert_eq!(
2611 lex("ls -l"),
2612 vec![
2613 Token::Ident("ls".to_string()),
2614 Token::ShortFlag("l".to_string()),
2615 ]
2616 );
2617 assert_eq!(
2618 lex("git commit -m"),
2619 vec![
2620 Token::Ident("git".to_string()),
2621 Token::Ident("commit".to_string()),
2622 Token::ShortFlag("m".to_string()),
2623 ]
2624 );
2625 assert_eq!(
2626 lex("git push --force"),
2627 vec![
2628 Token::Ident("git".to_string()),
2629 Token::Ident("push".to_string()),
2630 Token::LongFlag("force".to_string()),
2631 ]
2632 );
2633 }
2634
2635 #[test]
2636 fn flag_with_value() {
2637 assert_eq!(
2638 lex(r#"git commit -m "message""#),
2639 vec![
2640 Token::Ident("git".to_string()),
2641 Token::Ident("commit".to_string()),
2642 Token::ShortFlag("m".to_string()),
2643 Token::String("message".to_string()),
2644 ]
2645 );
2646 assert_eq!(
2647 lex(r#"--message="hello""#),
2648 vec![
2649 Token::LongFlag("message".to_string()),
2650 Token::Eq,
2651 Token::String("hello".to_string()),
2652 ]
2653 );
2654 }
2655
2656 #[test]
2657 fn end_of_flags_marker() {
2658 assert_eq!(
2659 lex("git checkout -- file"),
2660 vec![
2661 Token::Ident("git".to_string()),
2662 Token::Ident("checkout".to_string()),
2663 Token::DoubleDash,
2664 Token::Ident("file".to_string()),
2665 ]
2666 );
2667 }
2668
2669 #[test]
2674 fn local_keyword() {
2675 assert_eq!(lex("local"), vec![Token::Local]);
2676 assert_eq!(
2677 lex("local X = 5"),
2678 vec![Token::Local, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2679 );
2680 }
2681
2682 #[test]
2683 fn simple_var_ref() {
2684 assert_eq!(lex("$X"), vec![Token::SimpleVarRef("X".to_string())]);
2685 assert_eq!(lex("$foo"), vec![Token::SimpleVarRef("foo".to_string())]);
2686 assert_eq!(lex("$foo_bar"), vec![Token::SimpleVarRef("foo_bar".to_string())]);
2687 assert_eq!(lex("$_private"), vec![Token::SimpleVarRef("_private".to_string())]);
2688 }
2689
2690 #[test]
2691 fn simple_var_ref_in_command() {
2692 assert_eq!(
2693 lex("echo $NAME"),
2694 vec![Token::Ident("echo".to_string()), Token::SimpleVarRef("NAME".to_string())]
2695 );
2696 }
2697
2698 #[test]
2699 fn single_quoted_strings() {
2700 assert_eq!(lex("'hello'"), vec![Token::SingleString("hello".to_string())]);
2701 assert_eq!(lex("'hello world'"), vec![Token::SingleString("hello world".to_string())]);
2702 assert_eq!(lex("''"), vec![Token::SingleString("".to_string())]);
2703 assert_eq!(lex(r"'no $VAR here'"), vec![Token::SingleString("no $VAR here".to_string())]);
2705 assert_eq!(lex(r"'backslash \n stays'"), vec![Token::SingleString(r"backslash \n stays".to_string())]);
2706 }
2707
2708 #[test]
2709 fn test_brackets() {
2710 assert_eq!(lex("[["), vec![Token::LBracket, Token::LBracket]);
2712 assert_eq!(lex("]]"), vec![Token::RBracket, Token::RBracket]);
2713 assert_eq!(
2714 lex("[[ -f file ]]"),
2715 vec![
2716 Token::LBracket,
2717 Token::LBracket,
2718 Token::ShortFlag("f".to_string()),
2719 Token::Ident("file".to_string()),
2720 Token::RBracket,
2721 Token::RBracket
2722 ]
2723 );
2724 }
2725
2726 #[test]
2727 fn test_expression_syntax() {
2728 assert_eq!(
2729 lex(r#"[[ $X == "value" ]]"#),
2730 vec![
2731 Token::LBracket,
2732 Token::LBracket,
2733 Token::SimpleVarRef("X".to_string()),
2734 Token::EqEq,
2735 Token::String("value".to_string()),
2736 Token::RBracket,
2737 Token::RBracket
2738 ]
2739 );
2740 }
2741
2742 #[test]
2743 fn bash_style_assignment() {
2744 assert_eq!(
2746 lex(r#"NAME="value""#),
2747 vec![
2748 Token::Ident("NAME".to_string()),
2749 Token::Eq,
2750 Token::String("value".to_string())
2751 ]
2752 );
2753 }
2754
2755 #[test]
2756 fn positional_params() {
2757 assert_eq!(lex("$0"), vec![Token::Positional(0)]);
2758 assert_eq!(lex("$1"), vec![Token::Positional(1)]);
2759 assert_eq!(lex("$9"), vec![Token::Positional(9)]);
2760 assert_eq!(lex("$@"), vec![Token::AllArgs]);
2761 assert_eq!(lex("$#"), vec![Token::ArgCount]);
2762 }
2763
2764 #[test]
2765 fn positional_in_context() {
2766 assert_eq!(
2767 lex("echo $1 $2"),
2768 vec![
2769 Token::Ident("echo".to_string()),
2770 Token::Positional(1),
2771 Token::Positional(2),
2772 ]
2773 );
2774 }
2775
2776 #[test]
2777 fn var_length() {
2778 assert_eq!(lex("${#X}"), vec![Token::VarLength("X".to_string())]);
2779 assert_eq!(lex("${#NAME}"), vec![Token::VarLength("NAME".to_string())]);
2780 assert_eq!(lex("${#foo_bar}"), vec![Token::VarLength("foo_bar".to_string())]);
2781 }
2782
2783 #[test]
2784 fn var_length_in_context() {
2785 assert_eq!(
2786 lex("echo ${#NAME}"),
2787 vec![
2788 Token::Ident("echo".to_string()),
2789 Token::VarLength("NAME".to_string()),
2790 ]
2791 );
2792 }
2793
2794 #[test]
2799 fn plus_flag() {
2800 assert_eq!(lex("+e"), vec![Token::PlusFlag("e".to_string())]);
2802 assert_eq!(lex("+x"), vec![Token::PlusFlag("x".to_string())]);
2803 assert_eq!(lex("+ex"), vec![Token::PlusFlag("ex".to_string())]);
2804 }
2805
2806 #[test]
2807 fn set_with_plus_flag() {
2808 assert_eq!(
2809 lex("set +e"),
2810 vec![
2811 Token::Set,
2812 Token::PlusFlag("e".to_string()),
2813 ]
2814 );
2815 }
2816
2817 #[test]
2818 fn set_with_multiple_flags() {
2819 assert_eq!(
2820 lex("set -e -u"),
2821 vec![
2822 Token::Set,
2823 Token::ShortFlag("e".to_string()),
2824 Token::ShortFlag("u".to_string()),
2825 ]
2826 );
2827 }
2828
2829 #[test]
2830 fn flags_vs_negative_numbers_edge_cases() {
2831 assert_eq!(
2833 lex("-1 a"),
2834 vec![Token::Int(-1), Token::Ident("a".to_string())]
2835 );
2836 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2838 assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2840 }
2841
2842 #[test]
2843 fn single_dash_is_minus_alone() {
2844 let result = tokenize("-").expect("should lex");
2846 assert_eq!(result.len(), 1);
2847 assert!(matches!(result[0].token, Token::MinusAlone));
2848 }
2849
2850 #[test]
2851 fn plus_bare_for_date_format() {
2852 let result = tokenize("+%s").expect("should lex");
2854 assert_eq!(result.len(), 1);
2855 assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%s"));
2856
2857 let result = tokenize("+%Y-%m-%d").expect("should lex");
2859 assert_eq!(result.len(), 1);
2860 assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%Y-%m-%d"));
2861 }
2862
2863 #[test]
2864 fn plus_flag_still_works() {
2865 let result = tokenize("+e").expect("should lex");
2867 assert_eq!(result.len(), 1);
2868 assert!(matches!(result[0].token, Token::PlusFlag(ref s) if s == "e"));
2869 }
2870
2871 #[test]
2872 fn while_keyword_vs_while_loop() {
2873 assert_eq!(lex("while"), vec![Token::While]);
2875 assert_eq!(
2877 lex("while true"),
2878 vec![Token::While, Token::True]
2879 );
2880 }
2881
2882 #[test]
2883 fn control_flow_keywords() {
2884 assert_eq!(lex("break"), vec![Token::Break]);
2885 assert_eq!(lex("continue"), vec![Token::Continue]);
2886 assert_eq!(lex("return"), vec![Token::Return]);
2887 assert_eq!(lex("exit"), vec![Token::Exit]);
2888 }
2889
2890 #[test]
2891 fn control_flow_with_numbers() {
2892 assert_eq!(
2893 lex("break 2"),
2894 vec![Token::Break, Token::Int(2)]
2895 );
2896 assert_eq!(
2897 lex("continue 3"),
2898 vec![Token::Continue, Token::Int(3)]
2899 );
2900 assert_eq!(
2901 lex("exit 1"),
2902 vec![Token::Exit, Token::Int(1)]
2903 );
2904 }
2905
2906 #[test]
2911 fn heredoc_simple() {
2912 let source = "cat <<EOF\nhello\nworld\nEOF";
2913 let tokens = lex(source);
2914 assert_eq!(tokens, vec![
2916 Token::Ident("cat".to_string()),
2917 Token::HereDocStart,
2918 Token::HereDoc(HereDocData {
2919 content: "hello\nworld\n".to_string(),
2920 literal: false,
2921 strip_tabs: false,
2922 body_start_offset: 10,
2923 }),
2924 Token::Newline,
2925 ]);
2926 }
2927
2928 #[test]
2929 fn heredoc_empty() {
2930 let source = "cat <<EOF\nEOF";
2931 let tokens = lex(source);
2932 assert_eq!(tokens, vec![
2933 Token::Ident("cat".to_string()),
2934 Token::HereDocStart,
2935 Token::HereDoc(HereDocData {
2936 content: "".to_string(),
2937 literal: false,
2938 strip_tabs: false,
2939 body_start_offset: 10,
2940 }),
2941 Token::Newline,
2942 ]);
2943 }
2944
2945 #[test]
2946 fn heredoc_with_special_chars() {
2947 let source = "cat <<EOF\n$VAR and \"quoted\" 'single'\nEOF";
2948 let tokens = lex(source);
2949 assert_eq!(tokens, vec![
2950 Token::Ident("cat".to_string()),
2951 Token::HereDocStart,
2952 Token::HereDoc(HereDocData {
2953 content: "$VAR and \"quoted\" 'single'\n".to_string(),
2954 literal: false,
2955 strip_tabs: false,
2956 body_start_offset: 10,
2957 }),
2958 Token::Newline,
2959 ]);
2960 }
2961
2962 #[test]
2963 fn heredoc_multiline() {
2964 let source = "cat <<END\nline1\nline2\nline3\nEND";
2965 let tokens = lex(source);
2966 assert_eq!(tokens, vec![
2967 Token::Ident("cat".to_string()),
2968 Token::HereDocStart,
2969 Token::HereDoc(HereDocData {
2970 content: "line1\nline2\nline3\n".to_string(),
2971 literal: false,
2972 strip_tabs: false,
2973 body_start_offset: 10,
2974 }),
2975 Token::Newline,
2976 ]);
2977 }
2978
2979 #[test]
2980 fn heredoc_in_command() {
2981 let source = "cat <<EOF\nhello\nEOF\necho goodbye";
2982 let tokens = lex(source);
2983 assert_eq!(tokens, vec![
2984 Token::Ident("cat".to_string()),
2985 Token::HereDocStart,
2986 Token::HereDoc(HereDocData {
2987 content: "hello\n".to_string(),
2988 literal: false,
2989 strip_tabs: false,
2990 body_start_offset: 10,
2991 }),
2992 Token::Newline,
2993 Token::Ident("echo".to_string()),
2994 Token::Ident("goodbye".to_string()),
2995 ]);
2996 }
2997
2998 #[test]
2999 fn heredoc_strip_tabs() {
3000 let source = "cat <<-EOF\n\thello\n\tworld\n\tEOF";
3001 let tokens = lex(source);
3002 assert_eq!(tokens, vec![
3006 Token::Ident("cat".to_string()),
3007 Token::HereDocStart,
3008 Token::HereDoc(HereDocData {
3009 content: "\thello\n\tworld\n".to_string(),
3010 literal: false,
3011 strip_tabs: true,
3012 body_start_offset: 11,
3013 }),
3014 Token::Newline,
3015 ]);
3016 }
3017
3018 #[test]
3023 fn arithmetic_simple() {
3024 let source = "$((1 + 2))";
3025 let tokens = lex(source);
3026 assert_eq!(tokens, vec![Token::Arithmetic("1 + 2".to_string())]);
3027 }
3028
3029 #[test]
3030 fn arithmetic_in_assignment() {
3031 let source = "X=$((5 * 3))";
3032 let tokens = lex(source);
3033 assert_eq!(tokens, vec![
3034 Token::Ident("X".to_string()),
3035 Token::Eq,
3036 Token::Arithmetic("5 * 3".to_string()),
3037 ]);
3038 }
3039
3040 #[test]
3041 fn arithmetic_with_nested_parens() {
3042 let source = "$((2 * (3 + 4)))";
3043 let tokens = lex(source);
3044 assert_eq!(tokens, vec![Token::Arithmetic("2 * (3 + 4)".to_string())]);
3045 }
3046
3047 #[test]
3048 fn arithmetic_with_variable() {
3049 let source = "$((X + 1))";
3050 let tokens = lex(source);
3051 assert_eq!(tokens, vec![Token::Arithmetic("X + 1".to_string())]);
3052 }
3053
3054 #[test]
3055 fn arithmetic_command_subst_not_confused() {
3056 let source = "$(echo hello)";
3058 let tokens = lex(source);
3059 assert_eq!(tokens, vec![
3060 Token::CmdSubstStart,
3061 Token::Ident("echo".to_string()),
3062 Token::Ident("hello".to_string()),
3063 Token::RParen,
3064 ]);
3065 }
3066
3067 #[test]
3068 fn arithmetic_nesting_limit() {
3069 let open_parens = "(".repeat(300);
3071 let close_parens = ")".repeat(300);
3072 let source = format!("$(({}1{}))", open_parens, close_parens);
3073 let result = tokenize(&source);
3074 assert!(result.is_err());
3075 let errors = result.unwrap_err();
3076 assert_eq!(errors.len(), 1);
3077 assert_eq!(errors[0].token, LexerError::NestingTooDeep);
3078 }
3079
3080 #[test]
3081 fn arithmetic_nesting_within_limit() {
3082 let source = "$((((1 + 2) * 3)))";
3084 let tokens = lex(source);
3085 assert_eq!(tokens, vec![Token::Arithmetic("((1 + 2) * 3)".to_string())]);
3086 }
3087
3088 #[test]
3093 fn token_categories() {
3094 assert_eq!(Token::If.category(), TokenCategory::Keyword);
3096 assert_eq!(Token::Then.category(), TokenCategory::Keyword);
3097 assert_eq!(Token::For.category(), TokenCategory::Keyword);
3098 assert_eq!(Token::Function.category(), TokenCategory::Keyword);
3099 assert_eq!(Token::True.category(), TokenCategory::Keyword);
3100 assert_eq!(Token::TypeString.category(), TokenCategory::Keyword);
3101
3102 assert_eq!(Token::Pipe.category(), TokenCategory::Operator);
3104 assert_eq!(Token::And.category(), TokenCategory::Operator);
3105 assert_eq!(Token::Or.category(), TokenCategory::Operator);
3106 assert_eq!(Token::StderrToStdout.category(), TokenCategory::Operator);
3107 assert_eq!(Token::GtGt.category(), TokenCategory::Operator);
3108
3109 assert_eq!(Token::String("test".to_string()).category(), TokenCategory::String);
3111 assert_eq!(Token::SingleString("test".to_string()).category(), TokenCategory::String);
3112 assert_eq!(
3113 Token::HereDoc(HereDocData {
3114 content: "test".to_string(),
3115 literal: false,
3116 strip_tabs: false,
3117 body_start_offset: 0,
3118 }).category(),
3119 TokenCategory::String,
3120 );
3121
3122 assert_eq!(Token::Int(42).category(), TokenCategory::Number);
3124 assert_eq!(Token::Float(3.14).category(), TokenCategory::Number);
3125 assert_eq!(Token::Arithmetic("1+2".to_string()).category(), TokenCategory::Number);
3126
3127 assert_eq!(Token::SimpleVarRef("X".to_string()).category(), TokenCategory::Variable);
3129 assert_eq!(Token::VarRef("${X}".to_string()).category(), TokenCategory::Variable);
3130 assert_eq!(Token::Positional(1).category(), TokenCategory::Variable);
3131 assert_eq!(Token::AllArgs.category(), TokenCategory::Variable);
3132 assert_eq!(Token::ArgCount.category(), TokenCategory::Variable);
3133 assert_eq!(Token::LastExitCode.category(), TokenCategory::Variable);
3134 assert_eq!(Token::CurrentPid.category(), TokenCategory::Variable);
3135
3136 assert_eq!(Token::ShortFlag("l".to_string()).category(), TokenCategory::Flag);
3138 assert_eq!(Token::LongFlag("verbose".to_string()).category(), TokenCategory::Flag);
3139 assert_eq!(Token::PlusFlag("e".to_string()).category(), TokenCategory::Flag);
3140 assert_eq!(Token::DoubleDash.category(), TokenCategory::Flag);
3141
3142 assert_eq!(Token::Semi.category(), TokenCategory::Punctuation);
3144 assert_eq!(Token::LParen.category(), TokenCategory::Punctuation);
3145 assert_eq!(Token::LBracket.category(), TokenCategory::Punctuation);
3146 assert_eq!(Token::Newline.category(), TokenCategory::Punctuation);
3147
3148 assert_eq!(Token::Comment.category(), TokenCategory::Comment);
3150
3151 assert_eq!(Token::Path("/tmp/file".to_string()).category(), TokenCategory::Path);
3153
3154 assert_eq!(Token::Ident("echo".to_string()).category(), TokenCategory::Command);
3156 assert_eq!(Token::NumberIdent("019dda1c".to_string()).category(), TokenCategory::Command);
3157 assert_eq!(Token::DottedIdent(".gitignore".to_string()).category(), TokenCategory::Command);
3158
3159 assert_eq!(Token::InvalidFloatNoLeading.category(), TokenCategory::Error);
3161 assert_eq!(Token::InvalidFloatNoTrailing.category(), TokenCategory::Error);
3162 }
3163
3164 #[test]
3165 fn test_heredoc_piped_to_command() {
3166 let tokens = tokenize("cat <<EOF | jq\n{\"key\": \"val\"}\nEOF").unwrap();
3169 let heredoc_pos = tokens.iter().position(|t| matches!(t.token, Token::HereDoc(_)));
3170 let pipe_pos = tokens.iter().position(|t| matches!(t.token, Token::Pipe));
3171 assert!(heredoc_pos.is_some(), "should have a heredoc token");
3172 assert!(pipe_pos.is_some(), "should have a pipe token");
3173 assert!(
3174 pipe_pos.unwrap() > heredoc_pos.unwrap(),
3175 "Pipe must come after heredoc, got heredoc at {}, pipe at {}. Tokens: {:?}",
3176 heredoc_pos.unwrap(), pipe_pos.unwrap(), tokens,
3177 );
3178 }
3179
3180 #[test]
3181 fn test_heredoc_standalone_still_works() {
3182 let tokens = tokenize("cat <<EOF\nhello\nEOF").unwrap();
3184 assert!(tokens.iter().any(|t| matches!(t.token, Token::HereDoc(_))));
3185 assert!(!tokens.iter().any(|t| matches!(t.token, Token::Pipe)));
3186 }
3187
3188 #[test]
3189 fn test_heredoc_preserves_leading_empty_lines() {
3190 let tokens = tokenize("cat <<EOF\n\nhello\nEOF").unwrap();
3192 let heredoc = tokens.iter().find_map(|t| {
3193 if let Token::HereDoc(data) = &t.token {
3194 Some(data.clone())
3195 } else {
3196 None
3197 }
3198 });
3199 assert!(heredoc.is_some(), "should have a heredoc token");
3200 let data = heredoc.unwrap();
3201 assert!(data.content.starts_with('\n'), "leading empty line must be preserved, got: {:?}", data.content);
3202 assert_eq!(data.content, "\nhello\n");
3203 }
3204
3205 #[test]
3206 fn test_heredoc_quoted_delimiter_sets_literal() {
3207 let tokens = tokenize("cat <<'EOF'\nhello $HOME\nEOF").unwrap();
3209 let heredoc = tokens.iter().find_map(|t| {
3210 if let Token::HereDoc(data) = &t.token {
3211 Some(data.clone())
3212 } else {
3213 None
3214 }
3215 });
3216 assert!(heredoc.is_some(), "should have a heredoc token");
3217 let data = heredoc.unwrap();
3218 assert!(data.literal, "quoted delimiter should set literal=true");
3219 assert_eq!(data.content, "hello $HOME\n");
3220 }
3221
3222 #[test]
3223 fn test_heredoc_unquoted_delimiter_not_literal() {
3224 let tokens = tokenize("cat <<EOF\nhello $HOME\nEOF").unwrap();
3226 let heredoc = tokens.iter().find_map(|t| {
3227 if let Token::HereDoc(data) = &t.token {
3228 Some(data.clone())
3229 } else {
3230 None
3231 }
3232 });
3233 assert!(heredoc.is_some(), "should have a heredoc token");
3234 let data = heredoc.unwrap();
3235 assert!(!data.literal, "unquoted delimiter should have literal=false");
3236 }
3237
3238 #[test]
3243 fn colon_double_in_word() {
3244 assert_eq!(lex("foo::bar"), vec![Token::Ident("foo::bar".into())]);
3245 }
3246
3247 #[test]
3248 fn colon_single_in_word() {
3249 assert_eq!(lex("a:b:c"), vec![Token::Ident("a:b:c".into())]);
3250 }
3251
3252 #[test]
3253 fn colon_with_port() {
3254 assert_eq!(lex("host:8080"), vec![Token::Ident("host:8080".into())]);
3255 }
3256
3257 #[test]
3258 fn colon_standalone() {
3259 assert_eq!(lex(":"), vec![Token::Colon]);
3260 }
3261
3262 #[test]
3263 fn colon_spaced_no_merge() {
3264 assert_eq!(
3265 lex("foo : bar"),
3266 vec![
3267 Token::Ident("foo".into()),
3268 Token::Colon,
3269 Token::Ident("bar".into()),
3270 ]
3271 );
3272 }
3273
3274 #[test]
3275 fn colon_in_command_arg() {
3276 assert_eq!(
3277 lex("echo foo::bar"),
3278 vec![
3279 Token::Ident("echo".into()),
3280 Token::Ident("foo::bar".into()),
3281 ]
3282 );
3283 }
3284
3285 #[test]
3286 fn colon_trailing() {
3287 assert_eq!(lex("foo:"), vec![Token::Ident("foo:".into())]);
3289 }
3290
3291 #[test]
3292 fn colon_leading() {
3293 assert_eq!(lex(":foo"), vec![Token::Ident(":foo".into())]);
3295 }
3296
3297 #[test]
3298 fn colon_with_path() {
3299 assert_eq!(
3301 lex("/usr/bin:8080"),
3302 vec![Token::Ident("/usr/bin:8080".into())]
3303 );
3304 }
3305
3306 #[test]
3311 fn is_keyword_covers_control_flow() {
3312 for t in [
3313 Token::While,
3314 Token::Return,
3315 Token::Break,
3316 Token::Continue,
3317 Token::Exit,
3318 ] {
3319 assert!(t.is_keyword(), "{t:?} should be a keyword");
3320 }
3321 }
3322
3323 #[test]
3324 fn starts_statement_covers_while() {
3325 assert!(Token::While.starts_statement());
3326 }
3327
3328 #[test]
3329 fn is_keyword_rejects_operators() {
3330 for t in [Token::Pipe, Token::Amp, Token::Eq, Token::LBrace] {
3331 assert!(!t.is_keyword(), "{t:?} should not be a keyword");
3332 }
3333 }
3334}