1use logos::{Logos, Span};
17use std::fmt;
18use std::sync::atomic::{AtomicU64, Ordering};
19use std::time::{SystemTime, UNIX_EPOCH};
20
21static MARKER_COUNTER: AtomicU64 = AtomicU64::new(0);
23
24const MAX_PAREN_DEPTH: usize = 256;
27
28#[derive(Debug, Clone)]
32struct SpanReplacement {
33 preprocessed_pos: usize,
35 marker_len: usize,
37 original_len: usize,
39}
40
41fn correct_span(span: Span, replacements: &[SpanReplacement]) -> Span {
43 let mut start_adjustment: isize = 0;
44 let mut end_adjustment: isize = 0;
45
46 for r in replacements {
47 let delta = r.original_len as isize - r.marker_len as isize;
49
50 if span.start > r.preprocessed_pos + r.marker_len {
52 start_adjustment += delta;
53 } else if span.start > r.preprocessed_pos {
54 start_adjustment += delta;
57 }
58
59 if span.end > r.preprocessed_pos + r.marker_len {
61 end_adjustment += delta;
62 } else if span.end > r.preprocessed_pos {
63 end_adjustment += delta;
65 }
66 }
67
68 let new_start = (span.start as isize + start_adjustment).max(0) as usize;
69 let new_end = (span.end as isize + end_adjustment).max(new_start as isize) as usize;
70 new_start..new_end
71}
72
73fn unique_marker_id() -> String {
76 let timestamp = SystemTime::now()
77 .duration_since(UNIX_EPOCH)
78 .map(|d| d.as_nanos())
79 .unwrap_or(0);
80 let counter = MARKER_COUNTER.fetch_add(1, Ordering::Relaxed);
81 let pid = std::process::id();
82 format!("{:x}_{:x}_{:x}", timestamp, counter, pid)
83}
84
85#[derive(Debug, Clone, PartialEq)]
87pub struct Spanned<T> {
88 pub token: T,
89 pub span: Span,
90}
91
92impl<T> Spanned<T> {
93 pub fn new(token: T, span: Span) -> Self {
94 Self { token, span }
95 }
96}
97
98#[derive(Debug, Clone, PartialEq, Default)]
100pub enum LexerError {
101 #[default]
102 UnexpectedCharacter,
103 UnterminatedString,
104 UnterminatedVarRef,
105 InvalidEscape,
106 InvalidNumber,
107 AmbiguousBoolean(String),
108 AmbiguousBooleanLike(String),
109 InvalidNumberIdent(String),
110 InvalidFloatNoLeading,
111 InvalidFloatNoTrailing,
112 NestingTooDeep,
114}
115
116impl fmt::Display for LexerError {
117 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
118 match self {
119 LexerError::UnexpectedCharacter => write!(f, "unexpected character"),
120 LexerError::UnterminatedString => write!(f, "unterminated string"),
121 LexerError::UnterminatedVarRef => write!(f, "unterminated variable reference"),
122 LexerError::InvalidEscape => write!(f, "invalid escape sequence"),
123 LexerError::InvalidNumber => write!(f, "invalid number"),
124 LexerError::AmbiguousBoolean(s) => {
125 write!(f, "ambiguous boolean, use lowercase '{}'", s.to_lowercase())
126 }
127 LexerError::AmbiguousBooleanLike(s) => {
128 let suggest = if s.eq_ignore_ascii_case("yes") { "true" } else { "false" };
129 write!(f, "ambiguous boolean-like '{}', use '{}' or '\"{}\"'", s, suggest, s)
130 }
131 LexerError::InvalidNumberIdent(s) => {
132 write!(f, "identifier cannot start with digit: {}", s)
133 }
134 LexerError::InvalidFloatNoLeading => write!(f, "float must have leading digit"),
135 LexerError::InvalidFloatNoTrailing => write!(f, "float must have trailing digit"),
136 LexerError::NestingTooDeep => write!(f, "nesting depth exceeded (max {})", MAX_PAREN_DEPTH),
137 }
138 }
139}
140
141#[derive(Debug, Clone, PartialEq)]
153pub struct HereDocData {
154 pub content: String,
155 pub literal: bool,
156}
157
158#[derive(Logos, Debug, Clone, PartialEq)]
159#[logos(error = LexerError)]
160#[logos(skip r"[ \t]+")]
161pub enum Token {
162 #[token("set")]
166 Set,
167
168 #[token("local")]
169 Local,
170
171 #[token("if")]
172 If,
173
174 #[token("then")]
175 Then,
176
177 #[token("else")]
178 Else,
179
180 #[token("elif")]
181 Elif,
182
183 #[token("fi")]
184 Fi,
185
186 #[token("for")]
187 For,
188
189 #[token("while")]
190 While,
191
192 #[token("in")]
193 In,
194
195 #[token("do")]
196 Do,
197
198 #[token("done")]
199 Done,
200
201 #[token("case")]
202 Case,
203
204 #[token("esac")]
205 Esac,
206
207 #[token("function")]
208 Function,
209
210 #[token("break")]
211 Break,
212
213 #[token("continue")]
214 Continue,
215
216 #[token("return")]
217 Return,
218
219 #[token("exit")]
220 Exit,
221
222 #[token("true")]
223 True,
224
225 #[token("false")]
226 False,
227
228 #[token("string")]
232 TypeString,
233
234 #[token("int")]
235 TypeInt,
236
237 #[token("float")]
238 TypeFloat,
239
240 #[token("bool")]
241 TypeBool,
242
243 #[token("&&")]
247 And,
248
249 #[token("||")]
250 Or,
251
252 #[token("==")]
253 EqEq,
254
255 #[token("!=")]
256 NotEq,
257
258 #[token("=~")]
259 Match,
260
261 #[token("!~")]
262 NotMatch,
263
264 #[token(">=")]
265 GtEq,
266
267 #[token("<=")]
268 LtEq,
269
270 #[token(">>")]
271 GtGt,
272
273 #[token("2>&1")]
274 StderrToStdout,
275
276 #[token("1>&2")]
277 StdoutToStderr,
278
279 #[token(">&2")]
280 StdoutToStderr2,
281
282 #[token("2>")]
283 Stderr,
284
285 #[token("&>")]
286 Both,
287
288 #[token("<<")]
289 HereDocStart,
290
291 #[token(";;")]
292 DoubleSemi,
293
294 #[token("=")]
298 Eq,
299
300 #[token("|")]
301 Pipe,
302
303 #[token("&")]
304 Amp,
305
306 #[token(">")]
307 Gt,
308
309 #[token("<")]
310 Lt,
311
312 #[token(";")]
313 Semi,
314
315 #[token(":")]
316 Colon,
317
318 #[token(",")]
319 Comma,
320
321 #[token("..")]
322 DotDot,
323
324 #[token(".")]
325 Dot,
326
327 #[regex(r"~[a-zA-Z0-9_./+-]+", lex_tilde_path, priority = 3)]
329 TildePath(String),
330
331 #[token("~")]
333 Tilde,
334
335 #[regex(r"\.\./[a-zA-Z0-9_./-]+", lex_relative_path, priority = 3)]
337 RelativePath(String),
338
339 #[regex(r"\./[a-zA-Z0-9_./-]+", lex_dot_slash_path, priority = 3)]
341 DotSlashPath(String),
342
343 #[token("{")]
344 LBrace,
345
346 #[token("}")]
347 RBrace,
348
349 #[token("[")]
350 LBracket,
351
352 #[token("]")]
353 RBracket,
354
355 #[token("(")]
356 LParen,
357
358 #[token(")")]
359 RParen,
360
361 #[token("*")]
362 Star,
363
364 #[token("!")]
365 Bang,
366
367 #[token("?")]
368 Question,
369
370 Arithmetic(String),
377
378 #[token("$(")]
380 CmdSubstStart,
381
382 #[regex(r"--[a-zA-Z][a-zA-Z0-9-]*", lex_long_flag, priority = 3)]
388 LongFlag(String),
389
390 #[regex(r"-[a-zA-Z][a-zA-Z0-9]*", lex_short_flag, priority = 3)]
392 ShortFlag(String),
393
394 #[regex(r"\+[a-zA-Z][a-zA-Z0-9]*", lex_plus_flag, priority = 3)]
396 PlusFlag(String),
397
398 #[token("--")]
400 DoubleDash,
401
402 #[regex(r"\+[^a-zA-Z\s][^\s]*", lex_plus_bare, priority = 2)]
405 PlusBare(String),
406
407 #[regex(r"-[^a-zA-Z0-9\s\-][^\s]*", lex_minus_bare, priority = 1)]
411 MinusBare(String),
412
413 #[token("-")]
417 MinusAlone,
418
419 #[regex(r#""([^"\\]|\\.)*""#, lex_string)]
425 String(String),
426
427 #[regex(r"'[^']*'", lex_single_string)]
429 SingleString(String),
430
431 #[regex(r"\$\{[^}]+\}", lex_varref)]
433 VarRef(String),
434
435 #[regex(r"\$[a-zA-Z_][a-zA-Z0-9_]*", lex_simple_varref)]
437 SimpleVarRef(String),
438
439 #[regex(r"\$[0-9]", lex_positional)]
441 Positional(usize),
442
443 #[token("$@")]
445 AllArgs,
446
447 #[token("$#")]
449 ArgCount,
450
451 #[token("$?")]
453 LastExitCode,
454
455 #[token("$$")]
457 CurrentPid,
458
459 #[regex(r"\$\{#[a-zA-Z_][a-zA-Z0-9_]*\}", lex_var_length)]
461 VarLength(String),
462
463 HereDoc(HereDocData),
466
467 #[regex(r"-?[0-9]+", lex_int, priority = 2)]
469 Int(i64),
470
471 #[regex(r"-?[0-9]+\.[0-9]+", lex_float)]
473 Float(f64),
474
475 #[regex(r"[0-9]+[a-zA-Z_][a-zA-Z0-9_-]*", lex_invalid_number_ident, priority = 3)]
481 InvalidNumberIdent,
482
483 #[regex(r"\.[0-9]+", lex_invalid_float_no_leading, priority = 3)]
485 InvalidFloatNoLeading,
486
487 #[regex(r"[0-9]+\.", lex_invalid_float_no_trailing, priority = 2)]
490 InvalidFloatNoTrailing,
491
492 #[regex(r"/[a-zA-Z0-9_./+-]*", lex_path)]
498 Path(String),
499
500 #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*", lex_ident)]
507 Ident(String),
508
509 #[regex(r"#[^\n\r]*", allow_greedy = true)]
515 Comment,
516
517 #[regex(r"\n|\r\n")]
519 Newline,
520
521 #[regex(r"\\[ \t]*(\n|\r\n)")]
523 LineContinuation,
524}
525
526#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
531pub enum TokenCategory {
532 Keyword,
534 Operator,
536 String,
538 Number,
540 Variable,
542 Comment,
544 Punctuation,
546 Command,
548 Path,
550 Flag,
552 Error,
554}
555
556impl Token {
557 pub fn category(&self) -> TokenCategory {
559 match self {
560 Token::If
562 | Token::Then
563 | Token::Else
564 | Token::Elif
565 | Token::Fi
566 | Token::For
567 | Token::In
568 | Token::Do
569 | Token::Done
570 | Token::While
571 | Token::Case
572 | Token::Esac
573 | Token::Function
574 | Token::Return
575 | Token::Break
576 | Token::Continue
577 | Token::Exit
578 | Token::Set
579 | Token::Local
580 | Token::True
581 | Token::False
582 | Token::TypeString
583 | Token::TypeInt
584 | Token::TypeFloat
585 | Token::TypeBool => TokenCategory::Keyword,
586
587 Token::Pipe
589 | Token::And
590 | Token::Or
591 | Token::Amp
592 | Token::Eq
593 | Token::EqEq
594 | Token::NotEq
595 | Token::Match
596 | Token::NotMatch
597 | Token::Lt
598 | Token::Gt
599 | Token::LtEq
600 | Token::GtEq
601 | Token::GtGt
602 | Token::Stderr
603 | Token::Both
604 | Token::HereDocStart
605 | Token::StderrToStdout
606 | Token::StdoutToStderr
607 | Token::StdoutToStderr2 => TokenCategory::Operator,
608
609 Token::String(_) | Token::SingleString(_) | Token::HereDoc(_) => TokenCategory::String,
611
612 Token::Int(_) | Token::Float(_) | Token::Arithmetic(_) => TokenCategory::Number,
614
615 Token::VarRef(_)
617 | Token::SimpleVarRef(_)
618 | Token::Positional(_)
619 | Token::AllArgs
620 | Token::ArgCount
621 | Token::VarLength(_)
622 | Token::LastExitCode
623 | Token::CurrentPid => TokenCategory::Variable,
624
625 Token::LongFlag(_)
627 | Token::ShortFlag(_)
628 | Token::PlusFlag(_)
629 | Token::DoubleDash => TokenCategory::Flag,
630
631 Token::Semi
633 | Token::DoubleSemi
634 | Token::Colon
635 | Token::Comma
636 | Token::Dot
637 | Token::LParen
638 | Token::RParen
639 | Token::LBrace
640 | Token::RBrace
641 | Token::LBracket
642 | Token::RBracket
643 | Token::Bang
644 | Token::Question
645 | Token::Star
646 | Token::Newline
647 | Token::LineContinuation
648 | Token::CmdSubstStart => TokenCategory::Punctuation,
649
650 Token::Comment => TokenCategory::Comment,
652
653 Token::Path(_)
655 | Token::TildePath(_)
656 | Token::RelativePath(_)
657 | Token::Tilde
658 | Token::DotDot
659 | Token::DotSlashPath(_) => TokenCategory::Path,
660
661 Token::Ident(_)
663 | Token::PlusBare(_)
664 | Token::MinusBare(_)
665 | Token::MinusAlone => TokenCategory::Command,
666
667 Token::InvalidNumberIdent
669 | Token::InvalidFloatNoLeading
670 | Token::InvalidFloatNoTrailing => TokenCategory::Error,
671 }
672 }
673}
674
675fn lex_string(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
677 parse_string_literal(lex.slice())
678}
679
680fn lex_single_string(lex: &mut logos::Lexer<Token>) -> String {
682 let s = lex.slice();
683 s[1..s.len() - 1].to_string()
685}
686
687fn lex_varref(lex: &mut logos::Lexer<Token>) -> String {
689 lex.slice().to_string()
691}
692
693fn lex_simple_varref(lex: &mut logos::Lexer<Token>) -> String {
695 lex.slice()[1..].to_string()
697}
698
699fn lex_positional(lex: &mut logos::Lexer<Token>) -> usize {
701 lex.slice()[1..].parse().unwrap_or(0)
703}
704
705fn lex_var_length(lex: &mut logos::Lexer<Token>) -> String {
707 let s = lex.slice();
709 s[3..s.len() - 1].to_string()
710}
711
712fn lex_int(lex: &mut logos::Lexer<Token>) -> Result<i64, LexerError> {
714 lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
715}
716
717fn lex_float(lex: &mut logos::Lexer<Token>) -> Result<f64, LexerError> {
719 lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
720}
721
722fn lex_invalid_number_ident(lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
725 Err(LexerError::InvalidNumberIdent(lex.slice().to_string()))
726}
727
728fn lex_invalid_float_no_leading(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
731 Err(LexerError::InvalidFloatNoLeading)
732}
733
734fn lex_invalid_float_no_trailing(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
737 Err(LexerError::InvalidFloatNoTrailing)
738}
739
740fn lex_ident(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
742 let s = lex.slice();
743
744 match s.to_lowercase().as_str() {
747 "true" | "false" if s != "true" && s != "false" => {
748 return Err(LexerError::AmbiguousBoolean(s.to_string()));
749 }
750 _ => {}
751 }
752
753 if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("no") {
755 return Err(LexerError::AmbiguousBooleanLike(s.to_string()));
756 }
757
758 Ok(s.to_string())
759}
760
761fn lex_long_flag(lex: &mut logos::Lexer<Token>) -> String {
763 lex.slice()[2..].to_string()
765}
766
767fn lex_short_flag(lex: &mut logos::Lexer<Token>) -> String {
769 lex.slice()[1..].to_string()
771}
772
773fn lex_plus_flag(lex: &mut logos::Lexer<Token>) -> String {
775 lex.slice()[1..].to_string()
777}
778
779fn lex_plus_bare(lex: &mut logos::Lexer<Token>) -> String {
781 lex.slice().to_string()
782}
783
784fn lex_minus_bare(lex: &mut logos::Lexer<Token>) -> String {
786 lex.slice().to_string()
787}
788
789fn lex_path(lex: &mut logos::Lexer<Token>) -> String {
791 lex.slice().to_string()
792}
793
794fn lex_tilde_path(lex: &mut logos::Lexer<Token>) -> String {
796 lex.slice().to_string()
797}
798
799fn lex_relative_path(lex: &mut logos::Lexer<Token>) -> String {
801 lex.slice().to_string()
802}
803
804fn lex_dot_slash_path(lex: &mut logos::Lexer<Token>) -> String {
806 lex.slice().to_string()
807}
808
809impl fmt::Display for Token {
810 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
811 match self {
812 Token::Set => write!(f, "set"),
813 Token::Local => write!(f, "local"),
814 Token::If => write!(f, "if"),
815 Token::Then => write!(f, "then"),
816 Token::Else => write!(f, "else"),
817 Token::Elif => write!(f, "elif"),
818 Token::Fi => write!(f, "fi"),
819 Token::For => write!(f, "for"),
820 Token::While => write!(f, "while"),
821 Token::In => write!(f, "in"),
822 Token::Do => write!(f, "do"),
823 Token::Done => write!(f, "done"),
824 Token::Case => write!(f, "case"),
825 Token::Esac => write!(f, "esac"),
826 Token::Function => write!(f, "function"),
827 Token::Break => write!(f, "break"),
828 Token::Continue => write!(f, "continue"),
829 Token::Return => write!(f, "return"),
830 Token::Exit => write!(f, "exit"),
831 Token::True => write!(f, "true"),
832 Token::False => write!(f, "false"),
833 Token::TypeString => write!(f, "string"),
834 Token::TypeInt => write!(f, "int"),
835 Token::TypeFloat => write!(f, "float"),
836 Token::TypeBool => write!(f, "bool"),
837 Token::And => write!(f, "&&"),
838 Token::Or => write!(f, "||"),
839 Token::EqEq => write!(f, "=="),
840 Token::NotEq => write!(f, "!="),
841 Token::Match => write!(f, "=~"),
842 Token::NotMatch => write!(f, "!~"),
843 Token::GtEq => write!(f, ">="),
844 Token::LtEq => write!(f, "<="),
845 Token::GtGt => write!(f, ">>"),
846 Token::StderrToStdout => write!(f, "2>&1"),
847 Token::StdoutToStderr => write!(f, "1>&2"),
848 Token::StdoutToStderr2 => write!(f, ">&2"),
849 Token::Stderr => write!(f, "2>"),
850 Token::Both => write!(f, "&>"),
851 Token::HereDocStart => write!(f, "<<"),
852 Token::DoubleSemi => write!(f, ";;"),
853 Token::Eq => write!(f, "="),
854 Token::Pipe => write!(f, "|"),
855 Token::Amp => write!(f, "&"),
856 Token::Gt => write!(f, ">"),
857 Token::Lt => write!(f, "<"),
858 Token::Semi => write!(f, ";"),
859 Token::Colon => write!(f, ":"),
860 Token::Comma => write!(f, ","),
861 Token::Dot => write!(f, "."),
862 Token::DotDot => write!(f, ".."),
863 Token::Tilde => write!(f, "~"),
864 Token::TildePath(s) => write!(f, "{}", s),
865 Token::RelativePath(s) => write!(f, "{}", s),
866 Token::DotSlashPath(s) => write!(f, "{}", s),
867 Token::LBrace => write!(f, "{{"),
868 Token::RBrace => write!(f, "}}"),
869 Token::LBracket => write!(f, "["),
870 Token::RBracket => write!(f, "]"),
871 Token::LParen => write!(f, "("),
872 Token::RParen => write!(f, ")"),
873 Token::Star => write!(f, "*"),
874 Token::Bang => write!(f, "!"),
875 Token::Question => write!(f, "?"),
876 Token::Arithmetic(s) => write!(f, "ARITHMETIC({})", s),
877 Token::CmdSubstStart => write!(f, "$("),
878 Token::LongFlag(s) => write!(f, "--{}", s),
879 Token::ShortFlag(s) => write!(f, "-{}", s),
880 Token::PlusFlag(s) => write!(f, "+{}", s),
881 Token::DoubleDash => write!(f, "--"),
882 Token::PlusBare(s) => write!(f, "{}", s),
883 Token::MinusBare(s) => write!(f, "{}", s),
884 Token::MinusAlone => write!(f, "-"),
885 Token::String(s) => write!(f, "STRING({:?})", s),
886 Token::SingleString(s) => write!(f, "SINGLESTRING({:?})", s),
887 Token::HereDoc(d) => write!(f, "HEREDOC({:?}, literal={})", d.content, d.literal),
888 Token::VarRef(v) => write!(f, "VARREF({})", v),
889 Token::SimpleVarRef(v) => write!(f, "SIMPLEVARREF({})", v),
890 Token::Positional(n) => write!(f, "${}", n),
891 Token::AllArgs => write!(f, "$@"),
892 Token::ArgCount => write!(f, "$#"),
893 Token::LastExitCode => write!(f, "$?"),
894 Token::CurrentPid => write!(f, "$$"),
895 Token::VarLength(v) => write!(f, "${{#{}}}", v),
896 Token::Int(n) => write!(f, "INT({})", n),
897 Token::Float(n) => write!(f, "FLOAT({})", n),
898 Token::Path(s) => write!(f, "PATH({})", s),
899 Token::Ident(s) => write!(f, "IDENT({})", s),
900 Token::Comment => write!(f, "COMMENT"),
901 Token::Newline => write!(f, "NEWLINE"),
902 Token::LineContinuation => write!(f, "LINECONT"),
903 Token::InvalidNumberIdent => write!(f, "INVALID_NUMBER_IDENT"),
905 Token::InvalidFloatNoLeading => write!(f, "INVALID_FLOAT_NO_LEADING"),
906 Token::InvalidFloatNoTrailing => write!(f, "INVALID_FLOAT_NO_TRAILING"),
907 }
908 }
909}
910
911impl Token {
912 pub fn is_keyword(&self) -> bool {
914 matches!(
915 self,
916 Token::Set
917 | Token::Local
918 | Token::If
919 | Token::Then
920 | Token::Else
921 | Token::Elif
922 | Token::Fi
923 | Token::For
924 | Token::In
925 | Token::Do
926 | Token::Done
927 | Token::Case
928 | Token::Esac
929 | Token::Function
930 | Token::True
931 | Token::False
932 )
933 }
934
935 pub fn is_type(&self) -> bool {
937 matches!(
938 self,
939 Token::TypeString
940 | Token::TypeInt
941 | Token::TypeFloat
942 | Token::TypeBool
943 )
944 }
945
946 pub fn starts_statement(&self) -> bool {
948 matches!(
949 self,
950 Token::Set | Token::Local | Token::Function | Token::If | Token::For | Token::Case | Token::Ident(_) | Token::LBracket
951 )
952 }
953
954 pub fn is_value(&self) -> bool {
956 matches!(
957 self,
958 Token::String(_)
959 | Token::SingleString(_)
960 | Token::HereDoc(_)
961 | Token::Arithmetic(_)
962 | Token::Int(_)
963 | Token::Float(_)
964 | Token::True
965 | Token::False
966 | Token::VarRef(_)
967 | Token::SimpleVarRef(_)
968 | Token::CmdSubstStart
969 | Token::Path(_)
970 | Token::LastExitCode
971 | Token::CurrentPid
972 )
973 }
974}
975
976struct ArithmeticPreprocessResult {
978 text: String,
980 arithmetics: Vec<(String, String)>,
982 replacements: Vec<SpanReplacement>,
984}
985
986fn skip_command_substitution(
995 chars: &[char],
996 i: &mut usize,
997 source_pos: &mut usize,
998 result: &mut String,
999) {
1000 result.push('$');
1002 result.push('(');
1003 *i += 2;
1004 *source_pos += 2;
1005
1006 let mut depth: usize = 1;
1007 let mut in_single_quote = false;
1008 let mut in_double_quote = false;
1009
1010 while *i < chars.len() && depth > 0 {
1011 let c = chars[*i];
1012
1013 if in_single_quote {
1014 result.push(c);
1015 *source_pos += c.len_utf8();
1016 *i += 1;
1017 if c == '\'' {
1018 in_single_quote = false;
1019 }
1020 continue;
1021 }
1022
1023 if in_double_quote {
1024 if c == '\\' && *i + 1 < chars.len() {
1025 let next = chars[*i + 1];
1026 if next == '"' || next == '\\' || next == '$' || next == '`' {
1027 result.push(c);
1028 result.push(next);
1029 *source_pos += c.len_utf8() + next.len_utf8();
1030 *i += 2;
1031 continue;
1032 }
1033 }
1034 if c == '"' {
1035 in_double_quote = false;
1036 }
1037 result.push(c);
1038 *source_pos += c.len_utf8();
1039 *i += 1;
1040 continue;
1041 }
1042
1043 match c {
1045 '\'' => {
1046 in_single_quote = true;
1047 result.push(c);
1048 *source_pos += c.len_utf8();
1049 *i += 1;
1050 }
1051 '"' => {
1052 in_double_quote = true;
1053 result.push(c);
1054 *source_pos += c.len_utf8();
1055 *i += 1;
1056 }
1057 '\\' if *i + 1 < chars.len() => {
1058 result.push(c);
1059 result.push(chars[*i + 1]);
1060 *source_pos += c.len_utf8() + chars[*i + 1].len_utf8();
1061 *i += 2;
1062 }
1063 '(' => {
1064 depth += 1;
1065 result.push(c);
1066 *source_pos += c.len_utf8();
1067 *i += 1;
1068 }
1069 ')' => {
1070 depth -= 1;
1071 result.push(c);
1072 *source_pos += c.len_utf8();
1073 *i += 1;
1074 }
1075 _ => {
1076 result.push(c);
1077 *source_pos += c.len_utf8();
1078 *i += 1;
1079 }
1080 }
1081 }
1082}
1083
1084fn preprocess_arithmetic(source: &str) -> Result<ArithmeticPreprocessResult, LexerError> {
1098 let mut result = String::with_capacity(source.len());
1099 let mut arithmetics: Vec<(String, String)> = Vec::new();
1100 let mut replacements: Vec<SpanReplacement> = Vec::new();
1101 let mut source_pos: usize = 0;
1102 let chars_vec: Vec<char> = source.chars().collect();
1103 let mut i = 0;
1104
1105 let mut in_double_quote = false;
1108
1109 while i < chars_vec.len() {
1110 let ch = chars_vec[i];
1111
1112 if !in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1114 result.push(ch);
1115 result.push(chars_vec[i + 1]);
1116 source_pos += ch.len_utf8() + chars_vec[i + 1].len_utf8();
1117 i += 2;
1118 continue;
1119 }
1120
1121 if ch == '\'' && !in_double_quote {
1123 result.push(ch);
1124 i += 1;
1125 source_pos += 1;
1126 while i < chars_vec.len() && chars_vec[i] != '\'' {
1127 result.push(chars_vec[i]);
1128 source_pos += chars_vec[i].len_utf8();
1129 i += 1;
1130 }
1131 if i < chars_vec.len() {
1132 result.push(chars_vec[i]); source_pos += 1;
1134 i += 1;
1135 }
1136 continue;
1137 }
1138
1139 if ch == '"' {
1141 in_double_quote = !in_double_quote;
1142 result.push(ch);
1143 i += 1;
1144 source_pos += 1;
1145 continue;
1146 }
1147
1148 if in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1150 let next = chars_vec[i + 1];
1151 if next == '"' || next == '\\' || next == '$' || next == '`' {
1152 result.push(ch);
1153 result.push(next);
1154 source_pos += ch.len_utf8() + next.len_utf8();
1155 i += 2;
1156 continue;
1157 }
1158 }
1159
1160 if ch == '$' && i + 1 < chars_vec.len() && chars_vec[i + 1] == '('
1162 && !(i + 2 < chars_vec.len() && chars_vec[i + 2] == '(')
1163 {
1164 skip_command_substitution(&chars_vec, &mut i, &mut source_pos, &mut result);
1165 continue;
1166 }
1167
1168 if ch == '$' && i + 2 < chars_vec.len() && chars_vec[i + 1] == '(' && chars_vec[i + 2] == '(' {
1170 let arith_start_pos = result.len();
1171 let original_start = source_pos;
1172
1173 i += 3;
1175 source_pos += 3;
1176
1177 let mut expr = String::new();
1179 let mut paren_depth: usize = 0;
1180
1181 while i < chars_vec.len() {
1182 let c = chars_vec[i];
1183 match c {
1184 '(' => {
1185 paren_depth += 1;
1186 if paren_depth > MAX_PAREN_DEPTH {
1187 return Err(LexerError::NestingTooDeep);
1188 }
1189 expr.push('(');
1190 i += 1;
1191 source_pos += c.len_utf8();
1192 }
1193 ')' => {
1194 if paren_depth > 0 {
1195 paren_depth -= 1;
1196 expr.push(')');
1197 i += 1;
1198 source_pos += 1;
1199 } else if i + 1 < chars_vec.len() && chars_vec[i + 1] == ')' {
1200 i += 2;
1202 source_pos += 2;
1203 break;
1204 } else {
1205 expr.push(')');
1207 i += 1;
1208 source_pos += 1;
1209 }
1210 }
1211 _ => {
1212 expr.push(c);
1213 i += 1;
1214 source_pos += c.len_utf8();
1215 }
1216 }
1217 }
1218
1219 let original_len = source_pos - original_start;
1221
1222 let marker = format!("__KAISH_ARITH_{}__", unique_marker_id());
1224 let marker_len = marker.len();
1225
1226 replacements.push(SpanReplacement {
1228 preprocessed_pos: arith_start_pos,
1229 marker_len,
1230 original_len,
1231 });
1232
1233 arithmetics.push((marker.clone(), expr));
1234 result.push_str(&marker);
1235 } else {
1236 result.push(ch);
1237 i += 1;
1238 source_pos += ch.len_utf8();
1239 }
1240 }
1241
1242 Ok(ArithmeticPreprocessResult {
1243 text: result,
1244 arithmetics,
1245 replacements,
1246 })
1247}
1248
1249fn preprocess_heredocs(source: &str) -> (String, Vec<(String, String, bool)>) {
1260 let mut result = String::with_capacity(source.len());
1261 let mut heredocs: Vec<(String, String, bool)> = Vec::new();
1262 let mut chars = source.chars().peekable();
1263
1264 while let Some(ch) = chars.next() {
1265 if ch == '<' && chars.peek() == Some(&'<') {
1267 chars.next(); let strip_tabs = chars.peek() == Some(&'-');
1271 if strip_tabs {
1272 chars.next();
1273 }
1274
1275 while let Some(&c) = chars.peek() {
1277 if c == ' ' || c == '\t' {
1278 chars.next();
1279 } else {
1280 break;
1281 }
1282 }
1283
1284 let mut delimiter = String::new();
1286 let quoted = chars.peek() == Some(&'\'') || chars.peek() == Some(&'"');
1287 let quote_char = if quoted { chars.next() } else { None };
1288
1289 while let Some(&c) = chars.peek() {
1290 if quoted {
1291 if Some(c) == quote_char {
1292 chars.next(); break;
1294 }
1295 } else if c.is_whitespace() || c == '\n' || c == '\r' {
1296 break;
1297 }
1298 if let Some(ch) = chars.next() {
1299 delimiter.push(ch);
1300 }
1301 }
1302
1303 if delimiter.is_empty() {
1304 result.push_str("<<");
1306 if strip_tabs {
1307 result.push('-');
1308 }
1309 continue;
1310 }
1311
1312 let mut after_delimiter = String::new();
1315 while let Some(&c) = chars.peek() {
1316 if c == '\n' {
1317 chars.next();
1318 break;
1319 } else if c == '\r' {
1320 chars.next();
1321 if chars.peek() == Some(&'\n') {
1322 chars.next();
1323 }
1324 break;
1325 }
1326 if let Some(ch) = chars.next() {
1327 after_delimiter.push(ch);
1328 }
1329 }
1330
1331 let mut content = String::new();
1333 let mut current_line = String::new();
1334
1335 loop {
1336 match chars.next() {
1337 Some('\n') => {
1338 let trimmed = if strip_tabs {
1340 current_line.trim_start_matches('\t')
1341 } else {
1342 ¤t_line
1343 };
1344 if trimmed == delimiter {
1345 break;
1347 }
1348 content.push_str(¤t_line);
1350 content.push('\n');
1351 current_line.clear();
1352 }
1353 Some('\r') => {
1354 if chars.peek() == Some(&'\n') {
1356 chars.next();
1357 }
1358 let trimmed = if strip_tabs {
1359 current_line.trim_start_matches('\t')
1360 } else {
1361 ¤t_line
1362 };
1363 if trimmed == delimiter {
1364 break;
1365 }
1366 content.push_str(¤t_line);
1367 content.push('\n');
1368 current_line.clear();
1369 }
1370 Some(c) => {
1371 current_line.push(c);
1372 }
1373 None => {
1374 let trimmed = if strip_tabs {
1376 current_line.trim_start_matches('\t')
1377 } else {
1378 ¤t_line
1379 };
1380 if trimmed == delimiter {
1381 break;
1383 }
1384 if !current_line.is_empty() {
1386 content.push_str(¤t_line);
1387 }
1388 break;
1389 }
1390 }
1391 }
1392
1393 let content = content.trim_end_matches('\n').to_string();
1395
1396 let marker = format!("__KAISH_HEREDOC_{}__", unique_marker_id());
1398 heredocs.push((marker.clone(), content, quoted));
1399
1400 result.push_str("<<");
1403 result.push_str(&marker);
1404 result.push_str(&after_delimiter);
1405 result.push('\n');
1406 } else {
1407 result.push(ch);
1408 }
1409 }
1410
1411 (result, heredocs)
1412}
1413
1414fn mergeable_text(token: &Token) -> Option<String> {
1419 match token {
1420 Token::Ident(s) => Some(s.clone()),
1421 Token::Colon => Some(":".to_string()),
1422 Token::Int(n) => Some(n.to_string()),
1423 Token::Path(p) => Some(p.clone()),
1424 Token::Float(f) => Some(f.to_string()),
1425 _ => None,
1426 }
1427}
1428
1429fn merge_colon_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1438 if tokens.is_empty() {
1439 return tokens;
1440 }
1441
1442 let mut result = Vec::with_capacity(tokens.len());
1443 let mut run: Vec<&Spanned<Token>> = Vec::new();
1444
1445 for token in &tokens {
1446 if run.is_empty() {
1447 if mergeable_text(&token.token).is_some() {
1448 run.push(token);
1449 } else {
1450 result.push(token.clone());
1451 }
1452 continue;
1453 }
1454
1455 let Some(last) = run.last() else { unreachable!() };
1458 let adjacent = last.span.end == token.span.start;
1459
1460 if adjacent && mergeable_text(&token.token).is_some() {
1461 run.push(token);
1462 } else {
1463 flush_colon_run(&mut run, &mut result);
1464 if mergeable_text(&token.token).is_some() {
1465 run.push(token);
1466 } else {
1467 result.push(token.clone());
1468 }
1469 }
1470 }
1471
1472 flush_colon_run(&mut run, &mut result);
1473
1474 result
1475}
1476
1477fn flush_colon_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1479 if run.is_empty() {
1480 return;
1481 }
1482
1483 let has_colon = run.iter().any(|t| matches!(t.token, Token::Colon));
1484
1485 if run.len() >= 2 && has_colon {
1486 let text: String = run
1487 .iter()
1488 .filter_map(|t| mergeable_text(&t.token))
1489 .collect();
1490 let start = run.first().map(|t| t.span.start).unwrap_or(0);
1492 let end = run.last().map(|t| t.span.end).unwrap_or(0);
1493 result.push(Spanned::new(Token::Ident(text), start..end));
1494 } else {
1495 for t in run.iter() {
1496 result.push((*t).clone());
1497 }
1498 }
1499
1500 run.clear();
1501}
1502
1503pub fn tokenize(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1513 let arith_result = preprocess_arithmetic(source)
1515 .map_err(|e| vec![Spanned::new(e, 0..source.len())])?;
1516
1517 let (preprocessed, heredocs) = preprocess_heredocs(&arith_result.text);
1519
1520 let span_replacements = arith_result.replacements;
1522
1523 let lexer = Token::lexer(&preprocessed);
1524 let mut tokens = Vec::new();
1525 let mut errors = Vec::new();
1526
1527 for (result, span) in lexer.spanned() {
1528 let corrected_span = correct_span(span, &span_replacements);
1530 match result {
1531 Ok(token) => {
1532 if !matches!(token, Token::Comment | Token::LineContinuation) {
1534 tokens.push(Spanned::new(token, corrected_span));
1535 }
1536 }
1537 Err(err) => {
1538 errors.push(Spanned::new(err, corrected_span));
1539 }
1540 }
1541 }
1542
1543 if !errors.is_empty() {
1544 return Err(errors);
1545 }
1546
1547 let mut final_tokens = Vec::with_capacity(tokens.len());
1549 let mut i = 0;
1550
1551 while i < tokens.len() {
1552 if let Token::Ident(ref name) = tokens[i].token
1554 && name.starts_with("__KAISH_ARITH_") && name.ends_with("__")
1555 && let Some((_, expr)) = arith_result.arithmetics.iter().find(|(marker, _)| marker == name) {
1556 final_tokens.push(Spanned::new(Token::Arithmetic(expr.clone()), tokens[i].span.clone()));
1557 i += 1;
1558 continue;
1559 }
1560
1561 if matches!(tokens[i].token, Token::HereDocStart) {
1563 if i + 1 < tokens.len()
1565 && let Token::Ident(ref name) = tokens[i + 1].token
1566 && name.starts_with("__KAISH_HEREDOC_") && name.ends_with("__") {
1567 if let Some((_, content, literal)) = heredocs.iter().find(|(marker, _, _)| marker == name) {
1569 final_tokens.push(Spanned::new(Token::HereDocStart, tokens[i].span.clone()));
1570 final_tokens.push(Spanned::new(Token::HereDoc(HereDocData { content: content.clone(), literal: *literal }), tokens[i + 1].span.clone()));
1571 i += 2;
1572 continue;
1573 }
1574 }
1575 }
1576
1577 let token = if let Token::String(ref s) = tokens[i].token {
1579 let mut new_content = s.clone();
1581 for (marker, expr) in &arith_result.arithmetics {
1582 if new_content.contains(marker) {
1583 new_content = new_content.replace(marker, &format!("${{__ARITH:{}__}}", expr));
1586 }
1587 }
1588 if new_content != *s {
1589 Spanned::new(Token::String(new_content), tokens[i].span.clone())
1590 } else {
1591 tokens[i].clone()
1592 }
1593 } else {
1594 tokens[i].clone()
1595 };
1596 final_tokens.push(token);
1597 i += 1;
1598 }
1599
1600 Ok(merge_colon_adjacent(final_tokens))
1601}
1602
1603pub fn tokenize_with_comments(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1607 let lexer = Token::lexer(source);
1608 let mut tokens = Vec::new();
1609 let mut errors = Vec::new();
1610
1611 for (result, span) in lexer.spanned() {
1612 match result {
1613 Ok(token) => {
1614 tokens.push(Spanned::new(token, span));
1615 }
1616 Err(err) => {
1617 errors.push(Spanned::new(err, span));
1618 }
1619 }
1620 }
1621
1622 if errors.is_empty() {
1623 Ok(tokens)
1624 } else {
1625 Err(errors)
1626 }
1627}
1628
1629pub fn parse_string_literal(source: &str) -> Result<String, LexerError> {
1631 if source.len() < 2 || !source.starts_with('"') || !source.ends_with('"') {
1633 return Err(LexerError::UnterminatedString);
1634 }
1635
1636 let inner = &source[1..source.len() - 1];
1637 let mut result = String::with_capacity(inner.len());
1638 let mut chars = inner.chars().peekable();
1639
1640 while let Some(ch) = chars.next() {
1641 if ch == '\\' {
1642 match chars.next() {
1643 Some('n') => result.push('\n'),
1644 Some('t') => result.push('\t'),
1645 Some('r') => result.push('\r'),
1646 Some('\\') => result.push('\\'),
1647 Some('"') => result.push('"'),
1648 Some('$') => result.push_str("__KAISH_ESCAPED_DOLLAR__"),
1651 Some('u') => {
1652 let mut hex = String::with_capacity(4);
1654 for _ in 0..4 {
1655 match chars.next() {
1656 Some(h) if h.is_ascii_hexdigit() => hex.push(h),
1657 _ => return Err(LexerError::InvalidEscape),
1658 }
1659 }
1660 let codepoint = u32::from_str_radix(&hex, 16)
1661 .map_err(|_| LexerError::InvalidEscape)?;
1662 let ch = char::from_u32(codepoint)
1663 .ok_or(LexerError::InvalidEscape)?;
1664 result.push(ch);
1665 }
1666 Some(next) => {
1668 result.push('\\');
1669 result.push(next);
1670 }
1671 None => return Err(LexerError::InvalidEscape),
1672 }
1673 } else {
1674 result.push(ch);
1675 }
1676 }
1677
1678 Ok(result)
1679}
1680
1681pub fn parse_var_ref(source: &str) -> Result<Vec<String>, LexerError> {
1684 if source.len() < 4 || !source.starts_with("${") || !source.ends_with('}') {
1686 return Err(LexerError::UnterminatedVarRef);
1687 }
1688
1689 let inner = &source[2..source.len() - 1];
1690
1691 if inner == "?" {
1693 return Ok(vec!["?".to_string()]);
1694 }
1695
1696 let mut segments = Vec::new();
1697 let mut current = String::new();
1698 let mut chars = inner.chars().peekable();
1699
1700 while let Some(ch) = chars.next() {
1701 match ch {
1702 '.' => {
1703 if !current.is_empty() {
1704 segments.push(current.clone());
1705 current.clear();
1706 }
1707 }
1708 '[' => {
1709 if !current.is_empty() {
1710 segments.push(current.clone());
1711 current.clear();
1712 }
1713 let mut index = String::from("[");
1715 while let Some(&c) = chars.peek() {
1716 if let Some(c) = chars.next() {
1717 index.push(c);
1718 }
1719 if c == ']' {
1720 break;
1721 }
1722 }
1723 segments.push(index);
1724 }
1725 _ => {
1726 current.push(ch);
1727 }
1728 }
1729 }
1730
1731 if !current.is_empty() {
1732 segments.push(current);
1733 }
1734
1735 Ok(segments)
1736}
1737
1738pub fn parse_int(source: &str) -> Result<i64, LexerError> {
1740 source.parse().map_err(|_| LexerError::InvalidNumber)
1741}
1742
1743pub fn parse_float(source: &str) -> Result<f64, LexerError> {
1745 source.parse().map_err(|_| LexerError::InvalidNumber)
1746}
1747
1748#[cfg(test)]
1749mod tests {
1750 use super::*;
1751
1752 fn lex(source: &str) -> Vec<Token> {
1753 tokenize(source)
1754 .expect("lexer should succeed")
1755 .into_iter()
1756 .map(|s| s.token)
1757 .collect()
1758 }
1759
1760 #[test]
1765 fn keywords() {
1766 assert_eq!(lex("set"), vec![Token::Set]);
1767 assert_eq!(lex("if"), vec![Token::If]);
1768 assert_eq!(lex("then"), vec![Token::Then]);
1769 assert_eq!(lex("else"), vec![Token::Else]);
1770 assert_eq!(lex("elif"), vec![Token::Elif]);
1771 assert_eq!(lex("fi"), vec![Token::Fi]);
1772 assert_eq!(lex("for"), vec![Token::For]);
1773 assert_eq!(lex("in"), vec![Token::In]);
1774 assert_eq!(lex("do"), vec![Token::Do]);
1775 assert_eq!(lex("done"), vec![Token::Done]);
1776 assert_eq!(lex("case"), vec![Token::Case]);
1777 assert_eq!(lex("esac"), vec![Token::Esac]);
1778 assert_eq!(lex("function"), vec![Token::Function]);
1779 assert_eq!(lex("true"), vec![Token::True]);
1780 assert_eq!(lex("false"), vec![Token::False]);
1781 }
1782
1783 #[test]
1784 fn double_semicolon() {
1785 assert_eq!(lex(";;"), vec![Token::DoubleSemi]);
1786 assert_eq!(lex("echo \"hi\";;"), vec![
1788 Token::Ident("echo".to_string()),
1789 Token::String("hi".to_string()),
1790 Token::DoubleSemi,
1791 ]);
1792 }
1793
1794 #[test]
1795 fn type_keywords() {
1796 assert_eq!(lex("string"), vec![Token::TypeString]);
1797 assert_eq!(lex("int"), vec![Token::TypeInt]);
1798 assert_eq!(lex("float"), vec![Token::TypeFloat]);
1799 assert_eq!(lex("bool"), vec![Token::TypeBool]);
1800 }
1801
1802 #[test]
1807 fn single_char_operators() {
1808 assert_eq!(lex("="), vec![Token::Eq]);
1809 assert_eq!(lex("|"), vec![Token::Pipe]);
1810 assert_eq!(lex("&"), vec![Token::Amp]);
1811 assert_eq!(lex(">"), vec![Token::Gt]);
1812 assert_eq!(lex("<"), vec![Token::Lt]);
1813 assert_eq!(lex(";"), vec![Token::Semi]);
1814 assert_eq!(lex(":"), vec![Token::Colon]);
1815 assert_eq!(lex(","), vec![Token::Comma]);
1816 assert_eq!(lex("."), vec![Token::Dot]);
1817 }
1818
1819 #[test]
1820 fn multi_char_operators() {
1821 assert_eq!(lex("&&"), vec![Token::And]);
1822 assert_eq!(lex("||"), vec![Token::Or]);
1823 assert_eq!(lex("=="), vec![Token::EqEq]);
1824 assert_eq!(lex("!="), vec![Token::NotEq]);
1825 assert_eq!(lex("=~"), vec![Token::Match]);
1826 assert_eq!(lex("!~"), vec![Token::NotMatch]);
1827 assert_eq!(lex(">="), vec![Token::GtEq]);
1828 assert_eq!(lex("<="), vec![Token::LtEq]);
1829 assert_eq!(lex(">>"), vec![Token::GtGt]);
1830 assert_eq!(lex("2>"), vec![Token::Stderr]);
1831 assert_eq!(lex("&>"), vec![Token::Both]);
1832 }
1833
1834 #[test]
1835 fn brackets() {
1836 assert_eq!(lex("{"), vec![Token::LBrace]);
1837 assert_eq!(lex("}"), vec![Token::RBrace]);
1838 assert_eq!(lex("["), vec![Token::LBracket]);
1839 assert_eq!(lex("]"), vec![Token::RBracket]);
1840 assert_eq!(lex("("), vec![Token::LParen]);
1841 assert_eq!(lex(")"), vec![Token::RParen]);
1842 }
1843
1844 #[test]
1849 fn integers() {
1850 assert_eq!(lex("0"), vec![Token::Int(0)]);
1851 assert_eq!(lex("42"), vec![Token::Int(42)]);
1852 assert_eq!(lex("-1"), vec![Token::Int(-1)]);
1853 assert_eq!(lex("999999"), vec![Token::Int(999999)]);
1854 }
1855
1856 #[test]
1857 fn floats() {
1858 assert_eq!(lex("3.14"), vec![Token::Float(3.14)]);
1859 assert_eq!(lex("-0.5"), vec![Token::Float(-0.5)]);
1860 assert_eq!(lex("123.456"), vec![Token::Float(123.456)]);
1861 }
1862
1863 #[test]
1864 fn strings() {
1865 assert_eq!(lex(r#""hello""#), vec![Token::String("hello".to_string())]);
1866 assert_eq!(lex(r#""hello world""#), vec![Token::String("hello world".to_string())]);
1867 assert_eq!(lex(r#""""#), vec![Token::String("".to_string())]); assert_eq!(lex(r#""with \"quotes\"""#), vec![Token::String("with \"quotes\"".to_string())]);
1869 assert_eq!(lex(r#""with\nnewline""#), vec![Token::String("with\nnewline".to_string())]);
1870 }
1871
1872 #[test]
1873 fn var_refs() {
1874 assert_eq!(lex("${X}"), vec![Token::VarRef("${X}".to_string())]);
1875 assert_eq!(lex("${VAR}"), vec![Token::VarRef("${VAR}".to_string())]);
1876 assert_eq!(lex("${VAR.field}"), vec![Token::VarRef("${VAR.field}".to_string())]);
1877 assert_eq!(lex("${VAR[0]}"), vec![Token::VarRef("${VAR[0]}".to_string())]);
1878 assert_eq!(lex("${?.ok}"), vec![Token::VarRef("${?.ok}".to_string())]);
1879 }
1880
1881 #[test]
1886 fn identifiers() {
1887 assert_eq!(lex("foo"), vec![Token::Ident("foo".to_string())]);
1888 assert_eq!(lex("foo_bar"), vec![Token::Ident("foo_bar".to_string())]);
1889 assert_eq!(lex("foo-bar"), vec![Token::Ident("foo-bar".to_string())]);
1890 assert_eq!(lex("_private"), vec![Token::Ident("_private".to_string())]);
1891 assert_eq!(lex("cmd123"), vec![Token::Ident("cmd123".to_string())]);
1892 }
1893
1894 #[test]
1895 fn keyword_prefix_identifiers() {
1896 assert_eq!(lex("setup"), vec![Token::Ident("setup".to_string())]);
1898 assert_eq!(lex("kaish-tools"), vec![Token::Ident("kaish-tools".to_string())]);
1899 assert_eq!(lex("iffy"), vec![Token::Ident("iffy".to_string())]);
1900 assert_eq!(lex("forked"), vec![Token::Ident("forked".to_string())]);
1901 assert_eq!(lex("done-with-it"), vec![Token::Ident("done-with-it".to_string())]);
1902 }
1903
1904 #[test]
1909 fn assignment() {
1910 assert_eq!(
1911 lex("set X = 5"),
1912 vec![Token::Set, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
1913 );
1914 }
1915
1916 #[test]
1917 fn command_simple() {
1918 assert_eq!(lex("echo"), vec![Token::Ident("echo".to_string())]);
1919 assert_eq!(
1920 lex(r#"echo "hello""#),
1921 vec![Token::Ident("echo".to_string()), Token::String("hello".to_string())]
1922 );
1923 }
1924
1925 #[test]
1926 fn command_with_args() {
1927 assert_eq!(
1928 lex("cmd arg1 arg2"),
1929 vec![Token::Ident("cmd".to_string()), Token::Ident("arg1".to_string()), Token::Ident("arg2".to_string())]
1930 );
1931 }
1932
1933 #[test]
1934 fn command_with_named_args() {
1935 assert_eq!(
1936 lex("cmd key=value"),
1937 vec![Token::Ident("cmd".to_string()), Token::Ident("key".to_string()), Token::Eq, Token::Ident("value".to_string())]
1938 );
1939 }
1940
1941 #[test]
1942 fn pipeline() {
1943 assert_eq!(
1944 lex("a | b | c"),
1945 vec![Token::Ident("a".to_string()), Token::Pipe, Token::Ident("b".to_string()), Token::Pipe, Token::Ident("c".to_string())]
1946 );
1947 }
1948
1949 #[test]
1950 fn if_statement() {
1951 assert_eq!(
1952 lex("if true; then echo; fi"),
1953 vec![
1954 Token::If,
1955 Token::True,
1956 Token::Semi,
1957 Token::Then,
1958 Token::Ident("echo".to_string()),
1959 Token::Semi,
1960 Token::Fi
1961 ]
1962 );
1963 }
1964
1965 #[test]
1966 fn for_loop() {
1967 assert_eq!(
1968 lex("for X in items; do echo; done"),
1969 vec![
1970 Token::For,
1971 Token::Ident("X".to_string()),
1972 Token::In,
1973 Token::Ident("items".to_string()),
1974 Token::Semi,
1975 Token::Do,
1976 Token::Ident("echo".to_string()),
1977 Token::Semi,
1978 Token::Done
1979 ]
1980 );
1981 }
1982
1983 #[test]
1988 fn whitespace_ignored() {
1989 assert_eq!(lex(" set X = 5 "), lex("set X = 5"));
1990 }
1991
1992 #[test]
1993 fn newlines_preserved() {
1994 let tokens = lex("a\nb");
1995 assert_eq!(
1996 tokens,
1997 vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
1998 );
1999 }
2000
2001 #[test]
2002 fn multiple_newlines() {
2003 let tokens = lex("a\n\n\nb");
2004 assert_eq!(
2005 tokens,
2006 vec![Token::Ident("a".to_string()), Token::Newline, Token::Newline, Token::Newline, Token::Ident("b".to_string())]
2007 );
2008 }
2009
2010 #[test]
2015 fn comments_skipped() {
2016 assert_eq!(lex("# comment"), vec![]);
2017 assert_eq!(lex("a # comment"), vec![Token::Ident("a".to_string())]);
2018 assert_eq!(
2019 lex("a # comment\nb"),
2020 vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2021 );
2022 }
2023
2024 #[test]
2025 fn comments_preserved_when_requested() {
2026 let tokens = tokenize_with_comments("a # comment")
2027 .expect("should succeed")
2028 .into_iter()
2029 .map(|s| s.token)
2030 .collect::<Vec<_>>();
2031 assert_eq!(tokens, vec![Token::Ident("a".to_string()), Token::Comment]);
2032 }
2033
2034 #[test]
2039 fn parse_simple_string() {
2040 assert_eq!(parse_string_literal(r#""hello""#).expect("ok"), "hello");
2041 }
2042
2043 #[test]
2044 fn parse_string_with_escapes() {
2045 assert_eq!(
2046 parse_string_literal(r#""hello\nworld""#).expect("ok"),
2047 "hello\nworld"
2048 );
2049 assert_eq!(
2050 parse_string_literal(r#""tab\there""#).expect("ok"),
2051 "tab\there"
2052 );
2053 assert_eq!(
2054 parse_string_literal(r#""quote\"here""#).expect("ok"),
2055 "quote\"here"
2056 );
2057 }
2058
2059 #[test]
2060 fn parse_string_with_unicode() {
2061 assert_eq!(
2062 parse_string_literal(r#""emoji \u2764""#).expect("ok"),
2063 "emoji ❤"
2064 );
2065 }
2066
2067 #[test]
2068 fn parse_string_with_escaped_dollar() {
2069 assert_eq!(
2072 parse_string_literal(r#""\$VAR""#).expect("ok"),
2073 "__KAISH_ESCAPED_DOLLAR__VAR"
2074 );
2075 assert_eq!(
2076 parse_string_literal(r#""cost: \$100""#).expect("ok"),
2077 "cost: __KAISH_ESCAPED_DOLLAR__100"
2078 );
2079 }
2080
2081 #[test]
2086 fn parse_simple_var() {
2087 assert_eq!(
2088 parse_var_ref("${X}").expect("ok"),
2089 vec!["X"]
2090 );
2091 }
2092
2093 #[test]
2094 fn parse_var_with_field() {
2095 assert_eq!(
2096 parse_var_ref("${VAR.field}").expect("ok"),
2097 vec!["VAR", "field"]
2098 );
2099 }
2100
2101 #[test]
2102 fn parse_var_with_index() {
2103 assert_eq!(
2104 parse_var_ref("${VAR[0]}").expect("ok"),
2105 vec!["VAR", "[0]"]
2106 );
2107 }
2108
2109 #[test]
2110 fn parse_var_nested() {
2111 assert_eq!(
2112 parse_var_ref("${VAR.field[0].nested}").expect("ok"),
2113 vec!["VAR", "field", "[0]", "nested"]
2114 );
2115 }
2116
2117 #[test]
2118 fn parse_last_result() {
2119 assert_eq!(
2120 parse_var_ref("${?}").expect("ok"),
2121 vec!["?"]
2122 );
2123 assert_eq!(
2124 parse_var_ref("${?.ok}").expect("ok"),
2125 vec!["?", "ok"]
2126 );
2127 }
2128
2129 #[test]
2134 fn parse_integers() {
2135 assert_eq!(parse_int("0").expect("ok"), 0);
2136 assert_eq!(parse_int("42").expect("ok"), 42);
2137 assert_eq!(parse_int("-1").expect("ok"), -1);
2138 }
2139
2140 #[test]
2141 fn parse_floats() {
2142 assert!((parse_float("3.14").expect("ok") - 3.14).abs() < f64::EPSILON);
2143 assert!((parse_float("-0.5").expect("ok") - (-0.5)).abs() < f64::EPSILON);
2144 }
2145
2146 #[test]
2151 fn empty_input() {
2152 assert_eq!(lex(""), vec![]);
2153 }
2154
2155 #[test]
2156 fn only_whitespace() {
2157 assert_eq!(lex(" \t\t "), vec![]);
2158 }
2159
2160 #[test]
2161 fn json_array() {
2162 assert_eq!(
2163 lex(r#"[1, 2, 3]"#),
2164 vec![
2165 Token::LBracket,
2166 Token::Int(1),
2167 Token::Comma,
2168 Token::Int(2),
2169 Token::Comma,
2170 Token::Int(3),
2171 Token::RBracket
2172 ]
2173 );
2174 }
2175
2176 #[test]
2177 fn json_object() {
2178 assert_eq!(
2179 lex(r#"{"key": "value"}"#),
2180 vec![
2181 Token::LBrace,
2182 Token::String("key".to_string()),
2183 Token::Colon,
2184 Token::String("value".to_string()),
2185 Token::RBrace
2186 ]
2187 );
2188 }
2189
2190 #[test]
2191 fn redirect_operators() {
2192 assert_eq!(
2193 lex("cmd > file"),
2194 vec![Token::Ident("cmd".to_string()), Token::Gt, Token::Ident("file".to_string())]
2195 );
2196 assert_eq!(
2197 lex("cmd >> file"),
2198 vec![Token::Ident("cmd".to_string()), Token::GtGt, Token::Ident("file".to_string())]
2199 );
2200 assert_eq!(
2201 lex("cmd 2> err"),
2202 vec![Token::Ident("cmd".to_string()), Token::Stderr, Token::Ident("err".to_string())]
2203 );
2204 assert_eq!(
2205 lex("cmd &> all"),
2206 vec![Token::Ident("cmd".to_string()), Token::Both, Token::Ident("all".to_string())]
2207 );
2208 }
2209
2210 #[test]
2211 fn background_job() {
2212 assert_eq!(
2213 lex("cmd &"),
2214 vec![Token::Ident("cmd".to_string()), Token::Amp]
2215 );
2216 }
2217
2218 #[test]
2219 fn command_substitution() {
2220 assert_eq!(
2221 lex("$(cmd)"),
2222 vec![Token::CmdSubstStart, Token::Ident("cmd".to_string()), Token::RParen]
2223 );
2224 assert_eq!(
2225 lex("$(cmd arg)"),
2226 vec![
2227 Token::CmdSubstStart,
2228 Token::Ident("cmd".to_string()),
2229 Token::Ident("arg".to_string()),
2230 Token::RParen
2231 ]
2232 );
2233 assert_eq!(
2234 lex("$(a | b)"),
2235 vec![
2236 Token::CmdSubstStart,
2237 Token::Ident("a".to_string()),
2238 Token::Pipe,
2239 Token::Ident("b".to_string()),
2240 Token::RParen
2241 ]
2242 );
2243 }
2244
2245 #[test]
2246 fn complex_pipeline() {
2247 assert_eq!(
2248 lex(r#"cat file | grep pattern="foo" | head count=10"#),
2249 vec![
2250 Token::Ident("cat".to_string()),
2251 Token::Ident("file".to_string()),
2252 Token::Pipe,
2253 Token::Ident("grep".to_string()),
2254 Token::Ident("pattern".to_string()),
2255 Token::Eq,
2256 Token::String("foo".to_string()),
2257 Token::Pipe,
2258 Token::Ident("head".to_string()),
2259 Token::Ident("count".to_string()),
2260 Token::Eq,
2261 Token::Int(10),
2262 ]
2263 );
2264 }
2265
2266 #[test]
2271 fn short_flag() {
2272 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2273 assert_eq!(lex("-a"), vec![Token::ShortFlag("a".to_string())]);
2274 assert_eq!(lex("-v"), vec![Token::ShortFlag("v".to_string())]);
2275 }
2276
2277 #[test]
2278 fn short_flag_combined() {
2279 assert_eq!(lex("-la"), vec![Token::ShortFlag("la".to_string())]);
2281 assert_eq!(lex("-vvv"), vec![Token::ShortFlag("vvv".to_string())]);
2282 }
2283
2284 #[test]
2285 fn long_flag() {
2286 assert_eq!(lex("--force"), vec![Token::LongFlag("force".to_string())]);
2287 assert_eq!(lex("--verbose"), vec![Token::LongFlag("verbose".to_string())]);
2288 assert_eq!(lex("--foo-bar"), vec![Token::LongFlag("foo-bar".to_string())]);
2289 }
2290
2291 #[test]
2292 fn double_dash() {
2293 assert_eq!(lex("--"), vec![Token::DoubleDash]);
2295 }
2296
2297 #[test]
2298 fn flags_vs_negative_numbers() {
2299 assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2301 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2303 assert_eq!(
2306 lex("-1 a"),
2307 vec![Token::Int(-1), Token::Ident("a".to_string())]
2308 );
2309 }
2310
2311 #[test]
2312 fn command_with_flags() {
2313 assert_eq!(
2314 lex("ls -l"),
2315 vec![
2316 Token::Ident("ls".to_string()),
2317 Token::ShortFlag("l".to_string()),
2318 ]
2319 );
2320 assert_eq!(
2321 lex("git commit -m"),
2322 vec![
2323 Token::Ident("git".to_string()),
2324 Token::Ident("commit".to_string()),
2325 Token::ShortFlag("m".to_string()),
2326 ]
2327 );
2328 assert_eq!(
2329 lex("git push --force"),
2330 vec![
2331 Token::Ident("git".to_string()),
2332 Token::Ident("push".to_string()),
2333 Token::LongFlag("force".to_string()),
2334 ]
2335 );
2336 }
2337
2338 #[test]
2339 fn flag_with_value() {
2340 assert_eq!(
2341 lex(r#"git commit -m "message""#),
2342 vec![
2343 Token::Ident("git".to_string()),
2344 Token::Ident("commit".to_string()),
2345 Token::ShortFlag("m".to_string()),
2346 Token::String("message".to_string()),
2347 ]
2348 );
2349 assert_eq!(
2350 lex(r#"--message="hello""#),
2351 vec![
2352 Token::LongFlag("message".to_string()),
2353 Token::Eq,
2354 Token::String("hello".to_string()),
2355 ]
2356 );
2357 }
2358
2359 #[test]
2360 fn end_of_flags_marker() {
2361 assert_eq!(
2362 lex("git checkout -- file"),
2363 vec![
2364 Token::Ident("git".to_string()),
2365 Token::Ident("checkout".to_string()),
2366 Token::DoubleDash,
2367 Token::Ident("file".to_string()),
2368 ]
2369 );
2370 }
2371
2372 #[test]
2377 fn local_keyword() {
2378 assert_eq!(lex("local"), vec![Token::Local]);
2379 assert_eq!(
2380 lex("local X = 5"),
2381 vec![Token::Local, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2382 );
2383 }
2384
2385 #[test]
2386 fn simple_var_ref() {
2387 assert_eq!(lex("$X"), vec![Token::SimpleVarRef("X".to_string())]);
2388 assert_eq!(lex("$foo"), vec![Token::SimpleVarRef("foo".to_string())]);
2389 assert_eq!(lex("$foo_bar"), vec![Token::SimpleVarRef("foo_bar".to_string())]);
2390 assert_eq!(lex("$_private"), vec![Token::SimpleVarRef("_private".to_string())]);
2391 }
2392
2393 #[test]
2394 fn simple_var_ref_in_command() {
2395 assert_eq!(
2396 lex("echo $NAME"),
2397 vec![Token::Ident("echo".to_string()), Token::SimpleVarRef("NAME".to_string())]
2398 );
2399 }
2400
2401 #[test]
2402 fn single_quoted_strings() {
2403 assert_eq!(lex("'hello'"), vec![Token::SingleString("hello".to_string())]);
2404 assert_eq!(lex("'hello world'"), vec![Token::SingleString("hello world".to_string())]);
2405 assert_eq!(lex("''"), vec![Token::SingleString("".to_string())]);
2406 assert_eq!(lex(r"'no $VAR here'"), vec![Token::SingleString("no $VAR here".to_string())]);
2408 assert_eq!(lex(r"'backslash \n stays'"), vec![Token::SingleString(r"backslash \n stays".to_string())]);
2409 }
2410
2411 #[test]
2412 fn test_brackets() {
2413 assert_eq!(lex("[["), vec![Token::LBracket, Token::LBracket]);
2415 assert_eq!(lex("]]"), vec![Token::RBracket, Token::RBracket]);
2416 assert_eq!(
2417 lex("[[ -f file ]]"),
2418 vec![
2419 Token::LBracket,
2420 Token::LBracket,
2421 Token::ShortFlag("f".to_string()),
2422 Token::Ident("file".to_string()),
2423 Token::RBracket,
2424 Token::RBracket
2425 ]
2426 );
2427 }
2428
2429 #[test]
2430 fn test_expression_syntax() {
2431 assert_eq!(
2432 lex(r#"[[ $X == "value" ]]"#),
2433 vec![
2434 Token::LBracket,
2435 Token::LBracket,
2436 Token::SimpleVarRef("X".to_string()),
2437 Token::EqEq,
2438 Token::String("value".to_string()),
2439 Token::RBracket,
2440 Token::RBracket
2441 ]
2442 );
2443 }
2444
2445 #[test]
2446 fn bash_style_assignment() {
2447 assert_eq!(
2449 lex(r#"NAME="value""#),
2450 vec![
2451 Token::Ident("NAME".to_string()),
2452 Token::Eq,
2453 Token::String("value".to_string())
2454 ]
2455 );
2456 }
2457
2458 #[test]
2459 fn positional_params() {
2460 assert_eq!(lex("$0"), vec![Token::Positional(0)]);
2461 assert_eq!(lex("$1"), vec![Token::Positional(1)]);
2462 assert_eq!(lex("$9"), vec![Token::Positional(9)]);
2463 assert_eq!(lex("$@"), vec![Token::AllArgs]);
2464 assert_eq!(lex("$#"), vec![Token::ArgCount]);
2465 }
2466
2467 #[test]
2468 fn positional_in_context() {
2469 assert_eq!(
2470 lex("echo $1 $2"),
2471 vec![
2472 Token::Ident("echo".to_string()),
2473 Token::Positional(1),
2474 Token::Positional(2),
2475 ]
2476 );
2477 }
2478
2479 #[test]
2480 fn var_length() {
2481 assert_eq!(lex("${#X}"), vec![Token::VarLength("X".to_string())]);
2482 assert_eq!(lex("${#NAME}"), vec![Token::VarLength("NAME".to_string())]);
2483 assert_eq!(lex("${#foo_bar}"), vec![Token::VarLength("foo_bar".to_string())]);
2484 }
2485
2486 #[test]
2487 fn var_length_in_context() {
2488 assert_eq!(
2489 lex("echo ${#NAME}"),
2490 vec![
2491 Token::Ident("echo".to_string()),
2492 Token::VarLength("NAME".to_string()),
2493 ]
2494 );
2495 }
2496
2497 #[test]
2502 fn plus_flag() {
2503 assert_eq!(lex("+e"), vec![Token::PlusFlag("e".to_string())]);
2505 assert_eq!(lex("+x"), vec![Token::PlusFlag("x".to_string())]);
2506 assert_eq!(lex("+ex"), vec![Token::PlusFlag("ex".to_string())]);
2507 }
2508
2509 #[test]
2510 fn set_with_plus_flag() {
2511 assert_eq!(
2512 lex("set +e"),
2513 vec![
2514 Token::Set,
2515 Token::PlusFlag("e".to_string()),
2516 ]
2517 );
2518 }
2519
2520 #[test]
2521 fn set_with_multiple_flags() {
2522 assert_eq!(
2523 lex("set -e -u"),
2524 vec![
2525 Token::Set,
2526 Token::ShortFlag("e".to_string()),
2527 Token::ShortFlag("u".to_string()),
2528 ]
2529 );
2530 }
2531
2532 #[test]
2533 fn flags_vs_negative_numbers_edge_cases() {
2534 assert_eq!(
2536 lex("-1 a"),
2537 vec![Token::Int(-1), Token::Ident("a".to_string())]
2538 );
2539 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2541 assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2543 }
2544
2545 #[test]
2546 fn single_dash_is_minus_alone() {
2547 let result = tokenize("-").expect("should lex");
2549 assert_eq!(result.len(), 1);
2550 assert!(matches!(result[0].token, Token::MinusAlone));
2551 }
2552
2553 #[test]
2554 fn plus_bare_for_date_format() {
2555 let result = tokenize("+%s").expect("should lex");
2557 assert_eq!(result.len(), 1);
2558 assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%s"));
2559
2560 let result = tokenize("+%Y-%m-%d").expect("should lex");
2562 assert_eq!(result.len(), 1);
2563 assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%Y-%m-%d"));
2564 }
2565
2566 #[test]
2567 fn plus_flag_still_works() {
2568 let result = tokenize("+e").expect("should lex");
2570 assert_eq!(result.len(), 1);
2571 assert!(matches!(result[0].token, Token::PlusFlag(ref s) if s == "e"));
2572 }
2573
2574 #[test]
2575 fn while_keyword_vs_while_loop() {
2576 assert_eq!(lex("while"), vec![Token::While]);
2578 assert_eq!(
2580 lex("while true"),
2581 vec![Token::While, Token::True]
2582 );
2583 }
2584
2585 #[test]
2586 fn control_flow_keywords() {
2587 assert_eq!(lex("break"), vec![Token::Break]);
2588 assert_eq!(lex("continue"), vec![Token::Continue]);
2589 assert_eq!(lex("return"), vec![Token::Return]);
2590 assert_eq!(lex("exit"), vec![Token::Exit]);
2591 }
2592
2593 #[test]
2594 fn control_flow_with_numbers() {
2595 assert_eq!(
2596 lex("break 2"),
2597 vec![Token::Break, Token::Int(2)]
2598 );
2599 assert_eq!(
2600 lex("continue 3"),
2601 vec![Token::Continue, Token::Int(3)]
2602 );
2603 assert_eq!(
2604 lex("exit 1"),
2605 vec![Token::Exit, Token::Int(1)]
2606 );
2607 }
2608
2609 #[test]
2614 fn heredoc_simple() {
2615 let source = "cat <<EOF\nhello\nworld\nEOF";
2616 let tokens = lex(source);
2617 assert_eq!(tokens, vec![
2618 Token::Ident("cat".to_string()),
2619 Token::HereDocStart,
2620 Token::HereDoc(HereDocData { content: "hello\nworld".to_string(), literal: false }),
2621 Token::Newline,
2622 ]);
2623 }
2624
2625 #[test]
2626 fn heredoc_empty() {
2627 let source = "cat <<EOF\nEOF";
2628 let tokens = lex(source);
2629 assert_eq!(tokens, vec![
2630 Token::Ident("cat".to_string()),
2631 Token::HereDocStart,
2632 Token::HereDoc(HereDocData { content: "".to_string(), literal: false }),
2633 Token::Newline,
2634 ]);
2635 }
2636
2637 #[test]
2638 fn heredoc_with_special_chars() {
2639 let source = "cat <<EOF\n$VAR and \"quoted\" 'single'\nEOF";
2640 let tokens = lex(source);
2641 assert_eq!(tokens, vec![
2642 Token::Ident("cat".to_string()),
2643 Token::HereDocStart,
2644 Token::HereDoc(HereDocData { content: "$VAR and \"quoted\" 'single'".to_string(), literal: false }),
2645 Token::Newline,
2646 ]);
2647 }
2648
2649 #[test]
2650 fn heredoc_multiline() {
2651 let source = "cat <<END\nline1\nline2\nline3\nEND";
2652 let tokens = lex(source);
2653 assert_eq!(tokens, vec![
2654 Token::Ident("cat".to_string()),
2655 Token::HereDocStart,
2656 Token::HereDoc(HereDocData { content: "line1\nline2\nline3".to_string(), literal: false }),
2657 Token::Newline,
2658 ]);
2659 }
2660
2661 #[test]
2662 fn heredoc_in_command() {
2663 let source = "cat <<EOF\nhello\nEOF\necho goodbye";
2664 let tokens = lex(source);
2665 assert_eq!(tokens, vec![
2666 Token::Ident("cat".to_string()),
2667 Token::HereDocStart,
2668 Token::HereDoc(HereDocData { content: "hello".to_string(), literal: false }),
2669 Token::Newline,
2670 Token::Ident("echo".to_string()),
2671 Token::Ident("goodbye".to_string()),
2672 ]);
2673 }
2674
2675 #[test]
2676 fn heredoc_strip_tabs() {
2677 let source = "cat <<-EOF\n\thello\n\tworld\n\tEOF";
2678 let tokens = lex(source);
2679 assert_eq!(tokens, vec![
2681 Token::Ident("cat".to_string()),
2682 Token::HereDocStart,
2683 Token::HereDoc(HereDocData { content: "\thello\n\tworld".to_string(), literal: false }),
2684 Token::Newline,
2685 ]);
2686 }
2687
2688 #[test]
2693 fn arithmetic_simple() {
2694 let source = "$((1 + 2))";
2695 let tokens = lex(source);
2696 assert_eq!(tokens, vec![Token::Arithmetic("1 + 2".to_string())]);
2697 }
2698
2699 #[test]
2700 fn arithmetic_in_assignment() {
2701 let source = "X=$((5 * 3))";
2702 let tokens = lex(source);
2703 assert_eq!(tokens, vec![
2704 Token::Ident("X".to_string()),
2705 Token::Eq,
2706 Token::Arithmetic("5 * 3".to_string()),
2707 ]);
2708 }
2709
2710 #[test]
2711 fn arithmetic_with_nested_parens() {
2712 let source = "$((2 * (3 + 4)))";
2713 let tokens = lex(source);
2714 assert_eq!(tokens, vec![Token::Arithmetic("2 * (3 + 4)".to_string())]);
2715 }
2716
2717 #[test]
2718 fn arithmetic_with_variable() {
2719 let source = "$((X + 1))";
2720 let tokens = lex(source);
2721 assert_eq!(tokens, vec![Token::Arithmetic("X + 1".to_string())]);
2722 }
2723
2724 #[test]
2725 fn arithmetic_command_subst_not_confused() {
2726 let source = "$(echo hello)";
2728 let tokens = lex(source);
2729 assert_eq!(tokens, vec![
2730 Token::CmdSubstStart,
2731 Token::Ident("echo".to_string()),
2732 Token::Ident("hello".to_string()),
2733 Token::RParen,
2734 ]);
2735 }
2736
2737 #[test]
2738 fn arithmetic_nesting_limit() {
2739 let open_parens = "(".repeat(300);
2741 let close_parens = ")".repeat(300);
2742 let source = format!("$(({}1{}))", open_parens, close_parens);
2743 let result = tokenize(&source);
2744 assert!(result.is_err());
2745 let errors = result.unwrap_err();
2746 assert_eq!(errors.len(), 1);
2747 assert_eq!(errors[0].token, LexerError::NestingTooDeep);
2748 }
2749
2750 #[test]
2751 fn arithmetic_nesting_within_limit() {
2752 let source = "$((((1 + 2) * 3)))";
2754 let tokens = lex(source);
2755 assert_eq!(tokens, vec![Token::Arithmetic("((1 + 2) * 3)".to_string())]);
2756 }
2757
2758 #[test]
2763 fn token_categories() {
2764 assert_eq!(Token::If.category(), TokenCategory::Keyword);
2766 assert_eq!(Token::Then.category(), TokenCategory::Keyword);
2767 assert_eq!(Token::For.category(), TokenCategory::Keyword);
2768 assert_eq!(Token::Function.category(), TokenCategory::Keyword);
2769 assert_eq!(Token::True.category(), TokenCategory::Keyword);
2770 assert_eq!(Token::TypeString.category(), TokenCategory::Keyword);
2771
2772 assert_eq!(Token::Pipe.category(), TokenCategory::Operator);
2774 assert_eq!(Token::And.category(), TokenCategory::Operator);
2775 assert_eq!(Token::Or.category(), TokenCategory::Operator);
2776 assert_eq!(Token::StderrToStdout.category(), TokenCategory::Operator);
2777 assert_eq!(Token::GtGt.category(), TokenCategory::Operator);
2778
2779 assert_eq!(Token::String("test".to_string()).category(), TokenCategory::String);
2781 assert_eq!(Token::SingleString("test".to_string()).category(), TokenCategory::String);
2782 assert_eq!(Token::HereDoc(HereDocData { content: "test".to_string(), literal: false }).category(), TokenCategory::String);
2783
2784 assert_eq!(Token::Int(42).category(), TokenCategory::Number);
2786 assert_eq!(Token::Float(3.14).category(), TokenCategory::Number);
2787 assert_eq!(Token::Arithmetic("1+2".to_string()).category(), TokenCategory::Number);
2788
2789 assert_eq!(Token::SimpleVarRef("X".to_string()).category(), TokenCategory::Variable);
2791 assert_eq!(Token::VarRef("${X}".to_string()).category(), TokenCategory::Variable);
2792 assert_eq!(Token::Positional(1).category(), TokenCategory::Variable);
2793 assert_eq!(Token::AllArgs.category(), TokenCategory::Variable);
2794 assert_eq!(Token::ArgCount.category(), TokenCategory::Variable);
2795 assert_eq!(Token::LastExitCode.category(), TokenCategory::Variable);
2796 assert_eq!(Token::CurrentPid.category(), TokenCategory::Variable);
2797
2798 assert_eq!(Token::ShortFlag("l".to_string()).category(), TokenCategory::Flag);
2800 assert_eq!(Token::LongFlag("verbose".to_string()).category(), TokenCategory::Flag);
2801 assert_eq!(Token::PlusFlag("e".to_string()).category(), TokenCategory::Flag);
2802 assert_eq!(Token::DoubleDash.category(), TokenCategory::Flag);
2803
2804 assert_eq!(Token::Semi.category(), TokenCategory::Punctuation);
2806 assert_eq!(Token::LParen.category(), TokenCategory::Punctuation);
2807 assert_eq!(Token::LBracket.category(), TokenCategory::Punctuation);
2808 assert_eq!(Token::Newline.category(), TokenCategory::Punctuation);
2809
2810 assert_eq!(Token::Comment.category(), TokenCategory::Comment);
2812
2813 assert_eq!(Token::Path("/tmp/file".to_string()).category(), TokenCategory::Path);
2815
2816 assert_eq!(Token::Ident("echo".to_string()).category(), TokenCategory::Command);
2818
2819 assert_eq!(Token::InvalidNumberIdent.category(), TokenCategory::Error);
2821 assert_eq!(Token::InvalidFloatNoLeading.category(), TokenCategory::Error);
2822 assert_eq!(Token::InvalidFloatNoTrailing.category(), TokenCategory::Error);
2823 }
2824
2825 #[test]
2826 fn test_heredoc_piped_to_command() {
2827 let tokens = tokenize("cat <<EOF | jq\n{\"key\": \"val\"}\nEOF").unwrap();
2830 let heredoc_pos = tokens.iter().position(|t| matches!(t.token, Token::HereDoc(_)));
2831 let pipe_pos = tokens.iter().position(|t| matches!(t.token, Token::Pipe));
2832 assert!(heredoc_pos.is_some(), "should have a heredoc token");
2833 assert!(pipe_pos.is_some(), "should have a pipe token");
2834 assert!(
2835 pipe_pos.unwrap() > heredoc_pos.unwrap(),
2836 "Pipe must come after heredoc, got heredoc at {}, pipe at {}. Tokens: {:?}",
2837 heredoc_pos.unwrap(), pipe_pos.unwrap(), tokens,
2838 );
2839 }
2840
2841 #[test]
2842 fn test_heredoc_standalone_still_works() {
2843 let tokens = tokenize("cat <<EOF\nhello\nEOF").unwrap();
2845 assert!(tokens.iter().any(|t| matches!(t.token, Token::HereDoc(_))));
2846 assert!(!tokens.iter().any(|t| matches!(t.token, Token::Pipe)));
2847 }
2848
2849 #[test]
2850 fn test_heredoc_preserves_leading_empty_lines() {
2851 let tokens = tokenize("cat <<EOF\n\nhello\nEOF").unwrap();
2853 let heredoc = tokens.iter().find_map(|t| {
2854 if let Token::HereDoc(data) = &t.token {
2855 Some(data.clone())
2856 } else {
2857 None
2858 }
2859 });
2860 assert!(heredoc.is_some(), "should have a heredoc token");
2861 let data = heredoc.unwrap();
2862 assert!(data.content.starts_with('\n'), "leading empty line must be preserved, got: {:?}", data.content);
2863 assert_eq!(data.content, "\nhello");
2864 }
2865
2866 #[test]
2867 fn test_heredoc_quoted_delimiter_sets_literal() {
2868 let tokens = tokenize("cat <<'EOF'\nhello $HOME\nEOF").unwrap();
2870 let heredoc = tokens.iter().find_map(|t| {
2871 if let Token::HereDoc(data) = &t.token {
2872 Some(data.clone())
2873 } else {
2874 None
2875 }
2876 });
2877 assert!(heredoc.is_some(), "should have a heredoc token");
2878 let data = heredoc.unwrap();
2879 assert!(data.literal, "quoted delimiter should set literal=true");
2880 assert_eq!(data.content, "hello $HOME");
2881 }
2882
2883 #[test]
2884 fn test_heredoc_unquoted_delimiter_not_literal() {
2885 let tokens = tokenize("cat <<EOF\nhello $HOME\nEOF").unwrap();
2887 let heredoc = tokens.iter().find_map(|t| {
2888 if let Token::HereDoc(data) = &t.token {
2889 Some(data.clone())
2890 } else {
2891 None
2892 }
2893 });
2894 assert!(heredoc.is_some(), "should have a heredoc token");
2895 let data = heredoc.unwrap();
2896 assert!(!data.literal, "unquoted delimiter should have literal=false");
2897 }
2898
2899 #[test]
2904 fn colon_double_in_word() {
2905 assert_eq!(lex("foo::bar"), vec![Token::Ident("foo::bar".into())]);
2906 }
2907
2908 #[test]
2909 fn colon_single_in_word() {
2910 assert_eq!(lex("a:b:c"), vec![Token::Ident("a:b:c".into())]);
2911 }
2912
2913 #[test]
2914 fn colon_with_port() {
2915 assert_eq!(lex("host:8080"), vec![Token::Ident("host:8080".into())]);
2916 }
2917
2918 #[test]
2919 fn colon_standalone() {
2920 assert_eq!(lex(":"), vec![Token::Colon]);
2921 }
2922
2923 #[test]
2924 fn colon_spaced_no_merge() {
2925 assert_eq!(
2926 lex("foo : bar"),
2927 vec![
2928 Token::Ident("foo".into()),
2929 Token::Colon,
2930 Token::Ident("bar".into()),
2931 ]
2932 );
2933 }
2934
2935 #[test]
2936 fn colon_in_command_arg() {
2937 assert_eq!(
2938 lex("echo foo::bar"),
2939 vec![
2940 Token::Ident("echo".into()),
2941 Token::Ident("foo::bar".into()),
2942 ]
2943 );
2944 }
2945
2946 #[test]
2947 fn colon_trailing() {
2948 assert_eq!(lex("foo:"), vec![Token::Ident("foo:".into())]);
2950 }
2951
2952 #[test]
2953 fn colon_leading() {
2954 assert_eq!(lex(":foo"), vec![Token::Ident(":foo".into())]);
2956 }
2957
2958 #[test]
2959 fn colon_with_path() {
2960 assert_eq!(
2962 lex("/usr/bin:8080"),
2963 vec![Token::Ident("/usr/bin:8080".into())]
2964 );
2965 }
2966}