1use logos::{Logos, Span};
17use std::fmt;
18use std::sync::atomic::{AtomicU64, Ordering};
19use std::time::{SystemTime, UNIX_EPOCH};
20
21static MARKER_COUNTER: AtomicU64 = AtomicU64::new(0);
23
24const MAX_PAREN_DEPTH: usize = 256;
27
28#[derive(Debug, Clone)]
32struct SpanReplacement {
33 preprocessed_pos: usize,
35 marker_len: usize,
37 original_len: usize,
39}
40
41fn correct_span(span: Span, replacements: &[SpanReplacement]) -> Span {
43 let mut start_adjustment: isize = 0;
44 let mut end_adjustment: isize = 0;
45
46 for r in replacements {
47 let delta = r.original_len as isize - r.marker_len as isize;
49
50 if span.start > r.preprocessed_pos + r.marker_len {
52 start_adjustment += delta;
53 } else if span.start > r.preprocessed_pos {
54 start_adjustment += delta;
57 }
58
59 if span.end > r.preprocessed_pos + r.marker_len {
61 end_adjustment += delta;
62 } else if span.end > r.preprocessed_pos {
63 end_adjustment += delta;
65 }
66 }
67
68 let new_start = (span.start as isize + start_adjustment).max(0) as usize;
69 let new_end = (span.end as isize + end_adjustment).max(new_start as isize) as usize;
70 new_start..new_end
71}
72
73fn unique_marker_id() -> String {
76 let timestamp = SystemTime::now()
77 .duration_since(UNIX_EPOCH)
78 .map(|d| d.as_nanos())
79 .unwrap_or(0);
80 let counter = MARKER_COUNTER.fetch_add(1, Ordering::Relaxed);
81 let pid = std::process::id();
82 format!("{:x}_{:x}_{:x}", timestamp, counter, pid)
83}
84
85#[derive(Debug, Clone, PartialEq)]
87pub struct Spanned<T> {
88 pub token: T,
89 pub span: Span,
90}
91
92impl<T> Spanned<T> {
93 pub fn new(token: T, span: Span) -> Self {
94 Self { token, span }
95 }
96}
97
98#[derive(Debug, Clone, PartialEq, Default)]
100pub enum LexerError {
101 #[default]
102 UnexpectedCharacter,
103 UnterminatedString,
104 UnterminatedVarRef,
105 InvalidEscape,
106 InvalidNumber,
107 AmbiguousBoolean(String),
108 AmbiguousBooleanLike(String),
109 InvalidNumberIdent(String),
110 InvalidFloatNoLeading,
111 InvalidFloatNoTrailing,
112 NestingTooDeep,
114}
115
116impl fmt::Display for LexerError {
117 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
118 match self {
119 LexerError::UnexpectedCharacter => write!(f, "unexpected character"),
120 LexerError::UnterminatedString => write!(f, "unterminated string"),
121 LexerError::UnterminatedVarRef => write!(f, "unterminated variable reference"),
122 LexerError::InvalidEscape => write!(f, "invalid escape sequence"),
123 LexerError::InvalidNumber => write!(f, "invalid number"),
124 LexerError::AmbiguousBoolean(s) => {
125 write!(f, "ambiguous boolean, use lowercase '{}'", s.to_lowercase())
126 }
127 LexerError::AmbiguousBooleanLike(s) => {
128 let suggest = if s.eq_ignore_ascii_case("yes") { "true" } else { "false" };
129 write!(f, "ambiguous boolean-like '{}', use '{}' or '\"{}\"'", s, suggest, s)
130 }
131 LexerError::InvalidNumberIdent(s) => {
132 write!(f, "identifier cannot start with digit: {}", s)
133 }
134 LexerError::InvalidFloatNoLeading => write!(f, "float must have leading digit"),
135 LexerError::InvalidFloatNoTrailing => write!(f, "float must have trailing digit"),
136 LexerError::NestingTooDeep => write!(f, "nesting depth exceeded (max {})", MAX_PAREN_DEPTH),
137 }
138 }
139}
140
141#[derive(Debug, Clone, PartialEq)]
153pub struct HereDocData {
154 pub content: String,
155 pub literal: bool,
156}
157
158#[derive(Logos, Debug, Clone, PartialEq)]
159#[logos(error = LexerError)]
160#[logos(skip r"[ \t]+")]
161pub enum Token {
162 #[token("set")]
166 Set,
167
168 #[token("local")]
169 Local,
170
171 #[token("if")]
172 If,
173
174 #[token("then")]
175 Then,
176
177 #[token("else")]
178 Else,
179
180 #[token("elif")]
181 Elif,
182
183 #[token("fi")]
184 Fi,
185
186 #[token("for")]
187 For,
188
189 #[token("while")]
190 While,
191
192 #[token("in")]
193 In,
194
195 #[token("do")]
196 Do,
197
198 #[token("done")]
199 Done,
200
201 #[token("case")]
202 Case,
203
204 #[token("esac")]
205 Esac,
206
207 #[token("function")]
208 Function,
209
210 #[token("break")]
211 Break,
212
213 #[token("continue")]
214 Continue,
215
216 #[token("return")]
217 Return,
218
219 #[token("exit")]
220 Exit,
221
222 #[token("true")]
223 True,
224
225 #[token("false")]
226 False,
227
228 #[token("string")]
232 TypeString,
233
234 #[token("int")]
235 TypeInt,
236
237 #[token("float")]
238 TypeFloat,
239
240 #[token("bool")]
241 TypeBool,
242
243 #[token("&&")]
247 And,
248
249 #[token("||")]
250 Or,
251
252 #[token("==")]
253 EqEq,
254
255 #[token("!=")]
256 NotEq,
257
258 #[token("=~")]
259 Match,
260
261 #[token("!~")]
262 NotMatch,
263
264 #[token(">=")]
265 GtEq,
266
267 #[token("<=")]
268 LtEq,
269
270 #[token(">>")]
271 GtGt,
272
273 #[token("2>&1")]
274 StderrToStdout,
275
276 #[token("1>&2")]
277 StdoutToStderr,
278
279 #[token(">&2")]
280 StdoutToStderr2,
281
282 #[token("2>")]
283 Stderr,
284
285 #[token("&>")]
286 Both,
287
288 #[token("<<")]
289 HereDocStart,
290
291 #[token(";;")]
292 DoubleSemi,
293
294 #[token("=")]
298 Eq,
299
300 #[token("|")]
301 Pipe,
302
303 #[token("&")]
304 Amp,
305
306 #[token(">")]
307 Gt,
308
309 #[token("<")]
310 Lt,
311
312 #[token(";")]
313 Semi,
314
315 #[token(":")]
316 Colon,
317
318 #[token(",")]
319 Comma,
320
321 #[token("..")]
322 DotDot,
323
324 #[token(".")]
325 Dot,
326
327 #[regex(r"~[a-zA-Z0-9_./+-]+", lex_tilde_path, priority = 3)]
329 TildePath(String),
330
331 #[token("~")]
333 Tilde,
334
335 #[regex(r"\.\./[a-zA-Z0-9_./-]+", lex_relative_path, priority = 3)]
337 RelativePath(String),
338
339 #[regex(r"\./[a-zA-Z0-9_./-]+", lex_dot_slash_path, priority = 3)]
341 DotSlashPath(String),
342
343 #[token("{")]
344 LBrace,
345
346 #[token("}")]
347 RBrace,
348
349 #[token("[")]
350 LBracket,
351
352 #[token("]")]
353 RBracket,
354
355 #[token("(")]
356 LParen,
357
358 #[token(")")]
359 RParen,
360
361 #[token("*")]
362 Star,
363
364 #[token("!")]
365 Bang,
366
367 #[token("?")]
368 Question,
369
370 Arithmetic(String),
377
378 #[token("$(")]
380 CmdSubstStart,
381
382 #[regex(r"--[a-zA-Z][a-zA-Z0-9-]*", lex_long_flag, priority = 3)]
388 LongFlag(String),
389
390 #[regex(r"-[a-zA-Z][a-zA-Z0-9]*", lex_short_flag, priority = 3)]
392 ShortFlag(String),
393
394 #[regex(r"\+[a-zA-Z][a-zA-Z0-9]*", lex_plus_flag, priority = 3)]
396 PlusFlag(String),
397
398 #[token("--")]
400 DoubleDash,
401
402 #[regex(r"\+[^a-zA-Z\s][^\s]*", lex_plus_bare, priority = 2)]
405 PlusBare(String),
406
407 #[regex(r"-[^a-zA-Z0-9\s\-][^\s]*", lex_minus_bare, priority = 1)]
411 MinusBare(String),
412
413 #[token("-")]
417 MinusAlone,
418
419 #[regex(r#""([^"\\]|\\.)*""#, lex_string)]
425 String(String),
426
427 #[regex(r"'[^']*'", lex_single_string)]
429 SingleString(String),
430
431 #[regex(r"\$\{[^}]+\}", lex_varref)]
433 VarRef(String),
434
435 #[regex(r"\$[a-zA-Z_][a-zA-Z0-9_]*", lex_simple_varref)]
437 SimpleVarRef(String),
438
439 #[regex(r"\$[0-9]", lex_positional)]
441 Positional(usize),
442
443 #[token("$@")]
445 AllArgs,
446
447 #[token("$#")]
449 ArgCount,
450
451 #[token("$?")]
453 LastExitCode,
454
455 #[token("$$")]
457 CurrentPid,
458
459 #[regex(r"\$\{#[a-zA-Z_][a-zA-Z0-9_]*\}", lex_var_length)]
461 VarLength(String),
462
463 HereDoc(HereDocData),
466
467 #[regex(r"-?[0-9]+", lex_int, priority = 2)]
469 Int(i64),
470
471 #[regex(r"-?[0-9]+\.[0-9]+", lex_float)]
473 Float(f64),
474
475 #[regex(r"[0-9]+[a-zA-Z_][a-zA-Z0-9_-]*", lex_invalid_number_ident, priority = 3)]
481 InvalidNumberIdent,
482
483 #[regex(r"\.[0-9]+", lex_invalid_float_no_leading, priority = 3)]
485 InvalidFloatNoLeading,
486
487 #[regex(r"[0-9]+\.", lex_invalid_float_no_trailing, priority = 2)]
490 InvalidFloatNoTrailing,
491
492 #[regex(r"/[a-zA-Z0-9_./+-]*", lex_path)]
498 Path(String),
499
500 #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*", lex_ident)]
507 Ident(String),
508
509 #[regex(r"#[^\n\r]*", allow_greedy = true)]
515 Comment,
516
517 #[regex(r"\n|\r\n")]
519 Newline,
520
521 #[regex(r"\\[ \t]*(\n|\r\n)")]
523 LineContinuation,
524}
525
526#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
531pub enum TokenCategory {
532 Keyword,
534 Operator,
536 String,
538 Number,
540 Variable,
542 Comment,
544 Punctuation,
546 Command,
548 Path,
550 Flag,
552 Error,
554}
555
556impl Token {
557 pub fn category(&self) -> TokenCategory {
559 match self {
560 Token::If
562 | Token::Then
563 | Token::Else
564 | Token::Elif
565 | Token::Fi
566 | Token::For
567 | Token::In
568 | Token::Do
569 | Token::Done
570 | Token::While
571 | Token::Case
572 | Token::Esac
573 | Token::Function
574 | Token::Return
575 | Token::Break
576 | Token::Continue
577 | Token::Exit
578 | Token::Set
579 | Token::Local
580 | Token::True
581 | Token::False
582 | Token::TypeString
583 | Token::TypeInt
584 | Token::TypeFloat
585 | Token::TypeBool => TokenCategory::Keyword,
586
587 Token::Pipe
589 | Token::And
590 | Token::Or
591 | Token::Amp
592 | Token::Eq
593 | Token::EqEq
594 | Token::NotEq
595 | Token::Match
596 | Token::NotMatch
597 | Token::Lt
598 | Token::Gt
599 | Token::LtEq
600 | Token::GtEq
601 | Token::GtGt
602 | Token::Stderr
603 | Token::Both
604 | Token::HereDocStart
605 | Token::StderrToStdout
606 | Token::StdoutToStderr
607 | Token::StdoutToStderr2 => TokenCategory::Operator,
608
609 Token::String(_) | Token::SingleString(_) | Token::HereDoc(_) => TokenCategory::String,
611
612 Token::Int(_) | Token::Float(_) | Token::Arithmetic(_) => TokenCategory::Number,
614
615 Token::VarRef(_)
617 | Token::SimpleVarRef(_)
618 | Token::Positional(_)
619 | Token::AllArgs
620 | Token::ArgCount
621 | Token::VarLength(_)
622 | Token::LastExitCode
623 | Token::CurrentPid => TokenCategory::Variable,
624
625 Token::LongFlag(_)
627 | Token::ShortFlag(_)
628 | Token::PlusFlag(_)
629 | Token::DoubleDash => TokenCategory::Flag,
630
631 Token::Semi
633 | Token::DoubleSemi
634 | Token::Colon
635 | Token::Comma
636 | Token::Dot
637 | Token::LParen
638 | Token::RParen
639 | Token::LBrace
640 | Token::RBrace
641 | Token::LBracket
642 | Token::RBracket
643 | Token::Bang
644 | Token::Question
645 | Token::Star
646 | Token::Newline
647 | Token::LineContinuation
648 | Token::CmdSubstStart => TokenCategory::Punctuation,
649
650 Token::Comment => TokenCategory::Comment,
652
653 Token::Path(_)
655 | Token::TildePath(_)
656 | Token::RelativePath(_)
657 | Token::Tilde
658 | Token::DotDot
659 | Token::DotSlashPath(_) => TokenCategory::Path,
660
661 Token::Ident(_)
663 | Token::PlusBare(_)
664 | Token::MinusBare(_)
665 | Token::MinusAlone => TokenCategory::Command,
666
667 Token::InvalidNumberIdent
669 | Token::InvalidFloatNoLeading
670 | Token::InvalidFloatNoTrailing => TokenCategory::Error,
671 }
672 }
673}
674
675fn lex_string(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
677 parse_string_literal(lex.slice())
678}
679
680fn lex_single_string(lex: &mut logos::Lexer<Token>) -> String {
682 let s = lex.slice();
683 s[1..s.len() - 1].to_string()
685}
686
687fn lex_varref(lex: &mut logos::Lexer<Token>) -> String {
689 lex.slice().to_string()
691}
692
693fn lex_simple_varref(lex: &mut logos::Lexer<Token>) -> String {
695 lex.slice()[1..].to_string()
697}
698
699fn lex_positional(lex: &mut logos::Lexer<Token>) -> usize {
701 lex.slice()[1..].parse().unwrap_or(0)
703}
704
705fn lex_var_length(lex: &mut logos::Lexer<Token>) -> String {
707 let s = lex.slice();
709 s[3..s.len() - 1].to_string()
710}
711
712fn lex_int(lex: &mut logos::Lexer<Token>) -> Result<i64, LexerError> {
714 lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
715}
716
717fn lex_float(lex: &mut logos::Lexer<Token>) -> Result<f64, LexerError> {
719 lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
720}
721
722fn lex_invalid_number_ident(lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
725 Err(LexerError::InvalidNumberIdent(lex.slice().to_string()))
726}
727
728fn lex_invalid_float_no_leading(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
731 Err(LexerError::InvalidFloatNoLeading)
732}
733
734fn lex_invalid_float_no_trailing(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
737 Err(LexerError::InvalidFloatNoTrailing)
738}
739
740fn lex_ident(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
742 let s = lex.slice();
743
744 match s.to_lowercase().as_str() {
747 "true" | "false" if s != "true" && s != "false" => {
748 return Err(LexerError::AmbiguousBoolean(s.to_string()));
749 }
750 _ => {}
751 }
752
753 if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("no") {
755 return Err(LexerError::AmbiguousBooleanLike(s.to_string()));
756 }
757
758 Ok(s.to_string())
759}
760
761fn lex_long_flag(lex: &mut logos::Lexer<Token>) -> String {
763 lex.slice()[2..].to_string()
765}
766
767fn lex_short_flag(lex: &mut logos::Lexer<Token>) -> String {
769 lex.slice()[1..].to_string()
771}
772
773fn lex_plus_flag(lex: &mut logos::Lexer<Token>) -> String {
775 lex.slice()[1..].to_string()
777}
778
779fn lex_plus_bare(lex: &mut logos::Lexer<Token>) -> String {
781 lex.slice().to_string()
782}
783
784fn lex_minus_bare(lex: &mut logos::Lexer<Token>) -> String {
786 lex.slice().to_string()
787}
788
789fn lex_path(lex: &mut logos::Lexer<Token>) -> String {
791 lex.slice().to_string()
792}
793
794fn lex_tilde_path(lex: &mut logos::Lexer<Token>) -> String {
796 lex.slice().to_string()
797}
798
799fn lex_relative_path(lex: &mut logos::Lexer<Token>) -> String {
801 lex.slice().to_string()
802}
803
804fn lex_dot_slash_path(lex: &mut logos::Lexer<Token>) -> String {
806 lex.slice().to_string()
807}
808
809impl fmt::Display for Token {
810 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
811 match self {
812 Token::Set => write!(f, "set"),
813 Token::Local => write!(f, "local"),
814 Token::If => write!(f, "if"),
815 Token::Then => write!(f, "then"),
816 Token::Else => write!(f, "else"),
817 Token::Elif => write!(f, "elif"),
818 Token::Fi => write!(f, "fi"),
819 Token::For => write!(f, "for"),
820 Token::While => write!(f, "while"),
821 Token::In => write!(f, "in"),
822 Token::Do => write!(f, "do"),
823 Token::Done => write!(f, "done"),
824 Token::Case => write!(f, "case"),
825 Token::Esac => write!(f, "esac"),
826 Token::Function => write!(f, "function"),
827 Token::Break => write!(f, "break"),
828 Token::Continue => write!(f, "continue"),
829 Token::Return => write!(f, "return"),
830 Token::Exit => write!(f, "exit"),
831 Token::True => write!(f, "true"),
832 Token::False => write!(f, "false"),
833 Token::TypeString => write!(f, "string"),
834 Token::TypeInt => write!(f, "int"),
835 Token::TypeFloat => write!(f, "float"),
836 Token::TypeBool => write!(f, "bool"),
837 Token::And => write!(f, "&&"),
838 Token::Or => write!(f, "||"),
839 Token::EqEq => write!(f, "=="),
840 Token::NotEq => write!(f, "!="),
841 Token::Match => write!(f, "=~"),
842 Token::NotMatch => write!(f, "!~"),
843 Token::GtEq => write!(f, ">="),
844 Token::LtEq => write!(f, "<="),
845 Token::GtGt => write!(f, ">>"),
846 Token::StderrToStdout => write!(f, "2>&1"),
847 Token::StdoutToStderr => write!(f, "1>&2"),
848 Token::StdoutToStderr2 => write!(f, ">&2"),
849 Token::Stderr => write!(f, "2>"),
850 Token::Both => write!(f, "&>"),
851 Token::HereDocStart => write!(f, "<<"),
852 Token::DoubleSemi => write!(f, ";;"),
853 Token::Eq => write!(f, "="),
854 Token::Pipe => write!(f, "|"),
855 Token::Amp => write!(f, "&"),
856 Token::Gt => write!(f, ">"),
857 Token::Lt => write!(f, "<"),
858 Token::Semi => write!(f, ";"),
859 Token::Colon => write!(f, ":"),
860 Token::Comma => write!(f, ","),
861 Token::Dot => write!(f, "."),
862 Token::DotDot => write!(f, ".."),
863 Token::Tilde => write!(f, "~"),
864 Token::TildePath(s) => write!(f, "{}", s),
865 Token::RelativePath(s) => write!(f, "{}", s),
866 Token::DotSlashPath(s) => write!(f, "{}", s),
867 Token::LBrace => write!(f, "{{"),
868 Token::RBrace => write!(f, "}}"),
869 Token::LBracket => write!(f, "["),
870 Token::RBracket => write!(f, "]"),
871 Token::LParen => write!(f, "("),
872 Token::RParen => write!(f, ")"),
873 Token::Star => write!(f, "*"),
874 Token::Bang => write!(f, "!"),
875 Token::Question => write!(f, "?"),
876 Token::Arithmetic(s) => write!(f, "ARITHMETIC({})", s),
877 Token::CmdSubstStart => write!(f, "$("),
878 Token::LongFlag(s) => write!(f, "--{}", s),
879 Token::ShortFlag(s) => write!(f, "-{}", s),
880 Token::PlusFlag(s) => write!(f, "+{}", s),
881 Token::DoubleDash => write!(f, "--"),
882 Token::PlusBare(s) => write!(f, "{}", s),
883 Token::MinusBare(s) => write!(f, "{}", s),
884 Token::MinusAlone => write!(f, "-"),
885 Token::String(s) => write!(f, "STRING({:?})", s),
886 Token::SingleString(s) => write!(f, "SINGLESTRING({:?})", s),
887 Token::HereDoc(d) => write!(f, "HEREDOC({:?}, literal={})", d.content, d.literal),
888 Token::VarRef(v) => write!(f, "VARREF({})", v),
889 Token::SimpleVarRef(v) => write!(f, "SIMPLEVARREF({})", v),
890 Token::Positional(n) => write!(f, "${}", n),
891 Token::AllArgs => write!(f, "$@"),
892 Token::ArgCount => write!(f, "$#"),
893 Token::LastExitCode => write!(f, "$?"),
894 Token::CurrentPid => write!(f, "$$"),
895 Token::VarLength(v) => write!(f, "${{#{}}}", v),
896 Token::Int(n) => write!(f, "INT({})", n),
897 Token::Float(n) => write!(f, "FLOAT({})", n),
898 Token::Path(s) => write!(f, "PATH({})", s),
899 Token::Ident(s) => write!(f, "IDENT({})", s),
900 Token::Comment => write!(f, "COMMENT"),
901 Token::Newline => write!(f, "NEWLINE"),
902 Token::LineContinuation => write!(f, "LINECONT"),
903 Token::InvalidNumberIdent => write!(f, "INVALID_NUMBER_IDENT"),
905 Token::InvalidFloatNoLeading => write!(f, "INVALID_FLOAT_NO_LEADING"),
906 Token::InvalidFloatNoTrailing => write!(f, "INVALID_FLOAT_NO_TRAILING"),
907 }
908 }
909}
910
911impl Token {
912 pub fn is_keyword(&self) -> bool {
914 matches!(
915 self,
916 Token::Set
917 | Token::Local
918 | Token::If
919 | Token::Then
920 | Token::Else
921 | Token::Elif
922 | Token::Fi
923 | Token::For
924 | Token::In
925 | Token::Do
926 | Token::Done
927 | Token::Case
928 | Token::Esac
929 | Token::Function
930 | Token::True
931 | Token::False
932 )
933 }
934
935 pub fn is_type(&self) -> bool {
937 matches!(
938 self,
939 Token::TypeString
940 | Token::TypeInt
941 | Token::TypeFloat
942 | Token::TypeBool
943 )
944 }
945
946 pub fn starts_statement(&self) -> bool {
948 matches!(
949 self,
950 Token::Set | Token::Local | Token::Function | Token::If | Token::For | Token::Case | Token::Ident(_) | Token::LBracket
951 )
952 }
953
954 pub fn is_value(&self) -> bool {
956 matches!(
957 self,
958 Token::String(_)
959 | Token::SingleString(_)
960 | Token::HereDoc(_)
961 | Token::Arithmetic(_)
962 | Token::Int(_)
963 | Token::Float(_)
964 | Token::True
965 | Token::False
966 | Token::VarRef(_)
967 | Token::SimpleVarRef(_)
968 | Token::CmdSubstStart
969 | Token::Path(_)
970 | Token::LastExitCode
971 | Token::CurrentPid
972 )
973 }
974}
975
976struct ArithmeticPreprocessResult {
978 text: String,
980 arithmetics: Vec<(String, String)>,
982 replacements: Vec<SpanReplacement>,
984}
985
986fn skip_command_substitution(
995 chars: &[char],
996 i: &mut usize,
997 source_pos: &mut usize,
998 result: &mut String,
999) {
1000 result.push('$');
1002 result.push('(');
1003 *i += 2;
1004 *source_pos += 2;
1005
1006 let mut depth: usize = 1;
1007 let mut in_single_quote = false;
1008 let mut in_double_quote = false;
1009
1010 while *i < chars.len() && depth > 0 {
1011 let c = chars[*i];
1012
1013 if in_single_quote {
1014 result.push(c);
1015 *source_pos += c.len_utf8();
1016 *i += 1;
1017 if c == '\'' {
1018 in_single_quote = false;
1019 }
1020 continue;
1021 }
1022
1023 if in_double_quote {
1024 if c == '\\' && *i + 1 < chars.len() {
1025 let next = chars[*i + 1];
1026 if next == '"' || next == '\\' || next == '$' || next == '`' {
1027 result.push(c);
1028 result.push(next);
1029 *source_pos += c.len_utf8() + next.len_utf8();
1030 *i += 2;
1031 continue;
1032 }
1033 }
1034 if c == '"' {
1035 in_double_quote = false;
1036 }
1037 result.push(c);
1038 *source_pos += c.len_utf8();
1039 *i += 1;
1040 continue;
1041 }
1042
1043 match c {
1045 '\'' => {
1046 in_single_quote = true;
1047 result.push(c);
1048 *source_pos += c.len_utf8();
1049 *i += 1;
1050 }
1051 '"' => {
1052 in_double_quote = true;
1053 result.push(c);
1054 *source_pos += c.len_utf8();
1055 *i += 1;
1056 }
1057 '\\' if *i + 1 < chars.len() => {
1058 result.push(c);
1059 result.push(chars[*i + 1]);
1060 *source_pos += c.len_utf8() + chars[*i + 1].len_utf8();
1061 *i += 2;
1062 }
1063 '(' => {
1064 depth += 1;
1065 result.push(c);
1066 *source_pos += c.len_utf8();
1067 *i += 1;
1068 }
1069 ')' => {
1070 depth -= 1;
1071 result.push(c);
1072 *source_pos += c.len_utf8();
1073 *i += 1;
1074 }
1075 _ => {
1076 result.push(c);
1077 *source_pos += c.len_utf8();
1078 *i += 1;
1079 }
1080 }
1081 }
1082}
1083
1084fn preprocess_arithmetic(source: &str) -> Result<ArithmeticPreprocessResult, LexerError> {
1098 let mut result = String::with_capacity(source.len());
1099 let mut arithmetics: Vec<(String, String)> = Vec::new();
1100 let mut replacements: Vec<SpanReplacement> = Vec::new();
1101 let mut source_pos: usize = 0;
1102 let chars_vec: Vec<char> = source.chars().collect();
1103 let mut i = 0;
1104
1105 let mut in_double_quote = false;
1108
1109 while i < chars_vec.len() {
1110 let ch = chars_vec[i];
1111
1112 if !in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1114 result.push(ch);
1115 result.push(chars_vec[i + 1]);
1116 source_pos += ch.len_utf8() + chars_vec[i + 1].len_utf8();
1117 i += 2;
1118 continue;
1119 }
1120
1121 if ch == '\'' && !in_double_quote {
1123 result.push(ch);
1124 i += 1;
1125 source_pos += 1;
1126 while i < chars_vec.len() && chars_vec[i] != '\'' {
1127 result.push(chars_vec[i]);
1128 source_pos += chars_vec[i].len_utf8();
1129 i += 1;
1130 }
1131 if i < chars_vec.len() {
1132 result.push(chars_vec[i]); source_pos += 1;
1134 i += 1;
1135 }
1136 continue;
1137 }
1138
1139 if ch == '"' {
1141 in_double_quote = !in_double_quote;
1142 result.push(ch);
1143 i += 1;
1144 source_pos += 1;
1145 continue;
1146 }
1147
1148 if in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1150 let next = chars_vec[i + 1];
1151 if next == '"' || next == '\\' || next == '$' || next == '`' {
1152 result.push(ch);
1153 result.push(next);
1154 source_pos += ch.len_utf8() + next.len_utf8();
1155 i += 2;
1156 continue;
1157 }
1158 }
1159
1160 if ch == '$' && i + 1 < chars_vec.len() && chars_vec[i + 1] == '('
1162 && !(i + 2 < chars_vec.len() && chars_vec[i + 2] == '(')
1163 {
1164 skip_command_substitution(&chars_vec, &mut i, &mut source_pos, &mut result);
1165 continue;
1166 }
1167
1168 if ch == '$' && i + 2 < chars_vec.len() && chars_vec[i + 1] == '(' && chars_vec[i + 2] == '(' {
1170 let arith_start_pos = result.len();
1171 let original_start = source_pos;
1172
1173 i += 3;
1175 source_pos += 3;
1176
1177 let mut expr = String::new();
1179 let mut paren_depth: usize = 0;
1180
1181 while i < chars_vec.len() {
1182 let c = chars_vec[i];
1183 match c {
1184 '(' => {
1185 paren_depth += 1;
1186 if paren_depth > MAX_PAREN_DEPTH {
1187 return Err(LexerError::NestingTooDeep);
1188 }
1189 expr.push('(');
1190 i += 1;
1191 source_pos += c.len_utf8();
1192 }
1193 ')' => {
1194 if paren_depth > 0 {
1195 paren_depth -= 1;
1196 expr.push(')');
1197 i += 1;
1198 source_pos += 1;
1199 } else if i + 1 < chars_vec.len() && chars_vec[i + 1] == ')' {
1200 i += 2;
1202 source_pos += 2;
1203 break;
1204 } else {
1205 expr.push(')');
1207 i += 1;
1208 source_pos += 1;
1209 }
1210 }
1211 _ => {
1212 expr.push(c);
1213 i += 1;
1214 source_pos += c.len_utf8();
1215 }
1216 }
1217 }
1218
1219 let original_len = source_pos - original_start;
1221
1222 let marker = format!("__KAISH_ARITH_{}__", unique_marker_id());
1224 let marker_len = marker.len();
1225
1226 replacements.push(SpanReplacement {
1228 preprocessed_pos: arith_start_pos,
1229 marker_len,
1230 original_len,
1231 });
1232
1233 arithmetics.push((marker.clone(), expr));
1234 result.push_str(&marker);
1235 } else {
1236 result.push(ch);
1237 i += 1;
1238 source_pos += ch.len_utf8();
1239 }
1240 }
1241
1242 Ok(ArithmeticPreprocessResult {
1243 text: result,
1244 arithmetics,
1245 replacements,
1246 })
1247}
1248
1249fn preprocess_heredocs(source: &str) -> (String, Vec<(String, String, bool)>) {
1260 let mut result = String::with_capacity(source.len());
1261 let mut heredocs: Vec<(String, String, bool)> = Vec::new();
1262 let mut chars = source.chars().peekable();
1263
1264 while let Some(ch) = chars.next() {
1265 if ch == '<' && chars.peek() == Some(&'<') {
1267 chars.next(); let strip_tabs = chars.peek() == Some(&'-');
1271 if strip_tabs {
1272 chars.next();
1273 }
1274
1275 while let Some(&c) = chars.peek() {
1277 if c == ' ' || c == '\t' {
1278 chars.next();
1279 } else {
1280 break;
1281 }
1282 }
1283
1284 let mut delimiter = String::new();
1286 let quoted = chars.peek() == Some(&'\'') || chars.peek() == Some(&'"');
1287 let quote_char = if quoted { chars.next() } else { None };
1288
1289 while let Some(&c) = chars.peek() {
1290 if quoted {
1291 if Some(c) == quote_char {
1292 chars.next(); break;
1294 }
1295 } else if c.is_whitespace() || c == '\n' || c == '\r' {
1296 break;
1297 }
1298 if let Some(ch) = chars.next() {
1299 delimiter.push(ch);
1300 }
1301 }
1302
1303 if delimiter.is_empty() {
1304 result.push_str("<<");
1306 if strip_tabs {
1307 result.push('-');
1308 }
1309 continue;
1310 }
1311
1312 let mut after_delimiter = String::new();
1315 while let Some(&c) = chars.peek() {
1316 if c == '\n' {
1317 chars.next();
1318 break;
1319 } else if c == '\r' {
1320 chars.next();
1321 if chars.peek() == Some(&'\n') {
1322 chars.next();
1323 }
1324 break;
1325 }
1326 if let Some(ch) = chars.next() {
1327 after_delimiter.push(ch);
1328 }
1329 }
1330
1331 let mut content = String::new();
1333 let mut current_line = String::new();
1334
1335 loop {
1336 match chars.next() {
1337 Some('\n') => {
1338 let trimmed = if strip_tabs {
1340 current_line.trim_start_matches('\t')
1341 } else {
1342 ¤t_line
1343 };
1344 if trimmed == delimiter {
1345 break;
1347 }
1348 content.push_str(¤t_line);
1350 content.push('\n');
1351 current_line.clear();
1352 }
1353 Some('\r') => {
1354 if chars.peek() == Some(&'\n') {
1356 chars.next();
1357 }
1358 let trimmed = if strip_tabs {
1359 current_line.trim_start_matches('\t')
1360 } else {
1361 ¤t_line
1362 };
1363 if trimmed == delimiter {
1364 break;
1365 }
1366 content.push_str(¤t_line);
1367 content.push('\n');
1368 current_line.clear();
1369 }
1370 Some(c) => {
1371 current_line.push(c);
1372 }
1373 None => {
1374 let trimmed = if strip_tabs {
1376 current_line.trim_start_matches('\t')
1377 } else {
1378 ¤t_line
1379 };
1380 if trimmed == delimiter {
1381 break;
1383 }
1384 if !current_line.is_empty() {
1386 content.push_str(¤t_line);
1387 }
1388 break;
1389 }
1390 }
1391 }
1392
1393 let content = content.trim_end_matches('\n').to_string();
1395
1396 let marker = format!("__KAISH_HEREDOC_{}__", unique_marker_id());
1398 heredocs.push((marker.clone(), content, quoted));
1399
1400 result.push_str("<<");
1403 result.push_str(&marker);
1404 result.push_str(&after_delimiter);
1405 result.push('\n');
1406 } else {
1407 result.push(ch);
1408 }
1409 }
1410
1411 (result, heredocs)
1412}
1413
1414pub fn tokenize(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1423 let arith_result = preprocess_arithmetic(source)
1425 .map_err(|e| vec![Spanned::new(e, 0..source.len())])?;
1426
1427 let (preprocessed, heredocs) = preprocess_heredocs(&arith_result.text);
1429
1430 let span_replacements = arith_result.replacements;
1432
1433 let lexer = Token::lexer(&preprocessed);
1434 let mut tokens = Vec::new();
1435 let mut errors = Vec::new();
1436
1437 for (result, span) in lexer.spanned() {
1438 let corrected_span = correct_span(span, &span_replacements);
1440 match result {
1441 Ok(token) => {
1442 if !matches!(token, Token::Comment | Token::LineContinuation) {
1444 tokens.push(Spanned::new(token, corrected_span));
1445 }
1446 }
1447 Err(err) => {
1448 errors.push(Spanned::new(err, corrected_span));
1449 }
1450 }
1451 }
1452
1453 if !errors.is_empty() {
1454 return Err(errors);
1455 }
1456
1457 let mut final_tokens = Vec::with_capacity(tokens.len());
1459 let mut i = 0;
1460
1461 while i < tokens.len() {
1462 if let Token::Ident(ref name) = tokens[i].token
1464 && name.starts_with("__KAISH_ARITH_") && name.ends_with("__")
1465 && let Some((_, expr)) = arith_result.arithmetics.iter().find(|(marker, _)| marker == name) {
1466 final_tokens.push(Spanned::new(Token::Arithmetic(expr.clone()), tokens[i].span.clone()));
1467 i += 1;
1468 continue;
1469 }
1470
1471 if matches!(tokens[i].token, Token::HereDocStart) {
1473 if i + 1 < tokens.len()
1475 && let Token::Ident(ref name) = tokens[i + 1].token
1476 && name.starts_with("__KAISH_HEREDOC_") && name.ends_with("__") {
1477 if let Some((_, content, literal)) = heredocs.iter().find(|(marker, _, _)| marker == name) {
1479 final_tokens.push(Spanned::new(Token::HereDocStart, tokens[i].span.clone()));
1480 final_tokens.push(Spanned::new(Token::HereDoc(HereDocData { content: content.clone(), literal: *literal }), tokens[i + 1].span.clone()));
1481 i += 2;
1482 continue;
1483 }
1484 }
1485 }
1486
1487 let token = if let Token::String(ref s) = tokens[i].token {
1489 let mut new_content = s.clone();
1491 for (marker, expr) in &arith_result.arithmetics {
1492 if new_content.contains(marker) {
1493 new_content = new_content.replace(marker, &format!("${{__ARITH:{}__}}", expr));
1496 }
1497 }
1498 if new_content != *s {
1499 Spanned::new(Token::String(new_content), tokens[i].span.clone())
1500 } else {
1501 tokens[i].clone()
1502 }
1503 } else {
1504 tokens[i].clone()
1505 };
1506 final_tokens.push(token);
1507 i += 1;
1508 }
1509
1510 Ok(final_tokens)
1511}
1512
1513pub fn tokenize_with_comments(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1517 let lexer = Token::lexer(source);
1518 let mut tokens = Vec::new();
1519 let mut errors = Vec::new();
1520
1521 for (result, span) in lexer.spanned() {
1522 match result {
1523 Ok(token) => {
1524 tokens.push(Spanned::new(token, span));
1525 }
1526 Err(err) => {
1527 errors.push(Spanned::new(err, span));
1528 }
1529 }
1530 }
1531
1532 if errors.is_empty() {
1533 Ok(tokens)
1534 } else {
1535 Err(errors)
1536 }
1537}
1538
1539pub fn parse_string_literal(source: &str) -> Result<String, LexerError> {
1541 if source.len() < 2 || !source.starts_with('"') || !source.ends_with('"') {
1543 return Err(LexerError::UnterminatedString);
1544 }
1545
1546 let inner = &source[1..source.len() - 1];
1547 let mut result = String::with_capacity(inner.len());
1548 let mut chars = inner.chars().peekable();
1549
1550 while let Some(ch) = chars.next() {
1551 if ch == '\\' {
1552 match chars.next() {
1553 Some('n') => result.push('\n'),
1554 Some('t') => result.push('\t'),
1555 Some('r') => result.push('\r'),
1556 Some('\\') => result.push('\\'),
1557 Some('"') => result.push('"'),
1558 Some('$') => result.push_str("__KAISH_ESCAPED_DOLLAR__"),
1561 Some('u') => {
1562 let mut hex = String::with_capacity(4);
1564 for _ in 0..4 {
1565 match chars.next() {
1566 Some(h) if h.is_ascii_hexdigit() => hex.push(h),
1567 _ => return Err(LexerError::InvalidEscape),
1568 }
1569 }
1570 let codepoint = u32::from_str_radix(&hex, 16)
1571 .map_err(|_| LexerError::InvalidEscape)?;
1572 let ch = char::from_u32(codepoint)
1573 .ok_or(LexerError::InvalidEscape)?;
1574 result.push(ch);
1575 }
1576 Some(next) => {
1578 result.push('\\');
1579 result.push(next);
1580 }
1581 None => return Err(LexerError::InvalidEscape),
1582 }
1583 } else {
1584 result.push(ch);
1585 }
1586 }
1587
1588 Ok(result)
1589}
1590
1591pub fn parse_var_ref(source: &str) -> Result<Vec<String>, LexerError> {
1594 if source.len() < 4 || !source.starts_with("${") || !source.ends_with('}') {
1596 return Err(LexerError::UnterminatedVarRef);
1597 }
1598
1599 let inner = &source[2..source.len() - 1];
1600
1601 if inner == "?" {
1603 return Ok(vec!["?".to_string()]);
1604 }
1605
1606 let mut segments = Vec::new();
1607 let mut current = String::new();
1608 let mut chars = inner.chars().peekable();
1609
1610 while let Some(ch) = chars.next() {
1611 match ch {
1612 '.' => {
1613 if !current.is_empty() {
1614 segments.push(current.clone());
1615 current.clear();
1616 }
1617 }
1618 '[' => {
1619 if !current.is_empty() {
1620 segments.push(current.clone());
1621 current.clear();
1622 }
1623 let mut index = String::from("[");
1625 while let Some(&c) = chars.peek() {
1626 if let Some(c) = chars.next() {
1627 index.push(c);
1628 }
1629 if c == ']' {
1630 break;
1631 }
1632 }
1633 segments.push(index);
1634 }
1635 _ => {
1636 current.push(ch);
1637 }
1638 }
1639 }
1640
1641 if !current.is_empty() {
1642 segments.push(current);
1643 }
1644
1645 Ok(segments)
1646}
1647
1648pub fn parse_int(source: &str) -> Result<i64, LexerError> {
1650 source.parse().map_err(|_| LexerError::InvalidNumber)
1651}
1652
1653pub fn parse_float(source: &str) -> Result<f64, LexerError> {
1655 source.parse().map_err(|_| LexerError::InvalidNumber)
1656}
1657
1658#[cfg(test)]
1659mod tests {
1660 use super::*;
1661
1662 fn lex(source: &str) -> Vec<Token> {
1663 tokenize(source)
1664 .expect("lexer should succeed")
1665 .into_iter()
1666 .map(|s| s.token)
1667 .collect()
1668 }
1669
1670 #[test]
1675 fn keywords() {
1676 assert_eq!(lex("set"), vec![Token::Set]);
1677 assert_eq!(lex("if"), vec![Token::If]);
1678 assert_eq!(lex("then"), vec![Token::Then]);
1679 assert_eq!(lex("else"), vec![Token::Else]);
1680 assert_eq!(lex("elif"), vec![Token::Elif]);
1681 assert_eq!(lex("fi"), vec![Token::Fi]);
1682 assert_eq!(lex("for"), vec![Token::For]);
1683 assert_eq!(lex("in"), vec![Token::In]);
1684 assert_eq!(lex("do"), vec![Token::Do]);
1685 assert_eq!(lex("done"), vec![Token::Done]);
1686 assert_eq!(lex("case"), vec![Token::Case]);
1687 assert_eq!(lex("esac"), vec![Token::Esac]);
1688 assert_eq!(lex("function"), vec![Token::Function]);
1689 assert_eq!(lex("true"), vec![Token::True]);
1690 assert_eq!(lex("false"), vec![Token::False]);
1691 }
1692
1693 #[test]
1694 fn double_semicolon() {
1695 assert_eq!(lex(";;"), vec![Token::DoubleSemi]);
1696 assert_eq!(lex("echo \"hi\";;"), vec![
1698 Token::Ident("echo".to_string()),
1699 Token::String("hi".to_string()),
1700 Token::DoubleSemi,
1701 ]);
1702 }
1703
1704 #[test]
1705 fn type_keywords() {
1706 assert_eq!(lex("string"), vec![Token::TypeString]);
1707 assert_eq!(lex("int"), vec![Token::TypeInt]);
1708 assert_eq!(lex("float"), vec![Token::TypeFloat]);
1709 assert_eq!(lex("bool"), vec![Token::TypeBool]);
1710 }
1711
1712 #[test]
1717 fn single_char_operators() {
1718 assert_eq!(lex("="), vec![Token::Eq]);
1719 assert_eq!(lex("|"), vec![Token::Pipe]);
1720 assert_eq!(lex("&"), vec![Token::Amp]);
1721 assert_eq!(lex(">"), vec![Token::Gt]);
1722 assert_eq!(lex("<"), vec![Token::Lt]);
1723 assert_eq!(lex(";"), vec![Token::Semi]);
1724 assert_eq!(lex(":"), vec![Token::Colon]);
1725 assert_eq!(lex(","), vec![Token::Comma]);
1726 assert_eq!(lex("."), vec![Token::Dot]);
1727 }
1728
1729 #[test]
1730 fn multi_char_operators() {
1731 assert_eq!(lex("&&"), vec![Token::And]);
1732 assert_eq!(lex("||"), vec![Token::Or]);
1733 assert_eq!(lex("=="), vec![Token::EqEq]);
1734 assert_eq!(lex("!="), vec![Token::NotEq]);
1735 assert_eq!(lex("=~"), vec![Token::Match]);
1736 assert_eq!(lex("!~"), vec![Token::NotMatch]);
1737 assert_eq!(lex(">="), vec![Token::GtEq]);
1738 assert_eq!(lex("<="), vec![Token::LtEq]);
1739 assert_eq!(lex(">>"), vec![Token::GtGt]);
1740 assert_eq!(lex("2>"), vec![Token::Stderr]);
1741 assert_eq!(lex("&>"), vec![Token::Both]);
1742 }
1743
1744 #[test]
1745 fn brackets() {
1746 assert_eq!(lex("{"), vec![Token::LBrace]);
1747 assert_eq!(lex("}"), vec![Token::RBrace]);
1748 assert_eq!(lex("["), vec![Token::LBracket]);
1749 assert_eq!(lex("]"), vec![Token::RBracket]);
1750 assert_eq!(lex("("), vec![Token::LParen]);
1751 assert_eq!(lex(")"), vec![Token::RParen]);
1752 }
1753
1754 #[test]
1759 fn integers() {
1760 assert_eq!(lex("0"), vec![Token::Int(0)]);
1761 assert_eq!(lex("42"), vec![Token::Int(42)]);
1762 assert_eq!(lex("-1"), vec![Token::Int(-1)]);
1763 assert_eq!(lex("999999"), vec![Token::Int(999999)]);
1764 }
1765
1766 #[test]
1767 fn floats() {
1768 assert_eq!(lex("3.14"), vec![Token::Float(3.14)]);
1769 assert_eq!(lex("-0.5"), vec![Token::Float(-0.5)]);
1770 assert_eq!(lex("123.456"), vec![Token::Float(123.456)]);
1771 }
1772
1773 #[test]
1774 fn strings() {
1775 assert_eq!(lex(r#""hello""#), vec![Token::String("hello".to_string())]);
1776 assert_eq!(lex(r#""hello world""#), vec![Token::String("hello world".to_string())]);
1777 assert_eq!(lex(r#""""#), vec![Token::String("".to_string())]); assert_eq!(lex(r#""with \"quotes\"""#), vec![Token::String("with \"quotes\"".to_string())]);
1779 assert_eq!(lex(r#""with\nnewline""#), vec![Token::String("with\nnewline".to_string())]);
1780 }
1781
1782 #[test]
1783 fn var_refs() {
1784 assert_eq!(lex("${X}"), vec![Token::VarRef("${X}".to_string())]);
1785 assert_eq!(lex("${VAR}"), vec![Token::VarRef("${VAR}".to_string())]);
1786 assert_eq!(lex("${VAR.field}"), vec![Token::VarRef("${VAR.field}".to_string())]);
1787 assert_eq!(lex("${VAR[0]}"), vec![Token::VarRef("${VAR[0]}".to_string())]);
1788 assert_eq!(lex("${?.ok}"), vec![Token::VarRef("${?.ok}".to_string())]);
1789 }
1790
1791 #[test]
1796 fn identifiers() {
1797 assert_eq!(lex("foo"), vec![Token::Ident("foo".to_string())]);
1798 assert_eq!(lex("foo_bar"), vec![Token::Ident("foo_bar".to_string())]);
1799 assert_eq!(lex("foo-bar"), vec![Token::Ident("foo-bar".to_string())]);
1800 assert_eq!(lex("_private"), vec![Token::Ident("_private".to_string())]);
1801 assert_eq!(lex("cmd123"), vec![Token::Ident("cmd123".to_string())]);
1802 }
1803
1804 #[test]
1805 fn keyword_prefix_identifiers() {
1806 assert_eq!(lex("setup"), vec![Token::Ident("setup".to_string())]);
1808 assert_eq!(lex("tools"), vec![Token::Ident("tools".to_string())]);
1809 assert_eq!(lex("iffy"), vec![Token::Ident("iffy".to_string())]);
1810 assert_eq!(lex("forked"), vec![Token::Ident("forked".to_string())]);
1811 assert_eq!(lex("done-with-it"), vec![Token::Ident("done-with-it".to_string())]);
1812 }
1813
1814 #[test]
1819 fn assignment() {
1820 assert_eq!(
1821 lex("set X = 5"),
1822 vec![Token::Set, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
1823 );
1824 }
1825
1826 #[test]
1827 fn command_simple() {
1828 assert_eq!(lex("echo"), vec![Token::Ident("echo".to_string())]);
1829 assert_eq!(
1830 lex(r#"echo "hello""#),
1831 vec![Token::Ident("echo".to_string()), Token::String("hello".to_string())]
1832 );
1833 }
1834
1835 #[test]
1836 fn command_with_args() {
1837 assert_eq!(
1838 lex("cmd arg1 arg2"),
1839 vec![Token::Ident("cmd".to_string()), Token::Ident("arg1".to_string()), Token::Ident("arg2".to_string())]
1840 );
1841 }
1842
1843 #[test]
1844 fn command_with_named_args() {
1845 assert_eq!(
1846 lex("cmd key=value"),
1847 vec![Token::Ident("cmd".to_string()), Token::Ident("key".to_string()), Token::Eq, Token::Ident("value".to_string())]
1848 );
1849 }
1850
1851 #[test]
1852 fn pipeline() {
1853 assert_eq!(
1854 lex("a | b | c"),
1855 vec![Token::Ident("a".to_string()), Token::Pipe, Token::Ident("b".to_string()), Token::Pipe, Token::Ident("c".to_string())]
1856 );
1857 }
1858
1859 #[test]
1860 fn if_statement() {
1861 assert_eq!(
1862 lex("if true; then echo; fi"),
1863 vec![
1864 Token::If,
1865 Token::True,
1866 Token::Semi,
1867 Token::Then,
1868 Token::Ident("echo".to_string()),
1869 Token::Semi,
1870 Token::Fi
1871 ]
1872 );
1873 }
1874
1875 #[test]
1876 fn for_loop() {
1877 assert_eq!(
1878 lex("for X in items; do echo; done"),
1879 vec![
1880 Token::For,
1881 Token::Ident("X".to_string()),
1882 Token::In,
1883 Token::Ident("items".to_string()),
1884 Token::Semi,
1885 Token::Do,
1886 Token::Ident("echo".to_string()),
1887 Token::Semi,
1888 Token::Done
1889 ]
1890 );
1891 }
1892
1893 #[test]
1898 fn whitespace_ignored() {
1899 assert_eq!(lex(" set X = 5 "), lex("set X = 5"));
1900 }
1901
1902 #[test]
1903 fn newlines_preserved() {
1904 let tokens = lex("a\nb");
1905 assert_eq!(
1906 tokens,
1907 vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
1908 );
1909 }
1910
1911 #[test]
1912 fn multiple_newlines() {
1913 let tokens = lex("a\n\n\nb");
1914 assert_eq!(
1915 tokens,
1916 vec![Token::Ident("a".to_string()), Token::Newline, Token::Newline, Token::Newline, Token::Ident("b".to_string())]
1917 );
1918 }
1919
1920 #[test]
1925 fn comments_skipped() {
1926 assert_eq!(lex("# comment"), vec![]);
1927 assert_eq!(lex("a # comment"), vec![Token::Ident("a".to_string())]);
1928 assert_eq!(
1929 lex("a # comment\nb"),
1930 vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
1931 );
1932 }
1933
1934 #[test]
1935 fn comments_preserved_when_requested() {
1936 let tokens = tokenize_with_comments("a # comment")
1937 .expect("should succeed")
1938 .into_iter()
1939 .map(|s| s.token)
1940 .collect::<Vec<_>>();
1941 assert_eq!(tokens, vec![Token::Ident("a".to_string()), Token::Comment]);
1942 }
1943
1944 #[test]
1949 fn parse_simple_string() {
1950 assert_eq!(parse_string_literal(r#""hello""#).expect("ok"), "hello");
1951 }
1952
1953 #[test]
1954 fn parse_string_with_escapes() {
1955 assert_eq!(
1956 parse_string_literal(r#""hello\nworld""#).expect("ok"),
1957 "hello\nworld"
1958 );
1959 assert_eq!(
1960 parse_string_literal(r#""tab\there""#).expect("ok"),
1961 "tab\there"
1962 );
1963 assert_eq!(
1964 parse_string_literal(r#""quote\"here""#).expect("ok"),
1965 "quote\"here"
1966 );
1967 }
1968
1969 #[test]
1970 fn parse_string_with_unicode() {
1971 assert_eq!(
1972 parse_string_literal(r#""emoji \u2764""#).expect("ok"),
1973 "emoji ❤"
1974 );
1975 }
1976
1977 #[test]
1978 fn parse_string_with_escaped_dollar() {
1979 assert_eq!(
1982 parse_string_literal(r#""\$VAR""#).expect("ok"),
1983 "__KAISH_ESCAPED_DOLLAR__VAR"
1984 );
1985 assert_eq!(
1986 parse_string_literal(r#""cost: \$100""#).expect("ok"),
1987 "cost: __KAISH_ESCAPED_DOLLAR__100"
1988 );
1989 }
1990
1991 #[test]
1996 fn parse_simple_var() {
1997 assert_eq!(
1998 parse_var_ref("${X}").expect("ok"),
1999 vec!["X"]
2000 );
2001 }
2002
2003 #[test]
2004 fn parse_var_with_field() {
2005 assert_eq!(
2006 parse_var_ref("${VAR.field}").expect("ok"),
2007 vec!["VAR", "field"]
2008 );
2009 }
2010
2011 #[test]
2012 fn parse_var_with_index() {
2013 assert_eq!(
2014 parse_var_ref("${VAR[0]}").expect("ok"),
2015 vec!["VAR", "[0]"]
2016 );
2017 }
2018
2019 #[test]
2020 fn parse_var_nested() {
2021 assert_eq!(
2022 parse_var_ref("${VAR.field[0].nested}").expect("ok"),
2023 vec!["VAR", "field", "[0]", "nested"]
2024 );
2025 }
2026
2027 #[test]
2028 fn parse_last_result() {
2029 assert_eq!(
2030 parse_var_ref("${?}").expect("ok"),
2031 vec!["?"]
2032 );
2033 assert_eq!(
2034 parse_var_ref("${?.ok}").expect("ok"),
2035 vec!["?", "ok"]
2036 );
2037 }
2038
2039 #[test]
2044 fn parse_integers() {
2045 assert_eq!(parse_int("0").expect("ok"), 0);
2046 assert_eq!(parse_int("42").expect("ok"), 42);
2047 assert_eq!(parse_int("-1").expect("ok"), -1);
2048 }
2049
2050 #[test]
2051 fn parse_floats() {
2052 assert!((parse_float("3.14").expect("ok") - 3.14).abs() < f64::EPSILON);
2053 assert!((parse_float("-0.5").expect("ok") - (-0.5)).abs() < f64::EPSILON);
2054 }
2055
2056 #[test]
2061 fn empty_input() {
2062 assert_eq!(lex(""), vec![]);
2063 }
2064
2065 #[test]
2066 fn only_whitespace() {
2067 assert_eq!(lex(" \t\t "), vec![]);
2068 }
2069
2070 #[test]
2071 fn json_array() {
2072 assert_eq!(
2073 lex(r#"[1, 2, 3]"#),
2074 vec![
2075 Token::LBracket,
2076 Token::Int(1),
2077 Token::Comma,
2078 Token::Int(2),
2079 Token::Comma,
2080 Token::Int(3),
2081 Token::RBracket
2082 ]
2083 );
2084 }
2085
2086 #[test]
2087 fn json_object() {
2088 assert_eq!(
2089 lex(r#"{"key": "value"}"#),
2090 vec![
2091 Token::LBrace,
2092 Token::String("key".to_string()),
2093 Token::Colon,
2094 Token::String("value".to_string()),
2095 Token::RBrace
2096 ]
2097 );
2098 }
2099
2100 #[test]
2101 fn redirect_operators() {
2102 assert_eq!(
2103 lex("cmd > file"),
2104 vec![Token::Ident("cmd".to_string()), Token::Gt, Token::Ident("file".to_string())]
2105 );
2106 assert_eq!(
2107 lex("cmd >> file"),
2108 vec![Token::Ident("cmd".to_string()), Token::GtGt, Token::Ident("file".to_string())]
2109 );
2110 assert_eq!(
2111 lex("cmd 2> err"),
2112 vec![Token::Ident("cmd".to_string()), Token::Stderr, Token::Ident("err".to_string())]
2113 );
2114 assert_eq!(
2115 lex("cmd &> all"),
2116 vec![Token::Ident("cmd".to_string()), Token::Both, Token::Ident("all".to_string())]
2117 );
2118 }
2119
2120 #[test]
2121 fn background_job() {
2122 assert_eq!(
2123 lex("cmd &"),
2124 vec![Token::Ident("cmd".to_string()), Token::Amp]
2125 );
2126 }
2127
2128 #[test]
2129 fn command_substitution() {
2130 assert_eq!(
2131 lex("$(cmd)"),
2132 vec![Token::CmdSubstStart, Token::Ident("cmd".to_string()), Token::RParen]
2133 );
2134 assert_eq!(
2135 lex("$(cmd arg)"),
2136 vec![
2137 Token::CmdSubstStart,
2138 Token::Ident("cmd".to_string()),
2139 Token::Ident("arg".to_string()),
2140 Token::RParen
2141 ]
2142 );
2143 assert_eq!(
2144 lex("$(a | b)"),
2145 vec![
2146 Token::CmdSubstStart,
2147 Token::Ident("a".to_string()),
2148 Token::Pipe,
2149 Token::Ident("b".to_string()),
2150 Token::RParen
2151 ]
2152 );
2153 }
2154
2155 #[test]
2156 fn complex_pipeline() {
2157 assert_eq!(
2158 lex(r#"cat file | grep pattern="foo" | head count=10"#),
2159 vec![
2160 Token::Ident("cat".to_string()),
2161 Token::Ident("file".to_string()),
2162 Token::Pipe,
2163 Token::Ident("grep".to_string()),
2164 Token::Ident("pattern".to_string()),
2165 Token::Eq,
2166 Token::String("foo".to_string()),
2167 Token::Pipe,
2168 Token::Ident("head".to_string()),
2169 Token::Ident("count".to_string()),
2170 Token::Eq,
2171 Token::Int(10),
2172 ]
2173 );
2174 }
2175
2176 #[test]
2181 fn short_flag() {
2182 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2183 assert_eq!(lex("-a"), vec![Token::ShortFlag("a".to_string())]);
2184 assert_eq!(lex("-v"), vec![Token::ShortFlag("v".to_string())]);
2185 }
2186
2187 #[test]
2188 fn short_flag_combined() {
2189 assert_eq!(lex("-la"), vec![Token::ShortFlag("la".to_string())]);
2191 assert_eq!(lex("-vvv"), vec![Token::ShortFlag("vvv".to_string())]);
2192 }
2193
2194 #[test]
2195 fn long_flag() {
2196 assert_eq!(lex("--force"), vec![Token::LongFlag("force".to_string())]);
2197 assert_eq!(lex("--verbose"), vec![Token::LongFlag("verbose".to_string())]);
2198 assert_eq!(lex("--foo-bar"), vec![Token::LongFlag("foo-bar".to_string())]);
2199 }
2200
2201 #[test]
2202 fn double_dash() {
2203 assert_eq!(lex("--"), vec![Token::DoubleDash]);
2205 }
2206
2207 #[test]
2208 fn flags_vs_negative_numbers() {
2209 assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2211 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2213 assert_eq!(
2216 lex("-1 a"),
2217 vec![Token::Int(-1), Token::Ident("a".to_string())]
2218 );
2219 }
2220
2221 #[test]
2222 fn command_with_flags() {
2223 assert_eq!(
2224 lex("ls -l"),
2225 vec![
2226 Token::Ident("ls".to_string()),
2227 Token::ShortFlag("l".to_string()),
2228 ]
2229 );
2230 assert_eq!(
2231 lex("git commit -m"),
2232 vec![
2233 Token::Ident("git".to_string()),
2234 Token::Ident("commit".to_string()),
2235 Token::ShortFlag("m".to_string()),
2236 ]
2237 );
2238 assert_eq!(
2239 lex("git push --force"),
2240 vec![
2241 Token::Ident("git".to_string()),
2242 Token::Ident("push".to_string()),
2243 Token::LongFlag("force".to_string()),
2244 ]
2245 );
2246 }
2247
2248 #[test]
2249 fn flag_with_value() {
2250 assert_eq!(
2251 lex(r#"git commit -m "message""#),
2252 vec![
2253 Token::Ident("git".to_string()),
2254 Token::Ident("commit".to_string()),
2255 Token::ShortFlag("m".to_string()),
2256 Token::String("message".to_string()),
2257 ]
2258 );
2259 assert_eq!(
2260 lex(r#"--message="hello""#),
2261 vec![
2262 Token::LongFlag("message".to_string()),
2263 Token::Eq,
2264 Token::String("hello".to_string()),
2265 ]
2266 );
2267 }
2268
2269 #[test]
2270 fn end_of_flags_marker() {
2271 assert_eq!(
2272 lex("git checkout -- file"),
2273 vec![
2274 Token::Ident("git".to_string()),
2275 Token::Ident("checkout".to_string()),
2276 Token::DoubleDash,
2277 Token::Ident("file".to_string()),
2278 ]
2279 );
2280 }
2281
2282 #[test]
2287 fn local_keyword() {
2288 assert_eq!(lex("local"), vec![Token::Local]);
2289 assert_eq!(
2290 lex("local X = 5"),
2291 vec![Token::Local, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2292 );
2293 }
2294
2295 #[test]
2296 fn simple_var_ref() {
2297 assert_eq!(lex("$X"), vec![Token::SimpleVarRef("X".to_string())]);
2298 assert_eq!(lex("$foo"), vec![Token::SimpleVarRef("foo".to_string())]);
2299 assert_eq!(lex("$foo_bar"), vec![Token::SimpleVarRef("foo_bar".to_string())]);
2300 assert_eq!(lex("$_private"), vec![Token::SimpleVarRef("_private".to_string())]);
2301 }
2302
2303 #[test]
2304 fn simple_var_ref_in_command() {
2305 assert_eq!(
2306 lex("echo $NAME"),
2307 vec![Token::Ident("echo".to_string()), Token::SimpleVarRef("NAME".to_string())]
2308 );
2309 }
2310
2311 #[test]
2312 fn single_quoted_strings() {
2313 assert_eq!(lex("'hello'"), vec![Token::SingleString("hello".to_string())]);
2314 assert_eq!(lex("'hello world'"), vec![Token::SingleString("hello world".to_string())]);
2315 assert_eq!(lex("''"), vec![Token::SingleString("".to_string())]);
2316 assert_eq!(lex(r"'no $VAR here'"), vec![Token::SingleString("no $VAR here".to_string())]);
2318 assert_eq!(lex(r"'backslash \n stays'"), vec![Token::SingleString(r"backslash \n stays".to_string())]);
2319 }
2320
2321 #[test]
2322 fn test_brackets() {
2323 assert_eq!(lex("[["), vec![Token::LBracket, Token::LBracket]);
2325 assert_eq!(lex("]]"), vec![Token::RBracket, Token::RBracket]);
2326 assert_eq!(
2327 lex("[[ -f file ]]"),
2328 vec![
2329 Token::LBracket,
2330 Token::LBracket,
2331 Token::ShortFlag("f".to_string()),
2332 Token::Ident("file".to_string()),
2333 Token::RBracket,
2334 Token::RBracket
2335 ]
2336 );
2337 }
2338
2339 #[test]
2340 fn test_expression_syntax() {
2341 assert_eq!(
2342 lex(r#"[[ $X == "value" ]]"#),
2343 vec![
2344 Token::LBracket,
2345 Token::LBracket,
2346 Token::SimpleVarRef("X".to_string()),
2347 Token::EqEq,
2348 Token::String("value".to_string()),
2349 Token::RBracket,
2350 Token::RBracket
2351 ]
2352 );
2353 }
2354
2355 #[test]
2356 fn bash_style_assignment() {
2357 assert_eq!(
2359 lex(r#"NAME="value""#),
2360 vec![
2361 Token::Ident("NAME".to_string()),
2362 Token::Eq,
2363 Token::String("value".to_string())
2364 ]
2365 );
2366 }
2367
2368 #[test]
2369 fn positional_params() {
2370 assert_eq!(lex("$0"), vec![Token::Positional(0)]);
2371 assert_eq!(lex("$1"), vec![Token::Positional(1)]);
2372 assert_eq!(lex("$9"), vec![Token::Positional(9)]);
2373 assert_eq!(lex("$@"), vec![Token::AllArgs]);
2374 assert_eq!(lex("$#"), vec![Token::ArgCount]);
2375 }
2376
2377 #[test]
2378 fn positional_in_context() {
2379 assert_eq!(
2380 lex("echo $1 $2"),
2381 vec![
2382 Token::Ident("echo".to_string()),
2383 Token::Positional(1),
2384 Token::Positional(2),
2385 ]
2386 );
2387 }
2388
2389 #[test]
2390 fn var_length() {
2391 assert_eq!(lex("${#X}"), vec![Token::VarLength("X".to_string())]);
2392 assert_eq!(lex("${#NAME}"), vec![Token::VarLength("NAME".to_string())]);
2393 assert_eq!(lex("${#foo_bar}"), vec![Token::VarLength("foo_bar".to_string())]);
2394 }
2395
2396 #[test]
2397 fn var_length_in_context() {
2398 assert_eq!(
2399 lex("echo ${#NAME}"),
2400 vec![
2401 Token::Ident("echo".to_string()),
2402 Token::VarLength("NAME".to_string()),
2403 ]
2404 );
2405 }
2406
2407 #[test]
2412 fn plus_flag() {
2413 assert_eq!(lex("+e"), vec![Token::PlusFlag("e".to_string())]);
2415 assert_eq!(lex("+x"), vec![Token::PlusFlag("x".to_string())]);
2416 assert_eq!(lex("+ex"), vec![Token::PlusFlag("ex".to_string())]);
2417 }
2418
2419 #[test]
2420 fn set_with_plus_flag() {
2421 assert_eq!(
2422 lex("set +e"),
2423 vec![
2424 Token::Set,
2425 Token::PlusFlag("e".to_string()),
2426 ]
2427 );
2428 }
2429
2430 #[test]
2431 fn set_with_multiple_flags() {
2432 assert_eq!(
2433 lex("set -e -u"),
2434 vec![
2435 Token::Set,
2436 Token::ShortFlag("e".to_string()),
2437 Token::ShortFlag("u".to_string()),
2438 ]
2439 );
2440 }
2441
2442 #[test]
2443 fn flags_vs_negative_numbers_edge_cases() {
2444 assert_eq!(
2446 lex("-1 a"),
2447 vec![Token::Int(-1), Token::Ident("a".to_string())]
2448 );
2449 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2451 assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2453 }
2454
2455 #[test]
2456 fn single_dash_is_minus_alone() {
2457 let result = tokenize("-").expect("should lex");
2459 assert_eq!(result.len(), 1);
2460 assert!(matches!(result[0].token, Token::MinusAlone));
2461 }
2462
2463 #[test]
2464 fn plus_bare_for_date_format() {
2465 let result = tokenize("+%s").expect("should lex");
2467 assert_eq!(result.len(), 1);
2468 assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%s"));
2469
2470 let result = tokenize("+%Y-%m-%d").expect("should lex");
2472 assert_eq!(result.len(), 1);
2473 assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%Y-%m-%d"));
2474 }
2475
2476 #[test]
2477 fn plus_flag_still_works() {
2478 let result = tokenize("+e").expect("should lex");
2480 assert_eq!(result.len(), 1);
2481 assert!(matches!(result[0].token, Token::PlusFlag(ref s) if s == "e"));
2482 }
2483
2484 #[test]
2485 fn while_keyword_vs_while_loop() {
2486 assert_eq!(lex("while"), vec![Token::While]);
2488 assert_eq!(
2490 lex("while true"),
2491 vec![Token::While, Token::True]
2492 );
2493 }
2494
2495 #[test]
2496 fn control_flow_keywords() {
2497 assert_eq!(lex("break"), vec![Token::Break]);
2498 assert_eq!(lex("continue"), vec![Token::Continue]);
2499 assert_eq!(lex("return"), vec![Token::Return]);
2500 assert_eq!(lex("exit"), vec![Token::Exit]);
2501 }
2502
2503 #[test]
2504 fn control_flow_with_numbers() {
2505 assert_eq!(
2506 lex("break 2"),
2507 vec![Token::Break, Token::Int(2)]
2508 );
2509 assert_eq!(
2510 lex("continue 3"),
2511 vec![Token::Continue, Token::Int(3)]
2512 );
2513 assert_eq!(
2514 lex("exit 1"),
2515 vec![Token::Exit, Token::Int(1)]
2516 );
2517 }
2518
2519 #[test]
2524 fn heredoc_simple() {
2525 let source = "cat <<EOF\nhello\nworld\nEOF";
2526 let tokens = lex(source);
2527 assert_eq!(tokens, vec![
2528 Token::Ident("cat".to_string()),
2529 Token::HereDocStart,
2530 Token::HereDoc(HereDocData { content: "hello\nworld".to_string(), literal: false }),
2531 Token::Newline,
2532 ]);
2533 }
2534
2535 #[test]
2536 fn heredoc_empty() {
2537 let source = "cat <<EOF\nEOF";
2538 let tokens = lex(source);
2539 assert_eq!(tokens, vec![
2540 Token::Ident("cat".to_string()),
2541 Token::HereDocStart,
2542 Token::HereDoc(HereDocData { content: "".to_string(), literal: false }),
2543 Token::Newline,
2544 ]);
2545 }
2546
2547 #[test]
2548 fn heredoc_with_special_chars() {
2549 let source = "cat <<EOF\n$VAR and \"quoted\" 'single'\nEOF";
2550 let tokens = lex(source);
2551 assert_eq!(tokens, vec![
2552 Token::Ident("cat".to_string()),
2553 Token::HereDocStart,
2554 Token::HereDoc(HereDocData { content: "$VAR and \"quoted\" 'single'".to_string(), literal: false }),
2555 Token::Newline,
2556 ]);
2557 }
2558
2559 #[test]
2560 fn heredoc_multiline() {
2561 let source = "cat <<END\nline1\nline2\nline3\nEND";
2562 let tokens = lex(source);
2563 assert_eq!(tokens, vec![
2564 Token::Ident("cat".to_string()),
2565 Token::HereDocStart,
2566 Token::HereDoc(HereDocData { content: "line1\nline2\nline3".to_string(), literal: false }),
2567 Token::Newline,
2568 ]);
2569 }
2570
2571 #[test]
2572 fn heredoc_in_command() {
2573 let source = "cat <<EOF\nhello\nEOF\necho goodbye";
2574 let tokens = lex(source);
2575 assert_eq!(tokens, vec![
2576 Token::Ident("cat".to_string()),
2577 Token::HereDocStart,
2578 Token::HereDoc(HereDocData { content: "hello".to_string(), literal: false }),
2579 Token::Newline,
2580 Token::Ident("echo".to_string()),
2581 Token::Ident("goodbye".to_string()),
2582 ]);
2583 }
2584
2585 #[test]
2586 fn heredoc_strip_tabs() {
2587 let source = "cat <<-EOF\n\thello\n\tworld\n\tEOF";
2588 let tokens = lex(source);
2589 assert_eq!(tokens, vec![
2591 Token::Ident("cat".to_string()),
2592 Token::HereDocStart,
2593 Token::HereDoc(HereDocData { content: "\thello\n\tworld".to_string(), literal: false }),
2594 Token::Newline,
2595 ]);
2596 }
2597
2598 #[test]
2603 fn arithmetic_simple() {
2604 let source = "$((1 + 2))";
2605 let tokens = lex(source);
2606 assert_eq!(tokens, vec![Token::Arithmetic("1 + 2".to_string())]);
2607 }
2608
2609 #[test]
2610 fn arithmetic_in_assignment() {
2611 let source = "X=$((5 * 3))";
2612 let tokens = lex(source);
2613 assert_eq!(tokens, vec![
2614 Token::Ident("X".to_string()),
2615 Token::Eq,
2616 Token::Arithmetic("5 * 3".to_string()),
2617 ]);
2618 }
2619
2620 #[test]
2621 fn arithmetic_with_nested_parens() {
2622 let source = "$((2 * (3 + 4)))";
2623 let tokens = lex(source);
2624 assert_eq!(tokens, vec![Token::Arithmetic("2 * (3 + 4)".to_string())]);
2625 }
2626
2627 #[test]
2628 fn arithmetic_with_variable() {
2629 let source = "$((X + 1))";
2630 let tokens = lex(source);
2631 assert_eq!(tokens, vec![Token::Arithmetic("X + 1".to_string())]);
2632 }
2633
2634 #[test]
2635 fn arithmetic_command_subst_not_confused() {
2636 let source = "$(echo hello)";
2638 let tokens = lex(source);
2639 assert_eq!(tokens, vec![
2640 Token::CmdSubstStart,
2641 Token::Ident("echo".to_string()),
2642 Token::Ident("hello".to_string()),
2643 Token::RParen,
2644 ]);
2645 }
2646
2647 #[test]
2648 fn arithmetic_nesting_limit() {
2649 let open_parens = "(".repeat(300);
2651 let close_parens = ")".repeat(300);
2652 let source = format!("$(({}1{}))", open_parens, close_parens);
2653 let result = tokenize(&source);
2654 assert!(result.is_err());
2655 let errors = result.unwrap_err();
2656 assert_eq!(errors.len(), 1);
2657 assert_eq!(errors[0].token, LexerError::NestingTooDeep);
2658 }
2659
2660 #[test]
2661 fn arithmetic_nesting_within_limit() {
2662 let source = "$((((1 + 2) * 3)))";
2664 let tokens = lex(source);
2665 assert_eq!(tokens, vec![Token::Arithmetic("((1 + 2) * 3)".to_string())]);
2666 }
2667
2668 #[test]
2673 fn token_categories() {
2674 assert_eq!(Token::If.category(), TokenCategory::Keyword);
2676 assert_eq!(Token::Then.category(), TokenCategory::Keyword);
2677 assert_eq!(Token::For.category(), TokenCategory::Keyword);
2678 assert_eq!(Token::Function.category(), TokenCategory::Keyword);
2679 assert_eq!(Token::True.category(), TokenCategory::Keyword);
2680 assert_eq!(Token::TypeString.category(), TokenCategory::Keyword);
2681
2682 assert_eq!(Token::Pipe.category(), TokenCategory::Operator);
2684 assert_eq!(Token::And.category(), TokenCategory::Operator);
2685 assert_eq!(Token::Or.category(), TokenCategory::Operator);
2686 assert_eq!(Token::StderrToStdout.category(), TokenCategory::Operator);
2687 assert_eq!(Token::GtGt.category(), TokenCategory::Operator);
2688
2689 assert_eq!(Token::String("test".to_string()).category(), TokenCategory::String);
2691 assert_eq!(Token::SingleString("test".to_string()).category(), TokenCategory::String);
2692 assert_eq!(Token::HereDoc(HereDocData { content: "test".to_string(), literal: false }).category(), TokenCategory::String);
2693
2694 assert_eq!(Token::Int(42).category(), TokenCategory::Number);
2696 assert_eq!(Token::Float(3.14).category(), TokenCategory::Number);
2697 assert_eq!(Token::Arithmetic("1+2".to_string()).category(), TokenCategory::Number);
2698
2699 assert_eq!(Token::SimpleVarRef("X".to_string()).category(), TokenCategory::Variable);
2701 assert_eq!(Token::VarRef("${X}".to_string()).category(), TokenCategory::Variable);
2702 assert_eq!(Token::Positional(1).category(), TokenCategory::Variable);
2703 assert_eq!(Token::AllArgs.category(), TokenCategory::Variable);
2704 assert_eq!(Token::ArgCount.category(), TokenCategory::Variable);
2705 assert_eq!(Token::LastExitCode.category(), TokenCategory::Variable);
2706 assert_eq!(Token::CurrentPid.category(), TokenCategory::Variable);
2707
2708 assert_eq!(Token::ShortFlag("l".to_string()).category(), TokenCategory::Flag);
2710 assert_eq!(Token::LongFlag("verbose".to_string()).category(), TokenCategory::Flag);
2711 assert_eq!(Token::PlusFlag("e".to_string()).category(), TokenCategory::Flag);
2712 assert_eq!(Token::DoubleDash.category(), TokenCategory::Flag);
2713
2714 assert_eq!(Token::Semi.category(), TokenCategory::Punctuation);
2716 assert_eq!(Token::LParen.category(), TokenCategory::Punctuation);
2717 assert_eq!(Token::LBracket.category(), TokenCategory::Punctuation);
2718 assert_eq!(Token::Newline.category(), TokenCategory::Punctuation);
2719
2720 assert_eq!(Token::Comment.category(), TokenCategory::Comment);
2722
2723 assert_eq!(Token::Path("/tmp/file".to_string()).category(), TokenCategory::Path);
2725
2726 assert_eq!(Token::Ident("echo".to_string()).category(), TokenCategory::Command);
2728
2729 assert_eq!(Token::InvalidNumberIdent.category(), TokenCategory::Error);
2731 assert_eq!(Token::InvalidFloatNoLeading.category(), TokenCategory::Error);
2732 assert_eq!(Token::InvalidFloatNoTrailing.category(), TokenCategory::Error);
2733 }
2734
2735 #[test]
2736 fn test_heredoc_piped_to_command() {
2737 let tokens = tokenize("cat <<EOF | jq\n{\"key\": \"val\"}\nEOF").unwrap();
2740 let heredoc_pos = tokens.iter().position(|t| matches!(t.token, Token::HereDoc(_)));
2741 let pipe_pos = tokens.iter().position(|t| matches!(t.token, Token::Pipe));
2742 assert!(heredoc_pos.is_some(), "should have a heredoc token");
2743 assert!(pipe_pos.is_some(), "should have a pipe token");
2744 assert!(
2745 pipe_pos.unwrap() > heredoc_pos.unwrap(),
2746 "Pipe must come after heredoc, got heredoc at {}, pipe at {}. Tokens: {:?}",
2747 heredoc_pos.unwrap(), pipe_pos.unwrap(), tokens,
2748 );
2749 }
2750
2751 #[test]
2752 fn test_heredoc_standalone_still_works() {
2753 let tokens = tokenize("cat <<EOF\nhello\nEOF").unwrap();
2755 assert!(tokens.iter().any(|t| matches!(t.token, Token::HereDoc(_))));
2756 assert!(!tokens.iter().any(|t| matches!(t.token, Token::Pipe)));
2757 }
2758
2759 #[test]
2760 fn test_heredoc_preserves_leading_empty_lines() {
2761 let tokens = tokenize("cat <<EOF\n\nhello\nEOF").unwrap();
2763 let heredoc = tokens.iter().find_map(|t| {
2764 if let Token::HereDoc(data) = &t.token {
2765 Some(data.clone())
2766 } else {
2767 None
2768 }
2769 });
2770 assert!(heredoc.is_some(), "should have a heredoc token");
2771 let data = heredoc.unwrap();
2772 assert!(data.content.starts_with('\n'), "leading empty line must be preserved, got: {:?}", data.content);
2773 assert_eq!(data.content, "\nhello");
2774 }
2775
2776 #[test]
2777 fn test_heredoc_quoted_delimiter_sets_literal() {
2778 let tokens = tokenize("cat <<'EOF'\nhello $HOME\nEOF").unwrap();
2780 let heredoc = tokens.iter().find_map(|t| {
2781 if let Token::HereDoc(data) = &t.token {
2782 Some(data.clone())
2783 } else {
2784 None
2785 }
2786 });
2787 assert!(heredoc.is_some(), "should have a heredoc token");
2788 let data = heredoc.unwrap();
2789 assert!(data.literal, "quoted delimiter should set literal=true");
2790 assert_eq!(data.content, "hello $HOME");
2791 }
2792
2793 #[test]
2794 fn test_heredoc_unquoted_delimiter_not_literal() {
2795 let tokens = tokenize("cat <<EOF\nhello $HOME\nEOF").unwrap();
2797 let heredoc = tokens.iter().find_map(|t| {
2798 if let Token::HereDoc(data) = &t.token {
2799 Some(data.clone())
2800 } else {
2801 None
2802 }
2803 });
2804 assert!(heredoc.is_some(), "should have a heredoc token");
2805 let data = heredoc.unwrap();
2806 assert!(!data.literal, "unquoted delimiter should have literal=false");
2807 }
2808}