1use logos::{Logos, Span};
17use std::fmt;
18use std::sync::atomic::{AtomicU64, Ordering};
19use std::time::{SystemTime, UNIX_EPOCH};
20
21static MARKER_COUNTER: AtomicU64 = AtomicU64::new(0);
23
24const MAX_PAREN_DEPTH: usize = 256;
27
28#[derive(Debug, Clone)]
32struct SpanReplacement {
33 preprocessed_pos: usize,
35 marker_len: usize,
37 original_len: usize,
39}
40
41fn correct_span(span: Span, replacements: &[SpanReplacement]) -> Span {
43 let mut start_adjustment: isize = 0;
44 let mut end_adjustment: isize = 0;
45
46 for r in replacements {
47 let delta = r.original_len as isize - r.marker_len as isize;
49
50 if span.start > r.preprocessed_pos + r.marker_len {
52 start_adjustment += delta;
53 } else if span.start > r.preprocessed_pos {
54 start_adjustment += delta;
57 }
58
59 if span.end > r.preprocessed_pos + r.marker_len {
61 end_adjustment += delta;
62 } else if span.end > r.preprocessed_pos {
63 end_adjustment += delta;
65 }
66 }
67
68 let new_start = (span.start as isize + start_adjustment).max(0) as usize;
69 let new_end = (span.end as isize + end_adjustment).max(new_start as isize) as usize;
70 new_start..new_end
71}
72
73fn unique_marker_id() -> String {
76 let timestamp = SystemTime::now()
77 .duration_since(UNIX_EPOCH)
78 .map(|d| d.as_nanos())
79 .unwrap_or(0);
80 let counter = MARKER_COUNTER.fetch_add(1, Ordering::Relaxed);
81 let pid = std::process::id();
82 format!("{:x}_{:x}_{:x}", timestamp, counter, pid)
83}
84
85#[derive(Debug, Clone, PartialEq)]
87pub struct Spanned<T> {
88 pub token: T,
89 pub span: Span,
90}
91
92impl<T> Spanned<T> {
93 pub fn new(token: T, span: Span) -> Self {
94 Self { token, span }
95 }
96}
97
98#[derive(Debug, Clone, PartialEq, Default)]
100pub enum LexerError {
101 #[default]
102 UnexpectedCharacter,
103 UnterminatedString,
104 UnterminatedVarRef,
105 InvalidEscape,
106 InvalidNumber,
107 AmbiguousBoolean(String),
108 AmbiguousBooleanLike(String),
109 InvalidNumberIdent(String),
110 InvalidFloatNoLeading,
111 InvalidFloatNoTrailing,
112 NestingTooDeep,
114}
115
116impl fmt::Display for LexerError {
117 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
118 match self {
119 LexerError::UnexpectedCharacter => write!(f, "unexpected character"),
120 LexerError::UnterminatedString => write!(f, "unterminated string"),
121 LexerError::UnterminatedVarRef => write!(f, "unterminated variable reference"),
122 LexerError::InvalidEscape => write!(f, "invalid escape sequence"),
123 LexerError::InvalidNumber => write!(f, "invalid number"),
124 LexerError::AmbiguousBoolean(s) => {
125 write!(f, "ambiguous boolean, use lowercase '{}'", s.to_lowercase())
126 }
127 LexerError::AmbiguousBooleanLike(s) => {
128 let suggest = if s.eq_ignore_ascii_case("yes") { "true" } else { "false" };
129 write!(f, "ambiguous boolean-like '{}', use '{}' or '\"{}\"'", s, suggest, s)
130 }
131 LexerError::InvalidNumberIdent(s) => {
132 write!(f, "identifier cannot start with digit: {}", s)
133 }
134 LexerError::InvalidFloatNoLeading => write!(f, "float must have leading digit"),
135 LexerError::InvalidFloatNoTrailing => write!(f, "float must have trailing digit"),
136 LexerError::NestingTooDeep => write!(f, "nesting depth exceeded (max {})", MAX_PAREN_DEPTH),
137 }
138 }
139}
140
141#[derive(Debug, Clone, PartialEq)]
153pub struct HereDocData {
154 pub content: String,
155 pub literal: bool,
156}
157
158#[derive(Logos, Debug, Clone, PartialEq)]
159#[logos(error = LexerError)]
160#[logos(skip r"[ \t]+")]
161pub enum Token {
162 #[token("set")]
166 Set,
167
168 #[token("local")]
169 Local,
170
171 #[token("if")]
172 If,
173
174 #[token("then")]
175 Then,
176
177 #[token("else")]
178 Else,
179
180 #[token("elif")]
181 Elif,
182
183 #[token("fi")]
184 Fi,
185
186 #[token("for")]
187 For,
188
189 #[token("while")]
190 While,
191
192 #[token("in")]
193 In,
194
195 #[token("do")]
196 Do,
197
198 #[token("done")]
199 Done,
200
201 #[token("case")]
202 Case,
203
204 #[token("esac")]
205 Esac,
206
207 #[token("function")]
208 Function,
209
210 #[token("break")]
211 Break,
212
213 #[token("continue")]
214 Continue,
215
216 #[token("return")]
217 Return,
218
219 #[token("exit")]
220 Exit,
221
222 #[token("true")]
223 True,
224
225 #[token("false")]
226 False,
227
228 #[token("string")]
232 TypeString,
233
234 #[token("int")]
235 TypeInt,
236
237 #[token("float")]
238 TypeFloat,
239
240 #[token("bool")]
241 TypeBool,
242
243 #[token("&&")]
247 And,
248
249 #[token("||")]
250 Or,
251
252 #[token("==")]
253 EqEq,
254
255 #[token("!=")]
256 NotEq,
257
258 #[token("=~")]
259 Match,
260
261 #[token("!~")]
262 NotMatch,
263
264 #[token(">=")]
265 GtEq,
266
267 #[token("<=")]
268 LtEq,
269
270 #[token(">>")]
271 GtGt,
272
273 #[token("2>&1")]
274 StderrToStdout,
275
276 #[token("1>&2")]
277 StdoutToStderr,
278
279 #[token(">&2")]
280 StdoutToStderr2,
281
282 #[token("2>")]
283 Stderr,
284
285 #[token("&>")]
286 Both,
287
288 #[token("<<")]
289 HereDocStart,
290
291 #[token(";;")]
292 DoubleSemi,
293
294 #[token("=")]
298 Eq,
299
300 #[token("|")]
301 Pipe,
302
303 #[token("&")]
304 Amp,
305
306 #[token(">")]
307 Gt,
308
309 #[token("<")]
310 Lt,
311
312 #[token(";")]
313 Semi,
314
315 #[token(":")]
316 Colon,
317
318 #[token(",")]
319 Comma,
320
321 #[token(".")]
322 Dot,
323
324 #[token("{")]
325 LBrace,
326
327 #[token("}")]
328 RBrace,
329
330 #[token("[")]
331 LBracket,
332
333 #[token("]")]
334 RBracket,
335
336 #[token("(")]
337 LParen,
338
339 #[token(")")]
340 RParen,
341
342 #[token("*")]
343 Star,
344
345 #[token("!")]
346 Bang,
347
348 #[token("?")]
349 Question,
350
351 Arithmetic(String),
358
359 #[token("$(")]
361 CmdSubstStart,
362
363 #[regex(r"--[a-zA-Z][a-zA-Z0-9-]*", lex_long_flag, priority = 3)]
369 LongFlag(String),
370
371 #[regex(r"-[a-zA-Z][a-zA-Z0-9]*", lex_short_flag, priority = 3)]
373 ShortFlag(String),
374
375 #[regex(r"\+[a-zA-Z][a-zA-Z0-9]*", lex_plus_flag, priority = 3)]
377 PlusFlag(String),
378
379 #[token("--")]
381 DoubleDash,
382
383 #[regex(r"\+[^a-zA-Z\s][^\s]*", lex_plus_bare, priority = 2)]
386 PlusBare(String),
387
388 #[regex(r"-[^a-zA-Z0-9\s\-][^\s]*", lex_minus_bare, priority = 1)]
392 MinusBare(String),
393
394 #[token("-")]
398 MinusAlone,
399
400 #[regex(r#""([^"\\]|\\.)*""#, lex_string)]
406 String(String),
407
408 #[regex(r"'[^']*'", lex_single_string)]
410 SingleString(String),
411
412 #[regex(r"\$\{[^}]+\}", lex_varref)]
414 VarRef(String),
415
416 #[regex(r"\$[a-zA-Z_][a-zA-Z0-9_]*", lex_simple_varref)]
418 SimpleVarRef(String),
419
420 #[regex(r"\$[0-9]", lex_positional)]
422 Positional(usize),
423
424 #[token("$@")]
426 AllArgs,
427
428 #[token("$#")]
430 ArgCount,
431
432 #[token("$?")]
434 LastExitCode,
435
436 #[token("$$")]
438 CurrentPid,
439
440 #[regex(r"\$\{#[a-zA-Z_][a-zA-Z0-9_]*\}", lex_var_length)]
442 VarLength(String),
443
444 HereDoc(HereDocData),
447
448 #[regex(r"-?[0-9]+", lex_int, priority = 2)]
450 Int(i64),
451
452 #[regex(r"-?[0-9]+\.[0-9]+", lex_float)]
454 Float(f64),
455
456 #[regex(r"[0-9]+[a-zA-Z_][a-zA-Z0-9_-]*", lex_invalid_number_ident, priority = 3)]
462 InvalidNumberIdent,
463
464 #[regex(r"\.[0-9]+", lex_invalid_float_no_leading, priority = 3)]
466 InvalidFloatNoLeading,
467
468 #[regex(r"[0-9]+\.", lex_invalid_float_no_trailing, priority = 2)]
471 InvalidFloatNoTrailing,
472
473 #[regex(r"/[a-zA-Z0-9_./+-]*", lex_path)]
479 Path(String),
480
481 #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*", lex_ident)]
488 Ident(String),
489
490 #[regex(r"#[^\n\r]*", allow_greedy = true)]
496 Comment,
497
498 #[regex(r"\n|\r\n")]
500 Newline,
501
502 #[regex(r"\\[ \t]*(\n|\r\n)")]
504 LineContinuation,
505}
506
507#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
512pub enum TokenCategory {
513 Keyword,
515 Operator,
517 String,
519 Number,
521 Variable,
523 Comment,
525 Punctuation,
527 Command,
529 Path,
531 Flag,
533 Error,
535}
536
537impl Token {
538 pub fn category(&self) -> TokenCategory {
540 match self {
541 Token::If
543 | Token::Then
544 | Token::Else
545 | Token::Elif
546 | Token::Fi
547 | Token::For
548 | Token::In
549 | Token::Do
550 | Token::Done
551 | Token::While
552 | Token::Case
553 | Token::Esac
554 | Token::Function
555 | Token::Return
556 | Token::Break
557 | Token::Continue
558 | Token::Exit
559 | Token::Set
560 | Token::Local
561 | Token::True
562 | Token::False
563 | Token::TypeString
564 | Token::TypeInt
565 | Token::TypeFloat
566 | Token::TypeBool => TokenCategory::Keyword,
567
568 Token::Pipe
570 | Token::And
571 | Token::Or
572 | Token::Amp
573 | Token::Eq
574 | Token::EqEq
575 | Token::NotEq
576 | Token::Match
577 | Token::NotMatch
578 | Token::Lt
579 | Token::Gt
580 | Token::LtEq
581 | Token::GtEq
582 | Token::GtGt
583 | Token::Stderr
584 | Token::Both
585 | Token::HereDocStart
586 | Token::StderrToStdout
587 | Token::StdoutToStderr
588 | Token::StdoutToStderr2 => TokenCategory::Operator,
589
590 Token::String(_) | Token::SingleString(_) | Token::HereDoc(_) => TokenCategory::String,
592
593 Token::Int(_) | Token::Float(_) | Token::Arithmetic(_) => TokenCategory::Number,
595
596 Token::VarRef(_)
598 | Token::SimpleVarRef(_)
599 | Token::Positional(_)
600 | Token::AllArgs
601 | Token::ArgCount
602 | Token::VarLength(_)
603 | Token::LastExitCode
604 | Token::CurrentPid => TokenCategory::Variable,
605
606 Token::LongFlag(_)
608 | Token::ShortFlag(_)
609 | Token::PlusFlag(_)
610 | Token::DoubleDash => TokenCategory::Flag,
611
612 Token::Semi
614 | Token::DoubleSemi
615 | Token::Colon
616 | Token::Comma
617 | Token::Dot
618 | Token::LParen
619 | Token::RParen
620 | Token::LBrace
621 | Token::RBrace
622 | Token::LBracket
623 | Token::RBracket
624 | Token::Bang
625 | Token::Question
626 | Token::Star
627 | Token::Newline
628 | Token::LineContinuation
629 | Token::CmdSubstStart => TokenCategory::Punctuation,
630
631 Token::Comment => TokenCategory::Comment,
633
634 Token::Path(_) => TokenCategory::Path,
636
637 Token::Ident(_)
639 | Token::PlusBare(_)
640 | Token::MinusBare(_)
641 | Token::MinusAlone => TokenCategory::Command,
642
643 Token::InvalidNumberIdent
645 | Token::InvalidFloatNoLeading
646 | Token::InvalidFloatNoTrailing => TokenCategory::Error,
647 }
648 }
649}
650
651fn lex_string(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
653 parse_string_literal(lex.slice())
654}
655
656fn lex_single_string(lex: &mut logos::Lexer<Token>) -> String {
658 let s = lex.slice();
659 s[1..s.len() - 1].to_string()
661}
662
663fn lex_varref(lex: &mut logos::Lexer<Token>) -> String {
665 lex.slice().to_string()
667}
668
669fn lex_simple_varref(lex: &mut logos::Lexer<Token>) -> String {
671 lex.slice()[1..].to_string()
673}
674
675fn lex_positional(lex: &mut logos::Lexer<Token>) -> usize {
677 lex.slice()[1..].parse().unwrap_or(0)
679}
680
681fn lex_var_length(lex: &mut logos::Lexer<Token>) -> String {
683 let s = lex.slice();
685 s[3..s.len() - 1].to_string()
686}
687
688fn lex_int(lex: &mut logos::Lexer<Token>) -> Result<i64, LexerError> {
690 lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
691}
692
693fn lex_float(lex: &mut logos::Lexer<Token>) -> Result<f64, LexerError> {
695 lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
696}
697
698fn lex_invalid_number_ident(lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
701 Err(LexerError::InvalidNumberIdent(lex.slice().to_string()))
702}
703
704fn lex_invalid_float_no_leading(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
707 Err(LexerError::InvalidFloatNoLeading)
708}
709
710fn lex_invalid_float_no_trailing(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
713 Err(LexerError::InvalidFloatNoTrailing)
714}
715
716fn lex_ident(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
718 let s = lex.slice();
719
720 match s.to_lowercase().as_str() {
723 "true" | "false" if s != "true" && s != "false" => {
724 return Err(LexerError::AmbiguousBoolean(s.to_string()));
725 }
726 _ => {}
727 }
728
729 if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("no") {
731 return Err(LexerError::AmbiguousBooleanLike(s.to_string()));
732 }
733
734 Ok(s.to_string())
735}
736
737fn lex_long_flag(lex: &mut logos::Lexer<Token>) -> String {
739 lex.slice()[2..].to_string()
741}
742
743fn lex_short_flag(lex: &mut logos::Lexer<Token>) -> String {
745 lex.slice()[1..].to_string()
747}
748
749fn lex_plus_flag(lex: &mut logos::Lexer<Token>) -> String {
751 lex.slice()[1..].to_string()
753}
754
755fn lex_plus_bare(lex: &mut logos::Lexer<Token>) -> String {
757 lex.slice().to_string()
758}
759
760fn lex_minus_bare(lex: &mut logos::Lexer<Token>) -> String {
762 lex.slice().to_string()
763}
764
765fn lex_path(lex: &mut logos::Lexer<Token>) -> String {
767 lex.slice().to_string()
768}
769
770impl fmt::Display for Token {
771 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
772 match self {
773 Token::Set => write!(f, "set"),
774 Token::Local => write!(f, "local"),
775 Token::If => write!(f, "if"),
776 Token::Then => write!(f, "then"),
777 Token::Else => write!(f, "else"),
778 Token::Elif => write!(f, "elif"),
779 Token::Fi => write!(f, "fi"),
780 Token::For => write!(f, "for"),
781 Token::While => write!(f, "while"),
782 Token::In => write!(f, "in"),
783 Token::Do => write!(f, "do"),
784 Token::Done => write!(f, "done"),
785 Token::Case => write!(f, "case"),
786 Token::Esac => write!(f, "esac"),
787 Token::Function => write!(f, "function"),
788 Token::Break => write!(f, "break"),
789 Token::Continue => write!(f, "continue"),
790 Token::Return => write!(f, "return"),
791 Token::Exit => write!(f, "exit"),
792 Token::True => write!(f, "true"),
793 Token::False => write!(f, "false"),
794 Token::TypeString => write!(f, "string"),
795 Token::TypeInt => write!(f, "int"),
796 Token::TypeFloat => write!(f, "float"),
797 Token::TypeBool => write!(f, "bool"),
798 Token::And => write!(f, "&&"),
799 Token::Or => write!(f, "||"),
800 Token::EqEq => write!(f, "=="),
801 Token::NotEq => write!(f, "!="),
802 Token::Match => write!(f, "=~"),
803 Token::NotMatch => write!(f, "!~"),
804 Token::GtEq => write!(f, ">="),
805 Token::LtEq => write!(f, "<="),
806 Token::GtGt => write!(f, ">>"),
807 Token::StderrToStdout => write!(f, "2>&1"),
808 Token::StdoutToStderr => write!(f, "1>&2"),
809 Token::StdoutToStderr2 => write!(f, ">&2"),
810 Token::Stderr => write!(f, "2>"),
811 Token::Both => write!(f, "&>"),
812 Token::HereDocStart => write!(f, "<<"),
813 Token::DoubleSemi => write!(f, ";;"),
814 Token::Eq => write!(f, "="),
815 Token::Pipe => write!(f, "|"),
816 Token::Amp => write!(f, "&"),
817 Token::Gt => write!(f, ">"),
818 Token::Lt => write!(f, "<"),
819 Token::Semi => write!(f, ";"),
820 Token::Colon => write!(f, ":"),
821 Token::Comma => write!(f, ","),
822 Token::Dot => write!(f, "."),
823 Token::LBrace => write!(f, "{{"),
824 Token::RBrace => write!(f, "}}"),
825 Token::LBracket => write!(f, "["),
826 Token::RBracket => write!(f, "]"),
827 Token::LParen => write!(f, "("),
828 Token::RParen => write!(f, ")"),
829 Token::Star => write!(f, "*"),
830 Token::Bang => write!(f, "!"),
831 Token::Question => write!(f, "?"),
832 Token::Arithmetic(s) => write!(f, "ARITHMETIC({})", s),
833 Token::CmdSubstStart => write!(f, "$("),
834 Token::LongFlag(s) => write!(f, "--{}", s),
835 Token::ShortFlag(s) => write!(f, "-{}", s),
836 Token::PlusFlag(s) => write!(f, "+{}", s),
837 Token::DoubleDash => write!(f, "--"),
838 Token::PlusBare(s) => write!(f, "{}", s),
839 Token::MinusBare(s) => write!(f, "{}", s),
840 Token::MinusAlone => write!(f, "-"),
841 Token::String(s) => write!(f, "STRING({:?})", s),
842 Token::SingleString(s) => write!(f, "SINGLESTRING({:?})", s),
843 Token::HereDoc(d) => write!(f, "HEREDOC({:?}, literal={})", d.content, d.literal),
844 Token::VarRef(v) => write!(f, "VARREF({})", v),
845 Token::SimpleVarRef(v) => write!(f, "SIMPLEVARREF({})", v),
846 Token::Positional(n) => write!(f, "${}", n),
847 Token::AllArgs => write!(f, "$@"),
848 Token::ArgCount => write!(f, "$#"),
849 Token::LastExitCode => write!(f, "$?"),
850 Token::CurrentPid => write!(f, "$$"),
851 Token::VarLength(v) => write!(f, "${{#{}}}", v),
852 Token::Int(n) => write!(f, "INT({})", n),
853 Token::Float(n) => write!(f, "FLOAT({})", n),
854 Token::Path(s) => write!(f, "PATH({})", s),
855 Token::Ident(s) => write!(f, "IDENT({})", s),
856 Token::Comment => write!(f, "COMMENT"),
857 Token::Newline => write!(f, "NEWLINE"),
858 Token::LineContinuation => write!(f, "LINECONT"),
859 Token::InvalidNumberIdent => write!(f, "INVALID_NUMBER_IDENT"),
861 Token::InvalidFloatNoLeading => write!(f, "INVALID_FLOAT_NO_LEADING"),
862 Token::InvalidFloatNoTrailing => write!(f, "INVALID_FLOAT_NO_TRAILING"),
863 }
864 }
865}
866
867impl Token {
868 pub fn is_keyword(&self) -> bool {
870 matches!(
871 self,
872 Token::Set
873 | Token::Local
874 | Token::If
875 | Token::Then
876 | Token::Else
877 | Token::Elif
878 | Token::Fi
879 | Token::For
880 | Token::In
881 | Token::Do
882 | Token::Done
883 | Token::Case
884 | Token::Esac
885 | Token::Function
886 | Token::True
887 | Token::False
888 )
889 }
890
891 pub fn is_type(&self) -> bool {
893 matches!(
894 self,
895 Token::TypeString
896 | Token::TypeInt
897 | Token::TypeFloat
898 | Token::TypeBool
899 )
900 }
901
902 pub fn starts_statement(&self) -> bool {
904 matches!(
905 self,
906 Token::Set | Token::Local | Token::Function | Token::If | Token::For | Token::Case | Token::Ident(_) | Token::LBracket
907 )
908 }
909
910 pub fn is_value(&self) -> bool {
912 matches!(
913 self,
914 Token::String(_)
915 | Token::SingleString(_)
916 | Token::HereDoc(_)
917 | Token::Arithmetic(_)
918 | Token::Int(_)
919 | Token::Float(_)
920 | Token::True
921 | Token::False
922 | Token::VarRef(_)
923 | Token::SimpleVarRef(_)
924 | Token::CmdSubstStart
925 | Token::Path(_)
926 | Token::LastExitCode
927 | Token::CurrentPid
928 )
929 }
930}
931
932struct ArithmeticPreprocessResult {
934 text: String,
936 arithmetics: Vec<(String, String)>,
938 replacements: Vec<SpanReplacement>,
940}
941
942fn preprocess_arithmetic(source: &str) -> Result<ArithmeticPreprocessResult, LexerError> {
956 let mut result = String::with_capacity(source.len());
957 let mut arithmetics: Vec<(String, String)> = Vec::new();
958 let mut replacements: Vec<SpanReplacement> = Vec::new();
959 let mut source_pos: usize = 0;
960 let chars_vec: Vec<char> = source.chars().collect();
961 let mut i = 0;
962
963 while i < chars_vec.len() {
964 let ch = chars_vec[i];
965
966 if ch == '$' && i + 2 < chars_vec.len() && chars_vec[i + 1] == '(' && chars_vec[i + 2] == '(' {
968 let arith_start_pos = result.len();
969 let original_start = source_pos;
970
971 i += 3;
973 source_pos += 3;
974
975 let mut expr = String::new();
977 let mut paren_depth: usize = 0;
978
979 while i < chars_vec.len() {
980 let c = chars_vec[i];
981 match c {
982 '(' => {
983 paren_depth += 1;
984 if paren_depth > MAX_PAREN_DEPTH {
985 return Err(LexerError::NestingTooDeep);
986 }
987 expr.push('(');
988 i += 1;
989 source_pos += c.len_utf8();
990 }
991 ')' => {
992 if paren_depth > 0 {
993 paren_depth -= 1;
994 expr.push(')');
995 i += 1;
996 source_pos += 1;
997 } else if i + 1 < chars_vec.len() && chars_vec[i + 1] == ')' {
998 i += 2;
1000 source_pos += 2;
1001 break;
1002 } else {
1003 expr.push(')');
1005 i += 1;
1006 source_pos += 1;
1007 }
1008 }
1009 _ => {
1010 expr.push(c);
1011 i += 1;
1012 source_pos += c.len_utf8();
1013 }
1014 }
1015 }
1016
1017 let original_len = source_pos - original_start;
1019
1020 let marker = format!("__KAISH_ARITH_{}__", unique_marker_id());
1022 let marker_len = marker.len();
1023
1024 replacements.push(SpanReplacement {
1026 preprocessed_pos: arith_start_pos,
1027 marker_len,
1028 original_len,
1029 });
1030
1031 arithmetics.push((marker.clone(), expr));
1032 result.push_str(&marker);
1033 } else {
1034 result.push(ch);
1035 i += 1;
1036 source_pos += ch.len_utf8();
1037 }
1038 }
1039
1040 Ok(ArithmeticPreprocessResult {
1041 text: result,
1042 arithmetics,
1043 replacements,
1044 })
1045}
1046
1047fn preprocess_heredocs(source: &str) -> (String, Vec<(String, String, bool)>) {
1058 let mut result = String::with_capacity(source.len());
1059 let mut heredocs: Vec<(String, String, bool)> = Vec::new();
1060 let mut chars = source.chars().peekable();
1061
1062 while let Some(ch) = chars.next() {
1063 if ch == '<' && chars.peek() == Some(&'<') {
1065 chars.next(); let strip_tabs = chars.peek() == Some(&'-');
1069 if strip_tabs {
1070 chars.next();
1071 }
1072
1073 while let Some(&c) = chars.peek() {
1075 if c == ' ' || c == '\t' {
1076 chars.next();
1077 } else {
1078 break;
1079 }
1080 }
1081
1082 let mut delimiter = String::new();
1084 let quoted = chars.peek() == Some(&'\'') || chars.peek() == Some(&'"');
1085 let quote_char = if quoted { chars.next() } else { None };
1086
1087 while let Some(&c) = chars.peek() {
1088 if quoted {
1089 if Some(c) == quote_char {
1090 chars.next(); break;
1092 }
1093 } else if c.is_whitespace() || c == '\n' || c == '\r' {
1094 break;
1095 }
1096 if let Some(ch) = chars.next() {
1097 delimiter.push(ch);
1098 }
1099 }
1100
1101 if delimiter.is_empty() {
1102 result.push_str("<<");
1104 if strip_tabs {
1105 result.push('-');
1106 }
1107 continue;
1108 }
1109
1110 let mut after_delimiter = String::new();
1113 while let Some(&c) = chars.peek() {
1114 if c == '\n' {
1115 chars.next();
1116 break;
1117 } else if c == '\r' {
1118 chars.next();
1119 if chars.peek() == Some(&'\n') {
1120 chars.next();
1121 }
1122 break;
1123 }
1124 if let Some(ch) = chars.next() {
1125 after_delimiter.push(ch);
1126 }
1127 }
1128
1129 let mut content = String::new();
1131 let mut current_line = String::new();
1132
1133 loop {
1134 match chars.next() {
1135 Some('\n') => {
1136 let trimmed = if strip_tabs {
1138 current_line.trim_start_matches('\t')
1139 } else {
1140 ¤t_line
1141 };
1142 if trimmed == delimiter {
1143 break;
1145 }
1146 content.push_str(¤t_line);
1148 content.push('\n');
1149 current_line.clear();
1150 }
1151 Some('\r') => {
1152 if chars.peek() == Some(&'\n') {
1154 chars.next();
1155 }
1156 let trimmed = if strip_tabs {
1157 current_line.trim_start_matches('\t')
1158 } else {
1159 ¤t_line
1160 };
1161 if trimmed == delimiter {
1162 break;
1163 }
1164 content.push_str(¤t_line);
1165 content.push('\n');
1166 current_line.clear();
1167 }
1168 Some(c) => {
1169 current_line.push(c);
1170 }
1171 None => {
1172 let trimmed = if strip_tabs {
1174 current_line.trim_start_matches('\t')
1175 } else {
1176 ¤t_line
1177 };
1178 if trimmed == delimiter {
1179 break;
1181 }
1182 if !current_line.is_empty() {
1184 content.push_str(¤t_line);
1185 }
1186 break;
1187 }
1188 }
1189 }
1190
1191 let content = content.trim_end_matches('\n').to_string();
1193
1194 let marker = format!("__KAISH_HEREDOC_{}__", unique_marker_id());
1196 heredocs.push((marker.clone(), content, quoted));
1197
1198 result.push_str("<<");
1201 result.push_str(&marker);
1202 result.push_str(&after_delimiter);
1203 result.push('\n');
1204 } else {
1205 result.push(ch);
1206 }
1207 }
1208
1209 (result, heredocs)
1210}
1211
1212pub fn tokenize(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1221 let arith_result = preprocess_arithmetic(source)
1223 .map_err(|e| vec![Spanned::new(e, 0..source.len())])?;
1224
1225 let (preprocessed, heredocs) = preprocess_heredocs(&arith_result.text);
1227
1228 let span_replacements = arith_result.replacements;
1230
1231 let lexer = Token::lexer(&preprocessed);
1232 let mut tokens = Vec::new();
1233 let mut errors = Vec::new();
1234
1235 for (result, span) in lexer.spanned() {
1236 let corrected_span = correct_span(span, &span_replacements);
1238 match result {
1239 Ok(token) => {
1240 if !matches!(token, Token::Comment | Token::LineContinuation) {
1242 tokens.push(Spanned::new(token, corrected_span));
1243 }
1244 }
1245 Err(err) => {
1246 errors.push(Spanned::new(err, corrected_span));
1247 }
1248 }
1249 }
1250
1251 if !errors.is_empty() {
1252 return Err(errors);
1253 }
1254
1255 let mut final_tokens = Vec::with_capacity(tokens.len());
1257 let mut i = 0;
1258
1259 while i < tokens.len() {
1260 if let Token::Ident(ref name) = tokens[i].token
1262 && name.starts_with("__KAISH_ARITH_") && name.ends_with("__")
1263 && let Some((_, expr)) = arith_result.arithmetics.iter().find(|(marker, _)| marker == name) {
1264 final_tokens.push(Spanned::new(Token::Arithmetic(expr.clone()), tokens[i].span.clone()));
1265 i += 1;
1266 continue;
1267 }
1268
1269 if matches!(tokens[i].token, Token::HereDocStart) {
1271 if i + 1 < tokens.len()
1273 && let Token::Ident(ref name) = tokens[i + 1].token
1274 && name.starts_with("__KAISH_HEREDOC_") && name.ends_with("__") {
1275 if let Some((_, content, literal)) = heredocs.iter().find(|(marker, _, _)| marker == name) {
1277 final_tokens.push(Spanned::new(Token::HereDocStart, tokens[i].span.clone()));
1278 final_tokens.push(Spanned::new(Token::HereDoc(HereDocData { content: content.clone(), literal: *literal }), tokens[i + 1].span.clone()));
1279 i += 2;
1280 continue;
1281 }
1282 }
1283 }
1284
1285 let token = if let Token::String(ref s) = tokens[i].token {
1287 let mut new_content = s.clone();
1289 for (marker, expr) in &arith_result.arithmetics {
1290 if new_content.contains(marker) {
1291 new_content = new_content.replace(marker, &format!("${{__ARITH:{}__}}", expr));
1294 }
1295 }
1296 if new_content != *s {
1297 Spanned::new(Token::String(new_content), tokens[i].span.clone())
1298 } else {
1299 tokens[i].clone()
1300 }
1301 } else {
1302 tokens[i].clone()
1303 };
1304 final_tokens.push(token);
1305 i += 1;
1306 }
1307
1308 Ok(final_tokens)
1309}
1310
1311pub fn tokenize_with_comments(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1315 let lexer = Token::lexer(source);
1316 let mut tokens = Vec::new();
1317 let mut errors = Vec::new();
1318
1319 for (result, span) in lexer.spanned() {
1320 match result {
1321 Ok(token) => {
1322 tokens.push(Spanned::new(token, span));
1323 }
1324 Err(err) => {
1325 errors.push(Spanned::new(err, span));
1326 }
1327 }
1328 }
1329
1330 if errors.is_empty() {
1331 Ok(tokens)
1332 } else {
1333 Err(errors)
1334 }
1335}
1336
1337pub fn parse_string_literal(source: &str) -> Result<String, LexerError> {
1339 if source.len() < 2 || !source.starts_with('"') || !source.ends_with('"') {
1341 return Err(LexerError::UnterminatedString);
1342 }
1343
1344 let inner = &source[1..source.len() - 1];
1345 let mut result = String::with_capacity(inner.len());
1346 let mut chars = inner.chars().peekable();
1347
1348 while let Some(ch) = chars.next() {
1349 if ch == '\\' {
1350 match chars.next() {
1351 Some('n') => result.push('\n'),
1352 Some('t') => result.push('\t'),
1353 Some('r') => result.push('\r'),
1354 Some('\\') => result.push('\\'),
1355 Some('"') => result.push('"'),
1356 Some('$') => result.push_str("__KAISH_ESCAPED_DOLLAR__"),
1359 Some('u') => {
1360 let mut hex = String::with_capacity(4);
1362 for _ in 0..4 {
1363 match chars.next() {
1364 Some(h) if h.is_ascii_hexdigit() => hex.push(h),
1365 _ => return Err(LexerError::InvalidEscape),
1366 }
1367 }
1368 let codepoint = u32::from_str_radix(&hex, 16)
1369 .map_err(|_| LexerError::InvalidEscape)?;
1370 let ch = char::from_u32(codepoint)
1371 .ok_or(LexerError::InvalidEscape)?;
1372 result.push(ch);
1373 }
1374 Some(next) => {
1376 result.push('\\');
1377 result.push(next);
1378 }
1379 None => return Err(LexerError::InvalidEscape),
1380 }
1381 } else {
1382 result.push(ch);
1383 }
1384 }
1385
1386 Ok(result)
1387}
1388
1389pub fn parse_var_ref(source: &str) -> Result<Vec<String>, LexerError> {
1392 if source.len() < 4 || !source.starts_with("${") || !source.ends_with('}') {
1394 return Err(LexerError::UnterminatedVarRef);
1395 }
1396
1397 let inner = &source[2..source.len() - 1];
1398
1399 if inner == "?" {
1401 return Ok(vec!["?".to_string()]);
1402 }
1403
1404 let mut segments = Vec::new();
1405 let mut current = String::new();
1406 let mut chars = inner.chars().peekable();
1407
1408 while let Some(ch) = chars.next() {
1409 match ch {
1410 '.' => {
1411 if !current.is_empty() {
1412 segments.push(current.clone());
1413 current.clear();
1414 }
1415 }
1416 '[' => {
1417 if !current.is_empty() {
1418 segments.push(current.clone());
1419 current.clear();
1420 }
1421 let mut index = String::from("[");
1423 while let Some(&c) = chars.peek() {
1424 if let Some(c) = chars.next() {
1425 index.push(c);
1426 }
1427 if c == ']' {
1428 break;
1429 }
1430 }
1431 segments.push(index);
1432 }
1433 _ => {
1434 current.push(ch);
1435 }
1436 }
1437 }
1438
1439 if !current.is_empty() {
1440 segments.push(current);
1441 }
1442
1443 Ok(segments)
1444}
1445
1446pub fn parse_int(source: &str) -> Result<i64, LexerError> {
1448 source.parse().map_err(|_| LexerError::InvalidNumber)
1449}
1450
1451pub fn parse_float(source: &str) -> Result<f64, LexerError> {
1453 source.parse().map_err(|_| LexerError::InvalidNumber)
1454}
1455
1456#[cfg(test)]
1457mod tests {
1458 use super::*;
1459
1460 fn lex(source: &str) -> Vec<Token> {
1461 tokenize(source)
1462 .expect("lexer should succeed")
1463 .into_iter()
1464 .map(|s| s.token)
1465 .collect()
1466 }
1467
1468 #[test]
1473 fn keywords() {
1474 assert_eq!(lex("set"), vec![Token::Set]);
1475 assert_eq!(lex("if"), vec![Token::If]);
1476 assert_eq!(lex("then"), vec![Token::Then]);
1477 assert_eq!(lex("else"), vec![Token::Else]);
1478 assert_eq!(lex("elif"), vec![Token::Elif]);
1479 assert_eq!(lex("fi"), vec![Token::Fi]);
1480 assert_eq!(lex("for"), vec![Token::For]);
1481 assert_eq!(lex("in"), vec![Token::In]);
1482 assert_eq!(lex("do"), vec![Token::Do]);
1483 assert_eq!(lex("done"), vec![Token::Done]);
1484 assert_eq!(lex("case"), vec![Token::Case]);
1485 assert_eq!(lex("esac"), vec![Token::Esac]);
1486 assert_eq!(lex("function"), vec![Token::Function]);
1487 assert_eq!(lex("true"), vec![Token::True]);
1488 assert_eq!(lex("false"), vec![Token::False]);
1489 }
1490
1491 #[test]
1492 fn double_semicolon() {
1493 assert_eq!(lex(";;"), vec![Token::DoubleSemi]);
1494 assert_eq!(lex("echo \"hi\";;"), vec![
1496 Token::Ident("echo".to_string()),
1497 Token::String("hi".to_string()),
1498 Token::DoubleSemi,
1499 ]);
1500 }
1501
1502 #[test]
1503 fn type_keywords() {
1504 assert_eq!(lex("string"), vec![Token::TypeString]);
1505 assert_eq!(lex("int"), vec![Token::TypeInt]);
1506 assert_eq!(lex("float"), vec![Token::TypeFloat]);
1507 assert_eq!(lex("bool"), vec![Token::TypeBool]);
1508 }
1509
1510 #[test]
1515 fn single_char_operators() {
1516 assert_eq!(lex("="), vec![Token::Eq]);
1517 assert_eq!(lex("|"), vec![Token::Pipe]);
1518 assert_eq!(lex("&"), vec![Token::Amp]);
1519 assert_eq!(lex(">"), vec![Token::Gt]);
1520 assert_eq!(lex("<"), vec![Token::Lt]);
1521 assert_eq!(lex(";"), vec![Token::Semi]);
1522 assert_eq!(lex(":"), vec![Token::Colon]);
1523 assert_eq!(lex(","), vec![Token::Comma]);
1524 assert_eq!(lex("."), vec![Token::Dot]);
1525 }
1526
1527 #[test]
1528 fn multi_char_operators() {
1529 assert_eq!(lex("&&"), vec![Token::And]);
1530 assert_eq!(lex("||"), vec![Token::Or]);
1531 assert_eq!(lex("=="), vec![Token::EqEq]);
1532 assert_eq!(lex("!="), vec![Token::NotEq]);
1533 assert_eq!(lex("=~"), vec![Token::Match]);
1534 assert_eq!(lex("!~"), vec![Token::NotMatch]);
1535 assert_eq!(lex(">="), vec![Token::GtEq]);
1536 assert_eq!(lex("<="), vec![Token::LtEq]);
1537 assert_eq!(lex(">>"), vec![Token::GtGt]);
1538 assert_eq!(lex("2>"), vec![Token::Stderr]);
1539 assert_eq!(lex("&>"), vec![Token::Both]);
1540 }
1541
1542 #[test]
1543 fn brackets() {
1544 assert_eq!(lex("{"), vec![Token::LBrace]);
1545 assert_eq!(lex("}"), vec![Token::RBrace]);
1546 assert_eq!(lex("["), vec![Token::LBracket]);
1547 assert_eq!(lex("]"), vec![Token::RBracket]);
1548 assert_eq!(lex("("), vec![Token::LParen]);
1549 assert_eq!(lex(")"), vec![Token::RParen]);
1550 }
1551
1552 #[test]
1557 fn integers() {
1558 assert_eq!(lex("0"), vec![Token::Int(0)]);
1559 assert_eq!(lex("42"), vec![Token::Int(42)]);
1560 assert_eq!(lex("-1"), vec![Token::Int(-1)]);
1561 assert_eq!(lex("999999"), vec![Token::Int(999999)]);
1562 }
1563
1564 #[test]
1565 fn floats() {
1566 assert_eq!(lex("3.14"), vec![Token::Float(3.14)]);
1567 assert_eq!(lex("-0.5"), vec![Token::Float(-0.5)]);
1568 assert_eq!(lex("123.456"), vec![Token::Float(123.456)]);
1569 }
1570
1571 #[test]
1572 fn strings() {
1573 assert_eq!(lex(r#""hello""#), vec![Token::String("hello".to_string())]);
1574 assert_eq!(lex(r#""hello world""#), vec![Token::String("hello world".to_string())]);
1575 assert_eq!(lex(r#""""#), vec![Token::String("".to_string())]); assert_eq!(lex(r#""with \"quotes\"""#), vec![Token::String("with \"quotes\"".to_string())]);
1577 assert_eq!(lex(r#""with\nnewline""#), vec![Token::String("with\nnewline".to_string())]);
1578 }
1579
1580 #[test]
1581 fn var_refs() {
1582 assert_eq!(lex("${X}"), vec![Token::VarRef("${X}".to_string())]);
1583 assert_eq!(lex("${VAR}"), vec![Token::VarRef("${VAR}".to_string())]);
1584 assert_eq!(lex("${VAR.field}"), vec![Token::VarRef("${VAR.field}".to_string())]);
1585 assert_eq!(lex("${VAR[0]}"), vec![Token::VarRef("${VAR[0]}".to_string())]);
1586 assert_eq!(lex("${?.ok}"), vec![Token::VarRef("${?.ok}".to_string())]);
1587 }
1588
1589 #[test]
1594 fn identifiers() {
1595 assert_eq!(lex("foo"), vec![Token::Ident("foo".to_string())]);
1596 assert_eq!(lex("foo_bar"), vec![Token::Ident("foo_bar".to_string())]);
1597 assert_eq!(lex("foo-bar"), vec![Token::Ident("foo-bar".to_string())]);
1598 assert_eq!(lex("_private"), vec![Token::Ident("_private".to_string())]);
1599 assert_eq!(lex("cmd123"), vec![Token::Ident("cmd123".to_string())]);
1600 }
1601
1602 #[test]
1603 fn keyword_prefix_identifiers() {
1604 assert_eq!(lex("setup"), vec![Token::Ident("setup".to_string())]);
1606 assert_eq!(lex("tools"), vec![Token::Ident("tools".to_string())]);
1607 assert_eq!(lex("iffy"), vec![Token::Ident("iffy".to_string())]);
1608 assert_eq!(lex("forked"), vec![Token::Ident("forked".to_string())]);
1609 assert_eq!(lex("done-with-it"), vec![Token::Ident("done-with-it".to_string())]);
1610 }
1611
1612 #[test]
1617 fn assignment() {
1618 assert_eq!(
1619 lex("set X = 5"),
1620 vec![Token::Set, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
1621 );
1622 }
1623
1624 #[test]
1625 fn command_simple() {
1626 assert_eq!(lex("echo"), vec![Token::Ident("echo".to_string())]);
1627 assert_eq!(
1628 lex(r#"echo "hello""#),
1629 vec![Token::Ident("echo".to_string()), Token::String("hello".to_string())]
1630 );
1631 }
1632
1633 #[test]
1634 fn command_with_args() {
1635 assert_eq!(
1636 lex("cmd arg1 arg2"),
1637 vec![Token::Ident("cmd".to_string()), Token::Ident("arg1".to_string()), Token::Ident("arg2".to_string())]
1638 );
1639 }
1640
1641 #[test]
1642 fn command_with_named_args() {
1643 assert_eq!(
1644 lex("cmd key=value"),
1645 vec![Token::Ident("cmd".to_string()), Token::Ident("key".to_string()), Token::Eq, Token::Ident("value".to_string())]
1646 );
1647 }
1648
1649 #[test]
1650 fn pipeline() {
1651 assert_eq!(
1652 lex("a | b | c"),
1653 vec![Token::Ident("a".to_string()), Token::Pipe, Token::Ident("b".to_string()), Token::Pipe, Token::Ident("c".to_string())]
1654 );
1655 }
1656
1657 #[test]
1658 fn if_statement() {
1659 assert_eq!(
1660 lex("if true; then echo; fi"),
1661 vec![
1662 Token::If,
1663 Token::True,
1664 Token::Semi,
1665 Token::Then,
1666 Token::Ident("echo".to_string()),
1667 Token::Semi,
1668 Token::Fi
1669 ]
1670 );
1671 }
1672
1673 #[test]
1674 fn for_loop() {
1675 assert_eq!(
1676 lex("for X in items; do echo; done"),
1677 vec![
1678 Token::For,
1679 Token::Ident("X".to_string()),
1680 Token::In,
1681 Token::Ident("items".to_string()),
1682 Token::Semi,
1683 Token::Do,
1684 Token::Ident("echo".to_string()),
1685 Token::Semi,
1686 Token::Done
1687 ]
1688 );
1689 }
1690
1691 #[test]
1696 fn whitespace_ignored() {
1697 assert_eq!(lex(" set X = 5 "), lex("set X = 5"));
1698 }
1699
1700 #[test]
1701 fn newlines_preserved() {
1702 let tokens = lex("a\nb");
1703 assert_eq!(
1704 tokens,
1705 vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
1706 );
1707 }
1708
1709 #[test]
1710 fn multiple_newlines() {
1711 let tokens = lex("a\n\n\nb");
1712 assert_eq!(
1713 tokens,
1714 vec![Token::Ident("a".to_string()), Token::Newline, Token::Newline, Token::Newline, Token::Ident("b".to_string())]
1715 );
1716 }
1717
1718 #[test]
1723 fn comments_skipped() {
1724 assert_eq!(lex("# comment"), vec![]);
1725 assert_eq!(lex("a # comment"), vec![Token::Ident("a".to_string())]);
1726 assert_eq!(
1727 lex("a # comment\nb"),
1728 vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
1729 );
1730 }
1731
1732 #[test]
1733 fn comments_preserved_when_requested() {
1734 let tokens = tokenize_with_comments("a # comment")
1735 .expect("should succeed")
1736 .into_iter()
1737 .map(|s| s.token)
1738 .collect::<Vec<_>>();
1739 assert_eq!(tokens, vec![Token::Ident("a".to_string()), Token::Comment]);
1740 }
1741
1742 #[test]
1747 fn parse_simple_string() {
1748 assert_eq!(parse_string_literal(r#""hello""#).expect("ok"), "hello");
1749 }
1750
1751 #[test]
1752 fn parse_string_with_escapes() {
1753 assert_eq!(
1754 parse_string_literal(r#""hello\nworld""#).expect("ok"),
1755 "hello\nworld"
1756 );
1757 assert_eq!(
1758 parse_string_literal(r#""tab\there""#).expect("ok"),
1759 "tab\there"
1760 );
1761 assert_eq!(
1762 parse_string_literal(r#""quote\"here""#).expect("ok"),
1763 "quote\"here"
1764 );
1765 }
1766
1767 #[test]
1768 fn parse_string_with_unicode() {
1769 assert_eq!(
1770 parse_string_literal(r#""emoji \u2764""#).expect("ok"),
1771 "emoji ❤"
1772 );
1773 }
1774
1775 #[test]
1776 fn parse_string_with_escaped_dollar() {
1777 assert_eq!(
1780 parse_string_literal(r#""\$VAR""#).expect("ok"),
1781 "__KAISH_ESCAPED_DOLLAR__VAR"
1782 );
1783 assert_eq!(
1784 parse_string_literal(r#""cost: \$100""#).expect("ok"),
1785 "cost: __KAISH_ESCAPED_DOLLAR__100"
1786 );
1787 }
1788
1789 #[test]
1794 fn parse_simple_var() {
1795 assert_eq!(
1796 parse_var_ref("${X}").expect("ok"),
1797 vec!["X"]
1798 );
1799 }
1800
1801 #[test]
1802 fn parse_var_with_field() {
1803 assert_eq!(
1804 parse_var_ref("${VAR.field}").expect("ok"),
1805 vec!["VAR", "field"]
1806 );
1807 }
1808
1809 #[test]
1810 fn parse_var_with_index() {
1811 assert_eq!(
1812 parse_var_ref("${VAR[0]}").expect("ok"),
1813 vec!["VAR", "[0]"]
1814 );
1815 }
1816
1817 #[test]
1818 fn parse_var_nested() {
1819 assert_eq!(
1820 parse_var_ref("${VAR.field[0].nested}").expect("ok"),
1821 vec!["VAR", "field", "[0]", "nested"]
1822 );
1823 }
1824
1825 #[test]
1826 fn parse_last_result() {
1827 assert_eq!(
1828 parse_var_ref("${?}").expect("ok"),
1829 vec!["?"]
1830 );
1831 assert_eq!(
1832 parse_var_ref("${?.ok}").expect("ok"),
1833 vec!["?", "ok"]
1834 );
1835 }
1836
1837 #[test]
1842 fn parse_integers() {
1843 assert_eq!(parse_int("0").expect("ok"), 0);
1844 assert_eq!(parse_int("42").expect("ok"), 42);
1845 assert_eq!(parse_int("-1").expect("ok"), -1);
1846 }
1847
1848 #[test]
1849 fn parse_floats() {
1850 assert!((parse_float("3.14").expect("ok") - 3.14).abs() < f64::EPSILON);
1851 assert!((parse_float("-0.5").expect("ok") - (-0.5)).abs() < f64::EPSILON);
1852 }
1853
1854 #[test]
1859 fn empty_input() {
1860 assert_eq!(lex(""), vec![]);
1861 }
1862
1863 #[test]
1864 fn only_whitespace() {
1865 assert_eq!(lex(" \t\t "), vec![]);
1866 }
1867
1868 #[test]
1869 fn json_array() {
1870 assert_eq!(
1871 lex(r#"[1, 2, 3]"#),
1872 vec![
1873 Token::LBracket,
1874 Token::Int(1),
1875 Token::Comma,
1876 Token::Int(2),
1877 Token::Comma,
1878 Token::Int(3),
1879 Token::RBracket
1880 ]
1881 );
1882 }
1883
1884 #[test]
1885 fn json_object() {
1886 assert_eq!(
1887 lex(r#"{"key": "value"}"#),
1888 vec![
1889 Token::LBrace,
1890 Token::String("key".to_string()),
1891 Token::Colon,
1892 Token::String("value".to_string()),
1893 Token::RBrace
1894 ]
1895 );
1896 }
1897
1898 #[test]
1899 fn redirect_operators() {
1900 assert_eq!(
1901 lex("cmd > file"),
1902 vec![Token::Ident("cmd".to_string()), Token::Gt, Token::Ident("file".to_string())]
1903 );
1904 assert_eq!(
1905 lex("cmd >> file"),
1906 vec![Token::Ident("cmd".to_string()), Token::GtGt, Token::Ident("file".to_string())]
1907 );
1908 assert_eq!(
1909 lex("cmd 2> err"),
1910 vec![Token::Ident("cmd".to_string()), Token::Stderr, Token::Ident("err".to_string())]
1911 );
1912 assert_eq!(
1913 lex("cmd &> all"),
1914 vec![Token::Ident("cmd".to_string()), Token::Both, Token::Ident("all".to_string())]
1915 );
1916 }
1917
1918 #[test]
1919 fn background_job() {
1920 assert_eq!(
1921 lex("cmd &"),
1922 vec![Token::Ident("cmd".to_string()), Token::Amp]
1923 );
1924 }
1925
1926 #[test]
1927 fn command_substitution() {
1928 assert_eq!(
1929 lex("$(cmd)"),
1930 vec![Token::CmdSubstStart, Token::Ident("cmd".to_string()), Token::RParen]
1931 );
1932 assert_eq!(
1933 lex("$(cmd arg)"),
1934 vec![
1935 Token::CmdSubstStart,
1936 Token::Ident("cmd".to_string()),
1937 Token::Ident("arg".to_string()),
1938 Token::RParen
1939 ]
1940 );
1941 assert_eq!(
1942 lex("$(a | b)"),
1943 vec![
1944 Token::CmdSubstStart,
1945 Token::Ident("a".to_string()),
1946 Token::Pipe,
1947 Token::Ident("b".to_string()),
1948 Token::RParen
1949 ]
1950 );
1951 }
1952
1953 #[test]
1954 fn complex_pipeline() {
1955 assert_eq!(
1956 lex(r#"cat file | grep pattern="foo" | head count=10"#),
1957 vec![
1958 Token::Ident("cat".to_string()),
1959 Token::Ident("file".to_string()),
1960 Token::Pipe,
1961 Token::Ident("grep".to_string()),
1962 Token::Ident("pattern".to_string()),
1963 Token::Eq,
1964 Token::String("foo".to_string()),
1965 Token::Pipe,
1966 Token::Ident("head".to_string()),
1967 Token::Ident("count".to_string()),
1968 Token::Eq,
1969 Token::Int(10),
1970 ]
1971 );
1972 }
1973
1974 #[test]
1979 fn short_flag() {
1980 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
1981 assert_eq!(lex("-a"), vec![Token::ShortFlag("a".to_string())]);
1982 assert_eq!(lex("-v"), vec![Token::ShortFlag("v".to_string())]);
1983 }
1984
1985 #[test]
1986 fn short_flag_combined() {
1987 assert_eq!(lex("-la"), vec![Token::ShortFlag("la".to_string())]);
1989 assert_eq!(lex("-vvv"), vec![Token::ShortFlag("vvv".to_string())]);
1990 }
1991
1992 #[test]
1993 fn long_flag() {
1994 assert_eq!(lex("--force"), vec![Token::LongFlag("force".to_string())]);
1995 assert_eq!(lex("--verbose"), vec![Token::LongFlag("verbose".to_string())]);
1996 assert_eq!(lex("--foo-bar"), vec![Token::LongFlag("foo-bar".to_string())]);
1997 }
1998
1999 #[test]
2000 fn double_dash() {
2001 assert_eq!(lex("--"), vec![Token::DoubleDash]);
2003 }
2004
2005 #[test]
2006 fn flags_vs_negative_numbers() {
2007 assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2009 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2011 assert_eq!(
2014 lex("-1 a"),
2015 vec![Token::Int(-1), Token::Ident("a".to_string())]
2016 );
2017 }
2018
2019 #[test]
2020 fn command_with_flags() {
2021 assert_eq!(
2022 lex("ls -l"),
2023 vec![
2024 Token::Ident("ls".to_string()),
2025 Token::ShortFlag("l".to_string()),
2026 ]
2027 );
2028 assert_eq!(
2029 lex("git commit -m"),
2030 vec![
2031 Token::Ident("git".to_string()),
2032 Token::Ident("commit".to_string()),
2033 Token::ShortFlag("m".to_string()),
2034 ]
2035 );
2036 assert_eq!(
2037 lex("git push --force"),
2038 vec![
2039 Token::Ident("git".to_string()),
2040 Token::Ident("push".to_string()),
2041 Token::LongFlag("force".to_string()),
2042 ]
2043 );
2044 }
2045
2046 #[test]
2047 fn flag_with_value() {
2048 assert_eq!(
2049 lex(r#"git commit -m "message""#),
2050 vec![
2051 Token::Ident("git".to_string()),
2052 Token::Ident("commit".to_string()),
2053 Token::ShortFlag("m".to_string()),
2054 Token::String("message".to_string()),
2055 ]
2056 );
2057 assert_eq!(
2058 lex(r#"--message="hello""#),
2059 vec![
2060 Token::LongFlag("message".to_string()),
2061 Token::Eq,
2062 Token::String("hello".to_string()),
2063 ]
2064 );
2065 }
2066
2067 #[test]
2068 fn end_of_flags_marker() {
2069 assert_eq!(
2070 lex("git checkout -- file"),
2071 vec![
2072 Token::Ident("git".to_string()),
2073 Token::Ident("checkout".to_string()),
2074 Token::DoubleDash,
2075 Token::Ident("file".to_string()),
2076 ]
2077 );
2078 }
2079
2080 #[test]
2085 fn local_keyword() {
2086 assert_eq!(lex("local"), vec![Token::Local]);
2087 assert_eq!(
2088 lex("local X = 5"),
2089 vec![Token::Local, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2090 );
2091 }
2092
2093 #[test]
2094 fn simple_var_ref() {
2095 assert_eq!(lex("$X"), vec![Token::SimpleVarRef("X".to_string())]);
2096 assert_eq!(lex("$foo"), vec![Token::SimpleVarRef("foo".to_string())]);
2097 assert_eq!(lex("$foo_bar"), vec![Token::SimpleVarRef("foo_bar".to_string())]);
2098 assert_eq!(lex("$_private"), vec![Token::SimpleVarRef("_private".to_string())]);
2099 }
2100
2101 #[test]
2102 fn simple_var_ref_in_command() {
2103 assert_eq!(
2104 lex("echo $NAME"),
2105 vec![Token::Ident("echo".to_string()), Token::SimpleVarRef("NAME".to_string())]
2106 );
2107 }
2108
2109 #[test]
2110 fn single_quoted_strings() {
2111 assert_eq!(lex("'hello'"), vec![Token::SingleString("hello".to_string())]);
2112 assert_eq!(lex("'hello world'"), vec![Token::SingleString("hello world".to_string())]);
2113 assert_eq!(lex("''"), vec![Token::SingleString("".to_string())]);
2114 assert_eq!(lex(r"'no $VAR here'"), vec![Token::SingleString("no $VAR here".to_string())]);
2116 assert_eq!(lex(r"'backslash \n stays'"), vec![Token::SingleString(r"backslash \n stays".to_string())]);
2117 }
2118
2119 #[test]
2120 fn test_brackets() {
2121 assert_eq!(lex("[["), vec![Token::LBracket, Token::LBracket]);
2123 assert_eq!(lex("]]"), vec![Token::RBracket, Token::RBracket]);
2124 assert_eq!(
2125 lex("[[ -f file ]]"),
2126 vec![
2127 Token::LBracket,
2128 Token::LBracket,
2129 Token::ShortFlag("f".to_string()),
2130 Token::Ident("file".to_string()),
2131 Token::RBracket,
2132 Token::RBracket
2133 ]
2134 );
2135 }
2136
2137 #[test]
2138 fn test_expression_syntax() {
2139 assert_eq!(
2140 lex(r#"[[ $X == "value" ]]"#),
2141 vec![
2142 Token::LBracket,
2143 Token::LBracket,
2144 Token::SimpleVarRef("X".to_string()),
2145 Token::EqEq,
2146 Token::String("value".to_string()),
2147 Token::RBracket,
2148 Token::RBracket
2149 ]
2150 );
2151 }
2152
2153 #[test]
2154 fn bash_style_assignment() {
2155 assert_eq!(
2157 lex(r#"NAME="value""#),
2158 vec![
2159 Token::Ident("NAME".to_string()),
2160 Token::Eq,
2161 Token::String("value".to_string())
2162 ]
2163 );
2164 }
2165
2166 #[test]
2167 fn positional_params() {
2168 assert_eq!(lex("$0"), vec![Token::Positional(0)]);
2169 assert_eq!(lex("$1"), vec![Token::Positional(1)]);
2170 assert_eq!(lex("$9"), vec![Token::Positional(9)]);
2171 assert_eq!(lex("$@"), vec![Token::AllArgs]);
2172 assert_eq!(lex("$#"), vec![Token::ArgCount]);
2173 }
2174
2175 #[test]
2176 fn positional_in_context() {
2177 assert_eq!(
2178 lex("echo $1 $2"),
2179 vec![
2180 Token::Ident("echo".to_string()),
2181 Token::Positional(1),
2182 Token::Positional(2),
2183 ]
2184 );
2185 }
2186
2187 #[test]
2188 fn var_length() {
2189 assert_eq!(lex("${#X}"), vec![Token::VarLength("X".to_string())]);
2190 assert_eq!(lex("${#NAME}"), vec![Token::VarLength("NAME".to_string())]);
2191 assert_eq!(lex("${#foo_bar}"), vec![Token::VarLength("foo_bar".to_string())]);
2192 }
2193
2194 #[test]
2195 fn var_length_in_context() {
2196 assert_eq!(
2197 lex("echo ${#NAME}"),
2198 vec![
2199 Token::Ident("echo".to_string()),
2200 Token::VarLength("NAME".to_string()),
2201 ]
2202 );
2203 }
2204
2205 #[test]
2210 fn plus_flag() {
2211 assert_eq!(lex("+e"), vec![Token::PlusFlag("e".to_string())]);
2213 assert_eq!(lex("+x"), vec![Token::PlusFlag("x".to_string())]);
2214 assert_eq!(lex("+ex"), vec![Token::PlusFlag("ex".to_string())]);
2215 }
2216
2217 #[test]
2218 fn set_with_plus_flag() {
2219 assert_eq!(
2220 lex("set +e"),
2221 vec![
2222 Token::Set,
2223 Token::PlusFlag("e".to_string()),
2224 ]
2225 );
2226 }
2227
2228 #[test]
2229 fn set_with_multiple_flags() {
2230 assert_eq!(
2231 lex("set -e -u"),
2232 vec![
2233 Token::Set,
2234 Token::ShortFlag("e".to_string()),
2235 Token::ShortFlag("u".to_string()),
2236 ]
2237 );
2238 }
2239
2240 #[test]
2241 fn flags_vs_negative_numbers_edge_cases() {
2242 assert_eq!(
2244 lex("-1 a"),
2245 vec![Token::Int(-1), Token::Ident("a".to_string())]
2246 );
2247 assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2249 assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2251 }
2252
2253 #[test]
2254 fn single_dash_is_minus_alone() {
2255 let result = tokenize("-").expect("should lex");
2257 assert_eq!(result.len(), 1);
2258 assert!(matches!(result[0].token, Token::MinusAlone));
2259 }
2260
2261 #[test]
2262 fn plus_bare_for_date_format() {
2263 let result = tokenize("+%s").expect("should lex");
2265 assert_eq!(result.len(), 1);
2266 assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%s"));
2267
2268 let result = tokenize("+%Y-%m-%d").expect("should lex");
2270 assert_eq!(result.len(), 1);
2271 assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%Y-%m-%d"));
2272 }
2273
2274 #[test]
2275 fn plus_flag_still_works() {
2276 let result = tokenize("+e").expect("should lex");
2278 assert_eq!(result.len(), 1);
2279 assert!(matches!(result[0].token, Token::PlusFlag(ref s) if s == "e"));
2280 }
2281
2282 #[test]
2283 fn while_keyword_vs_while_loop() {
2284 assert_eq!(lex("while"), vec![Token::While]);
2286 assert_eq!(
2288 lex("while true"),
2289 vec![Token::While, Token::True]
2290 );
2291 }
2292
2293 #[test]
2294 fn control_flow_keywords() {
2295 assert_eq!(lex("break"), vec![Token::Break]);
2296 assert_eq!(lex("continue"), vec![Token::Continue]);
2297 assert_eq!(lex("return"), vec![Token::Return]);
2298 assert_eq!(lex("exit"), vec![Token::Exit]);
2299 }
2300
2301 #[test]
2302 fn control_flow_with_numbers() {
2303 assert_eq!(
2304 lex("break 2"),
2305 vec![Token::Break, Token::Int(2)]
2306 );
2307 assert_eq!(
2308 lex("continue 3"),
2309 vec![Token::Continue, Token::Int(3)]
2310 );
2311 assert_eq!(
2312 lex("exit 1"),
2313 vec![Token::Exit, Token::Int(1)]
2314 );
2315 }
2316
2317 #[test]
2322 fn heredoc_simple() {
2323 let source = "cat <<EOF\nhello\nworld\nEOF";
2324 let tokens = lex(source);
2325 assert_eq!(tokens, vec![
2326 Token::Ident("cat".to_string()),
2327 Token::HereDocStart,
2328 Token::HereDoc(HereDocData { content: "hello\nworld".to_string(), literal: false }),
2329 Token::Newline,
2330 ]);
2331 }
2332
2333 #[test]
2334 fn heredoc_empty() {
2335 let source = "cat <<EOF\nEOF";
2336 let tokens = lex(source);
2337 assert_eq!(tokens, vec![
2338 Token::Ident("cat".to_string()),
2339 Token::HereDocStart,
2340 Token::HereDoc(HereDocData { content: "".to_string(), literal: false }),
2341 Token::Newline,
2342 ]);
2343 }
2344
2345 #[test]
2346 fn heredoc_with_special_chars() {
2347 let source = "cat <<EOF\n$VAR and \"quoted\" 'single'\nEOF";
2348 let tokens = lex(source);
2349 assert_eq!(tokens, vec![
2350 Token::Ident("cat".to_string()),
2351 Token::HereDocStart,
2352 Token::HereDoc(HereDocData { content: "$VAR and \"quoted\" 'single'".to_string(), literal: false }),
2353 Token::Newline,
2354 ]);
2355 }
2356
2357 #[test]
2358 fn heredoc_multiline() {
2359 let source = "cat <<END\nline1\nline2\nline3\nEND";
2360 let tokens = lex(source);
2361 assert_eq!(tokens, vec![
2362 Token::Ident("cat".to_string()),
2363 Token::HereDocStart,
2364 Token::HereDoc(HereDocData { content: "line1\nline2\nline3".to_string(), literal: false }),
2365 Token::Newline,
2366 ]);
2367 }
2368
2369 #[test]
2370 fn heredoc_in_command() {
2371 let source = "cat <<EOF\nhello\nEOF\necho goodbye";
2372 let tokens = lex(source);
2373 assert_eq!(tokens, vec![
2374 Token::Ident("cat".to_string()),
2375 Token::HereDocStart,
2376 Token::HereDoc(HereDocData { content: "hello".to_string(), literal: false }),
2377 Token::Newline,
2378 Token::Ident("echo".to_string()),
2379 Token::Ident("goodbye".to_string()),
2380 ]);
2381 }
2382
2383 #[test]
2384 fn heredoc_strip_tabs() {
2385 let source = "cat <<-EOF\n\thello\n\tworld\n\tEOF";
2386 let tokens = lex(source);
2387 assert_eq!(tokens, vec![
2389 Token::Ident("cat".to_string()),
2390 Token::HereDocStart,
2391 Token::HereDoc(HereDocData { content: "\thello\n\tworld".to_string(), literal: false }),
2392 Token::Newline,
2393 ]);
2394 }
2395
2396 #[test]
2401 fn arithmetic_simple() {
2402 let source = "$((1 + 2))";
2403 let tokens = lex(source);
2404 assert_eq!(tokens, vec![Token::Arithmetic("1 + 2".to_string())]);
2405 }
2406
2407 #[test]
2408 fn arithmetic_in_assignment() {
2409 let source = "X=$((5 * 3))";
2410 let tokens = lex(source);
2411 assert_eq!(tokens, vec![
2412 Token::Ident("X".to_string()),
2413 Token::Eq,
2414 Token::Arithmetic("5 * 3".to_string()),
2415 ]);
2416 }
2417
2418 #[test]
2419 fn arithmetic_with_nested_parens() {
2420 let source = "$((2 * (3 + 4)))";
2421 let tokens = lex(source);
2422 assert_eq!(tokens, vec![Token::Arithmetic("2 * (3 + 4)".to_string())]);
2423 }
2424
2425 #[test]
2426 fn arithmetic_with_variable() {
2427 let source = "$((X + 1))";
2428 let tokens = lex(source);
2429 assert_eq!(tokens, vec![Token::Arithmetic("X + 1".to_string())]);
2430 }
2431
2432 #[test]
2433 fn arithmetic_command_subst_not_confused() {
2434 let source = "$(echo hello)";
2436 let tokens = lex(source);
2437 assert_eq!(tokens, vec![
2438 Token::CmdSubstStart,
2439 Token::Ident("echo".to_string()),
2440 Token::Ident("hello".to_string()),
2441 Token::RParen,
2442 ]);
2443 }
2444
2445 #[test]
2446 fn arithmetic_nesting_limit() {
2447 let open_parens = "(".repeat(300);
2449 let close_parens = ")".repeat(300);
2450 let source = format!("$(({}1{}))", open_parens, close_parens);
2451 let result = tokenize(&source);
2452 assert!(result.is_err());
2453 let errors = result.unwrap_err();
2454 assert_eq!(errors.len(), 1);
2455 assert_eq!(errors[0].token, LexerError::NestingTooDeep);
2456 }
2457
2458 #[test]
2459 fn arithmetic_nesting_within_limit() {
2460 let source = "$((((1 + 2) * 3)))";
2462 let tokens = lex(source);
2463 assert_eq!(tokens, vec![Token::Arithmetic("((1 + 2) * 3)".to_string())]);
2464 }
2465
2466 #[test]
2471 fn token_categories() {
2472 assert_eq!(Token::If.category(), TokenCategory::Keyword);
2474 assert_eq!(Token::Then.category(), TokenCategory::Keyword);
2475 assert_eq!(Token::For.category(), TokenCategory::Keyword);
2476 assert_eq!(Token::Function.category(), TokenCategory::Keyword);
2477 assert_eq!(Token::True.category(), TokenCategory::Keyword);
2478 assert_eq!(Token::TypeString.category(), TokenCategory::Keyword);
2479
2480 assert_eq!(Token::Pipe.category(), TokenCategory::Operator);
2482 assert_eq!(Token::And.category(), TokenCategory::Operator);
2483 assert_eq!(Token::Or.category(), TokenCategory::Operator);
2484 assert_eq!(Token::StderrToStdout.category(), TokenCategory::Operator);
2485 assert_eq!(Token::GtGt.category(), TokenCategory::Operator);
2486
2487 assert_eq!(Token::String("test".to_string()).category(), TokenCategory::String);
2489 assert_eq!(Token::SingleString("test".to_string()).category(), TokenCategory::String);
2490 assert_eq!(Token::HereDoc(HereDocData { content: "test".to_string(), literal: false }).category(), TokenCategory::String);
2491
2492 assert_eq!(Token::Int(42).category(), TokenCategory::Number);
2494 assert_eq!(Token::Float(3.14).category(), TokenCategory::Number);
2495 assert_eq!(Token::Arithmetic("1+2".to_string()).category(), TokenCategory::Number);
2496
2497 assert_eq!(Token::SimpleVarRef("X".to_string()).category(), TokenCategory::Variable);
2499 assert_eq!(Token::VarRef("${X}".to_string()).category(), TokenCategory::Variable);
2500 assert_eq!(Token::Positional(1).category(), TokenCategory::Variable);
2501 assert_eq!(Token::AllArgs.category(), TokenCategory::Variable);
2502 assert_eq!(Token::ArgCount.category(), TokenCategory::Variable);
2503 assert_eq!(Token::LastExitCode.category(), TokenCategory::Variable);
2504 assert_eq!(Token::CurrentPid.category(), TokenCategory::Variable);
2505
2506 assert_eq!(Token::ShortFlag("l".to_string()).category(), TokenCategory::Flag);
2508 assert_eq!(Token::LongFlag("verbose".to_string()).category(), TokenCategory::Flag);
2509 assert_eq!(Token::PlusFlag("e".to_string()).category(), TokenCategory::Flag);
2510 assert_eq!(Token::DoubleDash.category(), TokenCategory::Flag);
2511
2512 assert_eq!(Token::Semi.category(), TokenCategory::Punctuation);
2514 assert_eq!(Token::LParen.category(), TokenCategory::Punctuation);
2515 assert_eq!(Token::LBracket.category(), TokenCategory::Punctuation);
2516 assert_eq!(Token::Newline.category(), TokenCategory::Punctuation);
2517
2518 assert_eq!(Token::Comment.category(), TokenCategory::Comment);
2520
2521 assert_eq!(Token::Path("/tmp/file".to_string()).category(), TokenCategory::Path);
2523
2524 assert_eq!(Token::Ident("echo".to_string()).category(), TokenCategory::Command);
2526
2527 assert_eq!(Token::InvalidNumberIdent.category(), TokenCategory::Error);
2529 assert_eq!(Token::InvalidFloatNoLeading.category(), TokenCategory::Error);
2530 assert_eq!(Token::InvalidFloatNoTrailing.category(), TokenCategory::Error);
2531 }
2532
2533 #[test]
2534 fn test_heredoc_piped_to_command() {
2535 let tokens = tokenize("cat <<EOF | jq\n{\"key\": \"val\"}\nEOF").unwrap();
2538 let heredoc_pos = tokens.iter().position(|t| matches!(t.token, Token::HereDoc(_)));
2539 let pipe_pos = tokens.iter().position(|t| matches!(t.token, Token::Pipe));
2540 assert!(heredoc_pos.is_some(), "should have a heredoc token");
2541 assert!(pipe_pos.is_some(), "should have a pipe token");
2542 assert!(
2543 pipe_pos.unwrap() > heredoc_pos.unwrap(),
2544 "Pipe must come after heredoc, got heredoc at {}, pipe at {}. Tokens: {:?}",
2545 heredoc_pos.unwrap(), pipe_pos.unwrap(), tokens,
2546 );
2547 }
2548
2549 #[test]
2550 fn test_heredoc_standalone_still_works() {
2551 let tokens = tokenize("cat <<EOF\nhello\nEOF").unwrap();
2553 assert!(tokens.iter().any(|t| matches!(t.token, Token::HereDoc(_))));
2554 assert!(!tokens.iter().any(|t| matches!(t.token, Token::Pipe)));
2555 }
2556
2557 #[test]
2558 fn test_heredoc_preserves_leading_empty_lines() {
2559 let tokens = tokenize("cat <<EOF\n\nhello\nEOF").unwrap();
2561 let heredoc = tokens.iter().find_map(|t| {
2562 if let Token::HereDoc(data) = &t.token {
2563 Some(data.clone())
2564 } else {
2565 None
2566 }
2567 });
2568 assert!(heredoc.is_some(), "should have a heredoc token");
2569 let data = heredoc.unwrap();
2570 assert!(data.content.starts_with('\n'), "leading empty line must be preserved, got: {:?}", data.content);
2571 assert_eq!(data.content, "\nhello");
2572 }
2573
2574 #[test]
2575 fn test_heredoc_quoted_delimiter_sets_literal() {
2576 let tokens = tokenize("cat <<'EOF'\nhello $HOME\nEOF").unwrap();
2578 let heredoc = tokens.iter().find_map(|t| {
2579 if let Token::HereDoc(data) = &t.token {
2580 Some(data.clone())
2581 } else {
2582 None
2583 }
2584 });
2585 assert!(heredoc.is_some(), "should have a heredoc token");
2586 let data = heredoc.unwrap();
2587 assert!(data.literal, "quoted delimiter should set literal=true");
2588 assert_eq!(data.content, "hello $HOME");
2589 }
2590
2591 #[test]
2592 fn test_heredoc_unquoted_delimiter_not_literal() {
2593 let tokens = tokenize("cat <<EOF\nhello $HOME\nEOF").unwrap();
2595 let heredoc = tokens.iter().find_map(|t| {
2596 if let Token::HereDoc(data) = &t.token {
2597 Some(data.clone())
2598 } else {
2599 None
2600 }
2601 });
2602 assert!(heredoc.is_some(), "should have a heredoc token");
2603 let data = heredoc.unwrap();
2604 assert!(!data.literal, "unquoted delimiter should have literal=false");
2605 }
2606}