Skip to main content

kaish_kernel/
lexer.rs

1//! Lexer for kaish source code.
2//!
3//! Converts source text into a stream of tokens using the logos lexer generator.
4//! The lexer is designed to be unambiguous: every valid input produces exactly
5//! one token sequence, and invalid input produces clear errors.
6//!
7//! # Token Categories
8//!
9//! - **Keywords**: `set`, `tool`, `if`, `then`, `else`, `fi`, `for`, `in`, `do`, `done`
10//! - **Literals**: strings, integers, floats, booleans (`true`/`false`)
11//! - **Operators**: `=`, `|`, `&`, `>`, `>>`, `<`, `2>`, `&>`, `&&`, `||`
12//! - **Punctuation**: `;`, `:`, `,`, `.`, `{`, `}`, `[`, `]`
13//! - **Variable references**: `${...}` with nested path access
14//! - **Identifiers**: command names, variable names, parameter names
15
16use logos::{Logos, Span};
17use std::fmt;
18use std::sync::atomic::{AtomicU64, Ordering};
19use std::time::{SystemTime, UNIX_EPOCH};
20
21/// Global counter for generating unique markers across all tokenize calls.
22static MARKER_COUNTER: AtomicU64 = AtomicU64::new(0);
23
24/// Maximum nesting depth for parentheses in arithmetic expressions.
25/// Prevents stack overflow from pathologically nested inputs like $((((((...
26const MAX_PAREN_DEPTH: usize = 256;
27
28/// Tracks a text replacement for span correction.
29/// When preprocessing replaces text (like `$((1+2))` with a marker),
30/// we need to adjust subsequent spans to account for the length change.
31#[derive(Debug, Clone)]
32struct SpanReplacement {
33    /// Position in the preprocessed text where the marker starts.
34    preprocessed_pos: usize,
35    /// Length of the marker in preprocessed text.
36    marker_len: usize,
37    /// Length of the original text that was replaced.
38    original_len: usize,
39}
40
41/// Corrects a span from preprocessed-text coordinates back to original-text coordinates.
42fn correct_span(span: Span, replacements: &[SpanReplacement]) -> Span {
43    let mut start_adjustment: isize = 0;
44    let mut end_adjustment: isize = 0;
45
46    for r in replacements {
47        // Calculate the length difference (positive = original was longer, negative = marker is longer)
48        let delta = r.original_len as isize - r.marker_len as isize;
49
50        // If the span starts after this replacement, adjust the start
51        if span.start > r.preprocessed_pos + r.marker_len {
52            start_adjustment += delta;
53        } else if span.start > r.preprocessed_pos {
54            // Span starts inside the marker - map to original position
55            // (this shouldn't happen often, but handle it gracefully)
56            start_adjustment += delta;
57        }
58
59        // If the span ends after this replacement, adjust the end
60        if span.end > r.preprocessed_pos + r.marker_len {
61            end_adjustment += delta;
62        } else if span.end > r.preprocessed_pos {
63            // Span ends inside the marker - map to end of original
64            end_adjustment += delta;
65        }
66    }
67
68    let new_start = (span.start as isize + start_adjustment).max(0) as usize;
69    let new_end = (span.end as isize + end_adjustment).max(new_start as isize) as usize;
70    new_start..new_end
71}
72
73/// Generate a unique marker ID that's extremely unlikely to collide with user code.
74/// Uses a combination of timestamp, counter, and process ID.
75fn unique_marker_id() -> String {
76    let timestamp = SystemTime::now()
77        .duration_since(UNIX_EPOCH)
78        .map(|d| d.as_nanos())
79        .unwrap_or(0);
80    let counter = MARKER_COUNTER.fetch_add(1, Ordering::Relaxed);
81    let pid = std::process::id();
82    format!("{:x}_{:x}_{:x}", timestamp, counter, pid)
83}
84
85/// A token with its span in the source text.
86#[derive(Debug, Clone, PartialEq)]
87pub struct Spanned<T> {
88    pub token: T,
89    pub span: Span,
90}
91
92impl<T> Spanned<T> {
93    pub fn new(token: T, span: Span) -> Self {
94        Self { token, span }
95    }
96}
97
98/// Lexer error types.
99#[derive(Debug, Clone, PartialEq, Default)]
100pub enum LexerError {
101    #[default]
102    UnexpectedCharacter,
103    UnterminatedString,
104    UnterminatedVarRef,
105    InvalidEscape,
106    InvalidNumber,
107    AmbiguousBoolean(String),
108    AmbiguousBooleanLike(String),
109    InvalidNumberIdent(String),
110    InvalidFloatNoLeading,
111    InvalidFloatNoTrailing,
112    /// Nesting depth exceeded (too many nested parentheses in arithmetic).
113    NestingTooDeep,
114}
115
116impl fmt::Display for LexerError {
117    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
118        match self {
119            LexerError::UnexpectedCharacter => write!(f, "unexpected character"),
120            LexerError::UnterminatedString => write!(f, "unterminated string"),
121            LexerError::UnterminatedVarRef => write!(f, "unterminated variable reference"),
122            LexerError::InvalidEscape => write!(f, "invalid escape sequence"),
123            LexerError::InvalidNumber => write!(f, "invalid number"),
124            LexerError::AmbiguousBoolean(s) => {
125                write!(f, "ambiguous boolean, use lowercase '{}'", s.to_lowercase())
126            }
127            LexerError::AmbiguousBooleanLike(s) => {
128                let suggest = if s.eq_ignore_ascii_case("yes") { "true" } else { "false" };
129                write!(f, "ambiguous boolean-like '{}', use '{}' or '\"{}\"'", s, suggest, s)
130            }
131            LexerError::InvalidNumberIdent(s) => {
132                write!(f, "identifier cannot start with digit: {}", s)
133            }
134            LexerError::InvalidFloatNoLeading => write!(f, "float must have leading digit"),
135            LexerError::InvalidFloatNoTrailing => write!(f, "float must have trailing digit"),
136            LexerError::NestingTooDeep => write!(f, "nesting depth exceeded (max {})", MAX_PAREN_DEPTH),
137        }
138    }
139}
140
141/// Tokens produced by the kaish lexer.
142///
143/// The order of variants matters for logos priority. More specific patterns
144/// (like keywords) should come before more general ones (like identifiers).
145///
146/// Tokens that carry semantic values (strings, numbers, identifiers) include
147/// the parsed value directly. This ensures the parser has access to actual
148/// data, not just token types.
149/// Here-doc content data.
150/// `literal` is true when the delimiter was quoted (<<'EOF' or <<"EOF"),
151/// meaning no variable expansion should occur.
152#[derive(Debug, Clone, PartialEq)]
153pub struct HereDocData {
154    pub content: String,
155    pub literal: bool,
156}
157
158#[derive(Logos, Debug, Clone, PartialEq)]
159#[logos(error = LexerError)]
160#[logos(skip r"[ \t]+")]
161pub enum Token {
162    // ═══════════════════════════════════════════════════════════════════
163    // Keywords (must come before Ident for priority)
164    // ═══════════════════════════════════════════════════════════════════
165    #[token("set")]
166    Set,
167
168    #[token("local")]
169    Local,
170
171    #[token("if")]
172    If,
173
174    #[token("then")]
175    Then,
176
177    #[token("else")]
178    Else,
179
180    #[token("elif")]
181    Elif,
182
183    #[token("fi")]
184    Fi,
185
186    #[token("for")]
187    For,
188
189    #[token("while")]
190    While,
191
192    #[token("in")]
193    In,
194
195    #[token("do")]
196    Do,
197
198    #[token("done")]
199    Done,
200
201    #[token("case")]
202    Case,
203
204    #[token("esac")]
205    Esac,
206
207    #[token("function")]
208    Function,
209
210    #[token("break")]
211    Break,
212
213    #[token("continue")]
214    Continue,
215
216    #[token("return")]
217    Return,
218
219    #[token("exit")]
220    Exit,
221
222    #[token("true")]
223    True,
224
225    #[token("false")]
226    False,
227
228    // ═══════════════════════════════════════════════════════════════════
229    // Type keywords (for tool parameters)
230    // ═══════════════════════════════════════════════════════════════════
231    #[token("string")]
232    TypeString,
233
234    #[token("int")]
235    TypeInt,
236
237    #[token("float")]
238    TypeFloat,
239
240    #[token("bool")]
241    TypeBool,
242
243    // ═══════════════════════════════════════════════════════════════════
244    // Multi-character operators (must come before single-char versions)
245    // ═══════════════════════════════════════════════════════════════════
246    #[token("&&")]
247    And,
248
249    #[token("||")]
250    Or,
251
252    #[token("==")]
253    EqEq,
254
255    #[token("!=")]
256    NotEq,
257
258    #[token("=~")]
259    Match,
260
261    #[token("!~")]
262    NotMatch,
263
264    #[token(">=")]
265    GtEq,
266
267    #[token("<=")]
268    LtEq,
269
270    #[token(">>")]
271    GtGt,
272
273    #[token("2>&1")]
274    StderrToStdout,
275
276    #[token("1>&2")]
277    StdoutToStderr,
278
279    #[token(">&2")]
280    StdoutToStderr2,
281
282    #[token("2>")]
283    Stderr,
284
285    #[token("&>")]
286    Both,
287
288    #[token("<<")]
289    HereDocStart,
290
291    #[token(";;")]
292    DoubleSemi,
293
294    // ═══════════════════════════════════════════════════════════════════
295    // Single-character operators and punctuation
296    // ═══════════════════════════════════════════════════════════════════
297    #[token("=")]
298    Eq,
299
300    #[token("|")]
301    Pipe,
302
303    #[token("&")]
304    Amp,
305
306    #[token(">")]
307    Gt,
308
309    #[token("<")]
310    Lt,
311
312    #[token(";")]
313    Semi,
314
315    #[token(":")]
316    Colon,
317
318    #[token(",")]
319    Comma,
320
321    #[token("..")]
322    DotDot,
323
324    #[token(".")]
325    Dot,
326
327    /// Tilde path: `~/foo`, `~user/bar` - value includes the full string
328    #[regex(r"~[a-zA-Z0-9_./+-]+", lex_tilde_path, priority = 3)]
329    TildePath(String),
330
331    /// Bare tilde: `~` alone (expands to $HOME)
332    #[token("~")]
333    Tilde,
334
335    /// Relative path starting with `../`: `../foo/bar`
336    #[regex(r"\.\./[a-zA-Z0-9_./-]+", lex_relative_path, priority = 3)]
337    RelativePath(String),
338
339    /// Dot-slash path: `./foo`, `./script.sh`
340    #[regex(r"\./[a-zA-Z0-9_./-]+", lex_dot_slash_path, priority = 3)]
341    DotSlashPath(String),
342
343    #[token("{")]
344    LBrace,
345
346    #[token("}")]
347    RBrace,
348
349    #[token("[")]
350    LBracket,
351
352    #[token("]")]
353    RBracket,
354
355    #[token("(")]
356    LParen,
357
358    #[token(")")]
359    RParen,
360
361    #[token("*")]
362    Star,
363
364    #[token("!")]
365    Bang,
366
367    #[token("?")]
368    Question,
369
370    // ═══════════════════════════════════════════════════════════════════
371    // Command substitution
372    // ═══════════════════════════════════════════════════════════════════
373
374    /// Arithmetic expression content: synthesized by preprocessing.
375    /// Contains the expression string between `$((` and `))`.
376    Arithmetic(String),
377
378    /// Command substitution start: `$(` - begins a command substitution
379    #[token("$(")]
380    CmdSubstStart,
381
382    // ═══════════════════════════════════════════════════════════════════
383    // Flags (must come before Int to win over negative numbers)
384    // ═══════════════════════════════════════════════════════════════════
385
386    /// Long flag: `--name` or `--foo-bar`
387    #[regex(r"--[a-zA-Z][a-zA-Z0-9-]*", lex_long_flag, priority = 3)]
388    LongFlag(String),
389
390    /// Short flag: `-l` or `-la` (combined short flags)
391    #[regex(r"-[a-zA-Z][a-zA-Z0-9]*", lex_short_flag, priority = 3)]
392    ShortFlag(String),
393
394    /// Plus flag: `+e` or `+x` (for set +e to disable options)
395    #[regex(r"\+[a-zA-Z][a-zA-Z0-9]*", lex_plus_flag, priority = 3)]
396    PlusFlag(String),
397
398    /// Double dash: `--` alone marks end of flags
399    #[token("--")]
400    DoubleDash,
401
402    /// Bare word starting with + followed by non-letter: `+%s`, `+%Y-%m-%d`
403    /// For date format strings and similar. Lower priority than PlusFlag.
404    #[regex(r"\+[^a-zA-Z\s][^\s]*", lex_plus_bare, priority = 2)]
405    PlusBare(String),
406
407    /// Bare word starting with - followed by non-letter/digit/dash: `-%`, etc.
408    /// For rare cases. Lower priority than ShortFlag, Int, and DoubleDash.
409    /// Excludes - after first - to avoid matching --name patterns.
410    #[regex(r"-[^a-zA-Z0-9\s\-][^\s]*", lex_minus_bare, priority = 1)]
411    MinusBare(String),
412
413    /// Standalone - (stdin indicator for cat -, diff - -, etc.)
414    /// Only matches when followed by whitespace or end.
415    /// This is handled specially in the parser as a positional arg.
416    #[token("-")]
417    MinusAlone,
418
419    // ═══════════════════════════════════════════════════════════════════
420    // Literals (with values)
421    // ═══════════════════════════════════════════════════════════════════
422
423    /// Double-quoted string: `"..."` - value is the parsed content (quotes removed, escapes processed)
424    #[regex(r#""([^"\\]|\\.)*""#, lex_string)]
425    String(String),
426
427    /// Single-quoted string: `'...'` - literal content, no escape processing
428    #[regex(r"'[^']*'", lex_single_string)]
429    SingleString(String),
430
431    /// Braced variable reference: `${VAR}` or `${VAR.field}` - value is the raw inner content
432    #[regex(r"\$\{[^}]+\}", lex_varref)]
433    VarRef(String),
434
435    /// Simple variable reference: `$NAME` - just the identifier
436    #[regex(r"\$[a-zA-Z_][a-zA-Z0-9_]*", lex_simple_varref)]
437    SimpleVarRef(String),
438
439    /// Positional parameter: `$0` through `$9`
440    #[regex(r"\$[0-9]", lex_positional)]
441    Positional(usize),
442
443    /// All positional parameters: `$@`
444    #[token("$@")]
445    AllArgs,
446
447    /// Number of positional parameters: `$#`
448    #[token("$#")]
449    ArgCount,
450
451    /// Last exit code: `$?`
452    #[token("$?")]
453    LastExitCode,
454
455    /// Current shell PID: `$$`
456    #[token("$$")]
457    CurrentPid,
458
459    /// Variable string length: `${#VAR}`
460    #[regex(r"\$\{#[a-zA-Z_][a-zA-Z0-9_]*\}", lex_var_length)]
461    VarLength(String),
462
463    /// Here-doc content: synthesized by preprocessing, not directly lexed.
464    /// Contains the full content of the here-doc (without the delimiter lines).
465    HereDoc(HereDocData),
466
467    /// Integer literal - value is the parsed i64
468    #[regex(r"-?[0-9]+", lex_int, priority = 2)]
469    Int(i64),
470
471    /// Float literal - value is the parsed f64
472    #[regex(r"-?[0-9]+\.[0-9]+", lex_float)]
473    Float(f64),
474
475    // ═══════════════════════════════════════════════════════════════════
476    // Invalid patterns (caught before valid tokens for better errors)
477    // ═══════════════════════════════════════════════════════════════════
478
479    /// Invalid: number followed by identifier characters (like 123abc)
480    #[regex(r"[0-9]+[a-zA-Z_][a-zA-Z0-9_-]*", lex_invalid_number_ident, priority = 3)]
481    InvalidNumberIdent,
482
483    /// Invalid: float without leading digit (like .5)
484    #[regex(r"\.[0-9]+", lex_invalid_float_no_leading, priority = 3)]
485    InvalidFloatNoLeading,
486
487    /// Invalid: float without trailing digit (like 5.)
488    /// Logos uses longest-match, so valid floats like 5.5 will match Float pattern instead
489    #[regex(r"[0-9]+\.", lex_invalid_float_no_trailing, priority = 2)]
490    InvalidFloatNoTrailing,
491
492    // ═══════════════════════════════════════════════════════════════════
493    // Paths (absolute paths starting with /)
494    // ═══════════════════════════════════════════════════════════════════
495
496    /// Absolute path: `/tmp/out`, `/etc/hosts`, etc.
497    #[regex(r"/[a-zA-Z0-9_./+-]*", lex_path)]
498    Path(String),
499
500    // ═══════════════════════════════════════════════════════════════════
501    // Identifiers (command names, variable names, etc.)
502    // ═══════════════════════════════════════════════════════════════════
503
504    /// Identifier - value is the identifier string
505    /// Allows dots for filenames like `script.kai`
506    #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*", lex_ident)]
507    Ident(String),
508
509    // ═══════════════════════════════════════════════════════════════════
510    // Structural tokens
511    // ═══════════════════════════════════════════════════════════════════
512
513    /// Comment: `# ...` to end of line
514    #[regex(r"#[^\n\r]*", allow_greedy = true)]
515    Comment,
516
517    /// Newline (significant in kaish - ends statements)
518    #[regex(r"\n|\r\n")]
519    Newline,
520
521    /// Line continuation: backslash at end of line
522    #[regex(r"\\[ \t]*(\n|\r\n)")]
523    LineContinuation,
524}
525
526/// Semantic category for syntax highlighting.
527///
528/// Stable enum that groups tokens by purpose. Consumers match on categories
529/// instead of individual tokens, insulating them from lexer evolution.
530#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
531pub enum TokenCategory {
532    /// Keywords: if, then, else, for, while, function, return, etc.
533    Keyword,
534    /// Operators: |, &&, ||, >, >>, 2>&1, =, ==, etc.
535    Operator,
536    /// String literals: "...", '...', heredocs
537    String,
538    /// Numeric literals: 123, 3.14, arithmetic expressions
539    Number,
540    /// Variable references: $foo, ${bar}, $1, $@, $#, $?, $$
541    Variable,
542    /// Comments: # ...
543    Comment,
544    /// Punctuation: ; , . ( ) { } [ ]
545    Punctuation,
546    /// Identifiers in command position
547    Command,
548    /// Absolute paths: /foo/bar
549    Path,
550    /// Flags: --long, -s, +x
551    Flag,
552    /// Invalid tokens
553    Error,
554}
555
556impl Token {
557    /// Returns the semantic category for syntax highlighting.
558    pub fn category(&self) -> TokenCategory {
559        match self {
560            // Keywords
561            Token::If
562            | Token::Then
563            | Token::Else
564            | Token::Elif
565            | Token::Fi
566            | Token::For
567            | Token::In
568            | Token::Do
569            | Token::Done
570            | Token::While
571            | Token::Case
572            | Token::Esac
573            | Token::Function
574            | Token::Return
575            | Token::Break
576            | Token::Continue
577            | Token::Exit
578            | Token::Set
579            | Token::Local
580            | Token::True
581            | Token::False
582            | Token::TypeString
583            | Token::TypeInt
584            | Token::TypeFloat
585            | Token::TypeBool => TokenCategory::Keyword,
586
587            // Operators and redirections
588            Token::Pipe
589            | Token::And
590            | Token::Or
591            | Token::Amp
592            | Token::Eq
593            | Token::EqEq
594            | Token::NotEq
595            | Token::Match
596            | Token::NotMatch
597            | Token::Lt
598            | Token::Gt
599            | Token::LtEq
600            | Token::GtEq
601            | Token::GtGt
602            | Token::Stderr
603            | Token::Both
604            | Token::HereDocStart
605            | Token::StderrToStdout
606            | Token::StdoutToStderr
607            | Token::StdoutToStderr2 => TokenCategory::Operator,
608
609            // Strings
610            Token::String(_) | Token::SingleString(_) | Token::HereDoc(_) => TokenCategory::String,
611
612            // Numbers
613            Token::Int(_) | Token::Float(_) | Token::Arithmetic(_) => TokenCategory::Number,
614
615            // Variables
616            Token::VarRef(_)
617            | Token::SimpleVarRef(_)
618            | Token::Positional(_)
619            | Token::AllArgs
620            | Token::ArgCount
621            | Token::VarLength(_)
622            | Token::LastExitCode
623            | Token::CurrentPid => TokenCategory::Variable,
624
625            // Flags
626            Token::LongFlag(_)
627            | Token::ShortFlag(_)
628            | Token::PlusFlag(_)
629            | Token::DoubleDash => TokenCategory::Flag,
630
631            // Punctuation
632            Token::Semi
633            | Token::DoubleSemi
634            | Token::Colon
635            | Token::Comma
636            | Token::Dot
637            | Token::LParen
638            | Token::RParen
639            | Token::LBrace
640            | Token::RBrace
641            | Token::LBracket
642            | Token::RBracket
643            | Token::Bang
644            | Token::Question
645            | Token::Star
646            | Token::Newline
647            | Token::LineContinuation
648            | Token::CmdSubstStart => TokenCategory::Punctuation,
649
650            // Comments
651            Token::Comment => TokenCategory::Comment,
652
653            // Paths
654            Token::Path(_)
655            | Token::TildePath(_)
656            | Token::RelativePath(_)
657            | Token::Tilde
658            | Token::DotDot
659            | Token::DotSlashPath(_) => TokenCategory::Path,
660
661            // Commands/identifiers (and bare words)
662            Token::Ident(_)
663            | Token::PlusBare(_)
664            | Token::MinusBare(_)
665            | Token::MinusAlone => TokenCategory::Command,
666
667            // Errors
668            Token::InvalidNumberIdent
669            | Token::InvalidFloatNoLeading
670            | Token::InvalidFloatNoTrailing => TokenCategory::Error,
671        }
672    }
673}
674
675/// Lex a double-quoted string literal, processing escape sequences.
676fn lex_string(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
677    parse_string_literal(lex.slice())
678}
679
680/// Lex a single-quoted string literal (no escape processing).
681fn lex_single_string(lex: &mut logos::Lexer<Token>) -> String {
682    let s = lex.slice();
683    // Strip the surrounding single quotes
684    s[1..s.len() - 1].to_string()
685}
686
687/// Lex a braced variable reference, extracting the inner content.
688fn lex_varref(lex: &mut logos::Lexer<Token>) -> String {
689    // Keep the full ${...} for later parsing of path segments
690    lex.slice().to_string()
691}
692
693/// Lex a simple variable reference: `$NAME` → `NAME`
694fn lex_simple_varref(lex: &mut logos::Lexer<Token>) -> String {
695    // Strip the leading `$`
696    lex.slice()[1..].to_string()
697}
698
699/// Lex a positional parameter: `$1` → 1
700fn lex_positional(lex: &mut logos::Lexer<Token>) -> usize {
701    // Strip the leading `$` and parse the digit
702    lex.slice()[1..].parse().unwrap_or(0)
703}
704
705/// Lex a variable length: `${#VAR}` → "VAR"
706fn lex_var_length(lex: &mut logos::Lexer<Token>) -> String {
707    // Strip the leading `${#` and trailing `}`
708    let s = lex.slice();
709    s[3..s.len() - 1].to_string()
710}
711
712/// Lex an integer literal.
713fn lex_int(lex: &mut logos::Lexer<Token>) -> Result<i64, LexerError> {
714    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
715}
716
717/// Lex a float literal.
718fn lex_float(lex: &mut logos::Lexer<Token>) -> Result<f64, LexerError> {
719    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
720}
721
722/// Lex an invalid number-identifier pattern (like 123abc).
723/// Always returns Err to produce a lexer error instead of a token.
724fn lex_invalid_number_ident(lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
725    Err(LexerError::InvalidNumberIdent(lex.slice().to_string()))
726}
727
728/// Lex an invalid float without leading digit (like .5).
729/// Always returns Err to produce a lexer error instead of a token.
730fn lex_invalid_float_no_leading(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
731    Err(LexerError::InvalidFloatNoLeading)
732}
733
734/// Lex an invalid float without trailing digit (like 5.).
735/// Always returns Err to produce a lexer error instead of a token.
736fn lex_invalid_float_no_trailing(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
737    Err(LexerError::InvalidFloatNoTrailing)
738}
739
740/// Lex an identifier, rejecting ambiguous boolean-like values.
741fn lex_ident(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
742    let s = lex.slice();
743
744    // Reject ambiguous boolean variants (TRUE, FALSE, True, etc.)
745    // Only lowercase 'true' and 'false' are valid booleans (handled by Token::True/False)
746    match s.to_lowercase().as_str() {
747        "true" | "false" if s != "true" && s != "false" => {
748            return Err(LexerError::AmbiguousBoolean(s.to_string()));
749        }
750        _ => {}
751    }
752
753    // Reject yes/no/YES/NO/Yes/No as ambiguous boolean-like values
754    if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("no") {
755        return Err(LexerError::AmbiguousBooleanLike(s.to_string()));
756    }
757
758    Ok(s.to_string())
759}
760
761/// Lex a long flag: `--name` → `name`
762fn lex_long_flag(lex: &mut logos::Lexer<Token>) -> String {
763    // Strip the leading `--`
764    lex.slice()[2..].to_string()
765}
766
767/// Lex a short flag: `-l` → `l`, `-la` → `la`
768fn lex_short_flag(lex: &mut logos::Lexer<Token>) -> String {
769    // Strip the leading `-`
770    lex.slice()[1..].to_string()
771}
772
773/// Lex a plus flag: `+e` → `e`, `+ex` → `ex`
774fn lex_plus_flag(lex: &mut logos::Lexer<Token>) -> String {
775    // Strip the leading `+`
776    lex.slice()[1..].to_string()
777}
778
779/// Lex a plus bare word: `+%s` → `+%s` (keep the full string)
780fn lex_plus_bare(lex: &mut logos::Lexer<Token>) -> String {
781    lex.slice().to_string()
782}
783
784/// Lex a minus bare word: `-%` → `-%` (keep the full string)
785fn lex_minus_bare(lex: &mut logos::Lexer<Token>) -> String {
786    lex.slice().to_string()
787}
788
789/// Lex an absolute path: `/tmp/out` → `/tmp/out`
790fn lex_path(lex: &mut logos::Lexer<Token>) -> String {
791    lex.slice().to_string()
792}
793
794/// Lex a tilde path: `~/foo` → `~/foo`
795fn lex_tilde_path(lex: &mut logos::Lexer<Token>) -> String {
796    lex.slice().to_string()
797}
798
799/// Lex a relative path: `../foo` → `../foo`
800fn lex_relative_path(lex: &mut logos::Lexer<Token>) -> String {
801    lex.slice().to_string()
802}
803
804/// Lex a dot-slash path: `./foo` → `./foo`
805fn lex_dot_slash_path(lex: &mut logos::Lexer<Token>) -> String {
806    lex.slice().to_string()
807}
808
809impl fmt::Display for Token {
810    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
811        match self {
812            Token::Set => write!(f, "set"),
813            Token::Local => write!(f, "local"),
814            Token::If => write!(f, "if"),
815            Token::Then => write!(f, "then"),
816            Token::Else => write!(f, "else"),
817            Token::Elif => write!(f, "elif"),
818            Token::Fi => write!(f, "fi"),
819            Token::For => write!(f, "for"),
820            Token::While => write!(f, "while"),
821            Token::In => write!(f, "in"),
822            Token::Do => write!(f, "do"),
823            Token::Done => write!(f, "done"),
824            Token::Case => write!(f, "case"),
825            Token::Esac => write!(f, "esac"),
826            Token::Function => write!(f, "function"),
827            Token::Break => write!(f, "break"),
828            Token::Continue => write!(f, "continue"),
829            Token::Return => write!(f, "return"),
830            Token::Exit => write!(f, "exit"),
831            Token::True => write!(f, "true"),
832            Token::False => write!(f, "false"),
833            Token::TypeString => write!(f, "string"),
834            Token::TypeInt => write!(f, "int"),
835            Token::TypeFloat => write!(f, "float"),
836            Token::TypeBool => write!(f, "bool"),
837            Token::And => write!(f, "&&"),
838            Token::Or => write!(f, "||"),
839            Token::EqEq => write!(f, "=="),
840            Token::NotEq => write!(f, "!="),
841            Token::Match => write!(f, "=~"),
842            Token::NotMatch => write!(f, "!~"),
843            Token::GtEq => write!(f, ">="),
844            Token::LtEq => write!(f, "<="),
845            Token::GtGt => write!(f, ">>"),
846            Token::StderrToStdout => write!(f, "2>&1"),
847            Token::StdoutToStderr => write!(f, "1>&2"),
848            Token::StdoutToStderr2 => write!(f, ">&2"),
849            Token::Stderr => write!(f, "2>"),
850            Token::Both => write!(f, "&>"),
851            Token::HereDocStart => write!(f, "<<"),
852            Token::DoubleSemi => write!(f, ";;"),
853            Token::Eq => write!(f, "="),
854            Token::Pipe => write!(f, "|"),
855            Token::Amp => write!(f, "&"),
856            Token::Gt => write!(f, ">"),
857            Token::Lt => write!(f, "<"),
858            Token::Semi => write!(f, ";"),
859            Token::Colon => write!(f, ":"),
860            Token::Comma => write!(f, ","),
861            Token::Dot => write!(f, "."),
862            Token::DotDot => write!(f, ".."),
863            Token::Tilde => write!(f, "~"),
864            Token::TildePath(s) => write!(f, "{}", s),
865            Token::RelativePath(s) => write!(f, "{}", s),
866            Token::DotSlashPath(s) => write!(f, "{}", s),
867            Token::LBrace => write!(f, "{{"),
868            Token::RBrace => write!(f, "}}"),
869            Token::LBracket => write!(f, "["),
870            Token::RBracket => write!(f, "]"),
871            Token::LParen => write!(f, "("),
872            Token::RParen => write!(f, ")"),
873            Token::Star => write!(f, "*"),
874            Token::Bang => write!(f, "!"),
875            Token::Question => write!(f, "?"),
876            Token::Arithmetic(s) => write!(f, "ARITHMETIC({})", s),
877            Token::CmdSubstStart => write!(f, "$("),
878            Token::LongFlag(s) => write!(f, "--{}", s),
879            Token::ShortFlag(s) => write!(f, "-{}", s),
880            Token::PlusFlag(s) => write!(f, "+{}", s),
881            Token::DoubleDash => write!(f, "--"),
882            Token::PlusBare(s) => write!(f, "{}", s),
883            Token::MinusBare(s) => write!(f, "{}", s),
884            Token::MinusAlone => write!(f, "-"),
885            Token::String(s) => write!(f, "STRING({:?})", s),
886            Token::SingleString(s) => write!(f, "SINGLESTRING({:?})", s),
887            Token::HereDoc(d) => write!(f, "HEREDOC({:?}, literal={})", d.content, d.literal),
888            Token::VarRef(v) => write!(f, "VARREF({})", v),
889            Token::SimpleVarRef(v) => write!(f, "SIMPLEVARREF({})", v),
890            Token::Positional(n) => write!(f, "${}", n),
891            Token::AllArgs => write!(f, "$@"),
892            Token::ArgCount => write!(f, "$#"),
893            Token::LastExitCode => write!(f, "$?"),
894            Token::CurrentPid => write!(f, "$$"),
895            Token::VarLength(v) => write!(f, "${{#{}}}", v),
896            Token::Int(n) => write!(f, "INT({})", n),
897            Token::Float(n) => write!(f, "FLOAT({})", n),
898            Token::Path(s) => write!(f, "PATH({})", s),
899            Token::Ident(s) => write!(f, "IDENT({})", s),
900            Token::Comment => write!(f, "COMMENT"),
901            Token::Newline => write!(f, "NEWLINE"),
902            Token::LineContinuation => write!(f, "LINECONT"),
903            // These variants should never be produced - their callbacks always return errors
904            Token::InvalidNumberIdent => write!(f, "INVALID_NUMBER_IDENT"),
905            Token::InvalidFloatNoLeading => write!(f, "INVALID_FLOAT_NO_LEADING"),
906            Token::InvalidFloatNoTrailing => write!(f, "INVALID_FLOAT_NO_TRAILING"),
907        }
908    }
909}
910
911impl Token {
912    /// Returns true if this token is a keyword.
913    pub fn is_keyword(&self) -> bool {
914        matches!(
915            self,
916            Token::Set
917                | Token::Local
918                | Token::If
919                | Token::Then
920                | Token::Else
921                | Token::Elif
922                | Token::Fi
923                | Token::For
924                | Token::In
925                | Token::Do
926                | Token::Done
927                | Token::Case
928                | Token::Esac
929                | Token::Function
930                | Token::True
931                | Token::False
932        )
933    }
934
935    /// Returns true if this token is a type keyword.
936    pub fn is_type(&self) -> bool {
937        matches!(
938            self,
939            Token::TypeString
940                | Token::TypeInt
941                | Token::TypeFloat
942                | Token::TypeBool
943        )
944    }
945
946    /// Returns true if this token starts a statement.
947    pub fn starts_statement(&self) -> bool {
948        matches!(
949            self,
950            Token::Set | Token::Local | Token::Function | Token::If | Token::For | Token::Case | Token::Ident(_) | Token::LBracket
951        )
952    }
953
954    /// Returns true if this token can appear in an expression.
955    pub fn is_value(&self) -> bool {
956        matches!(
957            self,
958            Token::String(_)
959                | Token::SingleString(_)
960                | Token::HereDoc(_)
961                | Token::Arithmetic(_)
962                | Token::Int(_)
963                | Token::Float(_)
964                | Token::True
965                | Token::False
966                | Token::VarRef(_)
967                | Token::SimpleVarRef(_)
968                | Token::CmdSubstStart
969                | Token::Path(_)
970                | Token::LastExitCode
971                | Token::CurrentPid
972        )
973    }
974}
975
976/// Result of preprocessing arithmetic expressions.
977struct ArithmeticPreprocessResult {
978    /// Preprocessed source with markers replacing $((expr)).
979    text: String,
980    /// Vector of (marker, expression_content) pairs.
981    arithmetics: Vec<(String, String)>,
982    /// Span replacements for correcting token positions.
983    replacements: Vec<SpanReplacement>,
984}
985
986/// Skip a `$(...)` command substitution with quote-aware paren matching.
987///
988/// Copies the entire command substitution verbatim to `result`, handling
989/// single quotes, double quotes, and backslash escapes inside the sub so
990/// that parentheses within strings don't confuse the depth counter.
991///
992/// On entry, `i` points to the `$` of `$(`. On exit, `i` points past the
993/// closing `)`.
994fn skip_command_substitution(
995    chars: &[char],
996    i: &mut usize,
997    source_pos: &mut usize,
998    result: &mut String,
999) {
1000    // Copy $(
1001    result.push('$');
1002    result.push('(');
1003    *i += 2;
1004    *source_pos += 2;
1005
1006    let mut depth: usize = 1;
1007    let mut in_single_quote = false;
1008    let mut in_double_quote = false;
1009
1010    while *i < chars.len() && depth > 0 {
1011        let c = chars[*i];
1012
1013        if in_single_quote {
1014            result.push(c);
1015            *source_pos += c.len_utf8();
1016            *i += 1;
1017            if c == '\'' {
1018                in_single_quote = false;
1019            }
1020            continue;
1021        }
1022
1023        if in_double_quote {
1024            if c == '\\' && *i + 1 < chars.len() {
1025                let next = chars[*i + 1];
1026                if next == '"' || next == '\\' || next == '$' || next == '`' {
1027                    result.push(c);
1028                    result.push(next);
1029                    *source_pos += c.len_utf8() + next.len_utf8();
1030                    *i += 2;
1031                    continue;
1032                }
1033            }
1034            if c == '"' {
1035                in_double_quote = false;
1036            }
1037            result.push(c);
1038            *source_pos += c.len_utf8();
1039            *i += 1;
1040            continue;
1041        }
1042
1043        // Outside quotes
1044        match c {
1045            '\'' => {
1046                in_single_quote = true;
1047                result.push(c);
1048                *source_pos += c.len_utf8();
1049                *i += 1;
1050            }
1051            '"' => {
1052                in_double_quote = true;
1053                result.push(c);
1054                *source_pos += c.len_utf8();
1055                *i += 1;
1056            }
1057            '\\' if *i + 1 < chars.len() => {
1058                result.push(c);
1059                result.push(chars[*i + 1]);
1060                *source_pos += c.len_utf8() + chars[*i + 1].len_utf8();
1061                *i += 2;
1062            }
1063            '(' => {
1064                depth += 1;
1065                result.push(c);
1066                *source_pos += c.len_utf8();
1067                *i += 1;
1068            }
1069            ')' => {
1070                depth -= 1;
1071                result.push(c);
1072                *source_pos += c.len_utf8();
1073                *i += 1;
1074            }
1075            _ => {
1076                result.push(c);
1077                *source_pos += c.len_utf8();
1078                *i += 1;
1079            }
1080        }
1081    }
1082}
1083
1084/// Preprocess arithmetic expressions in source code.
1085///
1086/// Finds `$((expr))` patterns and replaces them with markers.
1087/// Returns the preprocessed source, arithmetic contents, and span replacement info.
1088///
1089/// Example:
1090///   `X=$((1 + 2))`
1091/// Becomes:
1092///   `X=__KAISH_ARITH_{id}__`
1093/// With arithmetics[0] = ("__KAISH_ARITH_{id}__", "1 + 2")
1094///
1095/// # Errors
1096/// Returns `LexerError::NestingTooDeep` if parentheses are nested beyond MAX_PAREN_DEPTH.
1097fn preprocess_arithmetic(source: &str) -> Result<ArithmeticPreprocessResult, LexerError> {
1098    let mut result = String::with_capacity(source.len());
1099    let mut arithmetics: Vec<(String, String)> = Vec::new();
1100    let mut replacements: Vec<SpanReplacement> = Vec::new();
1101    let mut source_pos: usize = 0;
1102    let chars_vec: Vec<char> = source.chars().collect();
1103    let mut i = 0;
1104
1105    // Whether we're currently inside double quotes. Single quotes inside
1106    // double quotes are literal characters, not quote delimiters.
1107    let mut in_double_quote = false;
1108
1109    while i < chars_vec.len() {
1110        let ch = chars_vec[i];
1111
1112        // Backslash escape outside quotes — skip both chars verbatim
1113        if !in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1114            result.push(ch);
1115            result.push(chars_vec[i + 1]);
1116            source_pos += ch.len_utf8() + chars_vec[i + 1].len_utf8();
1117            i += 2;
1118            continue;
1119        }
1120
1121        // Single quote — only starts quote mode when NOT inside double quotes
1122        if ch == '\'' && !in_double_quote {
1123            result.push(ch);
1124            i += 1;
1125            source_pos += 1;
1126            while i < chars_vec.len() && chars_vec[i] != '\'' {
1127                result.push(chars_vec[i]);
1128                source_pos += chars_vec[i].len_utf8();
1129                i += 1;
1130            }
1131            if i < chars_vec.len() {
1132                result.push(chars_vec[i]); // closing quote
1133                source_pos += 1;
1134                i += 1;
1135            }
1136            continue;
1137        }
1138
1139        // Double quote — toggle state (arithmetic is still expanded inside)
1140        if ch == '"' {
1141            in_double_quote = !in_double_quote;
1142            result.push(ch);
1143            i += 1;
1144            source_pos += 1;
1145            continue;
1146        }
1147
1148        // Backslash escape inside double quotes — only \" and \\ are special
1149        if in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1150            let next = chars_vec[i + 1];
1151            if next == '"' || next == '\\' || next == '$' || next == '`' {
1152                result.push(ch);
1153                result.push(next);
1154                source_pos += ch.len_utf8() + next.len_utf8();
1155                i += 2;
1156                continue;
1157            }
1158        }
1159
1160        // Skip $(...) command substitutions — inner arithmetic belongs to the subcommand
1161        if ch == '$' && i + 1 < chars_vec.len() && chars_vec[i + 1] == '('
1162            && !(i + 2 < chars_vec.len() && chars_vec[i + 2] == '(')
1163        {
1164            skip_command_substitution(&chars_vec, &mut i, &mut source_pos, &mut result);
1165            continue;
1166        }
1167
1168        // Look for $(( (potential arithmetic)
1169        if ch == '$' && i + 2 < chars_vec.len() && chars_vec[i + 1] == '(' && chars_vec[i + 2] == '(' {
1170            let arith_start_pos = result.len();
1171            let original_start = source_pos;
1172
1173            // Skip $((
1174            i += 3;
1175            source_pos += 3;
1176
1177            // Collect expression until matching ))
1178            let mut expr = String::new();
1179            let mut paren_depth: usize = 0;
1180
1181            while i < chars_vec.len() {
1182                let c = chars_vec[i];
1183                match c {
1184                    '(' => {
1185                        paren_depth += 1;
1186                        if paren_depth > MAX_PAREN_DEPTH {
1187                            return Err(LexerError::NestingTooDeep);
1188                        }
1189                        expr.push('(');
1190                        i += 1;
1191                        source_pos += c.len_utf8();
1192                    }
1193                    ')' => {
1194                        if paren_depth > 0 {
1195                            paren_depth -= 1;
1196                            expr.push(')');
1197                            i += 1;
1198                            source_pos += 1;
1199                        } else if i + 1 < chars_vec.len() && chars_vec[i + 1] == ')' {
1200                            // Found closing ))
1201                            i += 2;
1202                            source_pos += 2;
1203                            break;
1204                        } else {
1205                            // Single ) inside - keep going
1206                            expr.push(')');
1207                            i += 1;
1208                            source_pos += 1;
1209                        }
1210                    }
1211                    _ => {
1212                        expr.push(c);
1213                        i += 1;
1214                        source_pos += c.len_utf8();
1215                    }
1216                }
1217            }
1218
1219            // Calculate original length: from $$(( to ))
1220            let original_len = source_pos - original_start;
1221
1222            // Create a unique marker for this arithmetic (collision-resistant)
1223            let marker = format!("__KAISH_ARITH_{}__", unique_marker_id());
1224            let marker_len = marker.len();
1225
1226            // Record the replacement for span correction
1227            replacements.push(SpanReplacement {
1228                preprocessed_pos: arith_start_pos,
1229                marker_len,
1230                original_len,
1231            });
1232
1233            arithmetics.push((marker.clone(), expr));
1234            result.push_str(&marker);
1235        } else {
1236            result.push(ch);
1237            i += 1;
1238            source_pos += ch.len_utf8();
1239        }
1240    }
1241
1242    Ok(ArithmeticPreprocessResult {
1243        text: result,
1244        arithmetics,
1245        replacements,
1246    })
1247}
1248
1249/// Preprocess here-docs in source code.
1250///
1251/// Finds `<<WORD` patterns and collects content until the delimiter line.
1252/// Returns the preprocessed source and a vector of (marker, content) pairs.
1253///
1254/// Example:
1255///   `cat <<EOF\nhello\nworld\nEOF`
1256/// Becomes:
1257///   `cat <<__HEREDOC_0__`
1258/// With heredocs[0] = ("__HEREDOC_0__", "hello\nworld")
1259fn preprocess_heredocs(source: &str) -> (String, Vec<(String, String, bool)>) {
1260    let mut result = String::with_capacity(source.len());
1261    let mut heredocs: Vec<(String, String, bool)> = Vec::new();
1262    let mut chars = source.chars().peekable();
1263
1264    while let Some(ch) = chars.next() {
1265        // Look for << (potential here-doc)
1266        if ch == '<' && chars.peek() == Some(&'<') {
1267            chars.next(); // consume second <
1268
1269            // Check for optional - (strip leading tabs)
1270            let strip_tabs = chars.peek() == Some(&'-');
1271            if strip_tabs {
1272                chars.next();
1273            }
1274
1275            // Skip whitespace before delimiter
1276            while let Some(&c) = chars.peek() {
1277                if c == ' ' || c == '\t' {
1278                    chars.next();
1279                } else {
1280                    break;
1281                }
1282            }
1283
1284            // Collect the delimiter word
1285            let mut delimiter = String::new();
1286            let quoted = chars.peek() == Some(&'\'') || chars.peek() == Some(&'"');
1287            let quote_char = if quoted { chars.next() } else { None };
1288
1289            while let Some(&c) = chars.peek() {
1290                if quoted {
1291                    if Some(c) == quote_char {
1292                        chars.next(); // consume closing quote
1293                        break;
1294                    }
1295                } else if c.is_whitespace() || c == '\n' || c == '\r' {
1296                    break;
1297                }
1298                if let Some(ch) = chars.next() {
1299                    delimiter.push(ch);
1300                }
1301            }
1302
1303            if delimiter.is_empty() {
1304                // Not a valid here-doc, output << literally
1305                result.push_str("<<");
1306                if strip_tabs {
1307                    result.push('-');
1308                }
1309                continue;
1310            }
1311
1312            // Buffer text after delimiter word (e.g., " | jq" in "cat <<EOF | jq")
1313            // This must be emitted AFTER the heredoc marker, not before.
1314            let mut after_delimiter = String::new();
1315            while let Some(&c) = chars.peek() {
1316                if c == '\n' {
1317                    chars.next();
1318                    break;
1319                } else if c == '\r' {
1320                    chars.next();
1321                    if chars.peek() == Some(&'\n') {
1322                        chars.next();
1323                    }
1324                    break;
1325                }
1326                if let Some(ch) = chars.next() {
1327                    after_delimiter.push(ch);
1328                }
1329            }
1330
1331            // Collect content until delimiter on its own line
1332            let mut content = String::new();
1333            let mut current_line = String::new();
1334
1335            loop {
1336                match chars.next() {
1337                    Some('\n') => {
1338                        // Check if this line is the delimiter
1339                        let trimmed = if strip_tabs {
1340                            current_line.trim_start_matches('\t')
1341                        } else {
1342                            &current_line
1343                        };
1344                        if trimmed == delimiter {
1345                            // Found end of here-doc
1346                            break;
1347                        }
1348                        // Add line to content (including empty lines)
1349                        content.push_str(&current_line);
1350                        content.push('\n');
1351                        current_line.clear();
1352                    }
1353                    Some('\r') => {
1354                        // Handle \r\n
1355                        if chars.peek() == Some(&'\n') {
1356                            chars.next();
1357                        }
1358                        let trimmed = if strip_tabs {
1359                            current_line.trim_start_matches('\t')
1360                        } else {
1361                            &current_line
1362                        };
1363                        if trimmed == delimiter {
1364                            break;
1365                        }
1366                        content.push_str(&current_line);
1367                        content.push('\n');
1368                        current_line.clear();
1369                    }
1370                    Some(c) => {
1371                        current_line.push(c);
1372                    }
1373                    None => {
1374                        // EOF - check if current line is the delimiter
1375                        let trimmed = if strip_tabs {
1376                            current_line.trim_start_matches('\t')
1377                        } else {
1378                            &current_line
1379                        };
1380                        if trimmed == delimiter {
1381                            // Found delimiter at EOF
1382                            break;
1383                        }
1384                        // Not a delimiter - include remaining content
1385                        if !current_line.is_empty() {
1386                            content.push_str(&current_line);
1387                        }
1388                        break;
1389                    }
1390                }
1391            }
1392
1393            // Remove trailing newline from content (we'll add it when needed)
1394            let content = content.trim_end_matches('\n').to_string();
1395
1396            // Create a unique marker for this here-doc (collision-resistant)
1397            let marker = format!("__KAISH_HEREDOC_{}__", unique_marker_id());
1398            heredocs.push((marker.clone(), content, quoted));
1399
1400            // Output <<marker first, then any text that followed the delimiter
1401            // (e.g., " | jq") so the heredoc attaches to the correct command.
1402            result.push_str("<<");
1403            result.push_str(&marker);
1404            result.push_str(&after_delimiter);
1405            result.push('\n');
1406        } else {
1407            result.push(ch);
1408        }
1409    }
1410
1411    (result, heredocs)
1412}
1413
1414/// Tokenize source code into a vector of spanned tokens.
1415///
1416/// Skips whitespace and comments (unless you need them for formatting).
1417/// Returns errors with their positions for nice error messages.
1418///
1419/// Handles:
1420/// - Arithmetic: `$((expr))` becomes `Arithmetic("expr")`
1421/// - Here-docs: `<<EOF\nhello\nEOF` becomes `HereDocStart` + `HereDoc("hello")`
1422pub fn tokenize(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1423    // Preprocess arithmetic first (before heredocs because heredoc content might contain $((
1424    let arith_result = preprocess_arithmetic(source)
1425        .map_err(|e| vec![Spanned::new(e, 0..source.len())])?;
1426
1427    // Then preprocess here-docs (heredoc span tracking is not implemented for simplicity)
1428    let (preprocessed, heredocs) = preprocess_heredocs(&arith_result.text);
1429
1430    // Combine replacements for span correction (arithmetic only for now)
1431    let span_replacements = arith_result.replacements;
1432
1433    let lexer = Token::lexer(&preprocessed);
1434    let mut tokens = Vec::new();
1435    let mut errors = Vec::new();
1436
1437    for (result, span) in lexer.spanned() {
1438        // Correct the span from preprocessed coordinates to original coordinates
1439        let corrected_span = correct_span(span, &span_replacements);
1440        match result {
1441            Ok(token) => {
1442                // Skip comments and line continuations - they're not needed for parsing
1443                if !matches!(token, Token::Comment | Token::LineContinuation) {
1444                    tokens.push(Spanned::new(token, corrected_span));
1445                }
1446            }
1447            Err(err) => {
1448                errors.push(Spanned::new(err, corrected_span));
1449            }
1450        }
1451    }
1452
1453    if !errors.is_empty() {
1454        return Err(errors);
1455    }
1456
1457    // Post-process: replace markers with actual token content
1458    let mut final_tokens = Vec::with_capacity(tokens.len());
1459    let mut i = 0;
1460
1461    while i < tokens.len() {
1462        // Check for arithmetic marker (unique format: __KAISH_ARITH_{id}__)
1463        if let Token::Ident(ref name) = tokens[i].token
1464            && name.starts_with("__KAISH_ARITH_") && name.ends_with("__")
1465                && let Some((_, expr)) = arith_result.arithmetics.iter().find(|(marker, _)| marker == name) {
1466                    final_tokens.push(Spanned::new(Token::Arithmetic(expr.clone()), tokens[i].span.clone()));
1467                    i += 1;
1468                    continue;
1469                }
1470
1471        // Check for heredoc (unique format: __KAISH_HEREDOC_{id}__)
1472        if matches!(tokens[i].token, Token::HereDocStart) {
1473            // Check if next token is a heredoc marker
1474            if i + 1 < tokens.len()
1475                && let Token::Ident(ref name) = tokens[i + 1].token
1476                    && name.starts_with("__KAISH_HEREDOC_") && name.ends_with("__") {
1477                        // Find the corresponding content
1478                        if let Some((_, content, literal)) = heredocs.iter().find(|(marker, _, _)| marker == name) {
1479                            final_tokens.push(Spanned::new(Token::HereDocStart, tokens[i].span.clone()));
1480                            final_tokens.push(Spanned::new(Token::HereDoc(HereDocData { content: content.clone(), literal: *literal }), tokens[i + 1].span.clone()));
1481                            i += 2;
1482                            continue;
1483                        }
1484                    }
1485        }
1486
1487        // Check for arithmetic markers inside string content
1488        let token = if let Token::String(ref s) = tokens[i].token {
1489            // Check if string contains any arithmetic markers
1490            let mut new_content = s.clone();
1491            for (marker, expr) in &arith_result.arithmetics {
1492                if new_content.contains(marker) {
1493                    // Replace marker with the special format that parse_interpolated_string can detect
1494                    // Use ${__ARITH:expr__} format so it gets parsed as StringPart::Arithmetic
1495                    new_content = new_content.replace(marker, &format!("${{__ARITH:{}__}}", expr));
1496                }
1497            }
1498            if new_content != *s {
1499                Spanned::new(Token::String(new_content), tokens[i].span.clone())
1500            } else {
1501                tokens[i].clone()
1502            }
1503        } else {
1504            tokens[i].clone()
1505        };
1506        final_tokens.push(token);
1507        i += 1;
1508    }
1509
1510    Ok(final_tokens)
1511}
1512
1513/// Tokenize source code, preserving comments.
1514///
1515/// Useful for pretty-printing or formatting tools that need to preserve comments.
1516pub fn tokenize_with_comments(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1517    let lexer = Token::lexer(source);
1518    let mut tokens = Vec::new();
1519    let mut errors = Vec::new();
1520
1521    for (result, span) in lexer.spanned() {
1522        match result {
1523            Ok(token) => {
1524                tokens.push(Spanned::new(token, span));
1525            }
1526            Err(err) => {
1527                errors.push(Spanned::new(err, span));
1528            }
1529        }
1530    }
1531
1532    if errors.is_empty() {
1533        Ok(tokens)
1534    } else {
1535        Err(errors)
1536    }
1537}
1538
1539/// Extract the string content from a string token (removes quotes, processes escapes).
1540pub fn parse_string_literal(source: &str) -> Result<String, LexerError> {
1541    // Remove surrounding quotes
1542    if source.len() < 2 || !source.starts_with('"') || !source.ends_with('"') {
1543        return Err(LexerError::UnterminatedString);
1544    }
1545
1546    let inner = &source[1..source.len() - 1];
1547    let mut result = String::with_capacity(inner.len());
1548    let mut chars = inner.chars().peekable();
1549
1550    while let Some(ch) = chars.next() {
1551        if ch == '\\' {
1552            match chars.next() {
1553                Some('n') => result.push('\n'),
1554                Some('t') => result.push('\t'),
1555                Some('r') => result.push('\r'),
1556                Some('\\') => result.push('\\'),
1557                Some('"') => result.push('"'),
1558                // Use a unique marker for escaped dollar that won't be re-interpreted
1559                // parse_interpolated_string will convert this back to $
1560                Some('$') => result.push_str("__KAISH_ESCAPED_DOLLAR__"),
1561                Some('u') => {
1562                    // Unicode escape: \uXXXX
1563                    let mut hex = String::with_capacity(4);
1564                    for _ in 0..4 {
1565                        match chars.next() {
1566                            Some(h) if h.is_ascii_hexdigit() => hex.push(h),
1567                            _ => return Err(LexerError::InvalidEscape),
1568                        }
1569                    }
1570                    let codepoint = u32::from_str_radix(&hex, 16)
1571                        .map_err(|_| LexerError::InvalidEscape)?;
1572                    let ch = char::from_u32(codepoint)
1573                        .ok_or(LexerError::InvalidEscape)?;
1574                    result.push(ch);
1575                }
1576                // Unknown escapes: preserve the backslash (for regex patterns like `\.`)
1577                Some(next) => {
1578                    result.push('\\');
1579                    result.push(next);
1580                }
1581                None => return Err(LexerError::InvalidEscape),
1582            }
1583        } else {
1584            result.push(ch);
1585        }
1586    }
1587
1588    Ok(result)
1589}
1590
1591/// Parse a variable reference, extracting the path segments.
1592/// Input: "${VAR.field[0].nested}" → ["VAR", "field", "[0]", "nested"]
1593pub fn parse_var_ref(source: &str) -> Result<Vec<String>, LexerError> {
1594    // Remove ${ and }
1595    if source.len() < 4 || !source.starts_with("${") || !source.ends_with('}') {
1596        return Err(LexerError::UnterminatedVarRef);
1597    }
1598
1599    let inner = &source[2..source.len() - 1];
1600
1601    // Special case: $? (last result)
1602    if inner == "?" {
1603        return Ok(vec!["?".to_string()]);
1604    }
1605
1606    let mut segments = Vec::new();
1607    let mut current = String::new();
1608    let mut chars = inner.chars().peekable();
1609
1610    while let Some(ch) = chars.next() {
1611        match ch {
1612            '.' => {
1613                if !current.is_empty() {
1614                    segments.push(current.clone());
1615                    current.clear();
1616                }
1617            }
1618            '[' => {
1619                if !current.is_empty() {
1620                    segments.push(current.clone());
1621                    current.clear();
1622                }
1623                // Collect the index
1624                let mut index = String::from("[");
1625                while let Some(&c) = chars.peek() {
1626                    if let Some(c) = chars.next() {
1627                        index.push(c);
1628                    }
1629                    if c == ']' {
1630                        break;
1631                    }
1632                }
1633                segments.push(index);
1634            }
1635            _ => {
1636                current.push(ch);
1637            }
1638        }
1639    }
1640
1641    if !current.is_empty() {
1642        segments.push(current);
1643    }
1644
1645    Ok(segments)
1646}
1647
1648/// Parse an integer literal.
1649pub fn parse_int(source: &str) -> Result<i64, LexerError> {
1650    source.parse().map_err(|_| LexerError::InvalidNumber)
1651}
1652
1653/// Parse a float literal.
1654pub fn parse_float(source: &str) -> Result<f64, LexerError> {
1655    source.parse().map_err(|_| LexerError::InvalidNumber)
1656}
1657
1658#[cfg(test)]
1659mod tests {
1660    use super::*;
1661
1662    fn lex(source: &str) -> Vec<Token> {
1663        tokenize(source)
1664            .expect("lexer should succeed")
1665            .into_iter()
1666            .map(|s| s.token)
1667            .collect()
1668    }
1669
1670    // ═══════════════════════════════════════════════════════════════════
1671    // Keyword tests
1672    // ═══════════════════════════════════════════════════════════════════
1673
1674    #[test]
1675    fn keywords() {
1676        assert_eq!(lex("set"), vec![Token::Set]);
1677        assert_eq!(lex("if"), vec![Token::If]);
1678        assert_eq!(lex("then"), vec![Token::Then]);
1679        assert_eq!(lex("else"), vec![Token::Else]);
1680        assert_eq!(lex("elif"), vec![Token::Elif]);
1681        assert_eq!(lex("fi"), vec![Token::Fi]);
1682        assert_eq!(lex("for"), vec![Token::For]);
1683        assert_eq!(lex("in"), vec![Token::In]);
1684        assert_eq!(lex("do"), vec![Token::Do]);
1685        assert_eq!(lex("done"), vec![Token::Done]);
1686        assert_eq!(lex("case"), vec![Token::Case]);
1687        assert_eq!(lex("esac"), vec![Token::Esac]);
1688        assert_eq!(lex("function"), vec![Token::Function]);
1689        assert_eq!(lex("true"), vec![Token::True]);
1690        assert_eq!(lex("false"), vec![Token::False]);
1691    }
1692
1693    #[test]
1694    fn double_semicolon() {
1695        assert_eq!(lex(";;"), vec![Token::DoubleSemi]);
1696        // In case pattern context
1697        assert_eq!(lex("echo \"hi\";;"), vec![
1698            Token::Ident("echo".to_string()),
1699            Token::String("hi".to_string()),
1700            Token::DoubleSemi,
1701        ]);
1702    }
1703
1704    #[test]
1705    fn type_keywords() {
1706        assert_eq!(lex("string"), vec![Token::TypeString]);
1707        assert_eq!(lex("int"), vec![Token::TypeInt]);
1708        assert_eq!(lex("float"), vec![Token::TypeFloat]);
1709        assert_eq!(lex("bool"), vec![Token::TypeBool]);
1710    }
1711
1712    // ═══════════════════════════════════════════════════════════════════
1713    // Operator tests
1714    // ═══════════════════════════════════════════════════════════════════
1715
1716    #[test]
1717    fn single_char_operators() {
1718        assert_eq!(lex("="), vec![Token::Eq]);
1719        assert_eq!(lex("|"), vec![Token::Pipe]);
1720        assert_eq!(lex("&"), vec![Token::Amp]);
1721        assert_eq!(lex(">"), vec![Token::Gt]);
1722        assert_eq!(lex("<"), vec![Token::Lt]);
1723        assert_eq!(lex(";"), vec![Token::Semi]);
1724        assert_eq!(lex(":"), vec![Token::Colon]);
1725        assert_eq!(lex(","), vec![Token::Comma]);
1726        assert_eq!(lex("."), vec![Token::Dot]);
1727    }
1728
1729    #[test]
1730    fn multi_char_operators() {
1731        assert_eq!(lex("&&"), vec![Token::And]);
1732        assert_eq!(lex("||"), vec![Token::Or]);
1733        assert_eq!(lex("=="), vec![Token::EqEq]);
1734        assert_eq!(lex("!="), vec![Token::NotEq]);
1735        assert_eq!(lex("=~"), vec![Token::Match]);
1736        assert_eq!(lex("!~"), vec![Token::NotMatch]);
1737        assert_eq!(lex(">="), vec![Token::GtEq]);
1738        assert_eq!(lex("<="), vec![Token::LtEq]);
1739        assert_eq!(lex(">>"), vec![Token::GtGt]);
1740        assert_eq!(lex("2>"), vec![Token::Stderr]);
1741        assert_eq!(lex("&>"), vec![Token::Both]);
1742    }
1743
1744    #[test]
1745    fn brackets() {
1746        assert_eq!(lex("{"), vec![Token::LBrace]);
1747        assert_eq!(lex("}"), vec![Token::RBrace]);
1748        assert_eq!(lex("["), vec![Token::LBracket]);
1749        assert_eq!(lex("]"), vec![Token::RBracket]);
1750        assert_eq!(lex("("), vec![Token::LParen]);
1751        assert_eq!(lex(")"), vec![Token::RParen]);
1752    }
1753
1754    // ═══════════════════════════════════════════════════════════════════
1755    // Literal tests
1756    // ═══════════════════════════════════════════════════════════════════
1757
1758    #[test]
1759    fn integers() {
1760        assert_eq!(lex("0"), vec![Token::Int(0)]);
1761        assert_eq!(lex("42"), vec![Token::Int(42)]);
1762        assert_eq!(lex("-1"), vec![Token::Int(-1)]);
1763        assert_eq!(lex("999999"), vec![Token::Int(999999)]);
1764    }
1765
1766    #[test]
1767    fn floats() {
1768        assert_eq!(lex("3.14"), vec![Token::Float(3.14)]);
1769        assert_eq!(lex("-0.5"), vec![Token::Float(-0.5)]);
1770        assert_eq!(lex("123.456"), vec![Token::Float(123.456)]);
1771    }
1772
1773    #[test]
1774    fn strings() {
1775        assert_eq!(lex(r#""hello""#), vec![Token::String("hello".to_string())]);
1776        assert_eq!(lex(r#""hello world""#), vec![Token::String("hello world".to_string())]);
1777        assert_eq!(lex(r#""""#), vec![Token::String("".to_string())]); // empty string
1778        assert_eq!(lex(r#""with \"quotes\"""#), vec![Token::String("with \"quotes\"".to_string())]);
1779        assert_eq!(lex(r#""with\nnewline""#), vec![Token::String("with\nnewline".to_string())]);
1780    }
1781
1782    #[test]
1783    fn var_refs() {
1784        assert_eq!(lex("${X}"), vec![Token::VarRef("${X}".to_string())]);
1785        assert_eq!(lex("${VAR}"), vec![Token::VarRef("${VAR}".to_string())]);
1786        assert_eq!(lex("${VAR.field}"), vec![Token::VarRef("${VAR.field}".to_string())]);
1787        assert_eq!(lex("${VAR[0]}"), vec![Token::VarRef("${VAR[0]}".to_string())]);
1788        assert_eq!(lex("${?.ok}"), vec![Token::VarRef("${?.ok}".to_string())]);
1789    }
1790
1791    // ═══════════════════════════════════════════════════════════════════
1792    // Identifier tests
1793    // ═══════════════════════════════════════════════════════════════════
1794
1795    #[test]
1796    fn identifiers() {
1797        assert_eq!(lex("foo"), vec![Token::Ident("foo".to_string())]);
1798        assert_eq!(lex("foo_bar"), vec![Token::Ident("foo_bar".to_string())]);
1799        assert_eq!(lex("foo-bar"), vec![Token::Ident("foo-bar".to_string())]);
1800        assert_eq!(lex("_private"), vec![Token::Ident("_private".to_string())]);
1801        assert_eq!(lex("cmd123"), vec![Token::Ident("cmd123".to_string())]);
1802    }
1803
1804    #[test]
1805    fn keyword_prefix_identifiers() {
1806        // Identifiers that start with keywords but aren't keywords
1807        assert_eq!(lex("setup"), vec![Token::Ident("setup".to_string())]);
1808        assert_eq!(lex("tools"), vec![Token::Ident("tools".to_string())]);
1809        assert_eq!(lex("iffy"), vec![Token::Ident("iffy".to_string())]);
1810        assert_eq!(lex("forked"), vec![Token::Ident("forked".to_string())]);
1811        assert_eq!(lex("done-with-it"), vec![Token::Ident("done-with-it".to_string())]);
1812    }
1813
1814    // ═══════════════════════════════════════════════════════════════════
1815    // Statement tests
1816    // ═══════════════════════════════════════════════════════════════════
1817
1818    #[test]
1819    fn assignment() {
1820        assert_eq!(
1821            lex("set X = 5"),
1822            vec![Token::Set, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
1823        );
1824    }
1825
1826    #[test]
1827    fn command_simple() {
1828        assert_eq!(lex("echo"), vec![Token::Ident("echo".to_string())]);
1829        assert_eq!(
1830            lex(r#"echo "hello""#),
1831            vec![Token::Ident("echo".to_string()), Token::String("hello".to_string())]
1832        );
1833    }
1834
1835    #[test]
1836    fn command_with_args() {
1837        assert_eq!(
1838            lex("cmd arg1 arg2"),
1839            vec![Token::Ident("cmd".to_string()), Token::Ident("arg1".to_string()), Token::Ident("arg2".to_string())]
1840        );
1841    }
1842
1843    #[test]
1844    fn command_with_named_args() {
1845        assert_eq!(
1846            lex("cmd key=value"),
1847            vec![Token::Ident("cmd".to_string()), Token::Ident("key".to_string()), Token::Eq, Token::Ident("value".to_string())]
1848        );
1849    }
1850
1851    #[test]
1852    fn pipeline() {
1853        assert_eq!(
1854            lex("a | b | c"),
1855            vec![Token::Ident("a".to_string()), Token::Pipe, Token::Ident("b".to_string()), Token::Pipe, Token::Ident("c".to_string())]
1856        );
1857    }
1858
1859    #[test]
1860    fn if_statement() {
1861        assert_eq!(
1862            lex("if true; then echo; fi"),
1863            vec![
1864                Token::If,
1865                Token::True,
1866                Token::Semi,
1867                Token::Then,
1868                Token::Ident("echo".to_string()),
1869                Token::Semi,
1870                Token::Fi
1871            ]
1872        );
1873    }
1874
1875    #[test]
1876    fn for_loop() {
1877        assert_eq!(
1878            lex("for X in items; do echo; done"),
1879            vec![
1880                Token::For,
1881                Token::Ident("X".to_string()),
1882                Token::In,
1883                Token::Ident("items".to_string()),
1884                Token::Semi,
1885                Token::Do,
1886                Token::Ident("echo".to_string()),
1887                Token::Semi,
1888                Token::Done
1889            ]
1890        );
1891    }
1892
1893    // ═══════════════════════════════════════════════════════════════════
1894    // Whitespace and newlines
1895    // ═══════════════════════════════════════════════════════════════════
1896
1897    #[test]
1898    fn whitespace_ignored() {
1899        assert_eq!(lex("   set   X   =   5   "), lex("set X = 5"));
1900    }
1901
1902    #[test]
1903    fn newlines_preserved() {
1904        let tokens = lex("a\nb");
1905        assert_eq!(
1906            tokens,
1907            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
1908        );
1909    }
1910
1911    #[test]
1912    fn multiple_newlines() {
1913        let tokens = lex("a\n\n\nb");
1914        assert_eq!(
1915            tokens,
1916            vec![Token::Ident("a".to_string()), Token::Newline, Token::Newline, Token::Newline, Token::Ident("b".to_string())]
1917        );
1918    }
1919
1920    // ═══════════════════════════════════════════════════════════════════
1921    // Comments
1922    // ═══════════════════════════════════════════════════════════════════
1923
1924    #[test]
1925    fn comments_skipped() {
1926        assert_eq!(lex("# comment"), vec![]);
1927        assert_eq!(lex("a # comment"), vec![Token::Ident("a".to_string())]);
1928        assert_eq!(
1929            lex("a # comment\nb"),
1930            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
1931        );
1932    }
1933
1934    #[test]
1935    fn comments_preserved_when_requested() {
1936        let tokens = tokenize_with_comments("a # comment")
1937            .expect("should succeed")
1938            .into_iter()
1939            .map(|s| s.token)
1940            .collect::<Vec<_>>();
1941        assert_eq!(tokens, vec![Token::Ident("a".to_string()), Token::Comment]);
1942    }
1943
1944    // ═══════════════════════════════════════════════════════════════════
1945    // String parsing
1946    // ═══════════════════════════════════════════════════════════════════
1947
1948    #[test]
1949    fn parse_simple_string() {
1950        assert_eq!(parse_string_literal(r#""hello""#).expect("ok"), "hello");
1951    }
1952
1953    #[test]
1954    fn parse_string_with_escapes() {
1955        assert_eq!(
1956            parse_string_literal(r#""hello\nworld""#).expect("ok"),
1957            "hello\nworld"
1958        );
1959        assert_eq!(
1960            parse_string_literal(r#""tab\there""#).expect("ok"),
1961            "tab\there"
1962        );
1963        assert_eq!(
1964            parse_string_literal(r#""quote\"here""#).expect("ok"),
1965            "quote\"here"
1966        );
1967    }
1968
1969    #[test]
1970    fn parse_string_with_unicode() {
1971        assert_eq!(
1972            parse_string_literal(r#""emoji \u2764""#).expect("ok"),
1973            "emoji ❤"
1974        );
1975    }
1976
1977    #[test]
1978    fn parse_string_with_escaped_dollar() {
1979        // \$ produces a marker that parse_interpolated_string will convert to $
1980        // The marker __KAISH_ESCAPED_DOLLAR__ is used to prevent re-interpretation
1981        assert_eq!(
1982            parse_string_literal(r#""\$VAR""#).expect("ok"),
1983            "__KAISH_ESCAPED_DOLLAR__VAR"
1984        );
1985        assert_eq!(
1986            parse_string_literal(r#""cost: \$100""#).expect("ok"),
1987            "cost: __KAISH_ESCAPED_DOLLAR__100"
1988        );
1989    }
1990
1991    // ═══════════════════════════════════════════════════════════════════
1992    // Variable reference parsing
1993    // ═══════════════════════════════════════════════════════════════════
1994
1995    #[test]
1996    fn parse_simple_var() {
1997        assert_eq!(
1998            parse_var_ref("${X}").expect("ok"),
1999            vec!["X"]
2000        );
2001    }
2002
2003    #[test]
2004    fn parse_var_with_field() {
2005        assert_eq!(
2006            parse_var_ref("${VAR.field}").expect("ok"),
2007            vec!["VAR", "field"]
2008        );
2009    }
2010
2011    #[test]
2012    fn parse_var_with_index() {
2013        assert_eq!(
2014            parse_var_ref("${VAR[0]}").expect("ok"),
2015            vec!["VAR", "[0]"]
2016        );
2017    }
2018
2019    #[test]
2020    fn parse_var_nested() {
2021        assert_eq!(
2022            parse_var_ref("${VAR.field[0].nested}").expect("ok"),
2023            vec!["VAR", "field", "[0]", "nested"]
2024        );
2025    }
2026
2027    #[test]
2028    fn parse_last_result() {
2029        assert_eq!(
2030            parse_var_ref("${?}").expect("ok"),
2031            vec!["?"]
2032        );
2033        assert_eq!(
2034            parse_var_ref("${?.ok}").expect("ok"),
2035            vec!["?", "ok"]
2036        );
2037    }
2038
2039    // ═══════════════════════════════════════════════════════════════════
2040    // Number parsing
2041    // ═══════════════════════════════════════════════════════════════════
2042
2043    #[test]
2044    fn parse_integers() {
2045        assert_eq!(parse_int("0").expect("ok"), 0);
2046        assert_eq!(parse_int("42").expect("ok"), 42);
2047        assert_eq!(parse_int("-1").expect("ok"), -1);
2048    }
2049
2050    #[test]
2051    fn parse_floats() {
2052        assert!((parse_float("3.14").expect("ok") - 3.14).abs() < f64::EPSILON);
2053        assert!((parse_float("-0.5").expect("ok") - (-0.5)).abs() < f64::EPSILON);
2054    }
2055
2056    // ═══════════════════════════════════════════════════════════════════
2057    // Edge cases and errors
2058    // ═══════════════════════════════════════════════════════════════════
2059
2060    #[test]
2061    fn empty_input() {
2062        assert_eq!(lex(""), vec![]);
2063    }
2064
2065    #[test]
2066    fn only_whitespace() {
2067        assert_eq!(lex("   \t\t   "), vec![]);
2068    }
2069
2070    #[test]
2071    fn json_array() {
2072        assert_eq!(
2073            lex(r#"[1, 2, 3]"#),
2074            vec![
2075                Token::LBracket,
2076                Token::Int(1),
2077                Token::Comma,
2078                Token::Int(2),
2079                Token::Comma,
2080                Token::Int(3),
2081                Token::RBracket
2082            ]
2083        );
2084    }
2085
2086    #[test]
2087    fn json_object() {
2088        assert_eq!(
2089            lex(r#"{"key": "value"}"#),
2090            vec![
2091                Token::LBrace,
2092                Token::String("key".to_string()),
2093                Token::Colon,
2094                Token::String("value".to_string()),
2095                Token::RBrace
2096            ]
2097        );
2098    }
2099
2100    #[test]
2101    fn redirect_operators() {
2102        assert_eq!(
2103            lex("cmd > file"),
2104            vec![Token::Ident("cmd".to_string()), Token::Gt, Token::Ident("file".to_string())]
2105        );
2106        assert_eq!(
2107            lex("cmd >> file"),
2108            vec![Token::Ident("cmd".to_string()), Token::GtGt, Token::Ident("file".to_string())]
2109        );
2110        assert_eq!(
2111            lex("cmd 2> err"),
2112            vec![Token::Ident("cmd".to_string()), Token::Stderr, Token::Ident("err".to_string())]
2113        );
2114        assert_eq!(
2115            lex("cmd &> all"),
2116            vec![Token::Ident("cmd".to_string()), Token::Both, Token::Ident("all".to_string())]
2117        );
2118    }
2119
2120    #[test]
2121    fn background_job() {
2122        assert_eq!(
2123            lex("cmd &"),
2124            vec![Token::Ident("cmd".to_string()), Token::Amp]
2125        );
2126    }
2127
2128    #[test]
2129    fn command_substitution() {
2130        assert_eq!(
2131            lex("$(cmd)"),
2132            vec![Token::CmdSubstStart, Token::Ident("cmd".to_string()), Token::RParen]
2133        );
2134        assert_eq!(
2135            lex("$(cmd arg)"),
2136            vec![
2137                Token::CmdSubstStart,
2138                Token::Ident("cmd".to_string()),
2139                Token::Ident("arg".to_string()),
2140                Token::RParen
2141            ]
2142        );
2143        assert_eq!(
2144            lex("$(a | b)"),
2145            vec![
2146                Token::CmdSubstStart,
2147                Token::Ident("a".to_string()),
2148                Token::Pipe,
2149                Token::Ident("b".to_string()),
2150                Token::RParen
2151            ]
2152        );
2153    }
2154
2155    #[test]
2156    fn complex_pipeline() {
2157        assert_eq!(
2158            lex(r#"cat file | grep pattern="foo" | head count=10"#),
2159            vec![
2160                Token::Ident("cat".to_string()),
2161                Token::Ident("file".to_string()),
2162                Token::Pipe,
2163                Token::Ident("grep".to_string()),
2164                Token::Ident("pattern".to_string()),
2165                Token::Eq,
2166                Token::String("foo".to_string()),
2167                Token::Pipe,
2168                Token::Ident("head".to_string()),
2169                Token::Ident("count".to_string()),
2170                Token::Eq,
2171                Token::Int(10),
2172            ]
2173        );
2174    }
2175
2176    // ═══════════════════════════════════════════════════════════════════
2177    // Flag tests
2178    // ═══════════════════════════════════════════════════════════════════
2179
2180    #[test]
2181    fn short_flag() {
2182        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2183        assert_eq!(lex("-a"), vec![Token::ShortFlag("a".to_string())]);
2184        assert_eq!(lex("-v"), vec![Token::ShortFlag("v".to_string())]);
2185    }
2186
2187    #[test]
2188    fn short_flag_combined() {
2189        // Combined short flags like -la
2190        assert_eq!(lex("-la"), vec![Token::ShortFlag("la".to_string())]);
2191        assert_eq!(lex("-vvv"), vec![Token::ShortFlag("vvv".to_string())]);
2192    }
2193
2194    #[test]
2195    fn long_flag() {
2196        assert_eq!(lex("--force"), vec![Token::LongFlag("force".to_string())]);
2197        assert_eq!(lex("--verbose"), vec![Token::LongFlag("verbose".to_string())]);
2198        assert_eq!(lex("--foo-bar"), vec![Token::LongFlag("foo-bar".to_string())]);
2199    }
2200
2201    #[test]
2202    fn double_dash() {
2203        // -- alone marks end of flags
2204        assert_eq!(lex("--"), vec![Token::DoubleDash]);
2205    }
2206
2207    #[test]
2208    fn flags_vs_negative_numbers() {
2209        // -123 should be a negative integer, not a flag
2210        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2211        // -l should be a flag
2212        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2213        // -1a is ambiguous - should be Int(-1) then Ident(a)
2214        // Actually the regex -[a-zA-Z] won't match -1a since 1 isn't a letter
2215        assert_eq!(
2216            lex("-1 a"),
2217            vec![Token::Int(-1), Token::Ident("a".to_string())]
2218        );
2219    }
2220
2221    #[test]
2222    fn command_with_flags() {
2223        assert_eq!(
2224            lex("ls -l"),
2225            vec![
2226                Token::Ident("ls".to_string()),
2227                Token::ShortFlag("l".to_string()),
2228            ]
2229        );
2230        assert_eq!(
2231            lex("git commit -m"),
2232            vec![
2233                Token::Ident("git".to_string()),
2234                Token::Ident("commit".to_string()),
2235                Token::ShortFlag("m".to_string()),
2236            ]
2237        );
2238        assert_eq!(
2239            lex("git push --force"),
2240            vec![
2241                Token::Ident("git".to_string()),
2242                Token::Ident("push".to_string()),
2243                Token::LongFlag("force".to_string()),
2244            ]
2245        );
2246    }
2247
2248    #[test]
2249    fn flag_with_value() {
2250        assert_eq!(
2251            lex(r#"git commit -m "message""#),
2252            vec![
2253                Token::Ident("git".to_string()),
2254                Token::Ident("commit".to_string()),
2255                Token::ShortFlag("m".to_string()),
2256                Token::String("message".to_string()),
2257            ]
2258        );
2259        assert_eq!(
2260            lex(r#"--message="hello""#),
2261            vec![
2262                Token::LongFlag("message".to_string()),
2263                Token::Eq,
2264                Token::String("hello".to_string()),
2265            ]
2266        );
2267    }
2268
2269    #[test]
2270    fn end_of_flags_marker() {
2271        assert_eq!(
2272            lex("git checkout -- file"),
2273            vec![
2274                Token::Ident("git".to_string()),
2275                Token::Ident("checkout".to_string()),
2276                Token::DoubleDash,
2277                Token::Ident("file".to_string()),
2278            ]
2279        );
2280    }
2281
2282    // ═══════════════════════════════════════════════════════════════════
2283    // Bash compatibility tokens
2284    // ═══════════════════════════════════════════════════════════════════
2285
2286    #[test]
2287    fn local_keyword() {
2288        assert_eq!(lex("local"), vec![Token::Local]);
2289        assert_eq!(
2290            lex("local X = 5"),
2291            vec![Token::Local, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2292        );
2293    }
2294
2295    #[test]
2296    fn simple_var_ref() {
2297        assert_eq!(lex("$X"), vec![Token::SimpleVarRef("X".to_string())]);
2298        assert_eq!(lex("$foo"), vec![Token::SimpleVarRef("foo".to_string())]);
2299        assert_eq!(lex("$foo_bar"), vec![Token::SimpleVarRef("foo_bar".to_string())]);
2300        assert_eq!(lex("$_private"), vec![Token::SimpleVarRef("_private".to_string())]);
2301    }
2302
2303    #[test]
2304    fn simple_var_ref_in_command() {
2305        assert_eq!(
2306            lex("echo $NAME"),
2307            vec![Token::Ident("echo".to_string()), Token::SimpleVarRef("NAME".to_string())]
2308        );
2309    }
2310
2311    #[test]
2312    fn single_quoted_strings() {
2313        assert_eq!(lex("'hello'"), vec![Token::SingleString("hello".to_string())]);
2314        assert_eq!(lex("'hello world'"), vec![Token::SingleString("hello world".to_string())]);
2315        assert_eq!(lex("''"), vec![Token::SingleString("".to_string())]);
2316        // Single quotes don't process escapes or variables
2317        assert_eq!(lex(r"'no $VAR here'"), vec![Token::SingleString("no $VAR here".to_string())]);
2318        assert_eq!(lex(r"'backslash \n stays'"), vec![Token::SingleString(r"backslash \n stays".to_string())]);
2319    }
2320
2321    #[test]
2322    fn test_brackets() {
2323        // [[ and ]] are now two separate bracket tokens to avoid conflicts with nested arrays
2324        assert_eq!(lex("[["), vec![Token::LBracket, Token::LBracket]);
2325        assert_eq!(lex("]]"), vec![Token::RBracket, Token::RBracket]);
2326        assert_eq!(
2327            lex("[[ -f file ]]"),
2328            vec![
2329                Token::LBracket,
2330                Token::LBracket,
2331                Token::ShortFlag("f".to_string()),
2332                Token::Ident("file".to_string()),
2333                Token::RBracket,
2334                Token::RBracket
2335            ]
2336        );
2337    }
2338
2339    #[test]
2340    fn test_expression_syntax() {
2341        assert_eq!(
2342            lex(r#"[[ $X == "value" ]]"#),
2343            vec![
2344                Token::LBracket,
2345                Token::LBracket,
2346                Token::SimpleVarRef("X".to_string()),
2347                Token::EqEq,
2348                Token::String("value".to_string()),
2349                Token::RBracket,
2350                Token::RBracket
2351            ]
2352        );
2353    }
2354
2355    #[test]
2356    fn bash_style_assignment() {
2357        // NAME="value" (no spaces) - lexer sees IDENT EQ STRING
2358        assert_eq!(
2359            lex(r#"NAME="value""#),
2360            vec![
2361                Token::Ident("NAME".to_string()),
2362                Token::Eq,
2363                Token::String("value".to_string())
2364            ]
2365        );
2366    }
2367
2368    #[test]
2369    fn positional_params() {
2370        assert_eq!(lex("$0"), vec![Token::Positional(0)]);
2371        assert_eq!(lex("$1"), vec![Token::Positional(1)]);
2372        assert_eq!(lex("$9"), vec![Token::Positional(9)]);
2373        assert_eq!(lex("$@"), vec![Token::AllArgs]);
2374        assert_eq!(lex("$#"), vec![Token::ArgCount]);
2375    }
2376
2377    #[test]
2378    fn positional_in_context() {
2379        assert_eq!(
2380            lex("echo $1 $2"),
2381            vec![
2382                Token::Ident("echo".to_string()),
2383                Token::Positional(1),
2384                Token::Positional(2),
2385            ]
2386        );
2387    }
2388
2389    #[test]
2390    fn var_length() {
2391        assert_eq!(lex("${#X}"), vec![Token::VarLength("X".to_string())]);
2392        assert_eq!(lex("${#NAME}"), vec![Token::VarLength("NAME".to_string())]);
2393        assert_eq!(lex("${#foo_bar}"), vec![Token::VarLength("foo_bar".to_string())]);
2394    }
2395
2396    #[test]
2397    fn var_length_in_context() {
2398        assert_eq!(
2399            lex("echo ${#NAME}"),
2400            vec![
2401                Token::Ident("echo".to_string()),
2402                Token::VarLength("NAME".to_string()),
2403            ]
2404        );
2405    }
2406
2407    // ═══════════════════════════════════════════════════════════════════
2408    // Edge case tests: Flag ambiguities
2409    // ═══════════════════════════════════════════════════════════════════
2410
2411    #[test]
2412    fn plus_flag() {
2413        // Plus flags for set +e
2414        assert_eq!(lex("+e"), vec![Token::PlusFlag("e".to_string())]);
2415        assert_eq!(lex("+x"), vec![Token::PlusFlag("x".to_string())]);
2416        assert_eq!(lex("+ex"), vec![Token::PlusFlag("ex".to_string())]);
2417    }
2418
2419    #[test]
2420    fn set_with_plus_flag() {
2421        assert_eq!(
2422            lex("set +e"),
2423            vec![
2424                Token::Set,
2425                Token::PlusFlag("e".to_string()),
2426            ]
2427        );
2428    }
2429
2430    #[test]
2431    fn set_with_multiple_flags() {
2432        assert_eq!(
2433            lex("set -e -u"),
2434            vec![
2435                Token::Set,
2436                Token::ShortFlag("e".to_string()),
2437                Token::ShortFlag("u".to_string()),
2438            ]
2439        );
2440    }
2441
2442    #[test]
2443    fn flags_vs_negative_numbers_edge_cases() {
2444        // -1a should be negative int followed by ident
2445        assert_eq!(
2446            lex("-1 a"),
2447            vec![Token::Int(-1), Token::Ident("a".to_string())]
2448        );
2449        // -l is a flag
2450        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2451        // -123 is negative number
2452        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2453    }
2454
2455    #[test]
2456    fn single_dash_is_minus_alone() {
2457        // Single dash alone - now handled as MinusAlone for `cat -` stdin indicator
2458        let result = tokenize("-").expect("should lex");
2459        assert_eq!(result.len(), 1);
2460        assert!(matches!(result[0].token, Token::MinusAlone));
2461    }
2462
2463    #[test]
2464    fn plus_bare_for_date_format() {
2465        // `date +%s` - the +%s should be PlusBare
2466        let result = tokenize("+%s").expect("should lex");
2467        assert_eq!(result.len(), 1);
2468        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%s"));
2469
2470        // `date +%Y-%m-%d` - format string with dashes
2471        let result = tokenize("+%Y-%m-%d").expect("should lex");
2472        assert_eq!(result.len(), 1);
2473        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%Y-%m-%d"));
2474    }
2475
2476    #[test]
2477    fn plus_flag_still_works() {
2478        // `set +e` - should still be PlusFlag
2479        let result = tokenize("+e").expect("should lex");
2480        assert_eq!(result.len(), 1);
2481        assert!(matches!(result[0].token, Token::PlusFlag(ref s) if s == "e"));
2482    }
2483
2484    #[test]
2485    fn while_keyword_vs_while_loop() {
2486        // 'while' as keyword in loop context
2487        assert_eq!(lex("while"), vec![Token::While]);
2488        // 'while' at start followed by condition
2489        assert_eq!(
2490            lex("while true"),
2491            vec![Token::While, Token::True]
2492        );
2493    }
2494
2495    #[test]
2496    fn control_flow_keywords() {
2497        assert_eq!(lex("break"), vec![Token::Break]);
2498        assert_eq!(lex("continue"), vec![Token::Continue]);
2499        assert_eq!(lex("return"), vec![Token::Return]);
2500        assert_eq!(lex("exit"), vec![Token::Exit]);
2501    }
2502
2503    #[test]
2504    fn control_flow_with_numbers() {
2505        assert_eq!(
2506            lex("break 2"),
2507            vec![Token::Break, Token::Int(2)]
2508        );
2509        assert_eq!(
2510            lex("continue 3"),
2511            vec![Token::Continue, Token::Int(3)]
2512        );
2513        assert_eq!(
2514            lex("exit 1"),
2515            vec![Token::Exit, Token::Int(1)]
2516        );
2517    }
2518
2519    // ═══════════════════════════════════════════════════════════════════
2520    // Here-doc tests
2521    // ═══════════════════════════════════════════════════════════════════
2522
2523    #[test]
2524    fn heredoc_simple() {
2525        let source = "cat <<EOF\nhello\nworld\nEOF";
2526        let tokens = lex(source);
2527        assert_eq!(tokens, vec![
2528            Token::Ident("cat".to_string()),
2529            Token::HereDocStart,
2530            Token::HereDoc(HereDocData { content: "hello\nworld".to_string(), literal: false }),
2531            Token::Newline,
2532        ]);
2533    }
2534
2535    #[test]
2536    fn heredoc_empty() {
2537        let source = "cat <<EOF\nEOF";
2538        let tokens = lex(source);
2539        assert_eq!(tokens, vec![
2540            Token::Ident("cat".to_string()),
2541            Token::HereDocStart,
2542            Token::HereDoc(HereDocData { content: "".to_string(), literal: false }),
2543            Token::Newline,
2544        ]);
2545    }
2546
2547    #[test]
2548    fn heredoc_with_special_chars() {
2549        let source = "cat <<EOF\n$VAR and \"quoted\" 'single'\nEOF";
2550        let tokens = lex(source);
2551        assert_eq!(tokens, vec![
2552            Token::Ident("cat".to_string()),
2553            Token::HereDocStart,
2554            Token::HereDoc(HereDocData { content: "$VAR and \"quoted\" 'single'".to_string(), literal: false }),
2555            Token::Newline,
2556        ]);
2557    }
2558
2559    #[test]
2560    fn heredoc_multiline() {
2561        let source = "cat <<END\nline1\nline2\nline3\nEND";
2562        let tokens = lex(source);
2563        assert_eq!(tokens, vec![
2564            Token::Ident("cat".to_string()),
2565            Token::HereDocStart,
2566            Token::HereDoc(HereDocData { content: "line1\nline2\nline3".to_string(), literal: false }),
2567            Token::Newline,
2568        ]);
2569    }
2570
2571    #[test]
2572    fn heredoc_in_command() {
2573        let source = "cat <<EOF\nhello\nEOF\necho goodbye";
2574        let tokens = lex(source);
2575        assert_eq!(tokens, vec![
2576            Token::Ident("cat".to_string()),
2577            Token::HereDocStart,
2578            Token::HereDoc(HereDocData { content: "hello".to_string(), literal: false }),
2579            Token::Newline,
2580            Token::Ident("echo".to_string()),
2581            Token::Ident("goodbye".to_string()),
2582        ]);
2583    }
2584
2585    #[test]
2586    fn heredoc_strip_tabs() {
2587        let source = "cat <<-EOF\n\thello\n\tworld\n\tEOF";
2588        let tokens = lex(source);
2589        // Content has tabs preserved, only delimiter matching strips tabs
2590        assert_eq!(tokens, vec![
2591            Token::Ident("cat".to_string()),
2592            Token::HereDocStart,
2593            Token::HereDoc(HereDocData { content: "\thello\n\tworld".to_string(), literal: false }),
2594            Token::Newline,
2595        ]);
2596    }
2597
2598    // ═══════════════════════════════════════════════════════════════════
2599    // Arithmetic expression tests
2600    // ═══════════════════════════════════════════════════════════════════
2601
2602    #[test]
2603    fn arithmetic_simple() {
2604        let source = "$((1 + 2))";
2605        let tokens = lex(source);
2606        assert_eq!(tokens, vec![Token::Arithmetic("1 + 2".to_string())]);
2607    }
2608
2609    #[test]
2610    fn arithmetic_in_assignment() {
2611        let source = "X=$((5 * 3))";
2612        let tokens = lex(source);
2613        assert_eq!(tokens, vec![
2614            Token::Ident("X".to_string()),
2615            Token::Eq,
2616            Token::Arithmetic("5 * 3".to_string()),
2617        ]);
2618    }
2619
2620    #[test]
2621    fn arithmetic_with_nested_parens() {
2622        let source = "$((2 * (3 + 4)))";
2623        let tokens = lex(source);
2624        assert_eq!(tokens, vec![Token::Arithmetic("2 * (3 + 4)".to_string())]);
2625    }
2626
2627    #[test]
2628    fn arithmetic_with_variable() {
2629        let source = "$((X + 1))";
2630        let tokens = lex(source);
2631        assert_eq!(tokens, vec![Token::Arithmetic("X + 1".to_string())]);
2632    }
2633
2634    #[test]
2635    fn arithmetic_command_subst_not_confused() {
2636        // $( should not be treated as arithmetic
2637        let source = "$(echo hello)";
2638        let tokens = lex(source);
2639        assert_eq!(tokens, vec![
2640            Token::CmdSubstStart,
2641            Token::Ident("echo".to_string()),
2642            Token::Ident("hello".to_string()),
2643            Token::RParen,
2644        ]);
2645    }
2646
2647    #[test]
2648    fn arithmetic_nesting_limit() {
2649        // Create deeply nested parens that exceed MAX_PAREN_DEPTH (256)
2650        let open_parens = "(".repeat(300);
2651        let close_parens = ")".repeat(300);
2652        let source = format!("$(({}1{}))", open_parens, close_parens);
2653        let result = tokenize(&source);
2654        assert!(result.is_err());
2655        let errors = result.unwrap_err();
2656        assert_eq!(errors.len(), 1);
2657        assert_eq!(errors[0].token, LexerError::NestingTooDeep);
2658    }
2659
2660    #[test]
2661    fn arithmetic_nesting_within_limit() {
2662        // Nesting within limit should work
2663        let source = "$((((1 + 2) * 3)))";
2664        let tokens = lex(source);
2665        assert_eq!(tokens, vec![Token::Arithmetic("((1 + 2) * 3)".to_string())]);
2666    }
2667
2668    // ═══════════════════════════════════════════════════════════════════
2669    // Token category tests
2670    // ═══════════════════════════════════════════════════════════════════
2671
2672    #[test]
2673    fn token_categories() {
2674        // Keywords
2675        assert_eq!(Token::If.category(), TokenCategory::Keyword);
2676        assert_eq!(Token::Then.category(), TokenCategory::Keyword);
2677        assert_eq!(Token::For.category(), TokenCategory::Keyword);
2678        assert_eq!(Token::Function.category(), TokenCategory::Keyword);
2679        assert_eq!(Token::True.category(), TokenCategory::Keyword);
2680        assert_eq!(Token::TypeString.category(), TokenCategory::Keyword);
2681
2682        // Operators
2683        assert_eq!(Token::Pipe.category(), TokenCategory::Operator);
2684        assert_eq!(Token::And.category(), TokenCategory::Operator);
2685        assert_eq!(Token::Or.category(), TokenCategory::Operator);
2686        assert_eq!(Token::StderrToStdout.category(), TokenCategory::Operator);
2687        assert_eq!(Token::GtGt.category(), TokenCategory::Operator);
2688
2689        // Strings
2690        assert_eq!(Token::String("test".to_string()).category(), TokenCategory::String);
2691        assert_eq!(Token::SingleString("test".to_string()).category(), TokenCategory::String);
2692        assert_eq!(Token::HereDoc(HereDocData { content: "test".to_string(), literal: false }).category(), TokenCategory::String);
2693
2694        // Numbers
2695        assert_eq!(Token::Int(42).category(), TokenCategory::Number);
2696        assert_eq!(Token::Float(3.14).category(), TokenCategory::Number);
2697        assert_eq!(Token::Arithmetic("1+2".to_string()).category(), TokenCategory::Number);
2698
2699        // Variables
2700        assert_eq!(Token::SimpleVarRef("X".to_string()).category(), TokenCategory::Variable);
2701        assert_eq!(Token::VarRef("${X}".to_string()).category(), TokenCategory::Variable);
2702        assert_eq!(Token::Positional(1).category(), TokenCategory::Variable);
2703        assert_eq!(Token::AllArgs.category(), TokenCategory::Variable);
2704        assert_eq!(Token::ArgCount.category(), TokenCategory::Variable);
2705        assert_eq!(Token::LastExitCode.category(), TokenCategory::Variable);
2706        assert_eq!(Token::CurrentPid.category(), TokenCategory::Variable);
2707
2708        // Flags
2709        assert_eq!(Token::ShortFlag("l".to_string()).category(), TokenCategory::Flag);
2710        assert_eq!(Token::LongFlag("verbose".to_string()).category(), TokenCategory::Flag);
2711        assert_eq!(Token::PlusFlag("e".to_string()).category(), TokenCategory::Flag);
2712        assert_eq!(Token::DoubleDash.category(), TokenCategory::Flag);
2713
2714        // Punctuation
2715        assert_eq!(Token::Semi.category(), TokenCategory::Punctuation);
2716        assert_eq!(Token::LParen.category(), TokenCategory::Punctuation);
2717        assert_eq!(Token::LBracket.category(), TokenCategory::Punctuation);
2718        assert_eq!(Token::Newline.category(), TokenCategory::Punctuation);
2719
2720        // Comments
2721        assert_eq!(Token::Comment.category(), TokenCategory::Comment);
2722
2723        // Paths
2724        assert_eq!(Token::Path("/tmp/file".to_string()).category(), TokenCategory::Path);
2725
2726        // Commands
2727        assert_eq!(Token::Ident("echo".to_string()).category(), TokenCategory::Command);
2728
2729        // Errors
2730        assert_eq!(Token::InvalidNumberIdent.category(), TokenCategory::Error);
2731        assert_eq!(Token::InvalidFloatNoLeading.category(), TokenCategory::Error);
2732        assert_eq!(Token::InvalidFloatNoTrailing.category(), TokenCategory::Error);
2733    }
2734
2735    #[test]
2736    fn test_heredoc_piped_to_command() {
2737        // Bug 4: "cat <<EOF | jq" should produce: cat <<heredoc | jq
2738        // Not: cat | jq <<heredoc
2739        let tokens = tokenize("cat <<EOF | jq\n{\"key\": \"val\"}\nEOF").unwrap();
2740        let heredoc_pos = tokens.iter().position(|t| matches!(t.token, Token::HereDoc(_)));
2741        let pipe_pos = tokens.iter().position(|t| matches!(t.token, Token::Pipe));
2742        assert!(heredoc_pos.is_some(), "should have a heredoc token");
2743        assert!(pipe_pos.is_some(), "should have a pipe token");
2744        assert!(
2745            pipe_pos.unwrap() > heredoc_pos.unwrap(),
2746            "Pipe must come after heredoc, got heredoc at {}, pipe at {}. Tokens: {:?}",
2747            heredoc_pos.unwrap(), pipe_pos.unwrap(), tokens,
2748        );
2749    }
2750
2751    #[test]
2752    fn test_heredoc_standalone_still_works() {
2753        // Regression: standalone heredoc (no pipe) must still work
2754        let tokens = tokenize("cat <<EOF\nhello\nEOF").unwrap();
2755        assert!(tokens.iter().any(|t| matches!(t.token, Token::HereDoc(_))));
2756        assert!(!tokens.iter().any(|t| matches!(t.token, Token::Pipe)));
2757    }
2758
2759    #[test]
2760    fn test_heredoc_preserves_leading_empty_lines() {
2761        // Bug B: heredoc starting with a blank line must preserve it
2762        let tokens = tokenize("cat <<EOF\n\nhello\nEOF").unwrap();
2763        let heredoc = tokens.iter().find_map(|t| {
2764            if let Token::HereDoc(data) = &t.token {
2765                Some(data.clone())
2766            } else {
2767                None
2768            }
2769        });
2770        assert!(heredoc.is_some(), "should have a heredoc token");
2771        let data = heredoc.unwrap();
2772        assert!(data.content.starts_with('\n'), "leading empty line must be preserved, got: {:?}", data.content);
2773        assert_eq!(data.content, "\nhello");
2774    }
2775
2776    #[test]
2777    fn test_heredoc_quoted_delimiter_sets_literal() {
2778        // Bug N: quoted delimiter (<<'EOF') should set literal=true
2779        let tokens = tokenize("cat <<'EOF'\nhello $HOME\nEOF").unwrap();
2780        let heredoc = tokens.iter().find_map(|t| {
2781            if let Token::HereDoc(data) = &t.token {
2782                Some(data.clone())
2783            } else {
2784                None
2785            }
2786        });
2787        assert!(heredoc.is_some(), "should have a heredoc token");
2788        let data = heredoc.unwrap();
2789        assert!(data.literal, "quoted delimiter should set literal=true");
2790        assert_eq!(data.content, "hello $HOME");
2791    }
2792
2793    #[test]
2794    fn test_heredoc_unquoted_delimiter_not_literal() {
2795        // Bug N: unquoted delimiter (<<EOF) should have literal=false
2796        let tokens = tokenize("cat <<EOF\nhello $HOME\nEOF").unwrap();
2797        let heredoc = tokens.iter().find_map(|t| {
2798            if let Token::HereDoc(data) = &t.token {
2799                Some(data.clone())
2800            } else {
2801                None
2802            }
2803        });
2804        assert!(heredoc.is_some(), "should have a heredoc token");
2805        let data = heredoc.unwrap();
2806        assert!(!data.literal, "unquoted delimiter should have literal=false");
2807    }
2808}