kaish_kernel/
lexer.rs

1//! Lexer for kaish source code.
2//!
3//! Converts source text into a stream of tokens using the logos lexer generator.
4//! The lexer is designed to be unambiguous: every valid input produces exactly
5//! one token sequence, and invalid input produces clear errors.
6//!
7//! # Token Categories
8//!
9//! - **Keywords**: `set`, `tool`, `if`, `then`, `else`, `fi`, `for`, `in`, `do`, `done`
10//! - **Literals**: strings, integers, floats, booleans (`true`/`false`)
11//! - **Operators**: `=`, `|`, `&`, `>`, `>>`, `<`, `2>`, `&>`, `&&`, `||`
12//! - **Punctuation**: `;`, `:`, `,`, `.`, `{`, `}`, `[`, `]`
13//! - **Variable references**: `${...}` with nested path access
14//! - **Identifiers**: command names, variable names, parameter names
15
16use logos::{Logos, Span};
17use std::fmt;
18use std::sync::atomic::{AtomicU64, Ordering};
19use std::time::{SystemTime, UNIX_EPOCH};
20
21/// Global counter for generating unique markers across all tokenize calls.
22static MARKER_COUNTER: AtomicU64 = AtomicU64::new(0);
23
24/// Maximum nesting depth for parentheses in arithmetic expressions.
25/// Prevents stack overflow from pathologically nested inputs like $((((((...
26const MAX_PAREN_DEPTH: usize = 256;
27
28/// Tracks a text replacement for span correction.
29/// When preprocessing replaces text (like `$((1+2))` with a marker),
30/// we need to adjust subsequent spans to account for the length change.
31#[derive(Debug, Clone)]
32struct SpanReplacement {
33    /// Position in the preprocessed text where the marker starts.
34    preprocessed_pos: usize,
35    /// Length of the marker in preprocessed text.
36    marker_len: usize,
37    /// Length of the original text that was replaced.
38    original_len: usize,
39}
40
41/// Corrects a span from preprocessed-text coordinates back to original-text coordinates.
42fn correct_span(span: Span, replacements: &[SpanReplacement]) -> Span {
43    let mut start_adjustment: isize = 0;
44    let mut end_adjustment: isize = 0;
45
46    for r in replacements {
47        // Calculate the length difference (positive = original was longer, negative = marker is longer)
48        let delta = r.original_len as isize - r.marker_len as isize;
49
50        // If the span starts after this replacement, adjust the start
51        if span.start > r.preprocessed_pos + r.marker_len {
52            start_adjustment += delta;
53        } else if span.start > r.preprocessed_pos {
54            // Span starts inside the marker - map to original position
55            // (this shouldn't happen often, but handle it gracefully)
56            start_adjustment += delta;
57        }
58
59        // If the span ends after this replacement, adjust the end
60        if span.end > r.preprocessed_pos + r.marker_len {
61            end_adjustment += delta;
62        } else if span.end > r.preprocessed_pos {
63            // Span ends inside the marker - map to end of original
64            end_adjustment += delta;
65        }
66    }
67
68    let new_start = (span.start as isize + start_adjustment).max(0) as usize;
69    let new_end = (span.end as isize + end_adjustment).max(new_start as isize) as usize;
70    new_start..new_end
71}
72
73/// Generate a unique marker ID that's extremely unlikely to collide with user code.
74/// Uses a combination of timestamp, counter, and process ID.
75fn unique_marker_id() -> String {
76    let timestamp = SystemTime::now()
77        .duration_since(UNIX_EPOCH)
78        .map(|d| d.as_nanos())
79        .unwrap_or(0);
80    let counter = MARKER_COUNTER.fetch_add(1, Ordering::Relaxed);
81    let pid = std::process::id();
82    format!("{:x}_{:x}_{:x}", timestamp, counter, pid)
83}
84
85/// A token with its span in the source text.
86#[derive(Debug, Clone, PartialEq)]
87pub struct Spanned<T> {
88    pub token: T,
89    pub span: Span,
90}
91
92impl<T> Spanned<T> {
93    pub fn new(token: T, span: Span) -> Self {
94        Self { token, span }
95    }
96}
97
98/// Lexer error types.
99#[derive(Debug, Clone, PartialEq, Default)]
100pub enum LexerError {
101    #[default]
102    UnexpectedCharacter,
103    UnterminatedString,
104    UnterminatedVarRef,
105    InvalidEscape,
106    InvalidNumber,
107    AmbiguousBoolean(String),
108    AmbiguousBooleanLike(String),
109    InvalidNumberIdent(String),
110    InvalidFloatNoLeading,
111    InvalidFloatNoTrailing,
112    /// Nesting depth exceeded (too many nested parentheses in arithmetic).
113    NestingTooDeep,
114}
115
116impl fmt::Display for LexerError {
117    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
118        match self {
119            LexerError::UnexpectedCharacter => write!(f, "unexpected character"),
120            LexerError::UnterminatedString => write!(f, "unterminated string"),
121            LexerError::UnterminatedVarRef => write!(f, "unterminated variable reference"),
122            LexerError::InvalidEscape => write!(f, "invalid escape sequence"),
123            LexerError::InvalidNumber => write!(f, "invalid number"),
124            LexerError::AmbiguousBoolean(s) => {
125                write!(f, "ambiguous boolean, use lowercase '{}'", s.to_lowercase())
126            }
127            LexerError::AmbiguousBooleanLike(s) => {
128                let suggest = if s.eq_ignore_ascii_case("yes") { "true" } else { "false" };
129                write!(f, "ambiguous boolean-like '{}', use '{}' or '\"{}\"'", s, suggest, s)
130            }
131            LexerError::InvalidNumberIdent(s) => {
132                write!(f, "identifier cannot start with digit: {}", s)
133            }
134            LexerError::InvalidFloatNoLeading => write!(f, "float must have leading digit"),
135            LexerError::InvalidFloatNoTrailing => write!(f, "float must have trailing digit"),
136            LexerError::NestingTooDeep => write!(f, "nesting depth exceeded (max {})", MAX_PAREN_DEPTH),
137        }
138    }
139}
140
141/// Tokens produced by the kaish lexer.
142///
143/// The order of variants matters for logos priority. More specific patterns
144/// (like keywords) should come before more general ones (like identifiers).
145///
146/// Tokens that carry semantic values (strings, numbers, identifiers) include
147/// the parsed value directly. This ensures the parser has access to actual
148/// data, not just token types.
149/// Here-doc content data.
150/// `literal` is true when the delimiter was quoted (<<'EOF' or <<"EOF"),
151/// meaning no variable expansion should occur.
152#[derive(Debug, Clone, PartialEq)]
153pub struct HereDocData {
154    pub content: String,
155    pub literal: bool,
156}
157
158#[derive(Logos, Debug, Clone, PartialEq)]
159#[logos(error = LexerError)]
160#[logos(skip r"[ \t]+")]
161pub enum Token {
162    // ═══════════════════════════════════════════════════════════════════
163    // Keywords (must come before Ident for priority)
164    // ═══════════════════════════════════════════════════════════════════
165    #[token("set")]
166    Set,
167
168    #[token("local")]
169    Local,
170
171    #[token("if")]
172    If,
173
174    #[token("then")]
175    Then,
176
177    #[token("else")]
178    Else,
179
180    #[token("elif")]
181    Elif,
182
183    #[token("fi")]
184    Fi,
185
186    #[token("for")]
187    For,
188
189    #[token("while")]
190    While,
191
192    #[token("in")]
193    In,
194
195    #[token("do")]
196    Do,
197
198    #[token("done")]
199    Done,
200
201    #[token("case")]
202    Case,
203
204    #[token("esac")]
205    Esac,
206
207    #[token("function")]
208    Function,
209
210    #[token("break")]
211    Break,
212
213    #[token("continue")]
214    Continue,
215
216    #[token("return")]
217    Return,
218
219    #[token("exit")]
220    Exit,
221
222    #[token("true")]
223    True,
224
225    #[token("false")]
226    False,
227
228    // ═══════════════════════════════════════════════════════════════════
229    // Type keywords (for tool parameters)
230    // ═══════════════════════════════════════════════════════════════════
231    #[token("string")]
232    TypeString,
233
234    #[token("int")]
235    TypeInt,
236
237    #[token("float")]
238    TypeFloat,
239
240    #[token("bool")]
241    TypeBool,
242
243    // ═══════════════════════════════════════════════════════════════════
244    // Multi-character operators (must come before single-char versions)
245    // ═══════════════════════════════════════════════════════════════════
246    #[token("&&")]
247    And,
248
249    #[token("||")]
250    Or,
251
252    #[token("==")]
253    EqEq,
254
255    #[token("!=")]
256    NotEq,
257
258    #[token("=~")]
259    Match,
260
261    #[token("!~")]
262    NotMatch,
263
264    #[token(">=")]
265    GtEq,
266
267    #[token("<=")]
268    LtEq,
269
270    #[token(">>")]
271    GtGt,
272
273    #[token("2>&1")]
274    StderrToStdout,
275
276    #[token("1>&2")]
277    StdoutToStderr,
278
279    #[token(">&2")]
280    StdoutToStderr2,
281
282    #[token("2>")]
283    Stderr,
284
285    #[token("&>")]
286    Both,
287
288    #[token("<<")]
289    HereDocStart,
290
291    #[token(";;")]
292    DoubleSemi,
293
294    // ═══════════════════════════════════════════════════════════════════
295    // Single-character operators and punctuation
296    // ═══════════════════════════════════════════════════════════════════
297    #[token("=")]
298    Eq,
299
300    #[token("|")]
301    Pipe,
302
303    #[token("&")]
304    Amp,
305
306    #[token(">")]
307    Gt,
308
309    #[token("<")]
310    Lt,
311
312    #[token(";")]
313    Semi,
314
315    #[token(":")]
316    Colon,
317
318    #[token(",")]
319    Comma,
320
321    #[token("..")]
322    DotDot,
323
324    #[token(".")]
325    Dot,
326
327    /// Tilde path: `~/foo`, `~user/bar` - value includes the full string
328    #[regex(r"~[a-zA-Z0-9_./+-]+", lex_tilde_path, priority = 3)]
329    TildePath(String),
330
331    /// Bare tilde: `~` alone (expands to $HOME)
332    #[token("~")]
333    Tilde,
334
335    /// Relative path starting with `../`: `../foo/bar`
336    #[regex(r"\.\./[a-zA-Z0-9_./-]+", lex_relative_path, priority = 3)]
337    RelativePath(String),
338
339    /// Dot-slash path: `./foo`, `./script.sh`
340    #[regex(r"\./[a-zA-Z0-9_./-]+", lex_dot_slash_path, priority = 3)]
341    DotSlashPath(String),
342
343    #[token("{")]
344    LBrace,
345
346    #[token("}")]
347    RBrace,
348
349    #[token("[")]
350    LBracket,
351
352    #[token("]")]
353    RBracket,
354
355    #[token("(")]
356    LParen,
357
358    #[token(")")]
359    RParen,
360
361    #[token("*")]
362    Star,
363
364    #[token("!")]
365    Bang,
366
367    #[token("?")]
368    Question,
369
370    /// Merged glob word: span-adjacent tokens containing `*`, `?`, or `[...]`.
371    /// Synthesized by `merge_glob_adjacent()`, never produced by logos directly.
372    GlobWord(String),
373
374    // ═══════════════════════════════════════════════════════════════════
375    // Command substitution
376    // ═══════════════════════════════════════════════════════════════════
377
378    /// Arithmetic expression content: synthesized by preprocessing.
379    /// Contains the expression string between `$((` and `))`.
380    Arithmetic(String),
381
382    /// Command substitution start: `$(` - begins a command substitution
383    #[token("$(")]
384    CmdSubstStart,
385
386    // ═══════════════════════════════════════════════════════════════════
387    // Flags (must come before Int to win over negative numbers)
388    // ═══════════════════════════════════════════════════════════════════
389
390    /// Long flag: `--name` or `--foo-bar`
391    #[regex(r"--[a-zA-Z][a-zA-Z0-9-]*", lex_long_flag, priority = 3)]
392    LongFlag(String),
393
394    /// Short flag: `-l` or `-la` (combined short flags)
395    #[regex(r"-[a-zA-Z][a-zA-Z0-9]*", lex_short_flag, priority = 3)]
396    ShortFlag(String),
397
398    /// Plus flag: `+e` or `+x` (for set +e to disable options)
399    #[regex(r"\+[a-zA-Z][a-zA-Z0-9]*", lex_plus_flag, priority = 3)]
400    PlusFlag(String),
401
402    /// Double dash: `--` alone marks end of flags
403    #[token("--")]
404    DoubleDash,
405
406    /// Bare word starting with + followed by non-letter: `+%s`, `+%Y-%m-%d`
407    /// For date format strings and similar. Lower priority than PlusFlag.
408    #[regex(r"\+[^a-zA-Z\s][^\s]*", lex_plus_bare, priority = 2)]
409    PlusBare(String),
410
411    /// Bare word starting with - followed by non-letter/digit/dash: `-%`, etc.
412    /// For rare cases. Lower priority than ShortFlag, Int, and DoubleDash.
413    /// Excludes - after first - to avoid matching --name patterns.
414    #[regex(r"-[^a-zA-Z0-9\s\-][^\s]*", lex_minus_bare, priority = 1)]
415    MinusBare(String),
416
417    /// Standalone - (stdin indicator for cat -, diff - -, etc.)
418    /// Only matches when followed by whitespace or end.
419    /// This is handled specially in the parser as a positional arg.
420    #[token("-")]
421    MinusAlone,
422
423    // ═══════════════════════════════════════════════════════════════════
424    // Literals (with values)
425    // ═══════════════════════════════════════════════════════════════════
426
427    /// Double-quoted string: `"..."` - value is the parsed content (quotes removed, escapes processed)
428    #[regex(r#""([^"\\]|\\.)*""#, lex_string)]
429    String(String),
430
431    /// Single-quoted string: `'...'` - literal content, no escape processing
432    #[regex(r"'[^']*'", lex_single_string)]
433    SingleString(String),
434
435    /// Braced variable reference: `${VAR}` or `${VAR.field}` - value is the raw inner content
436    #[regex(r"\$\{[^}]+\}", lex_varref)]
437    VarRef(String),
438
439    /// Simple variable reference: `$NAME` - just the identifier
440    #[regex(r"\$[a-zA-Z_][a-zA-Z0-9_]*", lex_simple_varref)]
441    SimpleVarRef(String),
442
443    /// Positional parameter: `$0` through `$9`
444    #[regex(r"\$[0-9]", lex_positional)]
445    Positional(usize),
446
447    /// All positional parameters: `$@`
448    #[token("$@")]
449    AllArgs,
450
451    /// Number of positional parameters: `$#`
452    #[token("$#")]
453    ArgCount,
454
455    /// Last exit code: `$?`
456    #[token("$?")]
457    LastExitCode,
458
459    /// Current shell PID: `$$`
460    #[token("$$")]
461    CurrentPid,
462
463    /// Variable string length: `${#VAR}`
464    #[regex(r"\$\{#[a-zA-Z_][a-zA-Z0-9_]*\}", lex_var_length)]
465    VarLength(String),
466
467    /// Here-doc content: synthesized by preprocessing, not directly lexed.
468    /// Contains the full content of the here-doc (without the delimiter lines).
469    HereDoc(HereDocData),
470
471    /// Integer literal - value is the parsed i64
472    #[regex(r"-?[0-9]+", lex_int, priority = 2)]
473    Int(i64),
474
475    /// Float literal - value is the parsed f64
476    #[regex(r"-?[0-9]+\.[0-9]+", lex_float)]
477    Float(f64),
478
479    // ═══════════════════════════════════════════════════════════════════
480    // Invalid patterns (caught before valid tokens for better errors)
481    // ═══════════════════════════════════════════════════════════════════
482
483    /// Invalid: number followed by identifier characters (like 123abc)
484    #[regex(r"[0-9]+[a-zA-Z_][a-zA-Z0-9_-]*", lex_invalid_number_ident, priority = 3)]
485    InvalidNumberIdent,
486
487    /// Invalid: float without leading digit (like .5)
488    #[regex(r"\.[0-9]+", lex_invalid_float_no_leading, priority = 3)]
489    InvalidFloatNoLeading,
490
491    /// Invalid: float without trailing digit (like 5.)
492    /// Logos uses longest-match, so valid floats like 5.5 will match Float pattern instead
493    #[regex(r"[0-9]+\.", lex_invalid_float_no_trailing, priority = 2)]
494    InvalidFloatNoTrailing,
495
496    // ═══════════════════════════════════════════════════════════════════
497    // Paths (absolute paths starting with /)
498    // ═══════════════════════════════════════════════════════════════════
499
500    /// Absolute path: `/tmp/out`, `/etc/hosts`, etc.
501    #[regex(r"/[a-zA-Z0-9_./+-]*", lex_path)]
502    Path(String),
503
504    // ═══════════════════════════════════════════════════════════════════
505    // Identifiers (command names, variable names, etc.)
506    // ═══════════════════════════════════════════════════════════════════
507
508    /// Identifier - value is the identifier string
509    /// Allows dots for filenames like `script.kai`
510    #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*", lex_ident)]
511    Ident(String),
512
513    // ═══════════════════════════════════════════════════════════════════
514    // Structural tokens
515    // ═══════════════════════════════════════════════════════════════════
516
517    /// Comment: `# ...` to end of line
518    #[regex(r"#[^\n\r]*", allow_greedy = true)]
519    Comment,
520
521    /// Newline (significant in kaish - ends statements)
522    #[regex(r"\n|\r\n")]
523    Newline,
524
525    /// Line continuation: backslash at end of line
526    #[regex(r"\\[ \t]*(\n|\r\n)")]
527    LineContinuation,
528}
529
530/// Semantic category for syntax highlighting.
531///
532/// Stable enum that groups tokens by purpose. Consumers match on categories
533/// instead of individual tokens, insulating them from lexer evolution.
534#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
535pub enum TokenCategory {
536    /// Keywords: if, then, else, for, while, function, return, etc.
537    Keyword,
538    /// Operators: |, &&, ||, >, >>, 2>&1, =, ==, etc.
539    Operator,
540    /// String literals: "...", '...', heredocs
541    String,
542    /// Numeric literals: 123, 3.14, arithmetic expressions
543    Number,
544    /// Variable references: $foo, ${bar}, $1, $@, $#, $?, $$
545    Variable,
546    /// Comments: # ...
547    Comment,
548    /// Punctuation: ; , . ( ) { } [ ]
549    Punctuation,
550    /// Identifiers in command position
551    Command,
552    /// Absolute paths: /foo/bar
553    Path,
554    /// Flags: --long, -s, +x
555    Flag,
556    /// Invalid tokens
557    Error,
558}
559
560impl Token {
561    /// Returns the semantic category for syntax highlighting.
562    pub fn category(&self) -> TokenCategory {
563        match self {
564            // Keywords
565            Token::If
566            | Token::Then
567            | Token::Else
568            | Token::Elif
569            | Token::Fi
570            | Token::For
571            | Token::In
572            | Token::Do
573            | Token::Done
574            | Token::While
575            | Token::Case
576            | Token::Esac
577            | Token::Function
578            | Token::Return
579            | Token::Break
580            | Token::Continue
581            | Token::Exit
582            | Token::Set
583            | Token::Local
584            | Token::True
585            | Token::False
586            | Token::TypeString
587            | Token::TypeInt
588            | Token::TypeFloat
589            | Token::TypeBool => TokenCategory::Keyword,
590
591            // Operators and redirections
592            Token::Pipe
593            | Token::And
594            | Token::Or
595            | Token::Amp
596            | Token::Eq
597            | Token::EqEq
598            | Token::NotEq
599            | Token::Match
600            | Token::NotMatch
601            | Token::Lt
602            | Token::Gt
603            | Token::LtEq
604            | Token::GtEq
605            | Token::GtGt
606            | Token::Stderr
607            | Token::Both
608            | Token::HereDocStart
609            | Token::StderrToStdout
610            | Token::StdoutToStderr
611            | Token::StdoutToStderr2 => TokenCategory::Operator,
612
613            // Strings
614            Token::String(_) | Token::SingleString(_) | Token::HereDoc(_) => TokenCategory::String,
615
616            // Numbers
617            Token::Int(_) | Token::Float(_) | Token::Arithmetic(_) => TokenCategory::Number,
618
619            // Variables
620            Token::VarRef(_)
621            | Token::SimpleVarRef(_)
622            | Token::Positional(_)
623            | Token::AllArgs
624            | Token::ArgCount
625            | Token::VarLength(_)
626            | Token::LastExitCode
627            | Token::CurrentPid => TokenCategory::Variable,
628
629            // Flags
630            Token::LongFlag(_)
631            | Token::ShortFlag(_)
632            | Token::PlusFlag(_)
633            | Token::DoubleDash => TokenCategory::Flag,
634
635            // Punctuation
636            Token::Semi
637            | Token::DoubleSemi
638            | Token::Colon
639            | Token::Comma
640            | Token::Dot
641            | Token::LParen
642            | Token::RParen
643            | Token::LBrace
644            | Token::RBrace
645            | Token::LBracket
646            | Token::RBracket
647            | Token::Bang
648            | Token::Question
649            | Token::Star
650            | Token::Newline
651            | Token::LineContinuation
652            | Token::CmdSubstStart => TokenCategory::Punctuation,
653
654            // Glob words (merged tokens containing wildcards)
655            Token::GlobWord(_) => TokenCategory::Path,
656
657            // Comments
658            Token::Comment => TokenCategory::Comment,
659
660            // Paths
661            Token::Path(_)
662            | Token::TildePath(_)
663            | Token::RelativePath(_)
664            | Token::Tilde
665            | Token::DotDot
666            | Token::DotSlashPath(_) => TokenCategory::Path,
667
668            // Commands/identifiers (and bare words)
669            Token::Ident(_)
670            | Token::PlusBare(_)
671            | Token::MinusBare(_)
672            | Token::MinusAlone => TokenCategory::Command,
673
674            // Errors
675            Token::InvalidNumberIdent
676            | Token::InvalidFloatNoLeading
677            | Token::InvalidFloatNoTrailing => TokenCategory::Error,
678        }
679    }
680}
681
682/// Lex a double-quoted string literal, processing escape sequences.
683fn lex_string(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
684    parse_string_literal(lex.slice())
685}
686
687/// Lex a single-quoted string literal (no escape processing).
688fn lex_single_string(lex: &mut logos::Lexer<Token>) -> String {
689    let s = lex.slice();
690    // Strip the surrounding single quotes
691    s[1..s.len() - 1].to_string()
692}
693
694/// Lex a braced variable reference, extracting the inner content.
695fn lex_varref(lex: &mut logos::Lexer<Token>) -> String {
696    // Keep the full ${...} for later parsing of path segments
697    lex.slice().to_string()
698}
699
700/// Lex a simple variable reference: `$NAME` → `NAME`
701fn lex_simple_varref(lex: &mut logos::Lexer<Token>) -> String {
702    // Strip the leading `$`
703    lex.slice()[1..].to_string()
704}
705
706/// Lex a positional parameter: `$1` → 1
707fn lex_positional(lex: &mut logos::Lexer<Token>) -> usize {
708    // Strip the leading `$` and parse the digit
709    lex.slice()[1..].parse().unwrap_or(0)
710}
711
712/// Lex a variable length: `${#VAR}` → "VAR"
713fn lex_var_length(lex: &mut logos::Lexer<Token>) -> String {
714    // Strip the leading `${#` and trailing `}`
715    let s = lex.slice();
716    s[3..s.len() - 1].to_string()
717}
718
719/// Lex an integer literal.
720fn lex_int(lex: &mut logos::Lexer<Token>) -> Result<i64, LexerError> {
721    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
722}
723
724/// Lex a float literal.
725fn lex_float(lex: &mut logos::Lexer<Token>) -> Result<f64, LexerError> {
726    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
727}
728
729/// Lex an invalid number-identifier pattern (like 123abc).
730/// Always returns Err to produce a lexer error instead of a token.
731fn lex_invalid_number_ident(lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
732    Err(LexerError::InvalidNumberIdent(lex.slice().to_string()))
733}
734
735/// Lex an invalid float without leading digit (like .5).
736/// Always returns Err to produce a lexer error instead of a token.
737fn lex_invalid_float_no_leading(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
738    Err(LexerError::InvalidFloatNoLeading)
739}
740
741/// Lex an invalid float without trailing digit (like 5.).
742/// Always returns Err to produce a lexer error instead of a token.
743fn lex_invalid_float_no_trailing(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
744    Err(LexerError::InvalidFloatNoTrailing)
745}
746
747/// Lex an identifier, rejecting ambiguous boolean-like values.
748fn lex_ident(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
749    let s = lex.slice();
750
751    // Reject ambiguous boolean variants (TRUE, FALSE, True, etc.)
752    // Only lowercase 'true' and 'false' are valid booleans (handled by Token::True/False)
753    match s.to_lowercase().as_str() {
754        "true" | "false" if s != "true" && s != "false" => {
755            return Err(LexerError::AmbiguousBoolean(s.to_string()));
756        }
757        _ => {}
758    }
759
760    // Reject yes/no/YES/NO/Yes/No as ambiguous boolean-like values
761    if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("no") {
762        return Err(LexerError::AmbiguousBooleanLike(s.to_string()));
763    }
764
765    Ok(s.to_string())
766}
767
768/// Lex a long flag: `--name` → `name`
769fn lex_long_flag(lex: &mut logos::Lexer<Token>) -> String {
770    // Strip the leading `--`
771    lex.slice()[2..].to_string()
772}
773
774/// Lex a short flag: `-l` → `l`, `-la` → `la`
775fn lex_short_flag(lex: &mut logos::Lexer<Token>) -> String {
776    // Strip the leading `-`
777    lex.slice()[1..].to_string()
778}
779
780/// Lex a plus flag: `+e` → `e`, `+ex` → `ex`
781fn lex_plus_flag(lex: &mut logos::Lexer<Token>) -> String {
782    // Strip the leading `+`
783    lex.slice()[1..].to_string()
784}
785
786/// Lex a plus bare word: `+%s` → `+%s` (keep the full string)
787fn lex_plus_bare(lex: &mut logos::Lexer<Token>) -> String {
788    lex.slice().to_string()
789}
790
791/// Lex a minus bare word: `-%` → `-%` (keep the full string)
792fn lex_minus_bare(lex: &mut logos::Lexer<Token>) -> String {
793    lex.slice().to_string()
794}
795
796/// Lex an absolute path: `/tmp/out` → `/tmp/out`
797fn lex_path(lex: &mut logos::Lexer<Token>) -> String {
798    lex.slice().to_string()
799}
800
801/// Lex a tilde path: `~/foo` → `~/foo`
802fn lex_tilde_path(lex: &mut logos::Lexer<Token>) -> String {
803    lex.slice().to_string()
804}
805
806/// Lex a relative path: `../foo` → `../foo`
807fn lex_relative_path(lex: &mut logos::Lexer<Token>) -> String {
808    lex.slice().to_string()
809}
810
811/// Lex a dot-slash path: `./foo` → `./foo`
812fn lex_dot_slash_path(lex: &mut logos::Lexer<Token>) -> String {
813    lex.slice().to_string()
814}
815
816impl fmt::Display for Token {
817    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
818        match self {
819            Token::Set => write!(f, "set"),
820            Token::Local => write!(f, "local"),
821            Token::If => write!(f, "if"),
822            Token::Then => write!(f, "then"),
823            Token::Else => write!(f, "else"),
824            Token::Elif => write!(f, "elif"),
825            Token::Fi => write!(f, "fi"),
826            Token::For => write!(f, "for"),
827            Token::While => write!(f, "while"),
828            Token::In => write!(f, "in"),
829            Token::Do => write!(f, "do"),
830            Token::Done => write!(f, "done"),
831            Token::Case => write!(f, "case"),
832            Token::Esac => write!(f, "esac"),
833            Token::Function => write!(f, "function"),
834            Token::Break => write!(f, "break"),
835            Token::Continue => write!(f, "continue"),
836            Token::Return => write!(f, "return"),
837            Token::Exit => write!(f, "exit"),
838            Token::True => write!(f, "true"),
839            Token::False => write!(f, "false"),
840            Token::TypeString => write!(f, "string"),
841            Token::TypeInt => write!(f, "int"),
842            Token::TypeFloat => write!(f, "float"),
843            Token::TypeBool => write!(f, "bool"),
844            Token::And => write!(f, "&&"),
845            Token::Or => write!(f, "||"),
846            Token::EqEq => write!(f, "=="),
847            Token::NotEq => write!(f, "!="),
848            Token::Match => write!(f, "=~"),
849            Token::NotMatch => write!(f, "!~"),
850            Token::GtEq => write!(f, ">="),
851            Token::LtEq => write!(f, "<="),
852            Token::GtGt => write!(f, ">>"),
853            Token::StderrToStdout => write!(f, "2>&1"),
854            Token::StdoutToStderr => write!(f, "1>&2"),
855            Token::StdoutToStderr2 => write!(f, ">&2"),
856            Token::Stderr => write!(f, "2>"),
857            Token::Both => write!(f, "&>"),
858            Token::HereDocStart => write!(f, "<<"),
859            Token::DoubleSemi => write!(f, ";;"),
860            Token::Eq => write!(f, "="),
861            Token::Pipe => write!(f, "|"),
862            Token::Amp => write!(f, "&"),
863            Token::Gt => write!(f, ">"),
864            Token::Lt => write!(f, "<"),
865            Token::Semi => write!(f, ";"),
866            Token::Colon => write!(f, ":"),
867            Token::Comma => write!(f, ","),
868            Token::Dot => write!(f, "."),
869            Token::DotDot => write!(f, ".."),
870            Token::Tilde => write!(f, "~"),
871            Token::TildePath(s) => write!(f, "{}", s),
872            Token::RelativePath(s) => write!(f, "{}", s),
873            Token::DotSlashPath(s) => write!(f, "{}", s),
874            Token::LBrace => write!(f, "{{"),
875            Token::RBrace => write!(f, "}}"),
876            Token::LBracket => write!(f, "["),
877            Token::RBracket => write!(f, "]"),
878            Token::LParen => write!(f, "("),
879            Token::RParen => write!(f, ")"),
880            Token::Star => write!(f, "*"),
881            Token::Bang => write!(f, "!"),
882            Token::Question => write!(f, "?"),
883            Token::GlobWord(s) => write!(f, "GLOB({})", s),
884            Token::Arithmetic(s) => write!(f, "ARITHMETIC({})", s),
885            Token::CmdSubstStart => write!(f, "$("),
886            Token::LongFlag(s) => write!(f, "--{}", s),
887            Token::ShortFlag(s) => write!(f, "-{}", s),
888            Token::PlusFlag(s) => write!(f, "+{}", s),
889            Token::DoubleDash => write!(f, "--"),
890            Token::PlusBare(s) => write!(f, "{}", s),
891            Token::MinusBare(s) => write!(f, "{}", s),
892            Token::MinusAlone => write!(f, "-"),
893            Token::String(s) => write!(f, "STRING({:?})", s),
894            Token::SingleString(s) => write!(f, "SINGLESTRING({:?})", s),
895            Token::HereDoc(d) => write!(f, "HEREDOC({:?}, literal={})", d.content, d.literal),
896            Token::VarRef(v) => write!(f, "VARREF({})", v),
897            Token::SimpleVarRef(v) => write!(f, "SIMPLEVARREF({})", v),
898            Token::Positional(n) => write!(f, "${}", n),
899            Token::AllArgs => write!(f, "$@"),
900            Token::ArgCount => write!(f, "$#"),
901            Token::LastExitCode => write!(f, "$?"),
902            Token::CurrentPid => write!(f, "$$"),
903            Token::VarLength(v) => write!(f, "${{#{}}}", v),
904            Token::Int(n) => write!(f, "INT({})", n),
905            Token::Float(n) => write!(f, "FLOAT({})", n),
906            Token::Path(s) => write!(f, "PATH({})", s),
907            Token::Ident(s) => write!(f, "IDENT({})", s),
908            Token::Comment => write!(f, "COMMENT"),
909            Token::Newline => write!(f, "NEWLINE"),
910            Token::LineContinuation => write!(f, "LINECONT"),
911            // These variants should never be produced - their callbacks always return errors
912            Token::InvalidNumberIdent => write!(f, "INVALID_NUMBER_IDENT"),
913            Token::InvalidFloatNoLeading => write!(f, "INVALID_FLOAT_NO_LEADING"),
914            Token::InvalidFloatNoTrailing => write!(f, "INVALID_FLOAT_NO_TRAILING"),
915        }
916    }
917}
918
919impl Token {
920    /// Returns true if this token is a keyword.
921    pub fn is_keyword(&self) -> bool {
922        matches!(
923            self,
924            Token::Set
925                | Token::Local
926                | Token::If
927                | Token::Then
928                | Token::Else
929                | Token::Elif
930                | Token::Fi
931                | Token::For
932                | Token::In
933                | Token::Do
934                | Token::Done
935                | Token::Case
936                | Token::Esac
937                | Token::Function
938                | Token::True
939                | Token::False
940        )
941    }
942
943    /// Returns true if this token is a type keyword.
944    pub fn is_type(&self) -> bool {
945        matches!(
946            self,
947            Token::TypeString
948                | Token::TypeInt
949                | Token::TypeFloat
950                | Token::TypeBool
951        )
952    }
953
954    /// Returns true if this token starts a statement.
955    pub fn starts_statement(&self) -> bool {
956        matches!(
957            self,
958            Token::Set | Token::Local | Token::Function | Token::If | Token::For | Token::Case | Token::Ident(_) | Token::LBracket
959        )
960    }
961
962    /// Returns true if this token can appear in an expression.
963    pub fn is_value(&self) -> bool {
964        matches!(
965            self,
966            Token::String(_)
967                | Token::SingleString(_)
968                | Token::HereDoc(_)
969                | Token::Arithmetic(_)
970                | Token::Int(_)
971                | Token::Float(_)
972                | Token::True
973                | Token::False
974                | Token::VarRef(_)
975                | Token::SimpleVarRef(_)
976                | Token::CmdSubstStart
977                | Token::Path(_)
978                | Token::GlobWord(_)
979                | Token::LastExitCode
980                | Token::CurrentPid
981        )
982    }
983}
984
985/// Result of preprocessing arithmetic expressions.
986struct ArithmeticPreprocessResult {
987    /// Preprocessed source with markers replacing $((expr)).
988    text: String,
989    /// Vector of (marker, expression_content) pairs.
990    arithmetics: Vec<(String, String)>,
991    /// Span replacements for correcting token positions.
992    replacements: Vec<SpanReplacement>,
993}
994
995/// Skip a `$(...)` command substitution with quote-aware paren matching.
996///
997/// Copies the entire command substitution verbatim to `result`, handling
998/// single quotes, double quotes, and backslash escapes inside the sub so
999/// that parentheses within strings don't confuse the depth counter.
1000///
1001/// On entry, `i` points to the `$` of `$(`. On exit, `i` points past the
1002/// closing `)`.
1003fn skip_command_substitution(
1004    chars: &[char],
1005    i: &mut usize,
1006    source_pos: &mut usize,
1007    result: &mut String,
1008) {
1009    // Copy $(
1010    result.push('$');
1011    result.push('(');
1012    *i += 2;
1013    *source_pos += 2;
1014
1015    let mut depth: usize = 1;
1016    let mut in_single_quote = false;
1017    let mut in_double_quote = false;
1018
1019    while *i < chars.len() && depth > 0 {
1020        let c = chars[*i];
1021
1022        if in_single_quote {
1023            result.push(c);
1024            *source_pos += c.len_utf8();
1025            *i += 1;
1026            if c == '\'' {
1027                in_single_quote = false;
1028            }
1029            continue;
1030        }
1031
1032        if in_double_quote {
1033            if c == '\\' && *i + 1 < chars.len() {
1034                let next = chars[*i + 1];
1035                if next == '"' || next == '\\' || next == '$' || next == '`' {
1036                    result.push(c);
1037                    result.push(next);
1038                    *source_pos += c.len_utf8() + next.len_utf8();
1039                    *i += 2;
1040                    continue;
1041                }
1042            }
1043            if c == '"' {
1044                in_double_quote = false;
1045            }
1046            result.push(c);
1047            *source_pos += c.len_utf8();
1048            *i += 1;
1049            continue;
1050        }
1051
1052        // Outside quotes
1053        match c {
1054            '\'' => {
1055                in_single_quote = true;
1056                result.push(c);
1057                *source_pos += c.len_utf8();
1058                *i += 1;
1059            }
1060            '"' => {
1061                in_double_quote = true;
1062                result.push(c);
1063                *source_pos += c.len_utf8();
1064                *i += 1;
1065            }
1066            '\\' if *i + 1 < chars.len() => {
1067                result.push(c);
1068                result.push(chars[*i + 1]);
1069                *source_pos += c.len_utf8() + chars[*i + 1].len_utf8();
1070                *i += 2;
1071            }
1072            '(' => {
1073                depth += 1;
1074                result.push(c);
1075                *source_pos += c.len_utf8();
1076                *i += 1;
1077            }
1078            ')' => {
1079                depth -= 1;
1080                result.push(c);
1081                *source_pos += c.len_utf8();
1082                *i += 1;
1083            }
1084            _ => {
1085                result.push(c);
1086                *source_pos += c.len_utf8();
1087                *i += 1;
1088            }
1089        }
1090    }
1091}
1092
1093/// Preprocess arithmetic expressions in source code.
1094///
1095/// Finds `$((expr))` patterns and replaces them with markers.
1096/// Returns the preprocessed source, arithmetic contents, and span replacement info.
1097///
1098/// Example:
1099///   `X=$((1 + 2))`
1100/// Becomes:
1101///   `X=__KAISH_ARITH_{id}__`
1102/// With arithmetics[0] = ("__KAISH_ARITH_{id}__", "1 + 2")
1103///
1104/// # Errors
1105/// Returns `LexerError::NestingTooDeep` if parentheses are nested beyond MAX_PAREN_DEPTH.
1106fn preprocess_arithmetic(source: &str) -> Result<ArithmeticPreprocessResult, LexerError> {
1107    let mut result = String::with_capacity(source.len());
1108    let mut arithmetics: Vec<(String, String)> = Vec::new();
1109    let mut replacements: Vec<SpanReplacement> = Vec::new();
1110    let mut source_pos: usize = 0;
1111    let chars_vec: Vec<char> = source.chars().collect();
1112    let mut i = 0;
1113
1114    // Whether we're currently inside double quotes. Single quotes inside
1115    // double quotes are literal characters, not quote delimiters.
1116    let mut in_double_quote = false;
1117
1118    while i < chars_vec.len() {
1119        let ch = chars_vec[i];
1120
1121        // Backslash escape outside quotes — skip both chars verbatim
1122        if !in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1123            result.push(ch);
1124            result.push(chars_vec[i + 1]);
1125            source_pos += ch.len_utf8() + chars_vec[i + 1].len_utf8();
1126            i += 2;
1127            continue;
1128        }
1129
1130        // Single quote — only starts quote mode when NOT inside double quotes
1131        if ch == '\'' && !in_double_quote {
1132            result.push(ch);
1133            i += 1;
1134            source_pos += 1;
1135            while i < chars_vec.len() && chars_vec[i] != '\'' {
1136                result.push(chars_vec[i]);
1137                source_pos += chars_vec[i].len_utf8();
1138                i += 1;
1139            }
1140            if i < chars_vec.len() {
1141                result.push(chars_vec[i]); // closing quote
1142                source_pos += 1;
1143                i += 1;
1144            }
1145            continue;
1146        }
1147
1148        // Double quote — toggle state (arithmetic is still expanded inside)
1149        if ch == '"' {
1150            in_double_quote = !in_double_quote;
1151            result.push(ch);
1152            i += 1;
1153            source_pos += 1;
1154            continue;
1155        }
1156
1157        // Backslash escape inside double quotes — only \" and \\ are special
1158        if in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1159            let next = chars_vec[i + 1];
1160            if next == '"' || next == '\\' || next == '$' || next == '`' {
1161                result.push(ch);
1162                result.push(next);
1163                source_pos += ch.len_utf8() + next.len_utf8();
1164                i += 2;
1165                continue;
1166            }
1167        }
1168
1169        // Skip $(...) command substitutions — inner arithmetic belongs to the subcommand
1170        if ch == '$' && i + 1 < chars_vec.len() && chars_vec[i + 1] == '('
1171            && !(i + 2 < chars_vec.len() && chars_vec[i + 2] == '(')
1172        {
1173            skip_command_substitution(&chars_vec, &mut i, &mut source_pos, &mut result);
1174            continue;
1175        }
1176
1177        // Look for $(( (potential arithmetic)
1178        if ch == '$' && i + 2 < chars_vec.len() && chars_vec[i + 1] == '(' && chars_vec[i + 2] == '(' {
1179            let arith_start_pos = result.len();
1180            let original_start = source_pos;
1181
1182            // Skip $((
1183            i += 3;
1184            source_pos += 3;
1185
1186            // Collect expression until matching ))
1187            let mut expr = String::new();
1188            let mut paren_depth: usize = 0;
1189
1190            while i < chars_vec.len() {
1191                let c = chars_vec[i];
1192                match c {
1193                    '(' => {
1194                        paren_depth += 1;
1195                        if paren_depth > MAX_PAREN_DEPTH {
1196                            return Err(LexerError::NestingTooDeep);
1197                        }
1198                        expr.push('(');
1199                        i += 1;
1200                        source_pos += c.len_utf8();
1201                    }
1202                    ')' => {
1203                        if paren_depth > 0 {
1204                            paren_depth -= 1;
1205                            expr.push(')');
1206                            i += 1;
1207                            source_pos += 1;
1208                        } else if i + 1 < chars_vec.len() && chars_vec[i + 1] == ')' {
1209                            // Found closing ))
1210                            i += 2;
1211                            source_pos += 2;
1212                            break;
1213                        } else {
1214                            // Single ) inside - keep going
1215                            expr.push(')');
1216                            i += 1;
1217                            source_pos += 1;
1218                        }
1219                    }
1220                    _ => {
1221                        expr.push(c);
1222                        i += 1;
1223                        source_pos += c.len_utf8();
1224                    }
1225                }
1226            }
1227
1228            // Calculate original length: from $$(( to ))
1229            let original_len = source_pos - original_start;
1230
1231            // Create a unique marker for this arithmetic (collision-resistant)
1232            let marker = format!("__KAISH_ARITH_{}__", unique_marker_id());
1233            let marker_len = marker.len();
1234
1235            // Record the replacement for span correction
1236            replacements.push(SpanReplacement {
1237                preprocessed_pos: arith_start_pos,
1238                marker_len,
1239                original_len,
1240            });
1241
1242            arithmetics.push((marker.clone(), expr));
1243            result.push_str(&marker);
1244        } else {
1245            result.push(ch);
1246            i += 1;
1247            source_pos += ch.len_utf8();
1248        }
1249    }
1250
1251    Ok(ArithmeticPreprocessResult {
1252        text: result,
1253        arithmetics,
1254        replacements,
1255    })
1256}
1257
1258/// Preprocess here-docs in source code.
1259///
1260/// Finds `<<WORD` patterns and collects content until the delimiter line.
1261/// Returns the preprocessed source and a vector of (marker, content) pairs.
1262///
1263/// Example:
1264///   `cat <<EOF\nhello\nworld\nEOF`
1265/// Becomes:
1266///   `cat <<__HEREDOC_0__`
1267/// With heredocs[0] = ("__HEREDOC_0__", "hello\nworld")
1268fn preprocess_heredocs(source: &str) -> (String, Vec<(String, String, bool)>) {
1269    let mut result = String::with_capacity(source.len());
1270    let mut heredocs: Vec<(String, String, bool)> = Vec::new();
1271    let mut chars = source.chars().peekable();
1272
1273    while let Some(ch) = chars.next() {
1274        // Look for << (potential here-doc)
1275        if ch == '<' && chars.peek() == Some(&'<') {
1276            chars.next(); // consume second <
1277
1278            // Check for optional - (strip leading tabs)
1279            let strip_tabs = chars.peek() == Some(&'-');
1280            if strip_tabs {
1281                chars.next();
1282            }
1283
1284            // Skip whitespace before delimiter
1285            while let Some(&c) = chars.peek() {
1286                if c == ' ' || c == '\t' {
1287                    chars.next();
1288                } else {
1289                    break;
1290                }
1291            }
1292
1293            // Collect the delimiter word
1294            let mut delimiter = String::new();
1295            let quoted = chars.peek() == Some(&'\'') || chars.peek() == Some(&'"');
1296            let quote_char = if quoted { chars.next() } else { None };
1297
1298            while let Some(&c) = chars.peek() {
1299                if quoted {
1300                    if Some(c) == quote_char {
1301                        chars.next(); // consume closing quote
1302                        break;
1303                    }
1304                } else if c.is_whitespace() || c == '\n' || c == '\r' {
1305                    break;
1306                }
1307                if let Some(ch) = chars.next() {
1308                    delimiter.push(ch);
1309                }
1310            }
1311
1312            if delimiter.is_empty() {
1313                // Not a valid here-doc, output << literally
1314                result.push_str("<<");
1315                if strip_tabs {
1316                    result.push('-');
1317                }
1318                continue;
1319            }
1320
1321            // Buffer text after delimiter word (e.g., " | jq" in "cat <<EOF | jq")
1322            // This must be emitted AFTER the heredoc marker, not before.
1323            let mut after_delimiter = String::new();
1324            while let Some(&c) = chars.peek() {
1325                if c == '\n' {
1326                    chars.next();
1327                    break;
1328                } else if c == '\r' {
1329                    chars.next();
1330                    if chars.peek() == Some(&'\n') {
1331                        chars.next();
1332                    }
1333                    break;
1334                }
1335                if let Some(ch) = chars.next() {
1336                    after_delimiter.push(ch);
1337                }
1338            }
1339
1340            // Collect content until delimiter on its own line
1341            let mut content = String::new();
1342            let mut current_line = String::new();
1343
1344            loop {
1345                match chars.next() {
1346                    Some('\n') => {
1347                        // Check if this line is the delimiter
1348                        let trimmed = if strip_tabs {
1349                            current_line.trim_start_matches('\t')
1350                        } else {
1351                            &current_line
1352                        };
1353                        if trimmed == delimiter {
1354                            // Found end of here-doc
1355                            break;
1356                        }
1357                        // Add line to content (including empty lines)
1358                        content.push_str(&current_line);
1359                        content.push('\n');
1360                        current_line.clear();
1361                    }
1362                    Some('\r') => {
1363                        // Handle \r\n
1364                        if chars.peek() == Some(&'\n') {
1365                            chars.next();
1366                        }
1367                        let trimmed = if strip_tabs {
1368                            current_line.trim_start_matches('\t')
1369                        } else {
1370                            &current_line
1371                        };
1372                        if trimmed == delimiter {
1373                            break;
1374                        }
1375                        content.push_str(&current_line);
1376                        content.push('\n');
1377                        current_line.clear();
1378                    }
1379                    Some(c) => {
1380                        current_line.push(c);
1381                    }
1382                    None => {
1383                        // EOF - check if current line is the delimiter
1384                        let trimmed = if strip_tabs {
1385                            current_line.trim_start_matches('\t')
1386                        } else {
1387                            &current_line
1388                        };
1389                        if trimmed == delimiter {
1390                            // Found delimiter at EOF
1391                            break;
1392                        }
1393                        // Not a delimiter - include remaining content
1394                        if !current_line.is_empty() {
1395                            content.push_str(&current_line);
1396                        }
1397                        break;
1398                    }
1399                }
1400            }
1401
1402            // Remove trailing newline from content (we'll add it when needed)
1403            let content = content.trim_end_matches('\n').to_string();
1404
1405            // Create a unique marker for this here-doc (collision-resistant)
1406            let marker = format!("__KAISH_HEREDOC_{}__", unique_marker_id());
1407            heredocs.push((marker.clone(), content, quoted));
1408
1409            // Output <<marker first, then any text that followed the delimiter
1410            // (e.g., " | jq") so the heredoc attaches to the correct command.
1411            result.push_str("<<");
1412            result.push_str(&marker);
1413            result.push_str(&after_delimiter);
1414            result.push('\n');
1415        } else {
1416            result.push(ch);
1417        }
1418    }
1419
1420    (result, heredocs)
1421}
1422
1423/// Extract the text contribution of a token for colon-adjacent merging.
1424///
1425/// Returns `Some(text)` for token types that can participate in word-like
1426/// merging, `None` for everything else.
1427fn mergeable_text(token: &Token) -> Option<String> {
1428    match token {
1429        Token::Ident(s) => Some(s.clone()),
1430        Token::Colon => Some(":".to_string()),
1431        Token::Int(n) => Some(n.to_string()),
1432        Token::Path(p) => Some(p.clone()),
1433        Token::Float(f) => Some(f.to_string()),
1434        _ => None,
1435    }
1436}
1437
1438/// Merge span-adjacent token runs containing `Token::Colon` into single `Ident` tokens.
1439///
1440/// In bash, `:` is a regular character in unquoted words. kaish tokenizes it
1441/// separately, which breaks Rust paths (`foo::bar`), URLs (`host:8080`), etc.
1442///
1443/// This pass fuses span-adjacent mergeable tokens (Ident, Colon, Int, Path, Float)
1444/// into a single `Ident` when the run contains at least one `Colon`. Runs without
1445/// colons or standalone tokens pass through unchanged.
1446fn merge_colon_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1447    if tokens.is_empty() {
1448        return tokens;
1449    }
1450
1451    let mut result = Vec::with_capacity(tokens.len());
1452    let mut run: Vec<&Spanned<Token>> = Vec::new();
1453
1454    for token in &tokens {
1455        if run.is_empty() {
1456            if mergeable_text(&token.token).is_some() {
1457                run.push(token);
1458            } else {
1459                result.push(token.clone());
1460            }
1461            continue;
1462        }
1463
1464        // Check span adjacency: previous run's last token ends where this one starts
1465        // Safety: run is non-empty (checked above)
1466        let Some(last) = run.last() else { unreachable!() };
1467        let adjacent = last.span.end == token.span.start;
1468
1469        if adjacent && mergeable_text(&token.token).is_some() {
1470            run.push(token);
1471        } else {
1472            flush_colon_run(&mut run, &mut result);
1473            if mergeable_text(&token.token).is_some() {
1474                run.push(token);
1475            } else {
1476                result.push(token.clone());
1477            }
1478        }
1479    }
1480
1481    flush_colon_run(&mut run, &mut result);
1482
1483    result
1484}
1485
1486/// Flush a run of mergeable tokens: merge if it contains a colon, otherwise emit individually.
1487fn flush_colon_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1488    if run.is_empty() {
1489        return;
1490    }
1491
1492    let has_colon = run.iter().any(|t| matches!(t.token, Token::Colon));
1493
1494    if run.len() >= 2 && has_colon {
1495        let text: String = run
1496            .iter()
1497            .filter_map(|t| mergeable_text(&t.token))
1498            .collect();
1499        // Safety: run.len() >= 2 so first/last exist
1500        let start = run.first().map(|t| t.span.start).unwrap_or(0);
1501        let end = run.last().map(|t| t.span.end).unwrap_or(0);
1502        result.push(Spanned::new(Token::Ident(text), start..end));
1503    } else {
1504        for t in run.iter() {
1505            result.push((*t).clone());
1506        }
1507    }
1508
1509    run.clear();
1510}
1511
1512/// Extract the text contribution of a token that can participate in a glob word.
1513///
1514/// Returns `Some(text)` for tokens that can be part of a glob pattern (identifiers,
1515/// wildcard chars, brackets, paths, etc.), `None` for structural tokens.
1516fn glob_mergeable_text(token: &Token) -> Option<String> {
1517    match token {
1518        Token::Star => Some("*".to_string()),
1519        Token::Question => Some("?".to_string()),
1520        Token::Dot => Some(".".to_string()),
1521        Token::DotDot => Some("..".to_string()),
1522        Token::Ident(s) => Some(s.clone()),
1523        Token::Path(s) => Some(s.clone()),
1524        Token::Int(n) => Some(n.to_string()),
1525        Token::LBracket => Some("[".to_string()),
1526        Token::RBracket => Some("]".to_string()),
1527        Token::Bang => Some("!".to_string()),
1528        Token::DotSlashPath(s) => Some(s.clone()),
1529        Token::RelativePath(s) => Some(s.clone()),
1530        Token::TildePath(s) => Some(s.clone()),
1531        Token::Tilde => Some("~".to_string()),
1532        Token::LBrace => Some("{".to_string()),
1533        Token::RBrace => Some("}".to_string()),
1534        Token::Comma => Some(",".to_string()),
1535        _ => None,
1536    }
1537}
1538
1539/// Merge span-adjacent token runs containing glob metacharacters into `GlobWord` tokens.
1540///
1541/// A run is merged into `GlobWord` when it contains at least one `Star`, `Question`,
1542/// or a `LBracket`+`RBracket` pair. Runs without glob chars pass through unchanged.
1543///
1544/// Runs after colon merge: `foo::bar` stays as `Ident("foo::bar")` because colon merge
1545/// already fused it before this pass sees it.
1546fn merge_glob_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1547    if tokens.is_empty() {
1548        return tokens;
1549    }
1550
1551    let mut result = Vec::with_capacity(tokens.len());
1552    let mut run: Vec<&Spanned<Token>> = Vec::new();
1553
1554    for token in &tokens {
1555        if run.is_empty() {
1556            if glob_mergeable_text(&token.token).is_some() {
1557                run.push(token);
1558            } else {
1559                result.push(token.clone());
1560            }
1561            continue;
1562        }
1563
1564        // Safety: run is non-empty (checked at top of loop)
1565        let Some(last) = run.last() else { unreachable!() };
1566        let adjacent = last.span.end == token.span.start;
1567
1568        if adjacent && glob_mergeable_text(&token.token).is_some() {
1569            run.push(token);
1570        } else {
1571            flush_glob_run(&mut run, &mut result);
1572            if glob_mergeable_text(&token.token).is_some() {
1573                run.push(token);
1574            } else {
1575                result.push(token.clone());
1576            }
1577        }
1578    }
1579
1580    flush_glob_run(&mut run, &mut result);
1581
1582    result
1583}
1584
1585/// Flush a run of glob-mergeable tokens: merge if it contains glob metacharacters.
1586fn flush_glob_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1587    if run.is_empty() {
1588        return;
1589    }
1590
1591    let has_glob = run.iter().any(|t| {
1592        matches!(t.token, Token::Star | Token::Question)
1593    }) || (run.iter().any(|t| matches!(t.token, Token::LBracket))
1594        && run.iter().any(|t| matches!(t.token, Token::RBracket)));
1595
1596    if run.len() >= 2 && has_glob {
1597        let text: String = run
1598            .iter()
1599            .filter_map(|t| glob_mergeable_text(&t.token))
1600            .collect();
1601        let start = run.first().map(|t| t.span.start).unwrap_or(0);
1602        let end = run.last().map(|t| t.span.end).unwrap_or(0);
1603        result.push(Spanned::new(Token::GlobWord(text), start..end));
1604    } else {
1605        for t in run.iter() {
1606            result.push((*t).clone());
1607        }
1608    }
1609
1610    run.clear();
1611}
1612
1613/// Tokenize source code into a vector of spanned tokens.
1614///
1615/// Skips whitespace and comments (unless you need them for formatting).
1616/// Returns errors with their positions for nice error messages.
1617///
1618/// Handles:
1619/// - Arithmetic: `$((expr))` becomes `Arithmetic("expr")`
1620/// - Here-docs: `<<EOF\nhello\nEOF` becomes `HereDocStart` + `HereDoc("hello")`
1621/// - Colon merge: span-adjacent `foo::bar` becomes `Ident("foo::bar")`
1622pub fn tokenize(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1623    // Preprocess arithmetic first (before heredocs because heredoc content might contain $((
1624    let arith_result = preprocess_arithmetic(source)
1625        .map_err(|e| vec![Spanned::new(e, 0..source.len())])?;
1626
1627    // Then preprocess here-docs (heredoc span tracking is not implemented for simplicity)
1628    let (preprocessed, heredocs) = preprocess_heredocs(&arith_result.text);
1629
1630    // Combine replacements for span correction (arithmetic only for now)
1631    let span_replacements = arith_result.replacements;
1632
1633    let lexer = Token::lexer(&preprocessed);
1634    let mut tokens = Vec::new();
1635    let mut errors = Vec::new();
1636
1637    for (result, span) in lexer.spanned() {
1638        // Correct the span from preprocessed coordinates to original coordinates
1639        let corrected_span = correct_span(span, &span_replacements);
1640        match result {
1641            Ok(token) => {
1642                // Skip comments and line continuations - they're not needed for parsing
1643                if !matches!(token, Token::Comment | Token::LineContinuation) {
1644                    tokens.push(Spanned::new(token, corrected_span));
1645                }
1646            }
1647            Err(err) => {
1648                errors.push(Spanned::new(err, corrected_span));
1649            }
1650        }
1651    }
1652
1653    if !errors.is_empty() {
1654        return Err(errors);
1655    }
1656
1657    // Post-process: replace markers with actual token content
1658    let mut final_tokens = Vec::with_capacity(tokens.len());
1659    let mut i = 0;
1660
1661    while i < tokens.len() {
1662        // Check for arithmetic marker (unique format: __KAISH_ARITH_{id}__)
1663        if let Token::Ident(ref name) = tokens[i].token
1664            && name.starts_with("__KAISH_ARITH_") && name.ends_with("__")
1665                && let Some((_, expr)) = arith_result.arithmetics.iter().find(|(marker, _)| marker == name) {
1666                    final_tokens.push(Spanned::new(Token::Arithmetic(expr.clone()), tokens[i].span.clone()));
1667                    i += 1;
1668                    continue;
1669                }
1670
1671        // Check for heredoc (unique format: __KAISH_HEREDOC_{id}__)
1672        if matches!(tokens[i].token, Token::HereDocStart) {
1673            // Check if next token is a heredoc marker
1674            if i + 1 < tokens.len()
1675                && let Token::Ident(ref name) = tokens[i + 1].token
1676                    && name.starts_with("__KAISH_HEREDOC_") && name.ends_with("__") {
1677                        // Find the corresponding content
1678                        if let Some((_, content, literal)) = heredocs.iter().find(|(marker, _, _)| marker == name) {
1679                            final_tokens.push(Spanned::new(Token::HereDocStart, tokens[i].span.clone()));
1680                            final_tokens.push(Spanned::new(Token::HereDoc(HereDocData { content: content.clone(), literal: *literal }), tokens[i + 1].span.clone()));
1681                            i += 2;
1682                            continue;
1683                        }
1684                    }
1685        }
1686
1687        // Check for arithmetic markers inside string content
1688        let token = if let Token::String(ref s) = tokens[i].token {
1689            // Check if string contains any arithmetic markers
1690            let mut new_content = s.clone();
1691            for (marker, expr) in &arith_result.arithmetics {
1692                if new_content.contains(marker) {
1693                    // Replace marker with the special format that parse_interpolated_string can detect
1694                    // Use ${__ARITH:expr__} format so it gets parsed as StringPart::Arithmetic
1695                    new_content = new_content.replace(marker, &format!("${{__ARITH:{}__}}", expr));
1696                }
1697            }
1698            if new_content != *s {
1699                Spanned::new(Token::String(new_content), tokens[i].span.clone())
1700            } else {
1701                tokens[i].clone()
1702            }
1703        } else {
1704            tokens[i].clone()
1705        };
1706        final_tokens.push(token);
1707        i += 1;
1708    }
1709
1710    Ok(merge_glob_adjacent(merge_colon_adjacent(final_tokens)))
1711}
1712
1713/// Tokenize source code, preserving comments.
1714///
1715/// Useful for pretty-printing or formatting tools that need to preserve comments.
1716pub fn tokenize_with_comments(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1717    let lexer = Token::lexer(source);
1718    let mut tokens = Vec::new();
1719    let mut errors = Vec::new();
1720
1721    for (result, span) in lexer.spanned() {
1722        match result {
1723            Ok(token) => {
1724                tokens.push(Spanned::new(token, span));
1725            }
1726            Err(err) => {
1727                errors.push(Spanned::new(err, span));
1728            }
1729        }
1730    }
1731
1732    if errors.is_empty() {
1733        Ok(tokens)
1734    } else {
1735        Err(errors)
1736    }
1737}
1738
1739/// Extract the string content from a string token (removes quotes, processes escapes).
1740pub fn parse_string_literal(source: &str) -> Result<String, LexerError> {
1741    // Remove surrounding quotes
1742    if source.len() < 2 || !source.starts_with('"') || !source.ends_with('"') {
1743        return Err(LexerError::UnterminatedString);
1744    }
1745
1746    let inner = &source[1..source.len() - 1];
1747    let mut result = String::with_capacity(inner.len());
1748    let mut chars = inner.chars().peekable();
1749
1750    while let Some(ch) = chars.next() {
1751        if ch == '\\' {
1752            match chars.next() {
1753                Some('n') => result.push('\n'),
1754                Some('t') => result.push('\t'),
1755                Some('r') => result.push('\r'),
1756                Some('\\') => result.push('\\'),
1757                Some('"') => result.push('"'),
1758                // Use a unique marker for escaped dollar that won't be re-interpreted
1759                // parse_interpolated_string will convert this back to $
1760                Some('$') => result.push_str("__KAISH_ESCAPED_DOLLAR__"),
1761                Some('u') => {
1762                    // Unicode escape: \uXXXX
1763                    let mut hex = String::with_capacity(4);
1764                    for _ in 0..4 {
1765                        match chars.next() {
1766                            Some(h) if h.is_ascii_hexdigit() => hex.push(h),
1767                            _ => return Err(LexerError::InvalidEscape),
1768                        }
1769                    }
1770                    let codepoint = u32::from_str_radix(&hex, 16)
1771                        .map_err(|_| LexerError::InvalidEscape)?;
1772                    let ch = char::from_u32(codepoint)
1773                        .ok_or(LexerError::InvalidEscape)?;
1774                    result.push(ch);
1775                }
1776                // Unknown escapes: preserve the backslash (for regex patterns like `\.`)
1777                Some(next) => {
1778                    result.push('\\');
1779                    result.push(next);
1780                }
1781                None => return Err(LexerError::InvalidEscape),
1782            }
1783        } else {
1784            result.push(ch);
1785        }
1786    }
1787
1788    Ok(result)
1789}
1790
1791/// Parse a variable reference, extracting the path segments.
1792/// Input: "${VAR.field[0].nested}" → ["VAR", "field", "[0]", "nested"]
1793pub fn parse_var_ref(source: &str) -> Result<Vec<String>, LexerError> {
1794    // Remove ${ and }
1795    if source.len() < 4 || !source.starts_with("${") || !source.ends_with('}') {
1796        return Err(LexerError::UnterminatedVarRef);
1797    }
1798
1799    let inner = &source[2..source.len() - 1];
1800
1801    // Special case: $? (last result)
1802    if inner == "?" {
1803        return Ok(vec!["?".to_string()]);
1804    }
1805
1806    let mut segments = Vec::new();
1807    let mut current = String::new();
1808    let mut chars = inner.chars().peekable();
1809
1810    while let Some(ch) = chars.next() {
1811        match ch {
1812            '.' => {
1813                if !current.is_empty() {
1814                    segments.push(current.clone());
1815                    current.clear();
1816                }
1817            }
1818            '[' => {
1819                if !current.is_empty() {
1820                    segments.push(current.clone());
1821                    current.clear();
1822                }
1823                // Collect the index
1824                let mut index = String::from("[");
1825                while let Some(&c) = chars.peek() {
1826                    if let Some(c) = chars.next() {
1827                        index.push(c);
1828                    }
1829                    if c == ']' {
1830                        break;
1831                    }
1832                }
1833                segments.push(index);
1834            }
1835            _ => {
1836                current.push(ch);
1837            }
1838        }
1839    }
1840
1841    if !current.is_empty() {
1842        segments.push(current);
1843    }
1844
1845    Ok(segments)
1846}
1847
1848/// Parse an integer literal.
1849pub fn parse_int(source: &str) -> Result<i64, LexerError> {
1850    source.parse().map_err(|_| LexerError::InvalidNumber)
1851}
1852
1853/// Parse a float literal.
1854pub fn parse_float(source: &str) -> Result<f64, LexerError> {
1855    source.parse().map_err(|_| LexerError::InvalidNumber)
1856}
1857
1858#[cfg(test)]
1859mod tests {
1860    use super::*;
1861
1862    fn lex(source: &str) -> Vec<Token> {
1863        tokenize(source)
1864            .expect("lexer should succeed")
1865            .into_iter()
1866            .map(|s| s.token)
1867            .collect()
1868    }
1869
1870    // ═══════════════════════════════════════════════════════════════════
1871    // Keyword tests
1872    // ═══════════════════════════════════════════════════════════════════
1873
1874    #[test]
1875    fn keywords() {
1876        assert_eq!(lex("set"), vec![Token::Set]);
1877        assert_eq!(lex("if"), vec![Token::If]);
1878        assert_eq!(lex("then"), vec![Token::Then]);
1879        assert_eq!(lex("else"), vec![Token::Else]);
1880        assert_eq!(lex("elif"), vec![Token::Elif]);
1881        assert_eq!(lex("fi"), vec![Token::Fi]);
1882        assert_eq!(lex("for"), vec![Token::For]);
1883        assert_eq!(lex("in"), vec![Token::In]);
1884        assert_eq!(lex("do"), vec![Token::Do]);
1885        assert_eq!(lex("done"), vec![Token::Done]);
1886        assert_eq!(lex("case"), vec![Token::Case]);
1887        assert_eq!(lex("esac"), vec![Token::Esac]);
1888        assert_eq!(lex("function"), vec![Token::Function]);
1889        assert_eq!(lex("true"), vec![Token::True]);
1890        assert_eq!(lex("false"), vec![Token::False]);
1891    }
1892
1893    #[test]
1894    fn double_semicolon() {
1895        assert_eq!(lex(";;"), vec![Token::DoubleSemi]);
1896        // In case pattern context
1897        assert_eq!(lex("echo \"hi\";;"), vec![
1898            Token::Ident("echo".to_string()),
1899            Token::String("hi".to_string()),
1900            Token::DoubleSemi,
1901        ]);
1902    }
1903
1904    #[test]
1905    fn type_keywords() {
1906        assert_eq!(lex("string"), vec![Token::TypeString]);
1907        assert_eq!(lex("int"), vec![Token::TypeInt]);
1908        assert_eq!(lex("float"), vec![Token::TypeFloat]);
1909        assert_eq!(lex("bool"), vec![Token::TypeBool]);
1910    }
1911
1912    // ═══════════════════════════════════════════════════════════════════
1913    // Operator tests
1914    // ═══════════════════════════════════════════════════════════════════
1915
1916    #[test]
1917    fn single_char_operators() {
1918        assert_eq!(lex("="), vec![Token::Eq]);
1919        assert_eq!(lex("|"), vec![Token::Pipe]);
1920        assert_eq!(lex("&"), vec![Token::Amp]);
1921        assert_eq!(lex(">"), vec![Token::Gt]);
1922        assert_eq!(lex("<"), vec![Token::Lt]);
1923        assert_eq!(lex(";"), vec![Token::Semi]);
1924        assert_eq!(lex(":"), vec![Token::Colon]);
1925        assert_eq!(lex(","), vec![Token::Comma]);
1926        assert_eq!(lex("."), vec![Token::Dot]);
1927    }
1928
1929    #[test]
1930    fn multi_char_operators() {
1931        assert_eq!(lex("&&"), vec![Token::And]);
1932        assert_eq!(lex("||"), vec![Token::Or]);
1933        assert_eq!(lex("=="), vec![Token::EqEq]);
1934        assert_eq!(lex("!="), vec![Token::NotEq]);
1935        assert_eq!(lex("=~"), vec![Token::Match]);
1936        assert_eq!(lex("!~"), vec![Token::NotMatch]);
1937        assert_eq!(lex(">="), vec![Token::GtEq]);
1938        assert_eq!(lex("<="), vec![Token::LtEq]);
1939        assert_eq!(lex(">>"), vec![Token::GtGt]);
1940        assert_eq!(lex("2>"), vec![Token::Stderr]);
1941        assert_eq!(lex("&>"), vec![Token::Both]);
1942    }
1943
1944    #[test]
1945    fn brackets() {
1946        assert_eq!(lex("{"), vec![Token::LBrace]);
1947        assert_eq!(lex("}"), vec![Token::RBrace]);
1948        assert_eq!(lex("["), vec![Token::LBracket]);
1949        assert_eq!(lex("]"), vec![Token::RBracket]);
1950        assert_eq!(lex("("), vec![Token::LParen]);
1951        assert_eq!(lex(")"), vec![Token::RParen]);
1952    }
1953
1954    // ═══════════════════════════════════════════════════════════════════
1955    // Literal tests
1956    // ═══════════════════════════════════════════════════════════════════
1957
1958    #[test]
1959    fn integers() {
1960        assert_eq!(lex("0"), vec![Token::Int(0)]);
1961        assert_eq!(lex("42"), vec![Token::Int(42)]);
1962        assert_eq!(lex("-1"), vec![Token::Int(-1)]);
1963        assert_eq!(lex("999999"), vec![Token::Int(999999)]);
1964    }
1965
1966    #[test]
1967    fn floats() {
1968        assert_eq!(lex("3.14"), vec![Token::Float(3.14)]);
1969        assert_eq!(lex("-0.5"), vec![Token::Float(-0.5)]);
1970        assert_eq!(lex("123.456"), vec![Token::Float(123.456)]);
1971    }
1972
1973    #[test]
1974    fn strings() {
1975        assert_eq!(lex(r#""hello""#), vec![Token::String("hello".to_string())]);
1976        assert_eq!(lex(r#""hello world""#), vec![Token::String("hello world".to_string())]);
1977        assert_eq!(lex(r#""""#), vec![Token::String("".to_string())]); // empty string
1978        assert_eq!(lex(r#""with \"quotes\"""#), vec![Token::String("with \"quotes\"".to_string())]);
1979        assert_eq!(lex(r#""with\nnewline""#), vec![Token::String("with\nnewline".to_string())]);
1980    }
1981
1982    #[test]
1983    fn var_refs() {
1984        assert_eq!(lex("${X}"), vec![Token::VarRef("${X}".to_string())]);
1985        assert_eq!(lex("${VAR}"), vec![Token::VarRef("${VAR}".to_string())]);
1986        assert_eq!(lex("${VAR.field}"), vec![Token::VarRef("${VAR.field}".to_string())]);
1987        assert_eq!(lex("${VAR[0]}"), vec![Token::VarRef("${VAR[0]}".to_string())]);
1988        assert_eq!(lex("${?.ok}"), vec![Token::VarRef("${?.ok}".to_string())]);
1989    }
1990
1991    // ═══════════════════════════════════════════════════════════════════
1992    // Identifier tests
1993    // ═══════════════════════════════════════════════════════════════════
1994
1995    #[test]
1996    fn identifiers() {
1997        assert_eq!(lex("foo"), vec![Token::Ident("foo".to_string())]);
1998        assert_eq!(lex("foo_bar"), vec![Token::Ident("foo_bar".to_string())]);
1999        assert_eq!(lex("foo-bar"), vec![Token::Ident("foo-bar".to_string())]);
2000        assert_eq!(lex("_private"), vec![Token::Ident("_private".to_string())]);
2001        assert_eq!(lex("cmd123"), vec![Token::Ident("cmd123".to_string())]);
2002    }
2003
2004    #[test]
2005    fn keyword_prefix_identifiers() {
2006        // Identifiers that start with keywords but aren't keywords
2007        assert_eq!(lex("setup"), vec![Token::Ident("setup".to_string())]);
2008        assert_eq!(lex("kaish-tools"), vec![Token::Ident("kaish-tools".to_string())]);
2009        assert_eq!(lex("iffy"), vec![Token::Ident("iffy".to_string())]);
2010        assert_eq!(lex("forked"), vec![Token::Ident("forked".to_string())]);
2011        assert_eq!(lex("done-with-it"), vec![Token::Ident("done-with-it".to_string())]);
2012    }
2013
2014    // ═══════════════════════════════════════════════════════════════════
2015    // Statement tests
2016    // ═══════════════════════════════════════════════════════════════════
2017
2018    #[test]
2019    fn assignment() {
2020        assert_eq!(
2021            lex("set X = 5"),
2022            vec![Token::Set, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2023        );
2024    }
2025
2026    #[test]
2027    fn command_simple() {
2028        assert_eq!(lex("echo"), vec![Token::Ident("echo".to_string())]);
2029        assert_eq!(
2030            lex(r#"echo "hello""#),
2031            vec![Token::Ident("echo".to_string()), Token::String("hello".to_string())]
2032        );
2033    }
2034
2035    #[test]
2036    fn command_with_args() {
2037        assert_eq!(
2038            lex("cmd arg1 arg2"),
2039            vec![Token::Ident("cmd".to_string()), Token::Ident("arg1".to_string()), Token::Ident("arg2".to_string())]
2040        );
2041    }
2042
2043    #[test]
2044    fn command_with_named_args() {
2045        assert_eq!(
2046            lex("cmd key=value"),
2047            vec![Token::Ident("cmd".to_string()), Token::Ident("key".to_string()), Token::Eq, Token::Ident("value".to_string())]
2048        );
2049    }
2050
2051    #[test]
2052    fn pipeline() {
2053        assert_eq!(
2054            lex("a | b | c"),
2055            vec![Token::Ident("a".to_string()), Token::Pipe, Token::Ident("b".to_string()), Token::Pipe, Token::Ident("c".to_string())]
2056        );
2057    }
2058
2059    #[test]
2060    fn if_statement() {
2061        assert_eq!(
2062            lex("if true; then echo; fi"),
2063            vec![
2064                Token::If,
2065                Token::True,
2066                Token::Semi,
2067                Token::Then,
2068                Token::Ident("echo".to_string()),
2069                Token::Semi,
2070                Token::Fi
2071            ]
2072        );
2073    }
2074
2075    #[test]
2076    fn for_loop() {
2077        assert_eq!(
2078            lex("for X in items; do echo; done"),
2079            vec![
2080                Token::For,
2081                Token::Ident("X".to_string()),
2082                Token::In,
2083                Token::Ident("items".to_string()),
2084                Token::Semi,
2085                Token::Do,
2086                Token::Ident("echo".to_string()),
2087                Token::Semi,
2088                Token::Done
2089            ]
2090        );
2091    }
2092
2093    // ═══════════════════════════════════════════════════════════════════
2094    // Whitespace and newlines
2095    // ═══════════════════════════════════════════════════════════════════
2096
2097    #[test]
2098    fn whitespace_ignored() {
2099        assert_eq!(lex("   set   X   =   5   "), lex("set X = 5"));
2100    }
2101
2102    #[test]
2103    fn newlines_preserved() {
2104        let tokens = lex("a\nb");
2105        assert_eq!(
2106            tokens,
2107            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2108        );
2109    }
2110
2111    #[test]
2112    fn multiple_newlines() {
2113        let tokens = lex("a\n\n\nb");
2114        assert_eq!(
2115            tokens,
2116            vec![Token::Ident("a".to_string()), Token::Newline, Token::Newline, Token::Newline, Token::Ident("b".to_string())]
2117        );
2118    }
2119
2120    // ═══════════════════════════════════════════════════════════════════
2121    // Comments
2122    // ═══════════════════════════════════════════════════════════════════
2123
2124    #[test]
2125    fn comments_skipped() {
2126        assert_eq!(lex("# comment"), vec![]);
2127        assert_eq!(lex("a # comment"), vec![Token::Ident("a".to_string())]);
2128        assert_eq!(
2129            lex("a # comment\nb"),
2130            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2131        );
2132    }
2133
2134    #[test]
2135    fn comments_preserved_when_requested() {
2136        let tokens = tokenize_with_comments("a # comment")
2137            .expect("should succeed")
2138            .into_iter()
2139            .map(|s| s.token)
2140            .collect::<Vec<_>>();
2141        assert_eq!(tokens, vec![Token::Ident("a".to_string()), Token::Comment]);
2142    }
2143
2144    // ═══════════════════════════════════════════════════════════════════
2145    // String parsing
2146    // ═══════════════════════════════════════════════════════════════════
2147
2148    #[test]
2149    fn parse_simple_string() {
2150        assert_eq!(parse_string_literal(r#""hello""#).expect("ok"), "hello");
2151    }
2152
2153    #[test]
2154    fn parse_string_with_escapes() {
2155        assert_eq!(
2156            parse_string_literal(r#""hello\nworld""#).expect("ok"),
2157            "hello\nworld"
2158        );
2159        assert_eq!(
2160            parse_string_literal(r#""tab\there""#).expect("ok"),
2161            "tab\there"
2162        );
2163        assert_eq!(
2164            parse_string_literal(r#""quote\"here""#).expect("ok"),
2165            "quote\"here"
2166        );
2167    }
2168
2169    #[test]
2170    fn parse_string_with_unicode() {
2171        assert_eq!(
2172            parse_string_literal(r#""emoji \u2764""#).expect("ok"),
2173            "emoji ❤"
2174        );
2175    }
2176
2177    #[test]
2178    fn parse_string_with_escaped_dollar() {
2179        // \$ produces a marker that parse_interpolated_string will convert to $
2180        // The marker __KAISH_ESCAPED_DOLLAR__ is used to prevent re-interpretation
2181        assert_eq!(
2182            parse_string_literal(r#""\$VAR""#).expect("ok"),
2183            "__KAISH_ESCAPED_DOLLAR__VAR"
2184        );
2185        assert_eq!(
2186            parse_string_literal(r#""cost: \$100""#).expect("ok"),
2187            "cost: __KAISH_ESCAPED_DOLLAR__100"
2188        );
2189    }
2190
2191    // ═══════════════════════════════════════════════════════════════════
2192    // Variable reference parsing
2193    // ═══════════════════════════════════════════════════════════════════
2194
2195    #[test]
2196    fn parse_simple_var() {
2197        assert_eq!(
2198            parse_var_ref("${X}").expect("ok"),
2199            vec!["X"]
2200        );
2201    }
2202
2203    #[test]
2204    fn parse_var_with_field() {
2205        assert_eq!(
2206            parse_var_ref("${VAR.field}").expect("ok"),
2207            vec!["VAR", "field"]
2208        );
2209    }
2210
2211    #[test]
2212    fn parse_var_with_index() {
2213        assert_eq!(
2214            parse_var_ref("${VAR[0]}").expect("ok"),
2215            vec!["VAR", "[0]"]
2216        );
2217    }
2218
2219    #[test]
2220    fn parse_var_nested() {
2221        assert_eq!(
2222            parse_var_ref("${VAR.field[0].nested}").expect("ok"),
2223            vec!["VAR", "field", "[0]", "nested"]
2224        );
2225    }
2226
2227    #[test]
2228    fn parse_last_result() {
2229        assert_eq!(
2230            parse_var_ref("${?}").expect("ok"),
2231            vec!["?"]
2232        );
2233        assert_eq!(
2234            parse_var_ref("${?.ok}").expect("ok"),
2235            vec!["?", "ok"]
2236        );
2237    }
2238
2239    // ═══════════════════════════════════════════════════════════════════
2240    // Number parsing
2241    // ═══════════════════════════════════════════════════════════════════
2242
2243    #[test]
2244    fn parse_integers() {
2245        assert_eq!(parse_int("0").expect("ok"), 0);
2246        assert_eq!(parse_int("42").expect("ok"), 42);
2247        assert_eq!(parse_int("-1").expect("ok"), -1);
2248    }
2249
2250    #[test]
2251    fn parse_floats() {
2252        assert!((parse_float("3.14").expect("ok") - 3.14).abs() < f64::EPSILON);
2253        assert!((parse_float("-0.5").expect("ok") - (-0.5)).abs() < f64::EPSILON);
2254    }
2255
2256    // ═══════════════════════════════════════════════════════════════════
2257    // Edge cases and errors
2258    // ═══════════════════════════════════════════════════════════════════
2259
2260    #[test]
2261    fn empty_input() {
2262        assert_eq!(lex(""), vec![]);
2263    }
2264
2265    #[test]
2266    fn only_whitespace() {
2267        assert_eq!(lex("   \t\t   "), vec![]);
2268    }
2269
2270    #[test]
2271    fn json_array() {
2272        assert_eq!(
2273            lex(r#"[1, 2, 3]"#),
2274            vec![
2275                Token::LBracket,
2276                Token::Int(1),
2277                Token::Comma,
2278                Token::Int(2),
2279                Token::Comma,
2280                Token::Int(3),
2281                Token::RBracket
2282            ]
2283        );
2284    }
2285
2286    #[test]
2287    fn json_object() {
2288        assert_eq!(
2289            lex(r#"{"key": "value"}"#),
2290            vec![
2291                Token::LBrace,
2292                Token::String("key".to_string()),
2293                Token::Colon,
2294                Token::String("value".to_string()),
2295                Token::RBrace
2296            ]
2297        );
2298    }
2299
2300    #[test]
2301    fn redirect_operators() {
2302        assert_eq!(
2303            lex("cmd > file"),
2304            vec![Token::Ident("cmd".to_string()), Token::Gt, Token::Ident("file".to_string())]
2305        );
2306        assert_eq!(
2307            lex("cmd >> file"),
2308            vec![Token::Ident("cmd".to_string()), Token::GtGt, Token::Ident("file".to_string())]
2309        );
2310        assert_eq!(
2311            lex("cmd 2> err"),
2312            vec![Token::Ident("cmd".to_string()), Token::Stderr, Token::Ident("err".to_string())]
2313        );
2314        assert_eq!(
2315            lex("cmd &> all"),
2316            vec![Token::Ident("cmd".to_string()), Token::Both, Token::Ident("all".to_string())]
2317        );
2318    }
2319
2320    #[test]
2321    fn background_job() {
2322        assert_eq!(
2323            lex("cmd &"),
2324            vec![Token::Ident("cmd".to_string()), Token::Amp]
2325        );
2326    }
2327
2328    #[test]
2329    fn command_substitution() {
2330        assert_eq!(
2331            lex("$(cmd)"),
2332            vec![Token::CmdSubstStart, Token::Ident("cmd".to_string()), Token::RParen]
2333        );
2334        assert_eq!(
2335            lex("$(cmd arg)"),
2336            vec![
2337                Token::CmdSubstStart,
2338                Token::Ident("cmd".to_string()),
2339                Token::Ident("arg".to_string()),
2340                Token::RParen
2341            ]
2342        );
2343        assert_eq!(
2344            lex("$(a | b)"),
2345            vec![
2346                Token::CmdSubstStart,
2347                Token::Ident("a".to_string()),
2348                Token::Pipe,
2349                Token::Ident("b".to_string()),
2350                Token::RParen
2351            ]
2352        );
2353    }
2354
2355    #[test]
2356    fn complex_pipeline() {
2357        assert_eq!(
2358            lex(r#"cat file | grep pattern="foo" | head count=10"#),
2359            vec![
2360                Token::Ident("cat".to_string()),
2361                Token::Ident("file".to_string()),
2362                Token::Pipe,
2363                Token::Ident("grep".to_string()),
2364                Token::Ident("pattern".to_string()),
2365                Token::Eq,
2366                Token::String("foo".to_string()),
2367                Token::Pipe,
2368                Token::Ident("head".to_string()),
2369                Token::Ident("count".to_string()),
2370                Token::Eq,
2371                Token::Int(10),
2372            ]
2373        );
2374    }
2375
2376    // ═══════════════════════════════════════════════════════════════════
2377    // Flag tests
2378    // ═══════════════════════════════════════════════════════════════════
2379
2380    #[test]
2381    fn short_flag() {
2382        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2383        assert_eq!(lex("-a"), vec![Token::ShortFlag("a".to_string())]);
2384        assert_eq!(lex("-v"), vec![Token::ShortFlag("v".to_string())]);
2385    }
2386
2387    #[test]
2388    fn short_flag_combined() {
2389        // Combined short flags like -la
2390        assert_eq!(lex("-la"), vec![Token::ShortFlag("la".to_string())]);
2391        assert_eq!(lex("-vvv"), vec![Token::ShortFlag("vvv".to_string())]);
2392    }
2393
2394    #[test]
2395    fn long_flag() {
2396        assert_eq!(lex("--force"), vec![Token::LongFlag("force".to_string())]);
2397        assert_eq!(lex("--verbose"), vec![Token::LongFlag("verbose".to_string())]);
2398        assert_eq!(lex("--foo-bar"), vec![Token::LongFlag("foo-bar".to_string())]);
2399    }
2400
2401    #[test]
2402    fn double_dash() {
2403        // -- alone marks end of flags
2404        assert_eq!(lex("--"), vec![Token::DoubleDash]);
2405    }
2406
2407    #[test]
2408    fn flags_vs_negative_numbers() {
2409        // -123 should be a negative integer, not a flag
2410        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2411        // -l should be a flag
2412        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2413        // -1a is ambiguous - should be Int(-1) then Ident(a)
2414        // Actually the regex -[a-zA-Z] won't match -1a since 1 isn't a letter
2415        assert_eq!(
2416            lex("-1 a"),
2417            vec![Token::Int(-1), Token::Ident("a".to_string())]
2418        );
2419    }
2420
2421    #[test]
2422    fn command_with_flags() {
2423        assert_eq!(
2424            lex("ls -l"),
2425            vec![
2426                Token::Ident("ls".to_string()),
2427                Token::ShortFlag("l".to_string()),
2428            ]
2429        );
2430        assert_eq!(
2431            lex("git commit -m"),
2432            vec![
2433                Token::Ident("git".to_string()),
2434                Token::Ident("commit".to_string()),
2435                Token::ShortFlag("m".to_string()),
2436            ]
2437        );
2438        assert_eq!(
2439            lex("git push --force"),
2440            vec![
2441                Token::Ident("git".to_string()),
2442                Token::Ident("push".to_string()),
2443                Token::LongFlag("force".to_string()),
2444            ]
2445        );
2446    }
2447
2448    #[test]
2449    fn flag_with_value() {
2450        assert_eq!(
2451            lex(r#"git commit -m "message""#),
2452            vec![
2453                Token::Ident("git".to_string()),
2454                Token::Ident("commit".to_string()),
2455                Token::ShortFlag("m".to_string()),
2456                Token::String("message".to_string()),
2457            ]
2458        );
2459        assert_eq!(
2460            lex(r#"--message="hello""#),
2461            vec![
2462                Token::LongFlag("message".to_string()),
2463                Token::Eq,
2464                Token::String("hello".to_string()),
2465            ]
2466        );
2467    }
2468
2469    #[test]
2470    fn end_of_flags_marker() {
2471        assert_eq!(
2472            lex("git checkout -- file"),
2473            vec![
2474                Token::Ident("git".to_string()),
2475                Token::Ident("checkout".to_string()),
2476                Token::DoubleDash,
2477                Token::Ident("file".to_string()),
2478            ]
2479        );
2480    }
2481
2482    // ═══════════════════════════════════════════════════════════════════
2483    // Bash compatibility tokens
2484    // ═══════════════════════════════════════════════════════════════════
2485
2486    #[test]
2487    fn local_keyword() {
2488        assert_eq!(lex("local"), vec![Token::Local]);
2489        assert_eq!(
2490            lex("local X = 5"),
2491            vec![Token::Local, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2492        );
2493    }
2494
2495    #[test]
2496    fn simple_var_ref() {
2497        assert_eq!(lex("$X"), vec![Token::SimpleVarRef("X".to_string())]);
2498        assert_eq!(lex("$foo"), vec![Token::SimpleVarRef("foo".to_string())]);
2499        assert_eq!(lex("$foo_bar"), vec![Token::SimpleVarRef("foo_bar".to_string())]);
2500        assert_eq!(lex("$_private"), vec![Token::SimpleVarRef("_private".to_string())]);
2501    }
2502
2503    #[test]
2504    fn simple_var_ref_in_command() {
2505        assert_eq!(
2506            lex("echo $NAME"),
2507            vec![Token::Ident("echo".to_string()), Token::SimpleVarRef("NAME".to_string())]
2508        );
2509    }
2510
2511    #[test]
2512    fn single_quoted_strings() {
2513        assert_eq!(lex("'hello'"), vec![Token::SingleString("hello".to_string())]);
2514        assert_eq!(lex("'hello world'"), vec![Token::SingleString("hello world".to_string())]);
2515        assert_eq!(lex("''"), vec![Token::SingleString("".to_string())]);
2516        // Single quotes don't process escapes or variables
2517        assert_eq!(lex(r"'no $VAR here'"), vec![Token::SingleString("no $VAR here".to_string())]);
2518        assert_eq!(lex(r"'backslash \n stays'"), vec![Token::SingleString(r"backslash \n stays".to_string())]);
2519    }
2520
2521    #[test]
2522    fn test_brackets() {
2523        // [[ and ]] are now two separate bracket tokens to avoid conflicts with nested arrays
2524        assert_eq!(lex("[["), vec![Token::LBracket, Token::LBracket]);
2525        assert_eq!(lex("]]"), vec![Token::RBracket, Token::RBracket]);
2526        assert_eq!(
2527            lex("[[ -f file ]]"),
2528            vec![
2529                Token::LBracket,
2530                Token::LBracket,
2531                Token::ShortFlag("f".to_string()),
2532                Token::Ident("file".to_string()),
2533                Token::RBracket,
2534                Token::RBracket
2535            ]
2536        );
2537    }
2538
2539    #[test]
2540    fn test_expression_syntax() {
2541        assert_eq!(
2542            lex(r#"[[ $X == "value" ]]"#),
2543            vec![
2544                Token::LBracket,
2545                Token::LBracket,
2546                Token::SimpleVarRef("X".to_string()),
2547                Token::EqEq,
2548                Token::String("value".to_string()),
2549                Token::RBracket,
2550                Token::RBracket
2551            ]
2552        );
2553    }
2554
2555    #[test]
2556    fn bash_style_assignment() {
2557        // NAME="value" (no spaces) - lexer sees IDENT EQ STRING
2558        assert_eq!(
2559            lex(r#"NAME="value""#),
2560            vec![
2561                Token::Ident("NAME".to_string()),
2562                Token::Eq,
2563                Token::String("value".to_string())
2564            ]
2565        );
2566    }
2567
2568    #[test]
2569    fn positional_params() {
2570        assert_eq!(lex("$0"), vec![Token::Positional(0)]);
2571        assert_eq!(lex("$1"), vec![Token::Positional(1)]);
2572        assert_eq!(lex("$9"), vec![Token::Positional(9)]);
2573        assert_eq!(lex("$@"), vec![Token::AllArgs]);
2574        assert_eq!(lex("$#"), vec![Token::ArgCount]);
2575    }
2576
2577    #[test]
2578    fn positional_in_context() {
2579        assert_eq!(
2580            lex("echo $1 $2"),
2581            vec![
2582                Token::Ident("echo".to_string()),
2583                Token::Positional(1),
2584                Token::Positional(2),
2585            ]
2586        );
2587    }
2588
2589    #[test]
2590    fn var_length() {
2591        assert_eq!(lex("${#X}"), vec![Token::VarLength("X".to_string())]);
2592        assert_eq!(lex("${#NAME}"), vec![Token::VarLength("NAME".to_string())]);
2593        assert_eq!(lex("${#foo_bar}"), vec![Token::VarLength("foo_bar".to_string())]);
2594    }
2595
2596    #[test]
2597    fn var_length_in_context() {
2598        assert_eq!(
2599            lex("echo ${#NAME}"),
2600            vec![
2601                Token::Ident("echo".to_string()),
2602                Token::VarLength("NAME".to_string()),
2603            ]
2604        );
2605    }
2606
2607    // ═══════════════════════════════════════════════════════════════════
2608    // Edge case tests: Flag ambiguities
2609    // ═══════════════════════════════════════════════════════════════════
2610
2611    #[test]
2612    fn plus_flag() {
2613        // Plus flags for set +e
2614        assert_eq!(lex("+e"), vec![Token::PlusFlag("e".to_string())]);
2615        assert_eq!(lex("+x"), vec![Token::PlusFlag("x".to_string())]);
2616        assert_eq!(lex("+ex"), vec![Token::PlusFlag("ex".to_string())]);
2617    }
2618
2619    #[test]
2620    fn set_with_plus_flag() {
2621        assert_eq!(
2622            lex("set +e"),
2623            vec![
2624                Token::Set,
2625                Token::PlusFlag("e".to_string()),
2626            ]
2627        );
2628    }
2629
2630    #[test]
2631    fn set_with_multiple_flags() {
2632        assert_eq!(
2633            lex("set -e -u"),
2634            vec![
2635                Token::Set,
2636                Token::ShortFlag("e".to_string()),
2637                Token::ShortFlag("u".to_string()),
2638            ]
2639        );
2640    }
2641
2642    #[test]
2643    fn flags_vs_negative_numbers_edge_cases() {
2644        // -1a should be negative int followed by ident
2645        assert_eq!(
2646            lex("-1 a"),
2647            vec![Token::Int(-1), Token::Ident("a".to_string())]
2648        );
2649        // -l is a flag
2650        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2651        // -123 is negative number
2652        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2653    }
2654
2655    #[test]
2656    fn single_dash_is_minus_alone() {
2657        // Single dash alone - now handled as MinusAlone for `cat -` stdin indicator
2658        let result = tokenize("-").expect("should lex");
2659        assert_eq!(result.len(), 1);
2660        assert!(matches!(result[0].token, Token::MinusAlone));
2661    }
2662
2663    #[test]
2664    fn plus_bare_for_date_format() {
2665        // `date +%s` - the +%s should be PlusBare
2666        let result = tokenize("+%s").expect("should lex");
2667        assert_eq!(result.len(), 1);
2668        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%s"));
2669
2670        // `date +%Y-%m-%d` - format string with dashes
2671        let result = tokenize("+%Y-%m-%d").expect("should lex");
2672        assert_eq!(result.len(), 1);
2673        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%Y-%m-%d"));
2674    }
2675
2676    #[test]
2677    fn plus_flag_still_works() {
2678        // `set +e` - should still be PlusFlag
2679        let result = tokenize("+e").expect("should lex");
2680        assert_eq!(result.len(), 1);
2681        assert!(matches!(result[0].token, Token::PlusFlag(ref s) if s == "e"));
2682    }
2683
2684    #[test]
2685    fn while_keyword_vs_while_loop() {
2686        // 'while' as keyword in loop context
2687        assert_eq!(lex("while"), vec![Token::While]);
2688        // 'while' at start followed by condition
2689        assert_eq!(
2690            lex("while true"),
2691            vec![Token::While, Token::True]
2692        );
2693    }
2694
2695    #[test]
2696    fn control_flow_keywords() {
2697        assert_eq!(lex("break"), vec![Token::Break]);
2698        assert_eq!(lex("continue"), vec![Token::Continue]);
2699        assert_eq!(lex("return"), vec![Token::Return]);
2700        assert_eq!(lex("exit"), vec![Token::Exit]);
2701    }
2702
2703    #[test]
2704    fn control_flow_with_numbers() {
2705        assert_eq!(
2706            lex("break 2"),
2707            vec![Token::Break, Token::Int(2)]
2708        );
2709        assert_eq!(
2710            lex("continue 3"),
2711            vec![Token::Continue, Token::Int(3)]
2712        );
2713        assert_eq!(
2714            lex("exit 1"),
2715            vec![Token::Exit, Token::Int(1)]
2716        );
2717    }
2718
2719    // ═══════════════════════════════════════════════════════════════════
2720    // Here-doc tests
2721    // ═══════════════════════════════════════════════════════════════════
2722
2723    #[test]
2724    fn heredoc_simple() {
2725        let source = "cat <<EOF\nhello\nworld\nEOF";
2726        let tokens = lex(source);
2727        assert_eq!(tokens, vec![
2728            Token::Ident("cat".to_string()),
2729            Token::HereDocStart,
2730            Token::HereDoc(HereDocData { content: "hello\nworld".to_string(), literal: false }),
2731            Token::Newline,
2732        ]);
2733    }
2734
2735    #[test]
2736    fn heredoc_empty() {
2737        let source = "cat <<EOF\nEOF";
2738        let tokens = lex(source);
2739        assert_eq!(tokens, vec![
2740            Token::Ident("cat".to_string()),
2741            Token::HereDocStart,
2742            Token::HereDoc(HereDocData { content: "".to_string(), literal: false }),
2743            Token::Newline,
2744        ]);
2745    }
2746
2747    #[test]
2748    fn heredoc_with_special_chars() {
2749        let source = "cat <<EOF\n$VAR and \"quoted\" 'single'\nEOF";
2750        let tokens = lex(source);
2751        assert_eq!(tokens, vec![
2752            Token::Ident("cat".to_string()),
2753            Token::HereDocStart,
2754            Token::HereDoc(HereDocData { content: "$VAR and \"quoted\" 'single'".to_string(), literal: false }),
2755            Token::Newline,
2756        ]);
2757    }
2758
2759    #[test]
2760    fn heredoc_multiline() {
2761        let source = "cat <<END\nline1\nline2\nline3\nEND";
2762        let tokens = lex(source);
2763        assert_eq!(tokens, vec![
2764            Token::Ident("cat".to_string()),
2765            Token::HereDocStart,
2766            Token::HereDoc(HereDocData { content: "line1\nline2\nline3".to_string(), literal: false }),
2767            Token::Newline,
2768        ]);
2769    }
2770
2771    #[test]
2772    fn heredoc_in_command() {
2773        let source = "cat <<EOF\nhello\nEOF\necho goodbye";
2774        let tokens = lex(source);
2775        assert_eq!(tokens, vec![
2776            Token::Ident("cat".to_string()),
2777            Token::HereDocStart,
2778            Token::HereDoc(HereDocData { content: "hello".to_string(), literal: false }),
2779            Token::Newline,
2780            Token::Ident("echo".to_string()),
2781            Token::Ident("goodbye".to_string()),
2782        ]);
2783    }
2784
2785    #[test]
2786    fn heredoc_strip_tabs() {
2787        let source = "cat <<-EOF\n\thello\n\tworld\n\tEOF";
2788        let tokens = lex(source);
2789        // Content has tabs preserved, only delimiter matching strips tabs
2790        assert_eq!(tokens, vec![
2791            Token::Ident("cat".to_string()),
2792            Token::HereDocStart,
2793            Token::HereDoc(HereDocData { content: "\thello\n\tworld".to_string(), literal: false }),
2794            Token::Newline,
2795        ]);
2796    }
2797
2798    // ═══════════════════════════════════════════════════════════════════
2799    // Arithmetic expression tests
2800    // ═══════════════════════════════════════════════════════════════════
2801
2802    #[test]
2803    fn arithmetic_simple() {
2804        let source = "$((1 + 2))";
2805        let tokens = lex(source);
2806        assert_eq!(tokens, vec![Token::Arithmetic("1 + 2".to_string())]);
2807    }
2808
2809    #[test]
2810    fn arithmetic_in_assignment() {
2811        let source = "X=$((5 * 3))";
2812        let tokens = lex(source);
2813        assert_eq!(tokens, vec![
2814            Token::Ident("X".to_string()),
2815            Token::Eq,
2816            Token::Arithmetic("5 * 3".to_string()),
2817        ]);
2818    }
2819
2820    #[test]
2821    fn arithmetic_with_nested_parens() {
2822        let source = "$((2 * (3 + 4)))";
2823        let tokens = lex(source);
2824        assert_eq!(tokens, vec![Token::Arithmetic("2 * (3 + 4)".to_string())]);
2825    }
2826
2827    #[test]
2828    fn arithmetic_with_variable() {
2829        let source = "$((X + 1))";
2830        let tokens = lex(source);
2831        assert_eq!(tokens, vec![Token::Arithmetic("X + 1".to_string())]);
2832    }
2833
2834    #[test]
2835    fn arithmetic_command_subst_not_confused() {
2836        // $( should not be treated as arithmetic
2837        let source = "$(echo hello)";
2838        let tokens = lex(source);
2839        assert_eq!(tokens, vec![
2840            Token::CmdSubstStart,
2841            Token::Ident("echo".to_string()),
2842            Token::Ident("hello".to_string()),
2843            Token::RParen,
2844        ]);
2845    }
2846
2847    #[test]
2848    fn arithmetic_nesting_limit() {
2849        // Create deeply nested parens that exceed MAX_PAREN_DEPTH (256)
2850        let open_parens = "(".repeat(300);
2851        let close_parens = ")".repeat(300);
2852        let source = format!("$(({}1{}))", open_parens, close_parens);
2853        let result = tokenize(&source);
2854        assert!(result.is_err());
2855        let errors = result.unwrap_err();
2856        assert_eq!(errors.len(), 1);
2857        assert_eq!(errors[0].token, LexerError::NestingTooDeep);
2858    }
2859
2860    #[test]
2861    fn arithmetic_nesting_within_limit() {
2862        // Nesting within limit should work
2863        let source = "$((((1 + 2) * 3)))";
2864        let tokens = lex(source);
2865        assert_eq!(tokens, vec![Token::Arithmetic("((1 + 2) * 3)".to_string())]);
2866    }
2867
2868    // ═══════════════════════════════════════════════════════════════════
2869    // Token category tests
2870    // ═══════════════════════════════════════════════════════════════════
2871
2872    #[test]
2873    fn token_categories() {
2874        // Keywords
2875        assert_eq!(Token::If.category(), TokenCategory::Keyword);
2876        assert_eq!(Token::Then.category(), TokenCategory::Keyword);
2877        assert_eq!(Token::For.category(), TokenCategory::Keyword);
2878        assert_eq!(Token::Function.category(), TokenCategory::Keyword);
2879        assert_eq!(Token::True.category(), TokenCategory::Keyword);
2880        assert_eq!(Token::TypeString.category(), TokenCategory::Keyword);
2881
2882        // Operators
2883        assert_eq!(Token::Pipe.category(), TokenCategory::Operator);
2884        assert_eq!(Token::And.category(), TokenCategory::Operator);
2885        assert_eq!(Token::Or.category(), TokenCategory::Operator);
2886        assert_eq!(Token::StderrToStdout.category(), TokenCategory::Operator);
2887        assert_eq!(Token::GtGt.category(), TokenCategory::Operator);
2888
2889        // Strings
2890        assert_eq!(Token::String("test".to_string()).category(), TokenCategory::String);
2891        assert_eq!(Token::SingleString("test".to_string()).category(), TokenCategory::String);
2892        assert_eq!(Token::HereDoc(HereDocData { content: "test".to_string(), literal: false }).category(), TokenCategory::String);
2893
2894        // Numbers
2895        assert_eq!(Token::Int(42).category(), TokenCategory::Number);
2896        assert_eq!(Token::Float(3.14).category(), TokenCategory::Number);
2897        assert_eq!(Token::Arithmetic("1+2".to_string()).category(), TokenCategory::Number);
2898
2899        // Variables
2900        assert_eq!(Token::SimpleVarRef("X".to_string()).category(), TokenCategory::Variable);
2901        assert_eq!(Token::VarRef("${X}".to_string()).category(), TokenCategory::Variable);
2902        assert_eq!(Token::Positional(1).category(), TokenCategory::Variable);
2903        assert_eq!(Token::AllArgs.category(), TokenCategory::Variable);
2904        assert_eq!(Token::ArgCount.category(), TokenCategory::Variable);
2905        assert_eq!(Token::LastExitCode.category(), TokenCategory::Variable);
2906        assert_eq!(Token::CurrentPid.category(), TokenCategory::Variable);
2907
2908        // Flags
2909        assert_eq!(Token::ShortFlag("l".to_string()).category(), TokenCategory::Flag);
2910        assert_eq!(Token::LongFlag("verbose".to_string()).category(), TokenCategory::Flag);
2911        assert_eq!(Token::PlusFlag("e".to_string()).category(), TokenCategory::Flag);
2912        assert_eq!(Token::DoubleDash.category(), TokenCategory::Flag);
2913
2914        // Punctuation
2915        assert_eq!(Token::Semi.category(), TokenCategory::Punctuation);
2916        assert_eq!(Token::LParen.category(), TokenCategory::Punctuation);
2917        assert_eq!(Token::LBracket.category(), TokenCategory::Punctuation);
2918        assert_eq!(Token::Newline.category(), TokenCategory::Punctuation);
2919
2920        // Comments
2921        assert_eq!(Token::Comment.category(), TokenCategory::Comment);
2922
2923        // Paths
2924        assert_eq!(Token::Path("/tmp/file".to_string()).category(), TokenCategory::Path);
2925
2926        // Commands
2927        assert_eq!(Token::Ident("echo".to_string()).category(), TokenCategory::Command);
2928
2929        // Errors
2930        assert_eq!(Token::InvalidNumberIdent.category(), TokenCategory::Error);
2931        assert_eq!(Token::InvalidFloatNoLeading.category(), TokenCategory::Error);
2932        assert_eq!(Token::InvalidFloatNoTrailing.category(), TokenCategory::Error);
2933    }
2934
2935    #[test]
2936    fn test_heredoc_piped_to_command() {
2937        // Bug 4: "cat <<EOF | jq" should produce: cat <<heredoc | jq
2938        // Not: cat | jq <<heredoc
2939        let tokens = tokenize("cat <<EOF | jq\n{\"key\": \"val\"}\nEOF").unwrap();
2940        let heredoc_pos = tokens.iter().position(|t| matches!(t.token, Token::HereDoc(_)));
2941        let pipe_pos = tokens.iter().position(|t| matches!(t.token, Token::Pipe));
2942        assert!(heredoc_pos.is_some(), "should have a heredoc token");
2943        assert!(pipe_pos.is_some(), "should have a pipe token");
2944        assert!(
2945            pipe_pos.unwrap() > heredoc_pos.unwrap(),
2946            "Pipe must come after heredoc, got heredoc at {}, pipe at {}. Tokens: {:?}",
2947            heredoc_pos.unwrap(), pipe_pos.unwrap(), tokens,
2948        );
2949    }
2950
2951    #[test]
2952    fn test_heredoc_standalone_still_works() {
2953        // Regression: standalone heredoc (no pipe) must still work
2954        let tokens = tokenize("cat <<EOF\nhello\nEOF").unwrap();
2955        assert!(tokens.iter().any(|t| matches!(t.token, Token::HereDoc(_))));
2956        assert!(!tokens.iter().any(|t| matches!(t.token, Token::Pipe)));
2957    }
2958
2959    #[test]
2960    fn test_heredoc_preserves_leading_empty_lines() {
2961        // Bug B: heredoc starting with a blank line must preserve it
2962        let tokens = tokenize("cat <<EOF\n\nhello\nEOF").unwrap();
2963        let heredoc = tokens.iter().find_map(|t| {
2964            if let Token::HereDoc(data) = &t.token {
2965                Some(data.clone())
2966            } else {
2967                None
2968            }
2969        });
2970        assert!(heredoc.is_some(), "should have a heredoc token");
2971        let data = heredoc.unwrap();
2972        assert!(data.content.starts_with('\n'), "leading empty line must be preserved, got: {:?}", data.content);
2973        assert_eq!(data.content, "\nhello");
2974    }
2975
2976    #[test]
2977    fn test_heredoc_quoted_delimiter_sets_literal() {
2978        // Bug N: quoted delimiter (<<'EOF') should set literal=true
2979        let tokens = tokenize("cat <<'EOF'\nhello $HOME\nEOF").unwrap();
2980        let heredoc = tokens.iter().find_map(|t| {
2981            if let Token::HereDoc(data) = &t.token {
2982                Some(data.clone())
2983            } else {
2984                None
2985            }
2986        });
2987        assert!(heredoc.is_some(), "should have a heredoc token");
2988        let data = heredoc.unwrap();
2989        assert!(data.literal, "quoted delimiter should set literal=true");
2990        assert_eq!(data.content, "hello $HOME");
2991    }
2992
2993    #[test]
2994    fn test_heredoc_unquoted_delimiter_not_literal() {
2995        // Bug N: unquoted delimiter (<<EOF) should have literal=false
2996        let tokens = tokenize("cat <<EOF\nhello $HOME\nEOF").unwrap();
2997        let heredoc = tokens.iter().find_map(|t| {
2998            if let Token::HereDoc(data) = &t.token {
2999                Some(data.clone())
3000            } else {
3001                None
3002            }
3003        });
3004        assert!(heredoc.is_some(), "should have a heredoc token");
3005        let data = heredoc.unwrap();
3006        assert!(!data.literal, "unquoted delimiter should have literal=false");
3007    }
3008
3009    // ═══════════════════════════════════════════════════════════════════
3010    // Colon merge tests
3011    // ═══════════════════════════════════════════════════════════════════
3012
3013    #[test]
3014    fn colon_double_in_word() {
3015        assert_eq!(lex("foo::bar"), vec![Token::Ident("foo::bar".into())]);
3016    }
3017
3018    #[test]
3019    fn colon_single_in_word() {
3020        assert_eq!(lex("a:b:c"), vec![Token::Ident("a:b:c".into())]);
3021    }
3022
3023    #[test]
3024    fn colon_with_port() {
3025        assert_eq!(lex("host:8080"), vec![Token::Ident("host:8080".into())]);
3026    }
3027
3028    #[test]
3029    fn colon_standalone() {
3030        assert_eq!(lex(":"), vec![Token::Colon]);
3031    }
3032
3033    #[test]
3034    fn colon_spaced_no_merge() {
3035        assert_eq!(
3036            lex("foo : bar"),
3037            vec![
3038                Token::Ident("foo".into()),
3039                Token::Colon,
3040                Token::Ident("bar".into()),
3041            ]
3042        );
3043    }
3044
3045    #[test]
3046    fn colon_in_command_arg() {
3047        assert_eq!(
3048            lex("echo foo::bar"),
3049            vec![
3050                Token::Ident("echo".into()),
3051                Token::Ident("foo::bar".into()),
3052            ]
3053        );
3054    }
3055
3056    #[test]
3057    fn colon_trailing() {
3058        // Trailing colon merges with preceding ident
3059        assert_eq!(lex("foo:"), vec![Token::Ident("foo:".into())]);
3060    }
3061
3062    #[test]
3063    fn colon_leading() {
3064        // Leading colon merges with following ident
3065        assert_eq!(lex(":foo"), vec![Token::Ident(":foo".into())]);
3066    }
3067
3068    #[test]
3069    fn colon_with_path() {
3070        // Path token + colon + int
3071        assert_eq!(
3072            lex("/usr/bin:8080"),
3073            vec![Token::Ident("/usr/bin:8080".into())]
3074        );
3075    }
3076}
kaish_kernel/lexer.rs

kaish_kernel/
lexer.rs