kaish_kernel/
lexer.rs

1//! Lexer for kaish source code.
2//!
3//! Converts source text into a stream of tokens using the logos lexer generator.
4//! The lexer is designed to be unambiguous: every valid input produces exactly
5//! one token sequence, and invalid input produces clear errors.
6//!
7//! # Token Categories
8//!
9//! - **Keywords**: `set`, `tool`, `if`, `then`, `else`, `fi`, `for`, `in`, `do`, `done`
10//! - **Literals**: strings, integers, floats, booleans (`true`/`false`)
11//! - **Operators**: `=`, `|`, `&`, `>`, `>>`, `<`, `2>`, `&>`, `&&`, `||`
12//! - **Punctuation**: `;`, `:`, `,`, `.`, `{`, `}`, `[`, `]`
13//! - **Variable references**: `${...}` with nested path access
14//! - **Identifiers**: command names, variable names, parameter names
15
16use logos::{Logos, Span};
17use std::fmt;
18use std::sync::atomic::{AtomicU64, Ordering};
19use std::time::{SystemTime, UNIX_EPOCH};
20
21/// Global counter for generating unique markers across all tokenize calls.
22static MARKER_COUNTER: AtomicU64 = AtomicU64::new(0);
23
24/// Maximum nesting depth for parentheses in arithmetic expressions.
25/// Prevents stack overflow from pathologically nested inputs like $((((((...
26const MAX_PAREN_DEPTH: usize = 256;
27
28/// Tracks a text replacement for span correction.
29/// When preprocessing replaces text (like `$((1+2))` with a marker),
30/// we need to adjust subsequent spans to account for the length change.
31#[derive(Debug, Clone)]
32struct SpanReplacement {
33    /// Position in the preprocessed text where the marker starts.
34    preprocessed_pos: usize,
35    /// Length of the marker in preprocessed text.
36    marker_len: usize,
37    /// Length of the original text that was replaced.
38    original_len: usize,
39}
40
41/// Corrects a span from preprocessed-text coordinates back to original-text coordinates.
42fn correct_span(span: Span, replacements: &[SpanReplacement]) -> Span {
43    let mut start_adjustment: isize = 0;
44    let mut end_adjustment: isize = 0;
45
46    for r in replacements {
47        // Calculate the length difference (positive = original was longer, negative = marker is longer)
48        let delta = r.original_len as isize - r.marker_len as isize;
49
50        // If the span starts after this replacement, adjust the start
51        if span.start > r.preprocessed_pos + r.marker_len {
52            start_adjustment += delta;
53        } else if span.start > r.preprocessed_pos {
54            // Span starts inside the marker - map to original position
55            // (this shouldn't happen often, but handle it gracefully)
56            start_adjustment += delta;
57        }
58
59        // If the span ends after this replacement, adjust the end
60        if span.end > r.preprocessed_pos + r.marker_len {
61            end_adjustment += delta;
62        } else if span.end > r.preprocessed_pos {
63            // Span ends inside the marker - map to end of original
64            end_adjustment += delta;
65        }
66    }
67
68    let new_start = (span.start as isize + start_adjustment).max(0) as usize;
69    let new_end = (span.end as isize + end_adjustment).max(new_start as isize) as usize;
70    new_start..new_end
71}
72
73/// Generate a unique marker ID that's extremely unlikely to collide with user code.
74/// Uses a combination of timestamp, counter, and process ID.
75fn unique_marker_id() -> String {
76    let timestamp = SystemTime::now()
77        .duration_since(UNIX_EPOCH)
78        .map(|d| d.as_nanos())
79        .unwrap_or(0);
80    let counter = MARKER_COUNTER.fetch_add(1, Ordering::Relaxed);
81    #[cfg(target_os = "wasi")]
82    let pid = 0u32;
83    #[cfg(not(target_os = "wasi"))]
84    let pid = std::process::id();
85    format!("{:x}_{:x}_{:x}", timestamp, counter, pid)
86}
87
88/// A token with its span in the source text.
89#[derive(Debug, Clone, PartialEq)]
90pub struct Spanned<T> {
91    pub token: T,
92    pub span: Span,
93}
94
95impl<T> Spanned<T> {
96    pub fn new(token: T, span: Span) -> Self {
97        Self { token, span }
98    }
99}
100
101/// Lexer error types.
102#[derive(Debug, Clone, PartialEq, Default)]
103pub enum LexerError {
104    #[default]
105    UnexpectedCharacter,
106    UnterminatedString,
107    UnterminatedVarRef,
108    InvalidEscape,
109    InvalidNumber,
110    AmbiguousBoolean(String),
111    AmbiguousBooleanLike(String),
112    InvalidFloatNoLeading,
113    InvalidFloatNoTrailing,
114    /// Nesting depth exceeded (too many nested parentheses in arithmetic).
115    NestingTooDeep,
116    /// Heredoc body ended without seeing the closing delimiter on its own line.
117    /// The user almost certainly meant to type the delimiter — silently using
118    /// whatever was collected up to EOF would mask missing data.
119    UnterminatedHeredoc { delimiter: String },
120}
121
122impl fmt::Display for LexerError {
123    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
124        match self {
125            LexerError::UnexpectedCharacter => write!(f, "unexpected character"),
126            LexerError::UnterminatedString => write!(f, "unterminated string"),
127            LexerError::UnterminatedVarRef => write!(f, "unterminated variable reference"),
128            LexerError::InvalidEscape => write!(f, "invalid escape sequence"),
129            LexerError::InvalidNumber => write!(f, "invalid number"),
130            LexerError::AmbiguousBoolean(s) => {
131                write!(f, "ambiguous boolean, use lowercase '{}'", s.to_lowercase())
132            }
133            LexerError::AmbiguousBooleanLike(s) => {
134                let suggest = if s.eq_ignore_ascii_case("yes") { "true" } else { "false" };
135                write!(f, "ambiguous boolean-like '{}', use '{}' or '\"{}\"'", s, suggest, s)
136            }
137            LexerError::InvalidFloatNoLeading => write!(f, "float must have leading digit"),
138            LexerError::InvalidFloatNoTrailing => write!(f, "float must have trailing digit"),
139            LexerError::NestingTooDeep => write!(f, "nesting depth exceeded (max {})", MAX_PAREN_DEPTH),
140            LexerError::UnterminatedHeredoc { delimiter } => {
141                write!(f, "unterminated heredoc, expected closing delimiter `{}` on its own line", delimiter)
142            }
143        }
144    }
145}
146
147/// Tokens produced by the kaish lexer.
148///
149/// The order of variants matters for logos priority. More specific patterns
150/// (like keywords) should come before more general ones (like identifiers).
151///
152/// Tokens that carry semantic values (strings, numbers, identifiers) include
153/// the parsed value directly. This ensures the parser has access to actual
154/// data, not just token types.
155/// Here-doc content data.
156///
157/// - `literal` is true when the delimiter was quoted (`<<'EOF'` or `<<"EOF"`),
158///   meaning no variable expansion should occur.
159/// - `strip_tabs` is true for the `<<-EOF` form. Per POSIX, leading tabs on
160///   each body line are stripped at materialization time. Stripping happens
161///   downstream of the parser so byte offsets in `content` stay aligned with
162///   their original-source positions for span-tracking purposes.
163/// - `body_start_offset` is the byte offset of the first character of `content`
164///   in the source string fed into the lexer's `tokenize`. This lets the parser
165///   compute absolute spans for parts found inside the body during interpolation.
166///   In sources without arithmetic preprocessing rewrites, this equals the
167///   original-source offset; with arithmetic before the heredoc, line numbers
168///   may shift slightly until full preprocessing-layer composition lands.
169#[derive(Debug, Clone, PartialEq)]
170pub struct HereDocData {
171    pub content: String,
172    pub literal: bool,
173    pub strip_tabs: bool,
174    pub body_start_offset: usize,
175}
176
177#[derive(Logos, Debug, Clone, PartialEq)]
178#[logos(error = LexerError)]
179#[logos(skip r"[ \t]+")]
180pub enum Token {
181    // ═══════════════════════════════════════════════════════════════════
182    // Keywords (must come before Ident for priority)
183    // ═══════════════════════════════════════════════════════════════════
184    #[token("set")]
185    Set,
186
187    #[token("local")]
188    Local,
189
190    #[token("if")]
191    If,
192
193    #[token("then")]
194    Then,
195
196    #[token("else")]
197    Else,
198
199    #[token("elif")]
200    Elif,
201
202    #[token("fi")]
203    Fi,
204
205    #[token("for")]
206    For,
207
208    #[token("while")]
209    While,
210
211    #[token("in")]
212    In,
213
214    #[token("do")]
215    Do,
216
217    #[token("done")]
218    Done,
219
220    #[token("case")]
221    Case,
222
223    #[token("esac")]
224    Esac,
225
226    #[token("function")]
227    Function,
228
229    #[token("break")]
230    Break,
231
232    #[token("continue")]
233    Continue,
234
235    #[token("return")]
236    Return,
237
238    #[token("exit")]
239    Exit,
240
241    #[token("true")]
242    True,
243
244    #[token("false")]
245    False,
246
247    // ═══════════════════════════════════════════════════════════════════
248    // Type keywords (for tool parameters)
249    // ═══════════════════════════════════════════════════════════════════
250    #[token("string")]
251    TypeString,
252
253    #[token("int")]
254    TypeInt,
255
256    #[token("float")]
257    TypeFloat,
258
259    #[token("bool")]
260    TypeBool,
261
262    // ═══════════════════════════════════════════════════════════════════
263    // Multi-character operators (must come before single-char versions)
264    // ═══════════════════════════════════════════════════════════════════
265    #[token("&&")]
266    And,
267
268    #[token("||")]
269    Or,
270
271    #[token("==")]
272    EqEq,
273
274    #[token("!=")]
275    NotEq,
276
277    #[token("=~")]
278    Match,
279
280    #[token("!~")]
281    NotMatch,
282
283    #[token(">=")]
284    GtEq,
285
286    #[token("<=")]
287    LtEq,
288
289    #[token(">>")]
290    GtGt,
291
292    #[token("2>&1")]
293    StderrToStdout,
294
295    #[token("1>&2")]
296    StdoutToStderr,
297
298    #[token(">&2")]
299    StdoutToStderr2,
300
301    #[token("2>")]
302    Stderr,
303
304    #[token("&>")]
305    Both,
306
307    #[token("<<<")]
308    HereString,
309
310    #[token("<<")]
311    HereDocStart,
312
313    #[token(";;")]
314    DoubleSemi,
315
316    // ═══════════════════════════════════════════════════════════════════
317    // Single-character operators and punctuation
318    // ═══════════════════════════════════════════════════════════════════
319    #[token("=")]
320    Eq,
321
322    #[token("|")]
323    Pipe,
324
325    #[token("&")]
326    Amp,
327
328    #[token(">")]
329    Gt,
330
331    #[token("<")]
332    Lt,
333
334    #[token(";")]
335    Semi,
336
337    #[token(":")]
338    Colon,
339
340    #[token(",")]
341    Comma,
342
343    #[token("..")]
344    DotDot,
345
346    #[token(".")]
347    Dot,
348
349    /// Tilde path: `~/foo`, `~user/bar` - value includes the full string
350    #[regex(r"~[a-zA-Z0-9_./+-]+", lex_tilde_path, priority = 3)]
351    TildePath(String),
352
353    /// Bare tilde: `~` alone (expands to $HOME)
354    #[token("~")]
355    Tilde,
356
357    /// Relative path: `../foo/bar` or bare `src/kaish` (ident containing `/`)
358    #[regex(r"\.\./[a-zA-Z0-9_./-]+", lex_relative_path, priority = 3)]
359    #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*/[a-zA-Z0-9_./-]+", lex_relative_path, priority = 3)]
360    RelativePath(String),
361
362    /// Dot-slash path: `./foo`, `./script.sh`
363    #[regex(r"\./[a-zA-Z0-9_./-]+", lex_dot_slash_path, priority = 3)]
364    DotSlashPath(String),
365
366    /// Dot-prefixed bareword: `.parent`, `.gitignore`, `.foo.bar`.
367    /// Treated as an opaque string in argv position. Distinct from `Token::Dot`
368    /// (the POSIX `.` source alias) which only matches a bare `.` — the source
369    /// alias requires whitespace before its file argument (`. script`), so
370    /// `.parent` (no space) is unambiguously a single bareword.
371    #[regex(r"\.[a-zA-Z_][a-zA-Z0-9_.-]*", lex_dotted_ident, priority = 3)]
372    DottedIdent(String),
373
374    #[token("{")]
375    LBrace,
376
377    #[token("}")]
378    RBrace,
379
380    #[token("[")]
381    LBracket,
382
383    #[token("]")]
384    RBracket,
385
386    #[token("(")]
387    LParen,
388
389    #[token(")")]
390    RParen,
391
392    #[token("*")]
393    Star,
394
395    #[token("!")]
396    Bang,
397
398    #[token("?")]
399    Question,
400
401    /// Merged glob word: span-adjacent tokens containing `*`, `?`, or `[...]`.
402    /// Synthesized by `merge_glob_adjacent()`, never produced by logos directly.
403    GlobWord(String),
404
405    // ═══════════════════════════════════════════════════════════════════
406    // Command substitution
407    // ═══════════════════════════════════════════════════════════════════
408
409    /// Arithmetic expression content: synthesized by preprocessing.
410    /// Contains the expression string between `$((` and `))`.
411    Arithmetic(String),
412
413    /// Command substitution start: `$(` - begins a command substitution
414    #[token("$(")]
415    CmdSubstStart,
416
417    // ═══════════════════════════════════════════════════════════════════
418    // Flags (must come before Int to win over negative numbers)
419    // ═══════════════════════════════════════════════════════════════════
420
421    /// Long flag: `--name` or `--foo-bar`
422    #[regex(r"--[a-zA-Z][a-zA-Z0-9-]*", lex_long_flag, priority = 3)]
423    LongFlag(String),
424
425    /// Short flag: `-l` or `-la` (combined short flags)
426    #[regex(r"-[a-zA-Z][a-zA-Z0-9]*", lex_short_flag, priority = 3)]
427    ShortFlag(String),
428
429    /// Plus flag: `+e` or `+x` (for set +e to disable options)
430    #[regex(r"\+[a-zA-Z][a-zA-Z0-9]*", lex_plus_flag, priority = 3)]
431    PlusFlag(String),
432
433    /// Double dash: `--` alone marks end of flags
434    #[token("--")]
435    DoubleDash,
436
437    /// Bare word starting with + followed by non-letter: `+%s`, `+%Y-%m-%d`
438    /// For date format strings and similar. Lower priority than PlusFlag.
439    #[regex(r"\+[^a-zA-Z\s][^\s]*", lex_plus_bare, priority = 2)]
440    PlusBare(String),
441
442    /// Bare word starting with - followed by non-letter/digit/dash: `-%`, etc.
443    /// For rare cases. Lower priority than ShortFlag, Int, and DoubleDash.
444    /// Excludes - after first - to avoid matching --name patterns.
445    #[regex(r"-[^a-zA-Z0-9\s\-][^\s]*", lex_minus_bare, priority = 1)]
446    MinusBare(String),
447
448    /// Standalone - (stdin indicator for cat -, diff - -, etc.)
449    /// Only matches when followed by whitespace or end.
450    /// This is handled specially in the parser as a positional arg.
451    #[token("-")]
452    MinusAlone,
453
454    // ═══════════════════════════════════════════════════════════════════
455    // Literals (with values)
456    // ═══════════════════════════════════════════════════════════════════
457
458    /// Double-quoted string: `"..."` - value is the parsed content (quotes removed, escapes processed)
459    #[regex(r#""([^"\\]|\\.)*""#, lex_string)]
460    String(String),
461
462    /// Single-quoted string: `'...'` - literal content, no escape processing
463    #[regex(r"'[^']*'", lex_single_string)]
464    SingleString(String),
465
466    /// Braced variable reference: `${VAR}` or `${VAR.field}` - value is the raw inner content
467    #[regex(r"\$\{[^}]+\}", lex_varref)]
468    VarRef(String),
469
470    /// Simple variable reference: `$NAME` - just the identifier
471    #[regex(r"\$[a-zA-Z_][a-zA-Z0-9_]*", lex_simple_varref)]
472    SimpleVarRef(String),
473
474    /// Positional parameter: `$0` through `$9`
475    #[regex(r"\$[0-9]", lex_positional)]
476    Positional(usize),
477
478    /// All positional parameters: `$@`
479    #[token("$@")]
480    AllArgs,
481
482    /// Number of positional parameters: `$#`
483    #[token("$#")]
484    ArgCount,
485
486    /// Last exit code: `$?`
487    #[token("$?")]
488    LastExitCode,
489
490    /// Current shell PID: `$$`
491    #[token("$$")]
492    CurrentPid,
493
494    /// Variable string length: `${#VAR}`
495    #[regex(r"\$\{#[a-zA-Z_][a-zA-Z0-9_]*\}", lex_var_length)]
496    VarLength(String),
497
498    /// Here-doc content: synthesized by preprocessing, not directly lexed.
499    /// Contains the full content of the here-doc (without the delimiter lines).
500    HereDoc(HereDocData),
501
502    /// Integer literal - value is the parsed i64
503    #[regex(r"-?[0-9]+", lex_int, priority = 2)]
504    Int(i64),
505
506    /// Float literal - value is the parsed f64
507    #[regex(r"-?[0-9]+\.[0-9]+", lex_float)]
508    Float(f64),
509
510    // ═══════════════════════════════════════════════════════════════════
511    // Invalid patterns (caught before valid tokens for better errors)
512    // ═══════════════════════════════════════════════════════════════════
513
514    /// Digit-leading bareword: `019dda1c` (SHA prefix), UUIDs, version-ish
515    /// strings. Distinguished from `Int` because at least one alpha character
516    /// follows the leading digits — the lexer commits to "this is a string,
517    /// not a number." Treated as a bareword string in expression position.
518    #[regex(r"[0-9]+[a-zA-Z_][a-zA-Z0-9_.-]*", lex_number_ident, priority = 3)]
519    NumberIdent(String),
520
521    /// Invalid: float without leading digit (like .5)
522    #[regex(r"\.[0-9]+", lex_invalid_float_no_leading, priority = 3)]
523    InvalidFloatNoLeading,
524
525    /// Invalid: float without trailing digit (like 5.)
526    /// Logos uses longest-match, so valid floats like 5.5 will match Float pattern instead
527    #[regex(r"[0-9]+\.", lex_invalid_float_no_trailing, priority = 2)]
528    InvalidFloatNoTrailing,
529
530    // ═══════════════════════════════════════════════════════════════════
531    // Paths (absolute paths starting with /)
532    // ═══════════════════════════════════════════════════════════════════
533
534    /// Absolute path: `/tmp/out`, `/etc/hosts`, etc.
535    #[regex(r"/[a-zA-Z0-9_./+-]*", lex_path)]
536    Path(String),
537
538    // ═══════════════════════════════════════════════════════════════════
539    // Identifiers (command names, variable names, etc.)
540    // ═══════════════════════════════════════════════════════════════════
541
542    /// Identifier - value is the identifier string
543    /// Allows dots for filenames like `script.kai`
544    #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*", lex_ident)]
545    Ident(String),
546
547    // ═══════════════════════════════════════════════════════════════════
548    // Structural tokens
549    // ═══════════════════════════════════════════════════════════════════
550
551    /// Comment: `# ...` to end of line
552    #[regex(r"#[^\n\r]*", allow_greedy = true)]
553    Comment,
554
555    /// Newline (significant in kaish - ends statements)
556    #[regex(r"\n|\r\n")]
557    Newline,
558
559    /// Line continuation: backslash at end of line
560    #[regex(r"\\[ \t]*(\n|\r\n)")]
561    LineContinuation,
562}
563
564/// Semantic category for syntax highlighting.
565///
566/// Stable enum that groups tokens by purpose. Consumers match on categories
567/// instead of individual tokens, insulating them from lexer evolution.
568#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
569pub enum TokenCategory {
570    /// Keywords: if, then, else, for, while, function, return, etc.
571    Keyword,
572    /// Operators: |, &&, ||, >, >>, 2>&1, =, ==, etc.
573    Operator,
574    /// String literals: "...", '...', heredocs
575    String,
576    /// Numeric literals: 123, 3.14, arithmetic expressions
577    Number,
578    /// Variable references: $foo, ${bar}, $1, $@, $#, $?, $$
579    Variable,
580    /// Comments: # ...
581    Comment,
582    /// Punctuation: ; , . ( ) { } [ ]
583    Punctuation,
584    /// Identifiers in command position
585    Command,
586    /// Absolute paths: /foo/bar
587    Path,
588    /// Flags: --long, -s, +x
589    Flag,
590    /// Invalid tokens
591    Error,
592}
593
594impl Token {
595    /// Returns the semantic category for syntax highlighting.
596    pub fn category(&self) -> TokenCategory {
597        match self {
598            // Keywords
599            Token::If
600            | Token::Then
601            | Token::Else
602            | Token::Elif
603            | Token::Fi
604            | Token::For
605            | Token::In
606            | Token::Do
607            | Token::Done
608            | Token::While
609            | Token::Case
610            | Token::Esac
611            | Token::Function
612            | Token::Return
613            | Token::Break
614            | Token::Continue
615            | Token::Exit
616            | Token::Set
617            | Token::Local
618            | Token::True
619            | Token::False
620            | Token::TypeString
621            | Token::TypeInt
622            | Token::TypeFloat
623            | Token::TypeBool => TokenCategory::Keyword,
624
625            // Operators and redirections
626            Token::Pipe
627            | Token::And
628            | Token::Or
629            | Token::Amp
630            | Token::Eq
631            | Token::EqEq
632            | Token::NotEq
633            | Token::Match
634            | Token::NotMatch
635            | Token::Lt
636            | Token::Gt
637            | Token::LtEq
638            | Token::GtEq
639            | Token::GtGt
640            | Token::Stderr
641            | Token::Both
642            | Token::HereDocStart
643            | Token::HereString
644            | Token::StderrToStdout
645            | Token::StdoutToStderr
646            | Token::StdoutToStderr2 => TokenCategory::Operator,
647
648            // Strings
649            Token::String(_) | Token::SingleString(_) | Token::HereDoc(_) => TokenCategory::String,
650
651            // Numbers
652            Token::Int(_) | Token::Float(_) | Token::Arithmetic(_) => TokenCategory::Number,
653
654            // Variables
655            Token::VarRef(_)
656            | Token::SimpleVarRef(_)
657            | Token::Positional(_)
658            | Token::AllArgs
659            | Token::ArgCount
660            | Token::VarLength(_)
661            | Token::LastExitCode
662            | Token::CurrentPid => TokenCategory::Variable,
663
664            // Flags
665            Token::LongFlag(_)
666            | Token::ShortFlag(_)
667            | Token::PlusFlag(_)
668            | Token::DoubleDash => TokenCategory::Flag,
669
670            // Punctuation
671            Token::Semi
672            | Token::DoubleSemi
673            | Token::Colon
674            | Token::Comma
675            | Token::Dot
676            | Token::LParen
677            | Token::RParen
678            | Token::LBrace
679            | Token::RBrace
680            | Token::LBracket
681            | Token::RBracket
682            | Token::Bang
683            | Token::Question
684            | Token::Star
685            | Token::Newline
686            | Token::LineContinuation
687            | Token::CmdSubstStart => TokenCategory::Punctuation,
688
689            // Glob words (merged tokens containing wildcards)
690            Token::GlobWord(_) => TokenCategory::Path,
691
692            // Comments
693            Token::Comment => TokenCategory::Comment,
694
695            // Paths
696            Token::Path(_)
697            | Token::TildePath(_)
698            | Token::RelativePath(_)
699            | Token::Tilde
700            | Token::DotDot
701            | Token::DotSlashPath(_) => TokenCategory::Path,
702
703            // Commands/identifiers (and bare words)
704            Token::Ident(_)
705            | Token::PlusBare(_)
706            | Token::MinusBare(_)
707            | Token::MinusAlone
708            | Token::NumberIdent(_)
709            | Token::DottedIdent(_) => TokenCategory::Command,
710
711            // Errors
712            Token::InvalidFloatNoLeading
713            | Token::InvalidFloatNoTrailing => TokenCategory::Error,
714        }
715    }
716}
717
718/// Lex a double-quoted string literal, processing escape sequences.
719fn lex_string(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
720    parse_string_literal(lex.slice())
721}
722
723/// Lex a single-quoted string literal (no escape processing).
724fn lex_single_string(lex: &mut logos::Lexer<Token>) -> String {
725    let s = lex.slice();
726    // Strip the surrounding single quotes
727    s[1..s.len() - 1].to_string()
728}
729
730/// Lex a braced variable reference, extracting the inner content.
731fn lex_varref(lex: &mut logos::Lexer<Token>) -> String {
732    // Keep the full ${...} for later parsing of path segments
733    lex.slice().to_string()
734}
735
736/// Lex a simple variable reference: `$NAME` → `NAME`
737fn lex_simple_varref(lex: &mut logos::Lexer<Token>) -> String {
738    // Strip the leading `$`
739    lex.slice()[1..].to_string()
740}
741
742/// Lex a positional parameter: `$1` → 1
743fn lex_positional(lex: &mut logos::Lexer<Token>) -> usize {
744    // Strip the leading `$` and parse the digit
745    lex.slice()[1..].parse().unwrap_or(0)
746}
747
748/// Lex a variable length: `${#VAR}` → "VAR"
749fn lex_var_length(lex: &mut logos::Lexer<Token>) -> String {
750    // Strip the leading `${#` and trailing `}`
751    let s = lex.slice();
752    s[3..s.len() - 1].to_string()
753}
754
755/// Lex an integer literal.
756fn lex_int(lex: &mut logos::Lexer<Token>) -> Result<i64, LexerError> {
757    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
758}
759
760/// Lex a float literal.
761fn lex_float(lex: &mut logos::Lexer<Token>) -> Result<f64, LexerError> {
762    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
763}
764
765/// Lex a digit-leading bareword like `019dda1c` or `019dda1c-5b3f-7000`.
766/// Distinguished from `Int` because at least one alpha character follows the
767/// leading digits — the slice is treated as a string, not a number.
768fn lex_number_ident(lex: &mut logos::Lexer<Token>) -> String {
769    lex.slice().to_string()
770}
771
772/// Lex a dot-prefixed bareword like `.gitignore` or `.parent.parent`.
773fn lex_dotted_ident(lex: &mut logos::Lexer<Token>) -> String {
774    lex.slice().to_string()
775}
776
777/// Lex an invalid float without leading digit (like .5).
778/// Always returns Err to produce a lexer error instead of a token.
779fn lex_invalid_float_no_leading(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
780    Err(LexerError::InvalidFloatNoLeading)
781}
782
783/// Lex an invalid float without trailing digit (like 5.).
784/// Always returns Err to produce a lexer error instead of a token.
785fn lex_invalid_float_no_trailing(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
786    Err(LexerError::InvalidFloatNoTrailing)
787}
788
789/// Lex an identifier, rejecting ambiguous boolean-like values.
790fn lex_ident(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
791    let s = lex.slice();
792
793    // Reject ambiguous boolean variants (TRUE, FALSE, True, etc.)
794    // Only lowercase 'true' and 'false' are valid booleans (handled by Token::True/False)
795    match s.to_lowercase().as_str() {
796        "true" | "false" if s != "true" && s != "false" => {
797            return Err(LexerError::AmbiguousBoolean(s.to_string()));
798        }
799        _ => {}
800    }
801
802    // Reject yes/no/YES/NO/Yes/No as ambiguous boolean-like values
803    if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("no") {
804        return Err(LexerError::AmbiguousBooleanLike(s.to_string()));
805    }
806
807    Ok(s.to_string())
808}
809
810/// Lex a long flag: `--name` → `name`
811fn lex_long_flag(lex: &mut logos::Lexer<Token>) -> String {
812    // Strip the leading `--`
813    lex.slice()[2..].to_string()
814}
815
816/// Lex a short flag: `-l` → `l`, `-la` → `la`
817fn lex_short_flag(lex: &mut logos::Lexer<Token>) -> String {
818    // Strip the leading `-`
819    lex.slice()[1..].to_string()
820}
821
822/// Lex a plus flag: `+e` → `e`, `+ex` → `ex`
823fn lex_plus_flag(lex: &mut logos::Lexer<Token>) -> String {
824    // Strip the leading `+`
825    lex.slice()[1..].to_string()
826}
827
828/// Lex a plus bare word: `+%s` → `+%s` (keep the full string)
829fn lex_plus_bare(lex: &mut logos::Lexer<Token>) -> String {
830    lex.slice().to_string()
831}
832
833/// Lex a minus bare word: `-%` → `-%` (keep the full string)
834fn lex_minus_bare(lex: &mut logos::Lexer<Token>) -> String {
835    lex.slice().to_string()
836}
837
838/// Lex an absolute path: `/tmp/out` → `/tmp/out`
839fn lex_path(lex: &mut logos::Lexer<Token>) -> String {
840    lex.slice().to_string()
841}
842
843/// Lex a tilde path: `~/foo` → `~/foo`
844fn lex_tilde_path(lex: &mut logos::Lexer<Token>) -> String {
845    lex.slice().to_string()
846}
847
848/// Lex a relative path: `../foo` → `../foo`
849fn lex_relative_path(lex: &mut logos::Lexer<Token>) -> String {
850    lex.slice().to_string()
851}
852
853/// Lex a dot-slash path: `./foo` → `./foo`
854fn lex_dot_slash_path(lex: &mut logos::Lexer<Token>) -> String {
855    lex.slice().to_string()
856}
857
858impl fmt::Display for Token {
859    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
860        match self {
861            Token::Set => write!(f, "set"),
862            Token::Local => write!(f, "local"),
863            Token::If => write!(f, "if"),
864            Token::Then => write!(f, "then"),
865            Token::Else => write!(f, "else"),
866            Token::Elif => write!(f, "elif"),
867            Token::Fi => write!(f, "fi"),
868            Token::For => write!(f, "for"),
869            Token::While => write!(f, "while"),
870            Token::In => write!(f, "in"),
871            Token::Do => write!(f, "do"),
872            Token::Done => write!(f, "done"),
873            Token::Case => write!(f, "case"),
874            Token::Esac => write!(f, "esac"),
875            Token::Function => write!(f, "function"),
876            Token::Break => write!(f, "break"),
877            Token::Continue => write!(f, "continue"),
878            Token::Return => write!(f, "return"),
879            Token::Exit => write!(f, "exit"),
880            Token::True => write!(f, "true"),
881            Token::False => write!(f, "false"),
882            Token::TypeString => write!(f, "string"),
883            Token::TypeInt => write!(f, "int"),
884            Token::TypeFloat => write!(f, "float"),
885            Token::TypeBool => write!(f, "bool"),
886            Token::And => write!(f, "&&"),
887            Token::Or => write!(f, "||"),
888            Token::EqEq => write!(f, "=="),
889            Token::NotEq => write!(f, "!="),
890            Token::Match => write!(f, "=~"),
891            Token::NotMatch => write!(f, "!~"),
892            Token::GtEq => write!(f, ">="),
893            Token::LtEq => write!(f, "<="),
894            Token::GtGt => write!(f, ">>"),
895            Token::StderrToStdout => write!(f, "2>&1"),
896            Token::StdoutToStderr => write!(f, "1>&2"),
897            Token::StdoutToStderr2 => write!(f, ">&2"),
898            Token::Stderr => write!(f, "2>"),
899            Token::Both => write!(f, "&>"),
900            Token::HereDocStart => write!(f, "<<"),
901            Token::HereString => write!(f, "<<<"),
902            Token::DoubleSemi => write!(f, ";;"),
903            Token::Eq => write!(f, "="),
904            Token::Pipe => write!(f, "|"),
905            Token::Amp => write!(f, "&"),
906            Token::Gt => write!(f, ">"),
907            Token::Lt => write!(f, "<"),
908            Token::Semi => write!(f, ";"),
909            Token::Colon => write!(f, ":"),
910            Token::Comma => write!(f, ","),
911            Token::Dot => write!(f, "."),
912            Token::DotDot => write!(f, ".."),
913            Token::Tilde => write!(f, "~"),
914            Token::TildePath(s) => write!(f, "{}", s),
915            Token::RelativePath(s) => write!(f, "{}", s),
916            Token::DotSlashPath(s) => write!(f, "{}", s),
917            Token::LBrace => write!(f, "{{"),
918            Token::RBrace => write!(f, "}}"),
919            Token::LBracket => write!(f, "["),
920            Token::RBracket => write!(f, "]"),
921            Token::LParen => write!(f, "("),
922            Token::RParen => write!(f, ")"),
923            Token::Star => write!(f, "*"),
924            Token::Bang => write!(f, "!"),
925            Token::Question => write!(f, "?"),
926            Token::GlobWord(s) => write!(f, "GLOB({})", s),
927            Token::Arithmetic(s) => write!(f, "ARITHMETIC({})", s),
928            Token::CmdSubstStart => write!(f, "$("),
929            Token::LongFlag(s) => write!(f, "--{}", s),
930            Token::ShortFlag(s) => write!(f, "-{}", s),
931            Token::PlusFlag(s) => write!(f, "+{}", s),
932            Token::DoubleDash => write!(f, "--"),
933            Token::PlusBare(s) => write!(f, "{}", s),
934            Token::MinusBare(s) => write!(f, "{}", s),
935            Token::MinusAlone => write!(f, "-"),
936            Token::String(s) => write!(f, "STRING({:?})", s),
937            Token::SingleString(s) => write!(f, "SINGLESTRING({:?})", s),
938            Token::HereDoc(d) => write!(f, "HEREDOC({:?}, literal={})", d.content, d.literal),
939            Token::VarRef(v) => write!(f, "VARREF({})", v),
940            Token::SimpleVarRef(v) => write!(f, "SIMPLEVARREF({})", v),
941            Token::Positional(n) => write!(f, "${}", n),
942            Token::AllArgs => write!(f, "$@"),
943            Token::ArgCount => write!(f, "$#"),
944            Token::LastExitCode => write!(f, "$?"),
945            Token::CurrentPid => write!(f, "$$"),
946            Token::VarLength(v) => write!(f, "${{#{}}}", v),
947            Token::Int(n) => write!(f, "INT({})", n),
948            Token::Float(n) => write!(f, "FLOAT({})", n),
949            Token::Path(s) => write!(f, "PATH({})", s),
950            Token::Ident(s) => write!(f, "IDENT({})", s),
951            Token::NumberIdent(s) => write!(f, "NUMIDENT({})", s),
952            Token::DottedIdent(s) => write!(f, "DOTIDENT({})", s),
953            Token::Comment => write!(f, "COMMENT"),
954            Token::Newline => write!(f, "NEWLINE"),
955            Token::LineContinuation => write!(f, "LINECONT"),
956            // These variants should never be produced — their callbacks always return errors
957            Token::InvalidFloatNoLeading => write!(f, "INVALID_FLOAT_NO_LEADING"),
958            Token::InvalidFloatNoTrailing => write!(f, "INVALID_FLOAT_NO_TRAILING"),
959        }
960    }
961}
962
963impl Token {
964    /// Returns true if this token is a keyword.
965    // Must match the Keyword variants in `Token::category()` (minus the
966    // TypeX variants, which `is_type()` covers separately). Currently
967    // uncalled — kept exhaustive so future callers don't get wrong answers.
968    pub fn is_keyword(&self) -> bool {
969        matches!(
970            self,
971            Token::Set
972                | Token::Local
973                | Token::If
974                | Token::Then
975                | Token::Else
976                | Token::Elif
977                | Token::Fi
978                | Token::For
979                | Token::In
980                | Token::Do
981                | Token::Done
982                | Token::While
983                | Token::Case
984                | Token::Esac
985                | Token::Function
986                | Token::Return
987                | Token::Break
988                | Token::Continue
989                | Token::Exit
990                | Token::True
991                | Token::False
992        )
993    }
994
995    /// Returns true if this token is a type keyword.
996    pub fn is_type(&self) -> bool {
997        matches!(
998            self,
999            Token::TypeString
1000                | Token::TypeInt
1001                | Token::TypeFloat
1002                | Token::TypeBool
1003        )
1004    }
1005
1006    /// Returns true if this token starts a statement.
1007    // Currently uncalled — kept exhaustive so future callers don't get wrong answers.
1008    pub fn starts_statement(&self) -> bool {
1009        matches!(
1010            self,
1011            Token::Set
1012                | Token::Local
1013                | Token::Function
1014                | Token::If
1015                | Token::For
1016                | Token::While
1017                | Token::Case
1018                | Token::Ident(_)
1019                | Token::LBracket
1020        )
1021    }
1022
1023    /// Returns true if this token can appear in an expression.
1024    pub fn is_value(&self) -> bool {
1025        matches!(
1026            self,
1027            Token::String(_)
1028                | Token::SingleString(_)
1029                | Token::HereDoc(_)
1030                | Token::Arithmetic(_)
1031                | Token::Int(_)
1032                | Token::Float(_)
1033                | Token::True
1034                | Token::False
1035                | Token::VarRef(_)
1036                | Token::SimpleVarRef(_)
1037                | Token::CmdSubstStart
1038                | Token::Path(_)
1039                | Token::GlobWord(_)
1040                | Token::LastExitCode
1041                | Token::CurrentPid
1042        )
1043    }
1044}
1045
1046/// Result of preprocessing arithmetic expressions.
1047struct ArithmeticPreprocessResult {
1048    /// Preprocessed source with markers replacing $((expr)).
1049    text: String,
1050    /// Vector of (marker, expression_content) pairs.
1051    arithmetics: Vec<(String, String)>,
1052    /// Span replacements for correcting token positions.
1053    replacements: Vec<SpanReplacement>,
1054}
1055
1056/// Skip a `$(...)` command substitution with quote-aware paren matching.
1057///
1058/// Copies the entire command substitution verbatim to `result`, handling
1059/// single quotes, double quotes, and backslash escapes inside the sub so
1060/// that parentheses within strings don't confuse the depth counter.
1061///
1062/// On entry, `i` points to the `$` of `$(`. On exit, `i` points past the
1063/// closing `)`.
1064fn skip_command_substitution(
1065    chars: &[char],
1066    i: &mut usize,
1067    source_pos: &mut usize,
1068    result: &mut String,
1069) {
1070    // Copy $(
1071    result.push('$');
1072    result.push('(');
1073    *i += 2;
1074    *source_pos += 2;
1075
1076    let mut depth: usize = 1;
1077    let mut in_single_quote = false;
1078    let mut in_double_quote = false;
1079
1080    while *i < chars.len() && depth > 0 {
1081        let c = chars[*i];
1082
1083        if in_single_quote {
1084            result.push(c);
1085            *source_pos += c.len_utf8();
1086            *i += 1;
1087            if c == '\'' {
1088                in_single_quote = false;
1089            }
1090            continue;
1091        }
1092
1093        if in_double_quote {
1094            if c == '\\' && *i + 1 < chars.len() {
1095                let next = chars[*i + 1];
1096                if next == '"' || next == '\\' || next == '$' || next == '`' {
1097                    result.push(c);
1098                    result.push(next);
1099                    *source_pos += c.len_utf8() + next.len_utf8();
1100                    *i += 2;
1101                    continue;
1102                }
1103            }
1104            if c == '"' {
1105                in_double_quote = false;
1106            }
1107            result.push(c);
1108            *source_pos += c.len_utf8();
1109            *i += 1;
1110            continue;
1111        }
1112
1113        // Outside quotes
1114        match c {
1115            '\'' => {
1116                in_single_quote = true;
1117                result.push(c);
1118                *source_pos += c.len_utf8();
1119                *i += 1;
1120            }
1121            '"' => {
1122                in_double_quote = true;
1123                result.push(c);
1124                *source_pos += c.len_utf8();
1125                *i += 1;
1126            }
1127            '\\' if *i + 1 < chars.len() => {
1128                result.push(c);
1129                result.push(chars[*i + 1]);
1130                *source_pos += c.len_utf8() + chars[*i + 1].len_utf8();
1131                *i += 2;
1132            }
1133            '(' => {
1134                depth += 1;
1135                result.push(c);
1136                *source_pos += c.len_utf8();
1137                *i += 1;
1138            }
1139            ')' => {
1140                depth -= 1;
1141                result.push(c);
1142                *source_pos += c.len_utf8();
1143                *i += 1;
1144            }
1145            _ => {
1146                result.push(c);
1147                *source_pos += c.len_utf8();
1148                *i += 1;
1149            }
1150        }
1151    }
1152}
1153
1154/// Preprocess arithmetic expressions in source code.
1155///
1156/// Finds `$((expr))` patterns and replaces them with markers.
1157/// Returns the preprocessed source, arithmetic contents, and span replacement info.
1158///
1159/// Example:
1160///   `X=$((1 + 2))`
1161/// Becomes:
1162///   `X=__KAISH_ARITH_{id}__`
1163/// With arithmetics[0] = ("__KAISH_ARITH_{id}__", "1 + 2")
1164///
1165/// # Errors
1166/// Returns `LexerError::NestingTooDeep` if parentheses are nested beyond MAX_PAREN_DEPTH.
1167fn preprocess_arithmetic(source: &str) -> Result<ArithmeticPreprocessResult, LexerError> {
1168    let mut result = String::with_capacity(source.len());
1169    let mut arithmetics: Vec<(String, String)> = Vec::new();
1170    let mut replacements: Vec<SpanReplacement> = Vec::new();
1171    let mut source_pos: usize = 0;
1172    let chars_vec: Vec<char> = source.chars().collect();
1173    let mut i = 0;
1174
1175    // Whether we're currently inside double quotes. Single quotes inside
1176    // double quotes are literal characters, not quote delimiters.
1177    let mut in_double_quote = false;
1178
1179    while i < chars_vec.len() {
1180        let ch = chars_vec[i];
1181
1182        // Backslash escape outside quotes — skip both chars verbatim
1183        if !in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1184            result.push(ch);
1185            result.push(chars_vec[i + 1]);
1186            source_pos += ch.len_utf8() + chars_vec[i + 1].len_utf8();
1187            i += 2;
1188            continue;
1189        }
1190
1191        // Single quote — only starts quote mode when NOT inside double quotes
1192        if ch == '\'' && !in_double_quote {
1193            result.push(ch);
1194            i += 1;
1195            source_pos += 1;
1196            while i < chars_vec.len() && chars_vec[i] != '\'' {
1197                result.push(chars_vec[i]);
1198                source_pos += chars_vec[i].len_utf8();
1199                i += 1;
1200            }
1201            if i < chars_vec.len() {
1202                result.push(chars_vec[i]); // closing quote
1203                source_pos += 1;
1204                i += 1;
1205            }
1206            continue;
1207        }
1208
1209        // Double quote — toggle state (arithmetic is still expanded inside)
1210        if ch == '"' {
1211            in_double_quote = !in_double_quote;
1212            result.push(ch);
1213            i += 1;
1214            source_pos += 1;
1215            continue;
1216        }
1217
1218        // Backslash escape inside double quotes — only \" and \\ are special
1219        if in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1220            let next = chars_vec[i + 1];
1221            if next == '"' || next == '\\' || next == '$' || next == '`' {
1222                result.push(ch);
1223                result.push(next);
1224                source_pos += ch.len_utf8() + next.len_utf8();
1225                i += 2;
1226                continue;
1227            }
1228        }
1229
1230        // Skip $(...) command substitutions — inner arithmetic belongs to the subcommand
1231        if ch == '$' && i + 1 < chars_vec.len() && chars_vec[i + 1] == '('
1232            && !(i + 2 < chars_vec.len() && chars_vec[i + 2] == '(')
1233        {
1234            skip_command_substitution(&chars_vec, &mut i, &mut source_pos, &mut result);
1235            continue;
1236        }
1237
1238        // Look for $(( (potential arithmetic)
1239        if ch == '$' && i + 2 < chars_vec.len() && chars_vec[i + 1] == '(' && chars_vec[i + 2] == '(' {
1240            let arith_start_pos = result.len();
1241            let original_start = source_pos;
1242
1243            // Skip $((
1244            i += 3;
1245            source_pos += 3;
1246
1247            // Collect expression until matching ))
1248            let mut expr = String::new();
1249            let mut paren_depth: usize = 0;
1250
1251            while i < chars_vec.len() {
1252                let c = chars_vec[i];
1253                match c {
1254                    '(' => {
1255                        paren_depth += 1;
1256                        if paren_depth > MAX_PAREN_DEPTH {
1257                            return Err(LexerError::NestingTooDeep);
1258                        }
1259                        expr.push('(');
1260                        i += 1;
1261                        source_pos += c.len_utf8();
1262                    }
1263                    ')' => {
1264                        if paren_depth > 0 {
1265                            paren_depth -= 1;
1266                            expr.push(')');
1267                            i += 1;
1268                            source_pos += 1;
1269                        } else if i + 1 < chars_vec.len() && chars_vec[i + 1] == ')' {
1270                            // Found closing ))
1271                            i += 2;
1272                            source_pos += 2;
1273                            break;
1274                        } else {
1275                            // Single ) inside - keep going
1276                            expr.push(')');
1277                            i += 1;
1278                            source_pos += 1;
1279                        }
1280                    }
1281                    _ => {
1282                        expr.push(c);
1283                        i += 1;
1284                        source_pos += c.len_utf8();
1285                    }
1286                }
1287            }
1288
1289            // Calculate original length: from $$(( to ))
1290            let original_len = source_pos - original_start;
1291
1292            // Create a unique marker for this arithmetic (collision-resistant)
1293            let marker = format!("__KAISH_ARITH_{}__", unique_marker_id());
1294            let marker_len = marker.len();
1295
1296            // Record the replacement for span correction
1297            replacements.push(SpanReplacement {
1298                preprocessed_pos: arith_start_pos,
1299                marker_len,
1300                original_len,
1301            });
1302
1303            arithmetics.push((marker.clone(), expr));
1304            result.push_str(&marker);
1305        } else {
1306            result.push(ch);
1307            i += 1;
1308            source_pos += ch.len_utf8();
1309        }
1310    }
1311
1312    Ok(ArithmeticPreprocessResult {
1313        text: result,
1314        arithmetics,
1315        replacements,
1316    })
1317}
1318
1319/// Per-heredoc metadata collected during preprocessing.
1320///
1321/// Stored verbatim alongside the substituted marker so the parser, validator,
1322/// and interpreter can reconstitute the body with correct semantics:
1323/// - `body` is the raw body bytes; tab stripping for `<<-` is applied later
1324///   (at materialization), so byte offsets stay aligned with the original
1325///   source for span tracking.
1326/// - `strip_tabs` records whether the `<<-` form was used.
1327/// - `literal` records whether the delimiter was quoted (no interpolation).
1328/// - `body_start_offset` is the byte offset of the first body character in
1329///   the source string passed to `preprocess_heredocs`. When heredocs are
1330///   preprocessed AFTER arithmetic, this is in arith-preprocessed coordinates;
1331///   in the common case (no arithmetic before the heredoc) this equals the
1332///   original-source offset. See span-correction notes in `tokenize`.
1333#[derive(Debug, Clone)]
1334struct HeredocReplacement {
1335    marker: String,
1336    body: String,
1337    literal: bool,
1338    strip_tabs: bool,
1339    body_start_offset: usize,
1340}
1341
1342/// Preprocess here-docs in source code.
1343///
1344/// Finds `<<WORD` patterns and collects content until the delimiter line.
1345/// Returns the preprocessed source and a vector of replacement records.
1346///
1347/// Example:
1348///   `cat <<EOF\nhello\nworld\nEOF`
1349/// Becomes:
1350///   `cat <<__HEREDOC_0__`
1351/// With heredocs[0] = HeredocReplacement { marker: "__HEREDOC_0__",
1352/// body: "hello\nworld", literal: false, strip_tabs: false }
1353fn preprocess_heredocs(source: &str) -> Result<(String, Vec<HeredocReplacement>), Spanned<LexerError>> {
1354    let mut result = String::with_capacity(source.len());
1355    let mut heredocs: Vec<HeredocReplacement> = Vec::new();
1356    let chars_vec: Vec<char> = source.chars().collect();
1357    let mut i = 0;
1358    // `pos` tracks the byte offset into `source` corresponding to chars_vec[i].
1359    // `result` accumulates output; we record body offsets in `pos` (input-side)
1360    // and emit positions via `result.len()` (output-side) where needed.
1361    let mut pos: usize = 0;
1362
1363    while i < chars_vec.len() {
1364        let ch = chars_vec[i];
1365
1366        // Pass <<< through verbatim so the logos tokenizer sees the here-string
1367        // operator. If we fell through naively, the next iteration would see
1368        // the remaining `<<` and misfire heredoc preprocessing.
1369        if ch == '<'
1370            && chars_vec.get(i + 1) == Some(&'<')
1371            && chars_vec.get(i + 2) == Some(&'<')
1372        {
1373            result.push_str("<<<");
1374            i += 3;
1375            pos += 3;
1376            continue;
1377        }
1378
1379        // Look for << (potential here-doc).
1380        if ch == '<' && chars_vec.get(i + 1) == Some(&'<') {
1381            // Remember where the `<<` started so an unterminated-heredoc
1382            // error can point back at the introducer rather than at EOF.
1383            let introducer_start = pos;
1384            i += 2; // consume both '<'
1385            pos += 2;
1386
1387            // Check for optional - (strip leading tabs)
1388            let strip_tabs = chars_vec.get(i) == Some(&'-');
1389            if strip_tabs {
1390                i += 1;
1391                pos += 1;
1392            }
1393
1394            // Skip whitespace before delimiter
1395            while let Some(&c) = chars_vec.get(i) {
1396                if c == ' ' || c == '\t' {
1397                    i += 1;
1398                    pos += 1;
1399                } else {
1400                    break;
1401                }
1402            }
1403
1404            // Collect the delimiter word
1405            let mut delimiter = String::new();
1406            let quoted = chars_vec.get(i) == Some(&'\'') || chars_vec.get(i) == Some(&'"');
1407            let quote_char = if quoted {
1408                let q = chars_vec.get(i).copied();
1409                i += 1;
1410                pos += 1;
1411                q
1412            } else {
1413                None
1414            };
1415
1416            while let Some(&c) = chars_vec.get(i) {
1417                if quoted {
1418                    if Some(c) == quote_char {
1419                        i += 1; // consume closing quote
1420                        pos += 1;
1421                        break;
1422                    }
1423                } else if c.is_whitespace() || c == '\n' || c == '\r' {
1424                    break;
1425                }
1426                delimiter.push(c);
1427                i += 1;
1428                pos += c.len_utf8();
1429            }
1430
1431            if delimiter.is_empty() {
1432                // Not a valid here-doc, output << literally
1433                result.push_str("<<");
1434                if strip_tabs {
1435                    result.push('-');
1436                }
1437                continue;
1438            }
1439
1440            // Buffer text after delimiter word (e.g., " | jq" in "cat <<EOF | jq")
1441            // This must be emitted AFTER the heredoc marker, not before.
1442            let mut after_delimiter = String::new();
1443            while let Some(&c) = chars_vec.get(i) {
1444                if c == '\n' {
1445                    i += 1;
1446                    pos += 1;
1447                    break;
1448                } else if c == '\r' {
1449                    i += 1;
1450                    pos += 1;
1451                    if chars_vec.get(i) == Some(&'\n') {
1452                        i += 1;
1453                        pos += 1;
1454                    }
1455                    break;
1456                }
1457                after_delimiter.push(c);
1458                i += 1;
1459                pos += c.len_utf8();
1460            }
1461
1462            // Collect content until delimiter on its own line.
1463            // `body_start_offset` is the byte position of the first char of
1464            // the body in the source — first char after the newline that
1465            // ended the delimiter line. See HeredocReplacement docs for
1466            // coordinate-system caveat (arith-preprocessed, not original).
1467            let body_start_offset = pos;
1468            let mut content = String::new();
1469            let mut current_line = String::new();
1470
1471            loop {
1472                let next = chars_vec.get(i).copied();
1473                match next {
1474                    Some('\n') => {
1475                        i += 1;
1476                        pos += 1;
1477                        // Check if this line is the delimiter
1478                        let trimmed = if strip_tabs {
1479                            current_line.trim_start_matches('\t')
1480                        } else {
1481                            &current_line
1482                        };
1483                        if trimmed == delimiter {
1484                            // Found end of here-doc
1485                            break;
1486                        }
1487                        // Add line to content (including empty lines)
1488                        content.push_str(&current_line);
1489                        content.push('\n');
1490                        current_line.clear();
1491                    }
1492                    Some('\r') => {
1493                        i += 1;
1494                        pos += 1;
1495                        // Detect CRLF vs bare CR. We strip the line ending
1496                        // for delimiter matching (so `EOF\r` still matches
1497                        // `EOF`) but preserve the original byte sequence in
1498                        // the body content — the user's input is honored
1499                        // verbatim.
1500                        let crlf = chars_vec.get(i) == Some(&'\n');
1501                        if crlf {
1502                            i += 1;
1503                            pos += 1;
1504                        }
1505                        let trimmed = if strip_tabs {
1506                            current_line.trim_start_matches('\t')
1507                        } else {
1508                            &current_line
1509                        };
1510                        if trimmed == delimiter {
1511                            break;
1512                        }
1513                        content.push_str(&current_line);
1514                        content.push_str(if crlf { "\r\n" } else { "\r" });
1515                        current_line.clear();
1516                    }
1517                    Some(c) => {
1518                        current_line.push(c);
1519                        i += 1;
1520                        pos += c.len_utf8();
1521                    }
1522                    None => {
1523                        // EOF — check if current line is the delimiter (matches
1524                        // when the source ends without a trailing newline).
1525                        let trimmed = if strip_tabs {
1526                            current_line.trim_start_matches('\t')
1527                        } else {
1528                            &current_line
1529                        };
1530                        if trimmed == delimiter {
1531                            break;
1532                        }
1533                        // Not a delimiter — the heredoc was never closed.
1534                        // Crash rather than silently using whatever we
1535                        // collected: missing data is exactly the failure
1536                        // mode where silent fallback masks the bug.
1537                        let span_end = introducer_start
1538                            + 2
1539                            + if strip_tabs { 1 } else { 0 }
1540                            + delimiter.len();
1541                        return Err(Spanned::new(
1542                            LexerError::UnterminatedHeredoc {
1543                                delimiter: delimiter.clone(),
1544                            },
1545                            introducer_start..span_end,
1546                        ));
1547                    }
1548                }
1549            }
1550
1551            // Create a unique marker for this here-doc (collision-resistant)
1552            let marker = format!("__KAISH_HEREDOC_{}__", unique_marker_id());
1553            heredocs.push(HeredocReplacement {
1554                marker: marker.clone(),
1555                body: content,
1556                literal: quoted,
1557                strip_tabs,
1558                body_start_offset,
1559            });
1560
1561            // Output <<marker first, then any text that followed the delimiter
1562            // (e.g., " | jq") so the heredoc attaches to the correct command.
1563            result.push_str("<<");
1564            result.push_str(&marker);
1565            result.push_str(&after_delimiter);
1566            result.push('\n');
1567        } else {
1568            result.push(ch);
1569            i += 1;
1570            pos += ch.len_utf8();
1571        }
1572    }
1573
1574    Ok((result, heredocs))
1575}
1576
1577/// Extract the text contribution of a token for colon-adjacent merging.
1578///
1579/// Returns `Some(text)` for token types that can participate in word-like
1580/// merging, `None` for everything else.
1581fn mergeable_text(token: &Token) -> Option<String> {
1582    match token {
1583        Token::Ident(s) => Some(s.clone()),
1584        Token::NumberIdent(s) => Some(s.clone()),
1585        Token::DottedIdent(s) => Some(s.clone()),
1586        Token::Colon => Some(":".to_string()),
1587        Token::Int(n) => Some(n.to_string()),
1588        Token::Path(p) => Some(p.clone()),
1589        Token::Float(f) => Some(f.to_string()),
1590        _ => None,
1591    }
1592}
1593
1594/// Merge span-adjacent token runs containing `Token::Colon` into single `Ident` tokens.
1595///
1596/// In bash, `:` is a regular character in unquoted words. kaish tokenizes it
1597/// separately, which breaks Rust paths (`foo::bar`), URLs (`host:8080`), etc.
1598///
1599/// This pass fuses span-adjacent mergeable tokens (Ident, Colon, Int, Path, Float)
1600/// into a single `Ident` when the run contains at least one `Colon`. Runs without
1601/// colons or standalone tokens pass through unchanged.
1602fn merge_colon_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1603    if tokens.is_empty() {
1604        return tokens;
1605    }
1606
1607    let mut result = Vec::with_capacity(tokens.len());
1608    let mut run: Vec<&Spanned<Token>> = Vec::new();
1609
1610    for token in &tokens {
1611        if run.is_empty() {
1612            if mergeable_text(&token.token).is_some() {
1613                run.push(token);
1614            } else {
1615                result.push(token.clone());
1616            }
1617            continue;
1618        }
1619
1620        // Check span adjacency: previous run's last token ends where this one starts
1621        // Safety: run is non-empty (checked above)
1622        let Some(last) = run.last() else { unreachable!() };
1623        let adjacent = last.span.end == token.span.start;
1624
1625        if adjacent && mergeable_text(&token.token).is_some() {
1626            run.push(token);
1627        } else {
1628            flush_colon_run(&mut run, &mut result);
1629            if mergeable_text(&token.token).is_some() {
1630                run.push(token);
1631            } else {
1632                result.push(token.clone());
1633            }
1634        }
1635    }
1636
1637    flush_colon_run(&mut run, &mut result);
1638
1639    result
1640}
1641
1642/// Flush a run of mergeable tokens: merge if it contains a colon, otherwise emit individually.
1643fn flush_colon_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1644    if run.is_empty() {
1645        return;
1646    }
1647
1648    let has_colon = run.iter().any(|t| matches!(t.token, Token::Colon));
1649
1650    if run.len() >= 2 && has_colon {
1651        let text: String = run
1652            .iter()
1653            .filter_map(|t| mergeable_text(&t.token))
1654            .collect();
1655        // Safety: run.len() >= 2 so first/last exist
1656        let start = run.first().map(|t| t.span.start).unwrap_or(0);
1657        let end = run.last().map(|t| t.span.end).unwrap_or(0);
1658        result.push(Spanned::new(Token::Ident(text), start..end));
1659    } else {
1660        for t in run.iter() {
1661            result.push((*t).clone());
1662        }
1663    }
1664
1665    run.clear();
1666}
1667
1668/// Extract the text contribution of a token that can participate in a glob word.
1669///
1670/// Returns `Some(text)` for tokens that can be part of a glob pattern (identifiers,
1671/// wildcard chars, brackets, paths, etc.), `None` for structural tokens.
1672fn glob_mergeable_text(token: &Token) -> Option<String> {
1673    match token {
1674        Token::Star => Some("*".to_string()),
1675        Token::Question => Some("?".to_string()),
1676        Token::Dot => Some(".".to_string()),
1677        Token::DotDot => Some("..".to_string()),
1678        Token::Ident(s) => Some(s.clone()),
1679        Token::NumberIdent(s) => Some(s.clone()),
1680        Token::DottedIdent(s) => Some(s.clone()),
1681        Token::Path(s) => Some(s.clone()),
1682        Token::Int(n) => Some(n.to_string()),
1683        Token::LBracket => Some("[".to_string()),
1684        Token::RBracket => Some("]".to_string()),
1685        Token::Bang => Some("!".to_string()),
1686        Token::DotSlashPath(s) => Some(s.clone()),
1687        Token::RelativePath(s) => Some(s.clone()),
1688        Token::TildePath(s) => Some(s.clone()),
1689        Token::Tilde => Some("~".to_string()),
1690        Token::LBrace => Some("{".to_string()),
1691        Token::RBrace => Some("}".to_string()),
1692        Token::Comma => Some(",".to_string()),
1693        _ => None,
1694    }
1695}
1696
1697/// Merge span-adjacent token runs containing glob metacharacters into `GlobWord` tokens.
1698///
1699/// A run is merged into `GlobWord` when it contains at least one `Star`, `Question`,
1700/// or a `LBracket`+`RBracket` pair. Runs without glob chars pass through unchanged.
1701///
1702/// Runs after colon merge: `foo::bar` stays as `Ident("foo::bar")` because colon merge
1703/// already fused it before this pass sees it.
1704fn merge_glob_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1705    if tokens.is_empty() {
1706        return tokens;
1707    }
1708
1709    let mut result = Vec::with_capacity(tokens.len());
1710    let mut run: Vec<&Spanned<Token>> = Vec::new();
1711
1712    for token in &tokens {
1713        if run.is_empty() {
1714            if glob_mergeable_text(&token.token).is_some() {
1715                run.push(token);
1716            } else {
1717                result.push(token.clone());
1718            }
1719            continue;
1720        }
1721
1722        // Safety: run is non-empty (checked at top of loop)
1723        let Some(last) = run.last() else { unreachable!() };
1724        let adjacent = last.span.end == token.span.start;
1725
1726        if adjacent && glob_mergeable_text(&token.token).is_some() {
1727            run.push(token);
1728        } else {
1729            flush_glob_run(&mut run, &mut result);
1730            if glob_mergeable_text(&token.token).is_some() {
1731                run.push(token);
1732            } else {
1733                result.push(token.clone());
1734            }
1735        }
1736    }
1737
1738    flush_glob_run(&mut run, &mut result);
1739
1740    result
1741}
1742
1743/// Flush a run of glob-mergeable tokens: merge if it contains glob metacharacters.
1744fn flush_glob_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1745    if run.is_empty() {
1746        return;
1747    }
1748
1749    let has_glob = run.iter().any(|t| {
1750        matches!(t.token, Token::Star | Token::Question)
1751    }) || (run.iter().any(|t| matches!(t.token, Token::LBracket))
1752        && run.iter().any(|t| matches!(t.token, Token::RBracket)));
1753
1754    if run.len() >= 2 && has_glob {
1755        let text: String = run
1756            .iter()
1757            .filter_map(|t| glob_mergeable_text(&t.token))
1758            .collect();
1759        let start = run.first().map(|t| t.span.start).unwrap_or(0);
1760        let end = run.last().map(|t| t.span.end).unwrap_or(0);
1761        result.push(Spanned::new(Token::GlobWord(text), start..end));
1762    } else {
1763        for t in run.iter() {
1764            result.push((*t).clone());
1765        }
1766    }
1767
1768    run.clear();
1769}
1770
1771/// Tokenize source code into a vector of spanned tokens.
1772///
1773/// Skips whitespace and comments (unless you need them for formatting).
1774/// Returns errors with their positions for nice error messages.
1775///
1776/// Handles:
1777/// - Arithmetic: `$((expr))` becomes `Arithmetic("expr")`
1778/// - Here-docs: `<<EOF\nhello\nEOF` becomes `HereDocStart` + `HereDoc("hello")`
1779/// - Colon merge: span-adjacent `foo::bar` becomes `Ident("foo::bar")`
1780pub fn tokenize(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1781    // Preprocess arithmetic first (before heredocs because heredoc content might contain $((
1782    let arith_result = preprocess_arithmetic(source)
1783        .map_err(|e| vec![Spanned::new(e, 0..source.len())])?;
1784
1785    // Then preprocess here-docs. Spans inside the heredoc preprocessor are in
1786    // arith-preprocessed coords; correct back to original-source coords before
1787    // surfacing the error to keep parser diagnostics aligned with source.
1788    let span_replacements = arith_result.replacements;
1789    let (preprocessed, heredocs) = preprocess_heredocs(&arith_result.text)
1790        .map_err(|e| {
1791            let span = correct_span(e.span, &span_replacements);
1792            vec![Spanned::new(e.token, span)]
1793        })?;
1794
1795    let lexer = Token::lexer(&preprocessed);
1796    let mut tokens = Vec::new();
1797    let mut errors = Vec::new();
1798
1799    for (result, span) in lexer.spanned() {
1800        // Correct the span from preprocessed coordinates to original coordinates
1801        let corrected_span = correct_span(span, &span_replacements);
1802        match result {
1803            Ok(token) => {
1804                // Skip comments and line continuations - they're not needed for parsing
1805                if !matches!(token, Token::Comment | Token::LineContinuation) {
1806                    tokens.push(Spanned::new(token, corrected_span));
1807                }
1808            }
1809            Err(err) => {
1810                errors.push(Spanned::new(err, corrected_span));
1811            }
1812        }
1813    }
1814
1815    if !errors.is_empty() {
1816        return Err(errors);
1817    }
1818
1819    // Post-process: replace markers with actual token content
1820    let mut final_tokens = Vec::with_capacity(tokens.len());
1821    let mut i = 0;
1822
1823    while i < tokens.len() {
1824        // Check for arithmetic marker (unique format: __KAISH_ARITH_{id}__)
1825        if let Token::Ident(ref name) = tokens[i].token
1826            && name.starts_with("__KAISH_ARITH_") && name.ends_with("__")
1827                && let Some((_, expr)) = arith_result.arithmetics.iter().find(|(marker, _)| marker == name) {
1828                    final_tokens.push(Spanned::new(Token::Arithmetic(expr.clone()), tokens[i].span.clone()));
1829                    i += 1;
1830                    continue;
1831                }
1832
1833        // Check for heredoc (unique format: __KAISH_HEREDOC_{id}__)
1834        if matches!(tokens[i].token, Token::HereDocStart) {
1835            // Check if next token is a heredoc marker
1836            if i + 1 < tokens.len()
1837                && let Token::Ident(ref name) = tokens[i + 1].token
1838                    && name.starts_with("__KAISH_HEREDOC_") && name.ends_with("__") {
1839                        // Find the corresponding content
1840                        if let Some(hd) = heredocs.iter().find(|h| h.marker == *name) {
1841                            // Re-thread arithmetic markers that the arith
1842                            // preprocessor planted in the source — without
1843                            // this, `<<EOF\n$((1+2))\nEOF` materializes the
1844                            // marker text instead of `3`. Mirrors the
1845                            // String-content translation a few lines below.
1846                            // - Literal heredocs (no expansion): restore the
1847                            //   original `$((expr))` text verbatim.
1848                            // - Interpolated heredocs: wrap as
1849                            //   `${__ARITH:expr__}` so the spanned
1850                            //   interpolation parser turns it into a
1851                            //   StringPart::Arithmetic.
1852                            let mut content = hd.body.clone();
1853                            for (marker, expr) in &arith_result.arithmetics {
1854                                if content.contains(marker) {
1855                                    let replacement = if hd.literal {
1856                                        format!("$(({}))", expr)
1857                                    } else {
1858                                        format!("${{__ARITH:{}__}}", expr)
1859                                    };
1860                                    content = content.replace(marker, &replacement);
1861                                }
1862                            }
1863                            final_tokens.push(Spanned::new(Token::HereDocStart, tokens[i].span.clone()));
1864                            final_tokens.push(Spanned::new(
1865                                Token::HereDoc(HereDocData {
1866                                    content,
1867                                    literal: hd.literal,
1868                                    strip_tabs: hd.strip_tabs,
1869                                    body_start_offset: hd.body_start_offset,
1870                                }),
1871                                tokens[i + 1].span.clone(),
1872                            ));
1873                            i += 2;
1874                            continue;
1875                        }
1876                    }
1877        }
1878
1879        // Check for arithmetic markers inside string content
1880        let token = if let Token::String(ref s) = tokens[i].token {
1881            // Check if string contains any arithmetic markers
1882            let mut new_content = s.clone();
1883            for (marker, expr) in &arith_result.arithmetics {
1884                if new_content.contains(marker) {
1885                    // Replace marker with the special format that parse_interpolated_string can detect
1886                    // Use ${__ARITH:expr__} format so it gets parsed as StringPart::Arithmetic
1887                    new_content = new_content.replace(marker, &format!("${{__ARITH:{}__}}", expr));
1888                }
1889            }
1890            if new_content != *s {
1891                Spanned::new(Token::String(new_content), tokens[i].span.clone())
1892            } else {
1893                tokens[i].clone()
1894            }
1895        } else {
1896            tokens[i].clone()
1897        };
1898        final_tokens.push(token);
1899        i += 1;
1900    }
1901
1902    Ok(merge_glob_adjacent(merge_colon_adjacent(final_tokens)))
1903}
1904
1905/// Tokenize source code, preserving comments.
1906///
1907/// Useful for pretty-printing or formatting tools that need to preserve comments.
1908pub fn tokenize_with_comments(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1909    let lexer = Token::lexer(source);
1910    let mut tokens = Vec::new();
1911    let mut errors = Vec::new();
1912
1913    for (result, span) in lexer.spanned() {
1914        match result {
1915            Ok(token) => {
1916                tokens.push(Spanned::new(token, span));
1917            }
1918            Err(err) => {
1919                errors.push(Spanned::new(err, span));
1920            }
1921        }
1922    }
1923
1924    if errors.is_empty() {
1925        Ok(tokens)
1926    } else {
1927        Err(errors)
1928    }
1929}
1930
1931/// Extract the string content from a string token (removes quotes, processes escapes).
1932pub fn parse_string_literal(source: &str) -> Result<String, LexerError> {
1933    // Remove surrounding quotes
1934    if source.len() < 2 || !source.starts_with('"') || !source.ends_with('"') {
1935        return Err(LexerError::UnterminatedString);
1936    }
1937
1938    let inner = &source[1..source.len() - 1];
1939    let mut result = String::with_capacity(inner.len());
1940    let mut chars = inner.chars().peekable();
1941
1942    while let Some(ch) = chars.next() {
1943        if ch == '\\' {
1944            match chars.next() {
1945                Some('n') => result.push('\n'),
1946                Some('t') => result.push('\t'),
1947                Some('r') => result.push('\r'),
1948                Some('\\') => result.push('\\'),
1949                Some('"') => result.push('"'),
1950                // Use a unique marker for escaped dollar that won't be re-interpreted
1951                // parse_interpolated_string will convert this back to $
1952                Some('$') => result.push_str("__KAISH_ESCAPED_DOLLAR__"),
1953                Some('u') => {
1954                    // Unicode escape: \uXXXX
1955                    let mut hex = String::with_capacity(4);
1956                    for _ in 0..4 {
1957                        match chars.next() {
1958                            Some(h) if h.is_ascii_hexdigit() => hex.push(h),
1959                            _ => return Err(LexerError::InvalidEscape),
1960                        }
1961                    }
1962                    let codepoint = u32::from_str_radix(&hex, 16)
1963                        .map_err(|_| LexerError::InvalidEscape)?;
1964                    let ch = char::from_u32(codepoint)
1965                        .ok_or(LexerError::InvalidEscape)?;
1966                    result.push(ch);
1967                }
1968                // Unknown escapes: preserve the backslash (for regex patterns like `\.`)
1969                Some(next) => {
1970                    result.push('\\');
1971                    result.push(next);
1972                }
1973                None => return Err(LexerError::InvalidEscape),
1974            }
1975        } else {
1976            result.push(ch);
1977        }
1978    }
1979
1980    Ok(result)
1981}
1982
1983/// Parse a variable reference, extracting the path segments.
1984/// Input: "${VAR.field[0].nested}" → ["VAR", "field", "[0]", "nested"]
1985pub fn parse_var_ref(source: &str) -> Result<Vec<String>, LexerError> {
1986    // Remove ${ and }
1987    if source.len() < 4 || !source.starts_with("${") || !source.ends_with('}') {
1988        return Err(LexerError::UnterminatedVarRef);
1989    }
1990
1991    let inner = &source[2..source.len() - 1];
1992
1993    // Special case: $? (last result)
1994    if inner == "?" {
1995        return Ok(vec!["?".to_string()]);
1996    }
1997
1998    let mut segments = Vec::new();
1999    let mut current = String::new();
2000    let mut chars = inner.chars().peekable();
2001
2002    while let Some(ch) = chars.next() {
2003        match ch {
2004            '.' => {
2005                if !current.is_empty() {
2006                    segments.push(current.clone());
2007                    current.clear();
2008                }
2009            }
2010            '[' => {
2011                if !current.is_empty() {
2012                    segments.push(current.clone());
2013                    current.clear();
2014                }
2015                // Collect the index
2016                let mut index = String::from("[");
2017                while let Some(&c) = chars.peek() {
2018                    if let Some(c) = chars.next() {
2019                        index.push(c);
2020                    }
2021                    if c == ']' {
2022                        break;
2023                    }
2024                }
2025                segments.push(index);
2026            }
2027            _ => {
2028                current.push(ch);
2029            }
2030        }
2031    }
2032
2033    if !current.is_empty() {
2034        segments.push(current);
2035    }
2036
2037    Ok(segments)
2038}
2039
2040/// Parse an integer literal.
2041pub fn parse_int(source: &str) -> Result<i64, LexerError> {
2042    source.parse().map_err(|_| LexerError::InvalidNumber)
2043}
2044
2045/// Parse a float literal.
2046pub fn parse_float(source: &str) -> Result<f64, LexerError> {
2047    source.parse().map_err(|_| LexerError::InvalidNumber)
2048}
2049
2050#[cfg(test)]
2051mod tests {
2052    use super::*;
2053
2054    fn lex(source: &str) -> Vec<Token> {
2055        tokenize(source)
2056            .expect("lexer should succeed")
2057            .into_iter()
2058            .map(|s| s.token)
2059            .collect()
2060    }
2061
2062    // ═══════════════════════════════════════════════════════════════════
2063    // Keyword tests
2064    // ═══════════════════════════════════════════════════════════════════
2065
2066    #[test]
2067    fn keywords() {
2068        assert_eq!(lex("set"), vec![Token::Set]);
2069        assert_eq!(lex("if"), vec![Token::If]);
2070        assert_eq!(lex("then"), vec![Token::Then]);
2071        assert_eq!(lex("else"), vec![Token::Else]);
2072        assert_eq!(lex("elif"), vec![Token::Elif]);
2073        assert_eq!(lex("fi"), vec![Token::Fi]);
2074        assert_eq!(lex("for"), vec![Token::For]);
2075        assert_eq!(lex("in"), vec![Token::In]);
2076        assert_eq!(lex("do"), vec![Token::Do]);
2077        assert_eq!(lex("done"), vec![Token::Done]);
2078        assert_eq!(lex("case"), vec![Token::Case]);
2079        assert_eq!(lex("esac"), vec![Token::Esac]);
2080        assert_eq!(lex("function"), vec![Token::Function]);
2081        assert_eq!(lex("true"), vec![Token::True]);
2082        assert_eq!(lex("false"), vec![Token::False]);
2083    }
2084
2085    #[test]
2086    fn double_semicolon() {
2087        assert_eq!(lex(";;"), vec![Token::DoubleSemi]);
2088        // In case pattern context
2089        assert_eq!(lex("echo \"hi\";;"), vec![
2090            Token::Ident("echo".to_string()),
2091            Token::String("hi".to_string()),
2092            Token::DoubleSemi,
2093        ]);
2094    }
2095
2096    #[test]
2097    fn type_keywords() {
2098        assert_eq!(lex("string"), vec![Token::TypeString]);
2099        assert_eq!(lex("int"), vec![Token::TypeInt]);
2100        assert_eq!(lex("float"), vec![Token::TypeFloat]);
2101        assert_eq!(lex("bool"), vec![Token::TypeBool]);
2102    }
2103
2104    // ═══════════════════════════════════════════════════════════════════
2105    // Operator tests
2106    // ═══════════════════════════════════════════════════════════════════
2107
2108    #[test]
2109    fn single_char_operators() {
2110        assert_eq!(lex("="), vec![Token::Eq]);
2111        assert_eq!(lex("|"), vec![Token::Pipe]);
2112        assert_eq!(lex("&"), vec![Token::Amp]);
2113        assert_eq!(lex(">"), vec![Token::Gt]);
2114        assert_eq!(lex("<"), vec![Token::Lt]);
2115        assert_eq!(lex(";"), vec![Token::Semi]);
2116        assert_eq!(lex(":"), vec![Token::Colon]);
2117        assert_eq!(lex(","), vec![Token::Comma]);
2118        assert_eq!(lex("."), vec![Token::Dot]);
2119    }
2120
2121    #[test]
2122    fn multi_char_operators() {
2123        assert_eq!(lex("&&"), vec![Token::And]);
2124        assert_eq!(lex("||"), vec![Token::Or]);
2125        assert_eq!(lex("=="), vec![Token::EqEq]);
2126        assert_eq!(lex("!="), vec![Token::NotEq]);
2127        assert_eq!(lex("=~"), vec![Token::Match]);
2128        assert_eq!(lex("!~"), vec![Token::NotMatch]);
2129        assert_eq!(lex(">="), vec![Token::GtEq]);
2130        assert_eq!(lex("<="), vec![Token::LtEq]);
2131        assert_eq!(lex(">>"), vec![Token::GtGt]);
2132        assert_eq!(lex("2>"), vec![Token::Stderr]);
2133        assert_eq!(lex("&>"), vec![Token::Both]);
2134    }
2135
2136    #[test]
2137    fn brackets() {
2138        assert_eq!(lex("{"), vec![Token::LBrace]);
2139        assert_eq!(lex("}"), vec![Token::RBrace]);
2140        assert_eq!(lex("["), vec![Token::LBracket]);
2141        assert_eq!(lex("]"), vec![Token::RBracket]);
2142        assert_eq!(lex("("), vec![Token::LParen]);
2143        assert_eq!(lex(")"), vec![Token::RParen]);
2144    }
2145
2146    // ═══════════════════════════════════════════════════════════════════
2147    // Literal tests
2148    // ═══════════════════════════════════════════════════════════════════
2149
2150    #[test]
2151    fn integers() {
2152        assert_eq!(lex("0"), vec![Token::Int(0)]);
2153        assert_eq!(lex("42"), vec![Token::Int(42)]);
2154        assert_eq!(lex("-1"), vec![Token::Int(-1)]);
2155        assert_eq!(lex("999999"), vec![Token::Int(999999)]);
2156    }
2157
2158    #[test]
2159    fn floats() {
2160        assert_eq!(lex("3.14"), vec![Token::Float(3.14)]);
2161        assert_eq!(lex("-0.5"), vec![Token::Float(-0.5)]);
2162        assert_eq!(lex("123.456"), vec![Token::Float(123.456)]);
2163    }
2164
2165    #[test]
2166    fn strings() {
2167        assert_eq!(lex(r#""hello""#), vec![Token::String("hello".to_string())]);
2168        assert_eq!(lex(r#""hello world""#), vec![Token::String("hello world".to_string())]);
2169        assert_eq!(lex(r#""""#), vec![Token::String("".to_string())]); // empty string
2170        assert_eq!(lex(r#""with \"quotes\"""#), vec![Token::String("with \"quotes\"".to_string())]);
2171        assert_eq!(lex(r#""with\nnewline""#), vec![Token::String("with\nnewline".to_string())]);
2172    }
2173
2174    #[test]
2175    fn var_refs() {
2176        assert_eq!(lex("${X}"), vec![Token::VarRef("${X}".to_string())]);
2177        assert_eq!(lex("${VAR}"), vec![Token::VarRef("${VAR}".to_string())]);
2178        assert_eq!(lex("${VAR.field}"), vec![Token::VarRef("${VAR.field}".to_string())]);
2179        assert_eq!(lex("${VAR[0]}"), vec![Token::VarRef("${VAR[0]}".to_string())]);
2180    }
2181
2182    // ═══════════════════════════════════════════════════════════════════
2183    // Identifier tests
2184    // ═══════════════════════════════════════════════════════════════════
2185
2186    #[test]
2187    fn identifiers() {
2188        assert_eq!(lex("foo"), vec![Token::Ident("foo".to_string())]);
2189        assert_eq!(lex("foo_bar"), vec![Token::Ident("foo_bar".to_string())]);
2190        assert_eq!(lex("foo-bar"), vec![Token::Ident("foo-bar".to_string())]);
2191        assert_eq!(lex("_private"), vec![Token::Ident("_private".to_string())]);
2192        assert_eq!(lex("cmd123"), vec![Token::Ident("cmd123".to_string())]);
2193    }
2194
2195    #[test]
2196    fn keyword_prefix_identifiers() {
2197        // Identifiers that start with keywords but aren't keywords
2198        assert_eq!(lex("setup"), vec![Token::Ident("setup".to_string())]);
2199        assert_eq!(lex("kaish-tools"), vec![Token::Ident("kaish-tools".to_string())]);
2200        assert_eq!(lex("iffy"), vec![Token::Ident("iffy".to_string())]);
2201        assert_eq!(lex("forked"), vec![Token::Ident("forked".to_string())]);
2202        assert_eq!(lex("done-with-it"), vec![Token::Ident("done-with-it".to_string())]);
2203    }
2204
2205    // ═══════════════════════════════════════════════════════════════════
2206    // Statement tests
2207    // ═══════════════════════════════════════════════════════════════════
2208
2209    #[test]
2210    fn assignment() {
2211        assert_eq!(
2212            lex("set X = 5"),
2213            vec![Token::Set, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2214        );
2215    }
2216
2217    #[test]
2218    fn command_simple() {
2219        assert_eq!(lex("echo"), vec![Token::Ident("echo".to_string())]);
2220        assert_eq!(
2221            lex(r#"echo "hello""#),
2222            vec![Token::Ident("echo".to_string()), Token::String("hello".to_string())]
2223        );
2224    }
2225
2226    #[test]
2227    fn command_with_args() {
2228        assert_eq!(
2229            lex("cmd arg1 arg2"),
2230            vec![Token::Ident("cmd".to_string()), Token::Ident("arg1".to_string()), Token::Ident("arg2".to_string())]
2231        );
2232    }
2233
2234    #[test]
2235    fn command_with_named_args() {
2236        assert_eq!(
2237            lex("cmd key=value"),
2238            vec![Token::Ident("cmd".to_string()), Token::Ident("key".to_string()), Token::Eq, Token::Ident("value".to_string())]
2239        );
2240    }
2241
2242    #[test]
2243    fn pipeline() {
2244        assert_eq!(
2245            lex("a | b | c"),
2246            vec![Token::Ident("a".to_string()), Token::Pipe, Token::Ident("b".to_string()), Token::Pipe, Token::Ident("c".to_string())]
2247        );
2248    }
2249
2250    #[test]
2251    fn if_statement() {
2252        assert_eq!(
2253            lex("if true; then echo; fi"),
2254            vec![
2255                Token::If,
2256                Token::True,
2257                Token::Semi,
2258                Token::Then,
2259                Token::Ident("echo".to_string()),
2260                Token::Semi,
2261                Token::Fi
2262            ]
2263        );
2264    }
2265
2266    #[test]
2267    fn for_loop() {
2268        assert_eq!(
2269            lex("for X in items; do echo; done"),
2270            vec![
2271                Token::For,
2272                Token::Ident("X".to_string()),
2273                Token::In,
2274                Token::Ident("items".to_string()),
2275                Token::Semi,
2276                Token::Do,
2277                Token::Ident("echo".to_string()),
2278                Token::Semi,
2279                Token::Done
2280            ]
2281        );
2282    }
2283
2284    // ═══════════════════════════════════════════════════════════════════
2285    // Whitespace and newlines
2286    // ═══════════════════════════════════════════════════════════════════
2287
2288    #[test]
2289    fn whitespace_ignored() {
2290        assert_eq!(lex("   set   X   =   5   "), lex("set X = 5"));
2291    }
2292
2293    #[test]
2294    fn newlines_preserved() {
2295        let tokens = lex("a\nb");
2296        assert_eq!(
2297            tokens,
2298            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2299        );
2300    }
2301
2302    #[test]
2303    fn multiple_newlines() {
2304        let tokens = lex("a\n\n\nb");
2305        assert_eq!(
2306            tokens,
2307            vec![Token::Ident("a".to_string()), Token::Newline, Token::Newline, Token::Newline, Token::Ident("b".to_string())]
2308        );
2309    }
2310
2311    // ═══════════════════════════════════════════════════════════════════
2312    // Comments
2313    // ═══════════════════════════════════════════════════════════════════
2314
2315    #[test]
2316    fn comments_skipped() {
2317        assert_eq!(lex("# comment"), vec![]);
2318        assert_eq!(lex("a # comment"), vec![Token::Ident("a".to_string())]);
2319        assert_eq!(
2320            lex("a # comment\nb"),
2321            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2322        );
2323    }
2324
2325    #[test]
2326    fn comments_preserved_when_requested() {
2327        let tokens = tokenize_with_comments("a # comment")
2328            .expect("should succeed")
2329            .into_iter()
2330            .map(|s| s.token)
2331            .collect::<Vec<_>>();
2332        assert_eq!(tokens, vec![Token::Ident("a".to_string()), Token::Comment]);
2333    }
2334
2335    // ═══════════════════════════════════════════════════════════════════
2336    // String parsing
2337    // ═══════════════════════════════════════════════════════════════════
2338
2339    #[test]
2340    fn parse_simple_string() {
2341        assert_eq!(parse_string_literal(r#""hello""#).expect("ok"), "hello");
2342    }
2343
2344    #[test]
2345    fn parse_string_with_escapes() {
2346        assert_eq!(
2347            parse_string_literal(r#""hello\nworld""#).expect("ok"),
2348            "hello\nworld"
2349        );
2350        assert_eq!(
2351            parse_string_literal(r#""tab\there""#).expect("ok"),
2352            "tab\there"
2353        );
2354        assert_eq!(
2355            parse_string_literal(r#""quote\"here""#).expect("ok"),
2356            "quote\"here"
2357        );
2358    }
2359
2360    #[test]
2361    fn parse_string_with_unicode() {
2362        assert_eq!(
2363            parse_string_literal(r#""emoji \u2764""#).expect("ok"),
2364            "emoji ❤"
2365        );
2366    }
2367
2368    #[test]
2369    fn parse_string_with_escaped_dollar() {
2370        // \$ produces a marker that parse_interpolated_string will convert to $
2371        // The marker __KAISH_ESCAPED_DOLLAR__ is used to prevent re-interpretation
2372        assert_eq!(
2373            parse_string_literal(r#""\$VAR""#).expect("ok"),
2374            "__KAISH_ESCAPED_DOLLAR__VAR"
2375        );
2376        assert_eq!(
2377            parse_string_literal(r#""cost: \$100""#).expect("ok"),
2378            "cost: __KAISH_ESCAPED_DOLLAR__100"
2379        );
2380    }
2381
2382    // ═══════════════════════════════════════════════════════════════════
2383    // Variable reference parsing
2384    // ═══════════════════════════════════════════════════════════════════
2385
2386    #[test]
2387    fn parse_simple_var() {
2388        assert_eq!(
2389            parse_var_ref("${X}").expect("ok"),
2390            vec!["X"]
2391        );
2392    }
2393
2394    #[test]
2395    fn parse_var_with_field() {
2396        assert_eq!(
2397            parse_var_ref("${VAR.field}").expect("ok"),
2398            vec!["VAR", "field"]
2399        );
2400    }
2401
2402    #[test]
2403    fn parse_var_with_index() {
2404        assert_eq!(
2405            parse_var_ref("${VAR[0]}").expect("ok"),
2406            vec!["VAR", "[0]"]
2407        );
2408    }
2409
2410    #[test]
2411    fn parse_var_nested() {
2412        assert_eq!(
2413            parse_var_ref("${VAR.field[0].nested}").expect("ok"),
2414            vec!["VAR", "field", "[0]", "nested"]
2415        );
2416    }
2417
2418    #[test]
2419    fn parse_last_result() {
2420        assert_eq!(
2421            parse_var_ref("${?}").expect("ok"),
2422            vec!["?"]
2423        );
2424    }
2425
2426    // ═══════════════════════════════════════════════════════════════════
2427    // Number parsing
2428    // ═══════════════════════════════════════════════════════════════════
2429
2430    #[test]
2431    fn parse_integers() {
2432        assert_eq!(parse_int("0").expect("ok"), 0);
2433        assert_eq!(parse_int("42").expect("ok"), 42);
2434        assert_eq!(parse_int("-1").expect("ok"), -1);
2435    }
2436
2437    #[test]
2438    fn parse_floats() {
2439        assert!((parse_float("3.14").expect("ok") - 3.14).abs() < f64::EPSILON);
2440        assert!((parse_float("-0.5").expect("ok") - (-0.5)).abs() < f64::EPSILON);
2441    }
2442
2443    // ═══════════════════════════════════════════════════════════════════
2444    // Edge cases and errors
2445    // ═══════════════════════════════════════════════════════════════════
2446
2447    #[test]
2448    fn empty_input() {
2449        assert_eq!(lex(""), vec![]);
2450    }
2451
2452    #[test]
2453    fn only_whitespace() {
2454        assert_eq!(lex("   \t\t   "), vec![]);
2455    }
2456
2457    #[test]
2458    fn json_array() {
2459        assert_eq!(
2460            lex(r#"[1, 2, 3]"#),
2461            vec![
2462                Token::LBracket,
2463                Token::Int(1),
2464                Token::Comma,
2465                Token::Int(2),
2466                Token::Comma,
2467                Token::Int(3),
2468                Token::RBracket
2469            ]
2470        );
2471    }
2472
2473    #[test]
2474    fn json_object() {
2475        assert_eq!(
2476            lex(r#"{"key": "value"}"#),
2477            vec![
2478                Token::LBrace,
2479                Token::String("key".to_string()),
2480                Token::Colon,
2481                Token::String("value".to_string()),
2482                Token::RBrace
2483            ]
2484        );
2485    }
2486
2487    #[test]
2488    fn redirect_operators() {
2489        assert_eq!(
2490            lex("cmd > file"),
2491            vec![Token::Ident("cmd".to_string()), Token::Gt, Token::Ident("file".to_string())]
2492        );
2493        assert_eq!(
2494            lex("cmd >> file"),
2495            vec![Token::Ident("cmd".to_string()), Token::GtGt, Token::Ident("file".to_string())]
2496        );
2497        assert_eq!(
2498            lex("cmd 2> err"),
2499            vec![Token::Ident("cmd".to_string()), Token::Stderr, Token::Ident("err".to_string())]
2500        );
2501        assert_eq!(
2502            lex("cmd &> all"),
2503            vec![Token::Ident("cmd".to_string()), Token::Both, Token::Ident("all".to_string())]
2504        );
2505    }
2506
2507    #[test]
2508    fn background_job() {
2509        assert_eq!(
2510            lex("cmd &"),
2511            vec![Token::Ident("cmd".to_string()), Token::Amp]
2512        );
2513    }
2514
2515    #[test]
2516    fn command_substitution() {
2517        assert_eq!(
2518            lex("$(cmd)"),
2519            vec![Token::CmdSubstStart, Token::Ident("cmd".to_string()), Token::RParen]
2520        );
2521        assert_eq!(
2522            lex("$(cmd arg)"),
2523            vec![
2524                Token::CmdSubstStart,
2525                Token::Ident("cmd".to_string()),
2526                Token::Ident("arg".to_string()),
2527                Token::RParen
2528            ]
2529        );
2530        assert_eq!(
2531            lex("$(a | b)"),
2532            vec![
2533                Token::CmdSubstStart,
2534                Token::Ident("a".to_string()),
2535                Token::Pipe,
2536                Token::Ident("b".to_string()),
2537                Token::RParen
2538            ]
2539        );
2540    }
2541
2542    #[test]
2543    fn complex_pipeline() {
2544        assert_eq!(
2545            lex(r#"cat file | grep pattern="foo" | head count=10"#),
2546            vec![
2547                Token::Ident("cat".to_string()),
2548                Token::Ident("file".to_string()),
2549                Token::Pipe,
2550                Token::Ident("grep".to_string()),
2551                Token::Ident("pattern".to_string()),
2552                Token::Eq,
2553                Token::String("foo".to_string()),
2554                Token::Pipe,
2555                Token::Ident("head".to_string()),
2556                Token::Ident("count".to_string()),
2557                Token::Eq,
2558                Token::Int(10),
2559            ]
2560        );
2561    }
2562
2563    // ═══════════════════════════════════════════════════════════════════
2564    // Flag tests
2565    // ═══════════════════════════════════════════════════════════════════
2566
2567    #[test]
2568    fn short_flag() {
2569        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2570        assert_eq!(lex("-a"), vec![Token::ShortFlag("a".to_string())]);
2571        assert_eq!(lex("-v"), vec![Token::ShortFlag("v".to_string())]);
2572    }
2573
2574    #[test]
2575    fn short_flag_combined() {
2576        // Combined short flags like -la
2577        assert_eq!(lex("-la"), vec![Token::ShortFlag("la".to_string())]);
2578        assert_eq!(lex("-vvv"), vec![Token::ShortFlag("vvv".to_string())]);
2579    }
2580
2581    #[test]
2582    fn long_flag() {
2583        assert_eq!(lex("--force"), vec![Token::LongFlag("force".to_string())]);
2584        assert_eq!(lex("--verbose"), vec![Token::LongFlag("verbose".to_string())]);
2585        assert_eq!(lex("--foo-bar"), vec![Token::LongFlag("foo-bar".to_string())]);
2586    }
2587
2588    #[test]
2589    fn double_dash() {
2590        // -- alone marks end of flags
2591        assert_eq!(lex("--"), vec![Token::DoubleDash]);
2592    }
2593
2594    #[test]
2595    fn flags_vs_negative_numbers() {
2596        // -123 should be a negative integer, not a flag
2597        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2598        // -l should be a flag
2599        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2600        // -1a is ambiguous - should be Int(-1) then Ident(a)
2601        // Actually the regex -[a-zA-Z] won't match -1a since 1 isn't a letter
2602        assert_eq!(
2603            lex("-1 a"),
2604            vec![Token::Int(-1), Token::Ident("a".to_string())]
2605        );
2606    }
2607
2608    #[test]
2609    fn command_with_flags() {
2610        assert_eq!(
2611            lex("ls -l"),
2612            vec![
2613                Token::Ident("ls".to_string()),
2614                Token::ShortFlag("l".to_string()),
2615            ]
2616        );
2617        assert_eq!(
2618            lex("git commit -m"),
2619            vec![
2620                Token::Ident("git".to_string()),
2621                Token::Ident("commit".to_string()),
2622                Token::ShortFlag("m".to_string()),
2623            ]
2624        );
2625        assert_eq!(
2626            lex("git push --force"),
2627            vec![
2628                Token::Ident("git".to_string()),
2629                Token::Ident("push".to_string()),
2630                Token::LongFlag("force".to_string()),
2631            ]
2632        );
2633    }
2634
2635    #[test]
2636    fn flag_with_value() {
2637        assert_eq!(
2638            lex(r#"git commit -m "message""#),
2639            vec![
2640                Token::Ident("git".to_string()),
2641                Token::Ident("commit".to_string()),
2642                Token::ShortFlag("m".to_string()),
2643                Token::String("message".to_string()),
2644            ]
2645        );
2646        assert_eq!(
2647            lex(r#"--message="hello""#),
2648            vec![
2649                Token::LongFlag("message".to_string()),
2650                Token::Eq,
2651                Token::String("hello".to_string()),
2652            ]
2653        );
2654    }
2655
2656    #[test]
2657    fn end_of_flags_marker() {
2658        assert_eq!(
2659            lex("git checkout -- file"),
2660            vec![
2661                Token::Ident("git".to_string()),
2662                Token::Ident("checkout".to_string()),
2663                Token::DoubleDash,
2664                Token::Ident("file".to_string()),
2665            ]
2666        );
2667    }
2668
2669    // ═══════════════════════════════════════════════════════════════════
2670    // Bash compatibility tokens
2671    // ═══════════════════════════════════════════════════════════════════
2672
2673    #[test]
2674    fn local_keyword() {
2675        assert_eq!(lex("local"), vec![Token::Local]);
2676        assert_eq!(
2677            lex("local X = 5"),
2678            vec![Token::Local, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2679        );
2680    }
2681
2682    #[test]
2683    fn simple_var_ref() {
2684        assert_eq!(lex("$X"), vec![Token::SimpleVarRef("X".to_string())]);
2685        assert_eq!(lex("$foo"), vec![Token::SimpleVarRef("foo".to_string())]);
2686        assert_eq!(lex("$foo_bar"), vec![Token::SimpleVarRef("foo_bar".to_string())]);
2687        assert_eq!(lex("$_private"), vec![Token::SimpleVarRef("_private".to_string())]);
2688    }
2689
2690    #[test]
2691    fn simple_var_ref_in_command() {
2692        assert_eq!(
2693            lex("echo $NAME"),
2694            vec![Token::Ident("echo".to_string()), Token::SimpleVarRef("NAME".to_string())]
2695        );
2696    }
2697
2698    #[test]
2699    fn single_quoted_strings() {
2700        assert_eq!(lex("'hello'"), vec![Token::SingleString("hello".to_string())]);
2701        assert_eq!(lex("'hello world'"), vec![Token::SingleString("hello world".to_string())]);
2702        assert_eq!(lex("''"), vec![Token::SingleString("".to_string())]);
2703        // Single quotes don't process escapes or variables
2704        assert_eq!(lex(r"'no $VAR here'"), vec![Token::SingleString("no $VAR here".to_string())]);
2705        assert_eq!(lex(r"'backslash \n stays'"), vec![Token::SingleString(r"backslash \n stays".to_string())]);
2706    }
2707
2708    #[test]
2709    fn test_brackets() {
2710        // [[ and ]] are now two separate bracket tokens to avoid conflicts with nested arrays
2711        assert_eq!(lex("[["), vec![Token::LBracket, Token::LBracket]);
2712        assert_eq!(lex("]]"), vec![Token::RBracket, Token::RBracket]);
2713        assert_eq!(
2714            lex("[[ -f file ]]"),
2715            vec![
2716                Token::LBracket,
2717                Token::LBracket,
2718                Token::ShortFlag("f".to_string()),
2719                Token::Ident("file".to_string()),
2720                Token::RBracket,
2721                Token::RBracket
2722            ]
2723        );
2724    }
2725
2726    #[test]
2727    fn test_expression_syntax() {
2728        assert_eq!(
2729            lex(r#"[[ $X == "value" ]]"#),
2730            vec![
2731                Token::LBracket,
2732                Token::LBracket,
2733                Token::SimpleVarRef("X".to_string()),
2734                Token::EqEq,
2735                Token::String("value".to_string()),
2736                Token::RBracket,
2737                Token::RBracket
2738            ]
2739        );
2740    }
2741
2742    #[test]
2743    fn bash_style_assignment() {
2744        // NAME="value" (no spaces) - lexer sees IDENT EQ STRING
2745        assert_eq!(
2746            lex(r#"NAME="value""#),
2747            vec![
2748                Token::Ident("NAME".to_string()),
2749                Token::Eq,
2750                Token::String("value".to_string())
2751            ]
2752        );
2753    }
2754
2755    #[test]
2756    fn positional_params() {
2757        assert_eq!(lex("$0"), vec![Token::Positional(0)]);
2758        assert_eq!(lex("$1"), vec![Token::Positional(1)]);
2759        assert_eq!(lex("$9"), vec![Token::Positional(9)]);
2760        assert_eq!(lex("$@"), vec![Token::AllArgs]);
2761        assert_eq!(lex("$#"), vec![Token::ArgCount]);
2762    }
2763
2764    #[test]
2765    fn positional_in_context() {
2766        assert_eq!(
2767            lex("echo $1 $2"),
2768            vec![
2769                Token::Ident("echo".to_string()),
2770                Token::Positional(1),
2771                Token::Positional(2),
2772            ]
2773        );
2774    }
2775
2776    #[test]
2777    fn var_length() {
2778        assert_eq!(lex("${#X}"), vec![Token::VarLength("X".to_string())]);
2779        assert_eq!(lex("${#NAME}"), vec![Token::VarLength("NAME".to_string())]);
2780        assert_eq!(lex("${#foo_bar}"), vec![Token::VarLength("foo_bar".to_string())]);
2781    }
2782
2783    #[test]
2784    fn var_length_in_context() {
2785        assert_eq!(
2786            lex("echo ${#NAME}"),
2787            vec![
2788                Token::Ident("echo".to_string()),
2789                Token::VarLength("NAME".to_string()),
2790            ]
2791        );
2792    }
2793
2794    // ═══════════════════════════════════════════════════════════════════
2795    // Edge case tests: Flag ambiguities
2796    // ═══════════════════════════════════════════════════════════════════
2797
2798    #[test]
2799    fn plus_flag() {
2800        // Plus flags for set +e
2801        assert_eq!(lex("+e"), vec![Token::PlusFlag("e".to_string())]);
2802        assert_eq!(lex("+x"), vec![Token::PlusFlag("x".to_string())]);
2803        assert_eq!(lex("+ex"), vec![Token::PlusFlag("ex".to_string())]);
2804    }
2805
2806    #[test]
2807    fn set_with_plus_flag() {
2808        assert_eq!(
2809            lex("set +e"),
2810            vec![
2811                Token::Set,
2812                Token::PlusFlag("e".to_string()),
2813            ]
2814        );
2815    }
2816
2817    #[test]
2818    fn set_with_multiple_flags() {
2819        assert_eq!(
2820            lex("set -e -u"),
2821            vec![
2822                Token::Set,
2823                Token::ShortFlag("e".to_string()),
2824                Token::ShortFlag("u".to_string()),
2825            ]
2826        );
2827    }
2828
2829    #[test]
2830    fn flags_vs_negative_numbers_edge_cases() {
2831        // -1a should be negative int followed by ident
2832        assert_eq!(
2833            lex("-1 a"),
2834            vec![Token::Int(-1), Token::Ident("a".to_string())]
2835        );
2836        // -l is a flag
2837        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2838        // -123 is negative number
2839        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2840    }
2841
2842    #[test]
2843    fn single_dash_is_minus_alone() {
2844        // Single dash alone - now handled as MinusAlone for `cat -` stdin indicator
2845        let result = tokenize("-").expect("should lex");
2846        assert_eq!(result.len(), 1);
2847        assert!(matches!(result[0].token, Token::MinusAlone));
2848    }
2849
2850    #[test]
2851    fn plus_bare_for_date_format() {
2852        // `date +%s` - the +%s should be PlusBare
2853        let result = tokenize("+%s").expect("should lex");
2854        assert_eq!(result.len(), 1);
2855        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%s"));
2856
2857        // `date +%Y-%m-%d` - format string with dashes
2858        let result = tokenize("+%Y-%m-%d").expect("should lex");
2859        assert_eq!(result.len(), 1);
2860        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%Y-%m-%d"));
2861    }
2862
2863    #[test]
2864    fn plus_flag_still_works() {
2865        // `set +e` - should still be PlusFlag
2866        let result = tokenize("+e").expect("should lex");
2867        assert_eq!(result.len(), 1);
2868        assert!(matches!(result[0].token, Token::PlusFlag(ref s) if s == "e"));
2869    }
2870
2871    #[test]
2872    fn while_keyword_vs_while_loop() {
2873        // 'while' as keyword in loop context
2874        assert_eq!(lex("while"), vec![Token::While]);
2875        // 'while' at start followed by condition
2876        assert_eq!(
2877            lex("while true"),
2878            vec![Token::While, Token::True]
2879        );
2880    }
2881
2882    #[test]
2883    fn control_flow_keywords() {
2884        assert_eq!(lex("break"), vec![Token::Break]);
2885        assert_eq!(lex("continue"), vec![Token::Continue]);
2886        assert_eq!(lex("return"), vec![Token::Return]);
2887        assert_eq!(lex("exit"), vec![Token::Exit]);
2888    }
2889
2890    #[test]
2891    fn control_flow_with_numbers() {
2892        assert_eq!(
2893            lex("break 2"),
2894            vec![Token::Break, Token::Int(2)]
2895        );
2896        assert_eq!(
2897            lex("continue 3"),
2898            vec![Token::Continue, Token::Int(3)]
2899        );
2900        assert_eq!(
2901            lex("exit 1"),
2902            vec![Token::Exit, Token::Int(1)]
2903        );
2904    }
2905
2906    // ═══════════════════════════════════════════════════════════════════
2907    // Here-doc tests
2908    // ═══════════════════════════════════════════════════════════════════
2909
2910    #[test]
2911    fn heredoc_simple() {
2912        let source = "cat <<EOF\nhello\nworld\nEOF";
2913        let tokens = lex(source);
2914        // body_start_offset = byte offset of 'h' in "hello", i.e. just after "cat <<EOF\n"
2915        assert_eq!(tokens, vec![
2916            Token::Ident("cat".to_string()),
2917            Token::HereDocStart,
2918            Token::HereDoc(HereDocData {
2919                content: "hello\nworld\n".to_string(),
2920                literal: false,
2921                strip_tabs: false,
2922                body_start_offset: 10,
2923            }),
2924            Token::Newline,
2925        ]);
2926    }
2927
2928    #[test]
2929    fn heredoc_empty() {
2930        let source = "cat <<EOF\nEOF";
2931        let tokens = lex(source);
2932        assert_eq!(tokens, vec![
2933            Token::Ident("cat".to_string()),
2934            Token::HereDocStart,
2935            Token::HereDoc(HereDocData {
2936                content: "".to_string(),
2937                literal: false,
2938                strip_tabs: false,
2939                body_start_offset: 10,
2940            }),
2941            Token::Newline,
2942        ]);
2943    }
2944
2945    #[test]
2946    fn heredoc_with_special_chars() {
2947        let source = "cat <<EOF\n$VAR and \"quoted\" 'single'\nEOF";
2948        let tokens = lex(source);
2949        assert_eq!(tokens, vec![
2950            Token::Ident("cat".to_string()),
2951            Token::HereDocStart,
2952            Token::HereDoc(HereDocData {
2953                content: "$VAR and \"quoted\" 'single'\n".to_string(),
2954                literal: false,
2955                strip_tabs: false,
2956                body_start_offset: 10,
2957            }),
2958            Token::Newline,
2959        ]);
2960    }
2961
2962    #[test]
2963    fn heredoc_multiline() {
2964        let source = "cat <<END\nline1\nline2\nline3\nEND";
2965        let tokens = lex(source);
2966        assert_eq!(tokens, vec![
2967            Token::Ident("cat".to_string()),
2968            Token::HereDocStart,
2969            Token::HereDoc(HereDocData {
2970                content: "line1\nline2\nline3\n".to_string(),
2971                literal: false,
2972                strip_tabs: false,
2973                body_start_offset: 10,
2974            }),
2975            Token::Newline,
2976        ]);
2977    }
2978
2979    #[test]
2980    fn heredoc_in_command() {
2981        let source = "cat <<EOF\nhello\nEOF\necho goodbye";
2982        let tokens = lex(source);
2983        assert_eq!(tokens, vec![
2984            Token::Ident("cat".to_string()),
2985            Token::HereDocStart,
2986            Token::HereDoc(HereDocData {
2987                content: "hello\n".to_string(),
2988                literal: false,
2989                strip_tabs: false,
2990                body_start_offset: 10,
2991            }),
2992            Token::Newline,
2993            Token::Ident("echo".to_string()),
2994            Token::Ident("goodbye".to_string()),
2995        ]);
2996    }
2997
2998    #[test]
2999    fn heredoc_strip_tabs() {
3000        let source = "cat <<-EOF\n\thello\n\tworld\n\tEOF";
3001        let tokens = lex(source);
3002        // Content keeps tabs verbatim — strip_tabs is recorded on the token so
3003        // the interpreter can apply POSIX leading-tab stripping at materialization
3004        // without disturbing source byte offsets used for span tracking.
3005        assert_eq!(tokens, vec![
3006            Token::Ident("cat".to_string()),
3007            Token::HereDocStart,
3008            Token::HereDoc(HereDocData {
3009                content: "\thello\n\tworld\n".to_string(),
3010                literal: false,
3011                strip_tabs: true,
3012                body_start_offset: 11,
3013            }),
3014            Token::Newline,
3015        ]);
3016    }
3017
3018    // ═══════════════════════════════════════════════════════════════════
3019    // Arithmetic expression tests
3020    // ═══════════════════════════════════════════════════════════════════
3021
3022    #[test]
3023    fn arithmetic_simple() {
3024        let source = "$((1 + 2))";
3025        let tokens = lex(source);
3026        assert_eq!(tokens, vec![Token::Arithmetic("1 + 2".to_string())]);
3027    }
3028
3029    #[test]
3030    fn arithmetic_in_assignment() {
3031        let source = "X=$((5 * 3))";
3032        let tokens = lex(source);
3033        assert_eq!(tokens, vec![
3034            Token::Ident("X".to_string()),
3035            Token::Eq,
3036            Token::Arithmetic("5 * 3".to_string()),
3037        ]);
3038    }
3039
3040    #[test]
3041    fn arithmetic_with_nested_parens() {
3042        let source = "$((2 * (3 + 4)))";
3043        let tokens = lex(source);
3044        assert_eq!(tokens, vec![Token::Arithmetic("2 * (3 + 4)".to_string())]);
3045    }
3046
3047    #[test]
3048    fn arithmetic_with_variable() {
3049        let source = "$((X + 1))";
3050        let tokens = lex(source);
3051        assert_eq!(tokens, vec![Token::Arithmetic("X + 1".to_string())]);
3052    }
3053
3054    #[test]
3055    fn arithmetic_command_subst_not_confused() {
3056        // $( should not be treated as arithmetic
3057        let source = "$(echo hello)";
3058        let tokens = lex(source);
3059        assert_eq!(tokens, vec![
3060            Token::CmdSubstStart,
3061            Token::Ident("echo".to_string()),
3062            Token::Ident("hello".to_string()),
3063            Token::RParen,
3064        ]);
3065    }
3066
3067    #[test]
3068    fn arithmetic_nesting_limit() {
3069        // Create deeply nested parens that exceed MAX_PAREN_DEPTH (256)
3070        let open_parens = "(".repeat(300);
3071        let close_parens = ")".repeat(300);
3072        let source = format!("$(({}1{}))", open_parens, close_parens);
3073        let result = tokenize(&source);
3074        assert!(result.is_err());
3075        let errors = result.unwrap_err();
3076        assert_eq!(errors.len(), 1);
3077        assert_eq!(errors[0].token, LexerError::NestingTooDeep);
3078    }
3079
3080    #[test]
3081    fn arithmetic_nesting_within_limit() {
3082        // Nesting within limit should work
3083        let source = "$((((1 + 2) * 3)))";
3084        let tokens = lex(source);
3085        assert_eq!(tokens, vec![Token::Arithmetic("((1 + 2) * 3)".to_string())]);
3086    }
3087
3088    // ═══════════════════════════════════════════════════════════════════
3089    // Token category tests
3090    // ═══════════════════════════════════════════════════════════════════
3091
3092    #[test]
3093    fn token_categories() {
3094        // Keywords
3095        assert_eq!(Token::If.category(), TokenCategory::Keyword);
3096        assert_eq!(Token::Then.category(), TokenCategory::Keyword);
3097        assert_eq!(Token::For.category(), TokenCategory::Keyword);
3098        assert_eq!(Token::Function.category(), TokenCategory::Keyword);
3099        assert_eq!(Token::True.category(), TokenCategory::Keyword);
3100        assert_eq!(Token::TypeString.category(), TokenCategory::Keyword);
3101
3102        // Operators
3103        assert_eq!(Token::Pipe.category(), TokenCategory::Operator);
3104        assert_eq!(Token::And.category(), TokenCategory::Operator);
3105        assert_eq!(Token::Or.category(), TokenCategory::Operator);
3106        assert_eq!(Token::StderrToStdout.category(), TokenCategory::Operator);
3107        assert_eq!(Token::GtGt.category(), TokenCategory::Operator);
3108
3109        // Strings
3110        assert_eq!(Token::String("test".to_string()).category(), TokenCategory::String);
3111        assert_eq!(Token::SingleString("test".to_string()).category(), TokenCategory::String);
3112        assert_eq!(
3113            Token::HereDoc(HereDocData {
3114                content: "test".to_string(),
3115                literal: false,
3116                strip_tabs: false,
3117                body_start_offset: 0,
3118            }).category(),
3119            TokenCategory::String,
3120        );
3121
3122        // Numbers
3123        assert_eq!(Token::Int(42).category(), TokenCategory::Number);
3124        assert_eq!(Token::Float(3.14).category(), TokenCategory::Number);
3125        assert_eq!(Token::Arithmetic("1+2".to_string()).category(), TokenCategory::Number);
3126
3127        // Variables
3128        assert_eq!(Token::SimpleVarRef("X".to_string()).category(), TokenCategory::Variable);
3129        assert_eq!(Token::VarRef("${X}".to_string()).category(), TokenCategory::Variable);
3130        assert_eq!(Token::Positional(1).category(), TokenCategory::Variable);
3131        assert_eq!(Token::AllArgs.category(), TokenCategory::Variable);
3132        assert_eq!(Token::ArgCount.category(), TokenCategory::Variable);
3133        assert_eq!(Token::LastExitCode.category(), TokenCategory::Variable);
3134        assert_eq!(Token::CurrentPid.category(), TokenCategory::Variable);
3135
3136        // Flags
3137        assert_eq!(Token::ShortFlag("l".to_string()).category(), TokenCategory::Flag);
3138        assert_eq!(Token::LongFlag("verbose".to_string()).category(), TokenCategory::Flag);
3139        assert_eq!(Token::PlusFlag("e".to_string()).category(), TokenCategory::Flag);
3140        assert_eq!(Token::DoubleDash.category(), TokenCategory::Flag);
3141
3142        // Punctuation
3143        assert_eq!(Token::Semi.category(), TokenCategory::Punctuation);
3144        assert_eq!(Token::LParen.category(), TokenCategory::Punctuation);
3145        assert_eq!(Token::LBracket.category(), TokenCategory::Punctuation);
3146        assert_eq!(Token::Newline.category(), TokenCategory::Punctuation);
3147
3148        // Comments
3149        assert_eq!(Token::Comment.category(), TokenCategory::Comment);
3150
3151        // Paths
3152        assert_eq!(Token::Path("/tmp/file".to_string()).category(), TokenCategory::Path);
3153
3154        // Commands
3155        assert_eq!(Token::Ident("echo".to_string()).category(), TokenCategory::Command);
3156        assert_eq!(Token::NumberIdent("019dda1c".to_string()).category(), TokenCategory::Command);
3157        assert_eq!(Token::DottedIdent(".gitignore".to_string()).category(), TokenCategory::Command);
3158
3159        // Errors
3160        assert_eq!(Token::InvalidFloatNoLeading.category(), TokenCategory::Error);
3161        assert_eq!(Token::InvalidFloatNoTrailing.category(), TokenCategory::Error);
3162    }
3163
3164    #[test]
3165    fn test_heredoc_piped_to_command() {
3166        // Bug 4: "cat <<EOF | jq" should produce: cat <<heredoc | jq
3167        // Not: cat | jq <<heredoc
3168        let tokens = tokenize("cat <<EOF | jq\n{\"key\": \"val\"}\nEOF").unwrap();
3169        let heredoc_pos = tokens.iter().position(|t| matches!(t.token, Token::HereDoc(_)));
3170        let pipe_pos = tokens.iter().position(|t| matches!(t.token, Token::Pipe));
3171        assert!(heredoc_pos.is_some(), "should have a heredoc token");
3172        assert!(pipe_pos.is_some(), "should have a pipe token");
3173        assert!(
3174            pipe_pos.unwrap() > heredoc_pos.unwrap(),
3175            "Pipe must come after heredoc, got heredoc at {}, pipe at {}. Tokens: {:?}",
3176            heredoc_pos.unwrap(), pipe_pos.unwrap(), tokens,
3177        );
3178    }
3179
3180    #[test]
3181    fn test_heredoc_standalone_still_works() {
3182        // Regression: standalone heredoc (no pipe) must still work
3183        let tokens = tokenize("cat <<EOF\nhello\nEOF").unwrap();
3184        assert!(tokens.iter().any(|t| matches!(t.token, Token::HereDoc(_))));
3185        assert!(!tokens.iter().any(|t| matches!(t.token, Token::Pipe)));
3186    }
3187
3188    #[test]
3189    fn test_heredoc_preserves_leading_empty_lines() {
3190        // Bug B: heredoc starting with a blank line must preserve it
3191        let tokens = tokenize("cat <<EOF\n\nhello\nEOF").unwrap();
3192        let heredoc = tokens.iter().find_map(|t| {
3193            if let Token::HereDoc(data) = &t.token {
3194                Some(data.clone())
3195            } else {
3196                None
3197            }
3198        });
3199        assert!(heredoc.is_some(), "should have a heredoc token");
3200        let data = heredoc.unwrap();
3201        assert!(data.content.starts_with('\n'), "leading empty line must be preserved, got: {:?}", data.content);
3202        assert_eq!(data.content, "\nhello\n");
3203    }
3204
3205    #[test]
3206    fn test_heredoc_quoted_delimiter_sets_literal() {
3207        // Bug N: quoted delimiter (<<'EOF') should set literal=true
3208        let tokens = tokenize("cat <<'EOF'\nhello $HOME\nEOF").unwrap();
3209        let heredoc = tokens.iter().find_map(|t| {
3210            if let Token::HereDoc(data) = &t.token {
3211                Some(data.clone())
3212            } else {
3213                None
3214            }
3215        });
3216        assert!(heredoc.is_some(), "should have a heredoc token");
3217        let data = heredoc.unwrap();
3218        assert!(data.literal, "quoted delimiter should set literal=true");
3219        assert_eq!(data.content, "hello $HOME\n");
3220    }
3221
3222    #[test]
3223    fn test_heredoc_unquoted_delimiter_not_literal() {
3224        // Bug N: unquoted delimiter (<<EOF) should have literal=false
3225        let tokens = tokenize("cat <<EOF\nhello $HOME\nEOF").unwrap();
3226        let heredoc = tokens.iter().find_map(|t| {
3227            if let Token::HereDoc(data) = &t.token {
3228                Some(data.clone())
3229            } else {
3230                None
3231            }
3232        });
3233        assert!(heredoc.is_some(), "should have a heredoc token");
3234        let data = heredoc.unwrap();
3235        assert!(!data.literal, "unquoted delimiter should have literal=false");
3236    }
3237
3238    // ═══════════════════════════════════════════════════════════════════
3239    // Colon merge tests
3240    // ═══════════════════════════════════════════════════════════════════
3241
3242    #[test]
3243    fn colon_double_in_word() {
3244        assert_eq!(lex("foo::bar"), vec![Token::Ident("foo::bar".into())]);
3245    }
3246
3247    #[test]
3248    fn colon_single_in_word() {
3249        assert_eq!(lex("a:b:c"), vec![Token::Ident("a:b:c".into())]);
3250    }
3251
3252    #[test]
3253    fn colon_with_port() {
3254        assert_eq!(lex("host:8080"), vec![Token::Ident("host:8080".into())]);
3255    }
3256
3257    #[test]
3258    fn colon_standalone() {
3259        assert_eq!(lex(":"), vec![Token::Colon]);
3260    }
3261
3262    #[test]
3263    fn colon_spaced_no_merge() {
3264        assert_eq!(
3265            lex("foo : bar"),
3266            vec![
3267                Token::Ident("foo".into()),
3268                Token::Colon,
3269                Token::Ident("bar".into()),
3270            ]
3271        );
3272    }
3273
3274    #[test]
3275    fn colon_in_command_arg() {
3276        assert_eq!(
3277            lex("echo foo::bar"),
3278            vec![
3279                Token::Ident("echo".into()),
3280                Token::Ident("foo::bar".into()),
3281            ]
3282        );
3283    }
3284
3285    #[test]
3286    fn colon_trailing() {
3287        // Trailing colon merges with preceding ident
3288        assert_eq!(lex("foo:"), vec![Token::Ident("foo:".into())]);
3289    }
3290
3291    #[test]
3292    fn colon_leading() {
3293        // Leading colon merges with following ident
3294        assert_eq!(lex(":foo"), vec![Token::Ident(":foo".into())]);
3295    }
3296
3297    #[test]
3298    fn colon_with_path() {
3299        // Path token + colon + int
3300        assert_eq!(
3301            lex("/usr/bin:8080"),
3302            vec![Token::Ident("/usr/bin:8080".into())]
3303        );
3304    }
3305
3306    // ═══════════════════════════════════════════════════════════════════
3307    // Token predicate coverage (is_keyword / starts_statement)
3308    // ═══════════════════════════════════════════════════════════════════
3309
3310    #[test]
3311    fn is_keyword_covers_control_flow() {
3312        for t in [
3313            Token::While,
3314            Token::Return,
3315            Token::Break,
3316            Token::Continue,
3317            Token::Exit,
3318        ] {
3319            assert!(t.is_keyword(), "{t:?} should be a keyword");
3320        }
3321    }
3322
3323    #[test]
3324    fn starts_statement_covers_while() {
3325        assert!(Token::While.starts_statement());
3326    }
3327
3328    #[test]
3329    fn is_keyword_rejects_operators() {
3330        for t in [Token::Pipe, Token::Amp, Token::Eq, Token::LBrace] {
3331            assert!(!t.is_keyword(), "{t:?} should not be a keyword");
3332        }
3333    }
3334}
kaish_kernel/lexer.rs

kaish_kernel/
lexer.rs