kaish_kernel/
lexer.rs

1//! Lexer for kaish source code.
2//!
3//! Converts source text into a stream of tokens using the logos lexer generator.
4//! The lexer is designed to be unambiguous: every valid input produces exactly
5//! one token sequence, and invalid input produces clear errors.
6//!
7//! # Token Categories
8//!
9//! - **Keywords**: `set`, `tool`, `if`, `then`, `else`, `fi`, `for`, `in`, `do`, `done`
10//! - **Literals**: strings, integers, floats, booleans (`true`/`false`)
11//! - **Operators**: `=`, `|`, `&`, `>`, `>>`, `<`, `2>`, `&>`, `&&`, `||`
12//! - **Punctuation**: `;`, `:`, `,`, `.`, `{`, `}`, `[`, `]`
13//! - **Variable references**: `${...}` with nested path access
14//! - **Identifiers**: command names, variable names, parameter names
15
16use logos::{Logos, Span};
17use std::fmt;
18use std::sync::atomic::{AtomicU64, Ordering};
19use std::time::{SystemTime, UNIX_EPOCH};
20
21/// Global counter for generating unique markers across all tokenize calls.
22static MARKER_COUNTER: AtomicU64 = AtomicU64::new(0);
23
24/// Maximum nesting depth for parentheses in arithmetic expressions.
25/// Prevents stack overflow from pathologically nested inputs like $((((((...
26const MAX_PAREN_DEPTH: usize = 256;
27
28/// Tracks a text replacement for span correction.
29/// When preprocessing replaces text (like `$((1+2))` with a marker),
30/// we need to adjust subsequent spans to account for the length change.
31#[derive(Debug, Clone)]
32struct SpanReplacement {
33    /// Position in the preprocessed text where the marker starts.
34    preprocessed_pos: usize,
35    /// Length of the marker in preprocessed text.
36    marker_len: usize,
37    /// Length of the original text that was replaced.
38    original_len: usize,
39}
40
41/// Corrects a span from preprocessed-text coordinates back to original-text coordinates.
42fn correct_span(span: Span, replacements: &[SpanReplacement]) -> Span {
43    let mut start_adjustment: isize = 0;
44    let mut end_adjustment: isize = 0;
45
46    for r in replacements {
47        // Calculate the length difference (positive = original was longer, negative = marker is longer)
48        let delta = r.original_len as isize - r.marker_len as isize;
49
50        // If the span starts after this replacement, adjust the start
51        if span.start > r.preprocessed_pos + r.marker_len {
52            start_adjustment += delta;
53        } else if span.start > r.preprocessed_pos {
54            // Span starts inside the marker - map to original position
55            // (this shouldn't happen often, but handle it gracefully)
56            start_adjustment += delta;
57        }
58
59        // If the span ends after this replacement, adjust the end
60        if span.end > r.preprocessed_pos + r.marker_len {
61            end_adjustment += delta;
62        } else if span.end > r.preprocessed_pos {
63            // Span ends inside the marker - map to end of original
64            end_adjustment += delta;
65        }
66    }
67
68    let new_start = (span.start as isize + start_adjustment).max(0) as usize;
69    let new_end = (span.end as isize + end_adjustment).max(new_start as isize) as usize;
70    new_start..new_end
71}
72
73/// Generate a unique marker ID that's extremely unlikely to collide with user code.
74/// Uses a combination of timestamp, counter, and process ID.
75fn unique_marker_id() -> String {
76    let timestamp = SystemTime::now()
77        .duration_since(UNIX_EPOCH)
78        .map(|d| d.as_nanos())
79        .unwrap_or(0);
80    let counter = MARKER_COUNTER.fetch_add(1, Ordering::Relaxed);
81    #[cfg(target_os = "wasi")]
82    let pid = 0u32;
83    #[cfg(not(target_os = "wasi"))]
84    let pid = std::process::id();
85    format!("{:x}_{:x}_{:x}", timestamp, counter, pid)
86}
87
88/// A token with its span in the source text.
89#[derive(Debug, Clone, PartialEq)]
90pub struct Spanned<T> {
91    pub token: T,
92    pub span: Span,
93}
94
95impl<T> Spanned<T> {
96    pub fn new(token: T, span: Span) -> Self {
97        Self { token, span }
98    }
99}
100
101/// Lexer error types.
102#[derive(Debug, Clone, PartialEq, Default)]
103pub enum LexerError {
104    #[default]
105    UnexpectedCharacter,
106    UnterminatedString,
107    UnterminatedVarRef,
108    InvalidEscape,
109    InvalidNumber,
110    AmbiguousBoolean(String),
111    AmbiguousBooleanLike(String),
112    InvalidFloatNoLeading,
113    InvalidFloatNoTrailing,
114    /// Nesting depth exceeded (too many nested parentheses in arithmetic).
115    NestingTooDeep,
116    /// Heredoc body ended without seeing the closing delimiter on its own line.
117    /// The user almost certainly meant to type the delimiter — silently using
118    /// whatever was collected up to EOF would mask missing data.
119    UnterminatedHeredoc { delimiter: String },
120    /// Backtick command substitution. Kaish drops backticks intentionally —
121    /// they're listed in `docs/LANGUAGE.md` and the help system as not supported.
122    /// We surface this as a dedicated error (rather than `UnexpectedCharacter`)
123    /// so the message can point users at the `$(cmd)` replacement.
124    BackticksNotSupported,
125}
126
127impl fmt::Display for LexerError {
128    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
129        match self {
130            LexerError::UnexpectedCharacter => write!(f, "unexpected character"),
131            LexerError::UnterminatedString => write!(f, "unterminated string"),
132            LexerError::UnterminatedVarRef => write!(f, "unterminated variable reference"),
133            LexerError::InvalidEscape => write!(f, "invalid escape sequence"),
134            LexerError::InvalidNumber => write!(f, "invalid number"),
135            LexerError::AmbiguousBoolean(s) => {
136                write!(f, "ambiguous boolean, use lowercase '{}'", s.to_lowercase())
137            }
138            LexerError::AmbiguousBooleanLike(s) => {
139                let suggest = if s.eq_ignore_ascii_case("yes") { "true" } else { "false" };
140                write!(f, "ambiguous boolean-like '{}', use '{}' or '\"{}\"'", s, suggest, s)
141            }
142            LexerError::InvalidFloatNoLeading => write!(f, "float must have leading digit"),
143            LexerError::InvalidFloatNoTrailing => write!(f, "float must have trailing digit"),
144            LexerError::NestingTooDeep => write!(f, "nesting depth exceeded (max {})", MAX_PAREN_DEPTH),
145            LexerError::UnterminatedHeredoc { delimiter } => {
146                write!(f, "unterminated heredoc, expected closing delimiter `{}` on its own line", delimiter)
147            }
148            LexerError::BackticksNotSupported => {
149                write!(f, "backticks are not supported in kaish; use $(cmd) instead")
150            }
151        }
152    }
153}
154
155/// Tokens produced by the kaish lexer.
156///
157/// The order of variants matters for logos priority. More specific patterns
158/// (like keywords) should come before more general ones (like identifiers).
159///
160/// Tokens that carry semantic values (strings, numbers, identifiers) include
161/// the parsed value directly. This ensures the parser has access to actual
162/// data, not just token types.
163/// Here-doc content data.
164///
165/// - `literal` is true when the delimiter was quoted (`<<'EOF'` or `<<"EOF"`),
166///   meaning no variable expansion should occur.
167/// - `strip_tabs` is true for the `<<-EOF` form. Per POSIX, leading tabs on
168///   each body line are stripped at materialization time. Stripping happens
169///   downstream of the parser so byte offsets in `content` stay aligned with
170///   their original-source positions for span-tracking purposes.
171/// - `body_start_offset` is the byte offset of the first character of `content`
172///   in the source string fed into the lexer's `tokenize`. This lets the parser
173///   compute absolute spans for parts found inside the body during interpolation.
174///   In sources without arithmetic preprocessing rewrites, this equals the
175///   original-source offset; with arithmetic before the heredoc, line numbers
176///   may shift slightly until full preprocessing-layer composition lands.
177#[derive(Debug, Clone, PartialEq)]
178pub struct HereDocData {
179    pub content: String,
180    pub literal: bool,
181    pub strip_tabs: bool,
182    pub body_start_offset: usize,
183}
184
185#[derive(Logos, Debug, Clone, PartialEq)]
186#[logos(error = LexerError)]
187#[logos(skip r"[ \t]+")]
188pub enum Token {
189    // ═══════════════════════════════════════════════════════════════════
190    // Keywords (must come before Ident for priority)
191    // ═══════════════════════════════════════════════════════════════════
192    #[token("set")]
193    Set,
194
195    #[token("local")]
196    Local,
197
198    #[token("if")]
199    If,
200
201    #[token("then")]
202    Then,
203
204    #[token("else")]
205    Else,
206
207    #[token("elif")]
208    Elif,
209
210    #[token("fi")]
211    Fi,
212
213    #[token("for")]
214    For,
215
216    #[token("while")]
217    While,
218
219    #[token("in")]
220    In,
221
222    #[token("do")]
223    Do,
224
225    #[token("done")]
226    Done,
227
228    #[token("case")]
229    Case,
230
231    #[token("esac")]
232    Esac,
233
234    #[token("function")]
235    Function,
236
237    #[token("break")]
238    Break,
239
240    #[token("continue")]
241    Continue,
242
243    #[token("return")]
244    Return,
245
246    #[token("exit")]
247    Exit,
248
249    #[token("true")]
250    True,
251
252    #[token("false")]
253    False,
254
255    // ═══════════════════════════════════════════════════════════════════
256    // Type keywords (for tool parameters)
257    // ═══════════════════════════════════════════════════════════════════
258    #[token("string")]
259    TypeString,
260
261    #[token("int")]
262    TypeInt,
263
264    #[token("float")]
265    TypeFloat,
266
267    #[token("bool")]
268    TypeBool,
269
270    // ═══════════════════════════════════════════════════════════════════
271    // Multi-character operators (must come before single-char versions)
272    // ═══════════════════════════════════════════════════════════════════
273    #[token("&&")]
274    And,
275
276    #[token("||")]
277    Or,
278
279    #[token("==")]
280    EqEq,
281
282    #[token("!=")]
283    NotEq,
284
285    #[token("=~")]
286    Match,
287
288    #[token("!~")]
289    NotMatch,
290
291    #[token(">=")]
292    GtEq,
293
294    #[token("<=")]
295    LtEq,
296
297    #[token(">>")]
298    GtGt,
299
300    #[token("2>&1")]
301    StderrToStdout,
302
303    #[token("1>&2")]
304    StdoutToStderr,
305
306    #[token(">&2")]
307    StdoutToStderr2,
308
309    #[token("2>")]
310    Stderr,
311
312    #[token("&>")]
313    Both,
314
315    #[token("<<<")]
316    HereString,
317
318    #[token("<<")]
319    HereDocStart,
320
321    #[token(";;")]
322    DoubleSemi,
323
324    // ═══════════════════════════════════════════════════════════════════
325    // Single-character operators and punctuation
326    // ═══════════════════════════════════════════════════════════════════
327    #[token("=")]
328    Eq,
329
330    #[token("|")]
331    Pipe,
332
333    #[token("&")]
334    Amp,
335
336    #[token(">")]
337    Gt,
338
339    #[token("<")]
340    Lt,
341
342    #[token(";")]
343    Semi,
344
345    #[token(":")]
346    Colon,
347
348    #[token(",")]
349    Comma,
350
351    #[token("..")]
352    DotDot,
353
354    #[token(".")]
355    Dot,
356
357    /// Tilde path: `~/foo`, `~user/bar` - value includes the full string
358    #[regex(r"~[a-zA-Z0-9_./+-]+", lex_tilde_path, priority = 3)]
359    TildePath(String),
360
361    /// Bare tilde: `~` alone (expands to $HOME)
362    #[token("~")]
363    Tilde,
364
365    /// Relative path: `../foo/bar`, bare `src/kaish` (ident containing `/`),
366    /// or a directory reference with a trailing slash like `dest/`. The
367    /// trailing-slash form uses `*` (not `+`) after the slash so `dest/`
368    /// lexes as one token instead of `Ident("dest")` + `Path("/")` — the
369    /// latter split silently turned `cp a b dest/` into a 4-operand command.
370    #[regex(r"\.\./[a-zA-Z0-9_./-]+", lex_relative_path, priority = 3)]
371    #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*/[a-zA-Z0-9_./-]*", lex_relative_path, priority = 3)]
372    RelativePath(String),
373
374    /// Dot-slash path: `./foo`, `./script.sh`
375    #[regex(r"\./[a-zA-Z0-9_./-]+", lex_dot_slash_path, priority = 3)]
376    DotSlashPath(String),
377
378    /// Dot-prefixed bareword: `.parent`, `.gitignore`, `.foo.bar`.
379    /// Treated as an opaque string in argv position. Distinct from `Token::Dot`
380    /// (the POSIX `.` source alias) which only matches a bare `.` — the source
381    /// alias requires whitespace before its file argument (`. script`), so
382    /// `.parent` (no space) is unambiguously a single bareword.
383    #[regex(r"\.[a-zA-Z_][a-zA-Z0-9_.-]*", lex_dotted_ident, priority = 3)]
384    DottedIdent(String),
385
386    #[token("{")]
387    LBrace,
388
389    #[token("}")]
390    RBrace,
391
392    #[token("[")]
393    LBracket,
394
395    #[token("]")]
396    RBracket,
397
398    #[token("(")]
399    LParen,
400
401    #[token(")")]
402    RParen,
403
404    #[token("*")]
405    Star,
406
407    #[token("!")]
408    Bang,
409
410    #[token("?")]
411    Question,
412
413    /// Merged glob word: span-adjacent tokens containing `*`, `?`, or `[...]`.
414    /// Synthesized by `merge_glob_adjacent()`, never produced by logos directly.
415    GlobWord(String),
416
417    // ═══════════════════════════════════════════════════════════════════
418    // Command substitution
419    // ═══════════════════════════════════════════════════════════════════
420
421    /// Arithmetic expression content: synthesized by preprocessing.
422    /// Contains the expression string between `$((` and `))`.
423    Arithmetic(String),
424
425    /// Command substitution start: `$(` - begins a command substitution
426    #[token("$(")]
427    CmdSubstStart,
428
429    // ═══════════════════════════════════════════════════════════════════
430    // Flags (must come before Int to win over negative numbers)
431    // ═══════════════════════════════════════════════════════════════════
432
433    /// Long flag: `--name` or `--foo-bar`
434    #[regex(r"--[a-zA-Z][a-zA-Z0-9-]*", lex_long_flag, priority = 3)]
435    LongFlag(String),
436
437    /// Short flag: `-l` or `-la` (combined short flags)
438    #[regex(r"-[a-zA-Z][a-zA-Z0-9]*", lex_short_flag, priority = 3)]
439    ShortFlag(String),
440
441    /// Plus flag: `+e` or `+x` (for set +e to disable options)
442    #[regex(r"\+[a-zA-Z][a-zA-Z0-9]*", lex_plus_flag, priority = 3)]
443    PlusFlag(String),
444
445    /// Double dash: `--` alone marks end of flags
446    #[token("--")]
447    DoubleDash,
448
449    /// Bare word starting with + followed by non-letter: `+%s`, `+%Y-%m-%d`
450    /// For date format strings and similar. Lower priority than PlusFlag.
451    #[regex(r"\+[^a-zA-Z\s][^\s]*", lex_plus_bare, priority = 2)]
452    PlusBare(String),
453
454    /// Bare word starting with - followed by non-letter/digit/dash: `-%`, etc.
455    /// For rare cases. Lower priority than ShortFlag, Int, and DoubleDash.
456    /// Excludes - after first - to avoid matching --name patterns.
457    #[regex(r"-[^a-zA-Z0-9\s\-][^\s]*", lex_minus_bare, priority = 1)]
458    MinusBare(String),
459
460    /// Standalone - (stdin indicator for cat -, diff - -, etc.)
461    /// Only matches when followed by whitespace or end.
462    /// This is handled specially in the parser as a positional arg.
463    #[token("-")]
464    MinusAlone,
465
466    // ═══════════════════════════════════════════════════════════════════
467    // Literals (with values)
468    // ═══════════════════════════════════════════════════════════════════
469
470    /// Double-quoted string: `"..."` - value is the parsed content (quotes removed, escapes processed)
471    #[regex(r#""([^"\\]|\\.)*""#, lex_string)]
472    String(String),
473
474    /// Single-quoted string: `'...'` - literal content, no escape processing
475    #[regex(r"'[^']*'", lex_single_string)]
476    SingleString(String),
477
478    /// Braced variable reference: `${VAR}` or `${VAR.field}` - value is the raw inner content
479    #[regex(r"\$\{[^}]+\}", lex_varref)]
480    VarRef(String),
481
482    /// Simple variable reference: `$NAME` - just the identifier
483    #[regex(r"\$[a-zA-Z_][a-zA-Z0-9_]*", lex_simple_varref)]
484    SimpleVarRef(String),
485
486    /// Positional parameter: `$0` through `$9`
487    #[regex(r"\$[0-9]", lex_positional)]
488    Positional(usize),
489
490    /// All positional parameters: `$@`
491    #[token("$@")]
492    AllArgs,
493
494    /// Number of positional parameters: `$#`
495    #[token("$#")]
496    ArgCount,
497
498    /// Last exit code: `$?`
499    #[token("$?")]
500    LastExitCode,
501
502    /// Current shell PID: `$$`
503    #[token("$$")]
504    CurrentPid,
505
506    /// Variable string length: `${#VAR}`
507    #[regex(r"\$\{#[a-zA-Z_][a-zA-Z0-9_]*\}", lex_var_length)]
508    VarLength(String),
509
510    /// Here-doc content: synthesized by preprocessing, not directly lexed.
511    /// Contains the full content of the here-doc (without the delimiter lines).
512    HereDoc(HereDocData),
513
514    /// Integer literal - value is the parsed i64
515    #[regex(r"-?[0-9]+", lex_int, priority = 2)]
516    Int(i64),
517
518    /// Float literal - value is the parsed f64
519    #[regex(r"-?[0-9]+\.[0-9]+", lex_float)]
520    Float(f64),
521
522    // ═══════════════════════════════════════════════════════════════════
523    // Invalid patterns (caught before valid tokens for better errors)
524    // ═══════════════════════════════════════════════════════════════════
525
526    /// Digit-leading bareword: `019dda1c` (SHA prefix), UUIDs, version-ish
527    /// strings. Distinguished from `Int` because at least one alpha character
528    /// follows the leading digits — the lexer commits to "this is a string,
529    /// not a number." Treated as a bareword string in expression position.
530    #[regex(r"[0-9]+[a-zA-Z_][a-zA-Z0-9_.-]*", lex_number_ident, priority = 3)]
531    NumberIdent(String),
532
533    /// Invalid: float without leading digit (like .5)
534    #[regex(r"\.[0-9]+", lex_invalid_float_no_leading, priority = 3)]
535    InvalidFloatNoLeading,
536
537    /// Invalid: float without trailing digit (like 5.)
538    /// Logos uses longest-match, so valid floats like 5.5 will match Float pattern instead
539    #[regex(r"[0-9]+\.", lex_invalid_float_no_trailing, priority = 2)]
540    InvalidFloatNoTrailing,
541
542    // ═══════════════════════════════════════════════════════════════════
543    // Paths (absolute paths starting with /)
544    // ═══════════════════════════════════════════════════════════════════
545
546    /// Absolute path: `/tmp/out`, `/etc/hosts`, etc.
547    #[regex(r"/[a-zA-Z0-9_./+-]*", lex_path)]
548    Path(String),
549
550    // ═══════════════════════════════════════════════════════════════════
551    // Identifiers (command names, variable names, etc.)
552    // ═══════════════════════════════════════════════════════════════════
553
554    /// Identifier - value is the identifier string
555    /// Allows dots for filenames like `script.kai`
556    #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*", lex_ident)]
557    Ident(String),
558
559    // ═══════════════════════════════════════════════════════════════════
560    // Structural tokens
561    // ═══════════════════════════════════════════════════════════════════
562
563    /// Comment: `# ...` to end of line
564    #[regex(r"#[^\n\r]*", allow_greedy = true)]
565    Comment,
566
567    /// Newline (significant in kaish - ends statements)
568    #[regex(r"\n|\r\n")]
569    Newline,
570
571    /// Line continuation: backslash at end of line
572    #[regex(r"\\[ \t]*(\n|\r\n)")]
573    LineContinuation,
574
575    /// Backtick command substitution — explicitly rejected. Kaish drops
576    /// backticks; the callback always errors so users get a dedicated
577    /// `BackticksNotSupported` message instead of the generic
578    /// `UnexpectedCharacter` they would have hit before. Backticks inside
579    /// single/double-quoted strings, heredoc bodies, and comments don't
580    /// reach this match — those tokens are matched as a single unit
581    /// (strings) or extracted before logos runs (heredocs) or skipped to
582    /// EOL (comments).
583    #[token("`", reject_backtick)]
584    BacktickRejected,
585}
586
587/// Semantic category for syntax highlighting.
588///
589/// Stable enum that groups tokens by purpose. Consumers match on categories
590/// instead of individual tokens, insulating them from lexer evolution.
591#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
592pub enum TokenCategory {
593    /// Keywords: if, then, else, for, while, function, return, etc.
594    Keyword,
595    /// Operators: |, &&, ||, >, >>, 2>&1, =, ==, etc.
596    Operator,
597    /// String literals: "...", '...', heredocs
598    String,
599    /// Numeric literals: 123, 3.14, arithmetic expressions
600    Number,
601    /// Variable references: $foo, ${bar}, $1, $@, $#, $?, $$
602    Variable,
603    /// Comments: # ...
604    Comment,
605    /// Punctuation: ; , . ( ) { } [ ]
606    Punctuation,
607    /// Identifiers in command position
608    Command,
609    /// Absolute paths: /foo/bar
610    Path,
611    /// Flags: --long, -s, +x
612    Flag,
613    /// Invalid tokens
614    Error,
615}
616
617impl Token {
618    /// Returns the semantic category for syntax highlighting.
619    pub fn category(&self) -> TokenCategory {
620        match self {
621            // Keywords
622            Token::If
623            | Token::Then
624            | Token::Else
625            | Token::Elif
626            | Token::Fi
627            | Token::For
628            | Token::In
629            | Token::Do
630            | Token::Done
631            | Token::While
632            | Token::Case
633            | Token::Esac
634            | Token::Function
635            | Token::Return
636            | Token::Break
637            | Token::Continue
638            | Token::Exit
639            | Token::Set
640            | Token::Local
641            | Token::True
642            | Token::False
643            | Token::TypeString
644            | Token::TypeInt
645            | Token::TypeFloat
646            | Token::TypeBool => TokenCategory::Keyword,
647
648            // Operators and redirections
649            Token::Pipe
650            | Token::And
651            | Token::Or
652            | Token::Amp
653            | Token::Eq
654            | Token::EqEq
655            | Token::NotEq
656            | Token::Match
657            | Token::NotMatch
658            | Token::Lt
659            | Token::Gt
660            | Token::LtEq
661            | Token::GtEq
662            | Token::GtGt
663            | Token::Stderr
664            | Token::Both
665            | Token::HereDocStart
666            | Token::HereString
667            | Token::StderrToStdout
668            | Token::StdoutToStderr
669            | Token::StdoutToStderr2 => TokenCategory::Operator,
670
671            // Strings
672            Token::String(_) | Token::SingleString(_) | Token::HereDoc(_) => TokenCategory::String,
673
674            // Numbers
675            Token::Int(_) | Token::Float(_) | Token::Arithmetic(_) => TokenCategory::Number,
676
677            // Variables
678            Token::VarRef(_)
679            | Token::SimpleVarRef(_)
680            | Token::Positional(_)
681            | Token::AllArgs
682            | Token::ArgCount
683            | Token::VarLength(_)
684            | Token::LastExitCode
685            | Token::CurrentPid => TokenCategory::Variable,
686
687            // Flags
688            Token::LongFlag(_)
689            | Token::ShortFlag(_)
690            | Token::PlusFlag(_)
691            | Token::DoubleDash => TokenCategory::Flag,
692
693            // Punctuation
694            Token::Semi
695            | Token::DoubleSemi
696            | Token::Colon
697            | Token::Comma
698            | Token::Dot
699            | Token::LParen
700            | Token::RParen
701            | Token::LBrace
702            | Token::RBrace
703            | Token::LBracket
704            | Token::RBracket
705            | Token::Bang
706            | Token::Question
707            | Token::Star
708            | Token::Newline
709            | Token::LineContinuation
710            | Token::CmdSubstStart => TokenCategory::Punctuation,
711
712            // Glob words (merged tokens containing wildcards)
713            Token::GlobWord(_) => TokenCategory::Path,
714
715            // Comments
716            Token::Comment => TokenCategory::Comment,
717
718            // Paths
719            Token::Path(_)
720            | Token::TildePath(_)
721            | Token::RelativePath(_)
722            | Token::Tilde
723            | Token::DotDot
724            | Token::DotSlashPath(_) => TokenCategory::Path,
725
726            // Commands/identifiers (and bare words)
727            Token::Ident(_)
728            | Token::PlusBare(_)
729            | Token::MinusBare(_)
730            | Token::MinusAlone
731            | Token::NumberIdent(_)
732            | Token::DottedIdent(_) => TokenCategory::Command,
733
734            // Errors
735            Token::InvalidFloatNoLeading
736            | Token::InvalidFloatNoTrailing
737            | Token::BacktickRejected => TokenCategory::Error,
738        }
739    }
740}
741
742/// Lex a double-quoted string literal, processing escape sequences.
743fn lex_string(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
744    parse_string_literal(lex.slice())
745}
746
747/// Lex a single-quoted string literal (no escape processing).
748fn lex_single_string(lex: &mut logos::Lexer<Token>) -> String {
749    let s = lex.slice();
750    // Strip the surrounding single quotes
751    s[1..s.len() - 1].to_string()
752}
753
754/// Lex a braced variable reference, extracting the inner content.
755fn lex_varref(lex: &mut logos::Lexer<Token>) -> String {
756    // Keep the full ${...} for later parsing of path segments
757    lex.slice().to_string()
758}
759
760/// Lex a simple variable reference: `$NAME` → `NAME`
761fn lex_simple_varref(lex: &mut logos::Lexer<Token>) -> String {
762    // Strip the leading `$`
763    lex.slice()[1..].to_string()
764}
765
766/// Lex a positional parameter: `$1` → 1
767fn lex_positional(lex: &mut logos::Lexer<Token>) -> usize {
768    // Strip the leading `$` and parse the digit
769    lex.slice()[1..].parse().unwrap_or(0)
770}
771
772/// Lex a variable length: `${#VAR}` → "VAR"
773fn lex_var_length(lex: &mut logos::Lexer<Token>) -> String {
774    // Strip the leading `${#` and trailing `}`
775    let s = lex.slice();
776    s[3..s.len() - 1].to_string()
777}
778
779/// Lex an integer literal.
780fn lex_int(lex: &mut logos::Lexer<Token>) -> Result<i64, LexerError> {
781    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
782}
783
784/// Lex a float literal.
785fn lex_float(lex: &mut logos::Lexer<Token>) -> Result<f64, LexerError> {
786    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
787}
788
789/// Lex a digit-leading bareword like `019dda1c` or `019dda1c-5b3f-7000`.
790/// Distinguished from `Int` because at least one alpha character follows the
791/// leading digits — the slice is treated as a string, not a number.
792fn lex_number_ident(lex: &mut logos::Lexer<Token>) -> String {
793    lex.slice().to_string()
794}
795
796/// Lex a dot-prefixed bareword like `.gitignore` or `.parent.parent`.
797fn lex_dotted_ident(lex: &mut logos::Lexer<Token>) -> String {
798    lex.slice().to_string()
799}
800
801/// Lex an invalid float without leading digit (like .5).
802/// Always returns Err to produce a lexer error instead of a token.
803fn lex_invalid_float_no_leading(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
804    Err(LexerError::InvalidFloatNoLeading)
805}
806
807/// Reject a backtick — kaish doesn't support backtick command substitution.
808/// The dedicated error gives the user a `$(cmd)` hint instead of the generic
809/// `UnexpectedCharacter` they would have hit otherwise.
810fn reject_backtick(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
811    Err(LexerError::BackticksNotSupported)
812}
813
814/// Lex an invalid float without trailing digit (like 5.).
815/// Always returns Err to produce a lexer error instead of a token.
816fn lex_invalid_float_no_trailing(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
817    Err(LexerError::InvalidFloatNoTrailing)
818}
819
820/// Lex an identifier, rejecting ambiguous boolean-like values.
821fn lex_ident(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
822    let s = lex.slice();
823
824    // Reject ambiguous boolean variants (TRUE, FALSE, True, etc.)
825    // Only lowercase 'true' and 'false' are valid booleans (handled by Token::True/False)
826    match s.to_lowercase().as_str() {
827        "true" | "false" if s != "true" && s != "false" => {
828            return Err(LexerError::AmbiguousBoolean(s.to_string()));
829        }
830        _ => {}
831    }
832
833    // Reject yes/no/YES/NO/Yes/No as ambiguous boolean-like values
834    if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("no") {
835        return Err(LexerError::AmbiguousBooleanLike(s.to_string()));
836    }
837
838    Ok(s.to_string())
839}
840
841/// Lex a long flag: `--name` → `name`
842fn lex_long_flag(lex: &mut logos::Lexer<Token>) -> String {
843    // Strip the leading `--`
844    lex.slice()[2..].to_string()
845}
846
847/// Lex a short flag: `-l` → `l`, `-la` → `la`
848fn lex_short_flag(lex: &mut logos::Lexer<Token>) -> String {
849    // Strip the leading `-`
850    lex.slice()[1..].to_string()
851}
852
853/// Lex a plus flag: `+e` → `e`, `+ex` → `ex`
854fn lex_plus_flag(lex: &mut logos::Lexer<Token>) -> String {
855    // Strip the leading `+`
856    lex.slice()[1..].to_string()
857}
858
859/// Lex a plus bare word: `+%s` → `+%s` (keep the full string)
860fn lex_plus_bare(lex: &mut logos::Lexer<Token>) -> String {
861    lex.slice().to_string()
862}
863
864/// Lex a minus bare word: `-%` → `-%` (keep the full string)
865fn lex_minus_bare(lex: &mut logos::Lexer<Token>) -> String {
866    lex.slice().to_string()
867}
868
869/// Lex an absolute path: `/tmp/out` → `/tmp/out`
870fn lex_path(lex: &mut logos::Lexer<Token>) -> String {
871    lex.slice().to_string()
872}
873
874/// Lex a tilde path: `~/foo` → `~/foo`
875fn lex_tilde_path(lex: &mut logos::Lexer<Token>) -> String {
876    lex.slice().to_string()
877}
878
879/// Lex a relative path: `../foo` → `../foo`
880fn lex_relative_path(lex: &mut logos::Lexer<Token>) -> String {
881    lex.slice().to_string()
882}
883
884/// Lex a dot-slash path: `./foo` → `./foo`
885fn lex_dot_slash_path(lex: &mut logos::Lexer<Token>) -> String {
886    lex.slice().to_string()
887}
888
889impl fmt::Display for Token {
890    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
891        match self {
892            Token::Set => write!(f, "set"),
893            Token::Local => write!(f, "local"),
894            Token::If => write!(f, "if"),
895            Token::Then => write!(f, "then"),
896            Token::Else => write!(f, "else"),
897            Token::Elif => write!(f, "elif"),
898            Token::Fi => write!(f, "fi"),
899            Token::For => write!(f, "for"),
900            Token::While => write!(f, "while"),
901            Token::In => write!(f, "in"),
902            Token::Do => write!(f, "do"),
903            Token::Done => write!(f, "done"),
904            Token::Case => write!(f, "case"),
905            Token::Esac => write!(f, "esac"),
906            Token::Function => write!(f, "function"),
907            Token::Break => write!(f, "break"),
908            Token::Continue => write!(f, "continue"),
909            Token::Return => write!(f, "return"),
910            Token::Exit => write!(f, "exit"),
911            Token::True => write!(f, "true"),
912            Token::False => write!(f, "false"),
913            Token::TypeString => write!(f, "string"),
914            Token::TypeInt => write!(f, "int"),
915            Token::TypeFloat => write!(f, "float"),
916            Token::TypeBool => write!(f, "bool"),
917            Token::And => write!(f, "&&"),
918            Token::Or => write!(f, "||"),
919            Token::EqEq => write!(f, "=="),
920            Token::NotEq => write!(f, "!="),
921            Token::Match => write!(f, "=~"),
922            Token::NotMatch => write!(f, "!~"),
923            Token::GtEq => write!(f, ">="),
924            Token::LtEq => write!(f, "<="),
925            Token::GtGt => write!(f, ">>"),
926            Token::StderrToStdout => write!(f, "2>&1"),
927            Token::StdoutToStderr => write!(f, "1>&2"),
928            Token::StdoutToStderr2 => write!(f, ">&2"),
929            Token::Stderr => write!(f, "2>"),
930            Token::Both => write!(f, "&>"),
931            Token::HereDocStart => write!(f, "<<"),
932            Token::HereString => write!(f, "<<<"),
933            Token::DoubleSemi => write!(f, ";;"),
934            Token::Eq => write!(f, "="),
935            Token::Pipe => write!(f, "|"),
936            Token::Amp => write!(f, "&"),
937            Token::Gt => write!(f, ">"),
938            Token::Lt => write!(f, "<"),
939            Token::Semi => write!(f, ";"),
940            Token::Colon => write!(f, ":"),
941            Token::Comma => write!(f, ","),
942            Token::Dot => write!(f, "."),
943            Token::DotDot => write!(f, ".."),
944            Token::Tilde => write!(f, "~"),
945            Token::TildePath(s) => write!(f, "{}", s),
946            Token::RelativePath(s) => write!(f, "{}", s),
947            Token::DotSlashPath(s) => write!(f, "{}", s),
948            Token::LBrace => write!(f, "{{"),
949            Token::RBrace => write!(f, "}}"),
950            Token::LBracket => write!(f, "["),
951            Token::RBracket => write!(f, "]"),
952            Token::LParen => write!(f, "("),
953            Token::RParen => write!(f, ")"),
954            Token::Star => write!(f, "*"),
955            Token::Bang => write!(f, "!"),
956            Token::Question => write!(f, "?"),
957            Token::GlobWord(s) => write!(f, "GLOB({})", s),
958            Token::Arithmetic(s) => write!(f, "ARITHMETIC({})", s),
959            Token::CmdSubstStart => write!(f, "$("),
960            Token::LongFlag(s) => write!(f, "--{}", s),
961            Token::ShortFlag(s) => write!(f, "-{}", s),
962            Token::PlusFlag(s) => write!(f, "+{}", s),
963            Token::DoubleDash => write!(f, "--"),
964            Token::PlusBare(s) => write!(f, "{}", s),
965            Token::MinusBare(s) => write!(f, "{}", s),
966            Token::MinusAlone => write!(f, "-"),
967            Token::String(s) => write!(f, "STRING({:?})", s),
968            Token::SingleString(s) => write!(f, "SINGLESTRING({:?})", s),
969            Token::HereDoc(d) => write!(f, "HEREDOC({:?}, literal={})", d.content, d.literal),
970            Token::VarRef(v) => write!(f, "VARREF({})", v),
971            Token::SimpleVarRef(v) => write!(f, "SIMPLEVARREF({})", v),
972            Token::Positional(n) => write!(f, "${}", n),
973            Token::AllArgs => write!(f, "$@"),
974            Token::ArgCount => write!(f, "$#"),
975            Token::LastExitCode => write!(f, "$?"),
976            Token::CurrentPid => write!(f, "$$"),
977            Token::VarLength(v) => write!(f, "${{#{}}}", v),
978            Token::Int(n) => write!(f, "INT({})", n),
979            Token::Float(n) => write!(f, "FLOAT({})", n),
980            Token::Path(s) => write!(f, "PATH({})", s),
981            Token::Ident(s) => write!(f, "IDENT({})", s),
982            Token::NumberIdent(s) => write!(f, "NUMIDENT({})", s),
983            Token::DottedIdent(s) => write!(f, "DOTIDENT({})", s),
984            Token::Comment => write!(f, "COMMENT"),
985            Token::Newline => write!(f, "NEWLINE"),
986            Token::LineContinuation => write!(f, "LINECONT"),
987            // These variants should never be produced — their callbacks always return errors
988            Token::InvalidFloatNoLeading => write!(f, "INVALID_FLOAT_NO_LEADING"),
989            Token::InvalidFloatNoTrailing => write!(f, "INVALID_FLOAT_NO_TRAILING"),
990            Token::BacktickRejected => write!(f, "BACKTICK_REJECTED"),
991        }
992    }
993}
994
995impl Token {
996    /// Returns true if this token is a keyword.
997    // Must match the Keyword variants in `Token::category()` (minus the
998    // TypeX variants, which `is_type()` covers separately). Currently
999    // uncalled — kept exhaustive so future callers don't get wrong answers.
1000    pub fn is_keyword(&self) -> bool {
1001        matches!(
1002            self,
1003            Token::Set
1004                | Token::Local
1005                | Token::If
1006                | Token::Then
1007                | Token::Else
1008                | Token::Elif
1009                | Token::Fi
1010                | Token::For
1011                | Token::In
1012                | Token::Do
1013                | Token::Done
1014                | Token::While
1015                | Token::Case
1016                | Token::Esac
1017                | Token::Function
1018                | Token::Return
1019                | Token::Break
1020                | Token::Continue
1021                | Token::Exit
1022                | Token::True
1023                | Token::False
1024        )
1025    }
1026
1027    /// Returns true if this token is a type keyword.
1028    pub fn is_type(&self) -> bool {
1029        matches!(
1030            self,
1031            Token::TypeString
1032                | Token::TypeInt
1033                | Token::TypeFloat
1034                | Token::TypeBool
1035        )
1036    }
1037
1038    /// Returns true if this token starts a statement.
1039    // Currently uncalled — kept exhaustive so future callers don't get wrong answers.
1040    pub fn starts_statement(&self) -> bool {
1041        matches!(
1042            self,
1043            Token::Set
1044                | Token::Local
1045                | Token::Function
1046                | Token::If
1047                | Token::For
1048                | Token::While
1049                | Token::Case
1050                | Token::Ident(_)
1051                | Token::LBracket
1052        )
1053    }
1054
1055    /// Returns true if this token can appear in an expression.
1056    pub fn is_value(&self) -> bool {
1057        matches!(
1058            self,
1059            Token::String(_)
1060                | Token::SingleString(_)
1061                | Token::HereDoc(_)
1062                | Token::Arithmetic(_)
1063                | Token::Int(_)
1064                | Token::Float(_)
1065                | Token::True
1066                | Token::False
1067                | Token::VarRef(_)
1068                | Token::SimpleVarRef(_)
1069                | Token::CmdSubstStart
1070                | Token::Path(_)
1071                | Token::GlobWord(_)
1072                | Token::LastExitCode
1073                | Token::CurrentPid
1074        )
1075    }
1076}
1077
1078/// Result of preprocessing arithmetic expressions.
1079struct ArithmeticPreprocessResult {
1080    /// Preprocessed source with markers replacing $((expr)).
1081    text: String,
1082    /// Vector of (marker, expression_content) pairs.
1083    arithmetics: Vec<(String, String)>,
1084    /// Span replacements for correcting token positions.
1085    replacements: Vec<SpanReplacement>,
1086}
1087
1088/// Skip a `$(...)` command substitution with quote-aware paren matching.
1089///
1090/// Copies the entire command substitution verbatim to `result`, handling
1091/// single quotes, double quotes, and backslash escapes inside the sub so
1092/// that parentheses within strings don't confuse the depth counter.
1093///
1094/// On entry, `i` points to the `$` of `$(`. On exit, `i` points past the
1095/// closing `)`.
1096fn skip_command_substitution(
1097    chars: &[char],
1098    i: &mut usize,
1099    source_pos: &mut usize,
1100    result: &mut String,
1101) {
1102    // Copy $(
1103    result.push('$');
1104    result.push('(');
1105    *i += 2;
1106    *source_pos += 2;
1107
1108    let mut depth: usize = 1;
1109    let mut in_single_quote = false;
1110    let mut in_double_quote = false;
1111
1112    while *i < chars.len() && depth > 0 {
1113        let c = chars[*i];
1114
1115        if in_single_quote {
1116            result.push(c);
1117            *source_pos += c.len_utf8();
1118            *i += 1;
1119            if c == '\'' {
1120                in_single_quote = false;
1121            }
1122            continue;
1123        }
1124
1125        if in_double_quote {
1126            if c == '\\' && *i + 1 < chars.len() {
1127                let next = chars[*i + 1];
1128                if next == '"' || next == '\\' || next == '$' || next == '`' {
1129                    result.push(c);
1130                    result.push(next);
1131                    *source_pos += c.len_utf8() + next.len_utf8();
1132                    *i += 2;
1133                    continue;
1134                }
1135            }
1136            if c == '"' {
1137                in_double_quote = false;
1138            }
1139            result.push(c);
1140            *source_pos += c.len_utf8();
1141            *i += 1;
1142            continue;
1143        }
1144
1145        // Outside quotes
1146        match c {
1147            '\'' => {
1148                in_single_quote = true;
1149                result.push(c);
1150                *source_pos += c.len_utf8();
1151                *i += 1;
1152            }
1153            '"' => {
1154                in_double_quote = true;
1155                result.push(c);
1156                *source_pos += c.len_utf8();
1157                *i += 1;
1158            }
1159            '\\' if *i + 1 < chars.len() => {
1160                result.push(c);
1161                result.push(chars[*i + 1]);
1162                *source_pos += c.len_utf8() + chars[*i + 1].len_utf8();
1163                *i += 2;
1164            }
1165            '(' => {
1166                depth += 1;
1167                result.push(c);
1168                *source_pos += c.len_utf8();
1169                *i += 1;
1170            }
1171            ')' => {
1172                depth -= 1;
1173                result.push(c);
1174                *source_pos += c.len_utf8();
1175                *i += 1;
1176            }
1177            _ => {
1178                result.push(c);
1179                *source_pos += c.len_utf8();
1180                *i += 1;
1181            }
1182        }
1183    }
1184}
1185
1186/// Preprocess arithmetic expressions in source code.
1187///
1188/// Finds `$((expr))` patterns and replaces them with markers.
1189/// Returns the preprocessed source, arithmetic contents, and span replacement info.
1190///
1191/// Example:
1192///   `X=$((1 + 2))`
1193/// Becomes:
1194///   `X=__KAISH_ARITH_{id}__`
1195/// With arithmetics[0] = ("__KAISH_ARITH_{id}__", "1 + 2")
1196///
1197/// # Errors
1198/// Returns `LexerError::NestingTooDeep` if parentheses are nested beyond MAX_PAREN_DEPTH.
1199fn preprocess_arithmetic(source: &str) -> Result<ArithmeticPreprocessResult, LexerError> {
1200    let mut result = String::with_capacity(source.len());
1201    let mut arithmetics: Vec<(String, String)> = Vec::new();
1202    let mut replacements: Vec<SpanReplacement> = Vec::new();
1203    let mut source_pos: usize = 0;
1204    let chars_vec: Vec<char> = source.chars().collect();
1205    let mut i = 0;
1206
1207    // Whether we're currently inside double quotes. Single quotes inside
1208    // double quotes are literal characters, not quote delimiters.
1209    let mut in_double_quote = false;
1210
1211    while i < chars_vec.len() {
1212        let ch = chars_vec[i];
1213
1214        // Backslash escape outside quotes — skip both chars verbatim
1215        if !in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1216            result.push(ch);
1217            result.push(chars_vec[i + 1]);
1218            source_pos += ch.len_utf8() + chars_vec[i + 1].len_utf8();
1219            i += 2;
1220            continue;
1221        }
1222
1223        // Single quote — only starts quote mode when NOT inside double quotes
1224        if ch == '\'' && !in_double_quote {
1225            result.push(ch);
1226            i += 1;
1227            source_pos += 1;
1228            while i < chars_vec.len() && chars_vec[i] != '\'' {
1229                result.push(chars_vec[i]);
1230                source_pos += chars_vec[i].len_utf8();
1231                i += 1;
1232            }
1233            if i < chars_vec.len() {
1234                result.push(chars_vec[i]); // closing quote
1235                source_pos += 1;
1236                i += 1;
1237            }
1238            continue;
1239        }
1240
1241        // Double quote — toggle state (arithmetic is still expanded inside)
1242        if ch == '"' {
1243            in_double_quote = !in_double_quote;
1244            result.push(ch);
1245            i += 1;
1246            source_pos += 1;
1247            continue;
1248        }
1249
1250        // Backslash escape inside double quotes — only \" and \\ are special
1251        if in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1252            let next = chars_vec[i + 1];
1253            if next == '"' || next == '\\' || next == '$' || next == '`' {
1254                result.push(ch);
1255                result.push(next);
1256                source_pos += ch.len_utf8() + next.len_utf8();
1257                i += 2;
1258                continue;
1259            }
1260        }
1261
1262        // Comment — copy verbatim from `#` through end-of-line so apostrophes
1263        // and `$((..))` inside the comment body don't get processed. Logos's
1264        // own comment regex `#[^\n\r]*` doesn't require a word boundary, so
1265        // we match that: any `#` outside double quotes (and outside single
1266        // quotes — those are consumed above as a single run) starts a comment.
1267        // The newline is left for the next iteration so newline-significance
1268        // and span tracking are preserved.
1269        if ch == '#' && !in_double_quote {
1270            while i < chars_vec.len() && chars_vec[i] != '\n' && chars_vec[i] != '\r' {
1271                result.push(chars_vec[i]);
1272                source_pos += chars_vec[i].len_utf8();
1273                i += 1;
1274            }
1275            continue;
1276        }
1277
1278        // Skip $(...) command substitutions — inner arithmetic belongs to the subcommand
1279        if ch == '$' && i + 1 < chars_vec.len() && chars_vec[i + 1] == '('
1280            && !(i + 2 < chars_vec.len() && chars_vec[i + 2] == '(')
1281        {
1282            skip_command_substitution(&chars_vec, &mut i, &mut source_pos, &mut result);
1283            continue;
1284        }
1285
1286        // Look for $(( (potential arithmetic)
1287        if ch == '$' && i + 2 < chars_vec.len() && chars_vec[i + 1] == '(' && chars_vec[i + 2] == '(' {
1288            let arith_start_pos = result.len();
1289            let original_start = source_pos;
1290
1291            // Skip $((
1292            i += 3;
1293            source_pos += 3;
1294
1295            // Collect expression until matching ))
1296            let mut expr = String::new();
1297            let mut paren_depth: usize = 0;
1298
1299            while i < chars_vec.len() {
1300                let c = chars_vec[i];
1301                match c {
1302                    '(' => {
1303                        paren_depth += 1;
1304                        if paren_depth > MAX_PAREN_DEPTH {
1305                            return Err(LexerError::NestingTooDeep);
1306                        }
1307                        expr.push('(');
1308                        i += 1;
1309                        source_pos += c.len_utf8();
1310                    }
1311                    ')' => {
1312                        if paren_depth > 0 {
1313                            paren_depth -= 1;
1314                            expr.push(')');
1315                            i += 1;
1316                            source_pos += 1;
1317                        } else if i + 1 < chars_vec.len() && chars_vec[i + 1] == ')' {
1318                            // Found closing ))
1319                            i += 2;
1320                            source_pos += 2;
1321                            break;
1322                        } else {
1323                            // Single ) inside - keep going
1324                            expr.push(')');
1325                            i += 1;
1326                            source_pos += 1;
1327                        }
1328                    }
1329                    _ => {
1330                        expr.push(c);
1331                        i += 1;
1332                        source_pos += c.len_utf8();
1333                    }
1334                }
1335            }
1336
1337            // Calculate original length: from $$(( to ))
1338            let original_len = source_pos - original_start;
1339
1340            // Create a unique marker for this arithmetic (collision-resistant)
1341            let marker = format!("__KAISH_ARITH_{}__", unique_marker_id());
1342            let marker_len = marker.len();
1343
1344            // Record the replacement for span correction
1345            replacements.push(SpanReplacement {
1346                preprocessed_pos: arith_start_pos,
1347                marker_len,
1348                original_len,
1349            });
1350
1351            arithmetics.push((marker.clone(), expr));
1352            result.push_str(&marker);
1353        } else {
1354            result.push(ch);
1355            i += 1;
1356            source_pos += ch.len_utf8();
1357        }
1358    }
1359
1360    Ok(ArithmeticPreprocessResult {
1361        text: result,
1362        arithmetics,
1363        replacements,
1364    })
1365}
1366
1367/// Per-heredoc metadata collected during preprocessing.
1368///
1369/// Stored verbatim alongside the substituted marker so the parser, validator,
1370/// and interpreter can reconstitute the body with correct semantics:
1371/// - `body` is the raw body bytes; tab stripping for `<<-` is applied later
1372///   (at materialization), so byte offsets stay aligned with the original
1373///   source for span tracking.
1374/// - `strip_tabs` records whether the `<<-` form was used.
1375/// - `literal` records whether the delimiter was quoted (no interpolation).
1376/// - `body_start_offset` is the byte offset of the first body character in
1377///   the source string passed to `preprocess_heredocs`. When heredocs are
1378///   preprocessed AFTER arithmetic, this is in arith-preprocessed coordinates;
1379///   in the common case (no arithmetic before the heredoc) this equals the
1380///   original-source offset. See span-correction notes in `tokenize`.
1381#[derive(Debug, Clone)]
1382struct HeredocReplacement {
1383    marker: String,
1384    body: String,
1385    literal: bool,
1386    strip_tabs: bool,
1387    body_start_offset: usize,
1388}
1389
1390/// Preprocess here-docs in source code.
1391///
1392/// Finds `<<WORD` patterns and collects content until the delimiter line.
1393/// Returns the preprocessed source and a vector of replacement records.
1394///
1395/// Example:
1396///   `cat <<EOF\nhello\nworld\nEOF`
1397/// Becomes:
1398///   `cat <<__HEREDOC_0__`
1399/// With heredocs[0] = HeredocReplacement { marker: "__HEREDOC_0__",
1400/// body: "hello\nworld", literal: false, strip_tabs: false }
1401fn preprocess_heredocs(source: &str) -> Result<(String, Vec<HeredocReplacement>), Spanned<LexerError>> {
1402    let mut result = String::with_capacity(source.len());
1403    let mut heredocs: Vec<HeredocReplacement> = Vec::new();
1404    let chars_vec: Vec<char> = source.chars().collect();
1405    let mut i = 0;
1406    // `pos` tracks the byte offset into `source` corresponding to chars_vec[i].
1407    // `result` accumulates output; we record body offsets in `pos` (input-side)
1408    // and emit positions via `result.len()` (output-side) where needed.
1409    let mut pos: usize = 0;
1410
1411    while i < chars_vec.len() {
1412        let ch = chars_vec[i];
1413
1414        // Pass <<< through verbatim so the logos tokenizer sees the here-string
1415        // operator. If we fell through naively, the next iteration would see
1416        // the remaining `<<` and misfire heredoc preprocessing.
1417        if ch == '<'
1418            && chars_vec.get(i + 1) == Some(&'<')
1419            && chars_vec.get(i + 2) == Some(&'<')
1420        {
1421            result.push_str("<<<");
1422            i += 3;
1423            pos += 3;
1424            continue;
1425        }
1426
1427        // Look for << (potential here-doc).
1428        if ch == '<' && chars_vec.get(i + 1) == Some(&'<') {
1429            // Remember where the `<<` started so an unterminated-heredoc
1430            // error can point back at the introducer rather than at EOF.
1431            let introducer_start = pos;
1432            i += 2; // consume both '<'
1433            pos += 2;
1434
1435            // Check for optional - (strip leading tabs)
1436            let strip_tabs = chars_vec.get(i) == Some(&'-');
1437            if strip_tabs {
1438                i += 1;
1439                pos += 1;
1440            }
1441
1442            // Skip whitespace before delimiter
1443            while let Some(&c) = chars_vec.get(i) {
1444                if c == ' ' || c == '\t' {
1445                    i += 1;
1446                    pos += 1;
1447                } else {
1448                    break;
1449                }
1450            }
1451
1452            // Collect the delimiter word
1453            let mut delimiter = String::new();
1454            let quoted = chars_vec.get(i) == Some(&'\'') || chars_vec.get(i) == Some(&'"');
1455            let quote_char = if quoted {
1456                let q = chars_vec.get(i).copied();
1457                i += 1;
1458                pos += 1;
1459                q
1460            } else {
1461                None
1462            };
1463
1464            while let Some(&c) = chars_vec.get(i) {
1465                if quoted {
1466                    if Some(c) == quote_char {
1467                        i += 1; // consume closing quote
1468                        pos += 1;
1469                        break;
1470                    }
1471                } else if c.is_whitespace() || c == '\n' || c == '\r' {
1472                    break;
1473                }
1474                delimiter.push(c);
1475                i += 1;
1476                pos += c.len_utf8();
1477            }
1478
1479            if delimiter.is_empty() {
1480                // Not a valid here-doc, output << literally
1481                result.push_str("<<");
1482                if strip_tabs {
1483                    result.push('-');
1484                }
1485                continue;
1486            }
1487
1488            // Buffer text after delimiter word (e.g., " | jq" in "cat <<EOF | jq")
1489            // This must be emitted AFTER the heredoc marker, not before.
1490            let mut after_delimiter = String::new();
1491            while let Some(&c) = chars_vec.get(i) {
1492                if c == '\n' {
1493                    i += 1;
1494                    pos += 1;
1495                    break;
1496                } else if c == '\r' {
1497                    i += 1;
1498                    pos += 1;
1499                    if chars_vec.get(i) == Some(&'\n') {
1500                        i += 1;
1501                        pos += 1;
1502                    }
1503                    break;
1504                }
1505                after_delimiter.push(c);
1506                i += 1;
1507                pos += c.len_utf8();
1508            }
1509
1510            // Collect content until delimiter on its own line.
1511            // `body_start_offset` is the byte position of the first char of
1512            // the body in the source — first char after the newline that
1513            // ended the delimiter line. See HeredocReplacement docs for
1514            // coordinate-system caveat (arith-preprocessed, not original).
1515            let body_start_offset = pos;
1516            let mut content = String::new();
1517            let mut current_line = String::new();
1518
1519            loop {
1520                let next = chars_vec.get(i).copied();
1521                match next {
1522                    Some('\n') => {
1523                        i += 1;
1524                        pos += 1;
1525                        // Check if this line is the delimiter
1526                        let trimmed = if strip_tabs {
1527                            current_line.trim_start_matches('\t')
1528                        } else {
1529                            &current_line
1530                        };
1531                        if trimmed == delimiter {
1532                            // Found end of here-doc
1533                            break;
1534                        }
1535                        // Add line to content (including empty lines)
1536                        content.push_str(&current_line);
1537                        content.push('\n');
1538                        current_line.clear();
1539                    }
1540                    Some('\r') => {
1541                        i += 1;
1542                        pos += 1;
1543                        // Detect CRLF vs bare CR. We strip the line ending
1544                        // for delimiter matching (so `EOF\r` still matches
1545                        // `EOF`) but preserve the original byte sequence in
1546                        // the body content — the user's input is honored
1547                        // verbatim.
1548                        let crlf = chars_vec.get(i) == Some(&'\n');
1549                        if crlf {
1550                            i += 1;
1551                            pos += 1;
1552                        }
1553                        let trimmed = if strip_tabs {
1554                            current_line.trim_start_matches('\t')
1555                        } else {
1556                            &current_line
1557                        };
1558                        if trimmed == delimiter {
1559                            break;
1560                        }
1561                        content.push_str(&current_line);
1562                        content.push_str(if crlf { "\r\n" } else { "\r" });
1563                        current_line.clear();
1564                    }
1565                    Some(c) => {
1566                        current_line.push(c);
1567                        i += 1;
1568                        pos += c.len_utf8();
1569                    }
1570                    None => {
1571                        // EOF — check if current line is the delimiter (matches
1572                        // when the source ends without a trailing newline).
1573                        let trimmed = if strip_tabs {
1574                            current_line.trim_start_matches('\t')
1575                        } else {
1576                            &current_line
1577                        };
1578                        if trimmed == delimiter {
1579                            break;
1580                        }
1581                        // Not a delimiter — the heredoc was never closed.
1582                        // Crash rather than silently using whatever we
1583                        // collected: missing data is exactly the failure
1584                        // mode where silent fallback masks the bug.
1585                        let span_end = introducer_start
1586                            + 2
1587                            + if strip_tabs { 1 } else { 0 }
1588                            + delimiter.len();
1589                        return Err(Spanned::new(
1590                            LexerError::UnterminatedHeredoc {
1591                                delimiter: delimiter.clone(),
1592                            },
1593                            introducer_start..span_end,
1594                        ));
1595                    }
1596                }
1597            }
1598
1599            // Create a unique marker for this here-doc (collision-resistant)
1600            let marker = format!("__KAISH_HEREDOC_{}__", unique_marker_id());
1601            heredocs.push(HeredocReplacement {
1602                marker: marker.clone(),
1603                body: content,
1604                literal: quoted,
1605                strip_tabs,
1606                body_start_offset,
1607            });
1608
1609            // Output <<marker first, then any text that followed the delimiter
1610            // (e.g., " | jq") so the heredoc attaches to the correct command.
1611            result.push_str("<<");
1612            result.push_str(&marker);
1613            result.push_str(&after_delimiter);
1614            result.push('\n');
1615        } else {
1616            result.push(ch);
1617            i += 1;
1618            pos += ch.len_utf8();
1619        }
1620    }
1621
1622    Ok((result, heredocs))
1623}
1624
1625/// Extract the text contribution of a token for colon-adjacent merging.
1626///
1627/// Returns `Some(text)` for token types that can participate in word-like
1628/// merging, `None` for everything else.
1629fn mergeable_text(token: &Token) -> Option<String> {
1630    match token {
1631        Token::Ident(s) => Some(s.clone()),
1632        Token::NumberIdent(s) => Some(s.clone()),
1633        Token::DottedIdent(s) => Some(s.clone()),
1634        Token::Colon => Some(":".to_string()),
1635        Token::Int(n) => Some(n.to_string()),
1636        Token::Path(p) => Some(p.clone()),
1637        Token::Float(f) => Some(f.to_string()),
1638        _ => None,
1639    }
1640}
1641
1642/// Merge span-adjacent token runs containing `Token::Colon` into single `Ident` tokens.
1643///
1644/// In bash, `:` is a regular character in unquoted words. kaish tokenizes it
1645/// separately, which breaks Rust paths (`foo::bar`), URLs (`host:8080`), etc.
1646///
1647/// This pass fuses span-adjacent mergeable tokens (Ident, Colon, Int, Path, Float)
1648/// into a single `Ident` when the run contains at least one `Colon`. Runs without
1649/// colons or standalone tokens pass through unchanged.
1650fn merge_colon_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1651    if tokens.is_empty() {
1652        return tokens;
1653    }
1654
1655    let mut result = Vec::with_capacity(tokens.len());
1656    let mut run: Vec<&Spanned<Token>> = Vec::new();
1657
1658    for token in &tokens {
1659        if run.is_empty() {
1660            if mergeable_text(&token.token).is_some() {
1661                run.push(token);
1662            } else {
1663                result.push(token.clone());
1664            }
1665            continue;
1666        }
1667
1668        // Check span adjacency: previous run's last token ends where this one starts
1669        // Safety: run is non-empty (checked above)
1670        let Some(last) = run.last() else { unreachable!() };
1671        let adjacent = last.span.end == token.span.start;
1672
1673        if adjacent && mergeable_text(&token.token).is_some() {
1674            run.push(token);
1675        } else {
1676            flush_colon_run(&mut run, &mut result);
1677            if mergeable_text(&token.token).is_some() {
1678                run.push(token);
1679            } else {
1680                result.push(token.clone());
1681            }
1682        }
1683    }
1684
1685    flush_colon_run(&mut run, &mut result);
1686
1687    result
1688}
1689
1690/// Flush a run of mergeable tokens: merge if it contains a colon, otherwise emit individually.
1691fn flush_colon_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1692    if run.is_empty() {
1693        return;
1694    }
1695
1696    let has_colon = run.iter().any(|t| matches!(t.token, Token::Colon));
1697
1698    if run.len() >= 2 && has_colon {
1699        let text: String = run
1700            .iter()
1701            .filter_map(|t| mergeable_text(&t.token))
1702            .collect();
1703        // Safety: run.len() >= 2 so first/last exist
1704        let start = run.first().map(|t| t.span.start).unwrap_or(0);
1705        let end = run.last().map(|t| t.span.end).unwrap_or(0);
1706        result.push(Spanned::new(Token::Ident(text), start..end));
1707    } else {
1708        for t in run.iter() {
1709            result.push((*t).clone());
1710        }
1711    }
1712
1713    run.clear();
1714}
1715
1716/// Extract the text contribution of a token that can participate in a glob word.
1717///
1718/// Returns `Some(text)` for tokens that can be part of a glob pattern (identifiers,
1719/// wildcard chars, brackets, paths, etc.), `None` for structural tokens.
1720fn glob_mergeable_text(token: &Token) -> Option<String> {
1721    match token {
1722        Token::Star => Some("*".to_string()),
1723        Token::Question => Some("?".to_string()),
1724        Token::Dot => Some(".".to_string()),
1725        Token::DotDot => Some("..".to_string()),
1726        Token::Ident(s) => Some(s.clone()),
1727        Token::NumberIdent(s) => Some(s.clone()),
1728        Token::DottedIdent(s) => Some(s.clone()),
1729        Token::Path(s) => Some(s.clone()),
1730        Token::Int(n) => Some(n.to_string()),
1731        Token::LBracket => Some("[".to_string()),
1732        Token::RBracket => Some("]".to_string()),
1733        Token::Bang => Some("!".to_string()),
1734        Token::DotSlashPath(s) => Some(s.clone()),
1735        Token::RelativePath(s) => Some(s.clone()),
1736        Token::TildePath(s) => Some(s.clone()),
1737        Token::Tilde => Some("~".to_string()),
1738        Token::LBrace => Some("{".to_string()),
1739        Token::RBrace => Some("}".to_string()),
1740        Token::Comma => Some(",".to_string()),
1741        _ => None,
1742    }
1743}
1744
1745/// Merge span-adjacent token runs containing glob metacharacters into `GlobWord` tokens.
1746///
1747/// A run is merged into `GlobWord` when it contains at least one `Star`, `Question`,
1748/// or a `LBracket`+`RBracket` pair. Runs without glob chars pass through unchanged.
1749///
1750/// Runs after colon merge: `foo::bar` stays as `Ident("foo::bar")` because colon merge
1751/// already fused it before this pass sees it.
1752fn merge_glob_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1753    if tokens.is_empty() {
1754        return tokens;
1755    }
1756
1757    let mut result = Vec::with_capacity(tokens.len());
1758    let mut run: Vec<&Spanned<Token>> = Vec::new();
1759
1760    for token in &tokens {
1761        if run.is_empty() {
1762            if glob_mergeable_text(&token.token).is_some() {
1763                run.push(token);
1764            } else {
1765                result.push(token.clone());
1766            }
1767            continue;
1768        }
1769
1770        // Safety: run is non-empty (checked at top of loop)
1771        let Some(last) = run.last() else { unreachable!() };
1772        let adjacent = last.span.end == token.span.start;
1773
1774        if adjacent && glob_mergeable_text(&token.token).is_some() {
1775            run.push(token);
1776        } else {
1777            flush_glob_run(&mut run, &mut result);
1778            if glob_mergeable_text(&token.token).is_some() {
1779                run.push(token);
1780            } else {
1781                result.push(token.clone());
1782            }
1783        }
1784    }
1785
1786    flush_glob_run(&mut run, &mut result);
1787
1788    result
1789}
1790
1791/// Flush a run of glob-mergeable tokens: merge if it contains glob metacharacters.
1792fn flush_glob_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1793    if run.is_empty() {
1794        return;
1795    }
1796
1797    let has_glob = run.iter().any(|t| {
1798        matches!(t.token, Token::Star | Token::Question)
1799    }) || (run.iter().any(|t| matches!(t.token, Token::LBracket))
1800        && run.iter().any(|t| matches!(t.token, Token::RBracket)));
1801
1802    if run.len() >= 2 && has_glob {
1803        let text: String = run
1804            .iter()
1805            .filter_map(|t| glob_mergeable_text(&t.token))
1806            .collect();
1807        let start = run.first().map(|t| t.span.start).unwrap_or(0);
1808        let end = run.last().map(|t| t.span.end).unwrap_or(0);
1809        result.push(Spanned::new(Token::GlobWord(text), start..end));
1810    } else {
1811        for t in run.iter() {
1812            result.push((*t).clone());
1813        }
1814    }
1815
1816    run.clear();
1817}
1818
1819/// Tokenize source code into a vector of spanned tokens.
1820///
1821/// Skips whitespace and comments (unless you need them for formatting).
1822/// Returns errors with their positions for nice error messages.
1823///
1824/// Handles:
1825/// - Arithmetic: `$((expr))` becomes `Arithmetic("expr")`
1826/// - Here-docs: `<<EOF\nhello\nEOF` becomes `HereDocStart` + `HereDoc("hello")`
1827/// - Colon merge: span-adjacent `foo::bar` becomes `Ident("foo::bar")`
1828pub fn tokenize(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1829    // Preprocess arithmetic first (before heredocs because heredoc content might contain $((
1830    let arith_result = preprocess_arithmetic(source)
1831        .map_err(|e| vec![Spanned::new(e, 0..source.len())])?;
1832
1833    // Then preprocess here-docs. Spans inside the heredoc preprocessor are in
1834    // arith-preprocessed coords; correct back to original-source coords before
1835    // surfacing the error to keep parser diagnostics aligned with source.
1836    let span_replacements = arith_result.replacements;
1837    let (preprocessed, heredocs) = preprocess_heredocs(&arith_result.text)
1838        .map_err(|e| {
1839            let span = correct_span(e.span, &span_replacements);
1840            vec![Spanned::new(e.token, span)]
1841        })?;
1842
1843    let lexer = Token::lexer(&preprocessed);
1844    let mut tokens = Vec::new();
1845    let mut errors = Vec::new();
1846
1847    for (result, span) in lexer.spanned() {
1848        // Correct the span from preprocessed coordinates to original coordinates
1849        let corrected_span = correct_span(span, &span_replacements);
1850        match result {
1851            Ok(token) => {
1852                // Skip comments and line continuations - they're not needed for parsing
1853                if !matches!(token, Token::Comment | Token::LineContinuation) {
1854                    tokens.push(Spanned::new(token, corrected_span));
1855                }
1856            }
1857            Err(err) => {
1858                errors.push(Spanned::new(err, corrected_span));
1859            }
1860        }
1861    }
1862
1863    if !errors.is_empty() {
1864        return Err(errors);
1865    }
1866
1867    // Post-process: replace markers with actual token content
1868    let mut final_tokens = Vec::with_capacity(tokens.len());
1869    let mut i = 0;
1870
1871    while i < tokens.len() {
1872        // Check for arithmetic marker (unique format: __KAISH_ARITH_{id}__)
1873        if let Token::Ident(ref name) = tokens[i].token
1874            && name.starts_with("__KAISH_ARITH_") && name.ends_with("__")
1875                && let Some((_, expr)) = arith_result.arithmetics.iter().find(|(marker, _)| marker == name) {
1876                    final_tokens.push(Spanned::new(Token::Arithmetic(expr.clone()), tokens[i].span.clone()));
1877                    i += 1;
1878                    continue;
1879                }
1880
1881        // Check for heredoc (unique format: __KAISH_HEREDOC_{id}__)
1882        if matches!(tokens[i].token, Token::HereDocStart) {
1883            // Check if next token is a heredoc marker
1884            if i + 1 < tokens.len()
1885                && let Token::Ident(ref name) = tokens[i + 1].token
1886                    && name.starts_with("__KAISH_HEREDOC_") && name.ends_with("__") {
1887                        // Find the corresponding content
1888                        if let Some(hd) = heredocs.iter().find(|h| h.marker == *name) {
1889                            // Re-thread arithmetic markers that the arith
1890                            // preprocessor planted in the source — without
1891                            // this, `<<EOF\n$((1+2))\nEOF` materializes the
1892                            // marker text instead of `3`. Mirrors the
1893                            // String-content translation a few lines below.
1894                            // - Literal heredocs (no expansion): restore the
1895                            //   original `$((expr))` text verbatim.
1896                            // - Interpolated heredocs: wrap as
1897                            //   `${__ARITH:expr__}` so the spanned
1898                            //   interpolation parser turns it into a
1899                            //   StringPart::Arithmetic.
1900                            let mut content = hd.body.clone();
1901                            for (marker, expr) in &arith_result.arithmetics {
1902                                if content.contains(marker) {
1903                                    let replacement = if hd.literal {
1904                                        format!("$(({}))", expr)
1905                                    } else {
1906                                        format!("${{__ARITH:{}__}}", expr)
1907                                    };
1908                                    content = content.replace(marker, &replacement);
1909                                }
1910                            }
1911                            final_tokens.push(Spanned::new(Token::HereDocStart, tokens[i].span.clone()));
1912                            final_tokens.push(Spanned::new(
1913                                Token::HereDoc(HereDocData {
1914                                    content,
1915                                    literal: hd.literal,
1916                                    strip_tabs: hd.strip_tabs,
1917                                    body_start_offset: hd.body_start_offset,
1918                                }),
1919                                tokens[i + 1].span.clone(),
1920                            ));
1921                            i += 2;
1922                            continue;
1923                        }
1924                    }
1925        }
1926
1927        // Check for arithmetic markers inside string content
1928        let token = if let Token::String(ref s) = tokens[i].token {
1929            // Check if string contains any arithmetic markers
1930            let mut new_content = s.clone();
1931            for (marker, expr) in &arith_result.arithmetics {
1932                if new_content.contains(marker) {
1933                    // Replace marker with the special format that parse_interpolated_string can detect
1934                    // Use ${__ARITH:expr__} format so it gets parsed as StringPart::Arithmetic
1935                    new_content = new_content.replace(marker, &format!("${{__ARITH:{}__}}", expr));
1936                }
1937            }
1938            if new_content != *s {
1939                Spanned::new(Token::String(new_content), tokens[i].span.clone())
1940            } else {
1941                tokens[i].clone()
1942            }
1943        } else {
1944            tokens[i].clone()
1945        };
1946        final_tokens.push(token);
1947        i += 1;
1948    }
1949
1950    Ok(merge_glob_adjacent(merge_colon_adjacent(final_tokens)))
1951}
1952
1953/// Tokenize source code, preserving comments.
1954///
1955/// Useful for pretty-printing or formatting tools that need to preserve comments.
1956pub fn tokenize_with_comments(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1957    let lexer = Token::lexer(source);
1958    let mut tokens = Vec::new();
1959    let mut errors = Vec::new();
1960
1961    for (result, span) in lexer.spanned() {
1962        match result {
1963            Ok(token) => {
1964                tokens.push(Spanned::new(token, span));
1965            }
1966            Err(err) => {
1967                errors.push(Spanned::new(err, span));
1968            }
1969        }
1970    }
1971
1972    if errors.is_empty() {
1973        Ok(tokens)
1974    } else {
1975        Err(errors)
1976    }
1977}
1978
1979/// Extract the string content from a string token (removes quotes, processes escapes).
1980pub fn parse_string_literal(source: &str) -> Result<String, LexerError> {
1981    // Remove surrounding quotes
1982    if source.len() < 2 || !source.starts_with('"') || !source.ends_with('"') {
1983        return Err(LexerError::UnterminatedString);
1984    }
1985
1986    let inner = &source[1..source.len() - 1];
1987    let mut result = String::with_capacity(inner.len());
1988    let mut chars = inner.chars().peekable();
1989
1990    while let Some(ch) = chars.next() {
1991        if ch == '\\' {
1992            match chars.next() {
1993                Some('n') => result.push('\n'),
1994                Some('t') => result.push('\t'),
1995                Some('r') => result.push('\r'),
1996                Some('\\') => result.push('\\'),
1997                Some('"') => result.push('"'),
1998                // Use a unique marker for escaped dollar that won't be re-interpreted
1999                // parse_interpolated_string will convert this back to $
2000                Some('$') => result.push_str("__KAISH_ESCAPED_DOLLAR__"),
2001                Some('u') => {
2002                    // Unicode escape: \uXXXX
2003                    let mut hex = String::with_capacity(4);
2004                    for _ in 0..4 {
2005                        match chars.next() {
2006                            Some(h) if h.is_ascii_hexdigit() => hex.push(h),
2007                            _ => return Err(LexerError::InvalidEscape),
2008                        }
2009                    }
2010                    let codepoint = u32::from_str_radix(&hex, 16)
2011                        .map_err(|_| LexerError::InvalidEscape)?;
2012                    let ch = char::from_u32(codepoint)
2013                        .ok_or(LexerError::InvalidEscape)?;
2014                    result.push(ch);
2015                }
2016                // Unknown escapes: preserve the backslash (for regex patterns like `\.`)
2017                Some(next) => {
2018                    result.push('\\');
2019                    result.push(next);
2020                }
2021                None => return Err(LexerError::InvalidEscape),
2022            }
2023        } else {
2024            result.push(ch);
2025        }
2026    }
2027
2028    Ok(result)
2029}
2030
2031/// Parse a variable reference, extracting the path segments.
2032/// Input: "${VAR.field[0].nested}" → ["VAR", "field", "[0]", "nested"]
2033pub fn parse_var_ref(source: &str) -> Result<Vec<String>, LexerError> {
2034    // Remove ${ and }
2035    if source.len() < 4 || !source.starts_with("${") || !source.ends_with('}') {
2036        return Err(LexerError::UnterminatedVarRef);
2037    }
2038
2039    let inner = &source[2..source.len() - 1];
2040
2041    // Special case: $? (last result)
2042    if inner == "?" {
2043        return Ok(vec!["?".to_string()]);
2044    }
2045
2046    let mut segments = Vec::new();
2047    let mut current = String::new();
2048    let mut chars = inner.chars().peekable();
2049
2050    while let Some(ch) = chars.next() {
2051        match ch {
2052            '.' => {
2053                if !current.is_empty() {
2054                    segments.push(current.clone());
2055                    current.clear();
2056                }
2057            }
2058            '[' => {
2059                if !current.is_empty() {
2060                    segments.push(current.clone());
2061                    current.clear();
2062                }
2063                // Collect the index
2064                let mut index = String::from("[");
2065                while let Some(&c) = chars.peek() {
2066                    if let Some(c) = chars.next() {
2067                        index.push(c);
2068                    }
2069                    if c == ']' {
2070                        break;
2071                    }
2072                }
2073                segments.push(index);
2074            }
2075            _ => {
2076                current.push(ch);
2077            }
2078        }
2079    }
2080
2081    if !current.is_empty() {
2082        segments.push(current);
2083    }
2084
2085    Ok(segments)
2086}
2087
2088/// Parse an integer literal.
2089pub fn parse_int(source: &str) -> Result<i64, LexerError> {
2090    source.parse().map_err(|_| LexerError::InvalidNumber)
2091}
2092
2093/// Parse a float literal.
2094pub fn parse_float(source: &str) -> Result<f64, LexerError> {
2095    source.parse().map_err(|_| LexerError::InvalidNumber)
2096}
2097
2098#[cfg(test)]
2099mod tests {
2100    use super::*;
2101
2102    fn lex(source: &str) -> Vec<Token> {
2103        tokenize(source)
2104            .expect("lexer should succeed")
2105            .into_iter()
2106            .map(|s| s.token)
2107            .collect()
2108    }
2109
2110    // ═══════════════════════════════════════════════════════════════════
2111    // Keyword tests
2112    // ═══════════════════════════════════════════════════════════════════
2113
2114    #[test]
2115    fn keywords() {
2116        assert_eq!(lex("set"), vec![Token::Set]);
2117        assert_eq!(lex("if"), vec![Token::If]);
2118        assert_eq!(lex("then"), vec![Token::Then]);
2119        assert_eq!(lex("else"), vec![Token::Else]);
2120        assert_eq!(lex("elif"), vec![Token::Elif]);
2121        assert_eq!(lex("fi"), vec![Token::Fi]);
2122        assert_eq!(lex("for"), vec![Token::For]);
2123        assert_eq!(lex("in"), vec![Token::In]);
2124        assert_eq!(lex("do"), vec![Token::Do]);
2125        assert_eq!(lex("done"), vec![Token::Done]);
2126        assert_eq!(lex("case"), vec![Token::Case]);
2127        assert_eq!(lex("esac"), vec![Token::Esac]);
2128        assert_eq!(lex("function"), vec![Token::Function]);
2129        assert_eq!(lex("true"), vec![Token::True]);
2130        assert_eq!(lex("false"), vec![Token::False]);
2131    }
2132
2133    #[test]
2134    fn double_semicolon() {
2135        assert_eq!(lex(";;"), vec![Token::DoubleSemi]);
2136        // In case pattern context
2137        assert_eq!(lex("echo \"hi\";;"), vec![
2138            Token::Ident("echo".to_string()),
2139            Token::String("hi".to_string()),
2140            Token::DoubleSemi,
2141        ]);
2142    }
2143
2144    #[test]
2145    fn type_keywords() {
2146        assert_eq!(lex("string"), vec![Token::TypeString]);
2147        assert_eq!(lex("int"), vec![Token::TypeInt]);
2148        assert_eq!(lex("float"), vec![Token::TypeFloat]);
2149        assert_eq!(lex("bool"), vec![Token::TypeBool]);
2150    }
2151
2152    // ═══════════════════════════════════════════════════════════════════
2153    // Operator tests
2154    // ═══════════════════════════════════════════════════════════════════
2155
2156    #[test]
2157    fn single_char_operators() {
2158        assert_eq!(lex("="), vec![Token::Eq]);
2159        assert_eq!(lex("|"), vec![Token::Pipe]);
2160        assert_eq!(lex("&"), vec![Token::Amp]);
2161        assert_eq!(lex(">"), vec![Token::Gt]);
2162        assert_eq!(lex("<"), vec![Token::Lt]);
2163        assert_eq!(lex(";"), vec![Token::Semi]);
2164        assert_eq!(lex(":"), vec![Token::Colon]);
2165        assert_eq!(lex(","), vec![Token::Comma]);
2166        assert_eq!(lex("."), vec![Token::Dot]);
2167    }
2168
2169    #[test]
2170    fn multi_char_operators() {
2171        assert_eq!(lex("&&"), vec![Token::And]);
2172        assert_eq!(lex("||"), vec![Token::Or]);
2173        assert_eq!(lex("=="), vec![Token::EqEq]);
2174        assert_eq!(lex("!="), vec![Token::NotEq]);
2175        assert_eq!(lex("=~"), vec![Token::Match]);
2176        assert_eq!(lex("!~"), vec![Token::NotMatch]);
2177        assert_eq!(lex(">="), vec![Token::GtEq]);
2178        assert_eq!(lex("<="), vec![Token::LtEq]);
2179        assert_eq!(lex(">>"), vec![Token::GtGt]);
2180        assert_eq!(lex("2>"), vec![Token::Stderr]);
2181        assert_eq!(lex("&>"), vec![Token::Both]);
2182    }
2183
2184    #[test]
2185    fn brackets() {
2186        assert_eq!(lex("{"), vec![Token::LBrace]);
2187        assert_eq!(lex("}"), vec![Token::RBrace]);
2188        assert_eq!(lex("["), vec![Token::LBracket]);
2189        assert_eq!(lex("]"), vec![Token::RBracket]);
2190        assert_eq!(lex("("), vec![Token::LParen]);
2191        assert_eq!(lex(")"), vec![Token::RParen]);
2192    }
2193
2194    // ═══════════════════════════════════════════════════════════════════
2195    // Literal tests
2196    // ═══════════════════════════════════════════════════════════════════
2197
2198    #[test]
2199    fn integers() {
2200        assert_eq!(lex("0"), vec![Token::Int(0)]);
2201        assert_eq!(lex("42"), vec![Token::Int(42)]);
2202        assert_eq!(lex("-1"), vec![Token::Int(-1)]);
2203        assert_eq!(lex("999999"), vec![Token::Int(999999)]);
2204    }
2205
2206    #[test]
2207    fn floats() {
2208        assert_eq!(lex("3.14"), vec![Token::Float(3.14)]);
2209        assert_eq!(lex("-0.5"), vec![Token::Float(-0.5)]);
2210        assert_eq!(lex("123.456"), vec![Token::Float(123.456)]);
2211    }
2212
2213    #[test]
2214    fn strings() {
2215        assert_eq!(lex(r#""hello""#), vec![Token::String("hello".to_string())]);
2216        assert_eq!(lex(r#""hello world""#), vec![Token::String("hello world".to_string())]);
2217        assert_eq!(lex(r#""""#), vec![Token::String("".to_string())]); // empty string
2218        assert_eq!(lex(r#""with \"quotes\"""#), vec![Token::String("with \"quotes\"".to_string())]);
2219        assert_eq!(lex(r#""with\nnewline""#), vec![Token::String("with\nnewline".to_string())]);
2220    }
2221
2222    #[test]
2223    fn var_refs() {
2224        assert_eq!(lex("${X}"), vec![Token::VarRef("${X}".to_string())]);
2225        assert_eq!(lex("${VAR}"), vec![Token::VarRef("${VAR}".to_string())]);
2226        assert_eq!(lex("${VAR.field}"), vec![Token::VarRef("${VAR.field}".to_string())]);
2227        assert_eq!(lex("${VAR[0]}"), vec![Token::VarRef("${VAR[0]}".to_string())]);
2228    }
2229
2230    // ═══════════════════════════════════════════════════════════════════
2231    // Identifier tests
2232    // ═══════════════════════════════════════════════════════════════════
2233
2234    #[test]
2235    fn identifiers() {
2236        assert_eq!(lex("foo"), vec![Token::Ident("foo".to_string())]);
2237        assert_eq!(lex("foo_bar"), vec![Token::Ident("foo_bar".to_string())]);
2238        assert_eq!(lex("foo-bar"), vec![Token::Ident("foo-bar".to_string())]);
2239        assert_eq!(lex("_private"), vec![Token::Ident("_private".to_string())]);
2240        assert_eq!(lex("cmd123"), vec![Token::Ident("cmd123".to_string())]);
2241    }
2242
2243    #[test]
2244    fn keyword_prefix_identifiers() {
2245        // Identifiers that start with keywords but aren't keywords
2246        assert_eq!(lex("setup"), vec![Token::Ident("setup".to_string())]);
2247        assert_eq!(lex("kaish-tools"), vec![Token::Ident("kaish-tools".to_string())]);
2248        assert_eq!(lex("iffy"), vec![Token::Ident("iffy".to_string())]);
2249        assert_eq!(lex("forked"), vec![Token::Ident("forked".to_string())]);
2250        assert_eq!(lex("done-with-it"), vec![Token::Ident("done-with-it".to_string())]);
2251    }
2252
2253    // ═══════════════════════════════════════════════════════════════════
2254    // Statement tests
2255    // ═══════════════════════════════════════════════════════════════════
2256
2257    #[test]
2258    fn assignment() {
2259        assert_eq!(
2260            lex("set X = 5"),
2261            vec![Token::Set, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2262        );
2263    }
2264
2265    #[test]
2266    fn command_simple() {
2267        assert_eq!(lex("echo"), vec![Token::Ident("echo".to_string())]);
2268        assert_eq!(
2269            lex(r#"echo "hello""#),
2270            vec![Token::Ident("echo".to_string()), Token::String("hello".to_string())]
2271        );
2272    }
2273
2274    #[test]
2275    fn command_with_args() {
2276        assert_eq!(
2277            lex("cmd arg1 arg2"),
2278            vec![Token::Ident("cmd".to_string()), Token::Ident("arg1".to_string()), Token::Ident("arg2".to_string())]
2279        );
2280    }
2281
2282    #[test]
2283    fn command_with_named_args() {
2284        assert_eq!(
2285            lex("cmd key=value"),
2286            vec![Token::Ident("cmd".to_string()), Token::Ident("key".to_string()), Token::Eq, Token::Ident("value".to_string())]
2287        );
2288    }
2289
2290    #[test]
2291    fn pipeline() {
2292        assert_eq!(
2293            lex("a | b | c"),
2294            vec![Token::Ident("a".to_string()), Token::Pipe, Token::Ident("b".to_string()), Token::Pipe, Token::Ident("c".to_string())]
2295        );
2296    }
2297
2298    #[test]
2299    fn if_statement() {
2300        assert_eq!(
2301            lex("if true; then echo; fi"),
2302            vec![
2303                Token::If,
2304                Token::True,
2305                Token::Semi,
2306                Token::Then,
2307                Token::Ident("echo".to_string()),
2308                Token::Semi,
2309                Token::Fi
2310            ]
2311        );
2312    }
2313
2314    #[test]
2315    fn for_loop() {
2316        assert_eq!(
2317            lex("for X in items; do echo; done"),
2318            vec![
2319                Token::For,
2320                Token::Ident("X".to_string()),
2321                Token::In,
2322                Token::Ident("items".to_string()),
2323                Token::Semi,
2324                Token::Do,
2325                Token::Ident("echo".to_string()),
2326                Token::Semi,
2327                Token::Done
2328            ]
2329        );
2330    }
2331
2332    // ═══════════════════════════════════════════════════════════════════
2333    // Whitespace and newlines
2334    // ═══════════════════════════════════════════════════════════════════
2335
2336    #[test]
2337    fn whitespace_ignored() {
2338        assert_eq!(lex("   set   X   =   5   "), lex("set X = 5"));
2339    }
2340
2341    #[test]
2342    fn newlines_preserved() {
2343        let tokens = lex("a\nb");
2344        assert_eq!(
2345            tokens,
2346            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2347        );
2348    }
2349
2350    #[test]
2351    fn multiple_newlines() {
2352        let tokens = lex("a\n\n\nb");
2353        assert_eq!(
2354            tokens,
2355            vec![Token::Ident("a".to_string()), Token::Newline, Token::Newline, Token::Newline, Token::Ident("b".to_string())]
2356        );
2357    }
2358
2359    // ═══════════════════════════════════════════════════════════════════
2360    // Comments
2361    // ═══════════════════════════════════════════════════════════════════
2362
2363    #[test]
2364    fn comments_skipped() {
2365        assert_eq!(lex("# comment"), vec![]);
2366        assert_eq!(lex("a # comment"), vec![Token::Ident("a".to_string())]);
2367        assert_eq!(
2368            lex("a # comment\nb"),
2369            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2370        );
2371    }
2372
2373    #[test]
2374    fn comments_preserved_when_requested() {
2375        let tokens = tokenize_with_comments("a # comment")
2376            .expect("should succeed")
2377            .into_iter()
2378            .map(|s| s.token)
2379            .collect::<Vec<_>>();
2380        assert_eq!(tokens, vec![Token::Ident("a".to_string()), Token::Comment]);
2381    }
2382
2383    // ═══════════════════════════════════════════════════════════════════
2384    // String parsing
2385    // ═══════════════════════════════════════════════════════════════════
2386
2387    #[test]
2388    fn parse_simple_string() {
2389        assert_eq!(parse_string_literal(r#""hello""#).expect("ok"), "hello");
2390    }
2391
2392    #[test]
2393    fn parse_string_with_escapes() {
2394        assert_eq!(
2395            parse_string_literal(r#""hello\nworld""#).expect("ok"),
2396            "hello\nworld"
2397        );
2398        assert_eq!(
2399            parse_string_literal(r#""tab\there""#).expect("ok"),
2400            "tab\there"
2401        );
2402        assert_eq!(
2403            parse_string_literal(r#""quote\"here""#).expect("ok"),
2404            "quote\"here"
2405        );
2406    }
2407
2408    #[test]
2409    fn parse_string_with_unicode() {
2410        assert_eq!(
2411            parse_string_literal(r#""emoji \u2764""#).expect("ok"),
2412            "emoji ❤"
2413        );
2414    }
2415
2416    #[test]
2417    fn parse_string_with_escaped_dollar() {
2418        // \$ produces a marker that parse_interpolated_string will convert to $
2419        // The marker __KAISH_ESCAPED_DOLLAR__ is used to prevent re-interpretation
2420        assert_eq!(
2421            parse_string_literal(r#""\$VAR""#).expect("ok"),
2422            "__KAISH_ESCAPED_DOLLAR__VAR"
2423        );
2424        assert_eq!(
2425            parse_string_literal(r#""cost: \$100""#).expect("ok"),
2426            "cost: __KAISH_ESCAPED_DOLLAR__100"
2427        );
2428    }
2429
2430    // ═══════════════════════════════════════════════════════════════════
2431    // Variable reference parsing
2432    // ═══════════════════════════════════════════════════════════════════
2433
2434    #[test]
2435    fn parse_simple_var() {
2436        assert_eq!(
2437            parse_var_ref("${X}").expect("ok"),
2438            vec!["X"]
2439        );
2440    }
2441
2442    #[test]
2443    fn parse_var_with_field() {
2444        assert_eq!(
2445            parse_var_ref("${VAR.field}").expect("ok"),
2446            vec!["VAR", "field"]
2447        );
2448    }
2449
2450    #[test]
2451    fn parse_var_with_index() {
2452        assert_eq!(
2453            parse_var_ref("${VAR[0]}").expect("ok"),
2454            vec!["VAR", "[0]"]
2455        );
2456    }
2457
2458    #[test]
2459    fn parse_var_nested() {
2460        assert_eq!(
2461            parse_var_ref("${VAR.field[0].nested}").expect("ok"),
2462            vec!["VAR", "field", "[0]", "nested"]
2463        );
2464    }
2465
2466    #[test]
2467    fn parse_last_result() {
2468        assert_eq!(
2469            parse_var_ref("${?}").expect("ok"),
2470            vec!["?"]
2471        );
2472    }
2473
2474    // ═══════════════════════════════════════════════════════════════════
2475    // Number parsing
2476    // ═══════════════════════════════════════════════════════════════════
2477
2478    #[test]
2479    fn parse_integers() {
2480        assert_eq!(parse_int("0").expect("ok"), 0);
2481        assert_eq!(parse_int("42").expect("ok"), 42);
2482        assert_eq!(parse_int("-1").expect("ok"), -1);
2483    }
2484
2485    #[test]
2486    fn parse_floats() {
2487        assert!((parse_float("3.14").expect("ok") - 3.14).abs() < f64::EPSILON);
2488        assert!((parse_float("-0.5").expect("ok") - (-0.5)).abs() < f64::EPSILON);
2489    }
2490
2491    // ═══════════════════════════════════════════════════════════════════
2492    // Edge cases and errors
2493    // ═══════════════════════════════════════════════════════════════════
2494
2495    #[test]
2496    fn empty_input() {
2497        assert_eq!(lex(""), vec![]);
2498    }
2499
2500    #[test]
2501    fn only_whitespace() {
2502        assert_eq!(lex("   \t\t   "), vec![]);
2503    }
2504
2505    #[test]
2506    fn json_array() {
2507        assert_eq!(
2508            lex(r#"[1, 2, 3]"#),
2509            vec![
2510                Token::LBracket,
2511                Token::Int(1),
2512                Token::Comma,
2513                Token::Int(2),
2514                Token::Comma,
2515                Token::Int(3),
2516                Token::RBracket
2517            ]
2518        );
2519    }
2520
2521    #[test]
2522    fn json_object() {
2523        assert_eq!(
2524            lex(r#"{"key": "value"}"#),
2525            vec![
2526                Token::LBrace,
2527                Token::String("key".to_string()),
2528                Token::Colon,
2529                Token::String("value".to_string()),
2530                Token::RBrace
2531            ]
2532        );
2533    }
2534
2535    #[test]
2536    fn redirect_operators() {
2537        assert_eq!(
2538            lex("cmd > file"),
2539            vec![Token::Ident("cmd".to_string()), Token::Gt, Token::Ident("file".to_string())]
2540        );
2541        assert_eq!(
2542            lex("cmd >> file"),
2543            vec![Token::Ident("cmd".to_string()), Token::GtGt, Token::Ident("file".to_string())]
2544        );
2545        assert_eq!(
2546            lex("cmd 2> err"),
2547            vec![Token::Ident("cmd".to_string()), Token::Stderr, Token::Ident("err".to_string())]
2548        );
2549        assert_eq!(
2550            lex("cmd &> all"),
2551            vec![Token::Ident("cmd".to_string()), Token::Both, Token::Ident("all".to_string())]
2552        );
2553    }
2554
2555    #[test]
2556    fn background_job() {
2557        assert_eq!(
2558            lex("cmd &"),
2559            vec![Token::Ident("cmd".to_string()), Token::Amp]
2560        );
2561    }
2562
2563    #[test]
2564    fn command_substitution() {
2565        assert_eq!(
2566            lex("$(cmd)"),
2567            vec![Token::CmdSubstStart, Token::Ident("cmd".to_string()), Token::RParen]
2568        );
2569        assert_eq!(
2570            lex("$(cmd arg)"),
2571            vec![
2572                Token::CmdSubstStart,
2573                Token::Ident("cmd".to_string()),
2574                Token::Ident("arg".to_string()),
2575                Token::RParen
2576            ]
2577        );
2578        assert_eq!(
2579            lex("$(a | b)"),
2580            vec![
2581                Token::CmdSubstStart,
2582                Token::Ident("a".to_string()),
2583                Token::Pipe,
2584                Token::Ident("b".to_string()),
2585                Token::RParen
2586            ]
2587        );
2588    }
2589
2590    #[test]
2591    fn complex_pipeline() {
2592        assert_eq!(
2593            lex(r#"cat file | grep pattern="foo" | head count=10"#),
2594            vec![
2595                Token::Ident("cat".to_string()),
2596                Token::Ident("file".to_string()),
2597                Token::Pipe,
2598                Token::Ident("grep".to_string()),
2599                Token::Ident("pattern".to_string()),
2600                Token::Eq,
2601                Token::String("foo".to_string()),
2602                Token::Pipe,
2603                Token::Ident("head".to_string()),
2604                Token::Ident("count".to_string()),
2605                Token::Eq,
2606                Token::Int(10),
2607            ]
2608        );
2609    }
2610
2611    // ═══════════════════════════════════════════════════════════════════
2612    // Flag tests
2613    // ═══════════════════════════════════════════════════════════════════
2614
2615    #[test]
2616    fn short_flag() {
2617        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2618        assert_eq!(lex("-a"), vec![Token::ShortFlag("a".to_string())]);
2619        assert_eq!(lex("-v"), vec![Token::ShortFlag("v".to_string())]);
2620    }
2621
2622    #[test]
2623    fn short_flag_combined() {
2624        // Combined short flags like -la
2625        assert_eq!(lex("-la"), vec![Token::ShortFlag("la".to_string())]);
2626        assert_eq!(lex("-vvv"), vec![Token::ShortFlag("vvv".to_string())]);
2627    }
2628
2629    #[test]
2630    fn long_flag() {
2631        assert_eq!(lex("--force"), vec![Token::LongFlag("force".to_string())]);
2632        assert_eq!(lex("--verbose"), vec![Token::LongFlag("verbose".to_string())]);
2633        assert_eq!(lex("--foo-bar"), vec![Token::LongFlag("foo-bar".to_string())]);
2634    }
2635
2636    #[test]
2637    fn double_dash() {
2638        // -- alone marks end of flags
2639        assert_eq!(lex("--"), vec![Token::DoubleDash]);
2640    }
2641
2642    #[test]
2643    fn flags_vs_negative_numbers() {
2644        // -123 should be a negative integer, not a flag
2645        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2646        // -l should be a flag
2647        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2648        // -1a is ambiguous - should be Int(-1) then Ident(a)
2649        // Actually the regex -[a-zA-Z] won't match -1a since 1 isn't a letter
2650        assert_eq!(
2651            lex("-1 a"),
2652            vec![Token::Int(-1), Token::Ident("a".to_string())]
2653        );
2654    }
2655
2656    #[test]
2657    fn command_with_flags() {
2658        assert_eq!(
2659            lex("ls -l"),
2660            vec![
2661                Token::Ident("ls".to_string()),
2662                Token::ShortFlag("l".to_string()),
2663            ]
2664        );
2665        assert_eq!(
2666            lex("git commit -m"),
2667            vec![
2668                Token::Ident("git".to_string()),
2669                Token::Ident("commit".to_string()),
2670                Token::ShortFlag("m".to_string()),
2671            ]
2672        );
2673        assert_eq!(
2674            lex("git push --force"),
2675            vec![
2676                Token::Ident("git".to_string()),
2677                Token::Ident("push".to_string()),
2678                Token::LongFlag("force".to_string()),
2679            ]
2680        );
2681    }
2682
2683    #[test]
2684    fn flag_with_value() {
2685        assert_eq!(
2686            lex(r#"git commit -m "message""#),
2687            vec![
2688                Token::Ident("git".to_string()),
2689                Token::Ident("commit".to_string()),
2690                Token::ShortFlag("m".to_string()),
2691                Token::String("message".to_string()),
2692            ]
2693        );
2694        assert_eq!(
2695            lex(r#"--message="hello""#),
2696            vec![
2697                Token::LongFlag("message".to_string()),
2698                Token::Eq,
2699                Token::String("hello".to_string()),
2700            ]
2701        );
2702    }
2703
2704    #[test]
2705    fn end_of_flags_marker() {
2706        assert_eq!(
2707            lex("git checkout -- file"),
2708            vec![
2709                Token::Ident("git".to_string()),
2710                Token::Ident("checkout".to_string()),
2711                Token::DoubleDash,
2712                Token::Ident("file".to_string()),
2713            ]
2714        );
2715    }
2716
2717    // ═══════════════════════════════════════════════════════════════════
2718    // Bash compatibility tokens
2719    // ═══════════════════════════════════════════════════════════════════
2720
2721    #[test]
2722    fn local_keyword() {
2723        assert_eq!(lex("local"), vec![Token::Local]);
2724        assert_eq!(
2725            lex("local X = 5"),
2726            vec![Token::Local, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2727        );
2728    }
2729
2730    #[test]
2731    fn simple_var_ref() {
2732        assert_eq!(lex("$X"), vec![Token::SimpleVarRef("X".to_string())]);
2733        assert_eq!(lex("$foo"), vec![Token::SimpleVarRef("foo".to_string())]);
2734        assert_eq!(lex("$foo_bar"), vec![Token::SimpleVarRef("foo_bar".to_string())]);
2735        assert_eq!(lex("$_private"), vec![Token::SimpleVarRef("_private".to_string())]);
2736    }
2737
2738    #[test]
2739    fn simple_var_ref_in_command() {
2740        assert_eq!(
2741            lex("echo $NAME"),
2742            vec![Token::Ident("echo".to_string()), Token::SimpleVarRef("NAME".to_string())]
2743        );
2744    }
2745
2746    #[test]
2747    fn single_quoted_strings() {
2748        assert_eq!(lex("'hello'"), vec![Token::SingleString("hello".to_string())]);
2749        assert_eq!(lex("'hello world'"), vec![Token::SingleString("hello world".to_string())]);
2750        assert_eq!(lex("''"), vec![Token::SingleString("".to_string())]);
2751        // Single quotes don't process escapes or variables
2752        assert_eq!(lex(r"'no $VAR here'"), vec![Token::SingleString("no $VAR here".to_string())]);
2753        assert_eq!(lex(r"'backslash \n stays'"), vec![Token::SingleString(r"backslash \n stays".to_string())]);
2754    }
2755
2756    #[test]
2757    fn test_brackets() {
2758        // [[ and ]] are now two separate bracket tokens to avoid conflicts with nested arrays
2759        assert_eq!(lex("[["), vec![Token::LBracket, Token::LBracket]);
2760        assert_eq!(lex("]]"), vec![Token::RBracket, Token::RBracket]);
2761        assert_eq!(
2762            lex("[[ -f file ]]"),
2763            vec![
2764                Token::LBracket,
2765                Token::LBracket,
2766                Token::ShortFlag("f".to_string()),
2767                Token::Ident("file".to_string()),
2768                Token::RBracket,
2769                Token::RBracket
2770            ]
2771        );
2772    }
2773
2774    #[test]
2775    fn test_expression_syntax() {
2776        assert_eq!(
2777            lex(r#"[[ $X == "value" ]]"#),
2778            vec![
2779                Token::LBracket,
2780                Token::LBracket,
2781                Token::SimpleVarRef("X".to_string()),
2782                Token::EqEq,
2783                Token::String("value".to_string()),
2784                Token::RBracket,
2785                Token::RBracket
2786            ]
2787        );
2788    }
2789
2790    #[test]
2791    fn bash_style_assignment() {
2792        // NAME="value" (no spaces) - lexer sees IDENT EQ STRING
2793        assert_eq!(
2794            lex(r#"NAME="value""#),
2795            vec![
2796                Token::Ident("NAME".to_string()),
2797                Token::Eq,
2798                Token::String("value".to_string())
2799            ]
2800        );
2801    }
2802
2803    #[test]
2804    fn positional_params() {
2805        assert_eq!(lex("$0"), vec![Token::Positional(0)]);
2806        assert_eq!(lex("$1"), vec![Token::Positional(1)]);
2807        assert_eq!(lex("$9"), vec![Token::Positional(9)]);
2808        assert_eq!(lex("$@"), vec![Token::AllArgs]);
2809        assert_eq!(lex("$#"), vec![Token::ArgCount]);
2810    }
2811
2812    #[test]
2813    fn positional_in_context() {
2814        assert_eq!(
2815            lex("echo $1 $2"),
2816            vec![
2817                Token::Ident("echo".to_string()),
2818                Token::Positional(1),
2819                Token::Positional(2),
2820            ]
2821        );
2822    }
2823
2824    #[test]
2825    fn var_length() {
2826        assert_eq!(lex("${#X}"), vec![Token::VarLength("X".to_string())]);
2827        assert_eq!(lex("${#NAME}"), vec![Token::VarLength("NAME".to_string())]);
2828        assert_eq!(lex("${#foo_bar}"), vec![Token::VarLength("foo_bar".to_string())]);
2829    }
2830
2831    #[test]
2832    fn var_length_in_context() {
2833        assert_eq!(
2834            lex("echo ${#NAME}"),
2835            vec![
2836                Token::Ident("echo".to_string()),
2837                Token::VarLength("NAME".to_string()),
2838            ]
2839        );
2840    }
2841
2842    // ═══════════════════════════════════════════════════════════════════
2843    // Edge case tests: Flag ambiguities
2844    // ═══════════════════════════════════════════════════════════════════
2845
2846    #[test]
2847    fn plus_flag() {
2848        // Plus flags for set +e
2849        assert_eq!(lex("+e"), vec![Token::PlusFlag("e".to_string())]);
2850        assert_eq!(lex("+x"), vec![Token::PlusFlag("x".to_string())]);
2851        assert_eq!(lex("+ex"), vec![Token::PlusFlag("ex".to_string())]);
2852    }
2853
2854    #[test]
2855    fn set_with_plus_flag() {
2856        assert_eq!(
2857            lex("set +e"),
2858            vec![
2859                Token::Set,
2860                Token::PlusFlag("e".to_string()),
2861            ]
2862        );
2863    }
2864
2865    #[test]
2866    fn set_with_multiple_flags() {
2867        assert_eq!(
2868            lex("set -e -u"),
2869            vec![
2870                Token::Set,
2871                Token::ShortFlag("e".to_string()),
2872                Token::ShortFlag("u".to_string()),
2873            ]
2874        );
2875    }
2876
2877    #[test]
2878    fn flags_vs_negative_numbers_edge_cases() {
2879        // -1a should be negative int followed by ident
2880        assert_eq!(
2881            lex("-1 a"),
2882            vec![Token::Int(-1), Token::Ident("a".to_string())]
2883        );
2884        // -l is a flag
2885        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2886        // -123 is negative number
2887        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2888    }
2889
2890    #[test]
2891    fn single_dash_is_minus_alone() {
2892        // Single dash alone - now handled as MinusAlone for `cat -` stdin indicator
2893        let result = tokenize("-").expect("should lex");
2894        assert_eq!(result.len(), 1);
2895        assert!(matches!(result[0].token, Token::MinusAlone));
2896    }
2897
2898    #[test]
2899    fn plus_bare_for_date_format() {
2900        // `date +%s` - the +%s should be PlusBare
2901        let result = tokenize("+%s").expect("should lex");
2902        assert_eq!(result.len(), 1);
2903        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%s"));
2904
2905        // `date +%Y-%m-%d` - format string with dashes
2906        let result = tokenize("+%Y-%m-%d").expect("should lex");
2907        assert_eq!(result.len(), 1);
2908        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%Y-%m-%d"));
2909    }
2910
2911    #[test]
2912    fn plus_flag_still_works() {
2913        // `set +e` - should still be PlusFlag
2914        let result = tokenize("+e").expect("should lex");
2915        assert_eq!(result.len(), 1);
2916        assert!(matches!(result[0].token, Token::PlusFlag(ref s) if s == "e"));
2917    }
2918
2919    #[test]
2920    fn while_keyword_vs_while_loop() {
2921        // 'while' as keyword in loop context
2922        assert_eq!(lex("while"), vec![Token::While]);
2923        // 'while' at start followed by condition
2924        assert_eq!(
2925            lex("while true"),
2926            vec![Token::While, Token::True]
2927        );
2928    }
2929
2930    #[test]
2931    fn control_flow_keywords() {
2932        assert_eq!(lex("break"), vec![Token::Break]);
2933        assert_eq!(lex("continue"), vec![Token::Continue]);
2934        assert_eq!(lex("return"), vec![Token::Return]);
2935        assert_eq!(lex("exit"), vec![Token::Exit]);
2936    }
2937
2938    #[test]
2939    fn control_flow_with_numbers() {
2940        assert_eq!(
2941            lex("break 2"),
2942            vec![Token::Break, Token::Int(2)]
2943        );
2944        assert_eq!(
2945            lex("continue 3"),
2946            vec![Token::Continue, Token::Int(3)]
2947        );
2948        assert_eq!(
2949            lex("exit 1"),
2950            vec![Token::Exit, Token::Int(1)]
2951        );
2952    }
2953
2954    // ═══════════════════════════════════════════════════════════════════
2955    // Here-doc tests
2956    // ═══════════════════════════════════════════════════════════════════
2957
2958    #[test]
2959    fn heredoc_simple() {
2960        let source = "cat <<EOF\nhello\nworld\nEOF";
2961        let tokens = lex(source);
2962        // body_start_offset = byte offset of 'h' in "hello", i.e. just after "cat <<EOF\n"
2963        assert_eq!(tokens, vec![
2964            Token::Ident("cat".to_string()),
2965            Token::HereDocStart,
2966            Token::HereDoc(HereDocData {
2967                content: "hello\nworld\n".to_string(),
2968                literal: false,
2969                strip_tabs: false,
2970                body_start_offset: 10,
2971            }),
2972            Token::Newline,
2973        ]);
2974    }
2975
2976    #[test]
2977    fn heredoc_empty() {
2978        let source = "cat <<EOF\nEOF";
2979        let tokens = lex(source);
2980        assert_eq!(tokens, vec![
2981            Token::Ident("cat".to_string()),
2982            Token::HereDocStart,
2983            Token::HereDoc(HereDocData {
2984                content: "".to_string(),
2985                literal: false,
2986                strip_tabs: false,
2987                body_start_offset: 10,
2988            }),
2989            Token::Newline,
2990        ]);
2991    }
2992
2993    #[test]
2994    fn heredoc_with_special_chars() {
2995        let source = "cat <<EOF\n$VAR and \"quoted\" 'single'\nEOF";
2996        let tokens = lex(source);
2997        assert_eq!(tokens, vec![
2998            Token::Ident("cat".to_string()),
2999            Token::HereDocStart,
3000            Token::HereDoc(HereDocData {
3001                content: "$VAR and \"quoted\" 'single'\n".to_string(),
3002                literal: false,
3003                strip_tabs: false,
3004                body_start_offset: 10,
3005            }),
3006            Token::Newline,
3007        ]);
3008    }
3009
3010    #[test]
3011    fn heredoc_multiline() {
3012        let source = "cat <<END\nline1\nline2\nline3\nEND";
3013        let tokens = lex(source);
3014        assert_eq!(tokens, vec![
3015            Token::Ident("cat".to_string()),
3016            Token::HereDocStart,
3017            Token::HereDoc(HereDocData {
3018                content: "line1\nline2\nline3\n".to_string(),
3019                literal: false,
3020                strip_tabs: false,
3021                body_start_offset: 10,
3022            }),
3023            Token::Newline,
3024        ]);
3025    }
3026
3027    #[test]
3028    fn heredoc_in_command() {
3029        let source = "cat <<EOF\nhello\nEOF\necho goodbye";
3030        let tokens = lex(source);
3031        assert_eq!(tokens, vec![
3032            Token::Ident("cat".to_string()),
3033            Token::HereDocStart,
3034            Token::HereDoc(HereDocData {
3035                content: "hello\n".to_string(),
3036                literal: false,
3037                strip_tabs: false,
3038                body_start_offset: 10,
3039            }),
3040            Token::Newline,
3041            Token::Ident("echo".to_string()),
3042            Token::Ident("goodbye".to_string()),
3043        ]);
3044    }
3045
3046    #[test]
3047    fn heredoc_strip_tabs() {
3048        let source = "cat <<-EOF\n\thello\n\tworld\n\tEOF";
3049        let tokens = lex(source);
3050        // Content keeps tabs verbatim — strip_tabs is recorded on the token so
3051        // the interpreter can apply POSIX leading-tab stripping at materialization
3052        // without disturbing source byte offsets used for span tracking.
3053        assert_eq!(tokens, vec![
3054            Token::Ident("cat".to_string()),
3055            Token::HereDocStart,
3056            Token::HereDoc(HereDocData {
3057                content: "\thello\n\tworld\n".to_string(),
3058                literal: false,
3059                strip_tabs: true,
3060                body_start_offset: 11,
3061            }),
3062            Token::Newline,
3063        ]);
3064    }
3065
3066    // ═══════════════════════════════════════════════════════════════════
3067    // Arithmetic expression tests
3068    // ═══════════════════════════════════════════════════════════════════
3069
3070    #[test]
3071    fn arithmetic_simple() {
3072        let source = "$((1 + 2))";
3073        let tokens = lex(source);
3074        assert_eq!(tokens, vec![Token::Arithmetic("1 + 2".to_string())]);
3075    }
3076
3077    #[test]
3078    fn arithmetic_in_assignment() {
3079        let source = "X=$((5 * 3))";
3080        let tokens = lex(source);
3081        assert_eq!(tokens, vec![
3082            Token::Ident("X".to_string()),
3083            Token::Eq,
3084            Token::Arithmetic("5 * 3".to_string()),
3085        ]);
3086    }
3087
3088    #[test]
3089    fn arithmetic_with_nested_parens() {
3090        let source = "$((2 * (3 + 4)))";
3091        let tokens = lex(source);
3092        assert_eq!(tokens, vec![Token::Arithmetic("2 * (3 + 4)".to_string())]);
3093    }
3094
3095    #[test]
3096    fn arithmetic_with_variable() {
3097        let source = "$((X + 1))";
3098        let tokens = lex(source);
3099        assert_eq!(tokens, vec![Token::Arithmetic("X + 1".to_string())]);
3100    }
3101
3102    #[test]
3103    fn arithmetic_command_subst_not_confused() {
3104        // $( should not be treated as arithmetic
3105        let source = "$(echo hello)";
3106        let tokens = lex(source);
3107        assert_eq!(tokens, vec![
3108            Token::CmdSubstStart,
3109            Token::Ident("echo".to_string()),
3110            Token::Ident("hello".to_string()),
3111            Token::RParen,
3112        ]);
3113    }
3114
3115    #[test]
3116    fn arithmetic_nesting_limit() {
3117        // Create deeply nested parens that exceed MAX_PAREN_DEPTH (256)
3118        let open_parens = "(".repeat(300);
3119        let close_parens = ")".repeat(300);
3120        let source = format!("$(({}1{}))", open_parens, close_parens);
3121        let result = tokenize(&source);
3122        assert!(result.is_err());
3123        let errors = result.unwrap_err();
3124        assert_eq!(errors.len(), 1);
3125        assert_eq!(errors[0].token, LexerError::NestingTooDeep);
3126    }
3127
3128    #[test]
3129    fn arithmetic_nesting_within_limit() {
3130        // Nesting within limit should work
3131        let source = "$((((1 + 2) * 3)))";
3132        let tokens = lex(source);
3133        assert_eq!(tokens, vec![Token::Arithmetic("((1 + 2) * 3)".to_string())]);
3134    }
3135
3136    // ═══════════════════════════════════════════════════════════════════
3137    // Arithmetic preprocessor + comment interaction
3138    //
3139    // The preprocessor used to walk raw characters tracking only quote
3140    // state. An apostrophe inside a `#` comment would open single-quote
3141    // mode and swallow real `$((..))` later in the file; `$((..))` *inside*
3142    // a comment would itself be preprocessed into a marker, misplacing
3143    // tokens. Surfaced from kaijutsu's seed scripts (see gotcha memory
3144    // `gotcha-kaish-comment-arithmetic`).
3145    // ═══════════════════════════════════════════════════════════════════
3146
3147    #[test]
3148    fn arithmetic_after_apostrophe_in_comment() {
3149        // The bare apostrophe in "doesn't" used to open single-quote mode
3150        // in the preprocessor and swallow the $((..)) below.
3151        let source = "# this doesn't work\necho $((1+2))";
3152        let tokens = lex(source);
3153        assert_eq!(tokens, vec![
3154            Token::Newline,
3155            Token::Ident("echo".to_string()),
3156            Token::Arithmetic("1+2".to_string()),
3157        ]);
3158    }
3159
3160    #[test]
3161    fn arithmetic_inside_comment_is_not_expanded() {
3162        // `$((y))` inside a `#` comment must stay comment text.
3163        let source = "# the $((y)) syntax explained\necho hello";
3164        let tokens = lex(source);
3165        assert_eq!(tokens, vec![
3166            Token::Newline,
3167            Token::Ident("echo".to_string()),
3168            Token::Ident("hello".to_string()),
3169        ]);
3170    }
3171
3172    #[test]
3173    fn backticked_arithmetic_in_comment_is_not_expanded() {
3174        // The original kaijutsu repro: `$((x))` inside a comment.
3175        // Backticks-in-comments used to leak the inner $((..)) to the
3176        // preprocessor; with comment-skip they stay inert.
3177        let source = "# the `$((x))` syntax explained\necho $((3+4))";
3178        let tokens = lex(source);
3179        assert_eq!(tokens, vec![
3180            Token::Newline,
3181            Token::Ident("echo".to_string()),
3182            Token::Arithmetic("3+4".to_string()),
3183        ]);
3184    }
3185
3186    #[test]
3187    fn arithmetic_still_works_outside_comments() {
3188        // Regression guard: comment-skip must not shrink the arithmetic
3189        // preprocessor's scope on normal `$((..))` usages.
3190        let source = "X=$((1+2)); Y=$((3*4))";
3191        let tokens = lex(source);
3192        assert_eq!(tokens, vec![
3193            Token::Ident("X".to_string()),
3194            Token::Eq,
3195            Token::Arithmetic("1+2".to_string()),
3196            Token::Semi,
3197            Token::Ident("Y".to_string()),
3198            Token::Eq,
3199            Token::Arithmetic("3*4".to_string()),
3200        ]);
3201    }
3202
3203    #[test]
3204    fn arithmetic_inside_double_quotes_still_expands() {
3205        // `#` inside a double-quoted string is a literal character, not a
3206        // comment introducer — arithmetic must still expand around it.
3207        let source = "echo \"# $((1+2))\"";
3208        let tokens = lex(source);
3209        // The string token contains the `#` and the arithmetic marker;
3210        // the exact post-processing happens at interpret time. What we
3211        // assert here is that lexing succeeds and produces a String token
3212        // (i.e. the comment skip didn't trigger inside the string).
3213        assert_eq!(tokens.len(), 2);
3214        assert!(matches!(tokens[0], Token::Ident(_)));
3215        assert!(matches!(tokens[1], Token::String(_)));
3216    }
3217
3218    // ═══════════════════════════════════════════════════════════════════
3219    // Backtick rejection
3220    //
3221    // Backticks are an explicitly dropped feature (see CLAUDE.md,
3222    // docs/LANGUAGE.md, help/limits.md, help/overview.md). We surface a
3223    // dedicated error rather than the generic `UnexpectedCharacter` so
3224    // users get a hint to use `$(cmd)`. Comments, single-quoted strings,
3225    // double-quoted strings, and heredoc bodies are all matched as single
3226    // tokens (or extracted before logos runs), so the rejection only
3227    // fires on bare backticks in source code.
3228    // ═══════════════════════════════════════════════════════════════════
3229
3230    #[test]
3231    fn backtick_in_source_is_rejected() {
3232        let result = tokenize("echo `date`");
3233        assert!(result.is_err());
3234        let errors = result.unwrap_err();
3235        assert!(errors.iter().any(|e| e.token == LexerError::BackticksNotSupported));
3236    }
3237
3238    #[test]
3239    fn backtick_in_comment_is_just_comment_text() {
3240        // Backticks are only rejected when they reach the top-level
3241        // lexer. Inside a comment they're part of the comment body.
3242        let source = "# use `date` here\necho hi";
3243        let tokens = lex(source);
3244        assert_eq!(tokens, vec![
3245            Token::Newline,
3246            Token::Ident("echo".to_string()),
3247            Token::Ident("hi".to_string()),
3248        ]);
3249    }
3250
3251    #[test]
3252    fn backtick_in_single_quoted_string_is_literal() {
3253        // Single-quoted strings are matched as one token by logos; the
3254        // backticks inside never reach the rejecting matcher.
3255        let source = "echo '`date`'";
3256        let tokens = lex(source);
3257        assert_eq!(tokens, vec![
3258            Token::Ident("echo".to_string()),
3259            Token::SingleString("`date`".to_string()),
3260        ]);
3261    }
3262
3263    #[test]
3264    fn backtick_in_double_quoted_string_is_literal() {
3265        // Kaish does not activate command substitution from backticks
3266        // inside double-quoted strings either — clear divergence from
3267        // POSIX but matches the "backticks don't exist" stance. The
3268        // double-quoted string token absorbs them as literal characters.
3269        let source = "echo \"`date`\"";
3270        let tokens = lex(source);
3271        assert_eq!(tokens.len(), 2);
3272        assert!(matches!(tokens[0], Token::Ident(_)));
3273        match &tokens[1] {
3274            Token::String(s) => assert!(s.contains('`')),
3275            other => panic!("expected Token::String, got {:?}", other),
3276        }
3277    }
3278
3279    #[test]
3280    fn backtick_in_heredoc_body_is_preserved() {
3281        // Heredoc bodies are extracted by preprocess_heredocs before
3282        // logos runs, so backticks inside them survive as content.
3283        let source = "cat <<EOF\n`date`\nEOF\n";
3284        let tokens = lex(source);
3285        let heredoc = tokens.iter().find(|t| matches!(t, Token::HereDoc(_)));
3286        assert!(heredoc.is_some(), "expected a HereDoc token");
3287        if let Some(Token::HereDoc(d)) = heredoc {
3288            assert!(d.content.contains('`'));
3289        }
3290    }
3291
3292    // ═══════════════════════════════════════════════════════════════════
3293    // Token category tests
3294    // ═══════════════════════════════════════════════════════════════════
3295
3296    #[test]
3297    fn token_categories() {
3298        // Keywords
3299        assert_eq!(Token::If.category(), TokenCategory::Keyword);
3300        assert_eq!(Token::Then.category(), TokenCategory::Keyword);
3301        assert_eq!(Token::For.category(), TokenCategory::Keyword);
3302        assert_eq!(Token::Function.category(), TokenCategory::Keyword);
3303        assert_eq!(Token::True.category(), TokenCategory::Keyword);
3304        assert_eq!(Token::TypeString.category(), TokenCategory::Keyword);
3305
3306        // Operators
3307        assert_eq!(Token::Pipe.category(), TokenCategory::Operator);
3308        assert_eq!(Token::And.category(), TokenCategory::Operator);
3309        assert_eq!(Token::Or.category(), TokenCategory::Operator);
3310        assert_eq!(Token::StderrToStdout.category(), TokenCategory::Operator);
3311        assert_eq!(Token::GtGt.category(), TokenCategory::Operator);
3312
3313        // Strings
3314        assert_eq!(Token::String("test".to_string()).category(), TokenCategory::String);
3315        assert_eq!(Token::SingleString("test".to_string()).category(), TokenCategory::String);
3316        assert_eq!(
3317            Token::HereDoc(HereDocData {
3318                content: "test".to_string(),
3319                literal: false,
3320                strip_tabs: false,
3321                body_start_offset: 0,
3322            }).category(),
3323            TokenCategory::String,
3324        );
3325
3326        // Numbers
3327        assert_eq!(Token::Int(42).category(), TokenCategory::Number);
3328        assert_eq!(Token::Float(3.14).category(), TokenCategory::Number);
3329        assert_eq!(Token::Arithmetic("1+2".to_string()).category(), TokenCategory::Number);
3330
3331        // Variables
3332        assert_eq!(Token::SimpleVarRef("X".to_string()).category(), TokenCategory::Variable);
3333        assert_eq!(Token::VarRef("${X}".to_string()).category(), TokenCategory::Variable);
3334        assert_eq!(Token::Positional(1).category(), TokenCategory::Variable);
3335        assert_eq!(Token::AllArgs.category(), TokenCategory::Variable);
3336        assert_eq!(Token::ArgCount.category(), TokenCategory::Variable);
3337        assert_eq!(Token::LastExitCode.category(), TokenCategory::Variable);
3338        assert_eq!(Token::CurrentPid.category(), TokenCategory::Variable);
3339
3340        // Flags
3341        assert_eq!(Token::ShortFlag("l".to_string()).category(), TokenCategory::Flag);
3342        assert_eq!(Token::LongFlag("verbose".to_string()).category(), TokenCategory::Flag);
3343        assert_eq!(Token::PlusFlag("e".to_string()).category(), TokenCategory::Flag);
3344        assert_eq!(Token::DoubleDash.category(), TokenCategory::Flag);
3345
3346        // Punctuation
3347        assert_eq!(Token::Semi.category(), TokenCategory::Punctuation);
3348        assert_eq!(Token::LParen.category(), TokenCategory::Punctuation);
3349        assert_eq!(Token::LBracket.category(), TokenCategory::Punctuation);
3350        assert_eq!(Token::Newline.category(), TokenCategory::Punctuation);
3351
3352        // Comments
3353        assert_eq!(Token::Comment.category(), TokenCategory::Comment);
3354
3355        // Paths
3356        assert_eq!(Token::Path("/tmp/file".to_string()).category(), TokenCategory::Path);
3357
3358        // Commands
3359        assert_eq!(Token::Ident("echo".to_string()).category(), TokenCategory::Command);
3360        assert_eq!(Token::NumberIdent("019dda1c".to_string()).category(), TokenCategory::Command);
3361        assert_eq!(Token::DottedIdent(".gitignore".to_string()).category(), TokenCategory::Command);
3362
3363        // Errors
3364        assert_eq!(Token::InvalidFloatNoLeading.category(), TokenCategory::Error);
3365        assert_eq!(Token::InvalidFloatNoTrailing.category(), TokenCategory::Error);
3366    }
3367
3368    #[test]
3369    fn test_heredoc_piped_to_command() {
3370        // Bug 4: "cat <<EOF | jq" should produce: cat <<heredoc | jq
3371        // Not: cat | jq <<heredoc
3372        let tokens = tokenize("cat <<EOF | jq\n{\"key\": \"val\"}\nEOF").unwrap();
3373        let heredoc_pos = tokens.iter().position(|t| matches!(t.token, Token::HereDoc(_)));
3374        let pipe_pos = tokens.iter().position(|t| matches!(t.token, Token::Pipe));
3375        assert!(heredoc_pos.is_some(), "should have a heredoc token");
3376        assert!(pipe_pos.is_some(), "should have a pipe token");
3377        assert!(
3378            pipe_pos.unwrap() > heredoc_pos.unwrap(),
3379            "Pipe must come after heredoc, got heredoc at {}, pipe at {}. Tokens: {:?}",
3380            heredoc_pos.unwrap(), pipe_pos.unwrap(), tokens,
3381        );
3382    }
3383
3384    #[test]
3385    fn test_heredoc_standalone_still_works() {
3386        // Regression: standalone heredoc (no pipe) must still work
3387        let tokens = tokenize("cat <<EOF\nhello\nEOF").unwrap();
3388        assert!(tokens.iter().any(|t| matches!(t.token, Token::HereDoc(_))));
3389        assert!(!tokens.iter().any(|t| matches!(t.token, Token::Pipe)));
3390    }
3391
3392    #[test]
3393    fn test_heredoc_preserves_leading_empty_lines() {
3394        // Bug B: heredoc starting with a blank line must preserve it
3395        let tokens = tokenize("cat <<EOF\n\nhello\nEOF").unwrap();
3396        let heredoc = tokens.iter().find_map(|t| {
3397            if let Token::HereDoc(data) = &t.token {
3398                Some(data.clone())
3399            } else {
3400                None
3401            }
3402        });
3403        assert!(heredoc.is_some(), "should have a heredoc token");
3404        let data = heredoc.unwrap();
3405        assert!(data.content.starts_with('\n'), "leading empty line must be preserved, got: {:?}", data.content);
3406        assert_eq!(data.content, "\nhello\n");
3407    }
3408
3409    #[test]
3410    fn test_heredoc_quoted_delimiter_sets_literal() {
3411        // Bug N: quoted delimiter (<<'EOF') should set literal=true
3412        let tokens = tokenize("cat <<'EOF'\nhello $HOME\nEOF").unwrap();
3413        let heredoc = tokens.iter().find_map(|t| {
3414            if let Token::HereDoc(data) = &t.token {
3415                Some(data.clone())
3416            } else {
3417                None
3418            }
3419        });
3420        assert!(heredoc.is_some(), "should have a heredoc token");
3421        let data = heredoc.unwrap();
3422        assert!(data.literal, "quoted delimiter should set literal=true");
3423        assert_eq!(data.content, "hello $HOME\n");
3424    }
3425
3426    #[test]
3427    fn test_heredoc_unquoted_delimiter_not_literal() {
3428        // Bug N: unquoted delimiter (<<EOF) should have literal=false
3429        let tokens = tokenize("cat <<EOF\nhello $HOME\nEOF").unwrap();
3430        let heredoc = tokens.iter().find_map(|t| {
3431            if let Token::HereDoc(data) = &t.token {
3432                Some(data.clone())
3433            } else {
3434                None
3435            }
3436        });
3437        assert!(heredoc.is_some(), "should have a heredoc token");
3438        let data = heredoc.unwrap();
3439        assert!(!data.literal, "unquoted delimiter should have literal=false");
3440    }
3441
3442    // ═══════════════════════════════════════════════════════════════════
3443    // Colon merge tests
3444    // ═══════════════════════════════════════════════════════════════════
3445
3446    #[test]
3447    fn colon_double_in_word() {
3448        assert_eq!(lex("foo::bar"), vec![Token::Ident("foo::bar".into())]);
3449    }
3450
3451    #[test]
3452    fn colon_single_in_word() {
3453        assert_eq!(lex("a:b:c"), vec![Token::Ident("a:b:c".into())]);
3454    }
3455
3456    #[test]
3457    fn colon_with_port() {
3458        assert_eq!(lex("host:8080"), vec![Token::Ident("host:8080".into())]);
3459    }
3460
3461    #[test]
3462    fn colon_standalone() {
3463        assert_eq!(lex(":"), vec![Token::Colon]);
3464    }
3465
3466    #[test]
3467    fn colon_spaced_no_merge() {
3468        assert_eq!(
3469            lex("foo : bar"),
3470            vec![
3471                Token::Ident("foo".into()),
3472                Token::Colon,
3473                Token::Ident("bar".into()),
3474            ]
3475        );
3476    }
3477
3478    #[test]
3479    fn colon_in_command_arg() {
3480        assert_eq!(
3481            lex("echo foo::bar"),
3482            vec![
3483                Token::Ident("echo".into()),
3484                Token::Ident("foo::bar".into()),
3485            ]
3486        );
3487    }
3488
3489    #[test]
3490    fn colon_trailing() {
3491        // Trailing colon merges with preceding ident
3492        assert_eq!(lex("foo:"), vec![Token::Ident("foo:".into())]);
3493    }
3494
3495    #[test]
3496    fn colon_leading() {
3497        // Leading colon merges with following ident
3498        assert_eq!(lex(":foo"), vec![Token::Ident(":foo".into())]);
3499    }
3500
3501    #[test]
3502    fn colon_with_path() {
3503        // Path token + colon + int
3504        assert_eq!(
3505            lex("/usr/bin:8080"),
3506            vec![Token::Ident("/usr/bin:8080".into())]
3507        );
3508    }
3509
3510    // ═══════════════════════════════════════════════════════════════════
3511    // Token predicate coverage (is_keyword / starts_statement)
3512    // ═══════════════════════════════════════════════════════════════════
3513
3514    #[test]
3515    fn is_keyword_covers_control_flow() {
3516        for t in [
3517            Token::While,
3518            Token::Return,
3519            Token::Break,
3520            Token::Continue,
3521            Token::Exit,
3522        ] {
3523            assert!(t.is_keyword(), "{t:?} should be a keyword");
3524        }
3525    }
3526
3527    #[test]
3528    fn starts_statement_covers_while() {
3529        assert!(Token::While.starts_statement());
3530    }
3531
3532    #[test]
3533    fn is_keyword_rejects_operators() {
3534        for t in [Token::Pipe, Token::Amp, Token::Eq, Token::LBrace] {
3535            assert!(!t.is_keyword(), "{t:?} should not be a keyword");
3536        }
3537    }
3538}
kaish_kernel/lexer.rs

kaish_kernel/
lexer.rs