kaish_kernel/
lexer.rs

1//! Lexer for kaish source code.
2//!
3//! Converts source text into a stream of tokens using the logos lexer generator.
4//! The lexer is designed to be unambiguous: every valid input produces exactly
5//! one token sequence, and invalid input produces clear errors.
6//!
7//! # Token Categories
8//!
9//! - **Keywords**: `set`, `tool`, `if`, `then`, `else`, `fi`, `for`, `in`, `do`, `done`
10//! - **Literals**: strings, integers, floats, booleans (`true`/`false`)
11//! - **Operators**: `=`, `|`, `&`, `>`, `>>`, `<`, `2>`, `&>`, `&&`, `||`
12//! - **Punctuation**: `;`, `:`, `,`, `.`, `{`, `}`, `[`, `]`
13//! - **Variable references**: `${...}` with nested path access
14//! - **Identifiers**: command names, variable names, parameter names
15
16use logos::{Logos, Span};
17use std::fmt;
18use std::sync::atomic::{AtomicU64, Ordering};
19use std::time::{SystemTime, UNIX_EPOCH};
20
21/// Global counter for generating unique markers across all tokenize calls.
22static MARKER_COUNTER: AtomicU64 = AtomicU64::new(0);
23
24/// Maximum nesting depth for parentheses in arithmetic expressions.
25/// Prevents stack overflow from pathologically nested inputs like $((((((...
26const MAX_PAREN_DEPTH: usize = 256;
27
28/// Tracks a text replacement for span correction.
29/// When preprocessing replaces text (like `$((1+2))` with a marker),
30/// we need to adjust subsequent spans to account for the length change.
31#[derive(Debug, Clone)]
32struct SpanReplacement {
33    /// Position in the preprocessed text where the marker starts.
34    preprocessed_pos: usize,
35    /// Length of the marker in preprocessed text.
36    marker_len: usize,
37    /// Length of the original text that was replaced.
38    original_len: usize,
39}
40
41/// Corrects a span from preprocessed-text coordinates back to original-text coordinates.
42fn correct_span(span: Span, replacements: &[SpanReplacement]) -> Span {
43    let mut start_adjustment: isize = 0;
44    let mut end_adjustment: isize = 0;
45
46    for r in replacements {
47        // Calculate the length difference (positive = original was longer, negative = marker is longer)
48        let delta = r.original_len as isize - r.marker_len as isize;
49
50        // If the span starts after this replacement, adjust the start
51        if span.start > r.preprocessed_pos + r.marker_len {
52            start_adjustment += delta;
53        } else if span.start > r.preprocessed_pos {
54            // Span starts inside the marker - map to original position
55            // (this shouldn't happen often, but handle it gracefully)
56            start_adjustment += delta;
57        }
58
59        // If the span ends after this replacement, adjust the end
60        if span.end > r.preprocessed_pos + r.marker_len {
61            end_adjustment += delta;
62        } else if span.end > r.preprocessed_pos {
63            // Span ends inside the marker - map to end of original
64            end_adjustment += delta;
65        }
66    }
67
68    let new_start = (span.start as isize + start_adjustment).max(0) as usize;
69    let new_end = (span.end as isize + end_adjustment).max(new_start as isize) as usize;
70    new_start..new_end
71}
72
73/// Generate a unique marker ID that's extremely unlikely to collide with user code.
74/// Uses a combination of timestamp, counter, and process ID.
75fn unique_marker_id() -> String {
76    let timestamp = SystemTime::now()
77        .duration_since(UNIX_EPOCH)
78        .map(|d| d.as_nanos())
79        .unwrap_or(0);
80    let counter = MARKER_COUNTER.fetch_add(1, Ordering::Relaxed);
81    #[cfg(target_os = "wasi")]
82    let pid = 0u32;
83    #[cfg(not(target_os = "wasi"))]
84    let pid = std::process::id();
85    format!("{:x}_{:x}_{:x}", timestamp, counter, pid)
86}
87
88/// A token with its span in the source text.
89#[derive(Debug, Clone, PartialEq)]
90pub struct Spanned<T> {
91    pub token: T,
92    pub span: Span,
93}
94
95impl<T> Spanned<T> {
96    pub fn new(token: T, span: Span) -> Self {
97        Self { token, span }
98    }
99}
100
101/// Lexer error types.
102#[derive(Debug, Clone, PartialEq, Default)]
103pub enum LexerError {
104    #[default]
105    UnexpectedCharacter,
106    UnterminatedString,
107    UnterminatedVarRef,
108    InvalidEscape,
109    InvalidNumber,
110    AmbiguousBoolean(String),
111    AmbiguousBooleanLike(String),
112    InvalidFloatNoLeading,
113    InvalidFloatNoTrailing,
114    /// Nesting depth exceeded (too many nested parentheses in arithmetic).
115    NestingTooDeep,
116    /// Heredoc body ended without seeing the closing delimiter on its own line.
117    /// The user almost certainly meant to type the delimiter — silently using
118    /// whatever was collected up to EOF would mask missing data.
119    UnterminatedHeredoc { delimiter: String },
120    /// Backtick command substitution. Kaish drops backticks intentionally —
121    /// they're listed in `docs/LANGUAGE.md` and the help system as not supported.
122    /// We surface this as a dedicated error (rather than `UnexpectedCharacter`)
123    /// so the message can point users at the `$(cmd)` replacement.
124    BackticksNotSupported,
125}
126
127impl fmt::Display for LexerError {
128    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
129        match self {
130            LexerError::UnexpectedCharacter => write!(f, "unexpected character"),
131            LexerError::UnterminatedString => write!(f, "unterminated string"),
132            LexerError::UnterminatedVarRef => write!(f, "unterminated variable reference"),
133            LexerError::InvalidEscape => write!(f, "invalid escape sequence"),
134            LexerError::InvalidNumber => write!(f, "invalid number"),
135            LexerError::AmbiguousBoolean(s) => {
136                write!(f, "ambiguous boolean, use lowercase '{}'", s.to_lowercase())
137            }
138            LexerError::AmbiguousBooleanLike(s) => {
139                let suggest = if s.eq_ignore_ascii_case("yes") { "true" } else { "false" };
140                write!(f, "ambiguous boolean-like '{}', use '{}' or '\"{}\"'", s, suggest, s)
141            }
142            LexerError::InvalidFloatNoLeading => write!(f, "float must have leading digit"),
143            LexerError::InvalidFloatNoTrailing => write!(f, "float must have trailing digit"),
144            LexerError::NestingTooDeep => write!(f, "nesting depth exceeded (max {})", MAX_PAREN_DEPTH),
145            LexerError::UnterminatedHeredoc { delimiter } => {
146                write!(f, "unterminated heredoc, expected closing delimiter `{}` on its own line", delimiter)
147            }
148            LexerError::BackticksNotSupported => {
149                write!(f, "backticks are not supported in kaish; use $(cmd) instead")
150            }
151        }
152    }
153}
154
155/// Tokens produced by the kaish lexer.
156///
157/// The order of variants matters for logos priority. More specific patterns
158/// (like keywords) should come before more general ones (like identifiers).
159///
160/// Tokens that carry semantic values (strings, numbers, identifiers) include
161/// the parsed value directly. This ensures the parser has access to actual
162/// data, not just token types.
163/// Here-doc content data.
164///
165/// - `literal` is true when the delimiter was quoted (`<<'EOF'` or `<<"EOF"`),
166///   meaning no variable expansion should occur.
167/// - `strip_tabs` is true for the `<<-EOF` form. Per POSIX, leading tabs on
168///   each body line are stripped at materialization time. Stripping happens
169///   downstream of the parser so byte offsets in `content` stay aligned with
170///   their original-source positions for span-tracking purposes.
171/// - `body_start_offset` is the byte offset of the first character of `content`
172///   in the source string fed into the lexer's `tokenize`. This lets the parser
173///   compute absolute spans for parts found inside the body during interpolation.
174///   In sources without arithmetic preprocessing rewrites, this equals the
175///   original-source offset; with arithmetic before the heredoc, line numbers
176///   may shift slightly until full preprocessing-layer composition lands.
177#[derive(Debug, Clone, PartialEq)]
178pub struct HereDocData {
179    pub content: String,
180    pub literal: bool,
181    pub strip_tabs: bool,
182    pub body_start_offset: usize,
183}
184
185#[derive(Logos, Debug, Clone, PartialEq)]
186#[logos(error = LexerError)]
187#[logos(skip r"[ \t]+")]
188pub enum Token {
189    // ═══════════════════════════════════════════════════════════════════
190    // Keywords (must come before Ident for priority)
191    // ═══════════════════════════════════════════════════════════════════
192    #[token("set")]
193    Set,
194
195    #[token("local")]
196    Local,
197
198    #[token("if")]
199    If,
200
201    #[token("then")]
202    Then,
203
204    #[token("else")]
205    Else,
206
207    #[token("elif")]
208    Elif,
209
210    #[token("fi")]
211    Fi,
212
213    #[token("for")]
214    For,
215
216    #[token("while")]
217    While,
218
219    #[token("in")]
220    In,
221
222    #[token("do")]
223    Do,
224
225    #[token("done")]
226    Done,
227
228    #[token("case")]
229    Case,
230
231    #[token("esac")]
232    Esac,
233
234    #[token("function")]
235    Function,
236
237    #[token("break")]
238    Break,
239
240    #[token("continue")]
241    Continue,
242
243    #[token("return")]
244    Return,
245
246    #[token("exit")]
247    Exit,
248
249    #[token("true")]
250    True,
251
252    #[token("false")]
253    False,
254
255    // ═══════════════════════════════════════════════════════════════════
256    // Type keywords (for tool parameters)
257    // ═══════════════════════════════════════════════════════════════════
258    #[token("string")]
259    TypeString,
260
261    #[token("int")]
262    TypeInt,
263
264    #[token("float")]
265    TypeFloat,
266
267    #[token("bool")]
268    TypeBool,
269
270    // ═══════════════════════════════════════════════════════════════════
271    // Multi-character operators (must come before single-char versions)
272    // ═══════════════════════════════════════════════════════════════════
273    #[token("&&")]
274    And,
275
276    #[token("||")]
277    Or,
278
279    #[token("==")]
280    EqEq,
281
282    #[token("!=")]
283    NotEq,
284
285    #[token("=~")]
286    Match,
287
288    #[token("!~")]
289    NotMatch,
290
291    #[token(">=")]
292    GtEq,
293
294    #[token("<=")]
295    LtEq,
296
297    #[token(">>")]
298    GtGt,
299
300    #[token("2>&1")]
301    StderrToStdout,
302
303    #[token("1>&2")]
304    StdoutToStderr,
305
306    #[token(">&2")]
307    StdoutToStderr2,
308
309    #[token("2>")]
310    Stderr,
311
312    #[token("&>")]
313    Both,
314
315    #[token("<<<")]
316    HereString,
317
318    #[token("<<")]
319    HereDocStart,
320
321    #[token(";;")]
322    DoubleSemi,
323
324    // ═══════════════════════════════════════════════════════════════════
325    // Single-character operators and punctuation
326    // ═══════════════════════════════════════════════════════════════════
327    #[token("=")]
328    Eq,
329
330    #[token("|")]
331    Pipe,
332
333    #[token("&")]
334    Amp,
335
336    #[token(">")]
337    Gt,
338
339    #[token("<")]
340    Lt,
341
342    #[token(";")]
343    Semi,
344
345    #[token(":")]
346    Colon,
347
348    #[token(",")]
349    Comma,
350
351    #[token("..")]
352    DotDot,
353
354    #[token(".")]
355    Dot,
356
357    /// Tilde path: `~/foo`, `~user/bar` - value includes the full string
358    #[regex(r"~[a-zA-Z0-9_./+-]+", lex_tilde_path, priority = 3)]
359    TildePath(String),
360
361    /// Bare tilde: `~` alone (expands to $HOME)
362    #[token("~")]
363    Tilde,
364
365    /// Relative path: `../foo/bar`, bare `src/kaish` (ident containing `/`),
366    /// or a directory reference with a trailing slash like `dest/`. The
367    /// trailing-slash form uses `*` (not `+`) after the slash so `dest/`
368    /// lexes as one token instead of `Ident("dest")` + `Path("/")` — the
369    /// latter split silently turned `cp a b dest/` into a 4-operand command.
370    #[regex(r"\.\./[a-zA-Z0-9_./-]+", lex_relative_path, priority = 3)]
371    #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*/[a-zA-Z0-9_./-]*", lex_relative_path, priority = 3)]
372    RelativePath(String),
373
374    /// Dot-slash path: `./foo`, `./script.sh`
375    #[regex(r"\./[a-zA-Z0-9_./-]+", lex_dot_slash_path, priority = 3)]
376    DotSlashPath(String),
377
378    /// Dot-prefixed bareword: `.parent`, `.gitignore`, `.foo.bar`.
379    /// Treated as an opaque string in argv position. Distinct from `Token::Dot`
380    /// (the POSIX `.` source alias) which only matches a bare `.` — the source
381    /// alias requires whitespace before its file argument (`. script`), so
382    /// `.parent` (no space) is unambiguously a single bareword.
383    #[regex(r"\.[a-zA-Z_][a-zA-Z0-9_.-]*", lex_dotted_ident, priority = 3)]
384    DottedIdent(String),
385
386    #[token("{")]
387    LBrace,
388
389    #[token("}")]
390    RBrace,
391
392    #[token("[")]
393    LBracket,
394
395    #[token("]")]
396    RBracket,
397
398    #[token("(")]
399    LParen,
400
401    #[token(")")]
402    RParen,
403
404    #[token("*")]
405    Star,
406
407    #[token("!")]
408    Bang,
409
410    #[token("?")]
411    Question,
412
413    /// Merged glob word: span-adjacent tokens containing `*`, `?`, or `[...]`.
414    /// Synthesized by `merge_glob_adjacent()`, never produced by logos directly.
415    GlobWord(String),
416
417    // ═══════════════════════════════════════════════════════════════════
418    // Command substitution
419    // ═══════════════════════════════════════════════════════════════════
420
421    /// Arithmetic expression content: synthesized by preprocessing.
422    /// Contains the expression string between `$((` and `))`.
423    Arithmetic(String),
424
425    /// Command substitution start: `$(` - begins a command substitution
426    #[token("$(")]
427    CmdSubstStart,
428
429    // ═══════════════════════════════════════════════════════════════════
430    // Flags (must come before Int to win over negative numbers)
431    // ═══════════════════════════════════════════════════════════════════
432
433    /// Long flag: `--name` or `--foo-bar`
434    #[regex(r"--[a-zA-Z][a-zA-Z0-9-]*", lex_long_flag, priority = 3)]
435    LongFlag(String),
436
437    /// Short flag: `-l`, `-la` (combined short flags), or a dash-word with
438    /// internal hyphens like `-not-a-flag`. Internal hyphens are part of the
439    /// single shell word — without them the word fragments into separate flag
440    /// tokens, which breaks `echo -- -not-a-flag` and the like. A leading `--`
441    /// is still `DoubleDash` (the second char must be a letter here), and
442    /// whether the word is a flag or a literal is the binding layer's call.
443    #[regex(r"-[a-zA-Z][a-zA-Z0-9-]*", lex_short_flag, priority = 3)]
444    ShortFlag(String),
445
446    /// Plus flag: `+e` or `+x` (for set +e to disable options)
447    #[regex(r"\+[a-zA-Z][a-zA-Z0-9]*", lex_plus_flag, priority = 3)]
448    PlusFlag(String),
449
450    /// Double dash: `--` alone marks end of flags
451    #[token("--")]
452    DoubleDash,
453
454    /// Bare word starting with + followed by non-letter: `+%s`, `+%Y-%m-%d`
455    /// For date format strings and similar. Lower priority than PlusFlag.
456    #[regex(r"\+[^a-zA-Z\s][^\s]*", lex_plus_bare, priority = 2)]
457    PlusBare(String),
458
459    /// Bare word starting with - followed by non-letter/digit/dash: `-%`, etc.
460    /// For rare cases. Lower priority than ShortFlag, Int, and DoubleDash.
461    /// Excludes - after first - to avoid matching --name patterns.
462    #[regex(r"-[^a-zA-Z0-9\s\-][^\s]*", lex_minus_bare, priority = 1)]
463    MinusBare(String),
464
465    /// Job specifier: `%1`, `%2` — the bash idiom for `wait`/`kill` targets.
466    /// Keeps the leading `%` (kill uses it to distinguish a job from a PID;
467    /// wait strips it). Without this token a bare `%1` is a lexer error.
468    #[regex(r"%[0-9]+", lex_job_spec)]
469    JobSpec(String),
470
471    /// Standalone - (stdin indicator for cat -, diff - -, etc.)
472    /// Only matches when followed by whitespace or end.
473    /// This is handled specially in the parser as a positional arg.
474    #[token("-")]
475    MinusAlone,
476
477    // ═══════════════════════════════════════════════════════════════════
478    // Literals (with values)
479    // ═══════════════════════════════════════════════════════════════════
480
481    /// Double-quoted string: `"..."` - value is the parsed content (quotes removed, escapes processed)
482    #[regex(r#""([^"\\]|\\.)*""#, lex_string)]
483    String(String),
484
485    /// Single-quoted string: `'...'` - literal content, no escape processing
486    #[regex(r"'[^']*'", lex_single_string)]
487    SingleString(String),
488
489    /// Braced variable reference: `${VAR}` or `${VAR.field}` - value is the raw inner content
490    #[regex(r"\$\{[^}]+\}", lex_varref)]
491    VarRef(String),
492
493    /// Simple variable reference: `$NAME` - just the identifier
494    #[regex(r"\$[a-zA-Z_][a-zA-Z0-9_]*", lex_simple_varref)]
495    SimpleVarRef(String),
496
497    /// Positional parameter: `$0` through `$9`
498    #[regex(r"\$[0-9]", lex_positional)]
499    Positional(usize),
500
501    /// All positional parameters: `$@`
502    #[token("$@")]
503    AllArgs,
504
505    /// Number of positional parameters: `$#`
506    #[token("$#")]
507    ArgCount,
508
509    /// Last exit code: `$?`
510    #[token("$?")]
511    LastExitCode,
512
513    /// Current shell PID: `$$`
514    #[token("$$")]
515    CurrentPid,
516
517    /// Variable string length: `${#VAR}`
518    #[regex(r"\$\{#[a-zA-Z_][a-zA-Z0-9_]*\}", lex_var_length)]
519    VarLength(String),
520
521    /// Here-doc content: synthesized by preprocessing, not directly lexed.
522    /// Contains the full content of the here-doc (without the delimiter lines).
523    HereDoc(HereDocData),
524
525    /// Integer literal - value is the parsed i64
526    #[regex(r"-?[0-9]+", lex_int, priority = 2)]
527    Int(i64),
528
529    /// Float literal - value is the parsed f64
530    #[regex(r"-?[0-9]+\.[0-9]+", lex_float)]
531    Float(f64),
532
533    // ═══════════════════════════════════════════════════════════════════
534    // Invalid patterns (caught before valid tokens for better errors)
535    // ═══════════════════════════════════════════════════════════════════
536
537    /// Digit-leading bareword: `019dda1c` (SHA prefix), UUIDs, version-ish
538    /// strings. Distinguished from `Int` because at least one alpha character
539    /// follows the leading digits — the lexer commits to "this is a string,
540    /// not a number." Treated as a bareword string in expression position.
541    #[regex(r"[0-9]+[a-zA-Z_][a-zA-Z0-9_.-]*", lex_number_ident, priority = 3)]
542    NumberIdent(String),
543
544    /// Invalid: float without leading digit (like .5)
545    #[regex(r"\.[0-9]+", lex_invalid_float_no_leading, priority = 3)]
546    InvalidFloatNoLeading,
547
548    /// Invalid: float without trailing digit (like 5.)
549    /// Logos uses longest-match, so valid floats like 5.5 will match Float pattern instead
550    #[regex(r"[0-9]+\.", lex_invalid_float_no_trailing, priority = 2)]
551    InvalidFloatNoTrailing,
552
553    // ═══════════════════════════════════════════════════════════════════
554    // Paths (absolute paths starting with /)
555    // ═══════════════════════════════════════════════════════════════════
556
557    /// Absolute path: `/tmp/out`, `/etc/hosts`, etc.
558    #[regex(r"/[a-zA-Z0-9_./+-]*", lex_path)]
559    Path(String),
560
561    // ═══════════════════════════════════════════════════════════════════
562    // Identifiers (command names, variable names, etc.)
563    // ═══════════════════════════════════════════════════════════════════
564
565    /// Identifier - value is the identifier string
566    /// Allows dots for filenames like `script.kai`
567    #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*", lex_ident)]
568    Ident(String),
569
570    // ═══════════════════════════════════════════════════════════════════
571    // Structural tokens
572    // ═══════════════════════════════════════════════════════════════════
573
574    /// Comment: `# ...` to end of line
575    #[regex(r"#[^\n\r]*", allow_greedy = true)]
576    Comment,
577
578    /// Newline (significant in kaish - ends statements)
579    #[regex(r"\n|\r\n")]
580    Newline,
581
582    /// Line continuation: backslash at end of line
583    #[regex(r"\\[ \t]*(\n|\r\n)")]
584    LineContinuation,
585
586    /// Backtick command substitution — explicitly rejected. Kaish drops
587    /// backticks; the callback always errors so users get a dedicated
588    /// `BackticksNotSupported` message instead of the generic
589    /// `UnexpectedCharacter` they would have hit before. Backticks inside
590    /// single/double-quoted strings, heredoc bodies, and comments don't
591    /// reach this match — those tokens are matched as a single unit
592    /// (strings) or extracted before logos runs (heredocs) or skipped to
593    /// EOL (comments).
594    #[token("`", reject_backtick)]
595    BacktickRejected,
596}
597
598/// Semantic category for syntax highlighting.
599///
600/// Stable enum that groups tokens by purpose. Consumers match on categories
601/// instead of individual tokens, insulating them from lexer evolution.
602#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
603pub enum TokenCategory {
604    /// Keywords: if, then, else, for, while, function, return, etc.
605    Keyword,
606    /// Operators: |, &&, ||, >, >>, 2>&1, =, ==, etc.
607    Operator,
608    /// String literals: "...", '...', heredocs
609    String,
610    /// Numeric literals: 123, 3.14, arithmetic expressions
611    Number,
612    /// Variable references: $foo, ${bar}, $1, $@, $#, $?, $$
613    Variable,
614    /// Comments: # ...
615    Comment,
616    /// Punctuation: ; , . ( ) { } [ ]
617    Punctuation,
618    /// Identifiers in command position
619    Command,
620    /// Absolute paths: /foo/bar
621    Path,
622    /// Flags: --long, -s, +x
623    Flag,
624    /// Invalid tokens
625    Error,
626}
627
628impl Token {
629    /// Returns the semantic category for syntax highlighting.
630    pub fn category(&self) -> TokenCategory {
631        match self {
632            // Keywords
633            Token::If
634            | Token::Then
635            | Token::Else
636            | Token::Elif
637            | Token::Fi
638            | Token::For
639            | Token::In
640            | Token::Do
641            | Token::Done
642            | Token::While
643            | Token::Case
644            | Token::Esac
645            | Token::Function
646            | Token::Return
647            | Token::Break
648            | Token::Continue
649            | Token::Exit
650            | Token::Set
651            | Token::Local
652            | Token::True
653            | Token::False
654            | Token::TypeString
655            | Token::TypeInt
656            | Token::TypeFloat
657            | Token::TypeBool => TokenCategory::Keyword,
658
659            // Operators and redirections
660            Token::Pipe
661            | Token::And
662            | Token::Or
663            | Token::Amp
664            | Token::Eq
665            | Token::EqEq
666            | Token::NotEq
667            | Token::Match
668            | Token::NotMatch
669            | Token::Lt
670            | Token::Gt
671            | Token::LtEq
672            | Token::GtEq
673            | Token::GtGt
674            | Token::Stderr
675            | Token::Both
676            | Token::HereDocStart
677            | Token::HereString
678            | Token::StderrToStdout
679            | Token::StdoutToStderr
680            | Token::StdoutToStderr2 => TokenCategory::Operator,
681
682            // Strings
683            Token::String(_) | Token::SingleString(_) | Token::HereDoc(_) => TokenCategory::String,
684
685            // Numbers
686            Token::Int(_) | Token::Float(_) | Token::Arithmetic(_) => TokenCategory::Number,
687
688            // Variables
689            Token::VarRef(_)
690            | Token::SimpleVarRef(_)
691            | Token::Positional(_)
692            | Token::AllArgs
693            | Token::ArgCount
694            | Token::VarLength(_)
695            | Token::LastExitCode
696            | Token::CurrentPid => TokenCategory::Variable,
697
698            // Flags
699            Token::LongFlag(_)
700            | Token::ShortFlag(_)
701            | Token::PlusFlag(_)
702            | Token::DoubleDash => TokenCategory::Flag,
703
704            // Punctuation
705            Token::Semi
706            | Token::DoubleSemi
707            | Token::Colon
708            | Token::Comma
709            | Token::Dot
710            | Token::LParen
711            | Token::RParen
712            | Token::LBrace
713            | Token::RBrace
714            | Token::LBracket
715            | Token::RBracket
716            | Token::Bang
717            | Token::Question
718            | Token::Star
719            | Token::Newline
720            | Token::LineContinuation
721            | Token::CmdSubstStart => TokenCategory::Punctuation,
722
723            // Glob words (merged tokens containing wildcards)
724            Token::GlobWord(_) => TokenCategory::Path,
725
726            // Comments
727            Token::Comment => TokenCategory::Comment,
728
729            // Paths
730            Token::Path(_)
731            | Token::TildePath(_)
732            | Token::RelativePath(_)
733            | Token::Tilde
734            | Token::DotDot
735            | Token::DotSlashPath(_) => TokenCategory::Path,
736
737            // Commands/identifiers (and bare words)
738            Token::Ident(_)
739            | Token::PlusBare(_)
740            | Token::MinusBare(_)
741            | Token::MinusAlone
742            | Token::NumberIdent(_)
743            | Token::DottedIdent(_)
744            | Token::JobSpec(_) => TokenCategory::Command,
745
746            // Errors
747            Token::InvalidFloatNoLeading
748            | Token::InvalidFloatNoTrailing
749            | Token::BacktickRejected => TokenCategory::Error,
750        }
751    }
752}
753
754/// Lex a double-quoted string literal, processing escape sequences.
755fn lex_string(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
756    parse_string_literal(lex.slice())
757}
758
759/// Lex a single-quoted string literal (no escape processing).
760fn lex_single_string(lex: &mut logos::Lexer<Token>) -> String {
761    let s = lex.slice();
762    // Strip the surrounding single quotes
763    s[1..s.len() - 1].to_string()
764}
765
766/// Lex a braced variable reference, extracting the inner content.
767fn lex_varref(lex: &mut logos::Lexer<Token>) -> String {
768    // Keep the full ${...} for later parsing of path segments
769    lex.slice().to_string()
770}
771
772/// Lex a simple variable reference: `$NAME` → `NAME`
773fn lex_simple_varref(lex: &mut logos::Lexer<Token>) -> String {
774    // Strip the leading `$`
775    lex.slice()[1..].to_string()
776}
777
778/// Lex a positional parameter: `$1` → 1
779fn lex_positional(lex: &mut logos::Lexer<Token>) -> usize {
780    // Strip the leading `$` and parse the digit
781    lex.slice()[1..].parse().unwrap_or(0)
782}
783
784/// Lex a variable length: `${#VAR}` → "VAR"
785fn lex_var_length(lex: &mut logos::Lexer<Token>) -> String {
786    // Strip the leading `${#` and trailing `}`
787    let s = lex.slice();
788    s[3..s.len() - 1].to_string()
789}
790
791/// Lex an integer literal.
792fn lex_int(lex: &mut logos::Lexer<Token>) -> Result<i64, LexerError> {
793    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
794}
795
796/// Lex a float literal.
797fn lex_float(lex: &mut logos::Lexer<Token>) -> Result<f64, LexerError> {
798    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
799}
800
801/// Lex a digit-leading bareword like `019dda1c` or `019dda1c-5b3f-7000`.
802/// Distinguished from `Int` because at least one alpha character follows the
803/// leading digits — the slice is treated as a string, not a number.
804fn lex_number_ident(lex: &mut logos::Lexer<Token>) -> String {
805    lex.slice().to_string()
806}
807
808/// Lex a dot-prefixed bareword like `.gitignore` or `.parent.parent`.
809fn lex_dotted_ident(lex: &mut logos::Lexer<Token>) -> String {
810    lex.slice().to_string()
811}
812
813/// Lex an invalid float without leading digit (like .5).
814/// Always returns Err to produce a lexer error instead of a token.
815fn lex_invalid_float_no_leading(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
816    Err(LexerError::InvalidFloatNoLeading)
817}
818
819/// Reject a backtick — kaish doesn't support backtick command substitution.
820/// The dedicated error gives the user a `$(cmd)` hint instead of the generic
821/// `UnexpectedCharacter` they would have hit otherwise.
822fn reject_backtick(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
823    Err(LexerError::BackticksNotSupported)
824}
825
826/// Lex an invalid float without trailing digit (like 5.).
827/// Always returns Err to produce a lexer error instead of a token.
828fn lex_invalid_float_no_trailing(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
829    Err(LexerError::InvalidFloatNoTrailing)
830}
831
832/// Lex an identifier, rejecting ambiguous boolean-like values.
833fn lex_ident(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
834    let s = lex.slice();
835
836    // Reject ambiguous boolean variants (TRUE, FALSE, True, etc.)
837    // Only lowercase 'true' and 'false' are valid booleans (handled by Token::True/False)
838    match s.to_lowercase().as_str() {
839        "true" | "false" if s != "true" && s != "false" => {
840            return Err(LexerError::AmbiguousBoolean(s.to_string()));
841        }
842        _ => {}
843    }
844
845    // Reject yes/no/YES/NO/Yes/No as ambiguous boolean-like values
846    if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("no") {
847        return Err(LexerError::AmbiguousBooleanLike(s.to_string()));
848    }
849
850    Ok(s.to_string())
851}
852
853/// Lex a long flag: `--name` → `name`
854fn lex_long_flag(lex: &mut logos::Lexer<Token>) -> String {
855    // Strip the leading `--`
856    lex.slice()[2..].to_string()
857}
858
859/// Lex a short flag: `-l` → `l`, `-la` → `la`
860fn lex_short_flag(lex: &mut logos::Lexer<Token>) -> String {
861    // Strip the leading `-`
862    lex.slice()[1..].to_string()
863}
864
865/// Lex a plus flag: `+e` → `e`, `+ex` → `ex`
866fn lex_plus_flag(lex: &mut logos::Lexer<Token>) -> String {
867    // Strip the leading `+`
868    lex.slice()[1..].to_string()
869}
870
871/// Lex a plus bare word: `+%s` → `+%s` (keep the full string)
872fn lex_plus_bare(lex: &mut logos::Lexer<Token>) -> String {
873    lex.slice().to_string()
874}
875
876/// Lex a minus bare word: `-%` → `-%` (keep the full string)
877fn lex_minus_bare(lex: &mut logos::Lexer<Token>) -> String {
878    lex.slice().to_string()
879}
880
881/// Lex a job specifier: `%1` → `%1` (keep the leading `%`).
882fn lex_job_spec(lex: &mut logos::Lexer<Token>) -> String {
883    lex.slice().to_string()
884}
885
886/// Lex an absolute path: `/tmp/out` → `/tmp/out`
887fn lex_path(lex: &mut logos::Lexer<Token>) -> String {
888    lex.slice().to_string()
889}
890
891/// Lex a tilde path: `~/foo` → `~/foo`
892fn lex_tilde_path(lex: &mut logos::Lexer<Token>) -> String {
893    lex.slice().to_string()
894}
895
896/// Lex a relative path: `../foo` → `../foo`
897fn lex_relative_path(lex: &mut logos::Lexer<Token>) -> String {
898    lex.slice().to_string()
899}
900
901/// Lex a dot-slash path: `./foo` → `./foo`
902fn lex_dot_slash_path(lex: &mut logos::Lexer<Token>) -> String {
903    lex.slice().to_string()
904}
905
906impl fmt::Display for Token {
907    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
908        match self {
909            Token::Set => write!(f, "set"),
910            Token::Local => write!(f, "local"),
911            Token::If => write!(f, "if"),
912            Token::Then => write!(f, "then"),
913            Token::Else => write!(f, "else"),
914            Token::Elif => write!(f, "elif"),
915            Token::Fi => write!(f, "fi"),
916            Token::For => write!(f, "for"),
917            Token::While => write!(f, "while"),
918            Token::In => write!(f, "in"),
919            Token::Do => write!(f, "do"),
920            Token::Done => write!(f, "done"),
921            Token::Case => write!(f, "case"),
922            Token::Esac => write!(f, "esac"),
923            Token::Function => write!(f, "function"),
924            Token::Break => write!(f, "break"),
925            Token::Continue => write!(f, "continue"),
926            Token::Return => write!(f, "return"),
927            Token::Exit => write!(f, "exit"),
928            Token::True => write!(f, "true"),
929            Token::False => write!(f, "false"),
930            Token::TypeString => write!(f, "string"),
931            Token::TypeInt => write!(f, "int"),
932            Token::TypeFloat => write!(f, "float"),
933            Token::TypeBool => write!(f, "bool"),
934            Token::And => write!(f, "&&"),
935            Token::Or => write!(f, "||"),
936            Token::EqEq => write!(f, "=="),
937            Token::NotEq => write!(f, "!="),
938            Token::Match => write!(f, "=~"),
939            Token::NotMatch => write!(f, "!~"),
940            Token::GtEq => write!(f, ">="),
941            Token::LtEq => write!(f, "<="),
942            Token::GtGt => write!(f, ">>"),
943            Token::StderrToStdout => write!(f, "2>&1"),
944            Token::StdoutToStderr => write!(f, "1>&2"),
945            Token::StdoutToStderr2 => write!(f, ">&2"),
946            Token::Stderr => write!(f, "2>"),
947            Token::Both => write!(f, "&>"),
948            Token::HereDocStart => write!(f, "<<"),
949            Token::HereString => write!(f, "<<<"),
950            Token::DoubleSemi => write!(f, ";;"),
951            Token::Eq => write!(f, "="),
952            Token::Pipe => write!(f, "|"),
953            Token::Amp => write!(f, "&"),
954            Token::Gt => write!(f, ">"),
955            Token::Lt => write!(f, "<"),
956            Token::Semi => write!(f, ";"),
957            Token::Colon => write!(f, ":"),
958            Token::Comma => write!(f, ","),
959            Token::Dot => write!(f, "."),
960            Token::DotDot => write!(f, ".."),
961            Token::Tilde => write!(f, "~"),
962            Token::TildePath(s) => write!(f, "{}", s),
963            Token::RelativePath(s) => write!(f, "{}", s),
964            Token::DotSlashPath(s) => write!(f, "{}", s),
965            Token::LBrace => write!(f, "{{"),
966            Token::RBrace => write!(f, "}}"),
967            Token::LBracket => write!(f, "["),
968            Token::RBracket => write!(f, "]"),
969            Token::LParen => write!(f, "("),
970            Token::RParen => write!(f, ")"),
971            Token::Star => write!(f, "*"),
972            Token::Bang => write!(f, "!"),
973            Token::Question => write!(f, "?"),
974            Token::GlobWord(s) => write!(f, "GLOB({})", s),
975            Token::Arithmetic(s) => write!(f, "ARITHMETIC({})", s),
976            Token::CmdSubstStart => write!(f, "$("),
977            Token::LongFlag(s) => write!(f, "--{}", s),
978            Token::ShortFlag(s) => write!(f, "-{}", s),
979            Token::PlusFlag(s) => write!(f, "+{}", s),
980            Token::DoubleDash => write!(f, "--"),
981            Token::PlusBare(s) => write!(f, "{}", s),
982            Token::MinusBare(s) => write!(f, "{}", s),
983            Token::JobSpec(s) => write!(f, "{}", s),
984            Token::MinusAlone => write!(f, "-"),
985            Token::String(s) => write!(f, "STRING({:?})", s),
986            Token::SingleString(s) => write!(f, "SINGLESTRING({:?})", s),
987            Token::HereDoc(d) => write!(f, "HEREDOC({:?}, literal={})", d.content, d.literal),
988            Token::VarRef(v) => write!(f, "VARREF({})", v),
989            Token::SimpleVarRef(v) => write!(f, "SIMPLEVARREF({})", v),
990            Token::Positional(n) => write!(f, "${}", n),
991            Token::AllArgs => write!(f, "$@"),
992            Token::ArgCount => write!(f, "$#"),
993            Token::LastExitCode => write!(f, "$?"),
994            Token::CurrentPid => write!(f, "$$"),
995            Token::VarLength(v) => write!(f, "${{#{}}}", v),
996            Token::Int(n) => write!(f, "INT({})", n),
997            Token::Float(n) => write!(f, "FLOAT({})", n),
998            Token::Path(s) => write!(f, "PATH({})", s),
999            Token::Ident(s) => write!(f, "IDENT({})", s),
1000            Token::NumberIdent(s) => write!(f, "NUMIDENT({})", s),
1001            Token::DottedIdent(s) => write!(f, "DOTIDENT({})", s),
1002            Token::Comment => write!(f, "COMMENT"),
1003            Token::Newline => write!(f, "NEWLINE"),
1004            Token::LineContinuation => write!(f, "LINECONT"),
1005            // These variants should never be produced — their callbacks always return errors
1006            Token::InvalidFloatNoLeading => write!(f, "INVALID_FLOAT_NO_LEADING"),
1007            Token::InvalidFloatNoTrailing => write!(f, "INVALID_FLOAT_NO_TRAILING"),
1008            Token::BacktickRejected => write!(f, "BACKTICK_REJECTED"),
1009        }
1010    }
1011}
1012
1013impl Token {
1014    /// Returns true if this token is a keyword.
1015    // Must match the Keyword variants in `Token::category()` (minus the
1016    // TypeX variants, which `is_type()` covers separately). Currently
1017    // uncalled — kept exhaustive so future callers don't get wrong answers.
1018    pub fn is_keyword(&self) -> bool {
1019        matches!(
1020            self,
1021            Token::Set
1022                | Token::Local
1023                | Token::If
1024                | Token::Then
1025                | Token::Else
1026                | Token::Elif
1027                | Token::Fi
1028                | Token::For
1029                | Token::In
1030                | Token::Do
1031                | Token::Done
1032                | Token::While
1033                | Token::Case
1034                | Token::Esac
1035                | Token::Function
1036                | Token::Return
1037                | Token::Break
1038                | Token::Continue
1039                | Token::Exit
1040                | Token::True
1041                | Token::False
1042        )
1043    }
1044
1045    /// Returns true if this token is a type keyword.
1046    pub fn is_type(&self) -> bool {
1047        matches!(
1048            self,
1049            Token::TypeString
1050                | Token::TypeInt
1051                | Token::TypeFloat
1052                | Token::TypeBool
1053        )
1054    }
1055
1056    /// Returns true if this token starts a statement.
1057    // Currently uncalled — kept exhaustive so future callers don't get wrong answers.
1058    pub fn starts_statement(&self) -> bool {
1059        matches!(
1060            self,
1061            Token::Set
1062                | Token::Local
1063                | Token::Function
1064                | Token::If
1065                | Token::For
1066                | Token::While
1067                | Token::Case
1068                | Token::Ident(_)
1069                | Token::LBracket
1070        )
1071    }
1072
1073    /// Returns true if this token can appear in an expression.
1074    pub fn is_value(&self) -> bool {
1075        matches!(
1076            self,
1077            Token::String(_)
1078                | Token::SingleString(_)
1079                | Token::HereDoc(_)
1080                | Token::Arithmetic(_)
1081                | Token::Int(_)
1082                | Token::Float(_)
1083                | Token::True
1084                | Token::False
1085                | Token::VarRef(_)
1086                | Token::SimpleVarRef(_)
1087                | Token::CmdSubstStart
1088                | Token::Path(_)
1089                | Token::GlobWord(_)
1090                | Token::LastExitCode
1091                | Token::CurrentPid
1092        )
1093    }
1094}
1095
1096/// Result of preprocessing arithmetic expressions.
1097struct ArithmeticPreprocessResult {
1098    /// Preprocessed source with markers replacing $((expr)).
1099    text: String,
1100    /// Vector of (marker, expression_content) pairs.
1101    arithmetics: Vec<(String, String)>,
1102    /// Span replacements for correcting token positions.
1103    replacements: Vec<SpanReplacement>,
1104}
1105
1106/// Skip a `$(...)` command substitution with quote-aware paren matching.
1107///
1108/// Copies the entire command substitution verbatim to `result`, handling
1109/// single quotes, double quotes, and backslash escapes inside the sub so
1110/// that parentheses within strings don't confuse the depth counter.
1111///
1112/// On entry, `i` points to the `$` of `$(`. On exit, `i` points past the
1113/// closing `)`.
1114fn skip_command_substitution(
1115    chars: &[char],
1116    i: &mut usize,
1117    source_pos: &mut usize,
1118    result: &mut String,
1119) {
1120    // Copy $(
1121    result.push('$');
1122    result.push('(');
1123    *i += 2;
1124    *source_pos += 2;
1125
1126    let mut depth: usize = 1;
1127    let mut in_single_quote = false;
1128    let mut in_double_quote = false;
1129
1130    while *i < chars.len() && depth > 0 {
1131        let c = chars[*i];
1132
1133        if in_single_quote {
1134            result.push(c);
1135            *source_pos += c.len_utf8();
1136            *i += 1;
1137            if c == '\'' {
1138                in_single_quote = false;
1139            }
1140            continue;
1141        }
1142
1143        if in_double_quote {
1144            if c == '\\' && *i + 1 < chars.len() {
1145                let next = chars[*i + 1];
1146                if next == '"' || next == '\\' || next == '$' || next == '`' {
1147                    result.push(c);
1148                    result.push(next);
1149                    *source_pos += c.len_utf8() + next.len_utf8();
1150                    *i += 2;
1151                    continue;
1152                }
1153            }
1154            if c == '"' {
1155                in_double_quote = false;
1156            }
1157            result.push(c);
1158            *source_pos += c.len_utf8();
1159            *i += 1;
1160            continue;
1161        }
1162
1163        // Outside quotes
1164        match c {
1165            '\'' => {
1166                in_single_quote = true;
1167                result.push(c);
1168                *source_pos += c.len_utf8();
1169                *i += 1;
1170            }
1171            '"' => {
1172                in_double_quote = true;
1173                result.push(c);
1174                *source_pos += c.len_utf8();
1175                *i += 1;
1176            }
1177            '\\' if *i + 1 < chars.len() => {
1178                result.push(c);
1179                result.push(chars[*i + 1]);
1180                *source_pos += c.len_utf8() + chars[*i + 1].len_utf8();
1181                *i += 2;
1182            }
1183            '(' => {
1184                depth += 1;
1185                result.push(c);
1186                *source_pos += c.len_utf8();
1187                *i += 1;
1188            }
1189            ')' => {
1190                depth -= 1;
1191                result.push(c);
1192                *source_pos += c.len_utf8();
1193                *i += 1;
1194            }
1195            _ => {
1196                result.push(c);
1197                *source_pos += c.len_utf8();
1198                *i += 1;
1199            }
1200        }
1201    }
1202}
1203
1204/// Preprocess arithmetic expressions in source code.
1205///
1206/// Finds `$((expr))` patterns and replaces them with markers.
1207/// Returns the preprocessed source, arithmetic contents, and span replacement info.
1208///
1209/// Example:
1210///   `X=$((1 + 2))`
1211/// Becomes:
1212///   `X=__KAISH_ARITH_{id}__`
1213/// With arithmetics[0] = ("__KAISH_ARITH_{id}__", "1 + 2")
1214///
1215/// # Errors
1216/// Returns `LexerError::NestingTooDeep` if parentheses are nested beyond MAX_PAREN_DEPTH.
1217fn preprocess_arithmetic(source: &str) -> Result<ArithmeticPreprocessResult, LexerError> {
1218    let mut result = String::with_capacity(source.len());
1219    let mut arithmetics: Vec<(String, String)> = Vec::new();
1220    let mut replacements: Vec<SpanReplacement> = Vec::new();
1221    let mut source_pos: usize = 0;
1222    let chars_vec: Vec<char> = source.chars().collect();
1223    let mut i = 0;
1224
1225    // Whether we're currently inside double quotes. Single quotes inside
1226    // double quotes are literal characters, not quote delimiters.
1227    let mut in_double_quote = false;
1228
1229    while i < chars_vec.len() {
1230        let ch = chars_vec[i];
1231
1232        // Backslash escape outside quotes — skip both chars verbatim
1233        if !in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1234            result.push(ch);
1235            result.push(chars_vec[i + 1]);
1236            source_pos += ch.len_utf8() + chars_vec[i + 1].len_utf8();
1237            i += 2;
1238            continue;
1239        }
1240
1241        // Single quote — only starts quote mode when NOT inside double quotes
1242        if ch == '\'' && !in_double_quote {
1243            result.push(ch);
1244            i += 1;
1245            source_pos += 1;
1246            while i < chars_vec.len() && chars_vec[i] != '\'' {
1247                result.push(chars_vec[i]);
1248                source_pos += chars_vec[i].len_utf8();
1249                i += 1;
1250            }
1251            if i < chars_vec.len() {
1252                result.push(chars_vec[i]); // closing quote
1253                source_pos += 1;
1254                i += 1;
1255            }
1256            continue;
1257        }
1258
1259        // Double quote — toggle state (arithmetic is still expanded inside)
1260        if ch == '"' {
1261            in_double_quote = !in_double_quote;
1262            result.push(ch);
1263            i += 1;
1264            source_pos += 1;
1265            continue;
1266        }
1267
1268        // Backslash escape inside double quotes — only \" and \\ are special
1269        if in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1270            let next = chars_vec[i + 1];
1271            if next == '"' || next == '\\' || next == '$' || next == '`' {
1272                result.push(ch);
1273                result.push(next);
1274                source_pos += ch.len_utf8() + next.len_utf8();
1275                i += 2;
1276                continue;
1277            }
1278        }
1279
1280        // Comment — copy verbatim from `#` through end-of-line so apostrophes
1281        // and `$((..))` inside the comment body don't get processed. Logos's
1282        // own comment regex `#[^\n\r]*` doesn't require a word boundary, so
1283        // we match that: any `#` outside double quotes (and outside single
1284        // quotes — those are consumed above as a single run) starts a comment.
1285        // The newline is left for the next iteration so newline-significance
1286        // and span tracking are preserved.
1287        if ch == '#' && !in_double_quote {
1288            while i < chars_vec.len() && chars_vec[i] != '\n' && chars_vec[i] != '\r' {
1289                result.push(chars_vec[i]);
1290                source_pos += chars_vec[i].len_utf8();
1291                i += 1;
1292            }
1293            continue;
1294        }
1295
1296        // Skip $(...) command substitutions — inner arithmetic belongs to the subcommand
1297        if ch == '$' && i + 1 < chars_vec.len() && chars_vec[i + 1] == '('
1298            && !(i + 2 < chars_vec.len() && chars_vec[i + 2] == '(')
1299        {
1300            skip_command_substitution(&chars_vec, &mut i, &mut source_pos, &mut result);
1301            continue;
1302        }
1303
1304        // Look for $(( (potential arithmetic)
1305        if ch == '$' && i + 2 < chars_vec.len() && chars_vec[i + 1] == '(' && chars_vec[i + 2] == '(' {
1306            let arith_start_pos = result.len();
1307            let original_start = source_pos;
1308
1309            // Skip $((
1310            i += 3;
1311            source_pos += 3;
1312
1313            // Collect expression until matching ))
1314            let mut expr = String::new();
1315            let mut paren_depth: usize = 0;
1316
1317            while i < chars_vec.len() {
1318                let c = chars_vec[i];
1319                match c {
1320                    '(' => {
1321                        paren_depth += 1;
1322                        if paren_depth > MAX_PAREN_DEPTH {
1323                            return Err(LexerError::NestingTooDeep);
1324                        }
1325                        expr.push('(');
1326                        i += 1;
1327                        source_pos += c.len_utf8();
1328                    }
1329                    ')' => {
1330                        if paren_depth > 0 {
1331                            paren_depth -= 1;
1332                            expr.push(')');
1333                            i += 1;
1334                            source_pos += 1;
1335                        } else if i + 1 < chars_vec.len() && chars_vec[i + 1] == ')' {
1336                            // Found closing ))
1337                            i += 2;
1338                            source_pos += 2;
1339                            break;
1340                        } else {
1341                            // Single ) inside - keep going
1342                            expr.push(')');
1343                            i += 1;
1344                            source_pos += 1;
1345                        }
1346                    }
1347                    _ => {
1348                        expr.push(c);
1349                        i += 1;
1350                        source_pos += c.len_utf8();
1351                    }
1352                }
1353            }
1354
1355            // Calculate original length: from $$(( to ))
1356            let original_len = source_pos - original_start;
1357
1358            // Create a unique marker for this arithmetic (collision-resistant)
1359            let marker = format!("__KAISH_ARITH_{}__", unique_marker_id());
1360            let marker_len = marker.len();
1361
1362            // Record the replacement for span correction
1363            replacements.push(SpanReplacement {
1364                preprocessed_pos: arith_start_pos,
1365                marker_len,
1366                original_len,
1367            });
1368
1369            arithmetics.push((marker.clone(), expr));
1370            result.push_str(&marker);
1371        } else {
1372            result.push(ch);
1373            i += 1;
1374            source_pos += ch.len_utf8();
1375        }
1376    }
1377
1378    Ok(ArithmeticPreprocessResult {
1379        text: result,
1380        arithmetics,
1381        replacements,
1382    })
1383}
1384
1385/// Per-heredoc metadata collected during preprocessing.
1386///
1387/// Stored verbatim alongside the substituted marker so the parser, validator,
1388/// and interpreter can reconstitute the body with correct semantics:
1389/// - `body` is the raw body bytes; tab stripping for `<<-` is applied later
1390///   (at materialization), so byte offsets stay aligned with the original
1391///   source for span tracking.
1392/// - `strip_tabs` records whether the `<<-` form was used.
1393/// - `literal` records whether the delimiter was quoted (no interpolation).
1394/// - `body_start_offset` is the byte offset of the first body character in
1395///   the source string passed to `preprocess_heredocs`. When heredocs are
1396///   preprocessed AFTER arithmetic, this is in arith-preprocessed coordinates;
1397///   in the common case (no arithmetic before the heredoc) this equals the
1398///   original-source offset. See span-correction notes in `tokenize`.
1399#[derive(Debug, Clone)]
1400struct HeredocReplacement {
1401    marker: String,
1402    body: String,
1403    literal: bool,
1404    strip_tabs: bool,
1405    body_start_offset: usize,
1406}
1407
1408/// Preprocess here-docs in source code.
1409///
1410/// Finds `<<WORD` patterns and collects content until the delimiter line.
1411/// Returns the preprocessed source and a vector of replacement records.
1412///
1413/// Example:
1414///   `cat <<EOF\nhello\nworld\nEOF`
1415/// Becomes:
1416///   `cat <<__HEREDOC_0__`
1417/// With heredocs[0] = HeredocReplacement { marker: "__HEREDOC_0__",
1418/// body: "hello\nworld", literal: false, strip_tabs: false }
1419fn preprocess_heredocs(source: &str) -> Result<(String, Vec<HeredocReplacement>), Spanned<LexerError>> {
1420    let mut result = String::with_capacity(source.len());
1421    let mut heredocs: Vec<HeredocReplacement> = Vec::new();
1422    let chars_vec: Vec<char> = source.chars().collect();
1423    let mut i = 0;
1424    // `pos` tracks the byte offset into `source` corresponding to chars_vec[i].
1425    // `result` accumulates output; we record body offsets in `pos` (input-side)
1426    // and emit positions via `result.len()` (output-side) where needed.
1427    let mut pos: usize = 0;
1428
1429    while i < chars_vec.len() {
1430        let ch = chars_vec[i];
1431
1432        // Pass <<< through verbatim so the logos tokenizer sees the here-string
1433        // operator. If we fell through naively, the next iteration would see
1434        // the remaining `<<` and misfire heredoc preprocessing.
1435        if ch == '<'
1436            && chars_vec.get(i + 1) == Some(&'<')
1437            && chars_vec.get(i + 2) == Some(&'<')
1438        {
1439            result.push_str("<<<");
1440            i += 3;
1441            pos += 3;
1442            continue;
1443        }
1444
1445        // Look for << (potential here-doc).
1446        if ch == '<' && chars_vec.get(i + 1) == Some(&'<') {
1447            // Remember where the `<<` started so an unterminated-heredoc
1448            // error can point back at the introducer rather than at EOF.
1449            let introducer_start = pos;
1450            i += 2; // consume both '<'
1451            pos += 2;
1452
1453            // Check for optional - (strip leading tabs)
1454            let strip_tabs = chars_vec.get(i) == Some(&'-');
1455            if strip_tabs {
1456                i += 1;
1457                pos += 1;
1458            }
1459
1460            // Skip whitespace before delimiter
1461            while let Some(&c) = chars_vec.get(i) {
1462                if c == ' ' || c == '\t' {
1463                    i += 1;
1464                    pos += 1;
1465                } else {
1466                    break;
1467                }
1468            }
1469
1470            // Collect the delimiter word
1471            let mut delimiter = String::new();
1472            let quoted = chars_vec.get(i) == Some(&'\'') || chars_vec.get(i) == Some(&'"');
1473            let quote_char = if quoted {
1474                let q = chars_vec.get(i).copied();
1475                i += 1;
1476                pos += 1;
1477                q
1478            } else {
1479                None
1480            };
1481
1482            while let Some(&c) = chars_vec.get(i) {
1483                if quoted {
1484                    if Some(c) == quote_char {
1485                        i += 1; // consume closing quote
1486                        pos += 1;
1487                        break;
1488                    }
1489                } else if c.is_whitespace() || c == '\n' || c == '\r' {
1490                    break;
1491                }
1492                delimiter.push(c);
1493                i += 1;
1494                pos += c.len_utf8();
1495            }
1496
1497            if delimiter.is_empty() {
1498                // Not a valid here-doc, output << literally
1499                result.push_str("<<");
1500                if strip_tabs {
1501                    result.push('-');
1502                }
1503                continue;
1504            }
1505
1506            // Buffer text after delimiter word (e.g., " | jq" in "cat <<EOF | jq")
1507            // This must be emitted AFTER the heredoc marker, not before.
1508            let mut after_delimiter = String::new();
1509            while let Some(&c) = chars_vec.get(i) {
1510                if c == '\n' {
1511                    i += 1;
1512                    pos += 1;
1513                    break;
1514                } else if c == '\r' {
1515                    i += 1;
1516                    pos += 1;
1517                    if chars_vec.get(i) == Some(&'\n') {
1518                        i += 1;
1519                        pos += 1;
1520                    }
1521                    break;
1522                }
1523                after_delimiter.push(c);
1524                i += 1;
1525                pos += c.len_utf8();
1526            }
1527
1528            // Collect content until delimiter on its own line.
1529            // `body_start_offset` is the byte position of the first char of
1530            // the body in the source — first char after the newline that
1531            // ended the delimiter line. See HeredocReplacement docs for
1532            // coordinate-system caveat (arith-preprocessed, not original).
1533            let body_start_offset = pos;
1534            let mut content = String::new();
1535            let mut current_line = String::new();
1536
1537            loop {
1538                let next = chars_vec.get(i).copied();
1539                match next {
1540                    Some('\n') => {
1541                        i += 1;
1542                        pos += 1;
1543                        // Check if this line is the delimiter
1544                        let trimmed = if strip_tabs {
1545                            current_line.trim_start_matches('\t')
1546                        } else {
1547                            &current_line
1548                        };
1549                        if trimmed == delimiter {
1550                            // Found end of here-doc
1551                            break;
1552                        }
1553                        // Add line to content (including empty lines)
1554                        content.push_str(&current_line);
1555                        content.push('\n');
1556                        current_line.clear();
1557                    }
1558                    Some('\r') => {
1559                        i += 1;
1560                        pos += 1;
1561                        // Detect CRLF vs bare CR. We strip the line ending
1562                        // for delimiter matching (so `EOF\r` still matches
1563                        // `EOF`) but preserve the original byte sequence in
1564                        // the body content — the user's input is honored
1565                        // verbatim.
1566                        let crlf = chars_vec.get(i) == Some(&'\n');
1567                        if crlf {
1568                            i += 1;
1569                            pos += 1;
1570                        }
1571                        let trimmed = if strip_tabs {
1572                            current_line.trim_start_matches('\t')
1573                        } else {
1574                            &current_line
1575                        };
1576                        if trimmed == delimiter {
1577                            break;
1578                        }
1579                        content.push_str(&current_line);
1580                        content.push_str(if crlf { "\r\n" } else { "\r" });
1581                        current_line.clear();
1582                    }
1583                    Some(c) => {
1584                        current_line.push(c);
1585                        i += 1;
1586                        pos += c.len_utf8();
1587                    }
1588                    None => {
1589                        // EOF — check if current line is the delimiter (matches
1590                        // when the source ends without a trailing newline).
1591                        let trimmed = if strip_tabs {
1592                            current_line.trim_start_matches('\t')
1593                        } else {
1594                            &current_line
1595                        };
1596                        if trimmed == delimiter {
1597                            break;
1598                        }
1599                        // Not a delimiter — the heredoc was never closed.
1600                        // Crash rather than silently using whatever we
1601                        // collected: missing data is exactly the failure
1602                        // mode where silent fallback masks the bug.
1603                        let span_end = introducer_start
1604                            + 2
1605                            + if strip_tabs { 1 } else { 0 }
1606                            + delimiter.len();
1607                        return Err(Spanned::new(
1608                            LexerError::UnterminatedHeredoc {
1609                                delimiter: delimiter.clone(),
1610                            },
1611                            introducer_start..span_end,
1612                        ));
1613                    }
1614                }
1615            }
1616
1617            // Create a unique marker for this here-doc (collision-resistant)
1618            let marker = format!("__KAISH_HEREDOC_{}__", unique_marker_id());
1619            heredocs.push(HeredocReplacement {
1620                marker: marker.clone(),
1621                body: content,
1622                literal: quoted,
1623                strip_tabs,
1624                body_start_offset,
1625            });
1626
1627            // Output <<marker first, then any text that followed the delimiter
1628            // (e.g., " | jq") so the heredoc attaches to the correct command.
1629            result.push_str("<<");
1630            result.push_str(&marker);
1631            result.push_str(&after_delimiter);
1632            result.push('\n');
1633        } else {
1634            result.push(ch);
1635            i += 1;
1636            pos += ch.len_utf8();
1637        }
1638    }
1639
1640    Ok((result, heredocs))
1641}
1642
1643/// Extract the text contribution of a token for colon-adjacent merging.
1644///
1645/// Returns `Some(text)` for token types that can participate in word-like
1646/// merging, `None` for everything else.
1647fn mergeable_text(token: &Token) -> Option<String> {
1648    match token {
1649        Token::Ident(s) => Some(s.clone()),
1650        Token::NumberIdent(s) => Some(s.clone()),
1651        Token::DottedIdent(s) => Some(s.clone()),
1652        Token::Colon => Some(":".to_string()),
1653        Token::Int(n) => Some(n.to_string()),
1654        Token::Path(p) => Some(p.clone()),
1655        Token::Float(f) => Some(f.to_string()),
1656        _ => None,
1657    }
1658}
1659
1660/// Merge span-adjacent token runs containing `Token::Colon` into single `Ident` tokens.
1661///
1662/// In bash, `:` is a regular character in unquoted words. kaish tokenizes it
1663/// separately, which breaks Rust paths (`foo::bar`), URLs (`host:8080`), etc.
1664///
1665/// This pass fuses span-adjacent mergeable tokens (Ident, Colon, Int, Path, Float)
1666/// into a single `Ident` when the run contains at least one `Colon`. Runs without
1667/// colons or standalone tokens pass through unchanged.
1668fn merge_colon_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1669    if tokens.is_empty() {
1670        return tokens;
1671    }
1672
1673    let mut result = Vec::with_capacity(tokens.len());
1674    let mut run: Vec<&Spanned<Token>> = Vec::new();
1675
1676    for token in &tokens {
1677        if run.is_empty() {
1678            if mergeable_text(&token.token).is_some() {
1679                run.push(token);
1680            } else {
1681                result.push(token.clone());
1682            }
1683            continue;
1684        }
1685
1686        // Check span adjacency: previous run's last token ends where this one starts
1687        // Safety: run is non-empty (checked above)
1688        let Some(last) = run.last() else { unreachable!() };
1689        let adjacent = last.span.end == token.span.start;
1690
1691        if adjacent && mergeable_text(&token.token).is_some() {
1692            run.push(token);
1693        } else {
1694            flush_colon_run(&mut run, &mut result);
1695            if mergeable_text(&token.token).is_some() {
1696                run.push(token);
1697            } else {
1698                result.push(token.clone());
1699            }
1700        }
1701    }
1702
1703    flush_colon_run(&mut run, &mut result);
1704
1705    result
1706}
1707
1708/// Flush a run of mergeable tokens: merge if it contains a colon, otherwise emit individually.
1709fn flush_colon_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1710    if run.is_empty() {
1711        return;
1712    }
1713
1714    let has_colon = run.iter().any(|t| matches!(t.token, Token::Colon));
1715
1716    if run.len() >= 2 && has_colon {
1717        let text: String = run
1718            .iter()
1719            .filter_map(|t| mergeable_text(&t.token))
1720            .collect();
1721        // Safety: run.len() >= 2 so first/last exist
1722        let start = run.first().map(|t| t.span.start).unwrap_or(0);
1723        let end = run.last().map(|t| t.span.end).unwrap_or(0);
1724        result.push(Spanned::new(Token::Ident(text), start..end));
1725    } else {
1726        for t in run.iter() {
1727            result.push((*t).clone());
1728        }
1729    }
1730
1731    run.clear();
1732}
1733
1734/// Extract the text contribution of a token that can participate in a glob word.
1735///
1736/// Returns `Some(text)` for tokens that can be part of a glob pattern (identifiers,
1737/// wildcard chars, brackets, paths, etc.), `None` for structural tokens.
1738fn glob_mergeable_text(token: &Token) -> Option<String> {
1739    match token {
1740        Token::Star => Some("*".to_string()),
1741        Token::Question => Some("?".to_string()),
1742        Token::Dot => Some(".".to_string()),
1743        Token::DotDot => Some("..".to_string()),
1744        Token::Ident(s) => Some(s.clone()),
1745        Token::NumberIdent(s) => Some(s.clone()),
1746        Token::DottedIdent(s) => Some(s.clone()),
1747        Token::Path(s) => Some(s.clone()),
1748        Token::Int(n) => Some(n.to_string()),
1749        Token::LBracket => Some("[".to_string()),
1750        Token::RBracket => Some("]".to_string()),
1751        Token::Bang => Some("!".to_string()),
1752        Token::DotSlashPath(s) => Some(s.clone()),
1753        Token::RelativePath(s) => Some(s.clone()),
1754        Token::TildePath(s) => Some(s.clone()),
1755        Token::Tilde => Some("~".to_string()),
1756        Token::LBrace => Some("{".to_string()),
1757        Token::RBrace => Some("}".to_string()),
1758        Token::Comma => Some(",".to_string()),
1759        _ => None,
1760    }
1761}
1762
1763/// Merge span-adjacent token runs containing glob metacharacters into `GlobWord` tokens.
1764///
1765/// A run is merged into `GlobWord` when it contains at least one `Star`, `Question`,
1766/// or a `LBracket`+`RBracket` pair. Runs without glob chars pass through unchanged.
1767///
1768/// Runs after colon merge: `foo::bar` stays as `Ident("foo::bar")` because colon merge
1769/// already fused it before this pass sees it.
1770fn merge_glob_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1771    if tokens.is_empty() {
1772        return tokens;
1773    }
1774
1775    let mut result = Vec::with_capacity(tokens.len());
1776    let mut run: Vec<&Spanned<Token>> = Vec::new();
1777
1778    for token in &tokens {
1779        if run.is_empty() {
1780            if glob_mergeable_text(&token.token).is_some() {
1781                run.push(token);
1782            } else {
1783                result.push(token.clone());
1784            }
1785            continue;
1786        }
1787
1788        // Safety: run is non-empty (checked at top of loop)
1789        let Some(last) = run.last() else { unreachable!() };
1790        let adjacent = last.span.end == token.span.start;
1791
1792        if adjacent && glob_mergeable_text(&token.token).is_some() {
1793            run.push(token);
1794        } else {
1795            flush_glob_run(&mut run, &mut result);
1796            if glob_mergeable_text(&token.token).is_some() {
1797                run.push(token);
1798            } else {
1799                result.push(token.clone());
1800            }
1801        }
1802    }
1803
1804    flush_glob_run(&mut run, &mut result);
1805
1806    result
1807}
1808
1809/// Flush a run of glob-mergeable tokens: merge if it contains glob metacharacters.
1810fn flush_glob_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1811    if run.is_empty() {
1812        return;
1813    }
1814
1815    let has_glob = run.iter().any(|t| {
1816        matches!(t.token, Token::Star | Token::Question)
1817    }) || (run.iter().any(|t| matches!(t.token, Token::LBracket))
1818        && run.iter().any(|t| matches!(t.token, Token::RBracket)));
1819
1820    if run.len() >= 2 && has_glob {
1821        let text: String = run
1822            .iter()
1823            .filter_map(|t| glob_mergeable_text(&t.token))
1824            .collect();
1825        let start = run.first().map(|t| t.span.start).unwrap_or(0);
1826        let end = run.last().map(|t| t.span.end).unwrap_or(0);
1827        result.push(Spanned::new(Token::GlobWord(text), start..end));
1828    } else {
1829        for t in run.iter() {
1830            result.push((*t).clone());
1831        }
1832    }
1833
1834    run.clear();
1835}
1836
1837/// Tokenize source code into a vector of spanned tokens.
1838///
1839/// Skips whitespace and comments (unless you need them for formatting).
1840/// Returns errors with their positions for nice error messages.
1841///
1842/// Handles:
1843/// - Arithmetic: `$((expr))` becomes `Arithmetic("expr")`
1844/// - Here-docs: `<<EOF\nhello\nEOF` becomes `HereDocStart` + `HereDoc("hello")`
1845/// - Colon merge: span-adjacent `foo::bar` becomes `Ident("foo::bar")`
1846pub fn tokenize(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1847    // Preprocess arithmetic first (before heredocs because heredoc content might contain $((
1848    let arith_result = preprocess_arithmetic(source)
1849        .map_err(|e| vec![Spanned::new(e, 0..source.len())])?;
1850
1851    // Then preprocess here-docs. Spans inside the heredoc preprocessor are in
1852    // arith-preprocessed coords; correct back to original-source coords before
1853    // surfacing the error to keep parser diagnostics aligned with source.
1854    let span_replacements = arith_result.replacements;
1855    let (preprocessed, heredocs) = preprocess_heredocs(&arith_result.text)
1856        .map_err(|e| {
1857            let span = correct_span(e.span, &span_replacements);
1858            vec![Spanned::new(e.token, span)]
1859        })?;
1860
1861    let lexer = Token::lexer(&preprocessed);
1862    let mut tokens = Vec::new();
1863    let mut errors = Vec::new();
1864
1865    for (result, span) in lexer.spanned() {
1866        // Correct the span from preprocessed coordinates to original coordinates
1867        let corrected_span = correct_span(span, &span_replacements);
1868        match result {
1869            Ok(token) => {
1870                // Skip comments and line continuations - they're not needed for parsing
1871                if !matches!(token, Token::Comment | Token::LineContinuation) {
1872                    tokens.push(Spanned::new(token, corrected_span));
1873                }
1874            }
1875            Err(err) => {
1876                errors.push(Spanned::new(err, corrected_span));
1877            }
1878        }
1879    }
1880
1881    if !errors.is_empty() {
1882        return Err(errors);
1883    }
1884
1885    // Post-process: replace markers with actual token content
1886    let mut final_tokens = Vec::with_capacity(tokens.len());
1887    let mut i = 0;
1888
1889    while i < tokens.len() {
1890        // Check for arithmetic marker (unique format: __KAISH_ARITH_{id}__)
1891        if let Token::Ident(ref name) = tokens[i].token
1892            && name.starts_with("__KAISH_ARITH_") && name.ends_with("__")
1893                && let Some((_, expr)) = arith_result.arithmetics.iter().find(|(marker, _)| marker == name) {
1894                    final_tokens.push(Spanned::new(Token::Arithmetic(expr.clone()), tokens[i].span.clone()));
1895                    i += 1;
1896                    continue;
1897                }
1898
1899        // Check for heredoc (unique format: __KAISH_HEREDOC_{id}__)
1900        if matches!(tokens[i].token, Token::HereDocStart) {
1901            // Check if next token is a heredoc marker
1902            if i + 1 < tokens.len()
1903                && let Token::Ident(ref name) = tokens[i + 1].token
1904                    && name.starts_with("__KAISH_HEREDOC_") && name.ends_with("__") {
1905                        // Find the corresponding content
1906                        if let Some(hd) = heredocs.iter().find(|h| h.marker == *name) {
1907                            // Re-thread arithmetic markers that the arith
1908                            // preprocessor planted in the source — without
1909                            // this, `<<EOF\n$((1+2))\nEOF` materializes the
1910                            // marker text instead of `3`. Mirrors the
1911                            // String-content translation a few lines below.
1912                            // - Literal heredocs (no expansion): restore the
1913                            //   original `$((expr))` text verbatim.
1914                            // - Interpolated heredocs: wrap as
1915                            //   `${__ARITH:expr__}` so the spanned
1916                            //   interpolation parser turns it into a
1917                            //   StringPart::Arithmetic.
1918                            let mut content = hd.body.clone();
1919                            for (marker, expr) in &arith_result.arithmetics {
1920                                if content.contains(marker) {
1921                                    let replacement = if hd.literal {
1922                                        format!("$(({}))", expr)
1923                                    } else {
1924                                        format!("${{__ARITH:{}__}}", expr)
1925                                    };
1926                                    content = content.replace(marker, &replacement);
1927                                }
1928                            }
1929                            final_tokens.push(Spanned::new(Token::HereDocStart, tokens[i].span.clone()));
1930                            final_tokens.push(Spanned::new(
1931                                Token::HereDoc(HereDocData {
1932                                    content,
1933                                    literal: hd.literal,
1934                                    strip_tabs: hd.strip_tabs,
1935                                    body_start_offset: hd.body_start_offset,
1936                                }),
1937                                tokens[i + 1].span.clone(),
1938                            ));
1939                            i += 2;
1940                            continue;
1941                        }
1942                    }
1943        }
1944
1945        // Check for arithmetic markers inside string content
1946        let token = if let Token::String(ref s) = tokens[i].token {
1947            // Check if string contains any arithmetic markers
1948            let mut new_content = s.clone();
1949            for (marker, expr) in &arith_result.arithmetics {
1950                if new_content.contains(marker) {
1951                    // Replace marker with the special format that parse_interpolated_string can detect
1952                    // Use ${__ARITH:expr__} format so it gets parsed as StringPart::Arithmetic
1953                    new_content = new_content.replace(marker, &format!("${{__ARITH:{}__}}", expr));
1954                }
1955            }
1956            if new_content != *s {
1957                Spanned::new(Token::String(new_content), tokens[i].span.clone())
1958            } else {
1959                tokens[i].clone()
1960            }
1961        } else {
1962            tokens[i].clone()
1963        };
1964        final_tokens.push(token);
1965        i += 1;
1966    }
1967
1968    Ok(merge_glob_adjacent(merge_colon_adjacent(final_tokens)))
1969}
1970
1971/// Tokenize source code, preserving comments.
1972///
1973/// Useful for pretty-printing or formatting tools that need to preserve comments.
1974pub fn tokenize_with_comments(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1975    let lexer = Token::lexer(source);
1976    let mut tokens = Vec::new();
1977    let mut errors = Vec::new();
1978
1979    for (result, span) in lexer.spanned() {
1980        match result {
1981            Ok(token) => {
1982                tokens.push(Spanned::new(token, span));
1983            }
1984            Err(err) => {
1985                errors.push(Spanned::new(err, span));
1986            }
1987        }
1988    }
1989
1990    if errors.is_empty() {
1991        Ok(tokens)
1992    } else {
1993        Err(errors)
1994    }
1995}
1996
1997/// Extract the string content from a string token (removes quotes, processes escapes).
1998pub fn parse_string_literal(source: &str) -> Result<String, LexerError> {
1999    // Remove surrounding quotes
2000    if source.len() < 2 || !source.starts_with('"') || !source.ends_with('"') {
2001        return Err(LexerError::UnterminatedString);
2002    }
2003
2004    let inner = &source[1..source.len() - 1];
2005    let mut result = String::with_capacity(inner.len());
2006    let mut chars = inner.chars().peekable();
2007
2008    while let Some(ch) = chars.next() {
2009        if ch == '\\' {
2010            match chars.next() {
2011                Some('n') => result.push('\n'),
2012                Some('t') => result.push('\t'),
2013                Some('r') => result.push('\r'),
2014                Some('\\') => result.push('\\'),
2015                Some('"') => result.push('"'),
2016                // Use a unique marker for escaped dollar that won't be re-interpreted
2017                // parse_interpolated_string will convert this back to $
2018                Some('$') => result.push_str("__KAISH_ESCAPED_DOLLAR__"),
2019                Some('u') => {
2020                    // Unicode escape: \uXXXX
2021                    let mut hex = String::with_capacity(4);
2022                    for _ in 0..4 {
2023                        match chars.next() {
2024                            Some(h) if h.is_ascii_hexdigit() => hex.push(h),
2025                            _ => return Err(LexerError::InvalidEscape),
2026                        }
2027                    }
2028                    let codepoint = u32::from_str_radix(&hex, 16)
2029                        .map_err(|_| LexerError::InvalidEscape)?;
2030                    let ch = char::from_u32(codepoint)
2031                        .ok_or(LexerError::InvalidEscape)?;
2032                    result.push(ch);
2033                }
2034                // Unknown escapes: preserve the backslash (for regex patterns like `\.`)
2035                Some(next) => {
2036                    result.push('\\');
2037                    result.push(next);
2038                }
2039                None => return Err(LexerError::InvalidEscape),
2040            }
2041        } else {
2042            result.push(ch);
2043        }
2044    }
2045
2046    Ok(result)
2047}
2048
2049/// Parse a variable reference, extracting the path segments.
2050/// Input: "${VAR.field[0].nested}" → ["VAR", "field", "[0]", "nested"]
2051pub fn parse_var_ref(source: &str) -> Result<Vec<String>, LexerError> {
2052    // Remove ${ and }
2053    if source.len() < 4 || !source.starts_with("${") || !source.ends_with('}') {
2054        return Err(LexerError::UnterminatedVarRef);
2055    }
2056
2057    let inner = &source[2..source.len() - 1];
2058
2059    // Special case: $? (last result)
2060    if inner == "?" {
2061        return Ok(vec!["?".to_string()]);
2062    }
2063
2064    let mut segments = Vec::new();
2065    let mut current = String::new();
2066    let mut chars = inner.chars().peekable();
2067
2068    while let Some(ch) = chars.next() {
2069        match ch {
2070            '.' => {
2071                if !current.is_empty() {
2072                    segments.push(current.clone());
2073                    current.clear();
2074                }
2075            }
2076            '[' => {
2077                if !current.is_empty() {
2078                    segments.push(current.clone());
2079                    current.clear();
2080                }
2081                // Collect the index
2082                let mut index = String::from("[");
2083                while let Some(&c) = chars.peek() {
2084                    if let Some(c) = chars.next() {
2085                        index.push(c);
2086                    }
2087                    if c == ']' {
2088                        break;
2089                    }
2090                }
2091                segments.push(index);
2092            }
2093            _ => {
2094                current.push(ch);
2095            }
2096        }
2097    }
2098
2099    if !current.is_empty() {
2100        segments.push(current);
2101    }
2102
2103    Ok(segments)
2104}
2105
2106/// Parse an integer literal.
2107pub fn parse_int(source: &str) -> Result<i64, LexerError> {
2108    source.parse().map_err(|_| LexerError::InvalidNumber)
2109}
2110
2111/// Parse a float literal.
2112pub fn parse_float(source: &str) -> Result<f64, LexerError> {
2113    source.parse().map_err(|_| LexerError::InvalidNumber)
2114}
2115
2116#[cfg(test)]
2117#[allow(clippy::approx_constant)]
2118mod tests {
2119    use super::*;
2120
2121    fn lex(source: &str) -> Vec<Token> {
2122        tokenize(source)
2123            .expect("lexer should succeed")
2124            .into_iter()
2125            .map(|s| s.token)
2126            .collect()
2127    }
2128
2129    // ═══════════════════════════════════════════════════════════════════
2130    // Keyword tests
2131    // ═══════════════════════════════════════════════════════════════════
2132
2133    #[test]
2134    fn keywords() {
2135        assert_eq!(lex("set"), vec![Token::Set]);
2136        assert_eq!(lex("if"), vec![Token::If]);
2137        assert_eq!(lex("then"), vec![Token::Then]);
2138        assert_eq!(lex("else"), vec![Token::Else]);
2139        assert_eq!(lex("elif"), vec![Token::Elif]);
2140        assert_eq!(lex("fi"), vec![Token::Fi]);
2141        assert_eq!(lex("for"), vec![Token::For]);
2142        assert_eq!(lex("in"), vec![Token::In]);
2143        assert_eq!(lex("do"), vec![Token::Do]);
2144        assert_eq!(lex("done"), vec![Token::Done]);
2145        assert_eq!(lex("case"), vec![Token::Case]);
2146        assert_eq!(lex("esac"), vec![Token::Esac]);
2147        assert_eq!(lex("function"), vec![Token::Function]);
2148        assert_eq!(lex("true"), vec![Token::True]);
2149        assert_eq!(lex("false"), vec![Token::False]);
2150    }
2151
2152    #[test]
2153    fn double_semicolon() {
2154        assert_eq!(lex(";;"), vec![Token::DoubleSemi]);
2155        // In case pattern context
2156        assert_eq!(lex("echo \"hi\";;"), vec![
2157            Token::Ident("echo".to_string()),
2158            Token::String("hi".to_string()),
2159            Token::DoubleSemi,
2160        ]);
2161    }
2162
2163    #[test]
2164    fn type_keywords() {
2165        assert_eq!(lex("string"), vec![Token::TypeString]);
2166        assert_eq!(lex("int"), vec![Token::TypeInt]);
2167        assert_eq!(lex("float"), vec![Token::TypeFloat]);
2168        assert_eq!(lex("bool"), vec![Token::TypeBool]);
2169    }
2170
2171    // ═══════════════════════════════════════════════════════════════════
2172    // Operator tests
2173    // ═══════════════════════════════════════════════════════════════════
2174
2175    #[test]
2176    fn single_char_operators() {
2177        assert_eq!(lex("="), vec![Token::Eq]);
2178        assert_eq!(lex("|"), vec![Token::Pipe]);
2179        assert_eq!(lex("&"), vec![Token::Amp]);
2180        assert_eq!(lex(">"), vec![Token::Gt]);
2181        assert_eq!(lex("<"), vec![Token::Lt]);
2182        assert_eq!(lex(";"), vec![Token::Semi]);
2183        assert_eq!(lex(":"), vec![Token::Colon]);
2184        assert_eq!(lex(","), vec![Token::Comma]);
2185        assert_eq!(lex("."), vec![Token::Dot]);
2186    }
2187
2188    #[test]
2189    fn multi_char_operators() {
2190        assert_eq!(lex("&&"), vec![Token::And]);
2191        assert_eq!(lex("||"), vec![Token::Or]);
2192        assert_eq!(lex("=="), vec![Token::EqEq]);
2193        assert_eq!(lex("!="), vec![Token::NotEq]);
2194        assert_eq!(lex("=~"), vec![Token::Match]);
2195        assert_eq!(lex("!~"), vec![Token::NotMatch]);
2196        assert_eq!(lex(">="), vec![Token::GtEq]);
2197        assert_eq!(lex("<="), vec![Token::LtEq]);
2198        assert_eq!(lex(">>"), vec![Token::GtGt]);
2199        assert_eq!(lex("2>"), vec![Token::Stderr]);
2200        assert_eq!(lex("&>"), vec![Token::Both]);
2201    }
2202
2203    #[test]
2204    fn brackets() {
2205        assert_eq!(lex("{"), vec![Token::LBrace]);
2206        assert_eq!(lex("}"), vec![Token::RBrace]);
2207        assert_eq!(lex("["), vec![Token::LBracket]);
2208        assert_eq!(lex("]"), vec![Token::RBracket]);
2209        assert_eq!(lex("("), vec![Token::LParen]);
2210        assert_eq!(lex(")"), vec![Token::RParen]);
2211    }
2212
2213    // ═══════════════════════════════════════════════════════════════════
2214    // Literal tests
2215    // ═══════════════════════════════════════════════════════════════════
2216
2217    #[test]
2218    fn integers() {
2219        assert_eq!(lex("0"), vec![Token::Int(0)]);
2220        assert_eq!(lex("42"), vec![Token::Int(42)]);
2221        assert_eq!(lex("-1"), vec![Token::Int(-1)]);
2222        assert_eq!(lex("999999"), vec![Token::Int(999999)]);
2223    }
2224
2225    #[test]
2226    fn floats() {
2227        assert_eq!(lex("3.14"), vec![Token::Float(3.14)]);
2228        assert_eq!(lex("-0.5"), vec![Token::Float(-0.5)]);
2229        assert_eq!(lex("123.456"), vec![Token::Float(123.456)]);
2230    }
2231
2232    #[test]
2233    fn strings() {
2234        assert_eq!(lex(r#""hello""#), vec![Token::String("hello".to_string())]);
2235        assert_eq!(lex(r#""hello world""#), vec![Token::String("hello world".to_string())]);
2236        assert_eq!(lex(r#""""#), vec![Token::String("".to_string())]); // empty string
2237        assert_eq!(lex(r#""with \"quotes\"""#), vec![Token::String("with \"quotes\"".to_string())]);
2238        assert_eq!(lex(r#""with\nnewline""#), vec![Token::String("with\nnewline".to_string())]);
2239    }
2240
2241    #[test]
2242    fn var_refs() {
2243        assert_eq!(lex("${X}"), vec![Token::VarRef("${X}".to_string())]);
2244        assert_eq!(lex("${VAR}"), vec![Token::VarRef("${VAR}".to_string())]);
2245        assert_eq!(lex("${VAR.field}"), vec![Token::VarRef("${VAR.field}".to_string())]);
2246        assert_eq!(lex("${VAR[0]}"), vec![Token::VarRef("${VAR[0]}".to_string())]);
2247    }
2248
2249    // ═══════════════════════════════════════════════════════════════════
2250    // Identifier tests
2251    // ═══════════════════════════════════════════════════════════════════
2252
2253    #[test]
2254    fn identifiers() {
2255        assert_eq!(lex("foo"), vec![Token::Ident("foo".to_string())]);
2256        assert_eq!(lex("foo_bar"), vec![Token::Ident("foo_bar".to_string())]);
2257        assert_eq!(lex("foo-bar"), vec![Token::Ident("foo-bar".to_string())]);
2258        assert_eq!(lex("_private"), vec![Token::Ident("_private".to_string())]);
2259        assert_eq!(lex("cmd123"), vec![Token::Ident("cmd123".to_string())]);
2260    }
2261
2262    #[test]
2263    fn keyword_prefix_identifiers() {
2264        // Identifiers that start with keywords but aren't keywords
2265        assert_eq!(lex("setup"), vec![Token::Ident("setup".to_string())]);
2266        assert_eq!(lex("kaish-tools"), vec![Token::Ident("kaish-tools".to_string())]);
2267        assert_eq!(lex("iffy"), vec![Token::Ident("iffy".to_string())]);
2268        assert_eq!(lex("forked"), vec![Token::Ident("forked".to_string())]);
2269        assert_eq!(lex("done-with-it"), vec![Token::Ident("done-with-it".to_string())]);
2270    }
2271
2272    // ═══════════════════════════════════════════════════════════════════
2273    // Statement tests
2274    // ═══════════════════════════════════════════════════════════════════
2275
2276    #[test]
2277    fn assignment() {
2278        assert_eq!(
2279            lex("set X = 5"),
2280            vec![Token::Set, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2281        );
2282    }
2283
2284    #[test]
2285    fn command_simple() {
2286        assert_eq!(lex("echo"), vec![Token::Ident("echo".to_string())]);
2287        assert_eq!(
2288            lex(r#"echo "hello""#),
2289            vec![Token::Ident("echo".to_string()), Token::String("hello".to_string())]
2290        );
2291    }
2292
2293    #[test]
2294    fn command_with_args() {
2295        assert_eq!(
2296            lex("cmd arg1 arg2"),
2297            vec![Token::Ident("cmd".to_string()), Token::Ident("arg1".to_string()), Token::Ident("arg2".to_string())]
2298        );
2299    }
2300
2301    #[test]
2302    fn command_with_named_args() {
2303        assert_eq!(
2304            lex("cmd key=value"),
2305            vec![Token::Ident("cmd".to_string()), Token::Ident("key".to_string()), Token::Eq, Token::Ident("value".to_string())]
2306        );
2307    }
2308
2309    #[test]
2310    fn pipeline() {
2311        assert_eq!(
2312            lex("a | b | c"),
2313            vec![Token::Ident("a".to_string()), Token::Pipe, Token::Ident("b".to_string()), Token::Pipe, Token::Ident("c".to_string())]
2314        );
2315    }
2316
2317    #[test]
2318    fn if_statement() {
2319        assert_eq!(
2320            lex("if true; then echo; fi"),
2321            vec![
2322                Token::If,
2323                Token::True,
2324                Token::Semi,
2325                Token::Then,
2326                Token::Ident("echo".to_string()),
2327                Token::Semi,
2328                Token::Fi
2329            ]
2330        );
2331    }
2332
2333    #[test]
2334    fn for_loop() {
2335        assert_eq!(
2336            lex("for X in items; do echo; done"),
2337            vec![
2338                Token::For,
2339                Token::Ident("X".to_string()),
2340                Token::In,
2341                Token::Ident("items".to_string()),
2342                Token::Semi,
2343                Token::Do,
2344                Token::Ident("echo".to_string()),
2345                Token::Semi,
2346                Token::Done
2347            ]
2348        );
2349    }
2350
2351    // ═══════════════════════════════════════════════════════════════════
2352    // Whitespace and newlines
2353    // ═══════════════════════════════════════════════════════════════════
2354
2355    #[test]
2356    fn whitespace_ignored() {
2357        assert_eq!(lex("   set   X   =   5   "), lex("set X = 5"));
2358    }
2359
2360    #[test]
2361    fn newlines_preserved() {
2362        let tokens = lex("a\nb");
2363        assert_eq!(
2364            tokens,
2365            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2366        );
2367    }
2368
2369    #[test]
2370    fn multiple_newlines() {
2371        let tokens = lex("a\n\n\nb");
2372        assert_eq!(
2373            tokens,
2374            vec![Token::Ident("a".to_string()), Token::Newline, Token::Newline, Token::Newline, Token::Ident("b".to_string())]
2375        );
2376    }
2377
2378    // ═══════════════════════════════════════════════════════════════════
2379    // Comments
2380    // ═══════════════════════════════════════════════════════════════════
2381
2382    #[test]
2383    fn comments_skipped() {
2384        assert_eq!(lex("# comment"), vec![]);
2385        assert_eq!(lex("a # comment"), vec![Token::Ident("a".to_string())]);
2386        assert_eq!(
2387            lex("a # comment\nb"),
2388            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2389        );
2390    }
2391
2392    #[test]
2393    fn comments_preserved_when_requested() {
2394        let tokens = tokenize_with_comments("a # comment")
2395            .expect("should succeed")
2396            .into_iter()
2397            .map(|s| s.token)
2398            .collect::<Vec<_>>();
2399        assert_eq!(tokens, vec![Token::Ident("a".to_string()), Token::Comment]);
2400    }
2401
2402    // ═══════════════════════════════════════════════════════════════════
2403    // String parsing
2404    // ═══════════════════════════════════════════════════════════════════
2405
2406    #[test]
2407    fn parse_simple_string() {
2408        assert_eq!(parse_string_literal(r#""hello""#).expect("ok"), "hello");
2409    }
2410
2411    #[test]
2412    fn parse_string_with_escapes() {
2413        assert_eq!(
2414            parse_string_literal(r#""hello\nworld""#).expect("ok"),
2415            "hello\nworld"
2416        );
2417        assert_eq!(
2418            parse_string_literal(r#""tab\there""#).expect("ok"),
2419            "tab\there"
2420        );
2421        assert_eq!(
2422            parse_string_literal(r#""quote\"here""#).expect("ok"),
2423            "quote\"here"
2424        );
2425    }
2426
2427    #[test]
2428    fn parse_string_with_unicode() {
2429        assert_eq!(
2430            parse_string_literal(r#""emoji \u2764""#).expect("ok"),
2431            "emoji ❤"
2432        );
2433    }
2434
2435    #[test]
2436    fn parse_string_with_escaped_dollar() {
2437        // \$ produces a marker that parse_interpolated_string will convert to $
2438        // The marker __KAISH_ESCAPED_DOLLAR__ is used to prevent re-interpretation
2439        assert_eq!(
2440            parse_string_literal(r#""\$VAR""#).expect("ok"),
2441            "__KAISH_ESCAPED_DOLLAR__VAR"
2442        );
2443        assert_eq!(
2444            parse_string_literal(r#""cost: \$100""#).expect("ok"),
2445            "cost: __KAISH_ESCAPED_DOLLAR__100"
2446        );
2447    }
2448
2449    // ═══════════════════════════════════════════════════════════════════
2450    // Variable reference parsing
2451    // ═══════════════════════════════════════════════════════════════════
2452
2453    #[test]
2454    fn parse_simple_var() {
2455        assert_eq!(
2456            parse_var_ref("${X}").expect("ok"),
2457            vec!["X"]
2458        );
2459    }
2460
2461    #[test]
2462    fn parse_var_with_field() {
2463        assert_eq!(
2464            parse_var_ref("${VAR.field}").expect("ok"),
2465            vec!["VAR", "field"]
2466        );
2467    }
2468
2469    #[test]
2470    fn parse_var_with_index() {
2471        assert_eq!(
2472            parse_var_ref("${VAR[0]}").expect("ok"),
2473            vec!["VAR", "[0]"]
2474        );
2475    }
2476
2477    #[test]
2478    fn parse_var_nested() {
2479        assert_eq!(
2480            parse_var_ref("${VAR.field[0].nested}").expect("ok"),
2481            vec!["VAR", "field", "[0]", "nested"]
2482        );
2483    }
2484
2485    #[test]
2486    fn parse_last_result() {
2487        assert_eq!(
2488            parse_var_ref("${?}").expect("ok"),
2489            vec!["?"]
2490        );
2491    }
2492
2493    // ═══════════════════════════════════════════════════════════════════
2494    // Number parsing
2495    // ═══════════════════════════════════════════════════════════════════
2496
2497    #[test]
2498    fn parse_integers() {
2499        assert_eq!(parse_int("0").expect("ok"), 0);
2500        assert_eq!(parse_int("42").expect("ok"), 42);
2501        assert_eq!(parse_int("-1").expect("ok"), -1);
2502    }
2503
2504    #[test]
2505    fn parse_floats() {
2506        assert!((parse_float("3.14").expect("ok") - 3.14).abs() < f64::EPSILON);
2507        assert!((parse_float("-0.5").expect("ok") - (-0.5)).abs() < f64::EPSILON);
2508    }
2509
2510    // ═══════════════════════════════════════════════════════════════════
2511    // Edge cases and errors
2512    // ═══════════════════════════════════════════════════════════════════
2513
2514    #[test]
2515    fn empty_input() {
2516        assert_eq!(lex(""), vec![]);
2517    }
2518
2519    #[test]
2520    fn only_whitespace() {
2521        assert_eq!(lex("   \t\t   "), vec![]);
2522    }
2523
2524    #[test]
2525    fn json_array() {
2526        assert_eq!(
2527            lex(r#"[1, 2, 3]"#),
2528            vec![
2529                Token::LBracket,
2530                Token::Int(1),
2531                Token::Comma,
2532                Token::Int(2),
2533                Token::Comma,
2534                Token::Int(3),
2535                Token::RBracket
2536            ]
2537        );
2538    }
2539
2540    #[test]
2541    fn json_object() {
2542        assert_eq!(
2543            lex(r#"{"key": "value"}"#),
2544            vec![
2545                Token::LBrace,
2546                Token::String("key".to_string()),
2547                Token::Colon,
2548                Token::String("value".to_string()),
2549                Token::RBrace
2550            ]
2551        );
2552    }
2553
2554    #[test]
2555    fn redirect_operators() {
2556        assert_eq!(
2557            lex("cmd > file"),
2558            vec![Token::Ident("cmd".to_string()), Token::Gt, Token::Ident("file".to_string())]
2559        );
2560        assert_eq!(
2561            lex("cmd >> file"),
2562            vec![Token::Ident("cmd".to_string()), Token::GtGt, Token::Ident("file".to_string())]
2563        );
2564        assert_eq!(
2565            lex("cmd 2> err"),
2566            vec![Token::Ident("cmd".to_string()), Token::Stderr, Token::Ident("err".to_string())]
2567        );
2568        assert_eq!(
2569            lex("cmd &> all"),
2570            vec![Token::Ident("cmd".to_string()), Token::Both, Token::Ident("all".to_string())]
2571        );
2572    }
2573
2574    #[test]
2575    fn background_job() {
2576        assert_eq!(
2577            lex("cmd &"),
2578            vec![Token::Ident("cmd".to_string()), Token::Amp]
2579        );
2580    }
2581
2582    #[test]
2583    fn command_substitution() {
2584        assert_eq!(
2585            lex("$(cmd)"),
2586            vec![Token::CmdSubstStart, Token::Ident("cmd".to_string()), Token::RParen]
2587        );
2588        assert_eq!(
2589            lex("$(cmd arg)"),
2590            vec![
2591                Token::CmdSubstStart,
2592                Token::Ident("cmd".to_string()),
2593                Token::Ident("arg".to_string()),
2594                Token::RParen
2595            ]
2596        );
2597        assert_eq!(
2598            lex("$(a | b)"),
2599            vec![
2600                Token::CmdSubstStart,
2601                Token::Ident("a".to_string()),
2602                Token::Pipe,
2603                Token::Ident("b".to_string()),
2604                Token::RParen
2605            ]
2606        );
2607    }
2608
2609    #[test]
2610    fn complex_pipeline() {
2611        assert_eq!(
2612            lex(r#"cat file | grep pattern="foo" | head count=10"#),
2613            vec![
2614                Token::Ident("cat".to_string()),
2615                Token::Ident("file".to_string()),
2616                Token::Pipe,
2617                Token::Ident("grep".to_string()),
2618                Token::Ident("pattern".to_string()),
2619                Token::Eq,
2620                Token::String("foo".to_string()),
2621                Token::Pipe,
2622                Token::Ident("head".to_string()),
2623                Token::Ident("count".to_string()),
2624                Token::Eq,
2625                Token::Int(10),
2626            ]
2627        );
2628    }
2629
2630    // ═══════════════════════════════════════════════════════════════════
2631    // Flag tests
2632    // ═══════════════════════════════════════════════════════════════════
2633
2634    #[test]
2635    fn short_flag() {
2636        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2637        assert_eq!(lex("-a"), vec![Token::ShortFlag("a".to_string())]);
2638        assert_eq!(lex("-v"), vec![Token::ShortFlag("v".to_string())]);
2639    }
2640
2641    #[test]
2642    fn short_flag_combined() {
2643        // Combined short flags like -la
2644        assert_eq!(lex("-la"), vec![Token::ShortFlag("la".to_string())]);
2645        assert_eq!(lex("-vvv"), vec![Token::ShortFlag("vvv".to_string())]);
2646    }
2647
2648    #[test]
2649    fn job_spec_lexes_as_one_token() {
2650        // `%N` is the bash jobspec for wait/kill — used to be a lexer error.
2651        assert_eq!(lex("%1"), vec![Token::JobSpec("%1".to_string())]);
2652        assert_eq!(lex("%12"), vec![Token::JobSpec("%12".to_string())]);
2653        assert_eq!(
2654            lex("wait %1 %2"),
2655            vec![
2656                Token::Ident("wait".to_string()),
2657                Token::JobSpec("%1".to_string()),
2658                Token::JobSpec("%2".to_string()),
2659            ]
2660        );
2661    }
2662
2663    #[test]
2664    fn short_flag_with_internal_hyphens_is_one_token() {
2665        // A dash-word with internal hyphens is ONE shell word, not three
2666        // flags — `-not-a-flag` must not fragment into `-not` `-a` `-flag`.
2667        // (Whether it's a flag or a literal is the binding layer's call.)
2668        assert_eq!(
2669            lex("-not-a-flag"),
2670            vec![Token::ShortFlag("not-a-flag".to_string())]
2671        );
2672        // The two-char terminator `--` is still DoubleDash, and a lone `-`
2673        // is still MinusAlone — the second char must be a letter to start a
2674        // short flag.
2675        assert_eq!(lex("--"), vec![Token::DoubleDash]);
2676        assert_eq!(lex("-"), vec![Token::MinusAlone]);
2677    }
2678
2679    #[test]
2680    fn long_flag() {
2681        assert_eq!(lex("--force"), vec![Token::LongFlag("force".to_string())]);
2682        assert_eq!(lex("--verbose"), vec![Token::LongFlag("verbose".to_string())]);
2683        assert_eq!(lex("--foo-bar"), vec![Token::LongFlag("foo-bar".to_string())]);
2684    }
2685
2686    #[test]
2687    fn double_dash() {
2688        // -- alone marks end of flags
2689        assert_eq!(lex("--"), vec![Token::DoubleDash]);
2690    }
2691
2692    #[test]
2693    fn flags_vs_negative_numbers() {
2694        // -123 should be a negative integer, not a flag
2695        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2696        // -l should be a flag
2697        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2698        // -1a is ambiguous - should be Int(-1) then Ident(a)
2699        // Actually the regex -[a-zA-Z] won't match -1a since 1 isn't a letter
2700        assert_eq!(
2701            lex("-1 a"),
2702            vec![Token::Int(-1), Token::Ident("a".to_string())]
2703        );
2704    }
2705
2706    #[test]
2707    fn command_with_flags() {
2708        assert_eq!(
2709            lex("ls -l"),
2710            vec![
2711                Token::Ident("ls".to_string()),
2712                Token::ShortFlag("l".to_string()),
2713            ]
2714        );
2715        assert_eq!(
2716            lex("git commit -m"),
2717            vec![
2718                Token::Ident("git".to_string()),
2719                Token::Ident("commit".to_string()),
2720                Token::ShortFlag("m".to_string()),
2721            ]
2722        );
2723        assert_eq!(
2724            lex("git push --force"),
2725            vec![
2726                Token::Ident("git".to_string()),
2727                Token::Ident("push".to_string()),
2728                Token::LongFlag("force".to_string()),
2729            ]
2730        );
2731    }
2732
2733    #[test]
2734    fn flag_with_value() {
2735        assert_eq!(
2736            lex(r#"git commit -m "message""#),
2737            vec![
2738                Token::Ident("git".to_string()),
2739                Token::Ident("commit".to_string()),
2740                Token::ShortFlag("m".to_string()),
2741                Token::String("message".to_string()),
2742            ]
2743        );
2744        assert_eq!(
2745            lex(r#"--message="hello""#),
2746            vec![
2747                Token::LongFlag("message".to_string()),
2748                Token::Eq,
2749                Token::String("hello".to_string()),
2750            ]
2751        );
2752    }
2753
2754    #[test]
2755    fn end_of_flags_marker() {
2756        assert_eq!(
2757            lex("git checkout -- file"),
2758            vec![
2759                Token::Ident("git".to_string()),
2760                Token::Ident("checkout".to_string()),
2761                Token::DoubleDash,
2762                Token::Ident("file".to_string()),
2763            ]
2764        );
2765    }
2766
2767    // ═══════════════════════════════════════════════════════════════════
2768    // Bash compatibility tokens
2769    // ═══════════════════════════════════════════════════════════════════
2770
2771    #[test]
2772    fn local_keyword() {
2773        assert_eq!(lex("local"), vec![Token::Local]);
2774        assert_eq!(
2775            lex("local X = 5"),
2776            vec![Token::Local, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2777        );
2778    }
2779
2780    #[test]
2781    fn simple_var_ref() {
2782        assert_eq!(lex("$X"), vec![Token::SimpleVarRef("X".to_string())]);
2783        assert_eq!(lex("$foo"), vec![Token::SimpleVarRef("foo".to_string())]);
2784        assert_eq!(lex("$foo_bar"), vec![Token::SimpleVarRef("foo_bar".to_string())]);
2785        assert_eq!(lex("$_private"), vec![Token::SimpleVarRef("_private".to_string())]);
2786    }
2787
2788    #[test]
2789    fn simple_var_ref_in_command() {
2790        assert_eq!(
2791            lex("echo $NAME"),
2792            vec![Token::Ident("echo".to_string()), Token::SimpleVarRef("NAME".to_string())]
2793        );
2794    }
2795
2796    #[test]
2797    fn single_quoted_strings() {
2798        assert_eq!(lex("'hello'"), vec![Token::SingleString("hello".to_string())]);
2799        assert_eq!(lex("'hello world'"), vec![Token::SingleString("hello world".to_string())]);
2800        assert_eq!(lex("''"), vec![Token::SingleString("".to_string())]);
2801        // Single quotes don't process escapes or variables
2802        assert_eq!(lex(r"'no $VAR here'"), vec![Token::SingleString("no $VAR here".to_string())]);
2803        assert_eq!(lex(r"'backslash \n stays'"), vec![Token::SingleString(r"backslash \n stays".to_string())]);
2804    }
2805
2806    #[test]
2807    fn test_brackets() {
2808        // [[ and ]] are now two separate bracket tokens to avoid conflicts with nested arrays
2809        assert_eq!(lex("[["), vec![Token::LBracket, Token::LBracket]);
2810        assert_eq!(lex("]]"), vec![Token::RBracket, Token::RBracket]);
2811        assert_eq!(
2812            lex("[[ -f file ]]"),
2813            vec![
2814                Token::LBracket,
2815                Token::LBracket,
2816                Token::ShortFlag("f".to_string()),
2817                Token::Ident("file".to_string()),
2818                Token::RBracket,
2819                Token::RBracket
2820            ]
2821        );
2822    }
2823
2824    #[test]
2825    fn test_expression_syntax() {
2826        assert_eq!(
2827            lex(r#"[[ $X == "value" ]]"#),
2828            vec![
2829                Token::LBracket,
2830                Token::LBracket,
2831                Token::SimpleVarRef("X".to_string()),
2832                Token::EqEq,
2833                Token::String("value".to_string()),
2834                Token::RBracket,
2835                Token::RBracket
2836            ]
2837        );
2838    }
2839
2840    #[test]
2841    fn bash_style_assignment() {
2842        // NAME="value" (no spaces) - lexer sees IDENT EQ STRING
2843        assert_eq!(
2844            lex(r#"NAME="value""#),
2845            vec![
2846                Token::Ident("NAME".to_string()),
2847                Token::Eq,
2848                Token::String("value".to_string())
2849            ]
2850        );
2851    }
2852
2853    #[test]
2854    fn positional_params() {
2855        assert_eq!(lex("$0"), vec![Token::Positional(0)]);
2856        assert_eq!(lex("$1"), vec![Token::Positional(1)]);
2857        assert_eq!(lex("$9"), vec![Token::Positional(9)]);
2858        assert_eq!(lex("$@"), vec![Token::AllArgs]);
2859        assert_eq!(lex("$#"), vec![Token::ArgCount]);
2860    }
2861
2862    #[test]
2863    fn positional_in_context() {
2864        assert_eq!(
2865            lex("echo $1 $2"),
2866            vec![
2867                Token::Ident("echo".to_string()),
2868                Token::Positional(1),
2869                Token::Positional(2),
2870            ]
2871        );
2872    }
2873
2874    #[test]
2875    fn var_length() {
2876        assert_eq!(lex("${#X}"), vec![Token::VarLength("X".to_string())]);
2877        assert_eq!(lex("${#NAME}"), vec![Token::VarLength("NAME".to_string())]);
2878        assert_eq!(lex("${#foo_bar}"), vec![Token::VarLength("foo_bar".to_string())]);
2879    }
2880
2881    #[test]
2882    fn var_length_in_context() {
2883        assert_eq!(
2884            lex("echo ${#NAME}"),
2885            vec![
2886                Token::Ident("echo".to_string()),
2887                Token::VarLength("NAME".to_string()),
2888            ]
2889        );
2890    }
2891
2892    // ═══════════════════════════════════════════════════════════════════
2893    // Edge case tests: Flag ambiguities
2894    // ═══════════════════════════════════════════════════════════════════
2895
2896    #[test]
2897    fn plus_flag() {
2898        // Plus flags for set +e
2899        assert_eq!(lex("+e"), vec![Token::PlusFlag("e".to_string())]);
2900        assert_eq!(lex("+x"), vec![Token::PlusFlag("x".to_string())]);
2901        assert_eq!(lex("+ex"), vec![Token::PlusFlag("ex".to_string())]);
2902    }
2903
2904    #[test]
2905    fn set_with_plus_flag() {
2906        assert_eq!(
2907            lex("set +e"),
2908            vec![
2909                Token::Set,
2910                Token::PlusFlag("e".to_string()),
2911            ]
2912        );
2913    }
2914
2915    #[test]
2916    fn set_with_multiple_flags() {
2917        assert_eq!(
2918            lex("set -e -u"),
2919            vec![
2920                Token::Set,
2921                Token::ShortFlag("e".to_string()),
2922                Token::ShortFlag("u".to_string()),
2923            ]
2924        );
2925    }
2926
2927    #[test]
2928    fn flags_vs_negative_numbers_edge_cases() {
2929        // -1a should be negative int followed by ident
2930        assert_eq!(
2931            lex("-1 a"),
2932            vec![Token::Int(-1), Token::Ident("a".to_string())]
2933        );
2934        // -l is a flag
2935        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2936        // -123 is negative number
2937        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2938    }
2939
2940    #[test]
2941    fn single_dash_is_minus_alone() {
2942        // Single dash alone - now handled as MinusAlone for `cat -` stdin indicator
2943        let result = tokenize("-").expect("should lex");
2944        assert_eq!(result.len(), 1);
2945        assert!(matches!(result[0].token, Token::MinusAlone));
2946    }
2947
2948    #[test]
2949    fn plus_bare_for_date_format() {
2950        // `date +%s` - the +%s should be PlusBare
2951        let result = tokenize("+%s").expect("should lex");
2952        assert_eq!(result.len(), 1);
2953        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%s"));
2954
2955        // `date +%Y-%m-%d` - format string with dashes
2956        let result = tokenize("+%Y-%m-%d").expect("should lex");
2957        assert_eq!(result.len(), 1);
2958        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%Y-%m-%d"));
2959    }
2960
2961    #[test]
2962    fn plus_flag_still_works() {
2963        // `set +e` - should still be PlusFlag
2964        let result = tokenize("+e").expect("should lex");
2965        assert_eq!(result.len(), 1);
2966        assert!(matches!(result[0].token, Token::PlusFlag(ref s) if s == "e"));
2967    }
2968
2969    #[test]
2970    fn while_keyword_vs_while_loop() {
2971        // 'while' as keyword in loop context
2972        assert_eq!(lex("while"), vec![Token::While]);
2973        // 'while' at start followed by condition
2974        assert_eq!(
2975            lex("while true"),
2976            vec![Token::While, Token::True]
2977        );
2978    }
2979
2980    #[test]
2981    fn control_flow_keywords() {
2982        assert_eq!(lex("break"), vec![Token::Break]);
2983        assert_eq!(lex("continue"), vec![Token::Continue]);
2984        assert_eq!(lex("return"), vec![Token::Return]);
2985        assert_eq!(lex("exit"), vec![Token::Exit]);
2986    }
2987
2988    #[test]
2989    fn control_flow_with_numbers() {
2990        assert_eq!(
2991            lex("break 2"),
2992            vec![Token::Break, Token::Int(2)]
2993        );
2994        assert_eq!(
2995            lex("continue 3"),
2996            vec![Token::Continue, Token::Int(3)]
2997        );
2998        assert_eq!(
2999            lex("exit 1"),
3000            vec![Token::Exit, Token::Int(1)]
3001        );
3002    }
3003
3004    // ═══════════════════════════════════════════════════════════════════
3005    // Here-doc tests
3006    // ═══════════════════════════════════════════════════════════════════
3007
3008    #[test]
3009    fn heredoc_simple() {
3010        let source = "cat <<EOF\nhello\nworld\nEOF";
3011        let tokens = lex(source);
3012        // body_start_offset = byte offset of 'h' in "hello", i.e. just after "cat <<EOF\n"
3013        assert_eq!(tokens, vec![
3014            Token::Ident("cat".to_string()),
3015            Token::HereDocStart,
3016            Token::HereDoc(HereDocData {
3017                content: "hello\nworld\n".to_string(),
3018                literal: false,
3019                strip_tabs: false,
3020                body_start_offset: 10,
3021            }),
3022            Token::Newline,
3023        ]);
3024    }
3025
3026    #[test]
3027    fn heredoc_empty() {
3028        let source = "cat <<EOF\nEOF";
3029        let tokens = lex(source);
3030        assert_eq!(tokens, vec![
3031            Token::Ident("cat".to_string()),
3032            Token::HereDocStart,
3033            Token::HereDoc(HereDocData {
3034                content: "".to_string(),
3035                literal: false,
3036                strip_tabs: false,
3037                body_start_offset: 10,
3038            }),
3039            Token::Newline,
3040        ]);
3041    }
3042
3043    #[test]
3044    fn heredoc_with_special_chars() {
3045        let source = "cat <<EOF\n$VAR and \"quoted\" 'single'\nEOF";
3046        let tokens = lex(source);
3047        assert_eq!(tokens, vec![
3048            Token::Ident("cat".to_string()),
3049            Token::HereDocStart,
3050            Token::HereDoc(HereDocData {
3051                content: "$VAR and \"quoted\" 'single'\n".to_string(),
3052                literal: false,
3053                strip_tabs: false,
3054                body_start_offset: 10,
3055            }),
3056            Token::Newline,
3057        ]);
3058    }
3059
3060    #[test]
3061    fn heredoc_multiline() {
3062        let source = "cat <<END\nline1\nline2\nline3\nEND";
3063        let tokens = lex(source);
3064        assert_eq!(tokens, vec![
3065            Token::Ident("cat".to_string()),
3066            Token::HereDocStart,
3067            Token::HereDoc(HereDocData {
3068                content: "line1\nline2\nline3\n".to_string(),
3069                literal: false,
3070                strip_tabs: false,
3071                body_start_offset: 10,
3072            }),
3073            Token::Newline,
3074        ]);
3075    }
3076
3077    #[test]
3078    fn heredoc_in_command() {
3079        let source = "cat <<EOF\nhello\nEOF\necho goodbye";
3080        let tokens = lex(source);
3081        assert_eq!(tokens, vec![
3082            Token::Ident("cat".to_string()),
3083            Token::HereDocStart,
3084            Token::HereDoc(HereDocData {
3085                content: "hello\n".to_string(),
3086                literal: false,
3087                strip_tabs: false,
3088                body_start_offset: 10,
3089            }),
3090            Token::Newline,
3091            Token::Ident("echo".to_string()),
3092            Token::Ident("goodbye".to_string()),
3093        ]);
3094    }
3095
3096    #[test]
3097    fn heredoc_strip_tabs() {
3098        let source = "cat <<-EOF\n\thello\n\tworld\n\tEOF";
3099        let tokens = lex(source);
3100        // Content keeps tabs verbatim — strip_tabs is recorded on the token so
3101        // the interpreter can apply POSIX leading-tab stripping at materialization
3102        // without disturbing source byte offsets used for span tracking.
3103        assert_eq!(tokens, vec![
3104            Token::Ident("cat".to_string()),
3105            Token::HereDocStart,
3106            Token::HereDoc(HereDocData {
3107                content: "\thello\n\tworld\n".to_string(),
3108                literal: false,
3109                strip_tabs: true,
3110                body_start_offset: 11,
3111            }),
3112            Token::Newline,
3113        ]);
3114    }
3115
3116    // ═══════════════════════════════════════════════════════════════════
3117    // Arithmetic expression tests
3118    // ═══════════════════════════════════════════════════════════════════
3119
3120    #[test]
3121    fn arithmetic_simple() {
3122        let source = "$((1 + 2))";
3123        let tokens = lex(source);
3124        assert_eq!(tokens, vec![Token::Arithmetic("1 + 2".to_string())]);
3125    }
3126
3127    #[test]
3128    fn arithmetic_in_assignment() {
3129        let source = "X=$((5 * 3))";
3130        let tokens = lex(source);
3131        assert_eq!(tokens, vec![
3132            Token::Ident("X".to_string()),
3133            Token::Eq,
3134            Token::Arithmetic("5 * 3".to_string()),
3135        ]);
3136    }
3137
3138    #[test]
3139    fn arithmetic_with_nested_parens() {
3140        let source = "$((2 * (3 + 4)))";
3141        let tokens = lex(source);
3142        assert_eq!(tokens, vec![Token::Arithmetic("2 * (3 + 4)".to_string())]);
3143    }
3144
3145    #[test]
3146    fn arithmetic_with_variable() {
3147        let source = "$((X + 1))";
3148        let tokens = lex(source);
3149        assert_eq!(tokens, vec![Token::Arithmetic("X + 1".to_string())]);
3150    }
3151
3152    #[test]
3153    fn arithmetic_command_subst_not_confused() {
3154        // $( should not be treated as arithmetic
3155        let source = "$(echo hello)";
3156        let tokens = lex(source);
3157        assert_eq!(tokens, vec![
3158            Token::CmdSubstStart,
3159            Token::Ident("echo".to_string()),
3160            Token::Ident("hello".to_string()),
3161            Token::RParen,
3162        ]);
3163    }
3164
3165    #[test]
3166    fn arithmetic_nesting_limit() {
3167        // Create deeply nested parens that exceed MAX_PAREN_DEPTH (256)
3168        let open_parens = "(".repeat(300);
3169        let close_parens = ")".repeat(300);
3170        let source = format!("$(({}1{}))", open_parens, close_parens);
3171        let result = tokenize(&source);
3172        assert!(result.is_err());
3173        let errors = result.unwrap_err();
3174        assert_eq!(errors.len(), 1);
3175        assert_eq!(errors[0].token, LexerError::NestingTooDeep);
3176    }
3177
3178    #[test]
3179    fn arithmetic_nesting_within_limit() {
3180        // Nesting within limit should work
3181        let source = "$((((1 + 2) * 3)))";
3182        let tokens = lex(source);
3183        assert_eq!(tokens, vec![Token::Arithmetic("((1 + 2) * 3)".to_string())]);
3184    }
3185
3186    // ═══════════════════════════════════════════════════════════════════
3187    // Arithmetic preprocessor + comment interaction
3188    //
3189    // The preprocessor used to walk raw characters tracking only quote
3190    // state. An apostrophe inside a `#` comment would open single-quote
3191    // mode and swallow real `$((..))` later in the file; `$((..))` *inside*
3192    // a comment would itself be preprocessed into a marker, misplacing
3193    // tokens. Surfaced from kaijutsu's seed scripts (see gotcha memory
3194    // `gotcha-kaish-comment-arithmetic`).
3195    // ═══════════════════════════════════════════════════════════════════
3196
3197    #[test]
3198    fn arithmetic_after_apostrophe_in_comment() {
3199        // The bare apostrophe in "doesn't" used to open single-quote mode
3200        // in the preprocessor and swallow the $((..)) below.
3201        let source = "# this doesn't work\necho $((1+2))";
3202        let tokens = lex(source);
3203        assert_eq!(tokens, vec![
3204            Token::Newline,
3205            Token::Ident("echo".to_string()),
3206            Token::Arithmetic("1+2".to_string()),
3207        ]);
3208    }
3209
3210    #[test]
3211    fn arithmetic_inside_comment_is_not_expanded() {
3212        // `$((y))` inside a `#` comment must stay comment text.
3213        let source = "# the $((y)) syntax explained\necho hello";
3214        let tokens = lex(source);
3215        assert_eq!(tokens, vec![
3216            Token::Newline,
3217            Token::Ident("echo".to_string()),
3218            Token::Ident("hello".to_string()),
3219        ]);
3220    }
3221
3222    #[test]
3223    fn backticked_arithmetic_in_comment_is_not_expanded() {
3224        // The original kaijutsu repro: `$((x))` inside a comment.
3225        // Backticks-in-comments used to leak the inner $((..)) to the
3226        // preprocessor; with comment-skip they stay inert.
3227        let source = "# the `$((x))` syntax explained\necho $((3+4))";
3228        let tokens = lex(source);
3229        assert_eq!(tokens, vec![
3230            Token::Newline,
3231            Token::Ident("echo".to_string()),
3232            Token::Arithmetic("3+4".to_string()),
3233        ]);
3234    }
3235
3236    #[test]
3237    fn arithmetic_still_works_outside_comments() {
3238        // Regression guard: comment-skip must not shrink the arithmetic
3239        // preprocessor's scope on normal `$((..))` usages.
3240        let source = "X=$((1+2)); Y=$((3*4))";
3241        let tokens = lex(source);
3242        assert_eq!(tokens, vec![
3243            Token::Ident("X".to_string()),
3244            Token::Eq,
3245            Token::Arithmetic("1+2".to_string()),
3246            Token::Semi,
3247            Token::Ident("Y".to_string()),
3248            Token::Eq,
3249            Token::Arithmetic("3*4".to_string()),
3250        ]);
3251    }
3252
3253    #[test]
3254    fn arithmetic_inside_double_quotes_still_expands() {
3255        // `#` inside a double-quoted string is a literal character, not a
3256        // comment introducer — arithmetic must still expand around it.
3257        let source = "echo \"# $((1+2))\"";
3258        let tokens = lex(source);
3259        // The string token contains the `#` and the arithmetic marker;
3260        // the exact post-processing happens at interpret time. What we
3261        // assert here is that lexing succeeds and produces a String token
3262        // (i.e. the comment skip didn't trigger inside the string).
3263        assert_eq!(tokens.len(), 2);
3264        assert!(matches!(tokens[0], Token::Ident(_)));
3265        assert!(matches!(tokens[1], Token::String(_)));
3266    }
3267
3268    // ═══════════════════════════════════════════════════════════════════
3269    // Backtick rejection
3270    //
3271    // Backticks are an explicitly dropped feature (see CLAUDE.md,
3272    // docs/LANGUAGE.md, help/limits.md, help/overview.md). We surface a
3273    // dedicated error rather than the generic `UnexpectedCharacter` so
3274    // users get a hint to use `$(cmd)`. Comments, single-quoted strings,
3275    // double-quoted strings, and heredoc bodies are all matched as single
3276    // tokens (or extracted before logos runs), so the rejection only
3277    // fires on bare backticks in source code.
3278    // ═══════════════════════════════════════════════════════════════════
3279
3280    #[test]
3281    fn backtick_in_source_is_rejected() {
3282        let result = tokenize("echo `date`");
3283        assert!(result.is_err());
3284        let errors = result.unwrap_err();
3285        assert!(errors.iter().any(|e| e.token == LexerError::BackticksNotSupported));
3286    }
3287
3288    #[test]
3289    fn backtick_in_comment_is_just_comment_text() {
3290        // Backticks are only rejected when they reach the top-level
3291        // lexer. Inside a comment they're part of the comment body.
3292        let source = "# use `date` here\necho hi";
3293        let tokens = lex(source);
3294        assert_eq!(tokens, vec![
3295            Token::Newline,
3296            Token::Ident("echo".to_string()),
3297            Token::Ident("hi".to_string()),
3298        ]);
3299    }
3300
3301    #[test]
3302    fn backtick_in_single_quoted_string_is_literal() {
3303        // Single-quoted strings are matched as one token by logos; the
3304        // backticks inside never reach the rejecting matcher.
3305        let source = "echo '`date`'";
3306        let tokens = lex(source);
3307        assert_eq!(tokens, vec![
3308            Token::Ident("echo".to_string()),
3309            Token::SingleString("`date`".to_string()),
3310        ]);
3311    }
3312
3313    #[test]
3314    fn backtick_in_double_quoted_string_is_literal() {
3315        // Kaish does not activate command substitution from backticks
3316        // inside double-quoted strings either — clear divergence from
3317        // POSIX but matches the "backticks don't exist" stance. The
3318        // double-quoted string token absorbs them as literal characters.
3319        let source = "echo \"`date`\"";
3320        let tokens = lex(source);
3321        assert_eq!(tokens.len(), 2);
3322        assert!(matches!(tokens[0], Token::Ident(_)));
3323        match &tokens[1] {
3324            Token::String(s) => assert!(s.contains('`')),
3325            other => panic!("expected Token::String, got {:?}", other),
3326        }
3327    }
3328
3329    #[test]
3330    fn backtick_in_heredoc_body_is_preserved() {
3331        // Heredoc bodies are extracted by preprocess_heredocs before
3332        // logos runs, so backticks inside them survive as content.
3333        let source = "cat <<EOF\n`date`\nEOF\n";
3334        let tokens = lex(source);
3335        let heredoc = tokens.iter().find(|t| matches!(t, Token::HereDoc(_)));
3336        assert!(heredoc.is_some(), "expected a HereDoc token");
3337        if let Some(Token::HereDoc(d)) = heredoc {
3338            assert!(d.content.contains('`'));
3339        }
3340    }
3341
3342    // ═══════════════════════════════════════════════════════════════════
3343    // Token category tests
3344    // ═══════════════════════════════════════════════════════════════════
3345
3346    #[test]
3347    fn token_categories() {
3348        // Keywords
3349        assert_eq!(Token::If.category(), TokenCategory::Keyword);
3350        assert_eq!(Token::Then.category(), TokenCategory::Keyword);
3351        assert_eq!(Token::For.category(), TokenCategory::Keyword);
3352        assert_eq!(Token::Function.category(), TokenCategory::Keyword);
3353        assert_eq!(Token::True.category(), TokenCategory::Keyword);
3354        assert_eq!(Token::TypeString.category(), TokenCategory::Keyword);
3355
3356        // Operators
3357        assert_eq!(Token::Pipe.category(), TokenCategory::Operator);
3358        assert_eq!(Token::And.category(), TokenCategory::Operator);
3359        assert_eq!(Token::Or.category(), TokenCategory::Operator);
3360        assert_eq!(Token::StderrToStdout.category(), TokenCategory::Operator);
3361        assert_eq!(Token::GtGt.category(), TokenCategory::Operator);
3362
3363        // Strings
3364        assert_eq!(Token::String("test".to_string()).category(), TokenCategory::String);
3365        assert_eq!(Token::SingleString("test".to_string()).category(), TokenCategory::String);
3366        assert_eq!(
3367            Token::HereDoc(HereDocData {
3368                content: "test".to_string(),
3369                literal: false,
3370                strip_tabs: false,
3371                body_start_offset: 0,
3372            }).category(),
3373            TokenCategory::String,
3374        );
3375
3376        // Numbers
3377        assert_eq!(Token::Int(42).category(), TokenCategory::Number);
3378        assert_eq!(Token::Float(3.14).category(), TokenCategory::Number);
3379        assert_eq!(Token::Arithmetic("1+2".to_string()).category(), TokenCategory::Number);
3380
3381        // Variables
3382        assert_eq!(Token::SimpleVarRef("X".to_string()).category(), TokenCategory::Variable);
3383        assert_eq!(Token::VarRef("${X}".to_string()).category(), TokenCategory::Variable);
3384        assert_eq!(Token::Positional(1).category(), TokenCategory::Variable);
3385        assert_eq!(Token::AllArgs.category(), TokenCategory::Variable);
3386        assert_eq!(Token::ArgCount.category(), TokenCategory::Variable);
3387        assert_eq!(Token::LastExitCode.category(), TokenCategory::Variable);
3388        assert_eq!(Token::CurrentPid.category(), TokenCategory::Variable);
3389
3390        // Flags
3391        assert_eq!(Token::ShortFlag("l".to_string()).category(), TokenCategory::Flag);
3392        assert_eq!(Token::LongFlag("verbose".to_string()).category(), TokenCategory::Flag);
3393        assert_eq!(Token::PlusFlag("e".to_string()).category(), TokenCategory::Flag);
3394        assert_eq!(Token::DoubleDash.category(), TokenCategory::Flag);
3395
3396        // Punctuation
3397        assert_eq!(Token::Semi.category(), TokenCategory::Punctuation);
3398        assert_eq!(Token::LParen.category(), TokenCategory::Punctuation);
3399        assert_eq!(Token::LBracket.category(), TokenCategory::Punctuation);
3400        assert_eq!(Token::Newline.category(), TokenCategory::Punctuation);
3401
3402        // Comments
3403        assert_eq!(Token::Comment.category(), TokenCategory::Comment);
3404
3405        // Paths
3406        assert_eq!(Token::Path("/tmp/file".to_string()).category(), TokenCategory::Path);
3407
3408        // Commands
3409        assert_eq!(Token::Ident("echo".to_string()).category(), TokenCategory::Command);
3410        assert_eq!(Token::NumberIdent("019dda1c".to_string()).category(), TokenCategory::Command);
3411        assert_eq!(Token::DottedIdent(".gitignore".to_string()).category(), TokenCategory::Command);
3412
3413        // Errors
3414        assert_eq!(Token::InvalidFloatNoLeading.category(), TokenCategory::Error);
3415        assert_eq!(Token::InvalidFloatNoTrailing.category(), TokenCategory::Error);
3416    }
3417
3418    #[test]
3419    fn test_heredoc_piped_to_command() {
3420        // Bug 4: "cat <<EOF | jq" should produce: cat <<heredoc | jq
3421        // Not: cat | jq <<heredoc
3422        let tokens = tokenize("cat <<EOF | jq\n{\"key\": \"val\"}\nEOF").unwrap();
3423        let heredoc_pos = tokens.iter().position(|t| matches!(t.token, Token::HereDoc(_)));
3424        let pipe_pos = tokens.iter().position(|t| matches!(t.token, Token::Pipe));
3425        assert!(heredoc_pos.is_some(), "should have a heredoc token");
3426        assert!(pipe_pos.is_some(), "should have a pipe token");
3427        assert!(
3428            pipe_pos.unwrap() > heredoc_pos.unwrap(),
3429            "Pipe must come after heredoc, got heredoc at {}, pipe at {}. Tokens: {:?}",
3430            heredoc_pos.unwrap(), pipe_pos.unwrap(), tokens,
3431        );
3432    }
3433
3434    #[test]
3435    fn test_heredoc_standalone_still_works() {
3436        // Regression: standalone heredoc (no pipe) must still work
3437        let tokens = tokenize("cat <<EOF\nhello\nEOF").unwrap();
3438        assert!(tokens.iter().any(|t| matches!(t.token, Token::HereDoc(_))));
3439        assert!(!tokens.iter().any(|t| matches!(t.token, Token::Pipe)));
3440    }
3441
3442    #[test]
3443    fn test_heredoc_preserves_leading_empty_lines() {
3444        // Bug B: heredoc starting with a blank line must preserve it
3445        let tokens = tokenize("cat <<EOF\n\nhello\nEOF").unwrap();
3446        let heredoc = tokens.iter().find_map(|t| {
3447            if let Token::HereDoc(data) = &t.token {
3448                Some(data.clone())
3449            } else {
3450                None
3451            }
3452        });
3453        assert!(heredoc.is_some(), "should have a heredoc token");
3454        let data = heredoc.unwrap();
3455        assert!(data.content.starts_with('\n'), "leading empty line must be preserved, got: {:?}", data.content);
3456        assert_eq!(data.content, "\nhello\n");
3457    }
3458
3459    #[test]
3460    fn test_heredoc_quoted_delimiter_sets_literal() {
3461        // Bug N: quoted delimiter (<<'EOF') should set literal=true
3462        let tokens = tokenize("cat <<'EOF'\nhello $HOME\nEOF").unwrap();
3463        let heredoc = tokens.iter().find_map(|t| {
3464            if let Token::HereDoc(data) = &t.token {
3465                Some(data.clone())
3466            } else {
3467                None
3468            }
3469        });
3470        assert!(heredoc.is_some(), "should have a heredoc token");
3471        let data = heredoc.unwrap();
3472        assert!(data.literal, "quoted delimiter should set literal=true");
3473        assert_eq!(data.content, "hello $HOME\n");
3474    }
3475
3476    #[test]
3477    fn test_heredoc_unquoted_delimiter_not_literal() {
3478        // Bug N: unquoted delimiter (<<EOF) should have literal=false
3479        let tokens = tokenize("cat <<EOF\nhello $HOME\nEOF").unwrap();
3480        let heredoc = tokens.iter().find_map(|t| {
3481            if let Token::HereDoc(data) = &t.token {
3482                Some(data.clone())
3483            } else {
3484                None
3485            }
3486        });
3487        assert!(heredoc.is_some(), "should have a heredoc token");
3488        let data = heredoc.unwrap();
3489        assert!(!data.literal, "unquoted delimiter should have literal=false");
3490    }
3491
3492    // ═══════════════════════════════════════════════════════════════════
3493    // Colon merge tests
3494    // ═══════════════════════════════════════════════════════════════════
3495
3496    #[test]
3497    fn colon_double_in_word() {
3498        assert_eq!(lex("foo::bar"), vec![Token::Ident("foo::bar".into())]);
3499    }
3500
3501    #[test]
3502    fn colon_single_in_word() {
3503        assert_eq!(lex("a:b:c"), vec![Token::Ident("a:b:c".into())]);
3504    }
3505
3506    #[test]
3507    fn colon_with_port() {
3508        assert_eq!(lex("host:8080"), vec![Token::Ident("host:8080".into())]);
3509    }
3510
3511    #[test]
3512    fn colon_standalone() {
3513        assert_eq!(lex(":"), vec![Token::Colon]);
3514    }
3515
3516    #[test]
3517    fn colon_spaced_no_merge() {
3518        assert_eq!(
3519            lex("foo : bar"),
3520            vec![
3521                Token::Ident("foo".into()),
3522                Token::Colon,
3523                Token::Ident("bar".into()),
3524            ]
3525        );
3526    }
3527
3528    #[test]
3529    fn colon_in_command_arg() {
3530        assert_eq!(
3531            lex("echo foo::bar"),
3532            vec![
3533                Token::Ident("echo".into()),
3534                Token::Ident("foo::bar".into()),
3535            ]
3536        );
3537    }
3538
3539    #[test]
3540    fn colon_trailing() {
3541        // Trailing colon merges with preceding ident
3542        assert_eq!(lex("foo:"), vec![Token::Ident("foo:".into())]);
3543    }
3544
3545    #[test]
3546    fn colon_leading() {
3547        // Leading colon merges with following ident
3548        assert_eq!(lex(":foo"), vec![Token::Ident(":foo".into())]);
3549    }
3550
3551    #[test]
3552    fn colon_with_path() {
3553        // Path token + colon + int
3554        assert_eq!(
3555            lex("/usr/bin:8080"),
3556            vec![Token::Ident("/usr/bin:8080".into())]
3557        );
3558    }
3559
3560    // ═══════════════════════════════════════════════════════════════════
3561    // Token predicate coverage (is_keyword / starts_statement)
3562    // ═══════════════════════════════════════════════════════════════════
3563
3564    #[test]
3565    fn is_keyword_covers_control_flow() {
3566        for t in [
3567            Token::While,
3568            Token::Return,
3569            Token::Break,
3570            Token::Continue,
3571            Token::Exit,
3572        ] {
3573            assert!(t.is_keyword(), "{t:?} should be a keyword");
3574        }
3575    }
3576
3577    #[test]
3578    fn starts_statement_covers_while() {
3579        assert!(Token::While.starts_statement());
3580    }
3581
3582    #[test]
3583    fn is_keyword_rejects_operators() {
3584        for t in [Token::Pipe, Token::Amp, Token::Eq, Token::LBrace] {
3585            assert!(!t.is_keyword(), "{t:?} should not be a keyword");
3586        }
3587    }
3588}
kaish_kernel/lexer.rs

kaish_kernel/
lexer.rs