Skip to main content

kaish_kernel/
lexer.rs

1//! Lexer for kaish source code.
2//!
3//! Converts source text into a stream of tokens using the logos lexer generator.
4//! The lexer is designed to be unambiguous: every valid input produces exactly
5//! one token sequence, and invalid input produces clear errors.
6//!
7//! # Token Categories
8//!
9//! - **Keywords**: `set`, `tool`, `if`, `then`, `else`, `fi`, `for`, `in`, `do`, `done`
10//! - **Literals**: strings, integers, floats, booleans (`true`/`false`)
11//! - **Operators**: `=`, `|`, `&`, `>`, `>>`, `<`, `2>`, `&>`, `&&`, `||`
12//! - **Punctuation**: `;`, `:`, `,`, `.`, `{`, `}`, `[`, `]`
13//! - **Variable references**: `${...}` with nested path access
14//! - **Identifiers**: command names, variable names, parameter names
15
16use logos::{Logos, Span};
17use std::fmt;
18use std::sync::atomic::{AtomicU64, Ordering};
19use std::time::{SystemTime, UNIX_EPOCH};
20
21/// Global counter for generating unique markers across all tokenize calls.
22static MARKER_COUNTER: AtomicU64 = AtomicU64::new(0);
23
24/// Maximum nesting depth for parentheses in arithmetic expressions.
25/// Prevents stack overflow from pathologically nested inputs like $((((((...
26const MAX_PAREN_DEPTH: usize = 256;
27
28/// Tracks a text replacement for span correction.
29/// When preprocessing replaces text (like `$((1+2))` with a marker),
30/// we need to adjust subsequent spans to account for the length change.
31#[derive(Debug, Clone)]
32struct SpanReplacement {
33    /// Position in the preprocessed text where the marker starts.
34    preprocessed_pos: usize,
35    /// Length of the marker in preprocessed text.
36    marker_len: usize,
37    /// Length of the original text that was replaced.
38    original_len: usize,
39}
40
41/// Corrects a span from preprocessed-text coordinates back to original-text coordinates.
42fn correct_span(span: Span, replacements: &[SpanReplacement]) -> Span {
43    let mut start_adjustment: isize = 0;
44    let mut end_adjustment: isize = 0;
45
46    for r in replacements {
47        // Calculate the length difference (positive = original was longer, negative = marker is longer)
48        let delta = r.original_len as isize - r.marker_len as isize;
49
50        // If the span starts after this replacement, adjust the start
51        if span.start > r.preprocessed_pos + r.marker_len {
52            start_adjustment += delta;
53        } else if span.start > r.preprocessed_pos {
54            // Span starts inside the marker - map to original position
55            // (this shouldn't happen often, but handle it gracefully)
56            start_adjustment += delta;
57        }
58
59        // If the span ends after this replacement, adjust the end
60        if span.end > r.preprocessed_pos + r.marker_len {
61            end_adjustment += delta;
62        } else if span.end > r.preprocessed_pos {
63            // Span ends inside the marker - map to end of original
64            end_adjustment += delta;
65        }
66    }
67
68    let new_start = (span.start as isize + start_adjustment).max(0) as usize;
69    let new_end = (span.end as isize + end_adjustment).max(new_start as isize) as usize;
70    new_start..new_end
71}
72
73/// Generate a unique marker ID that's extremely unlikely to collide with user code.
74/// Uses a combination of timestamp, counter, and process ID.
75fn unique_marker_id() -> String {
76    let timestamp = SystemTime::now()
77        .duration_since(UNIX_EPOCH)
78        .map(|d| d.as_nanos())
79        .unwrap_or(0);
80    let counter = MARKER_COUNTER.fetch_add(1, Ordering::Relaxed);
81    #[cfg(target_os = "wasi")]
82    let pid = 0u32;
83    #[cfg(not(target_os = "wasi"))]
84    let pid = std::process::id();
85    format!("{:x}_{:x}_{:x}", timestamp, counter, pid)
86}
87
88/// A token with its span in the source text.
89#[derive(Debug, Clone, PartialEq)]
90pub struct Spanned<T> {
91    pub token: T,
92    pub span: Span,
93}
94
95impl<T> Spanned<T> {
96    pub fn new(token: T, span: Span) -> Self {
97        Self { token, span }
98    }
99}
100
101/// Lexer error types.
102#[derive(Debug, Clone, PartialEq, Default)]
103pub enum LexerError {
104    #[default]
105    UnexpectedCharacter,
106    UnterminatedString,
107    UnterminatedVarRef,
108    InvalidEscape,
109    InvalidNumber,
110    AmbiguousBoolean(String),
111    AmbiguousBooleanLike(String),
112    InvalidFloatNoLeading,
113    InvalidFloatNoTrailing,
114    /// Nesting depth exceeded (too many nested parentheses in arithmetic).
115    NestingTooDeep,
116    /// Heredoc body ended without seeing the closing delimiter on its own line.
117    /// The user almost certainly meant to type the delimiter — silently using
118    /// whatever was collected up to EOF would mask missing data.
119    UnterminatedHeredoc { delimiter: String },
120    /// Backtick command substitution. Kaish drops backticks intentionally —
121    /// they're listed in `docs/LANGUAGE.md` and the help system as not supported.
122    /// We surface this as a dedicated error (rather than `UnexpectedCharacter`)
123    /// so the message can point users at the `$(cmd)` replacement.
124    BackticksNotSupported,
125}
126
127impl fmt::Display for LexerError {
128    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
129        match self {
130            LexerError::UnexpectedCharacter => write!(f, "unexpected character"),
131            LexerError::UnterminatedString => write!(f, "unterminated string"),
132            LexerError::UnterminatedVarRef => write!(f, "unterminated variable reference"),
133            LexerError::InvalidEscape => write!(f, "invalid escape sequence"),
134            LexerError::InvalidNumber => write!(f, "invalid number"),
135            LexerError::AmbiguousBoolean(s) => {
136                write!(f, "ambiguous boolean, use lowercase '{}'", s.to_lowercase())
137            }
138            LexerError::AmbiguousBooleanLike(s) => {
139                let suggest = if s.eq_ignore_ascii_case("yes") { "true" } else { "false" };
140                write!(f, "ambiguous boolean-like '{}', use '{}' or '\"{}\"'", s, suggest, s)
141            }
142            LexerError::InvalidFloatNoLeading => write!(f, "float must have leading digit"),
143            LexerError::InvalidFloatNoTrailing => write!(f, "float must have trailing digit"),
144            LexerError::NestingTooDeep => write!(f, "nesting depth exceeded (max {})", MAX_PAREN_DEPTH),
145            LexerError::UnterminatedHeredoc { delimiter } => {
146                write!(f, "unterminated heredoc, expected closing delimiter `{}` on its own line", delimiter)
147            }
148            LexerError::BackticksNotSupported => {
149                write!(f, "backticks are not supported in kaish; use $(cmd) instead")
150            }
151        }
152    }
153}
154
155/// Tokens produced by the kaish lexer.
156///
157/// The order of variants matters for logos priority. More specific patterns
158/// (like keywords) should come before more general ones (like identifiers).
159///
160/// Tokens that carry semantic values (strings, numbers, identifiers) include
161/// the parsed value directly. This ensures the parser has access to actual
162/// data, not just token types.
163/// Here-doc content data.
164///
165/// - `literal` is true when the delimiter was quoted (`<<'EOF'` or `<<"EOF"`),
166///   meaning no variable expansion should occur.
167/// - `strip_tabs` is true for the `<<-EOF` form. Per POSIX, leading tabs on
168///   each body line are stripped at materialization time. Stripping happens
169///   downstream of the parser so byte offsets in `content` stay aligned with
170///   their original-source positions for span-tracking purposes.
171/// - `body_start_offset` is the byte offset of the first character of `content`
172///   in the source string fed into the lexer's `tokenize`. This lets the parser
173///   compute absolute spans for parts found inside the body during interpolation.
174///   In sources without arithmetic preprocessing rewrites, this equals the
175///   original-source offset; with arithmetic before the heredoc, line numbers
176///   may shift slightly until full preprocessing-layer composition lands.
177#[derive(Debug, Clone, PartialEq)]
178pub struct HereDocData {
179    pub content: String,
180    pub literal: bool,
181    pub strip_tabs: bool,
182    pub body_start_offset: usize,
183}
184
185#[derive(Logos, Debug, Clone, PartialEq)]
186#[logos(error = LexerError)]
187#[logos(skip r"[ \t]+")]
188pub enum Token {
189    // ═══════════════════════════════════════════════════════════════════
190    // Keywords (must come before Ident for priority)
191    // ═══════════════════════════════════════════════════════════════════
192    #[token("set")]
193    Set,
194
195    #[token("local")]
196    Local,
197
198    #[token("if")]
199    If,
200
201    #[token("then")]
202    Then,
203
204    #[token("else")]
205    Else,
206
207    #[token("elif")]
208    Elif,
209
210    #[token("fi")]
211    Fi,
212
213    #[token("for")]
214    For,
215
216    #[token("while")]
217    While,
218
219    #[token("in")]
220    In,
221
222    #[token("do")]
223    Do,
224
225    #[token("done")]
226    Done,
227
228    #[token("case")]
229    Case,
230
231    #[token("esac")]
232    Esac,
233
234    #[token("function")]
235    Function,
236
237    #[token("break")]
238    Break,
239
240    #[token("continue")]
241    Continue,
242
243    #[token("return")]
244    Return,
245
246    #[token("exit")]
247    Exit,
248
249    #[token("true")]
250    True,
251
252    #[token("false")]
253    False,
254
255    // ═══════════════════════════════════════════════════════════════════
256    // Type keywords (for tool parameters)
257    // ═══════════════════════════════════════════════════════════════════
258    #[token("string")]
259    TypeString,
260
261    #[token("int")]
262    TypeInt,
263
264    #[token("float")]
265    TypeFloat,
266
267    #[token("bool")]
268    TypeBool,
269
270    // ═══════════════════════════════════════════════════════════════════
271    // Multi-character operators (must come before single-char versions)
272    // ═══════════════════════════════════════════════════════════════════
273    #[token("&&")]
274    And,
275
276    #[token("||")]
277    Or,
278
279    #[token("==")]
280    EqEq,
281
282    #[token("!=")]
283    NotEq,
284
285    #[token("=~")]
286    Match,
287
288    #[token("!~")]
289    NotMatch,
290
291    #[token(">=")]
292    GtEq,
293
294    #[token("<=")]
295    LtEq,
296
297    #[token(">>")]
298    GtGt,
299
300    #[token("2>&1")]
301    StderrToStdout,
302
303    #[token("1>&2")]
304    StdoutToStderr,
305
306    #[token(">&2")]
307    StdoutToStderr2,
308
309    #[token("2>")]
310    Stderr,
311
312    #[token("&>")]
313    Both,
314
315    #[token("<<<")]
316    HereString,
317
318    #[token("<<")]
319    HereDocStart,
320
321    #[token(";;")]
322    DoubleSemi,
323
324    // ═══════════════════════════════════════════════════════════════════
325    // Single-character operators and punctuation
326    // ═══════════════════════════════════════════════════════════════════
327    #[token("=")]
328    Eq,
329
330    #[token("|")]
331    Pipe,
332
333    #[token("&")]
334    Amp,
335
336    #[token(">")]
337    Gt,
338
339    #[token("<")]
340    Lt,
341
342    #[token(";")]
343    Semi,
344
345    #[token(":")]
346    Colon,
347
348    #[token(",")]
349    Comma,
350
351    #[token("..")]
352    DotDot,
353
354    #[token(".")]
355    Dot,
356
357    /// Tilde path: `~/foo`, `~user/bar` - value includes the full string
358    #[regex(r"~[a-zA-Z0-9_./+-]+", lex_tilde_path, priority = 3)]
359    TildePath(String),
360
361    /// Bare tilde: `~` alone (expands to $HOME)
362    #[token("~")]
363    Tilde,
364
365    /// Relative path: `../foo/bar`, bare `src/kaish` (ident containing `/`),
366    /// or a directory reference with a trailing slash like `dest/`. The
367    /// trailing-slash form uses `*` (not `+`) after the slash so `dest/`
368    /// lexes as one token instead of `Ident("dest")` + `Path("/")` — the
369    /// latter split silently turned `cp a b dest/` into a 4-operand command.
370    #[regex(r"\.\./[a-zA-Z0-9_./-]+", lex_relative_path, priority = 3)]
371    #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*/[a-zA-Z0-9_./-]*", lex_relative_path, priority = 3)]
372    RelativePath(String),
373
374    /// Dot-slash path: `./foo`, `./script.sh`
375    #[regex(r"\./[a-zA-Z0-9_./-]+", lex_dot_slash_path, priority = 3)]
376    DotSlashPath(String),
377
378    /// Dot-prefixed bareword: `.parent`, `.gitignore`, `.foo.bar`.
379    /// Treated as an opaque string in argv position. Distinct from `Token::Dot`
380    /// (the POSIX `.` source alias) which only matches a bare `.` — the source
381    /// alias requires whitespace before its file argument (`. script`), so
382    /// `.parent` (no space) is unambiguously a single bareword.
383    #[regex(r"\.[a-zA-Z_][a-zA-Z0-9_.-]*", lex_dotted_ident, priority = 3)]
384    DottedIdent(String),
385
386    #[token("{")]
387    LBrace,
388
389    #[token("}")]
390    RBrace,
391
392    #[token("[")]
393    LBracket,
394
395    #[token("]")]
396    RBracket,
397
398    #[token("(")]
399    LParen,
400
401    #[token(")")]
402    RParen,
403
404    #[token("*")]
405    Star,
406
407    #[token("!")]
408    Bang,
409
410    #[token("?")]
411    Question,
412
413    /// Merged glob word: span-adjacent tokens containing `*`, `?`, or `[...]`.
414    /// Synthesized by `merge_glob_adjacent()`, never produced by logos directly.
415    GlobWord(String),
416
417    // ═══════════════════════════════════════════════════════════════════
418    // Command substitution
419    // ═══════════════════════════════════════════════════════════════════
420
421    /// Arithmetic expression content: synthesized by preprocessing.
422    /// Contains the expression string between `$((` and `))`.
423    Arithmetic(String),
424
425    /// Command substitution start: `$(` - begins a command substitution
426    #[token("$(")]
427    CmdSubstStart,
428
429    // ═══════════════════════════════════════════════════════════════════
430    // Flags (must come before Int to win over negative numbers)
431    // ═══════════════════════════════════════════════════════════════════
432
433    /// Long flag: `--name` or `--foo-bar`
434    #[regex(r"--[a-zA-Z][a-zA-Z0-9-]*", lex_long_flag, priority = 3)]
435    LongFlag(String),
436
437    /// Short flag: `-l`, `-la` (combined short flags), or a dash-word with
438    /// internal hyphens like `-not-a-flag`. Internal hyphens are part of the
439    /// single shell word — without them the word fragments into separate flag
440    /// tokens, which breaks `echo -- -not-a-flag` and the like. A leading `--`
441    /// is still `DoubleDash` (the second char must be a letter here), and
442    /// whether the word is a flag or a literal is the binding layer's call.
443    #[regex(r"-[a-zA-Z][a-zA-Z0-9-]*", lex_short_flag, priority = 3)]
444    ShortFlag(String),
445
446    /// Plus flag: `+e` or `+x` (for set +e to disable options)
447    #[regex(r"\+[a-zA-Z][a-zA-Z0-9]*", lex_plus_flag, priority = 3)]
448    PlusFlag(String),
449
450    /// Double dash: `--` alone marks end of flags
451    #[token("--")]
452    DoubleDash,
453
454    /// Bare word starting with + followed by non-letter: `+%s`, `+%Y-%m-%d`
455    /// For date format strings and similar. Lower priority than PlusFlag.
456    #[regex(r"\+[^a-zA-Z\s][^\s]*", lex_plus_bare, priority = 2)]
457    PlusBare(String),
458
459    /// Bare word starting with - followed by non-letter/digit/dash: `-%`, etc.
460    /// For rare cases. Lower priority than ShortFlag, Int, and DoubleDash.
461    /// Excludes - after first - to avoid matching --name patterns.
462    #[regex(r"-[^a-zA-Z0-9\s\-][^\s]*", lex_minus_bare, priority = 1)]
463    MinusBare(String),
464
465    /// Job specifier: `%1`, `%2` — the bash idiom for `wait`/`kill` targets.
466    /// Keeps the leading `%` (kill uses it to distinguish a job from a PID;
467    /// wait strips it). Without this token a bare `%1` is a lexer error.
468    #[regex(r"%[0-9]+", lex_job_spec)]
469    JobSpec(String),
470
471    /// Standalone - (stdin indicator for cat -, diff - -, etc.)
472    /// Only matches when followed by whitespace or end.
473    /// This is handled specially in the parser as a positional arg.
474    #[token("-")]
475    MinusAlone,
476
477    // ═══════════════════════════════════════════════════════════════════
478    // Literals (with values)
479    // ═══════════════════════════════════════════════════════════════════
480
481    /// Double-quoted string: `"..."` - value is the parsed content (quotes removed, escapes processed)
482    #[regex(r#""([^"\\]|\\.)*""#, lex_string)]
483    String(String),
484
485    /// Single-quoted string: `'...'` - literal content, no escape processing
486    #[regex(r"'[^']*'", lex_single_string)]
487    SingleString(String),
488
489    /// Braced variable reference: `${VAR}` or `${VAR.field}` - value is the raw inner content
490    #[regex(r"\$\{[^}]+\}", lex_varref)]
491    VarRef(String),
492
493    /// Simple variable reference: `$NAME` - just the identifier
494    #[regex(r"\$[a-zA-Z_][a-zA-Z0-9_]*", lex_simple_varref)]
495    SimpleVarRef(String),
496
497    /// Positional parameter: `$0` through `$9`
498    #[regex(r"\$[0-9]", lex_positional)]
499    Positional(usize),
500
501    /// All positional parameters: `$@`
502    #[token("$@")]
503    AllArgs,
504
505    /// Number of positional parameters: `$#`
506    #[token("$#")]
507    ArgCount,
508
509    /// Last exit code: `$?`
510    #[token("$?")]
511    LastExitCode,
512
513    /// Current shell PID: `$$`
514    #[token("$$")]
515    CurrentPid,
516
517    /// Variable string length: `${#VAR}`
518    #[regex(r"\$\{#[a-zA-Z_][a-zA-Z0-9_]*\}", lex_var_length)]
519    VarLength(String),
520
521    /// Here-doc content: synthesized by preprocessing, not directly lexed.
522    /// Contains the full content of the here-doc (without the delimiter lines).
523    HereDoc(HereDocData),
524
525    /// Integer literal - value is the parsed i64
526    #[regex(r"-?[0-9]+", lex_int, priority = 2)]
527    Int(i64),
528
529    /// Float literal - value is the parsed f64
530    #[regex(r"-?[0-9]+\.[0-9]+", lex_float)]
531    Float(f64),
532
533    // ═══════════════════════════════════════════════════════════════════
534    // Invalid patterns (caught before valid tokens for better errors)
535    // ═══════════════════════════════════════════════════════════════════
536
537    /// Digit-leading bareword: `019dda1c` (SHA prefix), UUIDs, version-ish
538    /// strings. Distinguished from `Int` because at least one alpha character
539    /// follows the leading digits — the lexer commits to "this is a string,
540    /// not a number." Treated as a bareword string in expression position.
541    #[regex(r"[0-9]+[a-zA-Z_][a-zA-Z0-9_.-]*", lex_number_ident, priority = 3)]
542    NumberIdent(String),
543
544    /// Invalid: float without leading digit (like .5)
545    #[regex(r"\.[0-9]+", lex_invalid_float_no_leading, priority = 3)]
546    InvalidFloatNoLeading,
547
548    /// Invalid: float without trailing digit (like 5.)
549    /// Logos uses longest-match, so valid floats like 5.5 will match Float pattern instead
550    #[regex(r"[0-9]+\.", lex_invalid_float_no_trailing, priority = 2)]
551    InvalidFloatNoTrailing,
552
553    // ═══════════════════════════════════════════════════════════════════
554    // Paths (absolute paths starting with /)
555    // ═══════════════════════════════════════════════════════════════════
556
557    /// Absolute path: `/tmp/out`, `/etc/hosts`, etc.
558    #[regex(r"/[a-zA-Z0-9_./+-]*", lex_path)]
559    Path(String),
560
561    // ═══════════════════════════════════════════════════════════════════
562    // Identifiers (command names, variable names, etc.)
563    // ═══════════════════════════════════════════════════════════════════
564
565    /// Identifier - value is the identifier string
566    /// Allows dots for filenames like `script.kai`
567    #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*", lex_ident)]
568    Ident(String),
569
570    // ═══════════════════════════════════════════════════════════════════
571    // Structural tokens
572    // ═══════════════════════════════════════════════════════════════════
573
574    /// Comment: `# ...` to end of line
575    #[regex(r"#[^\n\r]*", allow_greedy = true)]
576    Comment,
577
578    /// Newline (significant in kaish - ends statements)
579    #[regex(r"\n|\r\n")]
580    Newline,
581
582    /// Line continuation: backslash at end of line
583    #[regex(r"\\[ \t]*(\n|\r\n)")]
584    LineContinuation,
585
586    /// Backtick command substitution — explicitly rejected. Kaish drops
587    /// backticks; the callback always errors so users get a dedicated
588    /// `BackticksNotSupported` message instead of the generic
589    /// `UnexpectedCharacter` they would have hit before. Backticks inside
590    /// single/double-quoted strings, heredoc bodies, and comments don't
591    /// reach this match — those tokens are matched as a single unit
592    /// (strings) or extracted before logos runs (heredocs) or skipped to
593    /// EOL (comments).
594    #[token("`", reject_backtick)]
595    BacktickRejected,
596}
597
598/// Semantic category for syntax highlighting.
599///
600/// Stable enum that groups tokens by purpose. Consumers match on categories
601/// instead of individual tokens, insulating them from lexer evolution.
602#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
603pub enum TokenCategory {
604    /// Keywords: if, then, else, for, while, function, return, etc.
605    Keyword,
606    /// Operators: |, &&, ||, >, >>, 2>&1, =, ==, etc.
607    Operator,
608    /// String literals: "...", '...', heredocs
609    String,
610    /// Numeric literals: 123, 3.14, arithmetic expressions
611    Number,
612    /// Variable references: $foo, ${bar}, $1, $@, $#, $?, $$
613    Variable,
614    /// Comments: # ...
615    Comment,
616    /// Punctuation: ; , . ( ) { } [ ]
617    Punctuation,
618    /// Identifiers in command position
619    Command,
620    /// Absolute paths: /foo/bar
621    Path,
622    /// Flags: --long, -s, +x
623    Flag,
624    /// Invalid tokens
625    Error,
626}
627
628impl Token {
629    /// Returns the semantic category for syntax highlighting.
630    pub fn category(&self) -> TokenCategory {
631        match self {
632            // Keywords
633            Token::If
634            | Token::Then
635            | Token::Else
636            | Token::Elif
637            | Token::Fi
638            | Token::For
639            | Token::In
640            | Token::Do
641            | Token::Done
642            | Token::While
643            | Token::Case
644            | Token::Esac
645            | Token::Function
646            | Token::Return
647            | Token::Break
648            | Token::Continue
649            | Token::Exit
650            | Token::Set
651            | Token::Local
652            | Token::True
653            | Token::False
654            | Token::TypeString
655            | Token::TypeInt
656            | Token::TypeFloat
657            | Token::TypeBool => TokenCategory::Keyword,
658
659            // Operators and redirections
660            Token::Pipe
661            | Token::And
662            | Token::Or
663            | Token::Amp
664            | Token::Eq
665            | Token::EqEq
666            | Token::NotEq
667            | Token::Match
668            | Token::NotMatch
669            | Token::Lt
670            | Token::Gt
671            | Token::LtEq
672            | Token::GtEq
673            | Token::GtGt
674            | Token::Stderr
675            | Token::Both
676            | Token::HereDocStart
677            | Token::HereString
678            | Token::StderrToStdout
679            | Token::StdoutToStderr
680            | Token::StdoutToStderr2 => TokenCategory::Operator,
681
682            // Strings
683            Token::String(_) | Token::SingleString(_) | Token::HereDoc(_) => TokenCategory::String,
684
685            // Numbers
686            Token::Int(_) | Token::Float(_) | Token::Arithmetic(_) => TokenCategory::Number,
687
688            // Variables
689            Token::VarRef(_)
690            | Token::SimpleVarRef(_)
691            | Token::Positional(_)
692            | Token::AllArgs
693            | Token::ArgCount
694            | Token::VarLength(_)
695            | Token::LastExitCode
696            | Token::CurrentPid => TokenCategory::Variable,
697
698            // Flags
699            Token::LongFlag(_)
700            | Token::ShortFlag(_)
701            | Token::PlusFlag(_)
702            | Token::DoubleDash => TokenCategory::Flag,
703
704            // Punctuation
705            Token::Semi
706            | Token::DoubleSemi
707            | Token::Colon
708            | Token::Comma
709            | Token::Dot
710            | Token::LParen
711            | Token::RParen
712            | Token::LBrace
713            | Token::RBrace
714            | Token::LBracket
715            | Token::RBracket
716            | Token::Bang
717            | Token::Question
718            | Token::Star
719            | Token::Newline
720            | Token::LineContinuation
721            | Token::CmdSubstStart => TokenCategory::Punctuation,
722
723            // Glob words (merged tokens containing wildcards)
724            Token::GlobWord(_) => TokenCategory::Path,
725
726            // Comments
727            Token::Comment => TokenCategory::Comment,
728
729            // Paths
730            Token::Path(_)
731            | Token::TildePath(_)
732            | Token::RelativePath(_)
733            | Token::Tilde
734            | Token::DotDot
735            | Token::DotSlashPath(_) => TokenCategory::Path,
736
737            // Commands/identifiers (and bare words)
738            Token::Ident(_)
739            | Token::PlusBare(_)
740            | Token::MinusBare(_)
741            | Token::MinusAlone
742            | Token::NumberIdent(_)
743            | Token::DottedIdent(_)
744            | Token::JobSpec(_) => TokenCategory::Command,
745
746            // Errors
747            Token::InvalidFloatNoLeading
748            | Token::InvalidFloatNoTrailing
749            | Token::BacktickRejected => TokenCategory::Error,
750        }
751    }
752}
753
754/// Lex a double-quoted string literal, processing escape sequences.
755fn lex_string(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
756    parse_string_literal(lex.slice())
757}
758
759/// Lex a single-quoted string literal (no escape processing).
760fn lex_single_string(lex: &mut logos::Lexer<Token>) -> String {
761    let s = lex.slice();
762    // Strip the surrounding single quotes
763    s[1..s.len() - 1].to_string()
764}
765
766/// Lex a braced variable reference, extracting the inner content.
767fn lex_varref(lex: &mut logos::Lexer<Token>) -> String {
768    // Keep the full ${...} for later parsing of path segments
769    lex.slice().to_string()
770}
771
772/// Lex a simple variable reference: `$NAME` → `NAME`
773fn lex_simple_varref(lex: &mut logos::Lexer<Token>) -> String {
774    // Strip the leading `$`
775    lex.slice()[1..].to_string()
776}
777
778/// Lex a positional parameter: `$1` → 1
779fn lex_positional(lex: &mut logos::Lexer<Token>) -> usize {
780    // Strip the leading `$` and parse the digit
781    lex.slice()[1..].parse().unwrap_or(0)
782}
783
784/// Lex a variable length: `${#VAR}` → "VAR"
785fn lex_var_length(lex: &mut logos::Lexer<Token>) -> String {
786    // Strip the leading `${#` and trailing `}`
787    let s = lex.slice();
788    s[3..s.len() - 1].to_string()
789}
790
791/// Lex an integer literal.
792fn lex_int(lex: &mut logos::Lexer<Token>) -> Result<i64, LexerError> {
793    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
794}
795
796/// Lex a float literal.
797fn lex_float(lex: &mut logos::Lexer<Token>) -> Result<f64, LexerError> {
798    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
799}
800
801/// Lex a digit-leading bareword like `019dda1c` or `019dda1c-5b3f-7000`.
802/// Distinguished from `Int` because at least one alpha character follows the
803/// leading digits — the slice is treated as a string, not a number.
804fn lex_number_ident(lex: &mut logos::Lexer<Token>) -> String {
805    lex.slice().to_string()
806}
807
808/// Lex a dot-prefixed bareword like `.gitignore` or `.parent.parent`.
809fn lex_dotted_ident(lex: &mut logos::Lexer<Token>) -> String {
810    lex.slice().to_string()
811}
812
813/// Lex an invalid float without leading digit (like .5).
814/// Always returns Err to produce a lexer error instead of a token.
815fn lex_invalid_float_no_leading(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
816    Err(LexerError::InvalidFloatNoLeading)
817}
818
819/// Reject a backtick — kaish doesn't support backtick command substitution.
820/// The dedicated error gives the user a `$(cmd)` hint instead of the generic
821/// `UnexpectedCharacter` they would have hit otherwise.
822fn reject_backtick(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
823    Err(LexerError::BackticksNotSupported)
824}
825
826/// Lex an invalid float without trailing digit (like 5.).
827/// Always returns Err to produce a lexer error instead of a token.
828fn lex_invalid_float_no_trailing(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
829    Err(LexerError::InvalidFloatNoTrailing)
830}
831
832/// Lex an identifier, rejecting ambiguous boolean-like values.
833fn lex_ident(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
834    let s = lex.slice();
835
836    // Reject ambiguous boolean variants (TRUE, FALSE, True, etc.)
837    // Only lowercase 'true' and 'false' are valid booleans (handled by Token::True/False)
838    match s.to_lowercase().as_str() {
839        "true" | "false" if s != "true" && s != "false" => {
840            return Err(LexerError::AmbiguousBoolean(s.to_string()));
841        }
842        _ => {}
843    }
844
845    // Reject yes/no/YES/NO/Yes/No as ambiguous boolean-like values
846    if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("no") {
847        return Err(LexerError::AmbiguousBooleanLike(s.to_string()));
848    }
849
850    Ok(s.to_string())
851}
852
853/// Lex a long flag: `--name` → `name`
854fn lex_long_flag(lex: &mut logos::Lexer<Token>) -> String {
855    // Strip the leading `--`
856    lex.slice()[2..].to_string()
857}
858
859/// Lex a short flag: `-l` → `l`, `-la` → `la`
860fn lex_short_flag(lex: &mut logos::Lexer<Token>) -> String {
861    // Strip the leading `-`
862    lex.slice()[1..].to_string()
863}
864
865/// Lex a plus flag: `+e` → `e`, `+ex` → `ex`
866fn lex_plus_flag(lex: &mut logos::Lexer<Token>) -> String {
867    // Strip the leading `+`
868    lex.slice()[1..].to_string()
869}
870
871/// Lex a plus bare word: `+%s` → `+%s` (keep the full string)
872fn lex_plus_bare(lex: &mut logos::Lexer<Token>) -> String {
873    lex.slice().to_string()
874}
875
876/// Lex a minus bare word: `-%` → `-%` (keep the full string)
877fn lex_minus_bare(lex: &mut logos::Lexer<Token>) -> String {
878    lex.slice().to_string()
879}
880
881/// Lex a job specifier: `%1` → `%1` (keep the leading `%`).
882fn lex_job_spec(lex: &mut logos::Lexer<Token>) -> String {
883    lex.slice().to_string()
884}
885
886/// Lex an absolute path: `/tmp/out` → `/tmp/out`
887fn lex_path(lex: &mut logos::Lexer<Token>) -> String {
888    lex.slice().to_string()
889}
890
891/// Lex a tilde path: `~/foo` → `~/foo`
892fn lex_tilde_path(lex: &mut logos::Lexer<Token>) -> String {
893    lex.slice().to_string()
894}
895
896/// Lex a relative path: `../foo` → `../foo`
897fn lex_relative_path(lex: &mut logos::Lexer<Token>) -> String {
898    lex.slice().to_string()
899}
900
901/// Lex a dot-slash path: `./foo` → `./foo`
902fn lex_dot_slash_path(lex: &mut logos::Lexer<Token>) -> String {
903    lex.slice().to_string()
904}
905
906impl fmt::Display for Token {
907    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
908        match self {
909            Token::Set => write!(f, "set"),
910            Token::Local => write!(f, "local"),
911            Token::If => write!(f, "if"),
912            Token::Then => write!(f, "then"),
913            Token::Else => write!(f, "else"),
914            Token::Elif => write!(f, "elif"),
915            Token::Fi => write!(f, "fi"),
916            Token::For => write!(f, "for"),
917            Token::While => write!(f, "while"),
918            Token::In => write!(f, "in"),
919            Token::Do => write!(f, "do"),
920            Token::Done => write!(f, "done"),
921            Token::Case => write!(f, "case"),
922            Token::Esac => write!(f, "esac"),
923            Token::Function => write!(f, "function"),
924            Token::Break => write!(f, "break"),
925            Token::Continue => write!(f, "continue"),
926            Token::Return => write!(f, "return"),
927            Token::Exit => write!(f, "exit"),
928            Token::True => write!(f, "true"),
929            Token::False => write!(f, "false"),
930            Token::TypeString => write!(f, "string"),
931            Token::TypeInt => write!(f, "int"),
932            Token::TypeFloat => write!(f, "float"),
933            Token::TypeBool => write!(f, "bool"),
934            Token::And => write!(f, "&&"),
935            Token::Or => write!(f, "||"),
936            Token::EqEq => write!(f, "=="),
937            Token::NotEq => write!(f, "!="),
938            Token::Match => write!(f, "=~"),
939            Token::NotMatch => write!(f, "!~"),
940            Token::GtEq => write!(f, ">="),
941            Token::LtEq => write!(f, "<="),
942            Token::GtGt => write!(f, ">>"),
943            Token::StderrToStdout => write!(f, "2>&1"),
944            Token::StdoutToStderr => write!(f, "1>&2"),
945            Token::StdoutToStderr2 => write!(f, ">&2"),
946            Token::Stderr => write!(f, "2>"),
947            Token::Both => write!(f, "&>"),
948            Token::HereDocStart => write!(f, "<<"),
949            Token::HereString => write!(f, "<<<"),
950            Token::DoubleSemi => write!(f, ";;"),
951            Token::Eq => write!(f, "="),
952            Token::Pipe => write!(f, "|"),
953            Token::Amp => write!(f, "&"),
954            Token::Gt => write!(f, ">"),
955            Token::Lt => write!(f, "<"),
956            Token::Semi => write!(f, ";"),
957            Token::Colon => write!(f, ":"),
958            Token::Comma => write!(f, ","),
959            Token::Dot => write!(f, "."),
960            Token::DotDot => write!(f, ".."),
961            Token::Tilde => write!(f, "~"),
962            Token::TildePath(s) => write!(f, "{}", s),
963            Token::RelativePath(s) => write!(f, "{}", s),
964            Token::DotSlashPath(s) => write!(f, "{}", s),
965            Token::LBrace => write!(f, "{{"),
966            Token::RBrace => write!(f, "}}"),
967            Token::LBracket => write!(f, "["),
968            Token::RBracket => write!(f, "]"),
969            Token::LParen => write!(f, "("),
970            Token::RParen => write!(f, ")"),
971            Token::Star => write!(f, "*"),
972            Token::Bang => write!(f, "!"),
973            Token::Question => write!(f, "?"),
974            Token::GlobWord(s) => write!(f, "GLOB({})", s),
975            Token::Arithmetic(s) => write!(f, "ARITHMETIC({})", s),
976            Token::CmdSubstStart => write!(f, "$("),
977            Token::LongFlag(s) => write!(f, "--{}", s),
978            Token::ShortFlag(s) => write!(f, "-{}", s),
979            Token::PlusFlag(s) => write!(f, "+{}", s),
980            Token::DoubleDash => write!(f, "--"),
981            Token::PlusBare(s) => write!(f, "{}", s),
982            Token::MinusBare(s) => write!(f, "{}", s),
983            Token::JobSpec(s) => write!(f, "{}", s),
984            Token::MinusAlone => write!(f, "-"),
985            Token::String(s) => write!(f, "STRING({:?})", s),
986            Token::SingleString(s) => write!(f, "SINGLESTRING({:?})", s),
987            Token::HereDoc(d) => write!(f, "HEREDOC({:?}, literal={})", d.content, d.literal),
988            Token::VarRef(v) => write!(f, "VARREF({})", v),
989            Token::SimpleVarRef(v) => write!(f, "SIMPLEVARREF({})", v),
990            Token::Positional(n) => write!(f, "${}", n),
991            Token::AllArgs => write!(f, "$@"),
992            Token::ArgCount => write!(f, "$#"),
993            Token::LastExitCode => write!(f, "$?"),
994            Token::CurrentPid => write!(f, "$$"),
995            Token::VarLength(v) => write!(f, "${{#{}}}", v),
996            Token::Int(n) => write!(f, "INT({})", n),
997            Token::Float(n) => write!(f, "FLOAT({})", n),
998            Token::Path(s) => write!(f, "PATH({})", s),
999            Token::Ident(s) => write!(f, "IDENT({})", s),
1000            Token::NumberIdent(s) => write!(f, "NUMIDENT({})", s),
1001            Token::DottedIdent(s) => write!(f, "DOTIDENT({})", s),
1002            Token::Comment => write!(f, "COMMENT"),
1003            Token::Newline => write!(f, "NEWLINE"),
1004            Token::LineContinuation => write!(f, "LINECONT"),
1005            // These variants should never be produced — their callbacks always return errors
1006            Token::InvalidFloatNoLeading => write!(f, "INVALID_FLOAT_NO_LEADING"),
1007            Token::InvalidFloatNoTrailing => write!(f, "INVALID_FLOAT_NO_TRAILING"),
1008            Token::BacktickRejected => write!(f, "BACKTICK_REJECTED"),
1009        }
1010    }
1011}
1012
1013impl Token {
1014    /// Returns true if this token is a keyword.
1015    // Must match the Keyword variants in `Token::category()` (minus the
1016    // TypeX variants, which `is_type()` covers separately). Currently
1017    // uncalled — kept exhaustive so future callers don't get wrong answers.
1018    pub fn is_keyword(&self) -> bool {
1019        matches!(
1020            self,
1021            Token::Set
1022                | Token::Local
1023                | Token::If
1024                | Token::Then
1025                | Token::Else
1026                | Token::Elif
1027                | Token::Fi
1028                | Token::For
1029                | Token::In
1030                | Token::Do
1031                | Token::Done
1032                | Token::While
1033                | Token::Case
1034                | Token::Esac
1035                | Token::Function
1036                | Token::Return
1037                | Token::Break
1038                | Token::Continue
1039                | Token::Exit
1040                | Token::True
1041                | Token::False
1042        )
1043    }
1044
1045    /// Returns true if this token is a type keyword.
1046    pub fn is_type(&self) -> bool {
1047        matches!(
1048            self,
1049            Token::TypeString
1050                | Token::TypeInt
1051                | Token::TypeFloat
1052                | Token::TypeBool
1053        )
1054    }
1055
1056    /// Returns true if this token starts a statement.
1057    // Currently uncalled — kept exhaustive so future callers don't get wrong answers.
1058    pub fn starts_statement(&self) -> bool {
1059        matches!(
1060            self,
1061            Token::Set
1062                | Token::Local
1063                | Token::Function
1064                | Token::If
1065                | Token::For
1066                | Token::While
1067                | Token::Case
1068                | Token::Ident(_)
1069                | Token::LBracket
1070        )
1071    }
1072
1073    /// Returns true if this token can appear in an expression.
1074    pub fn is_value(&self) -> bool {
1075        matches!(
1076            self,
1077            Token::String(_)
1078                | Token::SingleString(_)
1079                | Token::HereDoc(_)
1080                | Token::Arithmetic(_)
1081                | Token::Int(_)
1082                | Token::Float(_)
1083                | Token::True
1084                | Token::False
1085                | Token::VarRef(_)
1086                | Token::SimpleVarRef(_)
1087                | Token::CmdSubstStart
1088                | Token::Path(_)
1089                | Token::GlobWord(_)
1090                | Token::LastExitCode
1091                | Token::CurrentPid
1092        )
1093    }
1094}
1095
1096/// Result of preprocessing arithmetic expressions.
1097struct ArithmeticPreprocessResult {
1098    /// Preprocessed source with markers replacing $((expr)).
1099    text: String,
1100    /// Vector of (marker, expression_content) pairs.
1101    arithmetics: Vec<(String, String)>,
1102    /// Span replacements for correcting token positions.
1103    replacements: Vec<SpanReplacement>,
1104}
1105
1106/// Skip a `$(...)` command substitution with quote-aware paren matching.
1107///
1108/// Copies the entire command substitution verbatim to `result`, handling
1109/// single quotes, double quotes, and backslash escapes inside the sub so
1110/// that parentheses within strings don't confuse the depth counter.
1111///
1112/// On entry, `i` points to the `$` of `$(`. On exit, `i` points past the
1113/// closing `)`.
1114fn skip_command_substitution(
1115    chars: &[char],
1116    i: &mut usize,
1117    source_pos: &mut usize,
1118    result: &mut String,
1119) {
1120    // Copy $(
1121    result.push('$');
1122    result.push('(');
1123    *i += 2;
1124    *source_pos += 2;
1125
1126    let mut depth: usize = 1;
1127    let mut in_single_quote = false;
1128    let mut in_double_quote = false;
1129
1130    while *i < chars.len() && depth > 0 {
1131        let c = chars[*i];
1132
1133        if in_single_quote {
1134            result.push(c);
1135            *source_pos += c.len_utf8();
1136            *i += 1;
1137            if c == '\'' {
1138                in_single_quote = false;
1139            }
1140            continue;
1141        }
1142
1143        if in_double_quote {
1144            if c == '\\' && *i + 1 < chars.len() {
1145                let next = chars[*i + 1];
1146                if next == '"' || next == '\\' || next == '$' || next == '`' {
1147                    result.push(c);
1148                    result.push(next);
1149                    *source_pos += c.len_utf8() + next.len_utf8();
1150                    *i += 2;
1151                    continue;
1152                }
1153            }
1154            if c == '"' {
1155                in_double_quote = false;
1156            }
1157            result.push(c);
1158            *source_pos += c.len_utf8();
1159            *i += 1;
1160            continue;
1161        }
1162
1163        // Outside quotes
1164        match c {
1165            '\'' => {
1166                in_single_quote = true;
1167                result.push(c);
1168                *source_pos += c.len_utf8();
1169                *i += 1;
1170            }
1171            '"' => {
1172                in_double_quote = true;
1173                result.push(c);
1174                *source_pos += c.len_utf8();
1175                *i += 1;
1176            }
1177            '\\' if *i + 1 < chars.len() => {
1178                result.push(c);
1179                result.push(chars[*i + 1]);
1180                *source_pos += c.len_utf8() + chars[*i + 1].len_utf8();
1181                *i += 2;
1182            }
1183            '(' => {
1184                depth += 1;
1185                result.push(c);
1186                *source_pos += c.len_utf8();
1187                *i += 1;
1188            }
1189            ')' => {
1190                depth -= 1;
1191                result.push(c);
1192                *source_pos += c.len_utf8();
1193                *i += 1;
1194            }
1195            _ => {
1196                result.push(c);
1197                *source_pos += c.len_utf8();
1198                *i += 1;
1199            }
1200        }
1201    }
1202}
1203
1204/// Preprocess arithmetic expressions in source code.
1205///
1206/// Finds `$((expr))` patterns and replaces them with markers.
1207/// Returns the preprocessed source, arithmetic contents, and span replacement info.
1208///
1209/// Example:
1210///   `X=$((1 + 2))`
1211/// Becomes:
1212///   `X=__KAISH_ARITH_{id}__`
1213/// With arithmetics[0] = ("__KAISH_ARITH_{id}__", "1 + 2")
1214///
1215/// # Errors
1216/// Returns `LexerError::NestingTooDeep` if parentheses are nested beyond MAX_PAREN_DEPTH.
1217fn preprocess_arithmetic(source: &str) -> Result<ArithmeticPreprocessResult, LexerError> {
1218    let mut result = String::with_capacity(source.len());
1219    let mut arithmetics: Vec<(String, String)> = Vec::new();
1220    let mut replacements: Vec<SpanReplacement> = Vec::new();
1221    let mut source_pos: usize = 0;
1222    let chars_vec: Vec<char> = source.chars().collect();
1223    let mut i = 0;
1224
1225    // Whether we're currently inside double quotes. Single quotes inside
1226    // double quotes are literal characters, not quote delimiters.
1227    let mut in_double_quote = false;
1228
1229    while i < chars_vec.len() {
1230        let ch = chars_vec[i];
1231
1232        // Backslash escape outside quotes — skip both chars verbatim
1233        if !in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1234            result.push(ch);
1235            result.push(chars_vec[i + 1]);
1236            source_pos += ch.len_utf8() + chars_vec[i + 1].len_utf8();
1237            i += 2;
1238            continue;
1239        }
1240
1241        // Single quote — only starts quote mode when NOT inside double quotes
1242        if ch == '\'' && !in_double_quote {
1243            result.push(ch);
1244            i += 1;
1245            source_pos += 1;
1246            while i < chars_vec.len() && chars_vec[i] != '\'' {
1247                result.push(chars_vec[i]);
1248                source_pos += chars_vec[i].len_utf8();
1249                i += 1;
1250            }
1251            if i < chars_vec.len() {
1252                result.push(chars_vec[i]); // closing quote
1253                source_pos += 1;
1254                i += 1;
1255            }
1256            continue;
1257        }
1258
1259        // Double quote — toggle state (arithmetic is still expanded inside)
1260        if ch == '"' {
1261            in_double_quote = !in_double_quote;
1262            result.push(ch);
1263            i += 1;
1264            source_pos += 1;
1265            continue;
1266        }
1267
1268        // Backslash escape inside double quotes — only \" and \\ are special
1269        if in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1270            let next = chars_vec[i + 1];
1271            if next == '"' || next == '\\' || next == '$' || next == '`' {
1272                result.push(ch);
1273                result.push(next);
1274                source_pos += ch.len_utf8() + next.len_utf8();
1275                i += 2;
1276                continue;
1277            }
1278        }
1279
1280        // Comment — copy verbatim from `#` through end-of-line so apostrophes
1281        // and `$((..))` inside the comment body don't get processed. Logos's
1282        // own comment regex `#[^\n\r]*` doesn't require a word boundary, so
1283        // we match that: any `#` outside double quotes (and outside single
1284        // quotes — those are consumed above as a single run) starts a comment.
1285        // The newline is left for the next iteration so newline-significance
1286        // and span tracking are preserved.
1287        if ch == '#' && !in_double_quote {
1288            while i < chars_vec.len() && chars_vec[i] != '\n' && chars_vec[i] != '\r' {
1289                result.push(chars_vec[i]);
1290                source_pos += chars_vec[i].len_utf8();
1291                i += 1;
1292            }
1293            continue;
1294        }
1295
1296        // Skip $(...) command substitutions — inner arithmetic belongs to the subcommand
1297        if ch == '$' && i + 1 < chars_vec.len() && chars_vec[i + 1] == '('
1298            && !(i + 2 < chars_vec.len() && chars_vec[i + 2] == '(')
1299        {
1300            skip_command_substitution(&chars_vec, &mut i, &mut source_pos, &mut result);
1301            continue;
1302        }
1303
1304        // Look for $(( (potential arithmetic)
1305        if ch == '$' && i + 2 < chars_vec.len() && chars_vec[i + 1] == '(' && chars_vec[i + 2] == '(' {
1306            let arith_start_pos = result.len();
1307            let original_start = source_pos;
1308
1309            // Skip $((
1310            i += 3;
1311            source_pos += 3;
1312
1313            // Collect expression until matching ))
1314            let mut expr = String::new();
1315            let mut paren_depth: usize = 0;
1316
1317            while i < chars_vec.len() {
1318                let c = chars_vec[i];
1319                match c {
1320                    '(' => {
1321                        paren_depth += 1;
1322                        if paren_depth > MAX_PAREN_DEPTH {
1323                            return Err(LexerError::NestingTooDeep);
1324                        }
1325                        expr.push('(');
1326                        i += 1;
1327                        source_pos += c.len_utf8();
1328                    }
1329                    ')' => {
1330                        if paren_depth > 0 {
1331                            paren_depth -= 1;
1332                            expr.push(')');
1333                            i += 1;
1334                            source_pos += 1;
1335                        } else if i + 1 < chars_vec.len() && chars_vec[i + 1] == ')' {
1336                            // Found closing ))
1337                            i += 2;
1338                            source_pos += 2;
1339                            break;
1340                        } else {
1341                            // Single ) inside - keep going
1342                            expr.push(')');
1343                            i += 1;
1344                            source_pos += 1;
1345                        }
1346                    }
1347                    _ => {
1348                        expr.push(c);
1349                        i += 1;
1350                        source_pos += c.len_utf8();
1351                    }
1352                }
1353            }
1354
1355            // Calculate original length: from $$(( to ))
1356            let original_len = source_pos - original_start;
1357
1358            // Create a unique marker for this arithmetic (collision-resistant)
1359            let marker = format!("__KAISH_ARITH_{}__", unique_marker_id());
1360            let marker_len = marker.len();
1361
1362            // Record the replacement for span correction
1363            replacements.push(SpanReplacement {
1364                preprocessed_pos: arith_start_pos,
1365                marker_len,
1366                original_len,
1367            });
1368
1369            arithmetics.push((marker.clone(), expr));
1370            result.push_str(&marker);
1371        } else {
1372            result.push(ch);
1373            i += 1;
1374            source_pos += ch.len_utf8();
1375        }
1376    }
1377
1378    Ok(ArithmeticPreprocessResult {
1379        text: result,
1380        arithmetics,
1381        replacements,
1382    })
1383}
1384
1385/// Per-heredoc metadata collected during preprocessing.
1386///
1387/// Stored verbatim alongside the substituted marker so the parser, validator,
1388/// and interpreter can reconstitute the body with correct semantics:
1389/// - `body` is the raw body bytes; tab stripping for `<<-` is applied later
1390///   (at materialization), so byte offsets stay aligned with the original
1391///   source for span tracking.
1392/// - `strip_tabs` records whether the `<<-` form was used.
1393/// - `literal` records whether the delimiter was quoted (no interpolation).
1394/// - `body_start_offset` is the byte offset of the first body character in
1395///   the source string passed to `preprocess_heredocs`. When heredocs are
1396///   preprocessed AFTER arithmetic, this is in arith-preprocessed coordinates;
1397///   in the common case (no arithmetic before the heredoc) this equals the
1398///   original-source offset. See span-correction notes in `tokenize`.
1399#[derive(Debug, Clone)]
1400struct HeredocReplacement {
1401    marker: String,
1402    body: String,
1403    literal: bool,
1404    strip_tabs: bool,
1405    body_start_offset: usize,
1406}
1407
1408/// Preprocess here-docs in source code.
1409///
1410/// Finds `<<WORD` patterns and collects content until the delimiter line.
1411/// Returns the preprocessed source and a vector of replacement records.
1412///
1413/// Example:
1414///   `cat <<EOF\nhello\nworld\nEOF`
1415/// Becomes:
1416///   `cat <<__HEREDOC_0__`
1417/// With heredocs[0] = HeredocReplacement { marker: "__HEREDOC_0__",
1418/// body: "hello\nworld", literal: false, strip_tabs: false }
1419fn preprocess_heredocs(source: &str) -> Result<(String, Vec<HeredocReplacement>), Spanned<LexerError>> {
1420    let mut result = String::with_capacity(source.len());
1421    let mut heredocs: Vec<HeredocReplacement> = Vec::new();
1422    let chars_vec: Vec<char> = source.chars().collect();
1423    let mut i = 0;
1424    // `pos` tracks the byte offset into `source` corresponding to chars_vec[i].
1425    // `result` accumulates output; we record body offsets in `pos` (input-side)
1426    // and emit positions via `result.len()` (output-side) where needed.
1427    let mut pos: usize = 0;
1428
1429    while i < chars_vec.len() {
1430        let ch = chars_vec[i];
1431
1432        // Pass <<< through verbatim so the logos tokenizer sees the here-string
1433        // operator. If we fell through naively, the next iteration would see
1434        // the remaining `<<` and misfire heredoc preprocessing.
1435        if ch == '<'
1436            && chars_vec.get(i + 1) == Some(&'<')
1437            && chars_vec.get(i + 2) == Some(&'<')
1438        {
1439            result.push_str("<<<");
1440            i += 3;
1441            pos += 3;
1442            continue;
1443        }
1444
1445        // Look for << (potential here-doc).
1446        if ch == '<' && chars_vec.get(i + 1) == Some(&'<') {
1447            // Remember where the `<<` started so an unterminated-heredoc
1448            // error can point back at the introducer rather than at EOF.
1449            let introducer_start = pos;
1450            i += 2; // consume both '<'
1451            pos += 2;
1452
1453            // Check for optional - (strip leading tabs)
1454            let strip_tabs = chars_vec.get(i) == Some(&'-');
1455            if strip_tabs {
1456                i += 1;
1457                pos += 1;
1458            }
1459
1460            // Skip whitespace before delimiter
1461            while let Some(&c) = chars_vec.get(i) {
1462                if c == ' ' || c == '\t' {
1463                    i += 1;
1464                    pos += 1;
1465                } else {
1466                    break;
1467                }
1468            }
1469
1470            // Collect the delimiter word
1471            let mut delimiter = String::new();
1472            let quoted = chars_vec.get(i) == Some(&'\'') || chars_vec.get(i) == Some(&'"');
1473            let quote_char = if quoted {
1474                let q = chars_vec.get(i).copied();
1475                i += 1;
1476                pos += 1;
1477                q
1478            } else {
1479                None
1480            };
1481
1482            while let Some(&c) = chars_vec.get(i) {
1483                if quoted {
1484                    if Some(c) == quote_char {
1485                        i += 1; // consume closing quote
1486                        pos += 1;
1487                        break;
1488                    }
1489                } else if c.is_whitespace() || c == '\n' || c == '\r' {
1490                    break;
1491                }
1492                delimiter.push(c);
1493                i += 1;
1494                pos += c.len_utf8();
1495            }
1496
1497            if delimiter.is_empty() {
1498                // Not a valid here-doc, output << literally
1499                result.push_str("<<");
1500                if strip_tabs {
1501                    result.push('-');
1502                }
1503                continue;
1504            }
1505
1506            // Buffer text after delimiter word (e.g., " | jq" in "cat <<EOF | jq")
1507            // This must be emitted AFTER the heredoc marker, not before.
1508            let mut after_delimiter = String::new();
1509            while let Some(&c) = chars_vec.get(i) {
1510                if c == '\n' {
1511                    i += 1;
1512                    pos += 1;
1513                    break;
1514                } else if c == '\r' {
1515                    i += 1;
1516                    pos += 1;
1517                    if chars_vec.get(i) == Some(&'\n') {
1518                        i += 1;
1519                        pos += 1;
1520                    }
1521                    break;
1522                }
1523                after_delimiter.push(c);
1524                i += 1;
1525                pos += c.len_utf8();
1526            }
1527
1528            // Collect content until delimiter on its own line.
1529            // `body_start_offset` is the byte position of the first char of
1530            // the body in the source — first char after the newline that
1531            // ended the delimiter line. See HeredocReplacement docs for
1532            // coordinate-system caveat (arith-preprocessed, not original).
1533            let body_start_offset = pos;
1534            let mut content = String::new();
1535            let mut current_line = String::new();
1536
1537            loop {
1538                let next = chars_vec.get(i).copied();
1539                match next {
1540                    Some('\n') => {
1541                        i += 1;
1542                        pos += 1;
1543                        // Check if this line is the delimiter
1544                        let trimmed = if strip_tabs {
1545                            current_line.trim_start_matches('\t')
1546                        } else {
1547                            &current_line
1548                        };
1549                        if trimmed == delimiter {
1550                            // Found end of here-doc
1551                            break;
1552                        }
1553                        // Add line to content (including empty lines)
1554                        content.push_str(&current_line);
1555                        content.push('\n');
1556                        current_line.clear();
1557                    }
1558                    Some('\r') => {
1559                        i += 1;
1560                        pos += 1;
1561                        // Detect CRLF vs bare CR. We strip the line ending
1562                        // for delimiter matching (so `EOF\r` still matches
1563                        // `EOF`) but preserve the original byte sequence in
1564                        // the body content — the user's input is honored
1565                        // verbatim.
1566                        let crlf = chars_vec.get(i) == Some(&'\n');
1567                        if crlf {
1568                            i += 1;
1569                            pos += 1;
1570                        }
1571                        let trimmed = if strip_tabs {
1572                            current_line.trim_start_matches('\t')
1573                        } else {
1574                            &current_line
1575                        };
1576                        if trimmed == delimiter {
1577                            break;
1578                        }
1579                        content.push_str(&current_line);
1580                        content.push_str(if crlf { "\r\n" } else { "\r" });
1581                        current_line.clear();
1582                    }
1583                    Some(c) => {
1584                        current_line.push(c);
1585                        i += 1;
1586                        pos += c.len_utf8();
1587                    }
1588                    None => {
1589                        // EOF — check if current line is the delimiter (matches
1590                        // when the source ends without a trailing newline).
1591                        let trimmed = if strip_tabs {
1592                            current_line.trim_start_matches('\t')
1593                        } else {
1594                            &current_line
1595                        };
1596                        if trimmed == delimiter {
1597                            break;
1598                        }
1599                        // Not a delimiter — the heredoc was never closed.
1600                        // Crash rather than silently using whatever we
1601                        // collected: missing data is exactly the failure
1602                        // mode where silent fallback masks the bug.
1603                        let span_end = introducer_start
1604                            + 2
1605                            + if strip_tabs { 1 } else { 0 }
1606                            + delimiter.len();
1607                        return Err(Spanned::new(
1608                            LexerError::UnterminatedHeredoc {
1609                                delimiter: delimiter.clone(),
1610                            },
1611                            introducer_start..span_end,
1612                        ));
1613                    }
1614                }
1615            }
1616
1617            // Create a unique marker for this here-doc (collision-resistant)
1618            let marker = format!("__KAISH_HEREDOC_{}__", unique_marker_id());
1619            heredocs.push(HeredocReplacement {
1620                marker: marker.clone(),
1621                body: content,
1622                literal: quoted,
1623                strip_tabs,
1624                body_start_offset,
1625            });
1626
1627            // Output <<marker first, then any text that followed the delimiter
1628            // (e.g., " | jq") so the heredoc attaches to the correct command.
1629            result.push_str("<<");
1630            result.push_str(&marker);
1631            result.push_str(&after_delimiter);
1632            result.push('\n');
1633        } else {
1634            result.push(ch);
1635            i += 1;
1636            pos += ch.len_utf8();
1637        }
1638    }
1639
1640    Ok((result, heredocs))
1641}
1642
1643/// Extract the text contribution of a token for colon-adjacent merging.
1644///
1645/// Returns `Some(text)` for token types that can participate in word-like
1646/// merging, `None` for everything else.
1647fn mergeable_text(token: &Token) -> Option<String> {
1648    match token {
1649        Token::Ident(s) => Some(s.clone()),
1650        Token::NumberIdent(s) => Some(s.clone()),
1651        Token::DottedIdent(s) => Some(s.clone()),
1652        Token::Colon => Some(":".to_string()),
1653        Token::Int(n) => Some(n.to_string()),
1654        Token::Path(p) => Some(p.clone()),
1655        Token::Float(f) => Some(f.to_string()),
1656        _ => None,
1657    }
1658}
1659
1660/// Merge span-adjacent token runs containing `Token::Colon` into single `Ident` tokens.
1661///
1662/// In bash, `:` is a regular character in unquoted words. kaish tokenizes it
1663/// separately, which breaks Rust paths (`foo::bar`), URLs (`host:8080`), etc.
1664///
1665/// This pass fuses span-adjacent mergeable tokens (Ident, Colon, Int, Path, Float)
1666/// into a single `Ident` when the run contains at least one `Colon`. Runs without
1667/// colons or standalone tokens pass through unchanged.
1668fn merge_colon_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1669    if tokens.is_empty() {
1670        return tokens;
1671    }
1672
1673    let mut result = Vec::with_capacity(tokens.len());
1674    let mut run: Vec<&Spanned<Token>> = Vec::new();
1675
1676    for token in &tokens {
1677        if run.is_empty() {
1678            if mergeable_text(&token.token).is_some() {
1679                run.push(token);
1680            } else {
1681                result.push(token.clone());
1682            }
1683            continue;
1684        }
1685
1686        // Check span adjacency: previous run's last token ends where this one starts
1687        // Safety: run is non-empty (checked above)
1688        let Some(last) = run.last() else { unreachable!() };
1689        let adjacent = last.span.end == token.span.start;
1690
1691        if adjacent && mergeable_text(&token.token).is_some() {
1692            run.push(token);
1693        } else {
1694            flush_colon_run(&mut run, &mut result);
1695            if mergeable_text(&token.token).is_some() {
1696                run.push(token);
1697            } else {
1698                result.push(token.clone());
1699            }
1700        }
1701    }
1702
1703    flush_colon_run(&mut run, &mut result);
1704
1705    result
1706}
1707
1708/// Flush a run of mergeable tokens: merge if it contains a colon, otherwise emit individually.
1709fn flush_colon_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1710    if run.is_empty() {
1711        return;
1712    }
1713
1714    let has_colon = run.iter().any(|t| matches!(t.token, Token::Colon));
1715
1716    if run.len() >= 2 && has_colon {
1717        let text: String = run
1718            .iter()
1719            .filter_map(|t| mergeable_text(&t.token))
1720            .collect();
1721        // Safety: run.len() >= 2 so first/last exist
1722        let start = run.first().map(|t| t.span.start).unwrap_or(0);
1723        let end = run.last().map(|t| t.span.end).unwrap_or(0);
1724        result.push(Spanned::new(Token::Ident(text), start..end));
1725    } else {
1726        for t in run.iter() {
1727            result.push((*t).clone());
1728        }
1729    }
1730
1731    run.clear();
1732}
1733
1734/// Extract the text contribution of a token that can participate in a glob word.
1735///
1736/// Returns `Some(text)` for tokens that can be part of a glob pattern (identifiers,
1737/// wildcard chars, brackets, paths, etc.), `None` for structural tokens.
1738fn glob_mergeable_text(token: &Token) -> Option<String> {
1739    match token {
1740        Token::Star => Some("*".to_string()),
1741        Token::Question => Some("?".to_string()),
1742        Token::Dot => Some(".".to_string()),
1743        Token::DotDot => Some("..".to_string()),
1744        Token::Ident(s) => Some(s.clone()),
1745        Token::NumberIdent(s) => Some(s.clone()),
1746        Token::DottedIdent(s) => Some(s.clone()),
1747        Token::Path(s) => Some(s.clone()),
1748        Token::Int(n) => Some(n.to_string()),
1749        Token::LBracket => Some("[".to_string()),
1750        Token::RBracket => Some("]".to_string()),
1751        Token::Bang => Some("!".to_string()),
1752        Token::DotSlashPath(s) => Some(s.clone()),
1753        Token::RelativePath(s) => Some(s.clone()),
1754        Token::TildePath(s) => Some(s.clone()),
1755        Token::Tilde => Some("~".to_string()),
1756        Token::LBrace => Some("{".to_string()),
1757        Token::RBrace => Some("}".to_string()),
1758        Token::Comma => Some(",".to_string()),
1759        _ => None,
1760    }
1761}
1762
1763/// Merge span-adjacent token runs containing glob metacharacters into `GlobWord` tokens.
1764///
1765/// A run is merged into `GlobWord` when it contains at least one `Star`, `Question`,
1766/// or a `LBracket`+`RBracket` pair. Runs without glob chars pass through unchanged.
1767///
1768/// Runs after colon merge: `foo::bar` stays as `Ident("foo::bar")` because colon merge
1769/// already fused it before this pass sees it.
1770fn merge_glob_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1771    if tokens.is_empty() {
1772        return tokens;
1773    }
1774
1775    let mut result = Vec::with_capacity(tokens.len());
1776    let mut run: Vec<&Spanned<Token>> = Vec::new();
1777
1778    for token in &tokens {
1779        if run.is_empty() {
1780            if glob_mergeable_text(&token.token).is_some() {
1781                run.push(token);
1782            } else {
1783                result.push(token.clone());
1784            }
1785            continue;
1786        }
1787
1788        // Safety: run is non-empty (checked at top of loop)
1789        let Some(last) = run.last() else { unreachable!() };
1790        let adjacent = last.span.end == token.span.start;
1791
1792        if adjacent && glob_mergeable_text(&token.token).is_some() {
1793            run.push(token);
1794        } else {
1795            flush_glob_run(&mut run, &mut result);
1796            if glob_mergeable_text(&token.token).is_some() {
1797                run.push(token);
1798            } else {
1799                result.push(token.clone());
1800            }
1801        }
1802    }
1803
1804    flush_glob_run(&mut run, &mut result);
1805
1806    result
1807}
1808
1809/// Flush a run of glob-mergeable tokens: merge if it contains glob metacharacters.
1810fn flush_glob_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1811    if run.is_empty() {
1812        return;
1813    }
1814
1815    let has_glob = run.iter().any(|t| {
1816        matches!(t.token, Token::Star | Token::Question)
1817    }) || (run.iter().any(|t| matches!(t.token, Token::LBracket))
1818        && run.iter().any(|t| matches!(t.token, Token::RBracket)));
1819
1820    if run.len() >= 2 && has_glob {
1821        let text: String = run
1822            .iter()
1823            .filter_map(|t| glob_mergeable_text(&t.token))
1824            .collect();
1825        let start = run.first().map(|t| t.span.start).unwrap_or(0);
1826        let end = run.last().map(|t| t.span.end).unwrap_or(0);
1827        result.push(Spanned::new(Token::GlobWord(text), start..end));
1828    } else {
1829        for t in run.iter() {
1830            result.push((*t).clone());
1831        }
1832    }
1833
1834    run.clear();
1835}
1836
1837/// Tokenize source code into a vector of spanned tokens.
1838///
1839/// Skips whitespace and comments (unless you need them for formatting).
1840/// Returns errors with their positions for nice error messages.
1841///
1842/// Handles:
1843/// - Arithmetic: `$((expr))` becomes `Arithmetic("expr")`
1844/// - Here-docs: `<<EOF\nhello\nEOF` becomes `HereDocStart` + `HereDoc("hello")`
1845/// - Colon merge: span-adjacent `foo::bar` becomes `Ident("foo::bar")`
1846pub fn tokenize(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1847    // Preprocess arithmetic first (before heredocs because heredoc content might contain $((
1848    let arith_result = preprocess_arithmetic(source)
1849        .map_err(|e| vec![Spanned::new(e, 0..source.len())])?;
1850
1851    // Then preprocess here-docs. Spans inside the heredoc preprocessor are in
1852    // arith-preprocessed coords; correct back to original-source coords before
1853    // surfacing the error to keep parser diagnostics aligned with source.
1854    let span_replacements = arith_result.replacements;
1855    let (preprocessed, heredocs) = preprocess_heredocs(&arith_result.text)
1856        .map_err(|e| {
1857            let span = correct_span(e.span, &span_replacements);
1858            vec![Spanned::new(e.token, span)]
1859        })?;
1860
1861    let lexer = Token::lexer(&preprocessed);
1862    let mut tokens = Vec::new();
1863    let mut errors = Vec::new();
1864
1865    for (result, span) in lexer.spanned() {
1866        // Correct the span from preprocessed coordinates to original coordinates
1867        let corrected_span = correct_span(span, &span_replacements);
1868        match result {
1869            Ok(token) => {
1870                // Skip comments and line continuations - they're not needed for parsing
1871                if !matches!(token, Token::Comment | Token::LineContinuation) {
1872                    tokens.push(Spanned::new(token, corrected_span));
1873                }
1874            }
1875            Err(err) => {
1876                errors.push(Spanned::new(err, corrected_span));
1877            }
1878        }
1879    }
1880
1881    if !errors.is_empty() {
1882        return Err(errors);
1883    }
1884
1885    // Post-process: replace markers with actual token content
1886    let mut final_tokens = Vec::with_capacity(tokens.len());
1887    let mut i = 0;
1888
1889    while i < tokens.len() {
1890        // Check for arithmetic marker (unique format: __KAISH_ARITH_{id}__)
1891        if let Token::Ident(ref name) = tokens[i].token
1892            && name.starts_with("__KAISH_ARITH_") && name.ends_with("__")
1893                && let Some((_, expr)) = arith_result.arithmetics.iter().find(|(marker, _)| marker == name) {
1894                    final_tokens.push(Spanned::new(Token::Arithmetic(expr.clone()), tokens[i].span.clone()));
1895                    i += 1;
1896                    continue;
1897                }
1898
1899        // Check for heredoc (unique format: __KAISH_HEREDOC_{id}__)
1900        if matches!(tokens[i].token, Token::HereDocStart) {
1901            // Check if next token is a heredoc marker
1902            if i + 1 < tokens.len()
1903                && let Token::Ident(ref name) = tokens[i + 1].token
1904                    && name.starts_with("__KAISH_HEREDOC_") && name.ends_with("__") {
1905                        // Find the corresponding content
1906                        if let Some(hd) = heredocs.iter().find(|h| h.marker == *name) {
1907                            // Re-thread arithmetic markers that the arith
1908                            // preprocessor planted in the source — without
1909                            // this, `<<EOF\n$((1+2))\nEOF` materializes the
1910                            // marker text instead of `3`. Mirrors the
1911                            // String-content translation a few lines below.
1912                            // - Literal heredocs (no expansion): restore the
1913                            //   original `$((expr))` text verbatim.
1914                            // - Interpolated heredocs: wrap as
1915                            //   `${__ARITH:expr__}` so the spanned
1916                            //   interpolation parser turns it into a
1917                            //   StringPart::Arithmetic.
1918                            let mut content = hd.body.clone();
1919                            for (marker, expr) in &arith_result.arithmetics {
1920                                if content.contains(marker) {
1921                                    let replacement = if hd.literal {
1922                                        format!("$(({}))", expr)
1923                                    } else {
1924                                        format!("${{__ARITH:{}__}}", expr)
1925                                    };
1926                                    content = content.replace(marker, &replacement);
1927                                }
1928                            }
1929                            final_tokens.push(Spanned::new(Token::HereDocStart, tokens[i].span.clone()));
1930                            final_tokens.push(Spanned::new(
1931                                Token::HereDoc(HereDocData {
1932                                    content,
1933                                    literal: hd.literal,
1934                                    strip_tabs: hd.strip_tabs,
1935                                    body_start_offset: hd.body_start_offset,
1936                                }),
1937                                tokens[i + 1].span.clone(),
1938                            ));
1939                            i += 2;
1940                            continue;
1941                        }
1942                    }
1943        }
1944
1945        // Check for arithmetic markers inside string content
1946        let token = if let Token::String(ref s) = tokens[i].token {
1947            // Check if string contains any arithmetic markers
1948            let mut new_content = s.clone();
1949            for (marker, expr) in &arith_result.arithmetics {
1950                if new_content.contains(marker) {
1951                    // Replace marker with the special format that parse_interpolated_string can detect
1952                    // Use ${__ARITH:expr__} format so it gets parsed as StringPart::Arithmetic
1953                    new_content = new_content.replace(marker, &format!("${{__ARITH:{}__}}", expr));
1954                }
1955            }
1956            if new_content != *s {
1957                Spanned::new(Token::String(new_content), tokens[i].span.clone())
1958            } else {
1959                tokens[i].clone()
1960            }
1961        } else {
1962            tokens[i].clone()
1963        };
1964        final_tokens.push(token);
1965        i += 1;
1966    }
1967
1968    Ok(merge_glob_adjacent(merge_colon_adjacent(final_tokens)))
1969}
1970
1971/// Tokenize source code, preserving comments.
1972///
1973/// Useful for pretty-printing or formatting tools that need to preserve comments.
1974pub fn tokenize_with_comments(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1975    let lexer = Token::lexer(source);
1976    let mut tokens = Vec::new();
1977    let mut errors = Vec::new();
1978
1979    for (result, span) in lexer.spanned() {
1980        match result {
1981            Ok(token) => {
1982                tokens.push(Spanned::new(token, span));
1983            }
1984            Err(err) => {
1985                errors.push(Spanned::new(err, span));
1986            }
1987        }
1988    }
1989
1990    if errors.is_empty() {
1991        Ok(tokens)
1992    } else {
1993        Err(errors)
1994    }
1995}
1996
1997/// Extract the string content from a string token (removes quotes, processes escapes).
1998pub fn parse_string_literal(source: &str) -> Result<String, LexerError> {
1999    // Remove surrounding quotes
2000    if source.len() < 2 || !source.starts_with('"') || !source.ends_with('"') {
2001        return Err(LexerError::UnterminatedString);
2002    }
2003
2004    let inner = &source[1..source.len() - 1];
2005    let mut result = String::with_capacity(inner.len());
2006    let mut chars = inner.chars().peekable();
2007
2008    while let Some(ch) = chars.next() {
2009        if ch == '\\' {
2010            match chars.next() {
2011                Some('n') => result.push('\n'),
2012                Some('t') => result.push('\t'),
2013                Some('r') => result.push('\r'),
2014                Some('\\') => result.push('\\'),
2015                Some('"') => result.push('"'),
2016                // Use a unique marker for escaped dollar that won't be re-interpreted
2017                // parse_interpolated_string will convert this back to $
2018                Some('$') => result.push_str("__KAISH_ESCAPED_DOLLAR__"),
2019                Some('u') => {
2020                    // Unicode escape: \uXXXX
2021                    let mut hex = String::with_capacity(4);
2022                    for _ in 0..4 {
2023                        match chars.next() {
2024                            Some(h) if h.is_ascii_hexdigit() => hex.push(h),
2025                            _ => return Err(LexerError::InvalidEscape),
2026                        }
2027                    }
2028                    let codepoint = u32::from_str_radix(&hex, 16)
2029                        .map_err(|_| LexerError::InvalidEscape)?;
2030                    let ch = char::from_u32(codepoint)
2031                        .ok_or(LexerError::InvalidEscape)?;
2032                    result.push(ch);
2033                }
2034                // Unknown escapes: preserve the backslash (for regex patterns like `\.`)
2035                Some(next) => {
2036                    result.push('\\');
2037                    result.push(next);
2038                }
2039                None => return Err(LexerError::InvalidEscape),
2040            }
2041        } else {
2042            result.push(ch);
2043        }
2044    }
2045
2046    Ok(result)
2047}
2048
2049/// Parse a variable reference, extracting the path segments.
2050/// Input: "${VAR.field[0].nested}" → ["VAR", "field", "[0]", "nested"]
2051pub fn parse_var_ref(source: &str) -> Result<Vec<String>, LexerError> {
2052    // Remove ${ and }
2053    if source.len() < 4 || !source.starts_with("${") || !source.ends_with('}') {
2054        return Err(LexerError::UnterminatedVarRef);
2055    }
2056
2057    let inner = &source[2..source.len() - 1];
2058
2059    // Special case: $? (last result)
2060    if inner == "?" {
2061        return Ok(vec!["?".to_string()]);
2062    }
2063
2064    let mut segments = Vec::new();
2065    let mut current = String::new();
2066    let mut chars = inner.chars().peekable();
2067
2068    while let Some(ch) = chars.next() {
2069        match ch {
2070            '.' => {
2071                if !current.is_empty() {
2072                    segments.push(current.clone());
2073                    current.clear();
2074                }
2075            }
2076            '[' => {
2077                if !current.is_empty() {
2078                    segments.push(current.clone());
2079                    current.clear();
2080                }
2081                // Collect the index
2082                let mut index = String::from("[");
2083                while let Some(&c) = chars.peek() {
2084                    if let Some(c) = chars.next() {
2085                        index.push(c);
2086                    }
2087                    if c == ']' {
2088                        break;
2089                    }
2090                }
2091                segments.push(index);
2092            }
2093            _ => {
2094                current.push(ch);
2095            }
2096        }
2097    }
2098
2099    if !current.is_empty() {
2100        segments.push(current);
2101    }
2102
2103    Ok(segments)
2104}
2105
2106/// Parse an integer literal.
2107pub fn parse_int(source: &str) -> Result<i64, LexerError> {
2108    source.parse().map_err(|_| LexerError::InvalidNumber)
2109}
2110
2111/// Parse a float literal.
2112pub fn parse_float(source: &str) -> Result<f64, LexerError> {
2113    source.parse().map_err(|_| LexerError::InvalidNumber)
2114}
2115
2116#[cfg(test)]
2117mod tests {
2118    use super::*;
2119
2120    fn lex(source: &str) -> Vec<Token> {
2121        tokenize(source)
2122            .expect("lexer should succeed")
2123            .into_iter()
2124            .map(|s| s.token)
2125            .collect()
2126    }
2127
2128    // ═══════════════════════════════════════════════════════════════════
2129    // Keyword tests
2130    // ═══════════════════════════════════════════════════════════════════
2131
2132    #[test]
2133    fn keywords() {
2134        assert_eq!(lex("set"), vec![Token::Set]);
2135        assert_eq!(lex("if"), vec![Token::If]);
2136        assert_eq!(lex("then"), vec![Token::Then]);
2137        assert_eq!(lex("else"), vec![Token::Else]);
2138        assert_eq!(lex("elif"), vec![Token::Elif]);
2139        assert_eq!(lex("fi"), vec![Token::Fi]);
2140        assert_eq!(lex("for"), vec![Token::For]);
2141        assert_eq!(lex("in"), vec![Token::In]);
2142        assert_eq!(lex("do"), vec![Token::Do]);
2143        assert_eq!(lex("done"), vec![Token::Done]);
2144        assert_eq!(lex("case"), vec![Token::Case]);
2145        assert_eq!(lex("esac"), vec![Token::Esac]);
2146        assert_eq!(lex("function"), vec![Token::Function]);
2147        assert_eq!(lex("true"), vec![Token::True]);
2148        assert_eq!(lex("false"), vec![Token::False]);
2149    }
2150
2151    #[test]
2152    fn double_semicolon() {
2153        assert_eq!(lex(";;"), vec![Token::DoubleSemi]);
2154        // In case pattern context
2155        assert_eq!(lex("echo \"hi\";;"), vec![
2156            Token::Ident("echo".to_string()),
2157            Token::String("hi".to_string()),
2158            Token::DoubleSemi,
2159        ]);
2160    }
2161
2162    #[test]
2163    fn type_keywords() {
2164        assert_eq!(lex("string"), vec![Token::TypeString]);
2165        assert_eq!(lex("int"), vec![Token::TypeInt]);
2166        assert_eq!(lex("float"), vec![Token::TypeFloat]);
2167        assert_eq!(lex("bool"), vec![Token::TypeBool]);
2168    }
2169
2170    // ═══════════════════════════════════════════════════════════════════
2171    // Operator tests
2172    // ═══════════════════════════════════════════════════════════════════
2173
2174    #[test]
2175    fn single_char_operators() {
2176        assert_eq!(lex("="), vec![Token::Eq]);
2177        assert_eq!(lex("|"), vec![Token::Pipe]);
2178        assert_eq!(lex("&"), vec![Token::Amp]);
2179        assert_eq!(lex(">"), vec![Token::Gt]);
2180        assert_eq!(lex("<"), vec![Token::Lt]);
2181        assert_eq!(lex(";"), vec![Token::Semi]);
2182        assert_eq!(lex(":"), vec![Token::Colon]);
2183        assert_eq!(lex(","), vec![Token::Comma]);
2184        assert_eq!(lex("."), vec![Token::Dot]);
2185    }
2186
2187    #[test]
2188    fn multi_char_operators() {
2189        assert_eq!(lex("&&"), vec![Token::And]);
2190        assert_eq!(lex("||"), vec![Token::Or]);
2191        assert_eq!(lex("=="), vec![Token::EqEq]);
2192        assert_eq!(lex("!="), vec![Token::NotEq]);
2193        assert_eq!(lex("=~"), vec![Token::Match]);
2194        assert_eq!(lex("!~"), vec![Token::NotMatch]);
2195        assert_eq!(lex(">="), vec![Token::GtEq]);
2196        assert_eq!(lex("<="), vec![Token::LtEq]);
2197        assert_eq!(lex(">>"), vec![Token::GtGt]);
2198        assert_eq!(lex("2>"), vec![Token::Stderr]);
2199        assert_eq!(lex("&>"), vec![Token::Both]);
2200    }
2201
2202    #[test]
2203    fn brackets() {
2204        assert_eq!(lex("{"), vec![Token::LBrace]);
2205        assert_eq!(lex("}"), vec![Token::RBrace]);
2206        assert_eq!(lex("["), vec![Token::LBracket]);
2207        assert_eq!(lex("]"), vec![Token::RBracket]);
2208        assert_eq!(lex("("), vec![Token::LParen]);
2209        assert_eq!(lex(")"), vec![Token::RParen]);
2210    }
2211
2212    // ═══════════════════════════════════════════════════════════════════
2213    // Literal tests
2214    // ═══════════════════════════════════════════════════════════════════
2215
2216    #[test]
2217    fn integers() {
2218        assert_eq!(lex("0"), vec![Token::Int(0)]);
2219        assert_eq!(lex("42"), vec![Token::Int(42)]);
2220        assert_eq!(lex("-1"), vec![Token::Int(-1)]);
2221        assert_eq!(lex("999999"), vec![Token::Int(999999)]);
2222    }
2223
2224    #[test]
2225    fn floats() {
2226        assert_eq!(lex("3.14"), vec![Token::Float(3.14)]);
2227        assert_eq!(lex("-0.5"), vec![Token::Float(-0.5)]);
2228        assert_eq!(lex("123.456"), vec![Token::Float(123.456)]);
2229    }
2230
2231    #[test]
2232    fn strings() {
2233        assert_eq!(lex(r#""hello""#), vec![Token::String("hello".to_string())]);
2234        assert_eq!(lex(r#""hello world""#), vec![Token::String("hello world".to_string())]);
2235        assert_eq!(lex(r#""""#), vec![Token::String("".to_string())]); // empty string
2236        assert_eq!(lex(r#""with \"quotes\"""#), vec![Token::String("with \"quotes\"".to_string())]);
2237        assert_eq!(lex(r#""with\nnewline""#), vec![Token::String("with\nnewline".to_string())]);
2238    }
2239
2240    #[test]
2241    fn var_refs() {
2242        assert_eq!(lex("${X}"), vec![Token::VarRef("${X}".to_string())]);
2243        assert_eq!(lex("${VAR}"), vec![Token::VarRef("${VAR}".to_string())]);
2244        assert_eq!(lex("${VAR.field}"), vec![Token::VarRef("${VAR.field}".to_string())]);
2245        assert_eq!(lex("${VAR[0]}"), vec![Token::VarRef("${VAR[0]}".to_string())]);
2246    }
2247
2248    // ═══════════════════════════════════════════════════════════════════
2249    // Identifier tests
2250    // ═══════════════════════════════════════════════════════════════════
2251
2252    #[test]
2253    fn identifiers() {
2254        assert_eq!(lex("foo"), vec![Token::Ident("foo".to_string())]);
2255        assert_eq!(lex("foo_bar"), vec![Token::Ident("foo_bar".to_string())]);
2256        assert_eq!(lex("foo-bar"), vec![Token::Ident("foo-bar".to_string())]);
2257        assert_eq!(lex("_private"), vec![Token::Ident("_private".to_string())]);
2258        assert_eq!(lex("cmd123"), vec![Token::Ident("cmd123".to_string())]);
2259    }
2260
2261    #[test]
2262    fn keyword_prefix_identifiers() {
2263        // Identifiers that start with keywords but aren't keywords
2264        assert_eq!(lex("setup"), vec![Token::Ident("setup".to_string())]);
2265        assert_eq!(lex("kaish-tools"), vec![Token::Ident("kaish-tools".to_string())]);
2266        assert_eq!(lex("iffy"), vec![Token::Ident("iffy".to_string())]);
2267        assert_eq!(lex("forked"), vec![Token::Ident("forked".to_string())]);
2268        assert_eq!(lex("done-with-it"), vec![Token::Ident("done-with-it".to_string())]);
2269    }
2270
2271    // ═══════════════════════════════════════════════════════════════════
2272    // Statement tests
2273    // ═══════════════════════════════════════════════════════════════════
2274
2275    #[test]
2276    fn assignment() {
2277        assert_eq!(
2278            lex("set X = 5"),
2279            vec![Token::Set, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2280        );
2281    }
2282
2283    #[test]
2284    fn command_simple() {
2285        assert_eq!(lex("echo"), vec![Token::Ident("echo".to_string())]);
2286        assert_eq!(
2287            lex(r#"echo "hello""#),
2288            vec![Token::Ident("echo".to_string()), Token::String("hello".to_string())]
2289        );
2290    }
2291
2292    #[test]
2293    fn command_with_args() {
2294        assert_eq!(
2295            lex("cmd arg1 arg2"),
2296            vec![Token::Ident("cmd".to_string()), Token::Ident("arg1".to_string()), Token::Ident("arg2".to_string())]
2297        );
2298    }
2299
2300    #[test]
2301    fn command_with_named_args() {
2302        assert_eq!(
2303            lex("cmd key=value"),
2304            vec![Token::Ident("cmd".to_string()), Token::Ident("key".to_string()), Token::Eq, Token::Ident("value".to_string())]
2305        );
2306    }
2307
2308    #[test]
2309    fn pipeline() {
2310        assert_eq!(
2311            lex("a | b | c"),
2312            vec![Token::Ident("a".to_string()), Token::Pipe, Token::Ident("b".to_string()), Token::Pipe, Token::Ident("c".to_string())]
2313        );
2314    }
2315
2316    #[test]
2317    fn if_statement() {
2318        assert_eq!(
2319            lex("if true; then echo; fi"),
2320            vec![
2321                Token::If,
2322                Token::True,
2323                Token::Semi,
2324                Token::Then,
2325                Token::Ident("echo".to_string()),
2326                Token::Semi,
2327                Token::Fi
2328            ]
2329        );
2330    }
2331
2332    #[test]
2333    fn for_loop() {
2334        assert_eq!(
2335            lex("for X in items; do echo; done"),
2336            vec![
2337                Token::For,
2338                Token::Ident("X".to_string()),
2339                Token::In,
2340                Token::Ident("items".to_string()),
2341                Token::Semi,
2342                Token::Do,
2343                Token::Ident("echo".to_string()),
2344                Token::Semi,
2345                Token::Done
2346            ]
2347        );
2348    }
2349
2350    // ═══════════════════════════════════════════════════════════════════
2351    // Whitespace and newlines
2352    // ═══════════════════════════════════════════════════════════════════
2353
2354    #[test]
2355    fn whitespace_ignored() {
2356        assert_eq!(lex("   set   X   =   5   "), lex("set X = 5"));
2357    }
2358
2359    #[test]
2360    fn newlines_preserved() {
2361        let tokens = lex("a\nb");
2362        assert_eq!(
2363            tokens,
2364            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2365        );
2366    }
2367
2368    #[test]
2369    fn multiple_newlines() {
2370        let tokens = lex("a\n\n\nb");
2371        assert_eq!(
2372            tokens,
2373            vec![Token::Ident("a".to_string()), Token::Newline, Token::Newline, Token::Newline, Token::Ident("b".to_string())]
2374        );
2375    }
2376
2377    // ═══════════════════════════════════════════════════════════════════
2378    // Comments
2379    // ═══════════════════════════════════════════════════════════════════
2380
2381    #[test]
2382    fn comments_skipped() {
2383        assert_eq!(lex("# comment"), vec![]);
2384        assert_eq!(lex("a # comment"), vec![Token::Ident("a".to_string())]);
2385        assert_eq!(
2386            lex("a # comment\nb"),
2387            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2388        );
2389    }
2390
2391    #[test]
2392    fn comments_preserved_when_requested() {
2393        let tokens = tokenize_with_comments("a # comment")
2394            .expect("should succeed")
2395            .into_iter()
2396            .map(|s| s.token)
2397            .collect::<Vec<_>>();
2398        assert_eq!(tokens, vec![Token::Ident("a".to_string()), Token::Comment]);
2399    }
2400
2401    // ═══════════════════════════════════════════════════════════════════
2402    // String parsing
2403    // ═══════════════════════════════════════════════════════════════════
2404
2405    #[test]
2406    fn parse_simple_string() {
2407        assert_eq!(parse_string_literal(r#""hello""#).expect("ok"), "hello");
2408    }
2409
2410    #[test]
2411    fn parse_string_with_escapes() {
2412        assert_eq!(
2413            parse_string_literal(r#""hello\nworld""#).expect("ok"),
2414            "hello\nworld"
2415        );
2416        assert_eq!(
2417            parse_string_literal(r#""tab\there""#).expect("ok"),
2418            "tab\there"
2419        );
2420        assert_eq!(
2421            parse_string_literal(r#""quote\"here""#).expect("ok"),
2422            "quote\"here"
2423        );
2424    }
2425
2426    #[test]
2427    fn parse_string_with_unicode() {
2428        assert_eq!(
2429            parse_string_literal(r#""emoji \u2764""#).expect("ok"),
2430            "emoji ❤"
2431        );
2432    }
2433
2434    #[test]
2435    fn parse_string_with_escaped_dollar() {
2436        // \$ produces a marker that parse_interpolated_string will convert to $
2437        // The marker __KAISH_ESCAPED_DOLLAR__ is used to prevent re-interpretation
2438        assert_eq!(
2439            parse_string_literal(r#""\$VAR""#).expect("ok"),
2440            "__KAISH_ESCAPED_DOLLAR__VAR"
2441        );
2442        assert_eq!(
2443            parse_string_literal(r#""cost: \$100""#).expect("ok"),
2444            "cost: __KAISH_ESCAPED_DOLLAR__100"
2445        );
2446    }
2447
2448    // ═══════════════════════════════════════════════════════════════════
2449    // Variable reference parsing
2450    // ═══════════════════════════════════════════════════════════════════
2451
2452    #[test]
2453    fn parse_simple_var() {
2454        assert_eq!(
2455            parse_var_ref("${X}").expect("ok"),
2456            vec!["X"]
2457        );
2458    }
2459
2460    #[test]
2461    fn parse_var_with_field() {
2462        assert_eq!(
2463            parse_var_ref("${VAR.field}").expect("ok"),
2464            vec!["VAR", "field"]
2465        );
2466    }
2467
2468    #[test]
2469    fn parse_var_with_index() {
2470        assert_eq!(
2471            parse_var_ref("${VAR[0]}").expect("ok"),
2472            vec!["VAR", "[0]"]
2473        );
2474    }
2475
2476    #[test]
2477    fn parse_var_nested() {
2478        assert_eq!(
2479            parse_var_ref("${VAR.field[0].nested}").expect("ok"),
2480            vec!["VAR", "field", "[0]", "nested"]
2481        );
2482    }
2483
2484    #[test]
2485    fn parse_last_result() {
2486        assert_eq!(
2487            parse_var_ref("${?}").expect("ok"),
2488            vec!["?"]
2489        );
2490    }
2491
2492    // ═══════════════════════════════════════════════════════════════════
2493    // Number parsing
2494    // ═══════════════════════════════════════════════════════════════════
2495
2496    #[test]
2497    fn parse_integers() {
2498        assert_eq!(parse_int("0").expect("ok"), 0);
2499        assert_eq!(parse_int("42").expect("ok"), 42);
2500        assert_eq!(parse_int("-1").expect("ok"), -1);
2501    }
2502
2503    #[test]
2504    fn parse_floats() {
2505        assert!((parse_float("3.14").expect("ok") - 3.14).abs() < f64::EPSILON);
2506        assert!((parse_float("-0.5").expect("ok") - (-0.5)).abs() < f64::EPSILON);
2507    }
2508
2509    // ═══════════════════════════════════════════════════════════════════
2510    // Edge cases and errors
2511    // ═══════════════════════════════════════════════════════════════════
2512
2513    #[test]
2514    fn empty_input() {
2515        assert_eq!(lex(""), vec![]);
2516    }
2517
2518    #[test]
2519    fn only_whitespace() {
2520        assert_eq!(lex("   \t\t   "), vec![]);
2521    }
2522
2523    #[test]
2524    fn json_array() {
2525        assert_eq!(
2526            lex(r#"[1, 2, 3]"#),
2527            vec![
2528                Token::LBracket,
2529                Token::Int(1),
2530                Token::Comma,
2531                Token::Int(2),
2532                Token::Comma,
2533                Token::Int(3),
2534                Token::RBracket
2535            ]
2536        );
2537    }
2538
2539    #[test]
2540    fn json_object() {
2541        assert_eq!(
2542            lex(r#"{"key": "value"}"#),
2543            vec![
2544                Token::LBrace,
2545                Token::String("key".to_string()),
2546                Token::Colon,
2547                Token::String("value".to_string()),
2548                Token::RBrace
2549            ]
2550        );
2551    }
2552
2553    #[test]
2554    fn redirect_operators() {
2555        assert_eq!(
2556            lex("cmd > file"),
2557            vec![Token::Ident("cmd".to_string()), Token::Gt, Token::Ident("file".to_string())]
2558        );
2559        assert_eq!(
2560            lex("cmd >> file"),
2561            vec![Token::Ident("cmd".to_string()), Token::GtGt, Token::Ident("file".to_string())]
2562        );
2563        assert_eq!(
2564            lex("cmd 2> err"),
2565            vec![Token::Ident("cmd".to_string()), Token::Stderr, Token::Ident("err".to_string())]
2566        );
2567        assert_eq!(
2568            lex("cmd &> all"),
2569            vec![Token::Ident("cmd".to_string()), Token::Both, Token::Ident("all".to_string())]
2570        );
2571    }
2572
2573    #[test]
2574    fn background_job() {
2575        assert_eq!(
2576            lex("cmd &"),
2577            vec![Token::Ident("cmd".to_string()), Token::Amp]
2578        );
2579    }
2580
2581    #[test]
2582    fn command_substitution() {
2583        assert_eq!(
2584            lex("$(cmd)"),
2585            vec![Token::CmdSubstStart, Token::Ident("cmd".to_string()), Token::RParen]
2586        );
2587        assert_eq!(
2588            lex("$(cmd arg)"),
2589            vec![
2590                Token::CmdSubstStart,
2591                Token::Ident("cmd".to_string()),
2592                Token::Ident("arg".to_string()),
2593                Token::RParen
2594            ]
2595        );
2596        assert_eq!(
2597            lex("$(a | b)"),
2598            vec![
2599                Token::CmdSubstStart,
2600                Token::Ident("a".to_string()),
2601                Token::Pipe,
2602                Token::Ident("b".to_string()),
2603                Token::RParen
2604            ]
2605        );
2606    }
2607
2608    #[test]
2609    fn complex_pipeline() {
2610        assert_eq!(
2611            lex(r#"cat file | grep pattern="foo" | head count=10"#),
2612            vec![
2613                Token::Ident("cat".to_string()),
2614                Token::Ident("file".to_string()),
2615                Token::Pipe,
2616                Token::Ident("grep".to_string()),
2617                Token::Ident("pattern".to_string()),
2618                Token::Eq,
2619                Token::String("foo".to_string()),
2620                Token::Pipe,
2621                Token::Ident("head".to_string()),
2622                Token::Ident("count".to_string()),
2623                Token::Eq,
2624                Token::Int(10),
2625            ]
2626        );
2627    }
2628
2629    // ═══════════════════════════════════════════════════════════════════
2630    // Flag tests
2631    // ═══════════════════════════════════════════════════════════════════
2632
2633    #[test]
2634    fn short_flag() {
2635        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2636        assert_eq!(lex("-a"), vec![Token::ShortFlag("a".to_string())]);
2637        assert_eq!(lex("-v"), vec![Token::ShortFlag("v".to_string())]);
2638    }
2639
2640    #[test]
2641    fn short_flag_combined() {
2642        // Combined short flags like -la
2643        assert_eq!(lex("-la"), vec![Token::ShortFlag("la".to_string())]);
2644        assert_eq!(lex("-vvv"), vec![Token::ShortFlag("vvv".to_string())]);
2645    }
2646
2647    #[test]
2648    fn job_spec_lexes_as_one_token() {
2649        // `%N` is the bash jobspec for wait/kill — used to be a lexer error.
2650        assert_eq!(lex("%1"), vec![Token::JobSpec("%1".to_string())]);
2651        assert_eq!(lex("%12"), vec![Token::JobSpec("%12".to_string())]);
2652        assert_eq!(
2653            lex("wait %1 %2"),
2654            vec![
2655                Token::Ident("wait".to_string()),
2656                Token::JobSpec("%1".to_string()),
2657                Token::JobSpec("%2".to_string()),
2658            ]
2659        );
2660    }
2661
2662    #[test]
2663    fn short_flag_with_internal_hyphens_is_one_token() {
2664        // A dash-word with internal hyphens is ONE shell word, not three
2665        // flags — `-not-a-flag` must not fragment into `-not` `-a` `-flag`.
2666        // (Whether it's a flag or a literal is the binding layer's call.)
2667        assert_eq!(
2668            lex("-not-a-flag"),
2669            vec![Token::ShortFlag("not-a-flag".to_string())]
2670        );
2671        // The two-char terminator `--` is still DoubleDash, and a lone `-`
2672        // is still MinusAlone — the second char must be a letter to start a
2673        // short flag.
2674        assert_eq!(lex("--"), vec![Token::DoubleDash]);
2675        assert_eq!(lex("-"), vec![Token::MinusAlone]);
2676    }
2677
2678    #[test]
2679    fn long_flag() {
2680        assert_eq!(lex("--force"), vec![Token::LongFlag("force".to_string())]);
2681        assert_eq!(lex("--verbose"), vec![Token::LongFlag("verbose".to_string())]);
2682        assert_eq!(lex("--foo-bar"), vec![Token::LongFlag("foo-bar".to_string())]);
2683    }
2684
2685    #[test]
2686    fn double_dash() {
2687        // -- alone marks end of flags
2688        assert_eq!(lex("--"), vec![Token::DoubleDash]);
2689    }
2690
2691    #[test]
2692    fn flags_vs_negative_numbers() {
2693        // -123 should be a negative integer, not a flag
2694        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2695        // -l should be a flag
2696        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2697        // -1a is ambiguous - should be Int(-1) then Ident(a)
2698        // Actually the regex -[a-zA-Z] won't match -1a since 1 isn't a letter
2699        assert_eq!(
2700            lex("-1 a"),
2701            vec![Token::Int(-1), Token::Ident("a".to_string())]
2702        );
2703    }
2704
2705    #[test]
2706    fn command_with_flags() {
2707        assert_eq!(
2708            lex("ls -l"),
2709            vec![
2710                Token::Ident("ls".to_string()),
2711                Token::ShortFlag("l".to_string()),
2712            ]
2713        );
2714        assert_eq!(
2715            lex("git commit -m"),
2716            vec![
2717                Token::Ident("git".to_string()),
2718                Token::Ident("commit".to_string()),
2719                Token::ShortFlag("m".to_string()),
2720            ]
2721        );
2722        assert_eq!(
2723            lex("git push --force"),
2724            vec![
2725                Token::Ident("git".to_string()),
2726                Token::Ident("push".to_string()),
2727                Token::LongFlag("force".to_string()),
2728            ]
2729        );
2730    }
2731
2732    #[test]
2733    fn flag_with_value() {
2734        assert_eq!(
2735            lex(r#"git commit -m "message""#),
2736            vec![
2737                Token::Ident("git".to_string()),
2738                Token::Ident("commit".to_string()),
2739                Token::ShortFlag("m".to_string()),
2740                Token::String("message".to_string()),
2741            ]
2742        );
2743        assert_eq!(
2744            lex(r#"--message="hello""#),
2745            vec![
2746                Token::LongFlag("message".to_string()),
2747                Token::Eq,
2748                Token::String("hello".to_string()),
2749            ]
2750        );
2751    }
2752
2753    #[test]
2754    fn end_of_flags_marker() {
2755        assert_eq!(
2756            lex("git checkout -- file"),
2757            vec![
2758                Token::Ident("git".to_string()),
2759                Token::Ident("checkout".to_string()),
2760                Token::DoubleDash,
2761                Token::Ident("file".to_string()),
2762            ]
2763        );
2764    }
2765
2766    // ═══════════════════════════════════════════════════════════════════
2767    // Bash compatibility tokens
2768    // ═══════════════════════════════════════════════════════════════════
2769
2770    #[test]
2771    fn local_keyword() {
2772        assert_eq!(lex("local"), vec![Token::Local]);
2773        assert_eq!(
2774            lex("local X = 5"),
2775            vec![Token::Local, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2776        );
2777    }
2778
2779    #[test]
2780    fn simple_var_ref() {
2781        assert_eq!(lex("$X"), vec![Token::SimpleVarRef("X".to_string())]);
2782        assert_eq!(lex("$foo"), vec![Token::SimpleVarRef("foo".to_string())]);
2783        assert_eq!(lex("$foo_bar"), vec![Token::SimpleVarRef("foo_bar".to_string())]);
2784        assert_eq!(lex("$_private"), vec![Token::SimpleVarRef("_private".to_string())]);
2785    }
2786
2787    #[test]
2788    fn simple_var_ref_in_command() {
2789        assert_eq!(
2790            lex("echo $NAME"),
2791            vec![Token::Ident("echo".to_string()), Token::SimpleVarRef("NAME".to_string())]
2792        );
2793    }
2794
2795    #[test]
2796    fn single_quoted_strings() {
2797        assert_eq!(lex("'hello'"), vec![Token::SingleString("hello".to_string())]);
2798        assert_eq!(lex("'hello world'"), vec![Token::SingleString("hello world".to_string())]);
2799        assert_eq!(lex("''"), vec![Token::SingleString("".to_string())]);
2800        // Single quotes don't process escapes or variables
2801        assert_eq!(lex(r"'no $VAR here'"), vec![Token::SingleString("no $VAR here".to_string())]);
2802        assert_eq!(lex(r"'backslash \n stays'"), vec![Token::SingleString(r"backslash \n stays".to_string())]);
2803    }
2804
2805    #[test]
2806    fn test_brackets() {
2807        // [[ and ]] are now two separate bracket tokens to avoid conflicts with nested arrays
2808        assert_eq!(lex("[["), vec![Token::LBracket, Token::LBracket]);
2809        assert_eq!(lex("]]"), vec![Token::RBracket, Token::RBracket]);
2810        assert_eq!(
2811            lex("[[ -f file ]]"),
2812            vec![
2813                Token::LBracket,
2814                Token::LBracket,
2815                Token::ShortFlag("f".to_string()),
2816                Token::Ident("file".to_string()),
2817                Token::RBracket,
2818                Token::RBracket
2819            ]
2820        );
2821    }
2822
2823    #[test]
2824    fn test_expression_syntax() {
2825        assert_eq!(
2826            lex(r#"[[ $X == "value" ]]"#),
2827            vec![
2828                Token::LBracket,
2829                Token::LBracket,
2830                Token::SimpleVarRef("X".to_string()),
2831                Token::EqEq,
2832                Token::String("value".to_string()),
2833                Token::RBracket,
2834                Token::RBracket
2835            ]
2836        );
2837    }
2838
2839    #[test]
2840    fn bash_style_assignment() {
2841        // NAME="value" (no spaces) - lexer sees IDENT EQ STRING
2842        assert_eq!(
2843            lex(r#"NAME="value""#),
2844            vec![
2845                Token::Ident("NAME".to_string()),
2846                Token::Eq,
2847                Token::String("value".to_string())
2848            ]
2849        );
2850    }
2851
2852    #[test]
2853    fn positional_params() {
2854        assert_eq!(lex("$0"), vec![Token::Positional(0)]);
2855        assert_eq!(lex("$1"), vec![Token::Positional(1)]);
2856        assert_eq!(lex("$9"), vec![Token::Positional(9)]);
2857        assert_eq!(lex("$@"), vec![Token::AllArgs]);
2858        assert_eq!(lex("$#"), vec![Token::ArgCount]);
2859    }
2860
2861    #[test]
2862    fn positional_in_context() {
2863        assert_eq!(
2864            lex("echo $1 $2"),
2865            vec![
2866                Token::Ident("echo".to_string()),
2867                Token::Positional(1),
2868                Token::Positional(2),
2869            ]
2870        );
2871    }
2872
2873    #[test]
2874    fn var_length() {
2875        assert_eq!(lex("${#X}"), vec![Token::VarLength("X".to_string())]);
2876        assert_eq!(lex("${#NAME}"), vec![Token::VarLength("NAME".to_string())]);
2877        assert_eq!(lex("${#foo_bar}"), vec![Token::VarLength("foo_bar".to_string())]);
2878    }
2879
2880    #[test]
2881    fn var_length_in_context() {
2882        assert_eq!(
2883            lex("echo ${#NAME}"),
2884            vec![
2885                Token::Ident("echo".to_string()),
2886                Token::VarLength("NAME".to_string()),
2887            ]
2888        );
2889    }
2890
2891    // ═══════════════════════════════════════════════════════════════════
2892    // Edge case tests: Flag ambiguities
2893    // ═══════════════════════════════════════════════════════════════════
2894
2895    #[test]
2896    fn plus_flag() {
2897        // Plus flags for set +e
2898        assert_eq!(lex("+e"), vec![Token::PlusFlag("e".to_string())]);
2899        assert_eq!(lex("+x"), vec![Token::PlusFlag("x".to_string())]);
2900        assert_eq!(lex("+ex"), vec![Token::PlusFlag("ex".to_string())]);
2901    }
2902
2903    #[test]
2904    fn set_with_plus_flag() {
2905        assert_eq!(
2906            lex("set +e"),
2907            vec![
2908                Token::Set,
2909                Token::PlusFlag("e".to_string()),
2910            ]
2911        );
2912    }
2913
2914    #[test]
2915    fn set_with_multiple_flags() {
2916        assert_eq!(
2917            lex("set -e -u"),
2918            vec![
2919                Token::Set,
2920                Token::ShortFlag("e".to_string()),
2921                Token::ShortFlag("u".to_string()),
2922            ]
2923        );
2924    }
2925
2926    #[test]
2927    fn flags_vs_negative_numbers_edge_cases() {
2928        // -1a should be negative int followed by ident
2929        assert_eq!(
2930            lex("-1 a"),
2931            vec![Token::Int(-1), Token::Ident("a".to_string())]
2932        );
2933        // -l is a flag
2934        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2935        // -123 is negative number
2936        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2937    }
2938
2939    #[test]
2940    fn single_dash_is_minus_alone() {
2941        // Single dash alone - now handled as MinusAlone for `cat -` stdin indicator
2942        let result = tokenize("-").expect("should lex");
2943        assert_eq!(result.len(), 1);
2944        assert!(matches!(result[0].token, Token::MinusAlone));
2945    }
2946
2947    #[test]
2948    fn plus_bare_for_date_format() {
2949        // `date +%s` - the +%s should be PlusBare
2950        let result = tokenize("+%s").expect("should lex");
2951        assert_eq!(result.len(), 1);
2952        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%s"));
2953
2954        // `date +%Y-%m-%d` - format string with dashes
2955        let result = tokenize("+%Y-%m-%d").expect("should lex");
2956        assert_eq!(result.len(), 1);
2957        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%Y-%m-%d"));
2958    }
2959
2960    #[test]
2961    fn plus_flag_still_works() {
2962        // `set +e` - should still be PlusFlag
2963        let result = tokenize("+e").expect("should lex");
2964        assert_eq!(result.len(), 1);
2965        assert!(matches!(result[0].token, Token::PlusFlag(ref s) if s == "e"));
2966    }
2967
2968    #[test]
2969    fn while_keyword_vs_while_loop() {
2970        // 'while' as keyword in loop context
2971        assert_eq!(lex("while"), vec![Token::While]);
2972        // 'while' at start followed by condition
2973        assert_eq!(
2974            lex("while true"),
2975            vec![Token::While, Token::True]
2976        );
2977    }
2978
2979    #[test]
2980    fn control_flow_keywords() {
2981        assert_eq!(lex("break"), vec![Token::Break]);
2982        assert_eq!(lex("continue"), vec![Token::Continue]);
2983        assert_eq!(lex("return"), vec![Token::Return]);
2984        assert_eq!(lex("exit"), vec![Token::Exit]);
2985    }
2986
2987    #[test]
2988    fn control_flow_with_numbers() {
2989        assert_eq!(
2990            lex("break 2"),
2991            vec![Token::Break, Token::Int(2)]
2992        );
2993        assert_eq!(
2994            lex("continue 3"),
2995            vec![Token::Continue, Token::Int(3)]
2996        );
2997        assert_eq!(
2998            lex("exit 1"),
2999            vec![Token::Exit, Token::Int(1)]
3000        );
3001    }
3002
3003    // ═══════════════════════════════════════════════════════════════════
3004    // Here-doc tests
3005    // ═══════════════════════════════════════════════════════════════════
3006
3007    #[test]
3008    fn heredoc_simple() {
3009        let source = "cat <<EOF\nhello\nworld\nEOF";
3010        let tokens = lex(source);
3011        // body_start_offset = byte offset of 'h' in "hello", i.e. just after "cat <<EOF\n"
3012        assert_eq!(tokens, vec![
3013            Token::Ident("cat".to_string()),
3014            Token::HereDocStart,
3015            Token::HereDoc(HereDocData {
3016                content: "hello\nworld\n".to_string(),
3017                literal: false,
3018                strip_tabs: false,
3019                body_start_offset: 10,
3020            }),
3021            Token::Newline,
3022        ]);
3023    }
3024
3025    #[test]
3026    fn heredoc_empty() {
3027        let source = "cat <<EOF\nEOF";
3028        let tokens = lex(source);
3029        assert_eq!(tokens, vec![
3030            Token::Ident("cat".to_string()),
3031            Token::HereDocStart,
3032            Token::HereDoc(HereDocData {
3033                content: "".to_string(),
3034                literal: false,
3035                strip_tabs: false,
3036                body_start_offset: 10,
3037            }),
3038            Token::Newline,
3039        ]);
3040    }
3041
3042    #[test]
3043    fn heredoc_with_special_chars() {
3044        let source = "cat <<EOF\n$VAR and \"quoted\" 'single'\nEOF";
3045        let tokens = lex(source);
3046        assert_eq!(tokens, vec![
3047            Token::Ident("cat".to_string()),
3048            Token::HereDocStart,
3049            Token::HereDoc(HereDocData {
3050                content: "$VAR and \"quoted\" 'single'\n".to_string(),
3051                literal: false,
3052                strip_tabs: false,
3053                body_start_offset: 10,
3054            }),
3055            Token::Newline,
3056        ]);
3057    }
3058
3059    #[test]
3060    fn heredoc_multiline() {
3061        let source = "cat <<END\nline1\nline2\nline3\nEND";
3062        let tokens = lex(source);
3063        assert_eq!(tokens, vec![
3064            Token::Ident("cat".to_string()),
3065            Token::HereDocStart,
3066            Token::HereDoc(HereDocData {
3067                content: "line1\nline2\nline3\n".to_string(),
3068                literal: false,
3069                strip_tabs: false,
3070                body_start_offset: 10,
3071            }),
3072            Token::Newline,
3073        ]);
3074    }
3075
3076    #[test]
3077    fn heredoc_in_command() {
3078        let source = "cat <<EOF\nhello\nEOF\necho goodbye";
3079        let tokens = lex(source);
3080        assert_eq!(tokens, vec![
3081            Token::Ident("cat".to_string()),
3082            Token::HereDocStart,
3083            Token::HereDoc(HereDocData {
3084                content: "hello\n".to_string(),
3085                literal: false,
3086                strip_tabs: false,
3087                body_start_offset: 10,
3088            }),
3089            Token::Newline,
3090            Token::Ident("echo".to_string()),
3091            Token::Ident("goodbye".to_string()),
3092        ]);
3093    }
3094
3095    #[test]
3096    fn heredoc_strip_tabs() {
3097        let source = "cat <<-EOF\n\thello\n\tworld\n\tEOF";
3098        let tokens = lex(source);
3099        // Content keeps tabs verbatim — strip_tabs is recorded on the token so
3100        // the interpreter can apply POSIX leading-tab stripping at materialization
3101        // without disturbing source byte offsets used for span tracking.
3102        assert_eq!(tokens, vec![
3103            Token::Ident("cat".to_string()),
3104            Token::HereDocStart,
3105            Token::HereDoc(HereDocData {
3106                content: "\thello\n\tworld\n".to_string(),
3107                literal: false,
3108                strip_tabs: true,
3109                body_start_offset: 11,
3110            }),
3111            Token::Newline,
3112        ]);
3113    }
3114
3115    // ═══════════════════════════════════════════════════════════════════
3116    // Arithmetic expression tests
3117    // ═══════════════════════════════════════════════════════════════════
3118
3119    #[test]
3120    fn arithmetic_simple() {
3121        let source = "$((1 + 2))";
3122        let tokens = lex(source);
3123        assert_eq!(tokens, vec![Token::Arithmetic("1 + 2".to_string())]);
3124    }
3125
3126    #[test]
3127    fn arithmetic_in_assignment() {
3128        let source = "X=$((5 * 3))";
3129        let tokens = lex(source);
3130        assert_eq!(tokens, vec![
3131            Token::Ident("X".to_string()),
3132            Token::Eq,
3133            Token::Arithmetic("5 * 3".to_string()),
3134        ]);
3135    }
3136
3137    #[test]
3138    fn arithmetic_with_nested_parens() {
3139        let source = "$((2 * (3 + 4)))";
3140        let tokens = lex(source);
3141        assert_eq!(tokens, vec![Token::Arithmetic("2 * (3 + 4)".to_string())]);
3142    }
3143
3144    #[test]
3145    fn arithmetic_with_variable() {
3146        let source = "$((X + 1))";
3147        let tokens = lex(source);
3148        assert_eq!(tokens, vec![Token::Arithmetic("X + 1".to_string())]);
3149    }
3150
3151    #[test]
3152    fn arithmetic_command_subst_not_confused() {
3153        // $( should not be treated as arithmetic
3154        let source = "$(echo hello)";
3155        let tokens = lex(source);
3156        assert_eq!(tokens, vec![
3157            Token::CmdSubstStart,
3158            Token::Ident("echo".to_string()),
3159            Token::Ident("hello".to_string()),
3160            Token::RParen,
3161        ]);
3162    }
3163
3164    #[test]
3165    fn arithmetic_nesting_limit() {
3166        // Create deeply nested parens that exceed MAX_PAREN_DEPTH (256)
3167        let open_parens = "(".repeat(300);
3168        let close_parens = ")".repeat(300);
3169        let source = format!("$(({}1{}))", open_parens, close_parens);
3170        let result = tokenize(&source);
3171        assert!(result.is_err());
3172        let errors = result.unwrap_err();
3173        assert_eq!(errors.len(), 1);
3174        assert_eq!(errors[0].token, LexerError::NestingTooDeep);
3175    }
3176
3177    #[test]
3178    fn arithmetic_nesting_within_limit() {
3179        // Nesting within limit should work
3180        let source = "$((((1 + 2) * 3)))";
3181        let tokens = lex(source);
3182        assert_eq!(tokens, vec![Token::Arithmetic("((1 + 2) * 3)".to_string())]);
3183    }
3184
3185    // ═══════════════════════════════════════════════════════════════════
3186    // Arithmetic preprocessor + comment interaction
3187    //
3188    // The preprocessor used to walk raw characters tracking only quote
3189    // state. An apostrophe inside a `#` comment would open single-quote
3190    // mode and swallow real `$((..))` later in the file; `$((..))` *inside*
3191    // a comment would itself be preprocessed into a marker, misplacing
3192    // tokens. Surfaced from kaijutsu's seed scripts (see gotcha memory
3193    // `gotcha-kaish-comment-arithmetic`).
3194    // ═══════════════════════════════════════════════════════════════════
3195
3196    #[test]
3197    fn arithmetic_after_apostrophe_in_comment() {
3198        // The bare apostrophe in "doesn't" used to open single-quote mode
3199        // in the preprocessor and swallow the $((..)) below.
3200        let source = "# this doesn't work\necho $((1+2))";
3201        let tokens = lex(source);
3202        assert_eq!(tokens, vec![
3203            Token::Newline,
3204            Token::Ident("echo".to_string()),
3205            Token::Arithmetic("1+2".to_string()),
3206        ]);
3207    }
3208
3209    #[test]
3210    fn arithmetic_inside_comment_is_not_expanded() {
3211        // `$((y))` inside a `#` comment must stay comment text.
3212        let source = "# the $((y)) syntax explained\necho hello";
3213        let tokens = lex(source);
3214        assert_eq!(tokens, vec![
3215            Token::Newline,
3216            Token::Ident("echo".to_string()),
3217            Token::Ident("hello".to_string()),
3218        ]);
3219    }
3220
3221    #[test]
3222    fn backticked_arithmetic_in_comment_is_not_expanded() {
3223        // The original kaijutsu repro: `$((x))` inside a comment.
3224        // Backticks-in-comments used to leak the inner $((..)) to the
3225        // preprocessor; with comment-skip they stay inert.
3226        let source = "# the `$((x))` syntax explained\necho $((3+4))";
3227        let tokens = lex(source);
3228        assert_eq!(tokens, vec![
3229            Token::Newline,
3230            Token::Ident("echo".to_string()),
3231            Token::Arithmetic("3+4".to_string()),
3232        ]);
3233    }
3234
3235    #[test]
3236    fn arithmetic_still_works_outside_comments() {
3237        // Regression guard: comment-skip must not shrink the arithmetic
3238        // preprocessor's scope on normal `$((..))` usages.
3239        let source = "X=$((1+2)); Y=$((3*4))";
3240        let tokens = lex(source);
3241        assert_eq!(tokens, vec![
3242            Token::Ident("X".to_string()),
3243            Token::Eq,
3244            Token::Arithmetic("1+2".to_string()),
3245            Token::Semi,
3246            Token::Ident("Y".to_string()),
3247            Token::Eq,
3248            Token::Arithmetic("3*4".to_string()),
3249        ]);
3250    }
3251
3252    #[test]
3253    fn arithmetic_inside_double_quotes_still_expands() {
3254        // `#` inside a double-quoted string is a literal character, not a
3255        // comment introducer — arithmetic must still expand around it.
3256        let source = "echo \"# $((1+2))\"";
3257        let tokens = lex(source);
3258        // The string token contains the `#` and the arithmetic marker;
3259        // the exact post-processing happens at interpret time. What we
3260        // assert here is that lexing succeeds and produces a String token
3261        // (i.e. the comment skip didn't trigger inside the string).
3262        assert_eq!(tokens.len(), 2);
3263        assert!(matches!(tokens[0], Token::Ident(_)));
3264        assert!(matches!(tokens[1], Token::String(_)));
3265    }
3266
3267    // ═══════════════════════════════════════════════════════════════════
3268    // Backtick rejection
3269    //
3270    // Backticks are an explicitly dropped feature (see CLAUDE.md,
3271    // docs/LANGUAGE.md, help/limits.md, help/overview.md). We surface a
3272    // dedicated error rather than the generic `UnexpectedCharacter` so
3273    // users get a hint to use `$(cmd)`. Comments, single-quoted strings,
3274    // double-quoted strings, and heredoc bodies are all matched as single
3275    // tokens (or extracted before logos runs), so the rejection only
3276    // fires on bare backticks in source code.
3277    // ═══════════════════════════════════════════════════════════════════
3278
3279    #[test]
3280    fn backtick_in_source_is_rejected() {
3281        let result = tokenize("echo `date`");
3282        assert!(result.is_err());
3283        let errors = result.unwrap_err();
3284        assert!(errors.iter().any(|e| e.token == LexerError::BackticksNotSupported));
3285    }
3286
3287    #[test]
3288    fn backtick_in_comment_is_just_comment_text() {
3289        // Backticks are only rejected when they reach the top-level
3290        // lexer. Inside a comment they're part of the comment body.
3291        let source = "# use `date` here\necho hi";
3292        let tokens = lex(source);
3293        assert_eq!(tokens, vec![
3294            Token::Newline,
3295            Token::Ident("echo".to_string()),
3296            Token::Ident("hi".to_string()),
3297        ]);
3298    }
3299
3300    #[test]
3301    fn backtick_in_single_quoted_string_is_literal() {
3302        // Single-quoted strings are matched as one token by logos; the
3303        // backticks inside never reach the rejecting matcher.
3304        let source = "echo '`date`'";
3305        let tokens = lex(source);
3306        assert_eq!(tokens, vec![
3307            Token::Ident("echo".to_string()),
3308            Token::SingleString("`date`".to_string()),
3309        ]);
3310    }
3311
3312    #[test]
3313    fn backtick_in_double_quoted_string_is_literal() {
3314        // Kaish does not activate command substitution from backticks
3315        // inside double-quoted strings either — clear divergence from
3316        // POSIX but matches the "backticks don't exist" stance. The
3317        // double-quoted string token absorbs them as literal characters.
3318        let source = "echo \"`date`\"";
3319        let tokens = lex(source);
3320        assert_eq!(tokens.len(), 2);
3321        assert!(matches!(tokens[0], Token::Ident(_)));
3322        match &tokens[1] {
3323            Token::String(s) => assert!(s.contains('`')),
3324            other => panic!("expected Token::String, got {:?}", other),
3325        }
3326    }
3327
3328    #[test]
3329    fn backtick_in_heredoc_body_is_preserved() {
3330        // Heredoc bodies are extracted by preprocess_heredocs before
3331        // logos runs, so backticks inside them survive as content.
3332        let source = "cat <<EOF\n`date`\nEOF\n";
3333        let tokens = lex(source);
3334        let heredoc = tokens.iter().find(|t| matches!(t, Token::HereDoc(_)));
3335        assert!(heredoc.is_some(), "expected a HereDoc token");
3336        if let Some(Token::HereDoc(d)) = heredoc {
3337            assert!(d.content.contains('`'));
3338        }
3339    }
3340
3341    // ═══════════════════════════════════════════════════════════════════
3342    // Token category tests
3343    // ═══════════════════════════════════════════════════════════════════
3344
3345    #[test]
3346    fn token_categories() {
3347        // Keywords
3348        assert_eq!(Token::If.category(), TokenCategory::Keyword);
3349        assert_eq!(Token::Then.category(), TokenCategory::Keyword);
3350        assert_eq!(Token::For.category(), TokenCategory::Keyword);
3351        assert_eq!(Token::Function.category(), TokenCategory::Keyword);
3352        assert_eq!(Token::True.category(), TokenCategory::Keyword);
3353        assert_eq!(Token::TypeString.category(), TokenCategory::Keyword);
3354
3355        // Operators
3356        assert_eq!(Token::Pipe.category(), TokenCategory::Operator);
3357        assert_eq!(Token::And.category(), TokenCategory::Operator);
3358        assert_eq!(Token::Or.category(), TokenCategory::Operator);
3359        assert_eq!(Token::StderrToStdout.category(), TokenCategory::Operator);
3360        assert_eq!(Token::GtGt.category(), TokenCategory::Operator);
3361
3362        // Strings
3363        assert_eq!(Token::String("test".to_string()).category(), TokenCategory::String);
3364        assert_eq!(Token::SingleString("test".to_string()).category(), TokenCategory::String);
3365        assert_eq!(
3366            Token::HereDoc(HereDocData {
3367                content: "test".to_string(),
3368                literal: false,
3369                strip_tabs: false,
3370                body_start_offset: 0,
3371            }).category(),
3372            TokenCategory::String,
3373        );
3374
3375        // Numbers
3376        assert_eq!(Token::Int(42).category(), TokenCategory::Number);
3377        assert_eq!(Token::Float(3.14).category(), TokenCategory::Number);
3378        assert_eq!(Token::Arithmetic("1+2".to_string()).category(), TokenCategory::Number);
3379
3380        // Variables
3381        assert_eq!(Token::SimpleVarRef("X".to_string()).category(), TokenCategory::Variable);
3382        assert_eq!(Token::VarRef("${X}".to_string()).category(), TokenCategory::Variable);
3383        assert_eq!(Token::Positional(1).category(), TokenCategory::Variable);
3384        assert_eq!(Token::AllArgs.category(), TokenCategory::Variable);
3385        assert_eq!(Token::ArgCount.category(), TokenCategory::Variable);
3386        assert_eq!(Token::LastExitCode.category(), TokenCategory::Variable);
3387        assert_eq!(Token::CurrentPid.category(), TokenCategory::Variable);
3388
3389        // Flags
3390        assert_eq!(Token::ShortFlag("l".to_string()).category(), TokenCategory::Flag);
3391        assert_eq!(Token::LongFlag("verbose".to_string()).category(), TokenCategory::Flag);
3392        assert_eq!(Token::PlusFlag("e".to_string()).category(), TokenCategory::Flag);
3393        assert_eq!(Token::DoubleDash.category(), TokenCategory::Flag);
3394
3395        // Punctuation
3396        assert_eq!(Token::Semi.category(), TokenCategory::Punctuation);
3397        assert_eq!(Token::LParen.category(), TokenCategory::Punctuation);
3398        assert_eq!(Token::LBracket.category(), TokenCategory::Punctuation);
3399        assert_eq!(Token::Newline.category(), TokenCategory::Punctuation);
3400
3401        // Comments
3402        assert_eq!(Token::Comment.category(), TokenCategory::Comment);
3403
3404        // Paths
3405        assert_eq!(Token::Path("/tmp/file".to_string()).category(), TokenCategory::Path);
3406
3407        // Commands
3408        assert_eq!(Token::Ident("echo".to_string()).category(), TokenCategory::Command);
3409        assert_eq!(Token::NumberIdent("019dda1c".to_string()).category(), TokenCategory::Command);
3410        assert_eq!(Token::DottedIdent(".gitignore".to_string()).category(), TokenCategory::Command);
3411
3412        // Errors
3413        assert_eq!(Token::InvalidFloatNoLeading.category(), TokenCategory::Error);
3414        assert_eq!(Token::InvalidFloatNoTrailing.category(), TokenCategory::Error);
3415    }
3416
3417    #[test]
3418    fn test_heredoc_piped_to_command() {
3419        // Bug 4: "cat <<EOF | jq" should produce: cat <<heredoc | jq
3420        // Not: cat | jq <<heredoc
3421        let tokens = tokenize("cat <<EOF | jq\n{\"key\": \"val\"}\nEOF").unwrap();
3422        let heredoc_pos = tokens.iter().position(|t| matches!(t.token, Token::HereDoc(_)));
3423        let pipe_pos = tokens.iter().position(|t| matches!(t.token, Token::Pipe));
3424        assert!(heredoc_pos.is_some(), "should have a heredoc token");
3425        assert!(pipe_pos.is_some(), "should have a pipe token");
3426        assert!(
3427            pipe_pos.unwrap() > heredoc_pos.unwrap(),
3428            "Pipe must come after heredoc, got heredoc at {}, pipe at {}. Tokens: {:?}",
3429            heredoc_pos.unwrap(), pipe_pos.unwrap(), tokens,
3430        );
3431    }
3432
3433    #[test]
3434    fn test_heredoc_standalone_still_works() {
3435        // Regression: standalone heredoc (no pipe) must still work
3436        let tokens = tokenize("cat <<EOF\nhello\nEOF").unwrap();
3437        assert!(tokens.iter().any(|t| matches!(t.token, Token::HereDoc(_))));
3438        assert!(!tokens.iter().any(|t| matches!(t.token, Token::Pipe)));
3439    }
3440
3441    #[test]
3442    fn test_heredoc_preserves_leading_empty_lines() {
3443        // Bug B: heredoc starting with a blank line must preserve it
3444        let tokens = tokenize("cat <<EOF\n\nhello\nEOF").unwrap();
3445        let heredoc = tokens.iter().find_map(|t| {
3446            if let Token::HereDoc(data) = &t.token {
3447                Some(data.clone())
3448            } else {
3449                None
3450            }
3451        });
3452        assert!(heredoc.is_some(), "should have a heredoc token");
3453        let data = heredoc.unwrap();
3454        assert!(data.content.starts_with('\n'), "leading empty line must be preserved, got: {:?}", data.content);
3455        assert_eq!(data.content, "\nhello\n");
3456    }
3457
3458    #[test]
3459    fn test_heredoc_quoted_delimiter_sets_literal() {
3460        // Bug N: quoted delimiter (<<'EOF') should set literal=true
3461        let tokens = tokenize("cat <<'EOF'\nhello $HOME\nEOF").unwrap();
3462        let heredoc = tokens.iter().find_map(|t| {
3463            if let Token::HereDoc(data) = &t.token {
3464                Some(data.clone())
3465            } else {
3466                None
3467            }
3468        });
3469        assert!(heredoc.is_some(), "should have a heredoc token");
3470        let data = heredoc.unwrap();
3471        assert!(data.literal, "quoted delimiter should set literal=true");
3472        assert_eq!(data.content, "hello $HOME\n");
3473    }
3474
3475    #[test]
3476    fn test_heredoc_unquoted_delimiter_not_literal() {
3477        // Bug N: unquoted delimiter (<<EOF) should have literal=false
3478        let tokens = tokenize("cat <<EOF\nhello $HOME\nEOF").unwrap();
3479        let heredoc = tokens.iter().find_map(|t| {
3480            if let Token::HereDoc(data) = &t.token {
3481                Some(data.clone())
3482            } else {
3483                None
3484            }
3485        });
3486        assert!(heredoc.is_some(), "should have a heredoc token");
3487        let data = heredoc.unwrap();
3488        assert!(!data.literal, "unquoted delimiter should have literal=false");
3489    }
3490
3491    // ═══════════════════════════════════════════════════════════════════
3492    // Colon merge tests
3493    // ═══════════════════════════════════════════════════════════════════
3494
3495    #[test]
3496    fn colon_double_in_word() {
3497        assert_eq!(lex("foo::bar"), vec![Token::Ident("foo::bar".into())]);
3498    }
3499
3500    #[test]
3501    fn colon_single_in_word() {
3502        assert_eq!(lex("a:b:c"), vec![Token::Ident("a:b:c".into())]);
3503    }
3504
3505    #[test]
3506    fn colon_with_port() {
3507        assert_eq!(lex("host:8080"), vec![Token::Ident("host:8080".into())]);
3508    }
3509
3510    #[test]
3511    fn colon_standalone() {
3512        assert_eq!(lex(":"), vec![Token::Colon]);
3513    }
3514
3515    #[test]
3516    fn colon_spaced_no_merge() {
3517        assert_eq!(
3518            lex("foo : bar"),
3519            vec![
3520                Token::Ident("foo".into()),
3521                Token::Colon,
3522                Token::Ident("bar".into()),
3523            ]
3524        );
3525    }
3526
3527    #[test]
3528    fn colon_in_command_arg() {
3529        assert_eq!(
3530            lex("echo foo::bar"),
3531            vec![
3532                Token::Ident("echo".into()),
3533                Token::Ident("foo::bar".into()),
3534            ]
3535        );
3536    }
3537
3538    #[test]
3539    fn colon_trailing() {
3540        // Trailing colon merges with preceding ident
3541        assert_eq!(lex("foo:"), vec![Token::Ident("foo:".into())]);
3542    }
3543
3544    #[test]
3545    fn colon_leading() {
3546        // Leading colon merges with following ident
3547        assert_eq!(lex(":foo"), vec![Token::Ident(":foo".into())]);
3548    }
3549
3550    #[test]
3551    fn colon_with_path() {
3552        // Path token + colon + int
3553        assert_eq!(
3554            lex("/usr/bin:8080"),
3555            vec![Token::Ident("/usr/bin:8080".into())]
3556        );
3557    }
3558
3559    // ═══════════════════════════════════════════════════════════════════
3560    // Token predicate coverage (is_keyword / starts_statement)
3561    // ═══════════════════════════════════════════════════════════════════
3562
3563    #[test]
3564    fn is_keyword_covers_control_flow() {
3565        for t in [
3566            Token::While,
3567            Token::Return,
3568            Token::Break,
3569            Token::Continue,
3570            Token::Exit,
3571        ] {
3572            assert!(t.is_keyword(), "{t:?} should be a keyword");
3573        }
3574    }
3575
3576    #[test]
3577    fn starts_statement_covers_while() {
3578        assert!(Token::While.starts_statement());
3579    }
3580
3581    #[test]
3582    fn is_keyword_rejects_operators() {
3583        for t in [Token::Pipe, Token::Amp, Token::Eq, Token::LBrace] {
3584            assert!(!t.is_keyword(), "{t:?} should not be a keyword");
3585        }
3586    }
3587}