Skip to main content

kaish_kernel/
lexer.rs

1//! Lexer for kaish source code.
2//!
3//! Converts source text into a stream of tokens using the logos lexer generator.
4//! The lexer is designed to be unambiguous: every valid input produces exactly
5//! one token sequence, and invalid input produces clear errors.
6//!
7//! # Token Categories
8//!
9//! - **Keywords**: `set`, `tool`, `if`, `then`, `else`, `fi`, `for`, `in`, `do`, `done`
10//! - **Literals**: strings, integers, floats, booleans (`true`/`false`)
11//! - **Operators**: `=`, `|`, `&`, `>`, `>>`, `<`, `2>`, `&>`, `&&`, `||`
12//! - **Punctuation**: `;`, `:`, `,`, `.`, `{`, `}`, `[`, `]`
13//! - **Variable references**: `${...}` with nested path access
14//! - **Identifiers**: command names, variable names, parameter names
15
16use logos::{Logos, Span};
17use std::fmt;
18use std::sync::atomic::{AtomicU64, Ordering};
19use std::time::{SystemTime, UNIX_EPOCH};
20
21/// Global counter for generating unique markers across all tokenize calls.
22static MARKER_COUNTER: AtomicU64 = AtomicU64::new(0);
23
24/// Maximum nesting depth for parentheses in arithmetic expressions.
25/// Prevents stack overflow from pathologically nested inputs like $((((((...
26const MAX_PAREN_DEPTH: usize = 256;
27
28/// Tracks a text replacement for span correction.
29/// When preprocessing replaces text (like `$((1+2))` with a marker),
30/// we need to adjust subsequent spans to account for the length change.
31#[derive(Debug, Clone)]
32struct SpanReplacement {
33    /// Position in the preprocessed text where the marker starts.
34    preprocessed_pos: usize,
35    /// Length of the marker in preprocessed text.
36    marker_len: usize,
37    /// Length of the original text that was replaced.
38    original_len: usize,
39}
40
41/// Corrects a span from preprocessed-text coordinates back to original-text coordinates.
42fn correct_span(span: Span, replacements: &[SpanReplacement]) -> Span {
43    let mut start_adjustment: isize = 0;
44    let mut end_adjustment: isize = 0;
45
46    for r in replacements {
47        // Calculate the length difference (positive = original was longer, negative = marker is longer)
48        let delta = r.original_len as isize - r.marker_len as isize;
49
50        // If the span starts after this replacement, adjust the start
51        if span.start > r.preprocessed_pos + r.marker_len {
52            start_adjustment += delta;
53        } else if span.start > r.preprocessed_pos {
54            // Span starts inside the marker - map to original position
55            // (this shouldn't happen often, but handle it gracefully)
56            start_adjustment += delta;
57        }
58
59        // If the span ends after this replacement, adjust the end
60        if span.end > r.preprocessed_pos + r.marker_len {
61            end_adjustment += delta;
62        } else if span.end > r.preprocessed_pos {
63            // Span ends inside the marker - map to end of original
64            end_adjustment += delta;
65        }
66    }
67
68    let new_start = (span.start as isize + start_adjustment).max(0) as usize;
69    let new_end = (span.end as isize + end_adjustment).max(new_start as isize) as usize;
70    new_start..new_end
71}
72
73/// Generate a unique marker ID that's extremely unlikely to collide with user code.
74/// Uses a combination of timestamp, counter, and process ID.
75fn unique_marker_id() -> String {
76    let timestamp = SystemTime::now()
77        .duration_since(UNIX_EPOCH)
78        .map(|d| d.as_nanos())
79        .unwrap_or(0);
80    let counter = MARKER_COUNTER.fetch_add(1, Ordering::Relaxed);
81    let pid = std::process::id();
82    format!("{:x}_{:x}_{:x}", timestamp, counter, pid)
83}
84
85/// A token with its span in the source text.
86#[derive(Debug, Clone, PartialEq)]
87pub struct Spanned<T> {
88    pub token: T,
89    pub span: Span,
90}
91
92impl<T> Spanned<T> {
93    pub fn new(token: T, span: Span) -> Self {
94        Self { token, span }
95    }
96}
97
98/// Lexer error types.
99#[derive(Debug, Clone, PartialEq, Default)]
100pub enum LexerError {
101    #[default]
102    UnexpectedCharacter,
103    UnterminatedString,
104    UnterminatedVarRef,
105    InvalidEscape,
106    InvalidNumber,
107    AmbiguousBoolean(String),
108    AmbiguousBooleanLike(String),
109    InvalidNumberIdent(String),
110    InvalidFloatNoLeading,
111    InvalidFloatNoTrailing,
112    /// Nesting depth exceeded (too many nested parentheses in arithmetic).
113    NestingTooDeep,
114}
115
116impl fmt::Display for LexerError {
117    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
118        match self {
119            LexerError::UnexpectedCharacter => write!(f, "unexpected character"),
120            LexerError::UnterminatedString => write!(f, "unterminated string"),
121            LexerError::UnterminatedVarRef => write!(f, "unterminated variable reference"),
122            LexerError::InvalidEscape => write!(f, "invalid escape sequence"),
123            LexerError::InvalidNumber => write!(f, "invalid number"),
124            LexerError::AmbiguousBoolean(s) => {
125                write!(f, "ambiguous boolean, use lowercase '{}'", s.to_lowercase())
126            }
127            LexerError::AmbiguousBooleanLike(s) => {
128                let suggest = if s.eq_ignore_ascii_case("yes") { "true" } else { "false" };
129                write!(f, "ambiguous boolean-like '{}', use '{}' or '\"{}\"'", s, suggest, s)
130            }
131            LexerError::InvalidNumberIdent(s) => {
132                write!(f, "identifier cannot start with digit: {}", s)
133            }
134            LexerError::InvalidFloatNoLeading => write!(f, "float must have leading digit"),
135            LexerError::InvalidFloatNoTrailing => write!(f, "float must have trailing digit"),
136            LexerError::NestingTooDeep => write!(f, "nesting depth exceeded (max {})", MAX_PAREN_DEPTH),
137        }
138    }
139}
140
141/// Tokens produced by the kaish lexer.
142///
143/// The order of variants matters for logos priority. More specific patterns
144/// (like keywords) should come before more general ones (like identifiers).
145///
146/// Tokens that carry semantic values (strings, numbers, identifiers) include
147/// the parsed value directly. This ensures the parser has access to actual
148/// data, not just token types.
149/// Here-doc content data.
150/// `literal` is true when the delimiter was quoted (<<'EOF' or <<"EOF"),
151/// meaning no variable expansion should occur.
152#[derive(Debug, Clone, PartialEq)]
153pub struct HereDocData {
154    pub content: String,
155    pub literal: bool,
156}
157
158#[derive(Logos, Debug, Clone, PartialEq)]
159#[logos(error = LexerError)]
160#[logos(skip r"[ \t]+")]
161pub enum Token {
162    // ═══════════════════════════════════════════════════════════════════
163    // Keywords (must come before Ident for priority)
164    // ═══════════════════════════════════════════════════════════════════
165    #[token("set")]
166    Set,
167
168    #[token("local")]
169    Local,
170
171    #[token("if")]
172    If,
173
174    #[token("then")]
175    Then,
176
177    #[token("else")]
178    Else,
179
180    #[token("elif")]
181    Elif,
182
183    #[token("fi")]
184    Fi,
185
186    #[token("for")]
187    For,
188
189    #[token("while")]
190    While,
191
192    #[token("in")]
193    In,
194
195    #[token("do")]
196    Do,
197
198    #[token("done")]
199    Done,
200
201    #[token("case")]
202    Case,
203
204    #[token("esac")]
205    Esac,
206
207    #[token("function")]
208    Function,
209
210    #[token("break")]
211    Break,
212
213    #[token("continue")]
214    Continue,
215
216    #[token("return")]
217    Return,
218
219    #[token("exit")]
220    Exit,
221
222    #[token("true")]
223    True,
224
225    #[token("false")]
226    False,
227
228    // ═══════════════════════════════════════════════════════════════════
229    // Type keywords (for tool parameters)
230    // ═══════════════════════════════════════════════════════════════════
231    #[token("string")]
232    TypeString,
233
234    #[token("int")]
235    TypeInt,
236
237    #[token("float")]
238    TypeFloat,
239
240    #[token("bool")]
241    TypeBool,
242
243    // ═══════════════════════════════════════════════════════════════════
244    // Multi-character operators (must come before single-char versions)
245    // ═══════════════════════════════════════════════════════════════════
246    #[token("&&")]
247    And,
248
249    #[token("||")]
250    Or,
251
252    #[token("==")]
253    EqEq,
254
255    #[token("!=")]
256    NotEq,
257
258    #[token("=~")]
259    Match,
260
261    #[token("!~")]
262    NotMatch,
263
264    #[token(">=")]
265    GtEq,
266
267    #[token("<=")]
268    LtEq,
269
270    #[token(">>")]
271    GtGt,
272
273    #[token("2>&1")]
274    StderrToStdout,
275
276    #[token("1>&2")]
277    StdoutToStderr,
278
279    #[token(">&2")]
280    StdoutToStderr2,
281
282    #[token("2>")]
283    Stderr,
284
285    #[token("&>")]
286    Both,
287
288    #[token("<<")]
289    HereDocStart,
290
291    #[token(";;")]
292    DoubleSemi,
293
294    // ═══════════════════════════════════════════════════════════════════
295    // Single-character operators and punctuation
296    // ═══════════════════════════════════════════════════════════════════
297    #[token("=")]
298    Eq,
299
300    #[token("|")]
301    Pipe,
302
303    #[token("&")]
304    Amp,
305
306    #[token(">")]
307    Gt,
308
309    #[token("<")]
310    Lt,
311
312    #[token(";")]
313    Semi,
314
315    #[token(":")]
316    Colon,
317
318    #[token(",")]
319    Comma,
320
321    #[token("..")]
322    DotDot,
323
324    #[token(".")]
325    Dot,
326
327    /// Tilde path: `~/foo`, `~user/bar` - value includes the full string
328    #[regex(r"~[a-zA-Z0-9_./+-]+", lex_tilde_path, priority = 3)]
329    TildePath(String),
330
331    /// Bare tilde: `~` alone (expands to $HOME)
332    #[token("~")]
333    Tilde,
334
335    /// Relative path starting with `../`: `../foo/bar`
336    #[regex(r"\.\./[a-zA-Z0-9_./-]+", lex_relative_path, priority = 3)]
337    RelativePath(String),
338
339    /// Dot-slash path: `./foo`, `./script.sh`
340    #[regex(r"\./[a-zA-Z0-9_./-]+", lex_dot_slash_path, priority = 3)]
341    DotSlashPath(String),
342
343    #[token("{")]
344    LBrace,
345
346    #[token("}")]
347    RBrace,
348
349    #[token("[")]
350    LBracket,
351
352    #[token("]")]
353    RBracket,
354
355    #[token("(")]
356    LParen,
357
358    #[token(")")]
359    RParen,
360
361    #[token("*")]
362    Star,
363
364    #[token("!")]
365    Bang,
366
367    #[token("?")]
368    Question,
369
370    // ═══════════════════════════════════════════════════════════════════
371    // Command substitution
372    // ═══════════════════════════════════════════════════════════════════
373
374    /// Arithmetic expression content: synthesized by preprocessing.
375    /// Contains the expression string between `$((` and `))`.
376    Arithmetic(String),
377
378    /// Command substitution start: `$(` - begins a command substitution
379    #[token("$(")]
380    CmdSubstStart,
381
382    // ═══════════════════════════════════════════════════════════════════
383    // Flags (must come before Int to win over negative numbers)
384    // ═══════════════════════════════════════════════════════════════════
385
386    /// Long flag: `--name` or `--foo-bar`
387    #[regex(r"--[a-zA-Z][a-zA-Z0-9-]*", lex_long_flag, priority = 3)]
388    LongFlag(String),
389
390    /// Short flag: `-l` or `-la` (combined short flags)
391    #[regex(r"-[a-zA-Z][a-zA-Z0-9]*", lex_short_flag, priority = 3)]
392    ShortFlag(String),
393
394    /// Plus flag: `+e` or `+x` (for set +e to disable options)
395    #[regex(r"\+[a-zA-Z][a-zA-Z0-9]*", lex_plus_flag, priority = 3)]
396    PlusFlag(String),
397
398    /// Double dash: `--` alone marks end of flags
399    #[token("--")]
400    DoubleDash,
401
402    /// Bare word starting with + followed by non-letter: `+%s`, `+%Y-%m-%d`
403    /// For date format strings and similar. Lower priority than PlusFlag.
404    #[regex(r"\+[^a-zA-Z\s][^\s]*", lex_plus_bare, priority = 2)]
405    PlusBare(String),
406
407    /// Bare word starting with - followed by non-letter/digit/dash: `-%`, etc.
408    /// For rare cases. Lower priority than ShortFlag, Int, and DoubleDash.
409    /// Excludes - after first - to avoid matching --name patterns.
410    #[regex(r"-[^a-zA-Z0-9\s\-][^\s]*", lex_minus_bare, priority = 1)]
411    MinusBare(String),
412
413    /// Standalone - (stdin indicator for cat -, diff - -, etc.)
414    /// Only matches when followed by whitespace or end.
415    /// This is handled specially in the parser as a positional arg.
416    #[token("-")]
417    MinusAlone,
418
419    // ═══════════════════════════════════════════════════════════════════
420    // Literals (with values)
421    // ═══════════════════════════════════════════════════════════════════
422
423    /// Double-quoted string: `"..."` - value is the parsed content (quotes removed, escapes processed)
424    #[regex(r#""([^"\\]|\\.)*""#, lex_string)]
425    String(String),
426
427    /// Single-quoted string: `'...'` - literal content, no escape processing
428    #[regex(r"'[^']*'", lex_single_string)]
429    SingleString(String),
430
431    /// Braced variable reference: `${VAR}` or `${VAR.field}` - value is the raw inner content
432    #[regex(r"\$\{[^}]+\}", lex_varref)]
433    VarRef(String),
434
435    /// Simple variable reference: `$NAME` - just the identifier
436    #[regex(r"\$[a-zA-Z_][a-zA-Z0-9_]*", lex_simple_varref)]
437    SimpleVarRef(String),
438
439    /// Positional parameter: `$0` through `$9`
440    #[regex(r"\$[0-9]", lex_positional)]
441    Positional(usize),
442
443    /// All positional parameters: `$@`
444    #[token("$@")]
445    AllArgs,
446
447    /// Number of positional parameters: `$#`
448    #[token("$#")]
449    ArgCount,
450
451    /// Last exit code: `$?`
452    #[token("$?")]
453    LastExitCode,
454
455    /// Current shell PID: `$$`
456    #[token("$$")]
457    CurrentPid,
458
459    /// Variable string length: `${#VAR}`
460    #[regex(r"\$\{#[a-zA-Z_][a-zA-Z0-9_]*\}", lex_var_length)]
461    VarLength(String),
462
463    /// Here-doc content: synthesized by preprocessing, not directly lexed.
464    /// Contains the full content of the here-doc (without the delimiter lines).
465    HereDoc(HereDocData),
466
467    /// Integer literal - value is the parsed i64
468    #[regex(r"-?[0-9]+", lex_int, priority = 2)]
469    Int(i64),
470
471    /// Float literal - value is the parsed f64
472    #[regex(r"-?[0-9]+\.[0-9]+", lex_float)]
473    Float(f64),
474
475    // ═══════════════════════════════════════════════════════════════════
476    // Invalid patterns (caught before valid tokens for better errors)
477    // ═══════════════════════════════════════════════════════════════════
478
479    /// Invalid: number followed by identifier characters (like 123abc)
480    #[regex(r"[0-9]+[a-zA-Z_][a-zA-Z0-9_-]*", lex_invalid_number_ident, priority = 3)]
481    InvalidNumberIdent,
482
483    /// Invalid: float without leading digit (like .5)
484    #[regex(r"\.[0-9]+", lex_invalid_float_no_leading, priority = 3)]
485    InvalidFloatNoLeading,
486
487    /// Invalid: float without trailing digit (like 5.)
488    /// Logos uses longest-match, so valid floats like 5.5 will match Float pattern instead
489    #[regex(r"[0-9]+\.", lex_invalid_float_no_trailing, priority = 2)]
490    InvalidFloatNoTrailing,
491
492    // ═══════════════════════════════════════════════════════════════════
493    // Paths (absolute paths starting with /)
494    // ═══════════════════════════════════════════════════════════════════
495
496    /// Absolute path: `/tmp/out`, `/etc/hosts`, etc.
497    #[regex(r"/[a-zA-Z0-9_./+-]*", lex_path)]
498    Path(String),
499
500    // ═══════════════════════════════════════════════════════════════════
501    // Identifiers (command names, variable names, etc.)
502    // ═══════════════════════════════════════════════════════════════════
503
504    /// Identifier - value is the identifier string
505    /// Allows dots for filenames like `script.kai`
506    #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*", lex_ident)]
507    Ident(String),
508
509    // ═══════════════════════════════════════════════════════════════════
510    // Structural tokens
511    // ═══════════════════════════════════════════════════════════════════
512
513    /// Comment: `# ...` to end of line
514    #[regex(r"#[^\n\r]*", allow_greedy = true)]
515    Comment,
516
517    /// Newline (significant in kaish - ends statements)
518    #[regex(r"\n|\r\n")]
519    Newline,
520
521    /// Line continuation: backslash at end of line
522    #[regex(r"\\[ \t]*(\n|\r\n)")]
523    LineContinuation,
524}
525
526/// Semantic category for syntax highlighting.
527///
528/// Stable enum that groups tokens by purpose. Consumers match on categories
529/// instead of individual tokens, insulating them from lexer evolution.
530#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
531pub enum TokenCategory {
532    /// Keywords: if, then, else, for, while, function, return, etc.
533    Keyword,
534    /// Operators: |, &&, ||, >, >>, 2>&1, =, ==, etc.
535    Operator,
536    /// String literals: "...", '...', heredocs
537    String,
538    /// Numeric literals: 123, 3.14, arithmetic expressions
539    Number,
540    /// Variable references: $foo, ${bar}, $1, $@, $#, $?, $$
541    Variable,
542    /// Comments: # ...
543    Comment,
544    /// Punctuation: ; , . ( ) { } [ ]
545    Punctuation,
546    /// Identifiers in command position
547    Command,
548    /// Absolute paths: /foo/bar
549    Path,
550    /// Flags: --long, -s, +x
551    Flag,
552    /// Invalid tokens
553    Error,
554}
555
556impl Token {
557    /// Returns the semantic category for syntax highlighting.
558    pub fn category(&self) -> TokenCategory {
559        match self {
560            // Keywords
561            Token::If
562            | Token::Then
563            | Token::Else
564            | Token::Elif
565            | Token::Fi
566            | Token::For
567            | Token::In
568            | Token::Do
569            | Token::Done
570            | Token::While
571            | Token::Case
572            | Token::Esac
573            | Token::Function
574            | Token::Return
575            | Token::Break
576            | Token::Continue
577            | Token::Exit
578            | Token::Set
579            | Token::Local
580            | Token::True
581            | Token::False
582            | Token::TypeString
583            | Token::TypeInt
584            | Token::TypeFloat
585            | Token::TypeBool => TokenCategory::Keyword,
586
587            // Operators and redirections
588            Token::Pipe
589            | Token::And
590            | Token::Or
591            | Token::Amp
592            | Token::Eq
593            | Token::EqEq
594            | Token::NotEq
595            | Token::Match
596            | Token::NotMatch
597            | Token::Lt
598            | Token::Gt
599            | Token::LtEq
600            | Token::GtEq
601            | Token::GtGt
602            | Token::Stderr
603            | Token::Both
604            | Token::HereDocStart
605            | Token::StderrToStdout
606            | Token::StdoutToStderr
607            | Token::StdoutToStderr2 => TokenCategory::Operator,
608
609            // Strings
610            Token::String(_) | Token::SingleString(_) | Token::HereDoc(_) => TokenCategory::String,
611
612            // Numbers
613            Token::Int(_) | Token::Float(_) | Token::Arithmetic(_) => TokenCategory::Number,
614
615            // Variables
616            Token::VarRef(_)
617            | Token::SimpleVarRef(_)
618            | Token::Positional(_)
619            | Token::AllArgs
620            | Token::ArgCount
621            | Token::VarLength(_)
622            | Token::LastExitCode
623            | Token::CurrentPid => TokenCategory::Variable,
624
625            // Flags
626            Token::LongFlag(_)
627            | Token::ShortFlag(_)
628            | Token::PlusFlag(_)
629            | Token::DoubleDash => TokenCategory::Flag,
630
631            // Punctuation
632            Token::Semi
633            | Token::DoubleSemi
634            | Token::Colon
635            | Token::Comma
636            | Token::Dot
637            | Token::LParen
638            | Token::RParen
639            | Token::LBrace
640            | Token::RBrace
641            | Token::LBracket
642            | Token::RBracket
643            | Token::Bang
644            | Token::Question
645            | Token::Star
646            | Token::Newline
647            | Token::LineContinuation
648            | Token::CmdSubstStart => TokenCategory::Punctuation,
649
650            // Comments
651            Token::Comment => TokenCategory::Comment,
652
653            // Paths
654            Token::Path(_)
655            | Token::TildePath(_)
656            | Token::RelativePath(_)
657            | Token::Tilde
658            | Token::DotDot
659            | Token::DotSlashPath(_) => TokenCategory::Path,
660
661            // Commands/identifiers (and bare words)
662            Token::Ident(_)
663            | Token::PlusBare(_)
664            | Token::MinusBare(_)
665            | Token::MinusAlone => TokenCategory::Command,
666
667            // Errors
668            Token::InvalidNumberIdent
669            | Token::InvalidFloatNoLeading
670            | Token::InvalidFloatNoTrailing => TokenCategory::Error,
671        }
672    }
673}
674
675/// Lex a double-quoted string literal, processing escape sequences.
676fn lex_string(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
677    parse_string_literal(lex.slice())
678}
679
680/// Lex a single-quoted string literal (no escape processing).
681fn lex_single_string(lex: &mut logos::Lexer<Token>) -> String {
682    let s = lex.slice();
683    // Strip the surrounding single quotes
684    s[1..s.len() - 1].to_string()
685}
686
687/// Lex a braced variable reference, extracting the inner content.
688fn lex_varref(lex: &mut logos::Lexer<Token>) -> String {
689    // Keep the full ${...} for later parsing of path segments
690    lex.slice().to_string()
691}
692
693/// Lex a simple variable reference: `$NAME` → `NAME`
694fn lex_simple_varref(lex: &mut logos::Lexer<Token>) -> String {
695    // Strip the leading `$`
696    lex.slice()[1..].to_string()
697}
698
699/// Lex a positional parameter: `$1` → 1
700fn lex_positional(lex: &mut logos::Lexer<Token>) -> usize {
701    // Strip the leading `$` and parse the digit
702    lex.slice()[1..].parse().unwrap_or(0)
703}
704
705/// Lex a variable length: `${#VAR}` → "VAR"
706fn lex_var_length(lex: &mut logos::Lexer<Token>) -> String {
707    // Strip the leading `${#` and trailing `}`
708    let s = lex.slice();
709    s[3..s.len() - 1].to_string()
710}
711
712/// Lex an integer literal.
713fn lex_int(lex: &mut logos::Lexer<Token>) -> Result<i64, LexerError> {
714    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
715}
716
717/// Lex a float literal.
718fn lex_float(lex: &mut logos::Lexer<Token>) -> Result<f64, LexerError> {
719    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
720}
721
722/// Lex an invalid number-identifier pattern (like 123abc).
723/// Always returns Err to produce a lexer error instead of a token.
724fn lex_invalid_number_ident(lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
725    Err(LexerError::InvalidNumberIdent(lex.slice().to_string()))
726}
727
728/// Lex an invalid float without leading digit (like .5).
729/// Always returns Err to produce a lexer error instead of a token.
730fn lex_invalid_float_no_leading(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
731    Err(LexerError::InvalidFloatNoLeading)
732}
733
734/// Lex an invalid float without trailing digit (like 5.).
735/// Always returns Err to produce a lexer error instead of a token.
736fn lex_invalid_float_no_trailing(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
737    Err(LexerError::InvalidFloatNoTrailing)
738}
739
740/// Lex an identifier, rejecting ambiguous boolean-like values.
741fn lex_ident(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
742    let s = lex.slice();
743
744    // Reject ambiguous boolean variants (TRUE, FALSE, True, etc.)
745    // Only lowercase 'true' and 'false' are valid booleans (handled by Token::True/False)
746    match s.to_lowercase().as_str() {
747        "true" | "false" if s != "true" && s != "false" => {
748            return Err(LexerError::AmbiguousBoolean(s.to_string()));
749        }
750        _ => {}
751    }
752
753    // Reject yes/no/YES/NO/Yes/No as ambiguous boolean-like values
754    if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("no") {
755        return Err(LexerError::AmbiguousBooleanLike(s.to_string()));
756    }
757
758    Ok(s.to_string())
759}
760
761/// Lex a long flag: `--name` → `name`
762fn lex_long_flag(lex: &mut logos::Lexer<Token>) -> String {
763    // Strip the leading `--`
764    lex.slice()[2..].to_string()
765}
766
767/// Lex a short flag: `-l` → `l`, `-la` → `la`
768fn lex_short_flag(lex: &mut logos::Lexer<Token>) -> String {
769    // Strip the leading `-`
770    lex.slice()[1..].to_string()
771}
772
773/// Lex a plus flag: `+e` → `e`, `+ex` → `ex`
774fn lex_plus_flag(lex: &mut logos::Lexer<Token>) -> String {
775    // Strip the leading `+`
776    lex.slice()[1..].to_string()
777}
778
779/// Lex a plus bare word: `+%s` → `+%s` (keep the full string)
780fn lex_plus_bare(lex: &mut logos::Lexer<Token>) -> String {
781    lex.slice().to_string()
782}
783
784/// Lex a minus bare word: `-%` → `-%` (keep the full string)
785fn lex_minus_bare(lex: &mut logos::Lexer<Token>) -> String {
786    lex.slice().to_string()
787}
788
789/// Lex an absolute path: `/tmp/out` → `/tmp/out`
790fn lex_path(lex: &mut logos::Lexer<Token>) -> String {
791    lex.slice().to_string()
792}
793
794/// Lex a tilde path: `~/foo` → `~/foo`
795fn lex_tilde_path(lex: &mut logos::Lexer<Token>) -> String {
796    lex.slice().to_string()
797}
798
799/// Lex a relative path: `../foo` → `../foo`
800fn lex_relative_path(lex: &mut logos::Lexer<Token>) -> String {
801    lex.slice().to_string()
802}
803
804/// Lex a dot-slash path: `./foo` → `./foo`
805fn lex_dot_slash_path(lex: &mut logos::Lexer<Token>) -> String {
806    lex.slice().to_string()
807}
808
809impl fmt::Display for Token {
810    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
811        match self {
812            Token::Set => write!(f, "set"),
813            Token::Local => write!(f, "local"),
814            Token::If => write!(f, "if"),
815            Token::Then => write!(f, "then"),
816            Token::Else => write!(f, "else"),
817            Token::Elif => write!(f, "elif"),
818            Token::Fi => write!(f, "fi"),
819            Token::For => write!(f, "for"),
820            Token::While => write!(f, "while"),
821            Token::In => write!(f, "in"),
822            Token::Do => write!(f, "do"),
823            Token::Done => write!(f, "done"),
824            Token::Case => write!(f, "case"),
825            Token::Esac => write!(f, "esac"),
826            Token::Function => write!(f, "function"),
827            Token::Break => write!(f, "break"),
828            Token::Continue => write!(f, "continue"),
829            Token::Return => write!(f, "return"),
830            Token::Exit => write!(f, "exit"),
831            Token::True => write!(f, "true"),
832            Token::False => write!(f, "false"),
833            Token::TypeString => write!(f, "string"),
834            Token::TypeInt => write!(f, "int"),
835            Token::TypeFloat => write!(f, "float"),
836            Token::TypeBool => write!(f, "bool"),
837            Token::And => write!(f, "&&"),
838            Token::Or => write!(f, "||"),
839            Token::EqEq => write!(f, "=="),
840            Token::NotEq => write!(f, "!="),
841            Token::Match => write!(f, "=~"),
842            Token::NotMatch => write!(f, "!~"),
843            Token::GtEq => write!(f, ">="),
844            Token::LtEq => write!(f, "<="),
845            Token::GtGt => write!(f, ">>"),
846            Token::StderrToStdout => write!(f, "2>&1"),
847            Token::StdoutToStderr => write!(f, "1>&2"),
848            Token::StdoutToStderr2 => write!(f, ">&2"),
849            Token::Stderr => write!(f, "2>"),
850            Token::Both => write!(f, "&>"),
851            Token::HereDocStart => write!(f, "<<"),
852            Token::DoubleSemi => write!(f, ";;"),
853            Token::Eq => write!(f, "="),
854            Token::Pipe => write!(f, "|"),
855            Token::Amp => write!(f, "&"),
856            Token::Gt => write!(f, ">"),
857            Token::Lt => write!(f, "<"),
858            Token::Semi => write!(f, ";"),
859            Token::Colon => write!(f, ":"),
860            Token::Comma => write!(f, ","),
861            Token::Dot => write!(f, "."),
862            Token::DotDot => write!(f, ".."),
863            Token::Tilde => write!(f, "~"),
864            Token::TildePath(s) => write!(f, "{}", s),
865            Token::RelativePath(s) => write!(f, "{}", s),
866            Token::DotSlashPath(s) => write!(f, "{}", s),
867            Token::LBrace => write!(f, "{{"),
868            Token::RBrace => write!(f, "}}"),
869            Token::LBracket => write!(f, "["),
870            Token::RBracket => write!(f, "]"),
871            Token::LParen => write!(f, "("),
872            Token::RParen => write!(f, ")"),
873            Token::Star => write!(f, "*"),
874            Token::Bang => write!(f, "!"),
875            Token::Question => write!(f, "?"),
876            Token::Arithmetic(s) => write!(f, "ARITHMETIC({})", s),
877            Token::CmdSubstStart => write!(f, "$("),
878            Token::LongFlag(s) => write!(f, "--{}", s),
879            Token::ShortFlag(s) => write!(f, "-{}", s),
880            Token::PlusFlag(s) => write!(f, "+{}", s),
881            Token::DoubleDash => write!(f, "--"),
882            Token::PlusBare(s) => write!(f, "{}", s),
883            Token::MinusBare(s) => write!(f, "{}", s),
884            Token::MinusAlone => write!(f, "-"),
885            Token::String(s) => write!(f, "STRING({:?})", s),
886            Token::SingleString(s) => write!(f, "SINGLESTRING({:?})", s),
887            Token::HereDoc(d) => write!(f, "HEREDOC({:?}, literal={})", d.content, d.literal),
888            Token::VarRef(v) => write!(f, "VARREF({})", v),
889            Token::SimpleVarRef(v) => write!(f, "SIMPLEVARREF({})", v),
890            Token::Positional(n) => write!(f, "${}", n),
891            Token::AllArgs => write!(f, "$@"),
892            Token::ArgCount => write!(f, "$#"),
893            Token::LastExitCode => write!(f, "$?"),
894            Token::CurrentPid => write!(f, "$$"),
895            Token::VarLength(v) => write!(f, "${{#{}}}", v),
896            Token::Int(n) => write!(f, "INT({})", n),
897            Token::Float(n) => write!(f, "FLOAT({})", n),
898            Token::Path(s) => write!(f, "PATH({})", s),
899            Token::Ident(s) => write!(f, "IDENT({})", s),
900            Token::Comment => write!(f, "COMMENT"),
901            Token::Newline => write!(f, "NEWLINE"),
902            Token::LineContinuation => write!(f, "LINECONT"),
903            // These variants should never be produced - their callbacks always return errors
904            Token::InvalidNumberIdent => write!(f, "INVALID_NUMBER_IDENT"),
905            Token::InvalidFloatNoLeading => write!(f, "INVALID_FLOAT_NO_LEADING"),
906            Token::InvalidFloatNoTrailing => write!(f, "INVALID_FLOAT_NO_TRAILING"),
907        }
908    }
909}
910
911impl Token {
912    /// Returns true if this token is a keyword.
913    pub fn is_keyword(&self) -> bool {
914        matches!(
915            self,
916            Token::Set
917                | Token::Local
918                | Token::If
919                | Token::Then
920                | Token::Else
921                | Token::Elif
922                | Token::Fi
923                | Token::For
924                | Token::In
925                | Token::Do
926                | Token::Done
927                | Token::Case
928                | Token::Esac
929                | Token::Function
930                | Token::True
931                | Token::False
932        )
933    }
934
935    /// Returns true if this token is a type keyword.
936    pub fn is_type(&self) -> bool {
937        matches!(
938            self,
939            Token::TypeString
940                | Token::TypeInt
941                | Token::TypeFloat
942                | Token::TypeBool
943        )
944    }
945
946    /// Returns true if this token starts a statement.
947    pub fn starts_statement(&self) -> bool {
948        matches!(
949            self,
950            Token::Set | Token::Local | Token::Function | Token::If | Token::For | Token::Case | Token::Ident(_) | Token::LBracket
951        )
952    }
953
954    /// Returns true if this token can appear in an expression.
955    pub fn is_value(&self) -> bool {
956        matches!(
957            self,
958            Token::String(_)
959                | Token::SingleString(_)
960                | Token::HereDoc(_)
961                | Token::Arithmetic(_)
962                | Token::Int(_)
963                | Token::Float(_)
964                | Token::True
965                | Token::False
966                | Token::VarRef(_)
967                | Token::SimpleVarRef(_)
968                | Token::CmdSubstStart
969                | Token::Path(_)
970                | Token::LastExitCode
971                | Token::CurrentPid
972        )
973    }
974}
975
976/// Result of preprocessing arithmetic expressions.
977struct ArithmeticPreprocessResult {
978    /// Preprocessed source with markers replacing $((expr)).
979    text: String,
980    /// Vector of (marker, expression_content) pairs.
981    arithmetics: Vec<(String, String)>,
982    /// Span replacements for correcting token positions.
983    replacements: Vec<SpanReplacement>,
984}
985
986/// Skip a `$(...)` command substitution with quote-aware paren matching.
987///
988/// Copies the entire command substitution verbatim to `result`, handling
989/// single quotes, double quotes, and backslash escapes inside the sub so
990/// that parentheses within strings don't confuse the depth counter.
991///
992/// On entry, `i` points to the `$` of `$(`. On exit, `i` points past the
993/// closing `)`.
994fn skip_command_substitution(
995    chars: &[char],
996    i: &mut usize,
997    source_pos: &mut usize,
998    result: &mut String,
999) {
1000    // Copy $(
1001    result.push('$');
1002    result.push('(');
1003    *i += 2;
1004    *source_pos += 2;
1005
1006    let mut depth: usize = 1;
1007    let mut in_single_quote = false;
1008    let mut in_double_quote = false;
1009
1010    while *i < chars.len() && depth > 0 {
1011        let c = chars[*i];
1012
1013        if in_single_quote {
1014            result.push(c);
1015            *source_pos += c.len_utf8();
1016            *i += 1;
1017            if c == '\'' {
1018                in_single_quote = false;
1019            }
1020            continue;
1021        }
1022
1023        if in_double_quote {
1024            if c == '\\' && *i + 1 < chars.len() {
1025                let next = chars[*i + 1];
1026                if next == '"' || next == '\\' || next == '$' || next == '`' {
1027                    result.push(c);
1028                    result.push(next);
1029                    *source_pos += c.len_utf8() + next.len_utf8();
1030                    *i += 2;
1031                    continue;
1032                }
1033            }
1034            if c == '"' {
1035                in_double_quote = false;
1036            }
1037            result.push(c);
1038            *source_pos += c.len_utf8();
1039            *i += 1;
1040            continue;
1041        }
1042
1043        // Outside quotes
1044        match c {
1045            '\'' => {
1046                in_single_quote = true;
1047                result.push(c);
1048                *source_pos += c.len_utf8();
1049                *i += 1;
1050            }
1051            '"' => {
1052                in_double_quote = true;
1053                result.push(c);
1054                *source_pos += c.len_utf8();
1055                *i += 1;
1056            }
1057            '\\' if *i + 1 < chars.len() => {
1058                result.push(c);
1059                result.push(chars[*i + 1]);
1060                *source_pos += c.len_utf8() + chars[*i + 1].len_utf8();
1061                *i += 2;
1062            }
1063            '(' => {
1064                depth += 1;
1065                result.push(c);
1066                *source_pos += c.len_utf8();
1067                *i += 1;
1068            }
1069            ')' => {
1070                depth -= 1;
1071                result.push(c);
1072                *source_pos += c.len_utf8();
1073                *i += 1;
1074            }
1075            _ => {
1076                result.push(c);
1077                *source_pos += c.len_utf8();
1078                *i += 1;
1079            }
1080        }
1081    }
1082}
1083
1084/// Preprocess arithmetic expressions in source code.
1085///
1086/// Finds `$((expr))` patterns and replaces them with markers.
1087/// Returns the preprocessed source, arithmetic contents, and span replacement info.
1088///
1089/// Example:
1090///   `X=$((1 + 2))`
1091/// Becomes:
1092///   `X=__KAISH_ARITH_{id}__`
1093/// With arithmetics[0] = ("__KAISH_ARITH_{id}__", "1 + 2")
1094///
1095/// # Errors
1096/// Returns `LexerError::NestingTooDeep` if parentheses are nested beyond MAX_PAREN_DEPTH.
1097fn preprocess_arithmetic(source: &str) -> Result<ArithmeticPreprocessResult, LexerError> {
1098    let mut result = String::with_capacity(source.len());
1099    let mut arithmetics: Vec<(String, String)> = Vec::new();
1100    let mut replacements: Vec<SpanReplacement> = Vec::new();
1101    let mut source_pos: usize = 0;
1102    let chars_vec: Vec<char> = source.chars().collect();
1103    let mut i = 0;
1104
1105    // Whether we're currently inside double quotes. Single quotes inside
1106    // double quotes are literal characters, not quote delimiters.
1107    let mut in_double_quote = false;
1108
1109    while i < chars_vec.len() {
1110        let ch = chars_vec[i];
1111
1112        // Backslash escape outside quotes — skip both chars verbatim
1113        if !in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1114            result.push(ch);
1115            result.push(chars_vec[i + 1]);
1116            source_pos += ch.len_utf8() + chars_vec[i + 1].len_utf8();
1117            i += 2;
1118            continue;
1119        }
1120
1121        // Single quote — only starts quote mode when NOT inside double quotes
1122        if ch == '\'' && !in_double_quote {
1123            result.push(ch);
1124            i += 1;
1125            source_pos += 1;
1126            while i < chars_vec.len() && chars_vec[i] != '\'' {
1127                result.push(chars_vec[i]);
1128                source_pos += chars_vec[i].len_utf8();
1129                i += 1;
1130            }
1131            if i < chars_vec.len() {
1132                result.push(chars_vec[i]); // closing quote
1133                source_pos += 1;
1134                i += 1;
1135            }
1136            continue;
1137        }
1138
1139        // Double quote — toggle state (arithmetic is still expanded inside)
1140        if ch == '"' {
1141            in_double_quote = !in_double_quote;
1142            result.push(ch);
1143            i += 1;
1144            source_pos += 1;
1145            continue;
1146        }
1147
1148        // Backslash escape inside double quotes — only \" and \\ are special
1149        if in_double_quote && ch == '\\' && i + 1 < chars_vec.len() {
1150            let next = chars_vec[i + 1];
1151            if next == '"' || next == '\\' || next == '$' || next == '`' {
1152                result.push(ch);
1153                result.push(next);
1154                source_pos += ch.len_utf8() + next.len_utf8();
1155                i += 2;
1156                continue;
1157            }
1158        }
1159
1160        // Skip $(...) command substitutions — inner arithmetic belongs to the subcommand
1161        if ch == '$' && i + 1 < chars_vec.len() && chars_vec[i + 1] == '('
1162            && !(i + 2 < chars_vec.len() && chars_vec[i + 2] == '(')
1163        {
1164            skip_command_substitution(&chars_vec, &mut i, &mut source_pos, &mut result);
1165            continue;
1166        }
1167
1168        // Look for $(( (potential arithmetic)
1169        if ch == '$' && i + 2 < chars_vec.len() && chars_vec[i + 1] == '(' && chars_vec[i + 2] == '(' {
1170            let arith_start_pos = result.len();
1171            let original_start = source_pos;
1172
1173            // Skip $((
1174            i += 3;
1175            source_pos += 3;
1176
1177            // Collect expression until matching ))
1178            let mut expr = String::new();
1179            let mut paren_depth: usize = 0;
1180
1181            while i < chars_vec.len() {
1182                let c = chars_vec[i];
1183                match c {
1184                    '(' => {
1185                        paren_depth += 1;
1186                        if paren_depth > MAX_PAREN_DEPTH {
1187                            return Err(LexerError::NestingTooDeep);
1188                        }
1189                        expr.push('(');
1190                        i += 1;
1191                        source_pos += c.len_utf8();
1192                    }
1193                    ')' => {
1194                        if paren_depth > 0 {
1195                            paren_depth -= 1;
1196                            expr.push(')');
1197                            i += 1;
1198                            source_pos += 1;
1199                        } else if i + 1 < chars_vec.len() && chars_vec[i + 1] == ')' {
1200                            // Found closing ))
1201                            i += 2;
1202                            source_pos += 2;
1203                            break;
1204                        } else {
1205                            // Single ) inside - keep going
1206                            expr.push(')');
1207                            i += 1;
1208                            source_pos += 1;
1209                        }
1210                    }
1211                    _ => {
1212                        expr.push(c);
1213                        i += 1;
1214                        source_pos += c.len_utf8();
1215                    }
1216                }
1217            }
1218
1219            // Calculate original length: from $$(( to ))
1220            let original_len = source_pos - original_start;
1221
1222            // Create a unique marker for this arithmetic (collision-resistant)
1223            let marker = format!("__KAISH_ARITH_{}__", unique_marker_id());
1224            let marker_len = marker.len();
1225
1226            // Record the replacement for span correction
1227            replacements.push(SpanReplacement {
1228                preprocessed_pos: arith_start_pos,
1229                marker_len,
1230                original_len,
1231            });
1232
1233            arithmetics.push((marker.clone(), expr));
1234            result.push_str(&marker);
1235        } else {
1236            result.push(ch);
1237            i += 1;
1238            source_pos += ch.len_utf8();
1239        }
1240    }
1241
1242    Ok(ArithmeticPreprocessResult {
1243        text: result,
1244        arithmetics,
1245        replacements,
1246    })
1247}
1248
1249/// Preprocess here-docs in source code.
1250///
1251/// Finds `<<WORD` patterns and collects content until the delimiter line.
1252/// Returns the preprocessed source and a vector of (marker, content) pairs.
1253///
1254/// Example:
1255///   `cat <<EOF\nhello\nworld\nEOF`
1256/// Becomes:
1257///   `cat <<__HEREDOC_0__`
1258/// With heredocs[0] = ("__HEREDOC_0__", "hello\nworld")
1259fn preprocess_heredocs(source: &str) -> (String, Vec<(String, String, bool)>) {
1260    let mut result = String::with_capacity(source.len());
1261    let mut heredocs: Vec<(String, String, bool)> = Vec::new();
1262    let mut chars = source.chars().peekable();
1263
1264    while let Some(ch) = chars.next() {
1265        // Look for << (potential here-doc)
1266        if ch == '<' && chars.peek() == Some(&'<') {
1267            chars.next(); // consume second <
1268
1269            // Check for optional - (strip leading tabs)
1270            let strip_tabs = chars.peek() == Some(&'-');
1271            if strip_tabs {
1272                chars.next();
1273            }
1274
1275            // Skip whitespace before delimiter
1276            while let Some(&c) = chars.peek() {
1277                if c == ' ' || c == '\t' {
1278                    chars.next();
1279                } else {
1280                    break;
1281                }
1282            }
1283
1284            // Collect the delimiter word
1285            let mut delimiter = String::new();
1286            let quoted = chars.peek() == Some(&'\'') || chars.peek() == Some(&'"');
1287            let quote_char = if quoted { chars.next() } else { None };
1288
1289            while let Some(&c) = chars.peek() {
1290                if quoted {
1291                    if Some(c) == quote_char {
1292                        chars.next(); // consume closing quote
1293                        break;
1294                    }
1295                } else if c.is_whitespace() || c == '\n' || c == '\r' {
1296                    break;
1297                }
1298                if let Some(ch) = chars.next() {
1299                    delimiter.push(ch);
1300                }
1301            }
1302
1303            if delimiter.is_empty() {
1304                // Not a valid here-doc, output << literally
1305                result.push_str("<<");
1306                if strip_tabs {
1307                    result.push('-');
1308                }
1309                continue;
1310            }
1311
1312            // Buffer text after delimiter word (e.g., " | jq" in "cat <<EOF | jq")
1313            // This must be emitted AFTER the heredoc marker, not before.
1314            let mut after_delimiter = String::new();
1315            while let Some(&c) = chars.peek() {
1316                if c == '\n' {
1317                    chars.next();
1318                    break;
1319                } else if c == '\r' {
1320                    chars.next();
1321                    if chars.peek() == Some(&'\n') {
1322                        chars.next();
1323                    }
1324                    break;
1325                }
1326                if let Some(ch) = chars.next() {
1327                    after_delimiter.push(ch);
1328                }
1329            }
1330
1331            // Collect content until delimiter on its own line
1332            let mut content = String::new();
1333            let mut current_line = String::new();
1334
1335            loop {
1336                match chars.next() {
1337                    Some('\n') => {
1338                        // Check if this line is the delimiter
1339                        let trimmed = if strip_tabs {
1340                            current_line.trim_start_matches('\t')
1341                        } else {
1342                            &current_line
1343                        };
1344                        if trimmed == delimiter {
1345                            // Found end of here-doc
1346                            break;
1347                        }
1348                        // Add line to content (including empty lines)
1349                        content.push_str(&current_line);
1350                        content.push('\n');
1351                        current_line.clear();
1352                    }
1353                    Some('\r') => {
1354                        // Handle \r\n
1355                        if chars.peek() == Some(&'\n') {
1356                            chars.next();
1357                        }
1358                        let trimmed = if strip_tabs {
1359                            current_line.trim_start_matches('\t')
1360                        } else {
1361                            &current_line
1362                        };
1363                        if trimmed == delimiter {
1364                            break;
1365                        }
1366                        content.push_str(&current_line);
1367                        content.push('\n');
1368                        current_line.clear();
1369                    }
1370                    Some(c) => {
1371                        current_line.push(c);
1372                    }
1373                    None => {
1374                        // EOF - check if current line is the delimiter
1375                        let trimmed = if strip_tabs {
1376                            current_line.trim_start_matches('\t')
1377                        } else {
1378                            &current_line
1379                        };
1380                        if trimmed == delimiter {
1381                            // Found delimiter at EOF
1382                            break;
1383                        }
1384                        // Not a delimiter - include remaining content
1385                        if !current_line.is_empty() {
1386                            content.push_str(&current_line);
1387                        }
1388                        break;
1389                    }
1390                }
1391            }
1392
1393            // Remove trailing newline from content (we'll add it when needed)
1394            let content = content.trim_end_matches('\n').to_string();
1395
1396            // Create a unique marker for this here-doc (collision-resistant)
1397            let marker = format!("__KAISH_HEREDOC_{}__", unique_marker_id());
1398            heredocs.push((marker.clone(), content, quoted));
1399
1400            // Output <<marker first, then any text that followed the delimiter
1401            // (e.g., " | jq") so the heredoc attaches to the correct command.
1402            result.push_str("<<");
1403            result.push_str(&marker);
1404            result.push_str(&after_delimiter);
1405            result.push('\n');
1406        } else {
1407            result.push(ch);
1408        }
1409    }
1410
1411    (result, heredocs)
1412}
1413
1414/// Extract the text contribution of a token for colon-adjacent merging.
1415///
1416/// Returns `Some(text)` for token types that can participate in word-like
1417/// merging, `None` for everything else.
1418fn mergeable_text(token: &Token) -> Option<String> {
1419    match token {
1420        Token::Ident(s) => Some(s.clone()),
1421        Token::Colon => Some(":".to_string()),
1422        Token::Int(n) => Some(n.to_string()),
1423        Token::Path(p) => Some(p.clone()),
1424        Token::Float(f) => Some(f.to_string()),
1425        _ => None,
1426    }
1427}
1428
1429/// Merge span-adjacent token runs containing `Token::Colon` into single `Ident` tokens.
1430///
1431/// In bash, `:` is a regular character in unquoted words. kaish tokenizes it
1432/// separately, which breaks Rust paths (`foo::bar`), URLs (`host:8080`), etc.
1433///
1434/// This pass fuses span-adjacent mergeable tokens (Ident, Colon, Int, Path, Float)
1435/// into a single `Ident` when the run contains at least one `Colon`. Runs without
1436/// colons or standalone tokens pass through unchanged.
1437fn merge_colon_adjacent(tokens: Vec<Spanned<Token>>) -> Vec<Spanned<Token>> {
1438    if tokens.is_empty() {
1439        return tokens;
1440    }
1441
1442    let mut result = Vec::with_capacity(tokens.len());
1443    let mut run: Vec<&Spanned<Token>> = Vec::new();
1444
1445    for token in &tokens {
1446        if run.is_empty() {
1447            if mergeable_text(&token.token).is_some() {
1448                run.push(token);
1449            } else {
1450                result.push(token.clone());
1451            }
1452            continue;
1453        }
1454
1455        // Check span adjacency: previous run's last token ends where this one starts
1456        // Safety: run is non-empty (checked above)
1457        let Some(last) = run.last() else { unreachable!() };
1458        let adjacent = last.span.end == token.span.start;
1459
1460        if adjacent && mergeable_text(&token.token).is_some() {
1461            run.push(token);
1462        } else {
1463            flush_colon_run(&mut run, &mut result);
1464            if mergeable_text(&token.token).is_some() {
1465                run.push(token);
1466            } else {
1467                result.push(token.clone());
1468            }
1469        }
1470    }
1471
1472    flush_colon_run(&mut run, &mut result);
1473
1474    result
1475}
1476
1477/// Flush a run of mergeable tokens: merge if it contains a colon, otherwise emit individually.
1478fn flush_colon_run(run: &mut Vec<&Spanned<Token>>, result: &mut Vec<Spanned<Token>>) {
1479    if run.is_empty() {
1480        return;
1481    }
1482
1483    let has_colon = run.iter().any(|t| matches!(t.token, Token::Colon));
1484
1485    if run.len() >= 2 && has_colon {
1486        let text: String = run
1487            .iter()
1488            .filter_map(|t| mergeable_text(&t.token))
1489            .collect();
1490        // Safety: run.len() >= 2 so first/last exist
1491        let start = run.first().map(|t| t.span.start).unwrap_or(0);
1492        let end = run.last().map(|t| t.span.end).unwrap_or(0);
1493        result.push(Spanned::new(Token::Ident(text), start..end));
1494    } else {
1495        for t in run.iter() {
1496            result.push((*t).clone());
1497        }
1498    }
1499
1500    run.clear();
1501}
1502
1503/// Tokenize source code into a vector of spanned tokens.
1504///
1505/// Skips whitespace and comments (unless you need them for formatting).
1506/// Returns errors with their positions for nice error messages.
1507///
1508/// Handles:
1509/// - Arithmetic: `$((expr))` becomes `Arithmetic("expr")`
1510/// - Here-docs: `<<EOF\nhello\nEOF` becomes `HereDocStart` + `HereDoc("hello")`
1511/// - Colon merge: span-adjacent `foo::bar` becomes `Ident("foo::bar")`
1512pub fn tokenize(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1513    // Preprocess arithmetic first (before heredocs because heredoc content might contain $((
1514    let arith_result = preprocess_arithmetic(source)
1515        .map_err(|e| vec![Spanned::new(e, 0..source.len())])?;
1516
1517    // Then preprocess here-docs (heredoc span tracking is not implemented for simplicity)
1518    let (preprocessed, heredocs) = preprocess_heredocs(&arith_result.text);
1519
1520    // Combine replacements for span correction (arithmetic only for now)
1521    let span_replacements = arith_result.replacements;
1522
1523    let lexer = Token::lexer(&preprocessed);
1524    let mut tokens = Vec::new();
1525    let mut errors = Vec::new();
1526
1527    for (result, span) in lexer.spanned() {
1528        // Correct the span from preprocessed coordinates to original coordinates
1529        let corrected_span = correct_span(span, &span_replacements);
1530        match result {
1531            Ok(token) => {
1532                // Skip comments and line continuations - they're not needed for parsing
1533                if !matches!(token, Token::Comment | Token::LineContinuation) {
1534                    tokens.push(Spanned::new(token, corrected_span));
1535                }
1536            }
1537            Err(err) => {
1538                errors.push(Spanned::new(err, corrected_span));
1539            }
1540        }
1541    }
1542
1543    if !errors.is_empty() {
1544        return Err(errors);
1545    }
1546
1547    // Post-process: replace markers with actual token content
1548    let mut final_tokens = Vec::with_capacity(tokens.len());
1549    let mut i = 0;
1550
1551    while i < tokens.len() {
1552        // Check for arithmetic marker (unique format: __KAISH_ARITH_{id}__)
1553        if let Token::Ident(ref name) = tokens[i].token
1554            && name.starts_with("__KAISH_ARITH_") && name.ends_with("__")
1555                && let Some((_, expr)) = arith_result.arithmetics.iter().find(|(marker, _)| marker == name) {
1556                    final_tokens.push(Spanned::new(Token::Arithmetic(expr.clone()), tokens[i].span.clone()));
1557                    i += 1;
1558                    continue;
1559                }
1560
1561        // Check for heredoc (unique format: __KAISH_HEREDOC_{id}__)
1562        if matches!(tokens[i].token, Token::HereDocStart) {
1563            // Check if next token is a heredoc marker
1564            if i + 1 < tokens.len()
1565                && let Token::Ident(ref name) = tokens[i + 1].token
1566                    && name.starts_with("__KAISH_HEREDOC_") && name.ends_with("__") {
1567                        // Find the corresponding content
1568                        if let Some((_, content, literal)) = heredocs.iter().find(|(marker, _, _)| marker == name) {
1569                            final_tokens.push(Spanned::new(Token::HereDocStart, tokens[i].span.clone()));
1570                            final_tokens.push(Spanned::new(Token::HereDoc(HereDocData { content: content.clone(), literal: *literal }), tokens[i + 1].span.clone()));
1571                            i += 2;
1572                            continue;
1573                        }
1574                    }
1575        }
1576
1577        // Check for arithmetic markers inside string content
1578        let token = if let Token::String(ref s) = tokens[i].token {
1579            // Check if string contains any arithmetic markers
1580            let mut new_content = s.clone();
1581            for (marker, expr) in &arith_result.arithmetics {
1582                if new_content.contains(marker) {
1583                    // Replace marker with the special format that parse_interpolated_string can detect
1584                    // Use ${__ARITH:expr__} format so it gets parsed as StringPart::Arithmetic
1585                    new_content = new_content.replace(marker, &format!("${{__ARITH:{}__}}", expr));
1586                }
1587            }
1588            if new_content != *s {
1589                Spanned::new(Token::String(new_content), tokens[i].span.clone())
1590            } else {
1591                tokens[i].clone()
1592            }
1593        } else {
1594            tokens[i].clone()
1595        };
1596        final_tokens.push(token);
1597        i += 1;
1598    }
1599
1600    Ok(merge_colon_adjacent(final_tokens))
1601}
1602
1603/// Tokenize source code, preserving comments.
1604///
1605/// Useful for pretty-printing or formatting tools that need to preserve comments.
1606pub fn tokenize_with_comments(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1607    let lexer = Token::lexer(source);
1608    let mut tokens = Vec::new();
1609    let mut errors = Vec::new();
1610
1611    for (result, span) in lexer.spanned() {
1612        match result {
1613            Ok(token) => {
1614                tokens.push(Spanned::new(token, span));
1615            }
1616            Err(err) => {
1617                errors.push(Spanned::new(err, span));
1618            }
1619        }
1620    }
1621
1622    if errors.is_empty() {
1623        Ok(tokens)
1624    } else {
1625        Err(errors)
1626    }
1627}
1628
1629/// Extract the string content from a string token (removes quotes, processes escapes).
1630pub fn parse_string_literal(source: &str) -> Result<String, LexerError> {
1631    // Remove surrounding quotes
1632    if source.len() < 2 || !source.starts_with('"') || !source.ends_with('"') {
1633        return Err(LexerError::UnterminatedString);
1634    }
1635
1636    let inner = &source[1..source.len() - 1];
1637    let mut result = String::with_capacity(inner.len());
1638    let mut chars = inner.chars().peekable();
1639
1640    while let Some(ch) = chars.next() {
1641        if ch == '\\' {
1642            match chars.next() {
1643                Some('n') => result.push('\n'),
1644                Some('t') => result.push('\t'),
1645                Some('r') => result.push('\r'),
1646                Some('\\') => result.push('\\'),
1647                Some('"') => result.push('"'),
1648                // Use a unique marker for escaped dollar that won't be re-interpreted
1649                // parse_interpolated_string will convert this back to $
1650                Some('$') => result.push_str("__KAISH_ESCAPED_DOLLAR__"),
1651                Some('u') => {
1652                    // Unicode escape: \uXXXX
1653                    let mut hex = String::with_capacity(4);
1654                    for _ in 0..4 {
1655                        match chars.next() {
1656                            Some(h) if h.is_ascii_hexdigit() => hex.push(h),
1657                            _ => return Err(LexerError::InvalidEscape),
1658                        }
1659                    }
1660                    let codepoint = u32::from_str_radix(&hex, 16)
1661                        .map_err(|_| LexerError::InvalidEscape)?;
1662                    let ch = char::from_u32(codepoint)
1663                        .ok_or(LexerError::InvalidEscape)?;
1664                    result.push(ch);
1665                }
1666                // Unknown escapes: preserve the backslash (for regex patterns like `\.`)
1667                Some(next) => {
1668                    result.push('\\');
1669                    result.push(next);
1670                }
1671                None => return Err(LexerError::InvalidEscape),
1672            }
1673        } else {
1674            result.push(ch);
1675        }
1676    }
1677
1678    Ok(result)
1679}
1680
1681/// Parse a variable reference, extracting the path segments.
1682/// Input: "${VAR.field[0].nested}" → ["VAR", "field", "[0]", "nested"]
1683pub fn parse_var_ref(source: &str) -> Result<Vec<String>, LexerError> {
1684    // Remove ${ and }
1685    if source.len() < 4 || !source.starts_with("${") || !source.ends_with('}') {
1686        return Err(LexerError::UnterminatedVarRef);
1687    }
1688
1689    let inner = &source[2..source.len() - 1];
1690
1691    // Special case: $? (last result)
1692    if inner == "?" {
1693        return Ok(vec!["?".to_string()]);
1694    }
1695
1696    let mut segments = Vec::new();
1697    let mut current = String::new();
1698    let mut chars = inner.chars().peekable();
1699
1700    while let Some(ch) = chars.next() {
1701        match ch {
1702            '.' => {
1703                if !current.is_empty() {
1704                    segments.push(current.clone());
1705                    current.clear();
1706                }
1707            }
1708            '[' => {
1709                if !current.is_empty() {
1710                    segments.push(current.clone());
1711                    current.clear();
1712                }
1713                // Collect the index
1714                let mut index = String::from("[");
1715                while let Some(&c) = chars.peek() {
1716                    if let Some(c) = chars.next() {
1717                        index.push(c);
1718                    }
1719                    if c == ']' {
1720                        break;
1721                    }
1722                }
1723                segments.push(index);
1724            }
1725            _ => {
1726                current.push(ch);
1727            }
1728        }
1729    }
1730
1731    if !current.is_empty() {
1732        segments.push(current);
1733    }
1734
1735    Ok(segments)
1736}
1737
1738/// Parse an integer literal.
1739pub fn parse_int(source: &str) -> Result<i64, LexerError> {
1740    source.parse().map_err(|_| LexerError::InvalidNumber)
1741}
1742
1743/// Parse a float literal.
1744pub fn parse_float(source: &str) -> Result<f64, LexerError> {
1745    source.parse().map_err(|_| LexerError::InvalidNumber)
1746}
1747
1748#[cfg(test)]
1749mod tests {
1750    use super::*;
1751
1752    fn lex(source: &str) -> Vec<Token> {
1753        tokenize(source)
1754            .expect("lexer should succeed")
1755            .into_iter()
1756            .map(|s| s.token)
1757            .collect()
1758    }
1759
1760    // ═══════════════════════════════════════════════════════════════════
1761    // Keyword tests
1762    // ═══════════════════════════════════════════════════════════════════
1763
1764    #[test]
1765    fn keywords() {
1766        assert_eq!(lex("set"), vec![Token::Set]);
1767        assert_eq!(lex("if"), vec![Token::If]);
1768        assert_eq!(lex("then"), vec![Token::Then]);
1769        assert_eq!(lex("else"), vec![Token::Else]);
1770        assert_eq!(lex("elif"), vec![Token::Elif]);
1771        assert_eq!(lex("fi"), vec![Token::Fi]);
1772        assert_eq!(lex("for"), vec![Token::For]);
1773        assert_eq!(lex("in"), vec![Token::In]);
1774        assert_eq!(lex("do"), vec![Token::Do]);
1775        assert_eq!(lex("done"), vec![Token::Done]);
1776        assert_eq!(lex("case"), vec![Token::Case]);
1777        assert_eq!(lex("esac"), vec![Token::Esac]);
1778        assert_eq!(lex("function"), vec![Token::Function]);
1779        assert_eq!(lex("true"), vec![Token::True]);
1780        assert_eq!(lex("false"), vec![Token::False]);
1781    }
1782
1783    #[test]
1784    fn double_semicolon() {
1785        assert_eq!(lex(";;"), vec![Token::DoubleSemi]);
1786        // In case pattern context
1787        assert_eq!(lex("echo \"hi\";;"), vec![
1788            Token::Ident("echo".to_string()),
1789            Token::String("hi".to_string()),
1790            Token::DoubleSemi,
1791        ]);
1792    }
1793
1794    #[test]
1795    fn type_keywords() {
1796        assert_eq!(lex("string"), vec![Token::TypeString]);
1797        assert_eq!(lex("int"), vec![Token::TypeInt]);
1798        assert_eq!(lex("float"), vec![Token::TypeFloat]);
1799        assert_eq!(lex("bool"), vec![Token::TypeBool]);
1800    }
1801
1802    // ═══════════════════════════════════════════════════════════════════
1803    // Operator tests
1804    // ═══════════════════════════════════════════════════════════════════
1805
1806    #[test]
1807    fn single_char_operators() {
1808        assert_eq!(lex("="), vec![Token::Eq]);
1809        assert_eq!(lex("|"), vec![Token::Pipe]);
1810        assert_eq!(lex("&"), vec![Token::Amp]);
1811        assert_eq!(lex(">"), vec![Token::Gt]);
1812        assert_eq!(lex("<"), vec![Token::Lt]);
1813        assert_eq!(lex(";"), vec![Token::Semi]);
1814        assert_eq!(lex(":"), vec![Token::Colon]);
1815        assert_eq!(lex(","), vec![Token::Comma]);
1816        assert_eq!(lex("."), vec![Token::Dot]);
1817    }
1818
1819    #[test]
1820    fn multi_char_operators() {
1821        assert_eq!(lex("&&"), vec![Token::And]);
1822        assert_eq!(lex("||"), vec![Token::Or]);
1823        assert_eq!(lex("=="), vec![Token::EqEq]);
1824        assert_eq!(lex("!="), vec![Token::NotEq]);
1825        assert_eq!(lex("=~"), vec![Token::Match]);
1826        assert_eq!(lex("!~"), vec![Token::NotMatch]);
1827        assert_eq!(lex(">="), vec![Token::GtEq]);
1828        assert_eq!(lex("<="), vec![Token::LtEq]);
1829        assert_eq!(lex(">>"), vec![Token::GtGt]);
1830        assert_eq!(lex("2>"), vec![Token::Stderr]);
1831        assert_eq!(lex("&>"), vec![Token::Both]);
1832    }
1833
1834    #[test]
1835    fn brackets() {
1836        assert_eq!(lex("{"), vec![Token::LBrace]);
1837        assert_eq!(lex("}"), vec![Token::RBrace]);
1838        assert_eq!(lex("["), vec![Token::LBracket]);
1839        assert_eq!(lex("]"), vec![Token::RBracket]);
1840        assert_eq!(lex("("), vec![Token::LParen]);
1841        assert_eq!(lex(")"), vec![Token::RParen]);
1842    }
1843
1844    // ═══════════════════════════════════════════════════════════════════
1845    // Literal tests
1846    // ═══════════════════════════════════════════════════════════════════
1847
1848    #[test]
1849    fn integers() {
1850        assert_eq!(lex("0"), vec![Token::Int(0)]);
1851        assert_eq!(lex("42"), vec![Token::Int(42)]);
1852        assert_eq!(lex("-1"), vec![Token::Int(-1)]);
1853        assert_eq!(lex("999999"), vec![Token::Int(999999)]);
1854    }
1855
1856    #[test]
1857    fn floats() {
1858        assert_eq!(lex("3.14"), vec![Token::Float(3.14)]);
1859        assert_eq!(lex("-0.5"), vec![Token::Float(-0.5)]);
1860        assert_eq!(lex("123.456"), vec![Token::Float(123.456)]);
1861    }
1862
1863    #[test]
1864    fn strings() {
1865        assert_eq!(lex(r#""hello""#), vec![Token::String("hello".to_string())]);
1866        assert_eq!(lex(r#""hello world""#), vec![Token::String("hello world".to_string())]);
1867        assert_eq!(lex(r#""""#), vec![Token::String("".to_string())]); // empty string
1868        assert_eq!(lex(r#""with \"quotes\"""#), vec![Token::String("with \"quotes\"".to_string())]);
1869        assert_eq!(lex(r#""with\nnewline""#), vec![Token::String("with\nnewline".to_string())]);
1870    }
1871
1872    #[test]
1873    fn var_refs() {
1874        assert_eq!(lex("${X}"), vec![Token::VarRef("${X}".to_string())]);
1875        assert_eq!(lex("${VAR}"), vec![Token::VarRef("${VAR}".to_string())]);
1876        assert_eq!(lex("${VAR.field}"), vec![Token::VarRef("${VAR.field}".to_string())]);
1877        assert_eq!(lex("${VAR[0]}"), vec![Token::VarRef("${VAR[0]}".to_string())]);
1878        assert_eq!(lex("${?.ok}"), vec![Token::VarRef("${?.ok}".to_string())]);
1879    }
1880
1881    // ═══════════════════════════════════════════════════════════════════
1882    // Identifier tests
1883    // ═══════════════════════════════════════════════════════════════════
1884
1885    #[test]
1886    fn identifiers() {
1887        assert_eq!(lex("foo"), vec![Token::Ident("foo".to_string())]);
1888        assert_eq!(lex("foo_bar"), vec![Token::Ident("foo_bar".to_string())]);
1889        assert_eq!(lex("foo-bar"), vec![Token::Ident("foo-bar".to_string())]);
1890        assert_eq!(lex("_private"), vec![Token::Ident("_private".to_string())]);
1891        assert_eq!(lex("cmd123"), vec![Token::Ident("cmd123".to_string())]);
1892    }
1893
1894    #[test]
1895    fn keyword_prefix_identifiers() {
1896        // Identifiers that start with keywords but aren't keywords
1897        assert_eq!(lex("setup"), vec![Token::Ident("setup".to_string())]);
1898        assert_eq!(lex("kaish-tools"), vec![Token::Ident("kaish-tools".to_string())]);
1899        assert_eq!(lex("iffy"), vec![Token::Ident("iffy".to_string())]);
1900        assert_eq!(lex("forked"), vec![Token::Ident("forked".to_string())]);
1901        assert_eq!(lex("done-with-it"), vec![Token::Ident("done-with-it".to_string())]);
1902    }
1903
1904    // ═══════════════════════════════════════════════════════════════════
1905    // Statement tests
1906    // ═══════════════════════════════════════════════════════════════════
1907
1908    #[test]
1909    fn assignment() {
1910        assert_eq!(
1911            lex("set X = 5"),
1912            vec![Token::Set, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
1913        );
1914    }
1915
1916    #[test]
1917    fn command_simple() {
1918        assert_eq!(lex("echo"), vec![Token::Ident("echo".to_string())]);
1919        assert_eq!(
1920            lex(r#"echo "hello""#),
1921            vec![Token::Ident("echo".to_string()), Token::String("hello".to_string())]
1922        );
1923    }
1924
1925    #[test]
1926    fn command_with_args() {
1927        assert_eq!(
1928            lex("cmd arg1 arg2"),
1929            vec![Token::Ident("cmd".to_string()), Token::Ident("arg1".to_string()), Token::Ident("arg2".to_string())]
1930        );
1931    }
1932
1933    #[test]
1934    fn command_with_named_args() {
1935        assert_eq!(
1936            lex("cmd key=value"),
1937            vec![Token::Ident("cmd".to_string()), Token::Ident("key".to_string()), Token::Eq, Token::Ident("value".to_string())]
1938        );
1939    }
1940
1941    #[test]
1942    fn pipeline() {
1943        assert_eq!(
1944            lex("a | b | c"),
1945            vec![Token::Ident("a".to_string()), Token::Pipe, Token::Ident("b".to_string()), Token::Pipe, Token::Ident("c".to_string())]
1946        );
1947    }
1948
1949    #[test]
1950    fn if_statement() {
1951        assert_eq!(
1952            lex("if true; then echo; fi"),
1953            vec![
1954                Token::If,
1955                Token::True,
1956                Token::Semi,
1957                Token::Then,
1958                Token::Ident("echo".to_string()),
1959                Token::Semi,
1960                Token::Fi
1961            ]
1962        );
1963    }
1964
1965    #[test]
1966    fn for_loop() {
1967        assert_eq!(
1968            lex("for X in items; do echo; done"),
1969            vec![
1970                Token::For,
1971                Token::Ident("X".to_string()),
1972                Token::In,
1973                Token::Ident("items".to_string()),
1974                Token::Semi,
1975                Token::Do,
1976                Token::Ident("echo".to_string()),
1977                Token::Semi,
1978                Token::Done
1979            ]
1980        );
1981    }
1982
1983    // ═══════════════════════════════════════════════════════════════════
1984    // Whitespace and newlines
1985    // ═══════════════════════════════════════════════════════════════════
1986
1987    #[test]
1988    fn whitespace_ignored() {
1989        assert_eq!(lex("   set   X   =   5   "), lex("set X = 5"));
1990    }
1991
1992    #[test]
1993    fn newlines_preserved() {
1994        let tokens = lex("a\nb");
1995        assert_eq!(
1996            tokens,
1997            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
1998        );
1999    }
2000
2001    #[test]
2002    fn multiple_newlines() {
2003        let tokens = lex("a\n\n\nb");
2004        assert_eq!(
2005            tokens,
2006            vec![Token::Ident("a".to_string()), Token::Newline, Token::Newline, Token::Newline, Token::Ident("b".to_string())]
2007        );
2008    }
2009
2010    // ═══════════════════════════════════════════════════════════════════
2011    // Comments
2012    // ═══════════════════════════════════════════════════════════════════
2013
2014    #[test]
2015    fn comments_skipped() {
2016        assert_eq!(lex("# comment"), vec![]);
2017        assert_eq!(lex("a # comment"), vec![Token::Ident("a".to_string())]);
2018        assert_eq!(
2019            lex("a # comment\nb"),
2020            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
2021        );
2022    }
2023
2024    #[test]
2025    fn comments_preserved_when_requested() {
2026        let tokens = tokenize_with_comments("a # comment")
2027            .expect("should succeed")
2028            .into_iter()
2029            .map(|s| s.token)
2030            .collect::<Vec<_>>();
2031        assert_eq!(tokens, vec![Token::Ident("a".to_string()), Token::Comment]);
2032    }
2033
2034    // ═══════════════════════════════════════════════════════════════════
2035    // String parsing
2036    // ═══════════════════════════════════════════════════════════════════
2037
2038    #[test]
2039    fn parse_simple_string() {
2040        assert_eq!(parse_string_literal(r#""hello""#).expect("ok"), "hello");
2041    }
2042
2043    #[test]
2044    fn parse_string_with_escapes() {
2045        assert_eq!(
2046            parse_string_literal(r#""hello\nworld""#).expect("ok"),
2047            "hello\nworld"
2048        );
2049        assert_eq!(
2050            parse_string_literal(r#""tab\there""#).expect("ok"),
2051            "tab\there"
2052        );
2053        assert_eq!(
2054            parse_string_literal(r#""quote\"here""#).expect("ok"),
2055            "quote\"here"
2056        );
2057    }
2058
2059    #[test]
2060    fn parse_string_with_unicode() {
2061        assert_eq!(
2062            parse_string_literal(r#""emoji \u2764""#).expect("ok"),
2063            "emoji ❤"
2064        );
2065    }
2066
2067    #[test]
2068    fn parse_string_with_escaped_dollar() {
2069        // \$ produces a marker that parse_interpolated_string will convert to $
2070        // The marker __KAISH_ESCAPED_DOLLAR__ is used to prevent re-interpretation
2071        assert_eq!(
2072            parse_string_literal(r#""\$VAR""#).expect("ok"),
2073            "__KAISH_ESCAPED_DOLLAR__VAR"
2074        );
2075        assert_eq!(
2076            parse_string_literal(r#""cost: \$100""#).expect("ok"),
2077            "cost: __KAISH_ESCAPED_DOLLAR__100"
2078        );
2079    }
2080
2081    // ═══════════════════════════════════════════════════════════════════
2082    // Variable reference parsing
2083    // ═══════════════════════════════════════════════════════════════════
2084
2085    #[test]
2086    fn parse_simple_var() {
2087        assert_eq!(
2088            parse_var_ref("${X}").expect("ok"),
2089            vec!["X"]
2090        );
2091    }
2092
2093    #[test]
2094    fn parse_var_with_field() {
2095        assert_eq!(
2096            parse_var_ref("${VAR.field}").expect("ok"),
2097            vec!["VAR", "field"]
2098        );
2099    }
2100
2101    #[test]
2102    fn parse_var_with_index() {
2103        assert_eq!(
2104            parse_var_ref("${VAR[0]}").expect("ok"),
2105            vec!["VAR", "[0]"]
2106        );
2107    }
2108
2109    #[test]
2110    fn parse_var_nested() {
2111        assert_eq!(
2112            parse_var_ref("${VAR.field[0].nested}").expect("ok"),
2113            vec!["VAR", "field", "[0]", "nested"]
2114        );
2115    }
2116
2117    #[test]
2118    fn parse_last_result() {
2119        assert_eq!(
2120            parse_var_ref("${?}").expect("ok"),
2121            vec!["?"]
2122        );
2123        assert_eq!(
2124            parse_var_ref("${?.ok}").expect("ok"),
2125            vec!["?", "ok"]
2126        );
2127    }
2128
2129    // ═══════════════════════════════════════════════════════════════════
2130    // Number parsing
2131    // ═══════════════════════════════════════════════════════════════════
2132
2133    #[test]
2134    fn parse_integers() {
2135        assert_eq!(parse_int("0").expect("ok"), 0);
2136        assert_eq!(parse_int("42").expect("ok"), 42);
2137        assert_eq!(parse_int("-1").expect("ok"), -1);
2138    }
2139
2140    #[test]
2141    fn parse_floats() {
2142        assert!((parse_float("3.14").expect("ok") - 3.14).abs() < f64::EPSILON);
2143        assert!((parse_float("-0.5").expect("ok") - (-0.5)).abs() < f64::EPSILON);
2144    }
2145
2146    // ═══════════════════════════════════════════════════════════════════
2147    // Edge cases and errors
2148    // ═══════════════════════════════════════════════════════════════════
2149
2150    #[test]
2151    fn empty_input() {
2152        assert_eq!(lex(""), vec![]);
2153    }
2154
2155    #[test]
2156    fn only_whitespace() {
2157        assert_eq!(lex("   \t\t   "), vec![]);
2158    }
2159
2160    #[test]
2161    fn json_array() {
2162        assert_eq!(
2163            lex(r#"[1, 2, 3]"#),
2164            vec![
2165                Token::LBracket,
2166                Token::Int(1),
2167                Token::Comma,
2168                Token::Int(2),
2169                Token::Comma,
2170                Token::Int(3),
2171                Token::RBracket
2172            ]
2173        );
2174    }
2175
2176    #[test]
2177    fn json_object() {
2178        assert_eq!(
2179            lex(r#"{"key": "value"}"#),
2180            vec![
2181                Token::LBrace,
2182                Token::String("key".to_string()),
2183                Token::Colon,
2184                Token::String("value".to_string()),
2185                Token::RBrace
2186            ]
2187        );
2188    }
2189
2190    #[test]
2191    fn redirect_operators() {
2192        assert_eq!(
2193            lex("cmd > file"),
2194            vec![Token::Ident("cmd".to_string()), Token::Gt, Token::Ident("file".to_string())]
2195        );
2196        assert_eq!(
2197            lex("cmd >> file"),
2198            vec![Token::Ident("cmd".to_string()), Token::GtGt, Token::Ident("file".to_string())]
2199        );
2200        assert_eq!(
2201            lex("cmd 2> err"),
2202            vec![Token::Ident("cmd".to_string()), Token::Stderr, Token::Ident("err".to_string())]
2203        );
2204        assert_eq!(
2205            lex("cmd &> all"),
2206            vec![Token::Ident("cmd".to_string()), Token::Both, Token::Ident("all".to_string())]
2207        );
2208    }
2209
2210    #[test]
2211    fn background_job() {
2212        assert_eq!(
2213            lex("cmd &"),
2214            vec![Token::Ident("cmd".to_string()), Token::Amp]
2215        );
2216    }
2217
2218    #[test]
2219    fn command_substitution() {
2220        assert_eq!(
2221            lex("$(cmd)"),
2222            vec![Token::CmdSubstStart, Token::Ident("cmd".to_string()), Token::RParen]
2223        );
2224        assert_eq!(
2225            lex("$(cmd arg)"),
2226            vec![
2227                Token::CmdSubstStart,
2228                Token::Ident("cmd".to_string()),
2229                Token::Ident("arg".to_string()),
2230                Token::RParen
2231            ]
2232        );
2233        assert_eq!(
2234            lex("$(a | b)"),
2235            vec![
2236                Token::CmdSubstStart,
2237                Token::Ident("a".to_string()),
2238                Token::Pipe,
2239                Token::Ident("b".to_string()),
2240                Token::RParen
2241            ]
2242        );
2243    }
2244
2245    #[test]
2246    fn complex_pipeline() {
2247        assert_eq!(
2248            lex(r#"cat file | grep pattern="foo" | head count=10"#),
2249            vec![
2250                Token::Ident("cat".to_string()),
2251                Token::Ident("file".to_string()),
2252                Token::Pipe,
2253                Token::Ident("grep".to_string()),
2254                Token::Ident("pattern".to_string()),
2255                Token::Eq,
2256                Token::String("foo".to_string()),
2257                Token::Pipe,
2258                Token::Ident("head".to_string()),
2259                Token::Ident("count".to_string()),
2260                Token::Eq,
2261                Token::Int(10),
2262            ]
2263        );
2264    }
2265
2266    // ═══════════════════════════════════════════════════════════════════
2267    // Flag tests
2268    // ═══════════════════════════════════════════════════════════════════
2269
2270    #[test]
2271    fn short_flag() {
2272        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2273        assert_eq!(lex("-a"), vec![Token::ShortFlag("a".to_string())]);
2274        assert_eq!(lex("-v"), vec![Token::ShortFlag("v".to_string())]);
2275    }
2276
2277    #[test]
2278    fn short_flag_combined() {
2279        // Combined short flags like -la
2280        assert_eq!(lex("-la"), vec![Token::ShortFlag("la".to_string())]);
2281        assert_eq!(lex("-vvv"), vec![Token::ShortFlag("vvv".to_string())]);
2282    }
2283
2284    #[test]
2285    fn long_flag() {
2286        assert_eq!(lex("--force"), vec![Token::LongFlag("force".to_string())]);
2287        assert_eq!(lex("--verbose"), vec![Token::LongFlag("verbose".to_string())]);
2288        assert_eq!(lex("--foo-bar"), vec![Token::LongFlag("foo-bar".to_string())]);
2289    }
2290
2291    #[test]
2292    fn double_dash() {
2293        // -- alone marks end of flags
2294        assert_eq!(lex("--"), vec![Token::DoubleDash]);
2295    }
2296
2297    #[test]
2298    fn flags_vs_negative_numbers() {
2299        // -123 should be a negative integer, not a flag
2300        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2301        // -l should be a flag
2302        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2303        // -1a is ambiguous - should be Int(-1) then Ident(a)
2304        // Actually the regex -[a-zA-Z] won't match -1a since 1 isn't a letter
2305        assert_eq!(
2306            lex("-1 a"),
2307            vec![Token::Int(-1), Token::Ident("a".to_string())]
2308        );
2309    }
2310
2311    #[test]
2312    fn command_with_flags() {
2313        assert_eq!(
2314            lex("ls -l"),
2315            vec![
2316                Token::Ident("ls".to_string()),
2317                Token::ShortFlag("l".to_string()),
2318            ]
2319        );
2320        assert_eq!(
2321            lex("git commit -m"),
2322            vec![
2323                Token::Ident("git".to_string()),
2324                Token::Ident("commit".to_string()),
2325                Token::ShortFlag("m".to_string()),
2326            ]
2327        );
2328        assert_eq!(
2329            lex("git push --force"),
2330            vec![
2331                Token::Ident("git".to_string()),
2332                Token::Ident("push".to_string()),
2333                Token::LongFlag("force".to_string()),
2334            ]
2335        );
2336    }
2337
2338    #[test]
2339    fn flag_with_value() {
2340        assert_eq!(
2341            lex(r#"git commit -m "message""#),
2342            vec![
2343                Token::Ident("git".to_string()),
2344                Token::Ident("commit".to_string()),
2345                Token::ShortFlag("m".to_string()),
2346                Token::String("message".to_string()),
2347            ]
2348        );
2349        assert_eq!(
2350            lex(r#"--message="hello""#),
2351            vec![
2352                Token::LongFlag("message".to_string()),
2353                Token::Eq,
2354                Token::String("hello".to_string()),
2355            ]
2356        );
2357    }
2358
2359    #[test]
2360    fn end_of_flags_marker() {
2361        assert_eq!(
2362            lex("git checkout -- file"),
2363            vec![
2364                Token::Ident("git".to_string()),
2365                Token::Ident("checkout".to_string()),
2366                Token::DoubleDash,
2367                Token::Ident("file".to_string()),
2368            ]
2369        );
2370    }
2371
2372    // ═══════════════════════════════════════════════════════════════════
2373    // Bash compatibility tokens
2374    // ═══════════════════════════════════════════════════════════════════
2375
2376    #[test]
2377    fn local_keyword() {
2378        assert_eq!(lex("local"), vec![Token::Local]);
2379        assert_eq!(
2380            lex("local X = 5"),
2381            vec![Token::Local, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2382        );
2383    }
2384
2385    #[test]
2386    fn simple_var_ref() {
2387        assert_eq!(lex("$X"), vec![Token::SimpleVarRef("X".to_string())]);
2388        assert_eq!(lex("$foo"), vec![Token::SimpleVarRef("foo".to_string())]);
2389        assert_eq!(lex("$foo_bar"), vec![Token::SimpleVarRef("foo_bar".to_string())]);
2390        assert_eq!(lex("$_private"), vec![Token::SimpleVarRef("_private".to_string())]);
2391    }
2392
2393    #[test]
2394    fn simple_var_ref_in_command() {
2395        assert_eq!(
2396            lex("echo $NAME"),
2397            vec![Token::Ident("echo".to_string()), Token::SimpleVarRef("NAME".to_string())]
2398        );
2399    }
2400
2401    #[test]
2402    fn single_quoted_strings() {
2403        assert_eq!(lex("'hello'"), vec![Token::SingleString("hello".to_string())]);
2404        assert_eq!(lex("'hello world'"), vec![Token::SingleString("hello world".to_string())]);
2405        assert_eq!(lex("''"), vec![Token::SingleString("".to_string())]);
2406        // Single quotes don't process escapes or variables
2407        assert_eq!(lex(r"'no $VAR here'"), vec![Token::SingleString("no $VAR here".to_string())]);
2408        assert_eq!(lex(r"'backslash \n stays'"), vec![Token::SingleString(r"backslash \n stays".to_string())]);
2409    }
2410
2411    #[test]
2412    fn test_brackets() {
2413        // [[ and ]] are now two separate bracket tokens to avoid conflicts with nested arrays
2414        assert_eq!(lex("[["), vec![Token::LBracket, Token::LBracket]);
2415        assert_eq!(lex("]]"), vec![Token::RBracket, Token::RBracket]);
2416        assert_eq!(
2417            lex("[[ -f file ]]"),
2418            vec![
2419                Token::LBracket,
2420                Token::LBracket,
2421                Token::ShortFlag("f".to_string()),
2422                Token::Ident("file".to_string()),
2423                Token::RBracket,
2424                Token::RBracket
2425            ]
2426        );
2427    }
2428
2429    #[test]
2430    fn test_expression_syntax() {
2431        assert_eq!(
2432            lex(r#"[[ $X == "value" ]]"#),
2433            vec![
2434                Token::LBracket,
2435                Token::LBracket,
2436                Token::SimpleVarRef("X".to_string()),
2437                Token::EqEq,
2438                Token::String("value".to_string()),
2439                Token::RBracket,
2440                Token::RBracket
2441            ]
2442        );
2443    }
2444
2445    #[test]
2446    fn bash_style_assignment() {
2447        // NAME="value" (no spaces) - lexer sees IDENT EQ STRING
2448        assert_eq!(
2449            lex(r#"NAME="value""#),
2450            vec![
2451                Token::Ident("NAME".to_string()),
2452                Token::Eq,
2453                Token::String("value".to_string())
2454            ]
2455        );
2456    }
2457
2458    #[test]
2459    fn positional_params() {
2460        assert_eq!(lex("$0"), vec![Token::Positional(0)]);
2461        assert_eq!(lex("$1"), vec![Token::Positional(1)]);
2462        assert_eq!(lex("$9"), vec![Token::Positional(9)]);
2463        assert_eq!(lex("$@"), vec![Token::AllArgs]);
2464        assert_eq!(lex("$#"), vec![Token::ArgCount]);
2465    }
2466
2467    #[test]
2468    fn positional_in_context() {
2469        assert_eq!(
2470            lex("echo $1 $2"),
2471            vec![
2472                Token::Ident("echo".to_string()),
2473                Token::Positional(1),
2474                Token::Positional(2),
2475            ]
2476        );
2477    }
2478
2479    #[test]
2480    fn var_length() {
2481        assert_eq!(lex("${#X}"), vec![Token::VarLength("X".to_string())]);
2482        assert_eq!(lex("${#NAME}"), vec![Token::VarLength("NAME".to_string())]);
2483        assert_eq!(lex("${#foo_bar}"), vec![Token::VarLength("foo_bar".to_string())]);
2484    }
2485
2486    #[test]
2487    fn var_length_in_context() {
2488        assert_eq!(
2489            lex("echo ${#NAME}"),
2490            vec![
2491                Token::Ident("echo".to_string()),
2492                Token::VarLength("NAME".to_string()),
2493            ]
2494        );
2495    }
2496
2497    // ═══════════════════════════════════════════════════════════════════
2498    // Edge case tests: Flag ambiguities
2499    // ═══════════════════════════════════════════════════════════════════
2500
2501    #[test]
2502    fn plus_flag() {
2503        // Plus flags for set +e
2504        assert_eq!(lex("+e"), vec![Token::PlusFlag("e".to_string())]);
2505        assert_eq!(lex("+x"), vec![Token::PlusFlag("x".to_string())]);
2506        assert_eq!(lex("+ex"), vec![Token::PlusFlag("ex".to_string())]);
2507    }
2508
2509    #[test]
2510    fn set_with_plus_flag() {
2511        assert_eq!(
2512            lex("set +e"),
2513            vec![
2514                Token::Set,
2515                Token::PlusFlag("e".to_string()),
2516            ]
2517        );
2518    }
2519
2520    #[test]
2521    fn set_with_multiple_flags() {
2522        assert_eq!(
2523            lex("set -e -u"),
2524            vec![
2525                Token::Set,
2526                Token::ShortFlag("e".to_string()),
2527                Token::ShortFlag("u".to_string()),
2528            ]
2529        );
2530    }
2531
2532    #[test]
2533    fn flags_vs_negative_numbers_edge_cases() {
2534        // -1a should be negative int followed by ident
2535        assert_eq!(
2536            lex("-1 a"),
2537            vec![Token::Int(-1), Token::Ident("a".to_string())]
2538        );
2539        // -l is a flag
2540        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2541        // -123 is negative number
2542        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2543    }
2544
2545    #[test]
2546    fn single_dash_is_minus_alone() {
2547        // Single dash alone - now handled as MinusAlone for `cat -` stdin indicator
2548        let result = tokenize("-").expect("should lex");
2549        assert_eq!(result.len(), 1);
2550        assert!(matches!(result[0].token, Token::MinusAlone));
2551    }
2552
2553    #[test]
2554    fn plus_bare_for_date_format() {
2555        // `date +%s` - the +%s should be PlusBare
2556        let result = tokenize("+%s").expect("should lex");
2557        assert_eq!(result.len(), 1);
2558        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%s"));
2559
2560        // `date +%Y-%m-%d` - format string with dashes
2561        let result = tokenize("+%Y-%m-%d").expect("should lex");
2562        assert_eq!(result.len(), 1);
2563        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%Y-%m-%d"));
2564    }
2565
2566    #[test]
2567    fn plus_flag_still_works() {
2568        // `set +e` - should still be PlusFlag
2569        let result = tokenize("+e").expect("should lex");
2570        assert_eq!(result.len(), 1);
2571        assert!(matches!(result[0].token, Token::PlusFlag(ref s) if s == "e"));
2572    }
2573
2574    #[test]
2575    fn while_keyword_vs_while_loop() {
2576        // 'while' as keyword in loop context
2577        assert_eq!(lex("while"), vec![Token::While]);
2578        // 'while' at start followed by condition
2579        assert_eq!(
2580            lex("while true"),
2581            vec![Token::While, Token::True]
2582        );
2583    }
2584
2585    #[test]
2586    fn control_flow_keywords() {
2587        assert_eq!(lex("break"), vec![Token::Break]);
2588        assert_eq!(lex("continue"), vec![Token::Continue]);
2589        assert_eq!(lex("return"), vec![Token::Return]);
2590        assert_eq!(lex("exit"), vec![Token::Exit]);
2591    }
2592
2593    #[test]
2594    fn control_flow_with_numbers() {
2595        assert_eq!(
2596            lex("break 2"),
2597            vec![Token::Break, Token::Int(2)]
2598        );
2599        assert_eq!(
2600            lex("continue 3"),
2601            vec![Token::Continue, Token::Int(3)]
2602        );
2603        assert_eq!(
2604            lex("exit 1"),
2605            vec![Token::Exit, Token::Int(1)]
2606        );
2607    }
2608
2609    // ═══════════════════════════════════════════════════════════════════
2610    // Here-doc tests
2611    // ═══════════════════════════════════════════════════════════════════
2612
2613    #[test]
2614    fn heredoc_simple() {
2615        let source = "cat <<EOF\nhello\nworld\nEOF";
2616        let tokens = lex(source);
2617        assert_eq!(tokens, vec![
2618            Token::Ident("cat".to_string()),
2619            Token::HereDocStart,
2620            Token::HereDoc(HereDocData { content: "hello\nworld".to_string(), literal: false }),
2621            Token::Newline,
2622        ]);
2623    }
2624
2625    #[test]
2626    fn heredoc_empty() {
2627        let source = "cat <<EOF\nEOF";
2628        let tokens = lex(source);
2629        assert_eq!(tokens, vec![
2630            Token::Ident("cat".to_string()),
2631            Token::HereDocStart,
2632            Token::HereDoc(HereDocData { content: "".to_string(), literal: false }),
2633            Token::Newline,
2634        ]);
2635    }
2636
2637    #[test]
2638    fn heredoc_with_special_chars() {
2639        let source = "cat <<EOF\n$VAR and \"quoted\" 'single'\nEOF";
2640        let tokens = lex(source);
2641        assert_eq!(tokens, vec![
2642            Token::Ident("cat".to_string()),
2643            Token::HereDocStart,
2644            Token::HereDoc(HereDocData { content: "$VAR and \"quoted\" 'single'".to_string(), literal: false }),
2645            Token::Newline,
2646        ]);
2647    }
2648
2649    #[test]
2650    fn heredoc_multiline() {
2651        let source = "cat <<END\nline1\nline2\nline3\nEND";
2652        let tokens = lex(source);
2653        assert_eq!(tokens, vec![
2654            Token::Ident("cat".to_string()),
2655            Token::HereDocStart,
2656            Token::HereDoc(HereDocData { content: "line1\nline2\nline3".to_string(), literal: false }),
2657            Token::Newline,
2658        ]);
2659    }
2660
2661    #[test]
2662    fn heredoc_in_command() {
2663        let source = "cat <<EOF\nhello\nEOF\necho goodbye";
2664        let tokens = lex(source);
2665        assert_eq!(tokens, vec![
2666            Token::Ident("cat".to_string()),
2667            Token::HereDocStart,
2668            Token::HereDoc(HereDocData { content: "hello".to_string(), literal: false }),
2669            Token::Newline,
2670            Token::Ident("echo".to_string()),
2671            Token::Ident("goodbye".to_string()),
2672        ]);
2673    }
2674
2675    #[test]
2676    fn heredoc_strip_tabs() {
2677        let source = "cat <<-EOF\n\thello\n\tworld\n\tEOF";
2678        let tokens = lex(source);
2679        // Content has tabs preserved, only delimiter matching strips tabs
2680        assert_eq!(tokens, vec![
2681            Token::Ident("cat".to_string()),
2682            Token::HereDocStart,
2683            Token::HereDoc(HereDocData { content: "\thello\n\tworld".to_string(), literal: false }),
2684            Token::Newline,
2685        ]);
2686    }
2687
2688    // ═══════════════════════════════════════════════════════════════════
2689    // Arithmetic expression tests
2690    // ═══════════════════════════════════════════════════════════════════
2691
2692    #[test]
2693    fn arithmetic_simple() {
2694        let source = "$((1 + 2))";
2695        let tokens = lex(source);
2696        assert_eq!(tokens, vec![Token::Arithmetic("1 + 2".to_string())]);
2697    }
2698
2699    #[test]
2700    fn arithmetic_in_assignment() {
2701        let source = "X=$((5 * 3))";
2702        let tokens = lex(source);
2703        assert_eq!(tokens, vec![
2704            Token::Ident("X".to_string()),
2705            Token::Eq,
2706            Token::Arithmetic("5 * 3".to_string()),
2707        ]);
2708    }
2709
2710    #[test]
2711    fn arithmetic_with_nested_parens() {
2712        let source = "$((2 * (3 + 4)))";
2713        let tokens = lex(source);
2714        assert_eq!(tokens, vec![Token::Arithmetic("2 * (3 + 4)".to_string())]);
2715    }
2716
2717    #[test]
2718    fn arithmetic_with_variable() {
2719        let source = "$((X + 1))";
2720        let tokens = lex(source);
2721        assert_eq!(tokens, vec![Token::Arithmetic("X + 1".to_string())]);
2722    }
2723
2724    #[test]
2725    fn arithmetic_command_subst_not_confused() {
2726        // $( should not be treated as arithmetic
2727        let source = "$(echo hello)";
2728        let tokens = lex(source);
2729        assert_eq!(tokens, vec![
2730            Token::CmdSubstStart,
2731            Token::Ident("echo".to_string()),
2732            Token::Ident("hello".to_string()),
2733            Token::RParen,
2734        ]);
2735    }
2736
2737    #[test]
2738    fn arithmetic_nesting_limit() {
2739        // Create deeply nested parens that exceed MAX_PAREN_DEPTH (256)
2740        let open_parens = "(".repeat(300);
2741        let close_parens = ")".repeat(300);
2742        let source = format!("$(({}1{}))", open_parens, close_parens);
2743        let result = tokenize(&source);
2744        assert!(result.is_err());
2745        let errors = result.unwrap_err();
2746        assert_eq!(errors.len(), 1);
2747        assert_eq!(errors[0].token, LexerError::NestingTooDeep);
2748    }
2749
2750    #[test]
2751    fn arithmetic_nesting_within_limit() {
2752        // Nesting within limit should work
2753        let source = "$((((1 + 2) * 3)))";
2754        let tokens = lex(source);
2755        assert_eq!(tokens, vec![Token::Arithmetic("((1 + 2) * 3)".to_string())]);
2756    }
2757
2758    // ═══════════════════════════════════════════════════════════════════
2759    // Token category tests
2760    // ═══════════════════════════════════════════════════════════════════
2761
2762    #[test]
2763    fn token_categories() {
2764        // Keywords
2765        assert_eq!(Token::If.category(), TokenCategory::Keyword);
2766        assert_eq!(Token::Then.category(), TokenCategory::Keyword);
2767        assert_eq!(Token::For.category(), TokenCategory::Keyword);
2768        assert_eq!(Token::Function.category(), TokenCategory::Keyword);
2769        assert_eq!(Token::True.category(), TokenCategory::Keyword);
2770        assert_eq!(Token::TypeString.category(), TokenCategory::Keyword);
2771
2772        // Operators
2773        assert_eq!(Token::Pipe.category(), TokenCategory::Operator);
2774        assert_eq!(Token::And.category(), TokenCategory::Operator);
2775        assert_eq!(Token::Or.category(), TokenCategory::Operator);
2776        assert_eq!(Token::StderrToStdout.category(), TokenCategory::Operator);
2777        assert_eq!(Token::GtGt.category(), TokenCategory::Operator);
2778
2779        // Strings
2780        assert_eq!(Token::String("test".to_string()).category(), TokenCategory::String);
2781        assert_eq!(Token::SingleString("test".to_string()).category(), TokenCategory::String);
2782        assert_eq!(Token::HereDoc(HereDocData { content: "test".to_string(), literal: false }).category(), TokenCategory::String);
2783
2784        // Numbers
2785        assert_eq!(Token::Int(42).category(), TokenCategory::Number);
2786        assert_eq!(Token::Float(3.14).category(), TokenCategory::Number);
2787        assert_eq!(Token::Arithmetic("1+2".to_string()).category(), TokenCategory::Number);
2788
2789        // Variables
2790        assert_eq!(Token::SimpleVarRef("X".to_string()).category(), TokenCategory::Variable);
2791        assert_eq!(Token::VarRef("${X}".to_string()).category(), TokenCategory::Variable);
2792        assert_eq!(Token::Positional(1).category(), TokenCategory::Variable);
2793        assert_eq!(Token::AllArgs.category(), TokenCategory::Variable);
2794        assert_eq!(Token::ArgCount.category(), TokenCategory::Variable);
2795        assert_eq!(Token::LastExitCode.category(), TokenCategory::Variable);
2796        assert_eq!(Token::CurrentPid.category(), TokenCategory::Variable);
2797
2798        // Flags
2799        assert_eq!(Token::ShortFlag("l".to_string()).category(), TokenCategory::Flag);
2800        assert_eq!(Token::LongFlag("verbose".to_string()).category(), TokenCategory::Flag);
2801        assert_eq!(Token::PlusFlag("e".to_string()).category(), TokenCategory::Flag);
2802        assert_eq!(Token::DoubleDash.category(), TokenCategory::Flag);
2803
2804        // Punctuation
2805        assert_eq!(Token::Semi.category(), TokenCategory::Punctuation);
2806        assert_eq!(Token::LParen.category(), TokenCategory::Punctuation);
2807        assert_eq!(Token::LBracket.category(), TokenCategory::Punctuation);
2808        assert_eq!(Token::Newline.category(), TokenCategory::Punctuation);
2809
2810        // Comments
2811        assert_eq!(Token::Comment.category(), TokenCategory::Comment);
2812
2813        // Paths
2814        assert_eq!(Token::Path("/tmp/file".to_string()).category(), TokenCategory::Path);
2815
2816        // Commands
2817        assert_eq!(Token::Ident("echo".to_string()).category(), TokenCategory::Command);
2818
2819        // Errors
2820        assert_eq!(Token::InvalidNumberIdent.category(), TokenCategory::Error);
2821        assert_eq!(Token::InvalidFloatNoLeading.category(), TokenCategory::Error);
2822        assert_eq!(Token::InvalidFloatNoTrailing.category(), TokenCategory::Error);
2823    }
2824
2825    #[test]
2826    fn test_heredoc_piped_to_command() {
2827        // Bug 4: "cat <<EOF | jq" should produce: cat <<heredoc | jq
2828        // Not: cat | jq <<heredoc
2829        let tokens = tokenize("cat <<EOF | jq\n{\"key\": \"val\"}\nEOF").unwrap();
2830        let heredoc_pos = tokens.iter().position(|t| matches!(t.token, Token::HereDoc(_)));
2831        let pipe_pos = tokens.iter().position(|t| matches!(t.token, Token::Pipe));
2832        assert!(heredoc_pos.is_some(), "should have a heredoc token");
2833        assert!(pipe_pos.is_some(), "should have a pipe token");
2834        assert!(
2835            pipe_pos.unwrap() > heredoc_pos.unwrap(),
2836            "Pipe must come after heredoc, got heredoc at {}, pipe at {}. Tokens: {:?}",
2837            heredoc_pos.unwrap(), pipe_pos.unwrap(), tokens,
2838        );
2839    }
2840
2841    #[test]
2842    fn test_heredoc_standalone_still_works() {
2843        // Regression: standalone heredoc (no pipe) must still work
2844        let tokens = tokenize("cat <<EOF\nhello\nEOF").unwrap();
2845        assert!(tokens.iter().any(|t| matches!(t.token, Token::HereDoc(_))));
2846        assert!(!tokens.iter().any(|t| matches!(t.token, Token::Pipe)));
2847    }
2848
2849    #[test]
2850    fn test_heredoc_preserves_leading_empty_lines() {
2851        // Bug B: heredoc starting with a blank line must preserve it
2852        let tokens = tokenize("cat <<EOF\n\nhello\nEOF").unwrap();
2853        let heredoc = tokens.iter().find_map(|t| {
2854            if let Token::HereDoc(data) = &t.token {
2855                Some(data.clone())
2856            } else {
2857                None
2858            }
2859        });
2860        assert!(heredoc.is_some(), "should have a heredoc token");
2861        let data = heredoc.unwrap();
2862        assert!(data.content.starts_with('\n'), "leading empty line must be preserved, got: {:?}", data.content);
2863        assert_eq!(data.content, "\nhello");
2864    }
2865
2866    #[test]
2867    fn test_heredoc_quoted_delimiter_sets_literal() {
2868        // Bug N: quoted delimiter (<<'EOF') should set literal=true
2869        let tokens = tokenize("cat <<'EOF'\nhello $HOME\nEOF").unwrap();
2870        let heredoc = tokens.iter().find_map(|t| {
2871            if let Token::HereDoc(data) = &t.token {
2872                Some(data.clone())
2873            } else {
2874                None
2875            }
2876        });
2877        assert!(heredoc.is_some(), "should have a heredoc token");
2878        let data = heredoc.unwrap();
2879        assert!(data.literal, "quoted delimiter should set literal=true");
2880        assert_eq!(data.content, "hello $HOME");
2881    }
2882
2883    #[test]
2884    fn test_heredoc_unquoted_delimiter_not_literal() {
2885        // Bug N: unquoted delimiter (<<EOF) should have literal=false
2886        let tokens = tokenize("cat <<EOF\nhello $HOME\nEOF").unwrap();
2887        let heredoc = tokens.iter().find_map(|t| {
2888            if let Token::HereDoc(data) = &t.token {
2889                Some(data.clone())
2890            } else {
2891                None
2892            }
2893        });
2894        assert!(heredoc.is_some(), "should have a heredoc token");
2895        let data = heredoc.unwrap();
2896        assert!(!data.literal, "unquoted delimiter should have literal=false");
2897    }
2898
2899    // ═══════════════════════════════════════════════════════════════════
2900    // Colon merge tests
2901    // ═══════════════════════════════════════════════════════════════════
2902
2903    #[test]
2904    fn colon_double_in_word() {
2905        assert_eq!(lex("foo::bar"), vec![Token::Ident("foo::bar".into())]);
2906    }
2907
2908    #[test]
2909    fn colon_single_in_word() {
2910        assert_eq!(lex("a:b:c"), vec![Token::Ident("a:b:c".into())]);
2911    }
2912
2913    #[test]
2914    fn colon_with_port() {
2915        assert_eq!(lex("host:8080"), vec![Token::Ident("host:8080".into())]);
2916    }
2917
2918    #[test]
2919    fn colon_standalone() {
2920        assert_eq!(lex(":"), vec![Token::Colon]);
2921    }
2922
2923    #[test]
2924    fn colon_spaced_no_merge() {
2925        assert_eq!(
2926            lex("foo : bar"),
2927            vec![
2928                Token::Ident("foo".into()),
2929                Token::Colon,
2930                Token::Ident("bar".into()),
2931            ]
2932        );
2933    }
2934
2935    #[test]
2936    fn colon_in_command_arg() {
2937        assert_eq!(
2938            lex("echo foo::bar"),
2939            vec![
2940                Token::Ident("echo".into()),
2941                Token::Ident("foo::bar".into()),
2942            ]
2943        );
2944    }
2945
2946    #[test]
2947    fn colon_trailing() {
2948        // Trailing colon merges with preceding ident
2949        assert_eq!(lex("foo:"), vec![Token::Ident("foo:".into())]);
2950    }
2951
2952    #[test]
2953    fn colon_leading() {
2954        // Leading colon merges with following ident
2955        assert_eq!(lex(":foo"), vec![Token::Ident(":foo".into())]);
2956    }
2957
2958    #[test]
2959    fn colon_with_path() {
2960        // Path token + colon + int
2961        assert_eq!(
2962            lex("/usr/bin:8080"),
2963            vec![Token::Ident("/usr/bin:8080".into())]
2964        );
2965    }
2966}