kaish_kernel/
lexer.rs

1//! Lexer for kaish source code.
2//!
3//! Converts source text into a stream of tokens using the logos lexer generator.
4//! The lexer is designed to be unambiguous: every valid input produces exactly
5//! one token sequence, and invalid input produces clear errors.
6//!
7//! # Token Categories
8//!
9//! - **Keywords**: `set`, `tool`, `if`, `then`, `else`, `fi`, `for`, `in`, `do`, `done`
10//! - **Literals**: strings, integers, floats, booleans (`true`/`false`)
11//! - **Operators**: `=`, `|`, `&`, `>`, `>>`, `<`, `2>`, `&>`, `&&`, `||`
12//! - **Punctuation**: `;`, `:`, `,`, `.`, `{`, `}`, `[`, `]`
13//! - **Variable references**: `${...}` with nested path access
14//! - **Identifiers**: command names, variable names, parameter names
15
16use logos::{Logos, Span};
17use std::fmt;
18use std::sync::atomic::{AtomicU64, Ordering};
19use std::time::{SystemTime, UNIX_EPOCH};
20
21/// Global counter for generating unique markers across all tokenize calls.
22static MARKER_COUNTER: AtomicU64 = AtomicU64::new(0);
23
24/// Maximum nesting depth for parentheses in arithmetic expressions.
25/// Prevents stack overflow from pathologically nested inputs like $((((((...
26const MAX_PAREN_DEPTH: usize = 256;
27
28/// Tracks a text replacement for span correction.
29/// When preprocessing replaces text (like `$((1+2))` with a marker),
30/// we need to adjust subsequent spans to account for the length change.
31#[derive(Debug, Clone)]
32struct SpanReplacement {
33    /// Position in the preprocessed text where the marker starts.
34    preprocessed_pos: usize,
35    /// Length of the marker in preprocessed text.
36    marker_len: usize,
37    /// Length of the original text that was replaced.
38    original_len: usize,
39}
40
41/// Corrects a span from preprocessed-text coordinates back to original-text coordinates.
42fn correct_span(span: Span, replacements: &[SpanReplacement]) -> Span {
43    let mut start_adjustment: isize = 0;
44    let mut end_adjustment: isize = 0;
45
46    for r in replacements {
47        // Calculate the length difference (positive = original was longer, negative = marker is longer)
48        let delta = r.original_len as isize - r.marker_len as isize;
49
50        // If the span starts after this replacement, adjust the start
51        if span.start > r.preprocessed_pos + r.marker_len {
52            start_adjustment += delta;
53        } else if span.start > r.preprocessed_pos {
54            // Span starts inside the marker - map to original position
55            // (this shouldn't happen often, but handle it gracefully)
56            start_adjustment += delta;
57        }
58
59        // If the span ends after this replacement, adjust the end
60        if span.end > r.preprocessed_pos + r.marker_len {
61            end_adjustment += delta;
62        } else if span.end > r.preprocessed_pos {
63            // Span ends inside the marker - map to end of original
64            end_adjustment += delta;
65        }
66    }
67
68    let new_start = (span.start as isize + start_adjustment).max(0) as usize;
69    let new_end = (span.end as isize + end_adjustment).max(new_start as isize) as usize;
70    new_start..new_end
71}
72
73/// Generate a unique marker ID that's extremely unlikely to collide with user code.
74/// Uses a combination of timestamp, counter, and process ID.
75fn unique_marker_id() -> String {
76    let timestamp = SystemTime::now()
77        .duration_since(UNIX_EPOCH)
78        .map(|d| d.as_nanos())
79        .unwrap_or(0);
80    let counter = MARKER_COUNTER.fetch_add(1, Ordering::Relaxed);
81    let pid = std::process::id();
82    format!("{:x}_{:x}_{:x}", timestamp, counter, pid)
83}
84
85/// A token with its span in the source text.
86#[derive(Debug, Clone, PartialEq)]
87pub struct Spanned<T> {
88    pub token: T,
89    pub span: Span,
90}
91
92impl<T> Spanned<T> {
93    pub fn new(token: T, span: Span) -> Self {
94        Self { token, span }
95    }
96}
97
98/// Lexer error types.
99#[derive(Debug, Clone, PartialEq, Default)]
100pub enum LexerError {
101    #[default]
102    UnexpectedCharacter,
103    UnterminatedString,
104    UnterminatedVarRef,
105    InvalidEscape,
106    InvalidNumber,
107    AmbiguousBoolean(String),
108    AmbiguousBooleanLike(String),
109    InvalidNumberIdent(String),
110    InvalidFloatNoLeading,
111    InvalidFloatNoTrailing,
112    /// Nesting depth exceeded (too many nested parentheses in arithmetic).
113    NestingTooDeep,
114}
115
116impl fmt::Display for LexerError {
117    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
118        match self {
119            LexerError::UnexpectedCharacter => write!(f, "unexpected character"),
120            LexerError::UnterminatedString => write!(f, "unterminated string"),
121            LexerError::UnterminatedVarRef => write!(f, "unterminated variable reference"),
122            LexerError::InvalidEscape => write!(f, "invalid escape sequence"),
123            LexerError::InvalidNumber => write!(f, "invalid number"),
124            LexerError::AmbiguousBoolean(s) => {
125                write!(f, "ambiguous boolean, use lowercase '{}'", s.to_lowercase())
126            }
127            LexerError::AmbiguousBooleanLike(s) => {
128                let suggest = if s.eq_ignore_ascii_case("yes") { "true" } else { "false" };
129                write!(f, "ambiguous boolean-like '{}', use '{}' or '\"{}\"'", s, suggest, s)
130            }
131            LexerError::InvalidNumberIdent(s) => {
132                write!(f, "identifier cannot start with digit: {}", s)
133            }
134            LexerError::InvalidFloatNoLeading => write!(f, "float must have leading digit"),
135            LexerError::InvalidFloatNoTrailing => write!(f, "float must have trailing digit"),
136            LexerError::NestingTooDeep => write!(f, "nesting depth exceeded (max {})", MAX_PAREN_DEPTH),
137        }
138    }
139}
140
141/// Tokens produced by the kaish lexer.
142///
143/// The order of variants matters for logos priority. More specific patterns
144/// (like keywords) should come before more general ones (like identifiers).
145///
146/// Tokens that carry semantic values (strings, numbers, identifiers) include
147/// the parsed value directly. This ensures the parser has access to actual
148/// data, not just token types.
149#[derive(Logos, Debug, Clone, PartialEq)]
150#[logos(error = LexerError)]
151#[logos(skip r"[ \t]+")]
152pub enum Token {
153    // ═══════════════════════════════════════════════════════════════════
154    // Keywords (must come before Ident for priority)
155    // ═══════════════════════════════════════════════════════════════════
156    #[token("set")]
157    Set,
158
159    #[token("local")]
160    Local,
161
162    #[token("if")]
163    If,
164
165    #[token("then")]
166    Then,
167
168    #[token("else")]
169    Else,
170
171    #[token("elif")]
172    Elif,
173
174    #[token("fi")]
175    Fi,
176
177    #[token("for")]
178    For,
179
180    #[token("while")]
181    While,
182
183    #[token("in")]
184    In,
185
186    #[token("do")]
187    Do,
188
189    #[token("done")]
190    Done,
191
192    #[token("case")]
193    Case,
194
195    #[token("esac")]
196    Esac,
197
198    #[token("function")]
199    Function,
200
201    #[token("break")]
202    Break,
203
204    #[token("continue")]
205    Continue,
206
207    #[token("return")]
208    Return,
209
210    #[token("exit")]
211    Exit,
212
213    #[token("true")]
214    True,
215
216    #[token("false")]
217    False,
218
219    // ═══════════════════════════════════════════════════════════════════
220    // Type keywords (for tool parameters)
221    // ═══════════════════════════════════════════════════════════════════
222    #[token("string")]
223    TypeString,
224
225    #[token("int")]
226    TypeInt,
227
228    #[token("float")]
229    TypeFloat,
230
231    #[token("bool")]
232    TypeBool,
233
234    // ═══════════════════════════════════════════════════════════════════
235    // Multi-character operators (must come before single-char versions)
236    // ═══════════════════════════════════════════════════════════════════
237    #[token("&&")]
238    And,
239
240    #[token("||")]
241    Or,
242
243    #[token("==")]
244    EqEq,
245
246    #[token("!=")]
247    NotEq,
248
249    #[token("=~")]
250    Match,
251
252    #[token("!~")]
253    NotMatch,
254
255    #[token(">=")]
256    GtEq,
257
258    #[token("<=")]
259    LtEq,
260
261    #[token(">>")]
262    GtGt,
263
264    #[token("2>&1")]
265    StderrToStdout,
266
267    #[token("1>&2")]
268    StdoutToStderr,
269
270    #[token(">&2")]
271    StdoutToStderr2,
272
273    #[token("2>")]
274    Stderr,
275
276    #[token("&>")]
277    Both,
278
279    #[token("<<")]
280    HereDocStart,
281
282    #[token(";;")]
283    DoubleSemi,
284
285    // ═══════════════════════════════════════════════════════════════════
286    // Single-character operators and punctuation
287    // ═══════════════════════════════════════════════════════════════════
288    #[token("=")]
289    Eq,
290
291    #[token("|")]
292    Pipe,
293
294    #[token("&")]
295    Amp,
296
297    #[token(">")]
298    Gt,
299
300    #[token("<")]
301    Lt,
302
303    #[token(";")]
304    Semi,
305
306    #[token(":")]
307    Colon,
308
309    #[token(",")]
310    Comma,
311
312    #[token(".")]
313    Dot,
314
315    #[token("{")]
316    LBrace,
317
318    #[token("}")]
319    RBrace,
320
321    #[token("[")]
322    LBracket,
323
324    #[token("]")]
325    RBracket,
326
327    #[token("(")]
328    LParen,
329
330    #[token(")")]
331    RParen,
332
333    #[token("*")]
334    Star,
335
336    #[token("!")]
337    Bang,
338
339    #[token("?")]
340    Question,
341
342    // ═══════════════════════════════════════════════════════════════════
343    // Command substitution
344    // ═══════════════════════════════════════════════════════════════════
345
346    /// Arithmetic expression content: synthesized by preprocessing.
347    /// Contains the expression string between `$((` and `))`.
348    Arithmetic(String),
349
350    /// Command substitution start: `$(` - begins a command substitution
351    #[token("$(")]
352    CmdSubstStart,
353
354    // ═══════════════════════════════════════════════════════════════════
355    // Flags (must come before Int to win over negative numbers)
356    // ═══════════════════════════════════════════════════════════════════
357
358    /// Long flag: `--name` or `--foo-bar`
359    #[regex(r"--[a-zA-Z][a-zA-Z0-9-]*", lex_long_flag, priority = 3)]
360    LongFlag(String),
361
362    /// Short flag: `-l` or `-la` (combined short flags)
363    #[regex(r"-[a-zA-Z][a-zA-Z0-9]*", lex_short_flag, priority = 3)]
364    ShortFlag(String),
365
366    /// Plus flag: `+e` or `+x` (for set +e to disable options)
367    #[regex(r"\+[a-zA-Z][a-zA-Z0-9]*", lex_plus_flag, priority = 3)]
368    PlusFlag(String),
369
370    /// Double dash: `--` alone marks end of flags
371    #[token("--")]
372    DoubleDash,
373
374    /// Bare word starting with + followed by non-letter: `+%s`, `+%Y-%m-%d`
375    /// For date format strings and similar. Lower priority than PlusFlag.
376    #[regex(r"\+[^a-zA-Z\s][^\s]*", lex_plus_bare, priority = 2)]
377    PlusBare(String),
378
379    /// Bare word starting with - followed by non-letter/digit/dash: `-%`, etc.
380    /// For rare cases. Lower priority than ShortFlag, Int, and DoubleDash.
381    /// Excludes - after first - to avoid matching --name patterns.
382    #[regex(r"-[^a-zA-Z0-9\s\-][^\s]*", lex_minus_bare, priority = 1)]
383    MinusBare(String),
384
385    /// Standalone - (stdin indicator for cat -, diff - -, etc.)
386    /// Only matches when followed by whitespace or end.
387    /// This is handled specially in the parser as a positional arg.
388    #[token("-")]
389    MinusAlone,
390
391    // ═══════════════════════════════════════════════════════════════════
392    // Literals (with values)
393    // ═══════════════════════════════════════════════════════════════════
394
395    /// Double-quoted string: `"..."` - value is the parsed content (quotes removed, escapes processed)
396    #[regex(r#""([^"\\]|\\.)*""#, lex_string)]
397    String(String),
398
399    /// Single-quoted string: `'...'` - literal content, no escape processing
400    #[regex(r"'[^']*'", lex_single_string)]
401    SingleString(String),
402
403    /// Braced variable reference: `${VAR}` or `${VAR.field}` - value is the raw inner content
404    #[regex(r"\$\{[^}]+\}", lex_varref)]
405    VarRef(String),
406
407    /// Simple variable reference: `$NAME` - just the identifier
408    #[regex(r"\$[a-zA-Z_][a-zA-Z0-9_]*", lex_simple_varref)]
409    SimpleVarRef(String),
410
411    /// Positional parameter: `$0` through `$9`
412    #[regex(r"\$[0-9]", lex_positional)]
413    Positional(usize),
414
415    /// All positional parameters: `$@`
416    #[token("$@")]
417    AllArgs,
418
419    /// Number of positional parameters: `$#`
420    #[token("$#")]
421    ArgCount,
422
423    /// Last exit code: `$?`
424    #[token("$?")]
425    LastExitCode,
426
427    /// Current shell PID: `$$`
428    #[token("$$")]
429    CurrentPid,
430
431    /// Variable string length: `${#VAR}`
432    #[regex(r"\$\{#[a-zA-Z_][a-zA-Z0-9_]*\}", lex_var_length)]
433    VarLength(String),
434
435    /// Here-doc content: synthesized by preprocessing, not directly lexed.
436    /// Contains the full content of the here-doc (without the delimiter lines).
437    HereDoc(String),
438
439    /// Integer literal - value is the parsed i64
440    #[regex(r"-?[0-9]+", lex_int, priority = 2)]
441    Int(i64),
442
443    /// Float literal - value is the parsed f64
444    #[regex(r"-?[0-9]+\.[0-9]+", lex_float)]
445    Float(f64),
446
447    // ═══════════════════════════════════════════════════════════════════
448    // Invalid patterns (caught before valid tokens for better errors)
449    // ═══════════════════════════════════════════════════════════════════
450
451    /// Invalid: number followed by identifier characters (like 123abc)
452    #[regex(r"[0-9]+[a-zA-Z_][a-zA-Z0-9_-]*", lex_invalid_number_ident, priority = 3)]
453    InvalidNumberIdent,
454
455    /// Invalid: float without leading digit (like .5)
456    #[regex(r"\.[0-9]+", lex_invalid_float_no_leading, priority = 3)]
457    InvalidFloatNoLeading,
458
459    /// Invalid: float without trailing digit (like 5.)
460    /// Logos uses longest-match, so valid floats like 5.5 will match Float pattern instead
461    #[regex(r"[0-9]+\.", lex_invalid_float_no_trailing, priority = 2)]
462    InvalidFloatNoTrailing,
463
464    // ═══════════════════════════════════════════════════════════════════
465    // Paths (absolute paths starting with /)
466    // ═══════════════════════════════════════════════════════════════════
467
468    /// Absolute path: `/tmp/out`, `/etc/hosts`, etc.
469    #[regex(r"/[a-zA-Z0-9_./+-]*", lex_path)]
470    Path(String),
471
472    // ═══════════════════════════════════════════════════════════════════
473    // Identifiers (command names, variable names, etc.)
474    // ═══════════════════════════════════════════════════════════════════
475
476    /// Identifier - value is the identifier string
477    /// Allows dots for filenames like `script.kai`
478    #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*", lex_ident)]
479    Ident(String),
480
481    // ═══════════════════════════════════════════════════════════════════
482    // Structural tokens
483    // ═══════════════════════════════════════════════════════════════════
484
485    /// Comment: `# ...` to end of line
486    #[regex(r"#[^\n\r]*", allow_greedy = true)]
487    Comment,
488
489    /// Newline (significant in kaish - ends statements)
490    #[regex(r"\n|\r\n")]
491    Newline,
492
493    /// Line continuation: backslash at end of line
494    #[regex(r"\\[ \t]*(\n|\r\n)")]
495    LineContinuation,
496}
497
498/// Semantic category for syntax highlighting.
499///
500/// Stable enum that groups tokens by purpose. Consumers match on categories
501/// instead of individual tokens, insulating them from lexer evolution.
502#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
503pub enum TokenCategory {
504    /// Keywords: if, then, else, for, while, function, return, etc.
505    Keyword,
506    /// Operators: |, &&, ||, >, >>, 2>&1, =, ==, etc.
507    Operator,
508    /// String literals: "...", '...', heredocs
509    String,
510    /// Numeric literals: 123, 3.14, arithmetic expressions
511    Number,
512    /// Variable references: $foo, ${bar}, $1, $@, $#, $?, $$
513    Variable,
514    /// Comments: # ...
515    Comment,
516    /// Punctuation: ; , . ( ) { } [ ]
517    Punctuation,
518    /// Identifiers in command position
519    Command,
520    /// Absolute paths: /foo/bar
521    Path,
522    /// Flags: --long, -s, +x
523    Flag,
524    /// Invalid tokens
525    Error,
526}
527
528impl Token {
529    /// Returns the semantic category for syntax highlighting.
530    pub fn category(&self) -> TokenCategory {
531        match self {
532            // Keywords
533            Token::If
534            | Token::Then
535            | Token::Else
536            | Token::Elif
537            | Token::Fi
538            | Token::For
539            | Token::In
540            | Token::Do
541            | Token::Done
542            | Token::While
543            | Token::Case
544            | Token::Esac
545            | Token::Function
546            | Token::Return
547            | Token::Break
548            | Token::Continue
549            | Token::Exit
550            | Token::Set
551            | Token::Local
552            | Token::True
553            | Token::False
554            | Token::TypeString
555            | Token::TypeInt
556            | Token::TypeFloat
557            | Token::TypeBool => TokenCategory::Keyword,
558
559            // Operators and redirections
560            Token::Pipe
561            | Token::And
562            | Token::Or
563            | Token::Amp
564            | Token::Eq
565            | Token::EqEq
566            | Token::NotEq
567            | Token::Match
568            | Token::NotMatch
569            | Token::Lt
570            | Token::Gt
571            | Token::LtEq
572            | Token::GtEq
573            | Token::GtGt
574            | Token::Stderr
575            | Token::Both
576            | Token::HereDocStart
577            | Token::StderrToStdout
578            | Token::StdoutToStderr
579            | Token::StdoutToStderr2 => TokenCategory::Operator,
580
581            // Strings
582            Token::String(_) | Token::SingleString(_) | Token::HereDoc(_) => TokenCategory::String,
583
584            // Numbers
585            Token::Int(_) | Token::Float(_) | Token::Arithmetic(_) => TokenCategory::Number,
586
587            // Variables
588            Token::VarRef(_)
589            | Token::SimpleVarRef(_)
590            | Token::Positional(_)
591            | Token::AllArgs
592            | Token::ArgCount
593            | Token::VarLength(_)
594            | Token::LastExitCode
595            | Token::CurrentPid => TokenCategory::Variable,
596
597            // Flags
598            Token::LongFlag(_)
599            | Token::ShortFlag(_)
600            | Token::PlusFlag(_)
601            | Token::DoubleDash => TokenCategory::Flag,
602
603            // Punctuation
604            Token::Semi
605            | Token::DoubleSemi
606            | Token::Colon
607            | Token::Comma
608            | Token::Dot
609            | Token::LParen
610            | Token::RParen
611            | Token::LBrace
612            | Token::RBrace
613            | Token::LBracket
614            | Token::RBracket
615            | Token::Bang
616            | Token::Question
617            | Token::Star
618            | Token::Newline
619            | Token::LineContinuation
620            | Token::CmdSubstStart => TokenCategory::Punctuation,
621
622            // Comments
623            Token::Comment => TokenCategory::Comment,
624
625            // Paths
626            Token::Path(_) => TokenCategory::Path,
627
628            // Commands/identifiers (and bare words)
629            Token::Ident(_)
630            | Token::PlusBare(_)
631            | Token::MinusBare(_)
632            | Token::MinusAlone => TokenCategory::Command,
633
634            // Errors
635            Token::InvalidNumberIdent
636            | Token::InvalidFloatNoLeading
637            | Token::InvalidFloatNoTrailing => TokenCategory::Error,
638        }
639    }
640}
641
642/// Lex a double-quoted string literal, processing escape sequences.
643fn lex_string(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
644    parse_string_literal(lex.slice())
645}
646
647/// Lex a single-quoted string literal (no escape processing).
648fn lex_single_string(lex: &mut logos::Lexer<Token>) -> String {
649    let s = lex.slice();
650    // Strip the surrounding single quotes
651    s[1..s.len() - 1].to_string()
652}
653
654/// Lex a braced variable reference, extracting the inner content.
655fn lex_varref(lex: &mut logos::Lexer<Token>) -> String {
656    // Keep the full ${...} for later parsing of path segments
657    lex.slice().to_string()
658}
659
660/// Lex a simple variable reference: `$NAME` → `NAME`
661fn lex_simple_varref(lex: &mut logos::Lexer<Token>) -> String {
662    // Strip the leading `$`
663    lex.slice()[1..].to_string()
664}
665
666/// Lex a positional parameter: `$1` → 1
667fn lex_positional(lex: &mut logos::Lexer<Token>) -> usize {
668    // Strip the leading `$` and parse the digit
669    lex.slice()[1..].parse().unwrap_or(0)
670}
671
672/// Lex a variable length: `${#VAR}` → "VAR"
673fn lex_var_length(lex: &mut logos::Lexer<Token>) -> String {
674    // Strip the leading `${#` and trailing `}`
675    let s = lex.slice();
676    s[3..s.len() - 1].to_string()
677}
678
679/// Lex an integer literal.
680fn lex_int(lex: &mut logos::Lexer<Token>) -> Result<i64, LexerError> {
681    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
682}
683
684/// Lex a float literal.
685fn lex_float(lex: &mut logos::Lexer<Token>) -> Result<f64, LexerError> {
686    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
687}
688
689/// Lex an invalid number-identifier pattern (like 123abc).
690/// Always returns Err to produce a lexer error instead of a token.
691fn lex_invalid_number_ident(lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
692    Err(LexerError::InvalidNumberIdent(lex.slice().to_string()))
693}
694
695/// Lex an invalid float without leading digit (like .5).
696/// Always returns Err to produce a lexer error instead of a token.
697fn lex_invalid_float_no_leading(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
698    Err(LexerError::InvalidFloatNoLeading)
699}
700
701/// Lex an invalid float without trailing digit (like 5.).
702/// Always returns Err to produce a lexer error instead of a token.
703fn lex_invalid_float_no_trailing(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
704    Err(LexerError::InvalidFloatNoTrailing)
705}
706
707/// Lex an identifier, rejecting ambiguous boolean-like values.
708fn lex_ident(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
709    let s = lex.slice();
710
711    // Reject ambiguous boolean variants (TRUE, FALSE, True, etc.)
712    // Only lowercase 'true' and 'false' are valid booleans (handled by Token::True/False)
713    match s.to_lowercase().as_str() {
714        "true" | "false" if s != "true" && s != "false" => {
715            return Err(LexerError::AmbiguousBoolean(s.to_string()));
716        }
717        _ => {}
718    }
719
720    // Reject yes/no/YES/NO/Yes/No as ambiguous boolean-like values
721    if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("no") {
722        return Err(LexerError::AmbiguousBooleanLike(s.to_string()));
723    }
724
725    Ok(s.to_string())
726}
727
728/// Lex a long flag: `--name` → `name`
729fn lex_long_flag(lex: &mut logos::Lexer<Token>) -> String {
730    // Strip the leading `--`
731    lex.slice()[2..].to_string()
732}
733
734/// Lex a short flag: `-l` → `l`, `-la` → `la`
735fn lex_short_flag(lex: &mut logos::Lexer<Token>) -> String {
736    // Strip the leading `-`
737    lex.slice()[1..].to_string()
738}
739
740/// Lex a plus flag: `+e` → `e`, `+ex` → `ex`
741fn lex_plus_flag(lex: &mut logos::Lexer<Token>) -> String {
742    // Strip the leading `+`
743    lex.slice()[1..].to_string()
744}
745
746/// Lex a plus bare word: `+%s` → `+%s` (keep the full string)
747fn lex_plus_bare(lex: &mut logos::Lexer<Token>) -> String {
748    lex.slice().to_string()
749}
750
751/// Lex a minus bare word: `-%` → `-%` (keep the full string)
752fn lex_minus_bare(lex: &mut logos::Lexer<Token>) -> String {
753    lex.slice().to_string()
754}
755
756/// Lex an absolute path: `/tmp/out` → `/tmp/out`
757fn lex_path(lex: &mut logos::Lexer<Token>) -> String {
758    lex.slice().to_string()
759}
760
761impl fmt::Display for Token {
762    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
763        match self {
764            Token::Set => write!(f, "set"),
765            Token::Local => write!(f, "local"),
766            Token::If => write!(f, "if"),
767            Token::Then => write!(f, "then"),
768            Token::Else => write!(f, "else"),
769            Token::Elif => write!(f, "elif"),
770            Token::Fi => write!(f, "fi"),
771            Token::For => write!(f, "for"),
772            Token::While => write!(f, "while"),
773            Token::In => write!(f, "in"),
774            Token::Do => write!(f, "do"),
775            Token::Done => write!(f, "done"),
776            Token::Case => write!(f, "case"),
777            Token::Esac => write!(f, "esac"),
778            Token::Function => write!(f, "function"),
779            Token::Break => write!(f, "break"),
780            Token::Continue => write!(f, "continue"),
781            Token::Return => write!(f, "return"),
782            Token::Exit => write!(f, "exit"),
783            Token::True => write!(f, "true"),
784            Token::False => write!(f, "false"),
785            Token::TypeString => write!(f, "string"),
786            Token::TypeInt => write!(f, "int"),
787            Token::TypeFloat => write!(f, "float"),
788            Token::TypeBool => write!(f, "bool"),
789            Token::And => write!(f, "&&"),
790            Token::Or => write!(f, "||"),
791            Token::EqEq => write!(f, "=="),
792            Token::NotEq => write!(f, "!="),
793            Token::Match => write!(f, "=~"),
794            Token::NotMatch => write!(f, "!~"),
795            Token::GtEq => write!(f, ">="),
796            Token::LtEq => write!(f, "<="),
797            Token::GtGt => write!(f, ">>"),
798            Token::StderrToStdout => write!(f, "2>&1"),
799            Token::StdoutToStderr => write!(f, "1>&2"),
800            Token::StdoutToStderr2 => write!(f, ">&2"),
801            Token::Stderr => write!(f, "2>"),
802            Token::Both => write!(f, "&>"),
803            Token::HereDocStart => write!(f, "<<"),
804            Token::DoubleSemi => write!(f, ";;"),
805            Token::Eq => write!(f, "="),
806            Token::Pipe => write!(f, "|"),
807            Token::Amp => write!(f, "&"),
808            Token::Gt => write!(f, ">"),
809            Token::Lt => write!(f, "<"),
810            Token::Semi => write!(f, ";"),
811            Token::Colon => write!(f, ":"),
812            Token::Comma => write!(f, ","),
813            Token::Dot => write!(f, "."),
814            Token::LBrace => write!(f, "{{"),
815            Token::RBrace => write!(f, "}}"),
816            Token::LBracket => write!(f, "["),
817            Token::RBracket => write!(f, "]"),
818            Token::LParen => write!(f, "("),
819            Token::RParen => write!(f, ")"),
820            Token::Star => write!(f, "*"),
821            Token::Bang => write!(f, "!"),
822            Token::Question => write!(f, "?"),
823            Token::Arithmetic(s) => write!(f, "ARITHMETIC({})", s),
824            Token::CmdSubstStart => write!(f, "$("),
825            Token::LongFlag(s) => write!(f, "--{}", s),
826            Token::ShortFlag(s) => write!(f, "-{}", s),
827            Token::PlusFlag(s) => write!(f, "+{}", s),
828            Token::DoubleDash => write!(f, "--"),
829            Token::PlusBare(s) => write!(f, "{}", s),
830            Token::MinusBare(s) => write!(f, "{}", s),
831            Token::MinusAlone => write!(f, "-"),
832            Token::String(s) => write!(f, "STRING({:?})", s),
833            Token::SingleString(s) => write!(f, "SINGLESTRING({:?})", s),
834            Token::HereDoc(s) => write!(f, "HEREDOC({:?})", s),
835            Token::VarRef(v) => write!(f, "VARREF({})", v),
836            Token::SimpleVarRef(v) => write!(f, "SIMPLEVARREF({})", v),
837            Token::Positional(n) => write!(f, "${}", n),
838            Token::AllArgs => write!(f, "$@"),
839            Token::ArgCount => write!(f, "$#"),
840            Token::LastExitCode => write!(f, "$?"),
841            Token::CurrentPid => write!(f, "$$"),
842            Token::VarLength(v) => write!(f, "${{#{}}}", v),
843            Token::Int(n) => write!(f, "INT({})", n),
844            Token::Float(n) => write!(f, "FLOAT({})", n),
845            Token::Path(s) => write!(f, "PATH({})", s),
846            Token::Ident(s) => write!(f, "IDENT({})", s),
847            Token::Comment => write!(f, "COMMENT"),
848            Token::Newline => write!(f, "NEWLINE"),
849            Token::LineContinuation => write!(f, "LINECONT"),
850            // These variants should never be produced - their callbacks always return errors
851            Token::InvalidNumberIdent => write!(f, "INVALID_NUMBER_IDENT"),
852            Token::InvalidFloatNoLeading => write!(f, "INVALID_FLOAT_NO_LEADING"),
853            Token::InvalidFloatNoTrailing => write!(f, "INVALID_FLOAT_NO_TRAILING"),
854        }
855    }
856}
857
858impl Token {
859    /// Returns true if this token is a keyword.
860    pub fn is_keyword(&self) -> bool {
861        matches!(
862            self,
863            Token::Set
864                | Token::Local
865                | Token::If
866                | Token::Then
867                | Token::Else
868                | Token::Elif
869                | Token::Fi
870                | Token::For
871                | Token::In
872                | Token::Do
873                | Token::Done
874                | Token::Case
875                | Token::Esac
876                | Token::Function
877                | Token::True
878                | Token::False
879        )
880    }
881
882    /// Returns true if this token is a type keyword.
883    pub fn is_type(&self) -> bool {
884        matches!(
885            self,
886            Token::TypeString
887                | Token::TypeInt
888                | Token::TypeFloat
889                | Token::TypeBool
890        )
891    }
892
893    /// Returns true if this token starts a statement.
894    pub fn starts_statement(&self) -> bool {
895        matches!(
896            self,
897            Token::Set | Token::Local | Token::Function | Token::If | Token::For | Token::Case | Token::Ident(_) | Token::LBracket
898        )
899    }
900
901    /// Returns true if this token can appear in an expression.
902    pub fn is_value(&self) -> bool {
903        matches!(
904            self,
905            Token::String(_)
906                | Token::SingleString(_)
907                | Token::HereDoc(_)
908                | Token::Arithmetic(_)
909                | Token::Int(_)
910                | Token::Float(_)
911                | Token::True
912                | Token::False
913                | Token::VarRef(_)
914                | Token::SimpleVarRef(_)
915                | Token::CmdSubstStart
916                | Token::Path(_)
917                | Token::LastExitCode
918                | Token::CurrentPid
919        )
920    }
921}
922
923/// Result of preprocessing arithmetic expressions.
924struct ArithmeticPreprocessResult {
925    /// Preprocessed source with markers replacing $((expr)).
926    text: String,
927    /// Vector of (marker, expression_content) pairs.
928    arithmetics: Vec<(String, String)>,
929    /// Span replacements for correcting token positions.
930    replacements: Vec<SpanReplacement>,
931}
932
933/// Preprocess arithmetic expressions in source code.
934///
935/// Finds `$((expr))` patterns and replaces them with markers.
936/// Returns the preprocessed source, arithmetic contents, and span replacement info.
937///
938/// Example:
939///   `X=$((1 + 2))`
940/// Becomes:
941///   `X=__KAISH_ARITH_{id}__`
942/// With arithmetics[0] = ("__KAISH_ARITH_{id}__", "1 + 2")
943///
944/// # Errors
945/// Returns `LexerError::NestingTooDeep` if parentheses are nested beyond MAX_PAREN_DEPTH.
946fn preprocess_arithmetic(source: &str) -> Result<ArithmeticPreprocessResult, LexerError> {
947    let mut result = String::with_capacity(source.len());
948    let mut arithmetics: Vec<(String, String)> = Vec::new();
949    let mut replacements: Vec<SpanReplacement> = Vec::new();
950    let mut source_pos: usize = 0;
951    let chars_vec: Vec<char> = source.chars().collect();
952    let mut i = 0;
953
954    while i < chars_vec.len() {
955        let ch = chars_vec[i];
956
957        // Look for $(( (potential arithmetic)
958        if ch == '$' && i + 2 < chars_vec.len() && chars_vec[i + 1] == '(' && chars_vec[i + 2] == '(' {
959            let arith_start_pos = result.len();
960            let original_start = source_pos;
961
962            // Skip $((
963            i += 3;
964            source_pos += 3;
965
966            // Collect expression until matching ))
967            let mut expr = String::new();
968            let mut paren_depth: usize = 0;
969
970            while i < chars_vec.len() {
971                let c = chars_vec[i];
972                match c {
973                    '(' => {
974                        paren_depth += 1;
975                        if paren_depth > MAX_PAREN_DEPTH {
976                            return Err(LexerError::NestingTooDeep);
977                        }
978                        expr.push('(');
979                        i += 1;
980                        source_pos += c.len_utf8();
981                    }
982                    ')' => {
983                        if paren_depth > 0 {
984                            paren_depth -= 1;
985                            expr.push(')');
986                            i += 1;
987                            source_pos += 1;
988                        } else if i + 1 < chars_vec.len() && chars_vec[i + 1] == ')' {
989                            // Found closing ))
990                            i += 2;
991                            source_pos += 2;
992                            break;
993                        } else {
994                            // Single ) inside - keep going
995                            expr.push(')');
996                            i += 1;
997                            source_pos += 1;
998                        }
999                    }
1000                    _ => {
1001                        expr.push(c);
1002                        i += 1;
1003                        source_pos += c.len_utf8();
1004                    }
1005                }
1006            }
1007
1008            // Calculate original length: from $$(( to ))
1009            let original_len = source_pos - original_start;
1010
1011            // Create a unique marker for this arithmetic (collision-resistant)
1012            let marker = format!("__KAISH_ARITH_{}__", unique_marker_id());
1013            let marker_len = marker.len();
1014
1015            // Record the replacement for span correction
1016            replacements.push(SpanReplacement {
1017                preprocessed_pos: arith_start_pos,
1018                marker_len,
1019                original_len,
1020            });
1021
1022            arithmetics.push((marker.clone(), expr));
1023            result.push_str(&marker);
1024        } else {
1025            result.push(ch);
1026            i += 1;
1027            source_pos += ch.len_utf8();
1028        }
1029    }
1030
1031    Ok(ArithmeticPreprocessResult {
1032        text: result,
1033        arithmetics,
1034        replacements,
1035    })
1036}
1037
1038/// Preprocess here-docs in source code.
1039///
1040/// Finds `<<WORD` patterns and collects content until the delimiter line.
1041/// Returns the preprocessed source and a vector of (marker, content) pairs.
1042///
1043/// Example:
1044///   `cat <<EOF\nhello\nworld\nEOF`
1045/// Becomes:
1046///   `cat <<__HEREDOC_0__`
1047/// With heredocs[0] = ("__HEREDOC_0__", "hello\nworld")
1048fn preprocess_heredocs(source: &str) -> (String, Vec<(String, String)>) {
1049    let mut result = String::with_capacity(source.len());
1050    let mut heredocs: Vec<(String, String)> = Vec::new();
1051    let mut chars = source.chars().peekable();
1052
1053    while let Some(ch) = chars.next() {
1054        // Look for << (potential here-doc)
1055        if ch == '<' && chars.peek() == Some(&'<') {
1056            chars.next(); // consume second <
1057
1058            // Check for optional - (strip leading tabs)
1059            let strip_tabs = chars.peek() == Some(&'-');
1060            if strip_tabs {
1061                chars.next();
1062            }
1063
1064            // Skip whitespace before delimiter
1065            while let Some(&c) = chars.peek() {
1066                if c == ' ' || c == '\t' {
1067                    chars.next();
1068                } else {
1069                    break;
1070                }
1071            }
1072
1073            // Collect the delimiter word
1074            let mut delimiter = String::new();
1075            let quoted = chars.peek() == Some(&'\'') || chars.peek() == Some(&'"');
1076            let quote_char = if quoted { chars.next() } else { None };
1077
1078            while let Some(&c) = chars.peek() {
1079                if quoted {
1080                    if Some(c) == quote_char {
1081                        chars.next(); // consume closing quote
1082                        break;
1083                    }
1084                } else if c.is_whitespace() || c == '\n' || c == '\r' {
1085                    break;
1086                }
1087                if let Some(ch) = chars.next() {
1088                    delimiter.push(ch);
1089                }
1090            }
1091
1092            if delimiter.is_empty() {
1093                // Not a valid here-doc, output << literally
1094                result.push_str("<<");
1095                if strip_tabs {
1096                    result.push('-');
1097                }
1098                continue;
1099            }
1100
1101            // Skip to newline
1102            while let Some(&c) = chars.peek() {
1103                if c == '\n' {
1104                    chars.next();
1105                    break;
1106                } else if c == '\r' {
1107                    chars.next();
1108                    if chars.peek() == Some(&'\n') {
1109                        chars.next();
1110                    }
1111                    break;
1112                }
1113                if let Some(ch) = chars.next() {
1114                    result.push(ch);
1115                }
1116            }
1117
1118            // Collect content until delimiter on its own line
1119            let mut content = String::new();
1120            let mut current_line = String::new();
1121
1122            loop {
1123                match chars.next() {
1124                    Some('\n') => {
1125                        // Check if this line is the delimiter
1126                        let trimmed = if strip_tabs {
1127                            current_line.trim_start_matches('\t')
1128                        } else {
1129                            &current_line
1130                        };
1131                        if trimmed == delimiter {
1132                            // Found end of here-doc
1133                            break;
1134                        }
1135                        // Add line to content
1136                        if !content.is_empty() || !current_line.is_empty() {
1137                            content.push_str(&current_line);
1138                            content.push('\n');
1139                        }
1140                        current_line.clear();
1141                    }
1142                    Some('\r') => {
1143                        // Handle \r\n
1144                        if chars.peek() == Some(&'\n') {
1145                            chars.next();
1146                        }
1147                        let trimmed = if strip_tabs {
1148                            current_line.trim_start_matches('\t')
1149                        } else {
1150                            &current_line
1151                        };
1152                        if trimmed == delimiter {
1153                            break;
1154                        }
1155                        if !content.is_empty() || !current_line.is_empty() {
1156                            content.push_str(&current_line);
1157                            content.push('\n');
1158                        }
1159                        current_line.clear();
1160                    }
1161                    Some(c) => {
1162                        current_line.push(c);
1163                    }
1164                    None => {
1165                        // EOF - check if current line is the delimiter
1166                        let trimmed = if strip_tabs {
1167                            current_line.trim_start_matches('\t')
1168                        } else {
1169                            &current_line
1170                        };
1171                        if trimmed == delimiter {
1172                            // Found delimiter at EOF
1173                            break;
1174                        }
1175                        // Not a delimiter - include remaining content
1176                        if !current_line.is_empty() {
1177                            content.push_str(&current_line);
1178                        }
1179                        break;
1180                    }
1181                }
1182            }
1183
1184            // Remove trailing newline from content (we'll add it when needed)
1185            let content = content.trim_end_matches('\n').to_string();
1186
1187            // Create a unique marker for this here-doc (collision-resistant)
1188            let marker = format!("__KAISH_HEREDOC_{}__", unique_marker_id());
1189            heredocs.push((marker.clone(), content));
1190
1191            // Output << and marker
1192            result.push_str("<<");
1193            result.push_str(&marker);
1194            result.push('\n'); // Preserve newline after here-doc
1195        } else {
1196            result.push(ch);
1197        }
1198    }
1199
1200    (result, heredocs)
1201}
1202
1203/// Tokenize source code into a vector of spanned tokens.
1204///
1205/// Skips whitespace and comments (unless you need them for formatting).
1206/// Returns errors with their positions for nice error messages.
1207///
1208/// Handles:
1209/// - Arithmetic: `$((expr))` becomes `Arithmetic("expr")`
1210/// - Here-docs: `<<EOF\nhello\nEOF` becomes `HereDocStart` + `HereDoc("hello")`
1211pub fn tokenize(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1212    // Preprocess arithmetic first (before heredocs because heredoc content might contain $((
1213    let arith_result = preprocess_arithmetic(source)
1214        .map_err(|e| vec![Spanned::new(e, 0..source.len())])?;
1215
1216    // Then preprocess here-docs (heredoc span tracking is not implemented for simplicity)
1217    let (preprocessed, heredocs) = preprocess_heredocs(&arith_result.text);
1218
1219    // Combine replacements for span correction (arithmetic only for now)
1220    let span_replacements = arith_result.replacements;
1221
1222    let lexer = Token::lexer(&preprocessed);
1223    let mut tokens = Vec::new();
1224    let mut errors = Vec::new();
1225
1226    for (result, span) in lexer.spanned() {
1227        // Correct the span from preprocessed coordinates to original coordinates
1228        let corrected_span = correct_span(span, &span_replacements);
1229        match result {
1230            Ok(token) => {
1231                // Skip comments and line continuations - they're not needed for parsing
1232                if !matches!(token, Token::Comment | Token::LineContinuation) {
1233                    tokens.push(Spanned::new(token, corrected_span));
1234                }
1235            }
1236            Err(err) => {
1237                errors.push(Spanned::new(err, corrected_span));
1238            }
1239        }
1240    }
1241
1242    if !errors.is_empty() {
1243        return Err(errors);
1244    }
1245
1246    // Post-process: replace markers with actual token content
1247    let mut final_tokens = Vec::with_capacity(tokens.len());
1248    let mut i = 0;
1249
1250    while i < tokens.len() {
1251        // Check for arithmetic marker (unique format: __KAISH_ARITH_{id}__)
1252        if let Token::Ident(ref name) = tokens[i].token
1253            && name.starts_with("__KAISH_ARITH_") && name.ends_with("__")
1254                && let Some((_, expr)) = arith_result.arithmetics.iter().find(|(marker, _)| marker == name) {
1255                    final_tokens.push(Spanned::new(Token::Arithmetic(expr.clone()), tokens[i].span.clone()));
1256                    i += 1;
1257                    continue;
1258                }
1259
1260        // Check for heredoc (unique format: __KAISH_HEREDOC_{id}__)
1261        if matches!(tokens[i].token, Token::HereDocStart) {
1262            // Check if next token is a heredoc marker
1263            if i + 1 < tokens.len()
1264                && let Token::Ident(ref name) = tokens[i + 1].token
1265                    && name.starts_with("__KAISH_HEREDOC_") && name.ends_with("__") {
1266                        // Find the corresponding content
1267                        if let Some((_, content)) = heredocs.iter().find(|(marker, _)| marker == name) {
1268                            final_tokens.push(Spanned::new(Token::HereDocStart, tokens[i].span.clone()));
1269                            final_tokens.push(Spanned::new(Token::HereDoc(content.clone()), tokens[i + 1].span.clone()));
1270                            i += 2;
1271                            continue;
1272                        }
1273                    }
1274        }
1275
1276        // Check for arithmetic markers inside string content
1277        let token = if let Token::String(ref s) = tokens[i].token {
1278            // Check if string contains any arithmetic markers
1279            let mut new_content = s.clone();
1280            for (marker, expr) in &arith_result.arithmetics {
1281                if new_content.contains(marker) {
1282                    // Replace marker with the special format that parse_interpolated_string can detect
1283                    // Use ${__ARITH:expr__} format so it gets parsed as StringPart::Arithmetic
1284                    new_content = new_content.replace(marker, &format!("${{__ARITH:{}__}}", expr));
1285                }
1286            }
1287            if new_content != *s {
1288                Spanned::new(Token::String(new_content), tokens[i].span.clone())
1289            } else {
1290                tokens[i].clone()
1291            }
1292        } else {
1293            tokens[i].clone()
1294        };
1295        final_tokens.push(token);
1296        i += 1;
1297    }
1298
1299    Ok(final_tokens)
1300}
1301
1302/// Tokenize source code, preserving comments.
1303///
1304/// Useful for pretty-printing or formatting tools that need to preserve comments.
1305pub fn tokenize_with_comments(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1306    let lexer = Token::lexer(source);
1307    let mut tokens = Vec::new();
1308    let mut errors = Vec::new();
1309
1310    for (result, span) in lexer.spanned() {
1311        match result {
1312            Ok(token) => {
1313                tokens.push(Spanned::new(token, span));
1314            }
1315            Err(err) => {
1316                errors.push(Spanned::new(err, span));
1317            }
1318        }
1319    }
1320
1321    if errors.is_empty() {
1322        Ok(tokens)
1323    } else {
1324        Err(errors)
1325    }
1326}
1327
1328/// Extract the string content from a string token (removes quotes, processes escapes).
1329pub fn parse_string_literal(source: &str) -> Result<String, LexerError> {
1330    // Remove surrounding quotes
1331    if source.len() < 2 || !source.starts_with('"') || !source.ends_with('"') {
1332        return Err(LexerError::UnterminatedString);
1333    }
1334
1335    let inner = &source[1..source.len() - 1];
1336    let mut result = String::with_capacity(inner.len());
1337    let mut chars = inner.chars().peekable();
1338
1339    while let Some(ch) = chars.next() {
1340        if ch == '\\' {
1341            match chars.next() {
1342                Some('n') => result.push('\n'),
1343                Some('t') => result.push('\t'),
1344                Some('r') => result.push('\r'),
1345                Some('\\') => result.push('\\'),
1346                Some('"') => result.push('"'),
1347                // Use a unique marker for escaped dollar that won't be re-interpreted
1348                // parse_interpolated_string will convert this back to $
1349                Some('$') => result.push_str("__KAISH_ESCAPED_DOLLAR__"),
1350                Some('u') => {
1351                    // Unicode escape: \uXXXX
1352                    let mut hex = String::with_capacity(4);
1353                    for _ in 0..4 {
1354                        match chars.next() {
1355                            Some(h) if h.is_ascii_hexdigit() => hex.push(h),
1356                            _ => return Err(LexerError::InvalidEscape),
1357                        }
1358                    }
1359                    let codepoint = u32::from_str_radix(&hex, 16)
1360                        .map_err(|_| LexerError::InvalidEscape)?;
1361                    let ch = char::from_u32(codepoint)
1362                        .ok_or(LexerError::InvalidEscape)?;
1363                    result.push(ch);
1364                }
1365                // Unknown escapes: preserve the backslash (for regex patterns like `\.`)
1366                Some(next) => {
1367                    result.push('\\');
1368                    result.push(next);
1369                }
1370                None => return Err(LexerError::InvalidEscape),
1371            }
1372        } else {
1373            result.push(ch);
1374        }
1375    }
1376
1377    Ok(result)
1378}
1379
1380/// Parse a variable reference, extracting the path segments.
1381/// Input: "${VAR.field[0].nested}" → ["VAR", "field", "[0]", "nested"]
1382pub fn parse_var_ref(source: &str) -> Result<Vec<String>, LexerError> {
1383    // Remove ${ and }
1384    if source.len() < 4 || !source.starts_with("${") || !source.ends_with('}') {
1385        return Err(LexerError::UnterminatedVarRef);
1386    }
1387
1388    let inner = &source[2..source.len() - 1];
1389
1390    // Special case: $? (last result)
1391    if inner == "?" {
1392        return Ok(vec!["?".to_string()]);
1393    }
1394
1395    let mut segments = Vec::new();
1396    let mut current = String::new();
1397    let mut chars = inner.chars().peekable();
1398
1399    while let Some(ch) = chars.next() {
1400        match ch {
1401            '.' => {
1402                if !current.is_empty() {
1403                    segments.push(current.clone());
1404                    current.clear();
1405                }
1406            }
1407            '[' => {
1408                if !current.is_empty() {
1409                    segments.push(current.clone());
1410                    current.clear();
1411                }
1412                // Collect the index
1413                let mut index = String::from("[");
1414                while let Some(&c) = chars.peek() {
1415                    if let Some(c) = chars.next() {
1416                        index.push(c);
1417                    }
1418                    if c == ']' {
1419                        break;
1420                    }
1421                }
1422                segments.push(index);
1423            }
1424            _ => {
1425                current.push(ch);
1426            }
1427        }
1428    }
1429
1430    if !current.is_empty() {
1431        segments.push(current);
1432    }
1433
1434    Ok(segments)
1435}
1436
1437/// Parse an integer literal.
1438pub fn parse_int(source: &str) -> Result<i64, LexerError> {
1439    source.parse().map_err(|_| LexerError::InvalidNumber)
1440}
1441
1442/// Parse a float literal.
1443pub fn parse_float(source: &str) -> Result<f64, LexerError> {
1444    source.parse().map_err(|_| LexerError::InvalidNumber)
1445}
1446
1447#[cfg(test)]
1448mod tests {
1449    use super::*;
1450
1451    fn lex(source: &str) -> Vec<Token> {
1452        tokenize(source)
1453            .expect("lexer should succeed")
1454            .into_iter()
1455            .map(|s| s.token)
1456            .collect()
1457    }
1458
1459    // ═══════════════════════════════════════════════════════════════════
1460    // Keyword tests
1461    // ═══════════════════════════════════════════════════════════════════
1462
1463    #[test]
1464    fn keywords() {
1465        assert_eq!(lex("set"), vec![Token::Set]);
1466        assert_eq!(lex("if"), vec![Token::If]);
1467        assert_eq!(lex("then"), vec![Token::Then]);
1468        assert_eq!(lex("else"), vec![Token::Else]);
1469        assert_eq!(lex("elif"), vec![Token::Elif]);
1470        assert_eq!(lex("fi"), vec![Token::Fi]);
1471        assert_eq!(lex("for"), vec![Token::For]);
1472        assert_eq!(lex("in"), vec![Token::In]);
1473        assert_eq!(lex("do"), vec![Token::Do]);
1474        assert_eq!(lex("done"), vec![Token::Done]);
1475        assert_eq!(lex("case"), vec![Token::Case]);
1476        assert_eq!(lex("esac"), vec![Token::Esac]);
1477        assert_eq!(lex("function"), vec![Token::Function]);
1478        assert_eq!(lex("true"), vec![Token::True]);
1479        assert_eq!(lex("false"), vec![Token::False]);
1480    }
1481
1482    #[test]
1483    fn double_semicolon() {
1484        assert_eq!(lex(";;"), vec![Token::DoubleSemi]);
1485        // In case pattern context
1486        assert_eq!(lex("echo \"hi\";;"), vec![
1487            Token::Ident("echo".to_string()),
1488            Token::String("hi".to_string()),
1489            Token::DoubleSemi,
1490        ]);
1491    }
1492
1493    #[test]
1494    fn type_keywords() {
1495        assert_eq!(lex("string"), vec![Token::TypeString]);
1496        assert_eq!(lex("int"), vec![Token::TypeInt]);
1497        assert_eq!(lex("float"), vec![Token::TypeFloat]);
1498        assert_eq!(lex("bool"), vec![Token::TypeBool]);
1499    }
1500
1501    // ═══════════════════════════════════════════════════════════════════
1502    // Operator tests
1503    // ═══════════════════════════════════════════════════════════════════
1504
1505    #[test]
1506    fn single_char_operators() {
1507        assert_eq!(lex("="), vec![Token::Eq]);
1508        assert_eq!(lex("|"), vec![Token::Pipe]);
1509        assert_eq!(lex("&"), vec![Token::Amp]);
1510        assert_eq!(lex(">"), vec![Token::Gt]);
1511        assert_eq!(lex("<"), vec![Token::Lt]);
1512        assert_eq!(lex(";"), vec![Token::Semi]);
1513        assert_eq!(lex(":"), vec![Token::Colon]);
1514        assert_eq!(lex(","), vec![Token::Comma]);
1515        assert_eq!(lex("."), vec![Token::Dot]);
1516    }
1517
1518    #[test]
1519    fn multi_char_operators() {
1520        assert_eq!(lex("&&"), vec![Token::And]);
1521        assert_eq!(lex("||"), vec![Token::Or]);
1522        assert_eq!(lex("=="), vec![Token::EqEq]);
1523        assert_eq!(lex("!="), vec![Token::NotEq]);
1524        assert_eq!(lex("=~"), vec![Token::Match]);
1525        assert_eq!(lex("!~"), vec![Token::NotMatch]);
1526        assert_eq!(lex(">="), vec![Token::GtEq]);
1527        assert_eq!(lex("<="), vec![Token::LtEq]);
1528        assert_eq!(lex(">>"), vec![Token::GtGt]);
1529        assert_eq!(lex("2>"), vec![Token::Stderr]);
1530        assert_eq!(lex("&>"), vec![Token::Both]);
1531    }
1532
1533    #[test]
1534    fn brackets() {
1535        assert_eq!(lex("{"), vec![Token::LBrace]);
1536        assert_eq!(lex("}"), vec![Token::RBrace]);
1537        assert_eq!(lex("["), vec![Token::LBracket]);
1538        assert_eq!(lex("]"), vec![Token::RBracket]);
1539        assert_eq!(lex("("), vec![Token::LParen]);
1540        assert_eq!(lex(")"), vec![Token::RParen]);
1541    }
1542
1543    // ═══════════════════════════════════════════════════════════════════
1544    // Literal tests
1545    // ═══════════════════════════════════════════════════════════════════
1546
1547    #[test]
1548    fn integers() {
1549        assert_eq!(lex("0"), vec![Token::Int(0)]);
1550        assert_eq!(lex("42"), vec![Token::Int(42)]);
1551        assert_eq!(lex("-1"), vec![Token::Int(-1)]);
1552        assert_eq!(lex("999999"), vec![Token::Int(999999)]);
1553    }
1554
1555    #[test]
1556    fn floats() {
1557        assert_eq!(lex("3.14"), vec![Token::Float(3.14)]);
1558        assert_eq!(lex("-0.5"), vec![Token::Float(-0.5)]);
1559        assert_eq!(lex("123.456"), vec![Token::Float(123.456)]);
1560    }
1561
1562    #[test]
1563    fn strings() {
1564        assert_eq!(lex(r#""hello""#), vec![Token::String("hello".to_string())]);
1565        assert_eq!(lex(r#""hello world""#), vec![Token::String("hello world".to_string())]);
1566        assert_eq!(lex(r#""""#), vec![Token::String("".to_string())]); // empty string
1567        assert_eq!(lex(r#""with \"quotes\"""#), vec![Token::String("with \"quotes\"".to_string())]);
1568        assert_eq!(lex(r#""with\nnewline""#), vec![Token::String("with\nnewline".to_string())]);
1569    }
1570
1571    #[test]
1572    fn var_refs() {
1573        assert_eq!(lex("${X}"), vec![Token::VarRef("${X}".to_string())]);
1574        assert_eq!(lex("${VAR}"), vec![Token::VarRef("${VAR}".to_string())]);
1575        assert_eq!(lex("${VAR.field}"), vec![Token::VarRef("${VAR.field}".to_string())]);
1576        assert_eq!(lex("${VAR[0]}"), vec![Token::VarRef("${VAR[0]}".to_string())]);
1577        assert_eq!(lex("${?.ok}"), vec![Token::VarRef("${?.ok}".to_string())]);
1578    }
1579
1580    // ═══════════════════════════════════════════════════════════════════
1581    // Identifier tests
1582    // ═══════════════════════════════════════════════════════════════════
1583
1584    #[test]
1585    fn identifiers() {
1586        assert_eq!(lex("foo"), vec![Token::Ident("foo".to_string())]);
1587        assert_eq!(lex("foo_bar"), vec![Token::Ident("foo_bar".to_string())]);
1588        assert_eq!(lex("foo-bar"), vec![Token::Ident("foo-bar".to_string())]);
1589        assert_eq!(lex("_private"), vec![Token::Ident("_private".to_string())]);
1590        assert_eq!(lex("cmd123"), vec![Token::Ident("cmd123".to_string())]);
1591    }
1592
1593    #[test]
1594    fn keyword_prefix_identifiers() {
1595        // Identifiers that start with keywords but aren't keywords
1596        assert_eq!(lex("setup"), vec![Token::Ident("setup".to_string())]);
1597        assert_eq!(lex("tools"), vec![Token::Ident("tools".to_string())]);
1598        assert_eq!(lex("iffy"), vec![Token::Ident("iffy".to_string())]);
1599        assert_eq!(lex("forked"), vec![Token::Ident("forked".to_string())]);
1600        assert_eq!(lex("done-with-it"), vec![Token::Ident("done-with-it".to_string())]);
1601    }
1602
1603    // ═══════════════════════════════════════════════════════════════════
1604    // Statement tests
1605    // ═══════════════════════════════════════════════════════════════════
1606
1607    #[test]
1608    fn assignment() {
1609        assert_eq!(
1610            lex("set X = 5"),
1611            vec![Token::Set, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
1612        );
1613    }
1614
1615    #[test]
1616    fn command_simple() {
1617        assert_eq!(lex("echo"), vec![Token::Ident("echo".to_string())]);
1618        assert_eq!(
1619            lex(r#"echo "hello""#),
1620            vec![Token::Ident("echo".to_string()), Token::String("hello".to_string())]
1621        );
1622    }
1623
1624    #[test]
1625    fn command_with_args() {
1626        assert_eq!(
1627            lex("cmd arg1 arg2"),
1628            vec![Token::Ident("cmd".to_string()), Token::Ident("arg1".to_string()), Token::Ident("arg2".to_string())]
1629        );
1630    }
1631
1632    #[test]
1633    fn command_with_named_args() {
1634        assert_eq!(
1635            lex("cmd key=value"),
1636            vec![Token::Ident("cmd".to_string()), Token::Ident("key".to_string()), Token::Eq, Token::Ident("value".to_string())]
1637        );
1638    }
1639
1640    #[test]
1641    fn pipeline() {
1642        assert_eq!(
1643            lex("a | b | c"),
1644            vec![Token::Ident("a".to_string()), Token::Pipe, Token::Ident("b".to_string()), Token::Pipe, Token::Ident("c".to_string())]
1645        );
1646    }
1647
1648    #[test]
1649    fn if_statement() {
1650        assert_eq!(
1651            lex("if true; then echo; fi"),
1652            vec![
1653                Token::If,
1654                Token::True,
1655                Token::Semi,
1656                Token::Then,
1657                Token::Ident("echo".to_string()),
1658                Token::Semi,
1659                Token::Fi
1660            ]
1661        );
1662    }
1663
1664    #[test]
1665    fn for_loop() {
1666        assert_eq!(
1667            lex("for X in items; do echo; done"),
1668            vec![
1669                Token::For,
1670                Token::Ident("X".to_string()),
1671                Token::In,
1672                Token::Ident("items".to_string()),
1673                Token::Semi,
1674                Token::Do,
1675                Token::Ident("echo".to_string()),
1676                Token::Semi,
1677                Token::Done
1678            ]
1679        );
1680    }
1681
1682    // ═══════════════════════════════════════════════════════════════════
1683    // Whitespace and newlines
1684    // ═══════════════════════════════════════════════════════════════════
1685
1686    #[test]
1687    fn whitespace_ignored() {
1688        assert_eq!(lex("   set   X   =   5   "), lex("set X = 5"));
1689    }
1690
1691    #[test]
1692    fn newlines_preserved() {
1693        let tokens = lex("a\nb");
1694        assert_eq!(
1695            tokens,
1696            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
1697        );
1698    }
1699
1700    #[test]
1701    fn multiple_newlines() {
1702        let tokens = lex("a\n\n\nb");
1703        assert_eq!(
1704            tokens,
1705            vec![Token::Ident("a".to_string()), Token::Newline, Token::Newline, Token::Newline, Token::Ident("b".to_string())]
1706        );
1707    }
1708
1709    // ═══════════════════════════════════════════════════════════════════
1710    // Comments
1711    // ═══════════════════════════════════════════════════════════════════
1712
1713    #[test]
1714    fn comments_skipped() {
1715        assert_eq!(lex("# comment"), vec![]);
1716        assert_eq!(lex("a # comment"), vec![Token::Ident("a".to_string())]);
1717        assert_eq!(
1718            lex("a # comment\nb"),
1719            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
1720        );
1721    }
1722
1723    #[test]
1724    fn comments_preserved_when_requested() {
1725        let tokens = tokenize_with_comments("a # comment")
1726            .expect("should succeed")
1727            .into_iter()
1728            .map(|s| s.token)
1729            .collect::<Vec<_>>();
1730        assert_eq!(tokens, vec![Token::Ident("a".to_string()), Token::Comment]);
1731    }
1732
1733    // ═══════════════════════════════════════════════════════════════════
1734    // String parsing
1735    // ═══════════════════════════════════════════════════════════════════
1736
1737    #[test]
1738    fn parse_simple_string() {
1739        assert_eq!(parse_string_literal(r#""hello""#).expect("ok"), "hello");
1740    }
1741
1742    #[test]
1743    fn parse_string_with_escapes() {
1744        assert_eq!(
1745            parse_string_literal(r#""hello\nworld""#).expect("ok"),
1746            "hello\nworld"
1747        );
1748        assert_eq!(
1749            parse_string_literal(r#""tab\there""#).expect("ok"),
1750            "tab\there"
1751        );
1752        assert_eq!(
1753            parse_string_literal(r#""quote\"here""#).expect("ok"),
1754            "quote\"here"
1755        );
1756    }
1757
1758    #[test]
1759    fn parse_string_with_unicode() {
1760        assert_eq!(
1761            parse_string_literal(r#""emoji \u2764""#).expect("ok"),
1762            "emoji ❤"
1763        );
1764    }
1765
1766    #[test]
1767    fn parse_string_with_escaped_dollar() {
1768        // \$ produces a marker that parse_interpolated_string will convert to $
1769        // The marker __KAISH_ESCAPED_DOLLAR__ is used to prevent re-interpretation
1770        assert_eq!(
1771            parse_string_literal(r#""\$VAR""#).expect("ok"),
1772            "__KAISH_ESCAPED_DOLLAR__VAR"
1773        );
1774        assert_eq!(
1775            parse_string_literal(r#""cost: \$100""#).expect("ok"),
1776            "cost: __KAISH_ESCAPED_DOLLAR__100"
1777        );
1778    }
1779
1780    // ═══════════════════════════════════════════════════════════════════
1781    // Variable reference parsing
1782    // ═══════════════════════════════════════════════════════════════════
1783
1784    #[test]
1785    fn parse_simple_var() {
1786        assert_eq!(
1787            parse_var_ref("${X}").expect("ok"),
1788            vec!["X"]
1789        );
1790    }
1791
1792    #[test]
1793    fn parse_var_with_field() {
1794        assert_eq!(
1795            parse_var_ref("${VAR.field}").expect("ok"),
1796            vec!["VAR", "field"]
1797        );
1798    }
1799
1800    #[test]
1801    fn parse_var_with_index() {
1802        assert_eq!(
1803            parse_var_ref("${VAR[0]}").expect("ok"),
1804            vec!["VAR", "[0]"]
1805        );
1806    }
1807
1808    #[test]
1809    fn parse_var_nested() {
1810        assert_eq!(
1811            parse_var_ref("${VAR.field[0].nested}").expect("ok"),
1812            vec!["VAR", "field", "[0]", "nested"]
1813        );
1814    }
1815
1816    #[test]
1817    fn parse_last_result() {
1818        assert_eq!(
1819            parse_var_ref("${?}").expect("ok"),
1820            vec!["?"]
1821        );
1822        assert_eq!(
1823            parse_var_ref("${?.ok}").expect("ok"),
1824            vec!["?", "ok"]
1825        );
1826    }
1827
1828    // ═══════════════════════════════════════════════════════════════════
1829    // Number parsing
1830    // ═══════════════════════════════════════════════════════════════════
1831
1832    #[test]
1833    fn parse_integers() {
1834        assert_eq!(parse_int("0").expect("ok"), 0);
1835        assert_eq!(parse_int("42").expect("ok"), 42);
1836        assert_eq!(parse_int("-1").expect("ok"), -1);
1837    }
1838
1839    #[test]
1840    fn parse_floats() {
1841        assert!((parse_float("3.14").expect("ok") - 3.14).abs() < f64::EPSILON);
1842        assert!((parse_float("-0.5").expect("ok") - (-0.5)).abs() < f64::EPSILON);
1843    }
1844
1845    // ═══════════════════════════════════════════════════════════════════
1846    // Edge cases and errors
1847    // ═══════════════════════════════════════════════════════════════════
1848
1849    #[test]
1850    fn empty_input() {
1851        assert_eq!(lex(""), vec![]);
1852    }
1853
1854    #[test]
1855    fn only_whitespace() {
1856        assert_eq!(lex("   \t\t   "), vec![]);
1857    }
1858
1859    #[test]
1860    fn json_array() {
1861        assert_eq!(
1862            lex(r#"[1, 2, 3]"#),
1863            vec![
1864                Token::LBracket,
1865                Token::Int(1),
1866                Token::Comma,
1867                Token::Int(2),
1868                Token::Comma,
1869                Token::Int(3),
1870                Token::RBracket
1871            ]
1872        );
1873    }
1874
1875    #[test]
1876    fn json_object() {
1877        assert_eq!(
1878            lex(r#"{"key": "value"}"#),
1879            vec![
1880                Token::LBrace,
1881                Token::String("key".to_string()),
1882                Token::Colon,
1883                Token::String("value".to_string()),
1884                Token::RBrace
1885            ]
1886        );
1887    }
1888
1889    #[test]
1890    fn redirect_operators() {
1891        assert_eq!(
1892            lex("cmd > file"),
1893            vec![Token::Ident("cmd".to_string()), Token::Gt, Token::Ident("file".to_string())]
1894        );
1895        assert_eq!(
1896            lex("cmd >> file"),
1897            vec![Token::Ident("cmd".to_string()), Token::GtGt, Token::Ident("file".to_string())]
1898        );
1899        assert_eq!(
1900            lex("cmd 2> err"),
1901            vec![Token::Ident("cmd".to_string()), Token::Stderr, Token::Ident("err".to_string())]
1902        );
1903        assert_eq!(
1904            lex("cmd &> all"),
1905            vec![Token::Ident("cmd".to_string()), Token::Both, Token::Ident("all".to_string())]
1906        );
1907    }
1908
1909    #[test]
1910    fn background_job() {
1911        assert_eq!(
1912            lex("cmd &"),
1913            vec![Token::Ident("cmd".to_string()), Token::Amp]
1914        );
1915    }
1916
1917    #[test]
1918    fn command_substitution() {
1919        assert_eq!(
1920            lex("$(cmd)"),
1921            vec![Token::CmdSubstStart, Token::Ident("cmd".to_string()), Token::RParen]
1922        );
1923        assert_eq!(
1924            lex("$(cmd arg)"),
1925            vec![
1926                Token::CmdSubstStart,
1927                Token::Ident("cmd".to_string()),
1928                Token::Ident("arg".to_string()),
1929                Token::RParen
1930            ]
1931        );
1932        assert_eq!(
1933            lex("$(a | b)"),
1934            vec![
1935                Token::CmdSubstStart,
1936                Token::Ident("a".to_string()),
1937                Token::Pipe,
1938                Token::Ident("b".to_string()),
1939                Token::RParen
1940            ]
1941        );
1942    }
1943
1944    #[test]
1945    fn complex_pipeline() {
1946        assert_eq!(
1947            lex(r#"cat file | grep pattern="foo" | head count=10"#),
1948            vec![
1949                Token::Ident("cat".to_string()),
1950                Token::Ident("file".to_string()),
1951                Token::Pipe,
1952                Token::Ident("grep".to_string()),
1953                Token::Ident("pattern".to_string()),
1954                Token::Eq,
1955                Token::String("foo".to_string()),
1956                Token::Pipe,
1957                Token::Ident("head".to_string()),
1958                Token::Ident("count".to_string()),
1959                Token::Eq,
1960                Token::Int(10),
1961            ]
1962        );
1963    }
1964
1965    // ═══════════════════════════════════════════════════════════════════
1966    // Flag tests
1967    // ═══════════════════════════════════════════════════════════════════
1968
1969    #[test]
1970    fn short_flag() {
1971        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
1972        assert_eq!(lex("-a"), vec![Token::ShortFlag("a".to_string())]);
1973        assert_eq!(lex("-v"), vec![Token::ShortFlag("v".to_string())]);
1974    }
1975
1976    #[test]
1977    fn short_flag_combined() {
1978        // Combined short flags like -la
1979        assert_eq!(lex("-la"), vec![Token::ShortFlag("la".to_string())]);
1980        assert_eq!(lex("-vvv"), vec![Token::ShortFlag("vvv".to_string())]);
1981    }
1982
1983    #[test]
1984    fn long_flag() {
1985        assert_eq!(lex("--force"), vec![Token::LongFlag("force".to_string())]);
1986        assert_eq!(lex("--verbose"), vec![Token::LongFlag("verbose".to_string())]);
1987        assert_eq!(lex("--foo-bar"), vec![Token::LongFlag("foo-bar".to_string())]);
1988    }
1989
1990    #[test]
1991    fn double_dash() {
1992        // -- alone marks end of flags
1993        assert_eq!(lex("--"), vec![Token::DoubleDash]);
1994    }
1995
1996    #[test]
1997    fn flags_vs_negative_numbers() {
1998        // -123 should be a negative integer, not a flag
1999        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2000        // -l should be a flag
2001        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2002        // -1a is ambiguous - should be Int(-1) then Ident(a)
2003        // Actually the regex -[a-zA-Z] won't match -1a since 1 isn't a letter
2004        assert_eq!(
2005            lex("-1 a"),
2006            vec![Token::Int(-1), Token::Ident("a".to_string())]
2007        );
2008    }
2009
2010    #[test]
2011    fn command_with_flags() {
2012        assert_eq!(
2013            lex("ls -l"),
2014            vec![
2015                Token::Ident("ls".to_string()),
2016                Token::ShortFlag("l".to_string()),
2017            ]
2018        );
2019        assert_eq!(
2020            lex("git commit -m"),
2021            vec![
2022                Token::Ident("git".to_string()),
2023                Token::Ident("commit".to_string()),
2024                Token::ShortFlag("m".to_string()),
2025            ]
2026        );
2027        assert_eq!(
2028            lex("git push --force"),
2029            vec![
2030                Token::Ident("git".to_string()),
2031                Token::Ident("push".to_string()),
2032                Token::LongFlag("force".to_string()),
2033            ]
2034        );
2035    }
2036
2037    #[test]
2038    fn flag_with_value() {
2039        assert_eq!(
2040            lex(r#"git commit -m "message""#),
2041            vec![
2042                Token::Ident("git".to_string()),
2043                Token::Ident("commit".to_string()),
2044                Token::ShortFlag("m".to_string()),
2045                Token::String("message".to_string()),
2046            ]
2047        );
2048        assert_eq!(
2049            lex(r#"--message="hello""#),
2050            vec![
2051                Token::LongFlag("message".to_string()),
2052                Token::Eq,
2053                Token::String("hello".to_string()),
2054            ]
2055        );
2056    }
2057
2058    #[test]
2059    fn end_of_flags_marker() {
2060        assert_eq!(
2061            lex("git checkout -- file"),
2062            vec![
2063                Token::Ident("git".to_string()),
2064                Token::Ident("checkout".to_string()),
2065                Token::DoubleDash,
2066                Token::Ident("file".to_string()),
2067            ]
2068        );
2069    }
2070
2071    // ═══════════════════════════════════════════════════════════════════
2072    // Bash compatibility tokens
2073    // ═══════════════════════════════════════════════════════════════════
2074
2075    #[test]
2076    fn local_keyword() {
2077        assert_eq!(lex("local"), vec![Token::Local]);
2078        assert_eq!(
2079            lex("local X = 5"),
2080            vec![Token::Local, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2081        );
2082    }
2083
2084    #[test]
2085    fn simple_var_ref() {
2086        assert_eq!(lex("$X"), vec![Token::SimpleVarRef("X".to_string())]);
2087        assert_eq!(lex("$foo"), vec![Token::SimpleVarRef("foo".to_string())]);
2088        assert_eq!(lex("$foo_bar"), vec![Token::SimpleVarRef("foo_bar".to_string())]);
2089        assert_eq!(lex("$_private"), vec![Token::SimpleVarRef("_private".to_string())]);
2090    }
2091
2092    #[test]
2093    fn simple_var_ref_in_command() {
2094        assert_eq!(
2095            lex("echo $NAME"),
2096            vec![Token::Ident("echo".to_string()), Token::SimpleVarRef("NAME".to_string())]
2097        );
2098    }
2099
2100    #[test]
2101    fn single_quoted_strings() {
2102        assert_eq!(lex("'hello'"), vec![Token::SingleString("hello".to_string())]);
2103        assert_eq!(lex("'hello world'"), vec![Token::SingleString("hello world".to_string())]);
2104        assert_eq!(lex("''"), vec![Token::SingleString("".to_string())]);
2105        // Single quotes don't process escapes or variables
2106        assert_eq!(lex(r"'no $VAR here'"), vec![Token::SingleString("no $VAR here".to_string())]);
2107        assert_eq!(lex(r"'backslash \n stays'"), vec![Token::SingleString(r"backslash \n stays".to_string())]);
2108    }
2109
2110    #[test]
2111    fn test_brackets() {
2112        // [[ and ]] are now two separate bracket tokens to avoid conflicts with nested arrays
2113        assert_eq!(lex("[["), vec![Token::LBracket, Token::LBracket]);
2114        assert_eq!(lex("]]"), vec![Token::RBracket, Token::RBracket]);
2115        assert_eq!(
2116            lex("[[ -f file ]]"),
2117            vec![
2118                Token::LBracket,
2119                Token::LBracket,
2120                Token::ShortFlag("f".to_string()),
2121                Token::Ident("file".to_string()),
2122                Token::RBracket,
2123                Token::RBracket
2124            ]
2125        );
2126    }
2127
2128    #[test]
2129    fn test_expression_syntax() {
2130        assert_eq!(
2131            lex(r#"[[ $X == "value" ]]"#),
2132            vec![
2133                Token::LBracket,
2134                Token::LBracket,
2135                Token::SimpleVarRef("X".to_string()),
2136                Token::EqEq,
2137                Token::String("value".to_string()),
2138                Token::RBracket,
2139                Token::RBracket
2140            ]
2141        );
2142    }
2143
2144    #[test]
2145    fn bash_style_assignment() {
2146        // NAME="value" (no spaces) - lexer sees IDENT EQ STRING
2147        assert_eq!(
2148            lex(r#"NAME="value""#),
2149            vec![
2150                Token::Ident("NAME".to_string()),
2151                Token::Eq,
2152                Token::String("value".to_string())
2153            ]
2154        );
2155    }
2156
2157    #[test]
2158    fn positional_params() {
2159        assert_eq!(lex("$0"), vec![Token::Positional(0)]);
2160        assert_eq!(lex("$1"), vec![Token::Positional(1)]);
2161        assert_eq!(lex("$9"), vec![Token::Positional(9)]);
2162        assert_eq!(lex("$@"), vec![Token::AllArgs]);
2163        assert_eq!(lex("$#"), vec![Token::ArgCount]);
2164    }
2165
2166    #[test]
2167    fn positional_in_context() {
2168        assert_eq!(
2169            lex("echo $1 $2"),
2170            vec![
2171                Token::Ident("echo".to_string()),
2172                Token::Positional(1),
2173                Token::Positional(2),
2174            ]
2175        );
2176    }
2177
2178    #[test]
2179    fn var_length() {
2180        assert_eq!(lex("${#X}"), vec![Token::VarLength("X".to_string())]);
2181        assert_eq!(lex("${#NAME}"), vec![Token::VarLength("NAME".to_string())]);
2182        assert_eq!(lex("${#foo_bar}"), vec![Token::VarLength("foo_bar".to_string())]);
2183    }
2184
2185    #[test]
2186    fn var_length_in_context() {
2187        assert_eq!(
2188            lex("echo ${#NAME}"),
2189            vec![
2190                Token::Ident("echo".to_string()),
2191                Token::VarLength("NAME".to_string()),
2192            ]
2193        );
2194    }
2195
2196    // ═══════════════════════════════════════════════════════════════════
2197    // Edge case tests: Flag ambiguities
2198    // ═══════════════════════════════════════════════════════════════════
2199
2200    #[test]
2201    fn plus_flag() {
2202        // Plus flags for set +e
2203        assert_eq!(lex("+e"), vec![Token::PlusFlag("e".to_string())]);
2204        assert_eq!(lex("+x"), vec![Token::PlusFlag("x".to_string())]);
2205        assert_eq!(lex("+ex"), vec![Token::PlusFlag("ex".to_string())]);
2206    }
2207
2208    #[test]
2209    fn set_with_plus_flag() {
2210        assert_eq!(
2211            lex("set +e"),
2212            vec![
2213                Token::Set,
2214                Token::PlusFlag("e".to_string()),
2215            ]
2216        );
2217    }
2218
2219    #[test]
2220    fn set_with_multiple_flags() {
2221        assert_eq!(
2222            lex("set -e -u"),
2223            vec![
2224                Token::Set,
2225                Token::ShortFlag("e".to_string()),
2226                Token::ShortFlag("u".to_string()),
2227            ]
2228        );
2229    }
2230
2231    #[test]
2232    fn flags_vs_negative_numbers_edge_cases() {
2233        // -1a should be negative int followed by ident
2234        assert_eq!(
2235            lex("-1 a"),
2236            vec![Token::Int(-1), Token::Ident("a".to_string())]
2237        );
2238        // -l is a flag
2239        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2240        // -123 is negative number
2241        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2242    }
2243
2244    #[test]
2245    fn single_dash_is_minus_alone() {
2246        // Single dash alone - now handled as MinusAlone for `cat -` stdin indicator
2247        let result = tokenize("-").expect("should lex");
2248        assert_eq!(result.len(), 1);
2249        assert!(matches!(result[0].token, Token::MinusAlone));
2250    }
2251
2252    #[test]
2253    fn plus_bare_for_date_format() {
2254        // `date +%s` - the +%s should be PlusBare
2255        let result = tokenize("+%s").expect("should lex");
2256        assert_eq!(result.len(), 1);
2257        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%s"));
2258
2259        // `date +%Y-%m-%d` - format string with dashes
2260        let result = tokenize("+%Y-%m-%d").expect("should lex");
2261        assert_eq!(result.len(), 1);
2262        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%Y-%m-%d"));
2263    }
2264
2265    #[test]
2266    fn plus_flag_still_works() {
2267        // `set +e` - should still be PlusFlag
2268        let result = tokenize("+e").expect("should lex");
2269        assert_eq!(result.len(), 1);
2270        assert!(matches!(result[0].token, Token::PlusFlag(ref s) if s == "e"));
2271    }
2272
2273    #[test]
2274    fn while_keyword_vs_while_loop() {
2275        // 'while' as keyword in loop context
2276        assert_eq!(lex("while"), vec![Token::While]);
2277        // 'while' at start followed by condition
2278        assert_eq!(
2279            lex("while true"),
2280            vec![Token::While, Token::True]
2281        );
2282    }
2283
2284    #[test]
2285    fn control_flow_keywords() {
2286        assert_eq!(lex("break"), vec![Token::Break]);
2287        assert_eq!(lex("continue"), vec![Token::Continue]);
2288        assert_eq!(lex("return"), vec![Token::Return]);
2289        assert_eq!(lex("exit"), vec![Token::Exit]);
2290    }
2291
2292    #[test]
2293    fn control_flow_with_numbers() {
2294        assert_eq!(
2295            lex("break 2"),
2296            vec![Token::Break, Token::Int(2)]
2297        );
2298        assert_eq!(
2299            lex("continue 3"),
2300            vec![Token::Continue, Token::Int(3)]
2301        );
2302        assert_eq!(
2303            lex("exit 1"),
2304            vec![Token::Exit, Token::Int(1)]
2305        );
2306    }
2307
2308    // ═══════════════════════════════════════════════════════════════════
2309    // Here-doc tests
2310    // ═══════════════════════════════════════════════════════════════════
2311
2312    #[test]
2313    fn heredoc_simple() {
2314        let source = "cat <<EOF\nhello\nworld\nEOF";
2315        let tokens = lex(source);
2316        assert_eq!(tokens, vec![
2317            Token::Ident("cat".to_string()),
2318            Token::HereDocStart,
2319            Token::HereDoc("hello\nworld".to_string()),
2320            Token::Newline,
2321        ]);
2322    }
2323
2324    #[test]
2325    fn heredoc_empty() {
2326        let source = "cat <<EOF\nEOF";
2327        let tokens = lex(source);
2328        assert_eq!(tokens, vec![
2329            Token::Ident("cat".to_string()),
2330            Token::HereDocStart,
2331            Token::HereDoc("".to_string()),
2332            Token::Newline,
2333        ]);
2334    }
2335
2336    #[test]
2337    fn heredoc_with_special_chars() {
2338        let source = "cat <<EOF\n$VAR and \"quoted\" 'single'\nEOF";
2339        let tokens = lex(source);
2340        assert_eq!(tokens, vec![
2341            Token::Ident("cat".to_string()),
2342            Token::HereDocStart,
2343            Token::HereDoc("$VAR and \"quoted\" 'single'".to_string()),
2344            Token::Newline,
2345        ]);
2346    }
2347
2348    #[test]
2349    fn heredoc_multiline() {
2350        let source = "cat <<END\nline1\nline2\nline3\nEND";
2351        let tokens = lex(source);
2352        assert_eq!(tokens, vec![
2353            Token::Ident("cat".to_string()),
2354            Token::HereDocStart,
2355            Token::HereDoc("line1\nline2\nline3".to_string()),
2356            Token::Newline,
2357        ]);
2358    }
2359
2360    #[test]
2361    fn heredoc_in_command() {
2362        let source = "cat <<EOF\nhello\nEOF\necho goodbye";
2363        let tokens = lex(source);
2364        assert_eq!(tokens, vec![
2365            Token::Ident("cat".to_string()),
2366            Token::HereDocStart,
2367            Token::HereDoc("hello".to_string()),
2368            Token::Newline,
2369            Token::Ident("echo".to_string()),
2370            Token::Ident("goodbye".to_string()),
2371        ]);
2372    }
2373
2374    #[test]
2375    fn heredoc_strip_tabs() {
2376        let source = "cat <<-EOF\n\thello\n\tworld\n\tEOF";
2377        let tokens = lex(source);
2378        // Content has tabs preserved, only delimiter matching strips tabs
2379        assert_eq!(tokens, vec![
2380            Token::Ident("cat".to_string()),
2381            Token::HereDocStart,
2382            Token::HereDoc("\thello\n\tworld".to_string()),
2383            Token::Newline,
2384        ]);
2385    }
2386
2387    // ═══════════════════════════════════════════════════════════════════
2388    // Arithmetic expression tests
2389    // ═══════════════════════════════════════════════════════════════════
2390
2391    #[test]
2392    fn arithmetic_simple() {
2393        let source = "$((1 + 2))";
2394        let tokens = lex(source);
2395        assert_eq!(tokens, vec![Token::Arithmetic("1 + 2".to_string())]);
2396    }
2397
2398    #[test]
2399    fn arithmetic_in_assignment() {
2400        let source = "X=$((5 * 3))";
2401        let tokens = lex(source);
2402        assert_eq!(tokens, vec![
2403            Token::Ident("X".to_string()),
2404            Token::Eq,
2405            Token::Arithmetic("5 * 3".to_string()),
2406        ]);
2407    }
2408
2409    #[test]
2410    fn arithmetic_with_nested_parens() {
2411        let source = "$((2 * (3 + 4)))";
2412        let tokens = lex(source);
2413        assert_eq!(tokens, vec![Token::Arithmetic("2 * (3 + 4)".to_string())]);
2414    }
2415
2416    #[test]
2417    fn arithmetic_with_variable() {
2418        let source = "$((X + 1))";
2419        let tokens = lex(source);
2420        assert_eq!(tokens, vec![Token::Arithmetic("X + 1".to_string())]);
2421    }
2422
2423    #[test]
2424    fn arithmetic_command_subst_not_confused() {
2425        // $( should not be treated as arithmetic
2426        let source = "$(echo hello)";
2427        let tokens = lex(source);
2428        assert_eq!(tokens, vec![
2429            Token::CmdSubstStart,
2430            Token::Ident("echo".to_string()),
2431            Token::Ident("hello".to_string()),
2432            Token::RParen,
2433        ]);
2434    }
2435
2436    #[test]
2437    fn arithmetic_nesting_limit() {
2438        // Create deeply nested parens that exceed MAX_PAREN_DEPTH (256)
2439        let open_parens = "(".repeat(300);
2440        let close_parens = ")".repeat(300);
2441        let source = format!("$(({}1{}))", open_parens, close_parens);
2442        let result = tokenize(&source);
2443        assert!(result.is_err());
2444        let errors = result.unwrap_err();
2445        assert_eq!(errors.len(), 1);
2446        assert_eq!(errors[0].token, LexerError::NestingTooDeep);
2447    }
2448
2449    #[test]
2450    fn arithmetic_nesting_within_limit() {
2451        // Nesting within limit should work
2452        let source = "$((((1 + 2) * 3)))";
2453        let tokens = lex(source);
2454        assert_eq!(tokens, vec![Token::Arithmetic("((1 + 2) * 3)".to_string())]);
2455    }
2456
2457    // ═══════════════════════════════════════════════════════════════════
2458    // Token category tests
2459    // ═══════════════════════════════════════════════════════════════════
2460
2461    #[test]
2462    fn token_categories() {
2463        // Keywords
2464        assert_eq!(Token::If.category(), TokenCategory::Keyword);
2465        assert_eq!(Token::Then.category(), TokenCategory::Keyword);
2466        assert_eq!(Token::For.category(), TokenCategory::Keyword);
2467        assert_eq!(Token::Function.category(), TokenCategory::Keyword);
2468        assert_eq!(Token::True.category(), TokenCategory::Keyword);
2469        assert_eq!(Token::TypeString.category(), TokenCategory::Keyword);
2470
2471        // Operators
2472        assert_eq!(Token::Pipe.category(), TokenCategory::Operator);
2473        assert_eq!(Token::And.category(), TokenCategory::Operator);
2474        assert_eq!(Token::Or.category(), TokenCategory::Operator);
2475        assert_eq!(Token::StderrToStdout.category(), TokenCategory::Operator);
2476        assert_eq!(Token::GtGt.category(), TokenCategory::Operator);
2477
2478        // Strings
2479        assert_eq!(Token::String("test".to_string()).category(), TokenCategory::String);
2480        assert_eq!(Token::SingleString("test".to_string()).category(), TokenCategory::String);
2481        assert_eq!(Token::HereDoc("test".to_string()).category(), TokenCategory::String);
2482
2483        // Numbers
2484        assert_eq!(Token::Int(42).category(), TokenCategory::Number);
2485        assert_eq!(Token::Float(3.14).category(), TokenCategory::Number);
2486        assert_eq!(Token::Arithmetic("1+2".to_string()).category(), TokenCategory::Number);
2487
2488        // Variables
2489        assert_eq!(Token::SimpleVarRef("X".to_string()).category(), TokenCategory::Variable);
2490        assert_eq!(Token::VarRef("${X}".to_string()).category(), TokenCategory::Variable);
2491        assert_eq!(Token::Positional(1).category(), TokenCategory::Variable);
2492        assert_eq!(Token::AllArgs.category(), TokenCategory::Variable);
2493        assert_eq!(Token::ArgCount.category(), TokenCategory::Variable);
2494        assert_eq!(Token::LastExitCode.category(), TokenCategory::Variable);
2495        assert_eq!(Token::CurrentPid.category(), TokenCategory::Variable);
2496
2497        // Flags
2498        assert_eq!(Token::ShortFlag("l".to_string()).category(), TokenCategory::Flag);
2499        assert_eq!(Token::LongFlag("verbose".to_string()).category(), TokenCategory::Flag);
2500        assert_eq!(Token::PlusFlag("e".to_string()).category(), TokenCategory::Flag);
2501        assert_eq!(Token::DoubleDash.category(), TokenCategory::Flag);
2502
2503        // Punctuation
2504        assert_eq!(Token::Semi.category(), TokenCategory::Punctuation);
2505        assert_eq!(Token::LParen.category(), TokenCategory::Punctuation);
2506        assert_eq!(Token::LBracket.category(), TokenCategory::Punctuation);
2507        assert_eq!(Token::Newline.category(), TokenCategory::Punctuation);
2508
2509        // Comments
2510        assert_eq!(Token::Comment.category(), TokenCategory::Comment);
2511
2512        // Paths
2513        assert_eq!(Token::Path("/tmp/file".to_string()).category(), TokenCategory::Path);
2514
2515        // Commands
2516        assert_eq!(Token::Ident("echo".to_string()).category(), TokenCategory::Command);
2517
2518        // Errors
2519        assert_eq!(Token::InvalidNumberIdent.category(), TokenCategory::Error);
2520        assert_eq!(Token::InvalidFloatNoLeading.category(), TokenCategory::Error);
2521        assert_eq!(Token::InvalidFloatNoTrailing.category(), TokenCategory::Error);
2522    }
2523}
kaish_kernel/lexer.rs

kaish_kernel/
lexer.rs