kaish_kernel/
lexer.rs

1//! Lexer for kaish source code.
2//!
3//! Converts source text into a stream of tokens using the logos lexer generator.
4//! The lexer is designed to be unambiguous: every valid input produces exactly
5//! one token sequence, and invalid input produces clear errors.
6//!
7//! # Token Categories
8//!
9//! - **Keywords**: `set`, `tool`, `if`, `then`, `else`, `fi`, `for`, `in`, `do`, `done`
10//! - **Literals**: strings, integers, floats, booleans (`true`/`false`)
11//! - **Operators**: `=`, `|`, `&`, `>`, `>>`, `<`, `2>`, `&>`, `&&`, `||`
12//! - **Punctuation**: `;`, `:`, `,`, `.`, `{`, `}`, `[`, `]`
13//! - **Variable references**: `${...}` with nested path access
14//! - **Identifiers**: command names, variable names, parameter names
15
16use logos::{Logos, Span};
17use std::fmt;
18use std::sync::atomic::{AtomicU64, Ordering};
19use std::time::{SystemTime, UNIX_EPOCH};
20
21/// Global counter for generating unique markers across all tokenize calls.
22static MARKER_COUNTER: AtomicU64 = AtomicU64::new(0);
23
24/// Maximum nesting depth for parentheses in arithmetic expressions.
25/// Prevents stack overflow from pathologically nested inputs like $((((((...
26const MAX_PAREN_DEPTH: usize = 256;
27
28/// Tracks a text replacement for span correction.
29/// When preprocessing replaces text (like `$((1+2))` with a marker),
30/// we need to adjust subsequent spans to account for the length change.
31#[derive(Debug, Clone)]
32struct SpanReplacement {
33    /// Position in the preprocessed text where the marker starts.
34    preprocessed_pos: usize,
35    /// Length of the marker in preprocessed text.
36    marker_len: usize,
37    /// Length of the original text that was replaced.
38    original_len: usize,
39}
40
41/// Corrects a span from preprocessed-text coordinates back to original-text coordinates.
42fn correct_span(span: Span, replacements: &[SpanReplacement]) -> Span {
43    let mut start_adjustment: isize = 0;
44    let mut end_adjustment: isize = 0;
45
46    for r in replacements {
47        // Calculate the length difference (positive = original was longer, negative = marker is longer)
48        let delta = r.original_len as isize - r.marker_len as isize;
49
50        // If the span starts after this replacement, adjust the start
51        if span.start > r.preprocessed_pos + r.marker_len {
52            start_adjustment += delta;
53        } else if span.start > r.preprocessed_pos {
54            // Span starts inside the marker - map to original position
55            // (this shouldn't happen often, but handle it gracefully)
56            start_adjustment += delta;
57        }
58
59        // If the span ends after this replacement, adjust the end
60        if span.end > r.preprocessed_pos + r.marker_len {
61            end_adjustment += delta;
62        } else if span.end > r.preprocessed_pos {
63            // Span ends inside the marker - map to end of original
64            end_adjustment += delta;
65        }
66    }
67
68    let new_start = (span.start as isize + start_adjustment).max(0) as usize;
69    let new_end = (span.end as isize + end_adjustment).max(new_start as isize) as usize;
70    new_start..new_end
71}
72
73/// Generate a unique marker ID that's extremely unlikely to collide with user code.
74/// Uses a combination of timestamp, counter, and process ID.
75fn unique_marker_id() -> String {
76    let timestamp = SystemTime::now()
77        .duration_since(UNIX_EPOCH)
78        .map(|d| d.as_nanos())
79        .unwrap_or(0);
80    let counter = MARKER_COUNTER.fetch_add(1, Ordering::Relaxed);
81    let pid = std::process::id();
82    format!("{:x}_{:x}_{:x}", timestamp, counter, pid)
83}
84
85/// A token with its span in the source text.
86#[derive(Debug, Clone, PartialEq)]
87pub struct Spanned<T> {
88    pub token: T,
89    pub span: Span,
90}
91
92impl<T> Spanned<T> {
93    pub fn new(token: T, span: Span) -> Self {
94        Self { token, span }
95    }
96}
97
98/// Lexer error types.
99#[derive(Debug, Clone, PartialEq, Default)]
100pub enum LexerError {
101    #[default]
102    UnexpectedCharacter,
103    UnterminatedString,
104    UnterminatedVarRef,
105    InvalidEscape,
106    InvalidNumber,
107    AmbiguousBoolean(String),
108    AmbiguousBooleanLike(String),
109    InvalidNumberIdent(String),
110    InvalidFloatNoLeading,
111    InvalidFloatNoTrailing,
112    /// Nesting depth exceeded (too many nested parentheses in arithmetic).
113    NestingTooDeep,
114}
115
116impl fmt::Display for LexerError {
117    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
118        match self {
119            LexerError::UnexpectedCharacter => write!(f, "unexpected character"),
120            LexerError::UnterminatedString => write!(f, "unterminated string"),
121            LexerError::UnterminatedVarRef => write!(f, "unterminated variable reference"),
122            LexerError::InvalidEscape => write!(f, "invalid escape sequence"),
123            LexerError::InvalidNumber => write!(f, "invalid number"),
124            LexerError::AmbiguousBoolean(s) => {
125                write!(f, "ambiguous boolean, use lowercase '{}'", s.to_lowercase())
126            }
127            LexerError::AmbiguousBooleanLike(s) => {
128                let suggest = if s.eq_ignore_ascii_case("yes") { "true" } else { "false" };
129                write!(f, "ambiguous boolean-like '{}', use '{}' or '\"{}\"'", s, suggest, s)
130            }
131            LexerError::InvalidNumberIdent(s) => {
132                write!(f, "identifier cannot start with digit: {}", s)
133            }
134            LexerError::InvalidFloatNoLeading => write!(f, "float must have leading digit"),
135            LexerError::InvalidFloatNoTrailing => write!(f, "float must have trailing digit"),
136            LexerError::NestingTooDeep => write!(f, "nesting depth exceeded (max {})", MAX_PAREN_DEPTH),
137        }
138    }
139}
140
141/// Tokens produced by the kaish lexer.
142///
143/// The order of variants matters for logos priority. More specific patterns
144/// (like keywords) should come before more general ones (like identifiers).
145///
146/// Tokens that carry semantic values (strings, numbers, identifiers) include
147/// the parsed value directly. This ensures the parser has access to actual
148/// data, not just token types.
149/// Here-doc content data.
150/// `literal` is true when the delimiter was quoted (<<'EOF' or <<"EOF"),
151/// meaning no variable expansion should occur.
152#[derive(Debug, Clone, PartialEq)]
153pub struct HereDocData {
154    pub content: String,
155    pub literal: bool,
156}
157
158#[derive(Logos, Debug, Clone, PartialEq)]
159#[logos(error = LexerError)]
160#[logos(skip r"[ \t]+")]
161pub enum Token {
162    // ═══════════════════════════════════════════════════════════════════
163    // Keywords (must come before Ident for priority)
164    // ═══════════════════════════════════════════════════════════════════
165    #[token("set")]
166    Set,
167
168    #[token("local")]
169    Local,
170
171    #[token("if")]
172    If,
173
174    #[token("then")]
175    Then,
176
177    #[token("else")]
178    Else,
179
180    #[token("elif")]
181    Elif,
182
183    #[token("fi")]
184    Fi,
185
186    #[token("for")]
187    For,
188
189    #[token("while")]
190    While,
191
192    #[token("in")]
193    In,
194
195    #[token("do")]
196    Do,
197
198    #[token("done")]
199    Done,
200
201    #[token("case")]
202    Case,
203
204    #[token("esac")]
205    Esac,
206
207    #[token("function")]
208    Function,
209
210    #[token("break")]
211    Break,
212
213    #[token("continue")]
214    Continue,
215
216    #[token("return")]
217    Return,
218
219    #[token("exit")]
220    Exit,
221
222    #[token("true")]
223    True,
224
225    #[token("false")]
226    False,
227
228    // ═══════════════════════════════════════════════════════════════════
229    // Type keywords (for tool parameters)
230    // ═══════════════════════════════════════════════════════════════════
231    #[token("string")]
232    TypeString,
233
234    #[token("int")]
235    TypeInt,
236
237    #[token("float")]
238    TypeFloat,
239
240    #[token("bool")]
241    TypeBool,
242
243    // ═══════════════════════════════════════════════════════════════════
244    // Multi-character operators (must come before single-char versions)
245    // ═══════════════════════════════════════════════════════════════════
246    #[token("&&")]
247    And,
248
249    #[token("||")]
250    Or,
251
252    #[token("==")]
253    EqEq,
254
255    #[token("!=")]
256    NotEq,
257
258    #[token("=~")]
259    Match,
260
261    #[token("!~")]
262    NotMatch,
263
264    #[token(">=")]
265    GtEq,
266
267    #[token("<=")]
268    LtEq,
269
270    #[token(">>")]
271    GtGt,
272
273    #[token("2>&1")]
274    StderrToStdout,
275
276    #[token("1>&2")]
277    StdoutToStderr,
278
279    #[token(">&2")]
280    StdoutToStderr2,
281
282    #[token("2>")]
283    Stderr,
284
285    #[token("&>")]
286    Both,
287
288    #[token("<<")]
289    HereDocStart,
290
291    #[token(";;")]
292    DoubleSemi,
293
294    // ═══════════════════════════════════════════════════════════════════
295    // Single-character operators and punctuation
296    // ═══════════════════════════════════════════════════════════════════
297    #[token("=")]
298    Eq,
299
300    #[token("|")]
301    Pipe,
302
303    #[token("&")]
304    Amp,
305
306    #[token(">")]
307    Gt,
308
309    #[token("<")]
310    Lt,
311
312    #[token(";")]
313    Semi,
314
315    #[token(":")]
316    Colon,
317
318    #[token(",")]
319    Comma,
320
321    #[token(".")]
322    Dot,
323
324    #[token("{")]
325    LBrace,
326
327    #[token("}")]
328    RBrace,
329
330    #[token("[")]
331    LBracket,
332
333    #[token("]")]
334    RBracket,
335
336    #[token("(")]
337    LParen,
338
339    #[token(")")]
340    RParen,
341
342    #[token("*")]
343    Star,
344
345    #[token("!")]
346    Bang,
347
348    #[token("?")]
349    Question,
350
351    // ═══════════════════════════════════════════════════════════════════
352    // Command substitution
353    // ═══════════════════════════════════════════════════════════════════
354
355    /// Arithmetic expression content: synthesized by preprocessing.
356    /// Contains the expression string between `$((` and `))`.
357    Arithmetic(String),
358
359    /// Command substitution start: `$(` - begins a command substitution
360    #[token("$(")]
361    CmdSubstStart,
362
363    // ═══════════════════════════════════════════════════════════════════
364    // Flags (must come before Int to win over negative numbers)
365    // ═══════════════════════════════════════════════════════════════════
366
367    /// Long flag: `--name` or `--foo-bar`
368    #[regex(r"--[a-zA-Z][a-zA-Z0-9-]*", lex_long_flag, priority = 3)]
369    LongFlag(String),
370
371    /// Short flag: `-l` or `-la` (combined short flags)
372    #[regex(r"-[a-zA-Z][a-zA-Z0-9]*", lex_short_flag, priority = 3)]
373    ShortFlag(String),
374
375    /// Plus flag: `+e` or `+x` (for set +e to disable options)
376    #[regex(r"\+[a-zA-Z][a-zA-Z0-9]*", lex_plus_flag, priority = 3)]
377    PlusFlag(String),
378
379    /// Double dash: `--` alone marks end of flags
380    #[token("--")]
381    DoubleDash,
382
383    /// Bare word starting with + followed by non-letter: `+%s`, `+%Y-%m-%d`
384    /// For date format strings and similar. Lower priority than PlusFlag.
385    #[regex(r"\+[^a-zA-Z\s][^\s]*", lex_plus_bare, priority = 2)]
386    PlusBare(String),
387
388    /// Bare word starting with - followed by non-letter/digit/dash: `-%`, etc.
389    /// For rare cases. Lower priority than ShortFlag, Int, and DoubleDash.
390    /// Excludes - after first - to avoid matching --name patterns.
391    #[regex(r"-[^a-zA-Z0-9\s\-][^\s]*", lex_minus_bare, priority = 1)]
392    MinusBare(String),
393
394    /// Standalone - (stdin indicator for cat -, diff - -, etc.)
395    /// Only matches when followed by whitespace or end.
396    /// This is handled specially in the parser as a positional arg.
397    #[token("-")]
398    MinusAlone,
399
400    // ═══════════════════════════════════════════════════════════════════
401    // Literals (with values)
402    // ═══════════════════════════════════════════════════════════════════
403
404    /// Double-quoted string: `"..."` - value is the parsed content (quotes removed, escapes processed)
405    #[regex(r#""([^"\\]|\\.)*""#, lex_string)]
406    String(String),
407
408    /// Single-quoted string: `'...'` - literal content, no escape processing
409    #[regex(r"'[^']*'", lex_single_string)]
410    SingleString(String),
411
412    /// Braced variable reference: `${VAR}` or `${VAR.field}` - value is the raw inner content
413    #[regex(r"\$\{[^}]+\}", lex_varref)]
414    VarRef(String),
415
416    /// Simple variable reference: `$NAME` - just the identifier
417    #[regex(r"\$[a-zA-Z_][a-zA-Z0-9_]*", lex_simple_varref)]
418    SimpleVarRef(String),
419
420    /// Positional parameter: `$0` through `$9`
421    #[regex(r"\$[0-9]", lex_positional)]
422    Positional(usize),
423
424    /// All positional parameters: `$@`
425    #[token("$@")]
426    AllArgs,
427
428    /// Number of positional parameters: `$#`
429    #[token("$#")]
430    ArgCount,
431
432    /// Last exit code: `$?`
433    #[token("$?")]
434    LastExitCode,
435
436    /// Current shell PID: `$$`
437    #[token("$$")]
438    CurrentPid,
439
440    /// Variable string length: `${#VAR}`
441    #[regex(r"\$\{#[a-zA-Z_][a-zA-Z0-9_]*\}", lex_var_length)]
442    VarLength(String),
443
444    /// Here-doc content: synthesized by preprocessing, not directly lexed.
445    /// Contains the full content of the here-doc (without the delimiter lines).
446    HereDoc(HereDocData),
447
448    /// Integer literal - value is the parsed i64
449    #[regex(r"-?[0-9]+", lex_int, priority = 2)]
450    Int(i64),
451
452    /// Float literal - value is the parsed f64
453    #[regex(r"-?[0-9]+\.[0-9]+", lex_float)]
454    Float(f64),
455
456    // ═══════════════════════════════════════════════════════════════════
457    // Invalid patterns (caught before valid tokens for better errors)
458    // ═══════════════════════════════════════════════════════════════════
459
460    /// Invalid: number followed by identifier characters (like 123abc)
461    #[regex(r"[0-9]+[a-zA-Z_][a-zA-Z0-9_-]*", lex_invalid_number_ident, priority = 3)]
462    InvalidNumberIdent,
463
464    /// Invalid: float without leading digit (like .5)
465    #[regex(r"\.[0-9]+", lex_invalid_float_no_leading, priority = 3)]
466    InvalidFloatNoLeading,
467
468    /// Invalid: float without trailing digit (like 5.)
469    /// Logos uses longest-match, so valid floats like 5.5 will match Float pattern instead
470    #[regex(r"[0-9]+\.", lex_invalid_float_no_trailing, priority = 2)]
471    InvalidFloatNoTrailing,
472
473    // ═══════════════════════════════════════════════════════════════════
474    // Paths (absolute paths starting with /)
475    // ═══════════════════════════════════════════════════════════════════
476
477    /// Absolute path: `/tmp/out`, `/etc/hosts`, etc.
478    #[regex(r"/[a-zA-Z0-9_./+-]*", lex_path)]
479    Path(String),
480
481    // ═══════════════════════════════════════════════════════════════════
482    // Identifiers (command names, variable names, etc.)
483    // ═══════════════════════════════════════════════════════════════════
484
485    /// Identifier - value is the identifier string
486    /// Allows dots for filenames like `script.kai`
487    #[regex(r"[a-zA-Z_][a-zA-Z0-9_.-]*", lex_ident)]
488    Ident(String),
489
490    // ═══════════════════════════════════════════════════════════════════
491    // Structural tokens
492    // ═══════════════════════════════════════════════════════════════════
493
494    /// Comment: `# ...` to end of line
495    #[regex(r"#[^\n\r]*", allow_greedy = true)]
496    Comment,
497
498    /// Newline (significant in kaish - ends statements)
499    #[regex(r"\n|\r\n")]
500    Newline,
501
502    /// Line continuation: backslash at end of line
503    #[regex(r"\\[ \t]*(\n|\r\n)")]
504    LineContinuation,
505}
506
507/// Semantic category for syntax highlighting.
508///
509/// Stable enum that groups tokens by purpose. Consumers match on categories
510/// instead of individual tokens, insulating them from lexer evolution.
511#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
512pub enum TokenCategory {
513    /// Keywords: if, then, else, for, while, function, return, etc.
514    Keyword,
515    /// Operators: |, &&, ||, >, >>, 2>&1, =, ==, etc.
516    Operator,
517    /// String literals: "...", '...', heredocs
518    String,
519    /// Numeric literals: 123, 3.14, arithmetic expressions
520    Number,
521    /// Variable references: $foo, ${bar}, $1, $@, $#, $?, $$
522    Variable,
523    /// Comments: # ...
524    Comment,
525    /// Punctuation: ; , . ( ) { } [ ]
526    Punctuation,
527    /// Identifiers in command position
528    Command,
529    /// Absolute paths: /foo/bar
530    Path,
531    /// Flags: --long, -s, +x
532    Flag,
533    /// Invalid tokens
534    Error,
535}
536
537impl Token {
538    /// Returns the semantic category for syntax highlighting.
539    pub fn category(&self) -> TokenCategory {
540        match self {
541            // Keywords
542            Token::If
543            | Token::Then
544            | Token::Else
545            | Token::Elif
546            | Token::Fi
547            | Token::For
548            | Token::In
549            | Token::Do
550            | Token::Done
551            | Token::While
552            | Token::Case
553            | Token::Esac
554            | Token::Function
555            | Token::Return
556            | Token::Break
557            | Token::Continue
558            | Token::Exit
559            | Token::Set
560            | Token::Local
561            | Token::True
562            | Token::False
563            | Token::TypeString
564            | Token::TypeInt
565            | Token::TypeFloat
566            | Token::TypeBool => TokenCategory::Keyword,
567
568            // Operators and redirections
569            Token::Pipe
570            | Token::And
571            | Token::Or
572            | Token::Amp
573            | Token::Eq
574            | Token::EqEq
575            | Token::NotEq
576            | Token::Match
577            | Token::NotMatch
578            | Token::Lt
579            | Token::Gt
580            | Token::LtEq
581            | Token::GtEq
582            | Token::GtGt
583            | Token::Stderr
584            | Token::Both
585            | Token::HereDocStart
586            | Token::StderrToStdout
587            | Token::StdoutToStderr
588            | Token::StdoutToStderr2 => TokenCategory::Operator,
589
590            // Strings
591            Token::String(_) | Token::SingleString(_) | Token::HereDoc(_) => TokenCategory::String,
592
593            // Numbers
594            Token::Int(_) | Token::Float(_) | Token::Arithmetic(_) => TokenCategory::Number,
595
596            // Variables
597            Token::VarRef(_)
598            | Token::SimpleVarRef(_)
599            | Token::Positional(_)
600            | Token::AllArgs
601            | Token::ArgCount
602            | Token::VarLength(_)
603            | Token::LastExitCode
604            | Token::CurrentPid => TokenCategory::Variable,
605
606            // Flags
607            Token::LongFlag(_)
608            | Token::ShortFlag(_)
609            | Token::PlusFlag(_)
610            | Token::DoubleDash => TokenCategory::Flag,
611
612            // Punctuation
613            Token::Semi
614            | Token::DoubleSemi
615            | Token::Colon
616            | Token::Comma
617            | Token::Dot
618            | Token::LParen
619            | Token::RParen
620            | Token::LBrace
621            | Token::RBrace
622            | Token::LBracket
623            | Token::RBracket
624            | Token::Bang
625            | Token::Question
626            | Token::Star
627            | Token::Newline
628            | Token::LineContinuation
629            | Token::CmdSubstStart => TokenCategory::Punctuation,
630
631            // Comments
632            Token::Comment => TokenCategory::Comment,
633
634            // Paths
635            Token::Path(_) => TokenCategory::Path,
636
637            // Commands/identifiers (and bare words)
638            Token::Ident(_)
639            | Token::PlusBare(_)
640            | Token::MinusBare(_)
641            | Token::MinusAlone => TokenCategory::Command,
642
643            // Errors
644            Token::InvalidNumberIdent
645            | Token::InvalidFloatNoLeading
646            | Token::InvalidFloatNoTrailing => TokenCategory::Error,
647        }
648    }
649}
650
651/// Lex a double-quoted string literal, processing escape sequences.
652fn lex_string(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
653    parse_string_literal(lex.slice())
654}
655
656/// Lex a single-quoted string literal (no escape processing).
657fn lex_single_string(lex: &mut logos::Lexer<Token>) -> String {
658    let s = lex.slice();
659    // Strip the surrounding single quotes
660    s[1..s.len() - 1].to_string()
661}
662
663/// Lex a braced variable reference, extracting the inner content.
664fn lex_varref(lex: &mut logos::Lexer<Token>) -> String {
665    // Keep the full ${...} for later parsing of path segments
666    lex.slice().to_string()
667}
668
669/// Lex a simple variable reference: `$NAME` → `NAME`
670fn lex_simple_varref(lex: &mut logos::Lexer<Token>) -> String {
671    // Strip the leading `$`
672    lex.slice()[1..].to_string()
673}
674
675/// Lex a positional parameter: `$1` → 1
676fn lex_positional(lex: &mut logos::Lexer<Token>) -> usize {
677    // Strip the leading `$` and parse the digit
678    lex.slice()[1..].parse().unwrap_or(0)
679}
680
681/// Lex a variable length: `${#VAR}` → "VAR"
682fn lex_var_length(lex: &mut logos::Lexer<Token>) -> String {
683    // Strip the leading `${#` and trailing `}`
684    let s = lex.slice();
685    s[3..s.len() - 1].to_string()
686}
687
688/// Lex an integer literal.
689fn lex_int(lex: &mut logos::Lexer<Token>) -> Result<i64, LexerError> {
690    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
691}
692
693/// Lex a float literal.
694fn lex_float(lex: &mut logos::Lexer<Token>) -> Result<f64, LexerError> {
695    lex.slice().parse().map_err(|_| LexerError::InvalidNumber)
696}
697
698/// Lex an invalid number-identifier pattern (like 123abc).
699/// Always returns Err to produce a lexer error instead of a token.
700fn lex_invalid_number_ident(lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
701    Err(LexerError::InvalidNumberIdent(lex.slice().to_string()))
702}
703
704/// Lex an invalid float without leading digit (like .5).
705/// Always returns Err to produce a lexer error instead of a token.
706fn lex_invalid_float_no_leading(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
707    Err(LexerError::InvalidFloatNoLeading)
708}
709
710/// Lex an invalid float without trailing digit (like 5.).
711/// Always returns Err to produce a lexer error instead of a token.
712fn lex_invalid_float_no_trailing(_lex: &mut logos::Lexer<Token>) -> Result<(), LexerError> {
713    Err(LexerError::InvalidFloatNoTrailing)
714}
715
716/// Lex an identifier, rejecting ambiguous boolean-like values.
717fn lex_ident(lex: &mut logos::Lexer<Token>) -> Result<String, LexerError> {
718    let s = lex.slice();
719
720    // Reject ambiguous boolean variants (TRUE, FALSE, True, etc.)
721    // Only lowercase 'true' and 'false' are valid booleans (handled by Token::True/False)
722    match s.to_lowercase().as_str() {
723        "true" | "false" if s != "true" && s != "false" => {
724            return Err(LexerError::AmbiguousBoolean(s.to_string()));
725        }
726        _ => {}
727    }
728
729    // Reject yes/no/YES/NO/Yes/No as ambiguous boolean-like values
730    if s.eq_ignore_ascii_case("yes") || s.eq_ignore_ascii_case("no") {
731        return Err(LexerError::AmbiguousBooleanLike(s.to_string()));
732    }
733
734    Ok(s.to_string())
735}
736
737/// Lex a long flag: `--name` → `name`
738fn lex_long_flag(lex: &mut logos::Lexer<Token>) -> String {
739    // Strip the leading `--`
740    lex.slice()[2..].to_string()
741}
742
743/// Lex a short flag: `-l` → `l`, `-la` → `la`
744fn lex_short_flag(lex: &mut logos::Lexer<Token>) -> String {
745    // Strip the leading `-`
746    lex.slice()[1..].to_string()
747}
748
749/// Lex a plus flag: `+e` → `e`, `+ex` → `ex`
750fn lex_plus_flag(lex: &mut logos::Lexer<Token>) -> String {
751    // Strip the leading `+`
752    lex.slice()[1..].to_string()
753}
754
755/// Lex a plus bare word: `+%s` → `+%s` (keep the full string)
756fn lex_plus_bare(lex: &mut logos::Lexer<Token>) -> String {
757    lex.slice().to_string()
758}
759
760/// Lex a minus bare word: `-%` → `-%` (keep the full string)
761fn lex_minus_bare(lex: &mut logos::Lexer<Token>) -> String {
762    lex.slice().to_string()
763}
764
765/// Lex an absolute path: `/tmp/out` → `/tmp/out`
766fn lex_path(lex: &mut logos::Lexer<Token>) -> String {
767    lex.slice().to_string()
768}
769
770impl fmt::Display for Token {
771    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
772        match self {
773            Token::Set => write!(f, "set"),
774            Token::Local => write!(f, "local"),
775            Token::If => write!(f, "if"),
776            Token::Then => write!(f, "then"),
777            Token::Else => write!(f, "else"),
778            Token::Elif => write!(f, "elif"),
779            Token::Fi => write!(f, "fi"),
780            Token::For => write!(f, "for"),
781            Token::While => write!(f, "while"),
782            Token::In => write!(f, "in"),
783            Token::Do => write!(f, "do"),
784            Token::Done => write!(f, "done"),
785            Token::Case => write!(f, "case"),
786            Token::Esac => write!(f, "esac"),
787            Token::Function => write!(f, "function"),
788            Token::Break => write!(f, "break"),
789            Token::Continue => write!(f, "continue"),
790            Token::Return => write!(f, "return"),
791            Token::Exit => write!(f, "exit"),
792            Token::True => write!(f, "true"),
793            Token::False => write!(f, "false"),
794            Token::TypeString => write!(f, "string"),
795            Token::TypeInt => write!(f, "int"),
796            Token::TypeFloat => write!(f, "float"),
797            Token::TypeBool => write!(f, "bool"),
798            Token::And => write!(f, "&&"),
799            Token::Or => write!(f, "||"),
800            Token::EqEq => write!(f, "=="),
801            Token::NotEq => write!(f, "!="),
802            Token::Match => write!(f, "=~"),
803            Token::NotMatch => write!(f, "!~"),
804            Token::GtEq => write!(f, ">="),
805            Token::LtEq => write!(f, "<="),
806            Token::GtGt => write!(f, ">>"),
807            Token::StderrToStdout => write!(f, "2>&1"),
808            Token::StdoutToStderr => write!(f, "1>&2"),
809            Token::StdoutToStderr2 => write!(f, ">&2"),
810            Token::Stderr => write!(f, "2>"),
811            Token::Both => write!(f, "&>"),
812            Token::HereDocStart => write!(f, "<<"),
813            Token::DoubleSemi => write!(f, ";;"),
814            Token::Eq => write!(f, "="),
815            Token::Pipe => write!(f, "|"),
816            Token::Amp => write!(f, "&"),
817            Token::Gt => write!(f, ">"),
818            Token::Lt => write!(f, "<"),
819            Token::Semi => write!(f, ";"),
820            Token::Colon => write!(f, ":"),
821            Token::Comma => write!(f, ","),
822            Token::Dot => write!(f, "."),
823            Token::LBrace => write!(f, "{{"),
824            Token::RBrace => write!(f, "}}"),
825            Token::LBracket => write!(f, "["),
826            Token::RBracket => write!(f, "]"),
827            Token::LParen => write!(f, "("),
828            Token::RParen => write!(f, ")"),
829            Token::Star => write!(f, "*"),
830            Token::Bang => write!(f, "!"),
831            Token::Question => write!(f, "?"),
832            Token::Arithmetic(s) => write!(f, "ARITHMETIC({})", s),
833            Token::CmdSubstStart => write!(f, "$("),
834            Token::LongFlag(s) => write!(f, "--{}", s),
835            Token::ShortFlag(s) => write!(f, "-{}", s),
836            Token::PlusFlag(s) => write!(f, "+{}", s),
837            Token::DoubleDash => write!(f, "--"),
838            Token::PlusBare(s) => write!(f, "{}", s),
839            Token::MinusBare(s) => write!(f, "{}", s),
840            Token::MinusAlone => write!(f, "-"),
841            Token::String(s) => write!(f, "STRING({:?})", s),
842            Token::SingleString(s) => write!(f, "SINGLESTRING({:?})", s),
843            Token::HereDoc(d) => write!(f, "HEREDOC({:?}, literal={})", d.content, d.literal),
844            Token::VarRef(v) => write!(f, "VARREF({})", v),
845            Token::SimpleVarRef(v) => write!(f, "SIMPLEVARREF({})", v),
846            Token::Positional(n) => write!(f, "${}", n),
847            Token::AllArgs => write!(f, "$@"),
848            Token::ArgCount => write!(f, "$#"),
849            Token::LastExitCode => write!(f, "$?"),
850            Token::CurrentPid => write!(f, "$$"),
851            Token::VarLength(v) => write!(f, "${{#{}}}", v),
852            Token::Int(n) => write!(f, "INT({})", n),
853            Token::Float(n) => write!(f, "FLOAT({})", n),
854            Token::Path(s) => write!(f, "PATH({})", s),
855            Token::Ident(s) => write!(f, "IDENT({})", s),
856            Token::Comment => write!(f, "COMMENT"),
857            Token::Newline => write!(f, "NEWLINE"),
858            Token::LineContinuation => write!(f, "LINECONT"),
859            // These variants should never be produced - their callbacks always return errors
860            Token::InvalidNumberIdent => write!(f, "INVALID_NUMBER_IDENT"),
861            Token::InvalidFloatNoLeading => write!(f, "INVALID_FLOAT_NO_LEADING"),
862            Token::InvalidFloatNoTrailing => write!(f, "INVALID_FLOAT_NO_TRAILING"),
863        }
864    }
865}
866
867impl Token {
868    /// Returns true if this token is a keyword.
869    pub fn is_keyword(&self) -> bool {
870        matches!(
871            self,
872            Token::Set
873                | Token::Local
874                | Token::If
875                | Token::Then
876                | Token::Else
877                | Token::Elif
878                | Token::Fi
879                | Token::For
880                | Token::In
881                | Token::Do
882                | Token::Done
883                | Token::Case
884                | Token::Esac
885                | Token::Function
886                | Token::True
887                | Token::False
888        )
889    }
890
891    /// Returns true if this token is a type keyword.
892    pub fn is_type(&self) -> bool {
893        matches!(
894            self,
895            Token::TypeString
896                | Token::TypeInt
897                | Token::TypeFloat
898                | Token::TypeBool
899        )
900    }
901
902    /// Returns true if this token starts a statement.
903    pub fn starts_statement(&self) -> bool {
904        matches!(
905            self,
906            Token::Set | Token::Local | Token::Function | Token::If | Token::For | Token::Case | Token::Ident(_) | Token::LBracket
907        )
908    }
909
910    /// Returns true if this token can appear in an expression.
911    pub fn is_value(&self) -> bool {
912        matches!(
913            self,
914            Token::String(_)
915                | Token::SingleString(_)
916                | Token::HereDoc(_)
917                | Token::Arithmetic(_)
918                | Token::Int(_)
919                | Token::Float(_)
920                | Token::True
921                | Token::False
922                | Token::VarRef(_)
923                | Token::SimpleVarRef(_)
924                | Token::CmdSubstStart
925                | Token::Path(_)
926                | Token::LastExitCode
927                | Token::CurrentPid
928        )
929    }
930}
931
932/// Result of preprocessing arithmetic expressions.
933struct ArithmeticPreprocessResult {
934    /// Preprocessed source with markers replacing $((expr)).
935    text: String,
936    /// Vector of (marker, expression_content) pairs.
937    arithmetics: Vec<(String, String)>,
938    /// Span replacements for correcting token positions.
939    replacements: Vec<SpanReplacement>,
940}
941
942/// Preprocess arithmetic expressions in source code.
943///
944/// Finds `$((expr))` patterns and replaces them with markers.
945/// Returns the preprocessed source, arithmetic contents, and span replacement info.
946///
947/// Example:
948///   `X=$((1 + 2))`
949/// Becomes:
950///   `X=__KAISH_ARITH_{id}__`
951/// With arithmetics[0] = ("__KAISH_ARITH_{id}__", "1 + 2")
952///
953/// # Errors
954/// Returns `LexerError::NestingTooDeep` if parentheses are nested beyond MAX_PAREN_DEPTH.
955fn preprocess_arithmetic(source: &str) -> Result<ArithmeticPreprocessResult, LexerError> {
956    let mut result = String::with_capacity(source.len());
957    let mut arithmetics: Vec<(String, String)> = Vec::new();
958    let mut replacements: Vec<SpanReplacement> = Vec::new();
959    let mut source_pos: usize = 0;
960    let chars_vec: Vec<char> = source.chars().collect();
961    let mut i = 0;
962
963    while i < chars_vec.len() {
964        let ch = chars_vec[i];
965
966        // Look for $(( (potential arithmetic)
967        if ch == '$' && i + 2 < chars_vec.len() && chars_vec[i + 1] == '(' && chars_vec[i + 2] == '(' {
968            let arith_start_pos = result.len();
969            let original_start = source_pos;
970
971            // Skip $((
972            i += 3;
973            source_pos += 3;
974
975            // Collect expression until matching ))
976            let mut expr = String::new();
977            let mut paren_depth: usize = 0;
978
979            while i < chars_vec.len() {
980                let c = chars_vec[i];
981                match c {
982                    '(' => {
983                        paren_depth += 1;
984                        if paren_depth > MAX_PAREN_DEPTH {
985                            return Err(LexerError::NestingTooDeep);
986                        }
987                        expr.push('(');
988                        i += 1;
989                        source_pos += c.len_utf8();
990                    }
991                    ')' => {
992                        if paren_depth > 0 {
993                            paren_depth -= 1;
994                            expr.push(')');
995                            i += 1;
996                            source_pos += 1;
997                        } else if i + 1 < chars_vec.len() && chars_vec[i + 1] == ')' {
998                            // Found closing ))
999                            i += 2;
1000                            source_pos += 2;
1001                            break;
1002                        } else {
1003                            // Single ) inside - keep going
1004                            expr.push(')');
1005                            i += 1;
1006                            source_pos += 1;
1007                        }
1008                    }
1009                    _ => {
1010                        expr.push(c);
1011                        i += 1;
1012                        source_pos += c.len_utf8();
1013                    }
1014                }
1015            }
1016
1017            // Calculate original length: from $$(( to ))
1018            let original_len = source_pos - original_start;
1019
1020            // Create a unique marker for this arithmetic (collision-resistant)
1021            let marker = format!("__KAISH_ARITH_{}__", unique_marker_id());
1022            let marker_len = marker.len();
1023
1024            // Record the replacement for span correction
1025            replacements.push(SpanReplacement {
1026                preprocessed_pos: arith_start_pos,
1027                marker_len,
1028                original_len,
1029            });
1030
1031            arithmetics.push((marker.clone(), expr));
1032            result.push_str(&marker);
1033        } else {
1034            result.push(ch);
1035            i += 1;
1036            source_pos += ch.len_utf8();
1037        }
1038    }
1039
1040    Ok(ArithmeticPreprocessResult {
1041        text: result,
1042        arithmetics,
1043        replacements,
1044    })
1045}
1046
1047/// Preprocess here-docs in source code.
1048///
1049/// Finds `<<WORD` patterns and collects content until the delimiter line.
1050/// Returns the preprocessed source and a vector of (marker, content) pairs.
1051///
1052/// Example:
1053///   `cat <<EOF\nhello\nworld\nEOF`
1054/// Becomes:
1055///   `cat <<__HEREDOC_0__`
1056/// With heredocs[0] = ("__HEREDOC_0__", "hello\nworld")
1057fn preprocess_heredocs(source: &str) -> (String, Vec<(String, String, bool)>) {
1058    let mut result = String::with_capacity(source.len());
1059    let mut heredocs: Vec<(String, String, bool)> = Vec::new();
1060    let mut chars = source.chars().peekable();
1061
1062    while let Some(ch) = chars.next() {
1063        // Look for << (potential here-doc)
1064        if ch == '<' && chars.peek() == Some(&'<') {
1065            chars.next(); // consume second <
1066
1067            // Check for optional - (strip leading tabs)
1068            let strip_tabs = chars.peek() == Some(&'-');
1069            if strip_tabs {
1070                chars.next();
1071            }
1072
1073            // Skip whitespace before delimiter
1074            while let Some(&c) = chars.peek() {
1075                if c == ' ' || c == '\t' {
1076                    chars.next();
1077                } else {
1078                    break;
1079                }
1080            }
1081
1082            // Collect the delimiter word
1083            let mut delimiter = String::new();
1084            let quoted = chars.peek() == Some(&'\'') || chars.peek() == Some(&'"');
1085            let quote_char = if quoted { chars.next() } else { None };
1086
1087            while let Some(&c) = chars.peek() {
1088                if quoted {
1089                    if Some(c) == quote_char {
1090                        chars.next(); // consume closing quote
1091                        break;
1092                    }
1093                } else if c.is_whitespace() || c == '\n' || c == '\r' {
1094                    break;
1095                }
1096                if let Some(ch) = chars.next() {
1097                    delimiter.push(ch);
1098                }
1099            }
1100
1101            if delimiter.is_empty() {
1102                // Not a valid here-doc, output << literally
1103                result.push_str("<<");
1104                if strip_tabs {
1105                    result.push('-');
1106                }
1107                continue;
1108            }
1109
1110            // Buffer text after delimiter word (e.g., " | jq" in "cat <<EOF | jq")
1111            // This must be emitted AFTER the heredoc marker, not before.
1112            let mut after_delimiter = String::new();
1113            while let Some(&c) = chars.peek() {
1114                if c == '\n' {
1115                    chars.next();
1116                    break;
1117                } else if c == '\r' {
1118                    chars.next();
1119                    if chars.peek() == Some(&'\n') {
1120                        chars.next();
1121                    }
1122                    break;
1123                }
1124                if let Some(ch) = chars.next() {
1125                    after_delimiter.push(ch);
1126                }
1127            }
1128
1129            // Collect content until delimiter on its own line
1130            let mut content = String::new();
1131            let mut current_line = String::new();
1132
1133            loop {
1134                match chars.next() {
1135                    Some('\n') => {
1136                        // Check if this line is the delimiter
1137                        let trimmed = if strip_tabs {
1138                            current_line.trim_start_matches('\t')
1139                        } else {
1140                            &current_line
1141                        };
1142                        if trimmed == delimiter {
1143                            // Found end of here-doc
1144                            break;
1145                        }
1146                        // Add line to content (including empty lines)
1147                        content.push_str(&current_line);
1148                        content.push('\n');
1149                        current_line.clear();
1150                    }
1151                    Some('\r') => {
1152                        // Handle \r\n
1153                        if chars.peek() == Some(&'\n') {
1154                            chars.next();
1155                        }
1156                        let trimmed = if strip_tabs {
1157                            current_line.trim_start_matches('\t')
1158                        } else {
1159                            &current_line
1160                        };
1161                        if trimmed == delimiter {
1162                            break;
1163                        }
1164                        content.push_str(&current_line);
1165                        content.push('\n');
1166                        current_line.clear();
1167                    }
1168                    Some(c) => {
1169                        current_line.push(c);
1170                    }
1171                    None => {
1172                        // EOF - check if current line is the delimiter
1173                        let trimmed = if strip_tabs {
1174                            current_line.trim_start_matches('\t')
1175                        } else {
1176                            &current_line
1177                        };
1178                        if trimmed == delimiter {
1179                            // Found delimiter at EOF
1180                            break;
1181                        }
1182                        // Not a delimiter - include remaining content
1183                        if !current_line.is_empty() {
1184                            content.push_str(&current_line);
1185                        }
1186                        break;
1187                    }
1188                }
1189            }
1190
1191            // Remove trailing newline from content (we'll add it when needed)
1192            let content = content.trim_end_matches('\n').to_string();
1193
1194            // Create a unique marker for this here-doc (collision-resistant)
1195            let marker = format!("__KAISH_HEREDOC_{}__", unique_marker_id());
1196            heredocs.push((marker.clone(), content, quoted));
1197
1198            // Output <<marker first, then any text that followed the delimiter
1199            // (e.g., " | jq") so the heredoc attaches to the correct command.
1200            result.push_str("<<");
1201            result.push_str(&marker);
1202            result.push_str(&after_delimiter);
1203            result.push('\n');
1204        } else {
1205            result.push(ch);
1206        }
1207    }
1208
1209    (result, heredocs)
1210}
1211
1212/// Tokenize source code into a vector of spanned tokens.
1213///
1214/// Skips whitespace and comments (unless you need them for formatting).
1215/// Returns errors with their positions for nice error messages.
1216///
1217/// Handles:
1218/// - Arithmetic: `$((expr))` becomes `Arithmetic("expr")`
1219/// - Here-docs: `<<EOF\nhello\nEOF` becomes `HereDocStart` + `HereDoc("hello")`
1220pub fn tokenize(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1221    // Preprocess arithmetic first (before heredocs because heredoc content might contain $((
1222    let arith_result = preprocess_arithmetic(source)
1223        .map_err(|e| vec![Spanned::new(e, 0..source.len())])?;
1224
1225    // Then preprocess here-docs (heredoc span tracking is not implemented for simplicity)
1226    let (preprocessed, heredocs) = preprocess_heredocs(&arith_result.text);
1227
1228    // Combine replacements for span correction (arithmetic only for now)
1229    let span_replacements = arith_result.replacements;
1230
1231    let lexer = Token::lexer(&preprocessed);
1232    let mut tokens = Vec::new();
1233    let mut errors = Vec::new();
1234
1235    for (result, span) in lexer.spanned() {
1236        // Correct the span from preprocessed coordinates to original coordinates
1237        let corrected_span = correct_span(span, &span_replacements);
1238        match result {
1239            Ok(token) => {
1240                // Skip comments and line continuations - they're not needed for parsing
1241                if !matches!(token, Token::Comment | Token::LineContinuation) {
1242                    tokens.push(Spanned::new(token, corrected_span));
1243                }
1244            }
1245            Err(err) => {
1246                errors.push(Spanned::new(err, corrected_span));
1247            }
1248        }
1249    }
1250
1251    if !errors.is_empty() {
1252        return Err(errors);
1253    }
1254
1255    // Post-process: replace markers with actual token content
1256    let mut final_tokens = Vec::with_capacity(tokens.len());
1257    let mut i = 0;
1258
1259    while i < tokens.len() {
1260        // Check for arithmetic marker (unique format: __KAISH_ARITH_{id}__)
1261        if let Token::Ident(ref name) = tokens[i].token
1262            && name.starts_with("__KAISH_ARITH_") && name.ends_with("__")
1263                && let Some((_, expr)) = arith_result.arithmetics.iter().find(|(marker, _)| marker == name) {
1264                    final_tokens.push(Spanned::new(Token::Arithmetic(expr.clone()), tokens[i].span.clone()));
1265                    i += 1;
1266                    continue;
1267                }
1268
1269        // Check for heredoc (unique format: __KAISH_HEREDOC_{id}__)
1270        if matches!(tokens[i].token, Token::HereDocStart) {
1271            // Check if next token is a heredoc marker
1272            if i + 1 < tokens.len()
1273                && let Token::Ident(ref name) = tokens[i + 1].token
1274                    && name.starts_with("__KAISH_HEREDOC_") && name.ends_with("__") {
1275                        // Find the corresponding content
1276                        if let Some((_, content, literal)) = heredocs.iter().find(|(marker, _, _)| marker == name) {
1277                            final_tokens.push(Spanned::new(Token::HereDocStart, tokens[i].span.clone()));
1278                            final_tokens.push(Spanned::new(Token::HereDoc(HereDocData { content: content.clone(), literal: *literal }), tokens[i + 1].span.clone()));
1279                            i += 2;
1280                            continue;
1281                        }
1282                    }
1283        }
1284
1285        // Check for arithmetic markers inside string content
1286        let token = if let Token::String(ref s) = tokens[i].token {
1287            // Check if string contains any arithmetic markers
1288            let mut new_content = s.clone();
1289            for (marker, expr) in &arith_result.arithmetics {
1290                if new_content.contains(marker) {
1291                    // Replace marker with the special format that parse_interpolated_string can detect
1292                    // Use ${__ARITH:expr__} format so it gets parsed as StringPart::Arithmetic
1293                    new_content = new_content.replace(marker, &format!("${{__ARITH:{}__}}", expr));
1294                }
1295            }
1296            if new_content != *s {
1297                Spanned::new(Token::String(new_content), tokens[i].span.clone())
1298            } else {
1299                tokens[i].clone()
1300            }
1301        } else {
1302            tokens[i].clone()
1303        };
1304        final_tokens.push(token);
1305        i += 1;
1306    }
1307
1308    Ok(final_tokens)
1309}
1310
1311/// Tokenize source code, preserving comments.
1312///
1313/// Useful for pretty-printing or formatting tools that need to preserve comments.
1314pub fn tokenize_with_comments(source: &str) -> Result<Vec<Spanned<Token>>, Vec<Spanned<LexerError>>> {
1315    let lexer = Token::lexer(source);
1316    let mut tokens = Vec::new();
1317    let mut errors = Vec::new();
1318
1319    for (result, span) in lexer.spanned() {
1320        match result {
1321            Ok(token) => {
1322                tokens.push(Spanned::new(token, span));
1323            }
1324            Err(err) => {
1325                errors.push(Spanned::new(err, span));
1326            }
1327        }
1328    }
1329
1330    if errors.is_empty() {
1331        Ok(tokens)
1332    } else {
1333        Err(errors)
1334    }
1335}
1336
1337/// Extract the string content from a string token (removes quotes, processes escapes).
1338pub fn parse_string_literal(source: &str) -> Result<String, LexerError> {
1339    // Remove surrounding quotes
1340    if source.len() < 2 || !source.starts_with('"') || !source.ends_with('"') {
1341        return Err(LexerError::UnterminatedString);
1342    }
1343
1344    let inner = &source[1..source.len() - 1];
1345    let mut result = String::with_capacity(inner.len());
1346    let mut chars = inner.chars().peekable();
1347
1348    while let Some(ch) = chars.next() {
1349        if ch == '\\' {
1350            match chars.next() {
1351                Some('n') => result.push('\n'),
1352                Some('t') => result.push('\t'),
1353                Some('r') => result.push('\r'),
1354                Some('\\') => result.push('\\'),
1355                Some('"') => result.push('"'),
1356                // Use a unique marker for escaped dollar that won't be re-interpreted
1357                // parse_interpolated_string will convert this back to $
1358                Some('$') => result.push_str("__KAISH_ESCAPED_DOLLAR__"),
1359                Some('u') => {
1360                    // Unicode escape: \uXXXX
1361                    let mut hex = String::with_capacity(4);
1362                    for _ in 0..4 {
1363                        match chars.next() {
1364                            Some(h) if h.is_ascii_hexdigit() => hex.push(h),
1365                            _ => return Err(LexerError::InvalidEscape),
1366                        }
1367                    }
1368                    let codepoint = u32::from_str_radix(&hex, 16)
1369                        .map_err(|_| LexerError::InvalidEscape)?;
1370                    let ch = char::from_u32(codepoint)
1371                        .ok_or(LexerError::InvalidEscape)?;
1372                    result.push(ch);
1373                }
1374                // Unknown escapes: preserve the backslash (for regex patterns like `\.`)
1375                Some(next) => {
1376                    result.push('\\');
1377                    result.push(next);
1378                }
1379                None => return Err(LexerError::InvalidEscape),
1380            }
1381        } else {
1382            result.push(ch);
1383        }
1384    }
1385
1386    Ok(result)
1387}
1388
1389/// Parse a variable reference, extracting the path segments.
1390/// Input: "${VAR.field[0].nested}" → ["VAR", "field", "[0]", "nested"]
1391pub fn parse_var_ref(source: &str) -> Result<Vec<String>, LexerError> {
1392    // Remove ${ and }
1393    if source.len() < 4 || !source.starts_with("${") || !source.ends_with('}') {
1394        return Err(LexerError::UnterminatedVarRef);
1395    }
1396
1397    let inner = &source[2..source.len() - 1];
1398
1399    // Special case: $? (last result)
1400    if inner == "?" {
1401        return Ok(vec!["?".to_string()]);
1402    }
1403
1404    let mut segments = Vec::new();
1405    let mut current = String::new();
1406    let mut chars = inner.chars().peekable();
1407
1408    while let Some(ch) = chars.next() {
1409        match ch {
1410            '.' => {
1411                if !current.is_empty() {
1412                    segments.push(current.clone());
1413                    current.clear();
1414                }
1415            }
1416            '[' => {
1417                if !current.is_empty() {
1418                    segments.push(current.clone());
1419                    current.clear();
1420                }
1421                // Collect the index
1422                let mut index = String::from("[");
1423                while let Some(&c) = chars.peek() {
1424                    if let Some(c) = chars.next() {
1425                        index.push(c);
1426                    }
1427                    if c == ']' {
1428                        break;
1429                    }
1430                }
1431                segments.push(index);
1432            }
1433            _ => {
1434                current.push(ch);
1435            }
1436        }
1437    }
1438
1439    if !current.is_empty() {
1440        segments.push(current);
1441    }
1442
1443    Ok(segments)
1444}
1445
1446/// Parse an integer literal.
1447pub fn parse_int(source: &str) -> Result<i64, LexerError> {
1448    source.parse().map_err(|_| LexerError::InvalidNumber)
1449}
1450
1451/// Parse a float literal.
1452pub fn parse_float(source: &str) -> Result<f64, LexerError> {
1453    source.parse().map_err(|_| LexerError::InvalidNumber)
1454}
1455
1456#[cfg(test)]
1457mod tests {
1458    use super::*;
1459
1460    fn lex(source: &str) -> Vec<Token> {
1461        tokenize(source)
1462            .expect("lexer should succeed")
1463            .into_iter()
1464            .map(|s| s.token)
1465            .collect()
1466    }
1467
1468    // ═══════════════════════════════════════════════════════════════════
1469    // Keyword tests
1470    // ═══════════════════════════════════════════════════════════════════
1471
1472    #[test]
1473    fn keywords() {
1474        assert_eq!(lex("set"), vec![Token::Set]);
1475        assert_eq!(lex("if"), vec![Token::If]);
1476        assert_eq!(lex("then"), vec![Token::Then]);
1477        assert_eq!(lex("else"), vec![Token::Else]);
1478        assert_eq!(lex("elif"), vec![Token::Elif]);
1479        assert_eq!(lex("fi"), vec![Token::Fi]);
1480        assert_eq!(lex("for"), vec![Token::For]);
1481        assert_eq!(lex("in"), vec![Token::In]);
1482        assert_eq!(lex("do"), vec![Token::Do]);
1483        assert_eq!(lex("done"), vec![Token::Done]);
1484        assert_eq!(lex("case"), vec![Token::Case]);
1485        assert_eq!(lex("esac"), vec![Token::Esac]);
1486        assert_eq!(lex("function"), vec![Token::Function]);
1487        assert_eq!(lex("true"), vec![Token::True]);
1488        assert_eq!(lex("false"), vec![Token::False]);
1489    }
1490
1491    #[test]
1492    fn double_semicolon() {
1493        assert_eq!(lex(";;"), vec![Token::DoubleSemi]);
1494        // In case pattern context
1495        assert_eq!(lex("echo \"hi\";;"), vec![
1496            Token::Ident("echo".to_string()),
1497            Token::String("hi".to_string()),
1498            Token::DoubleSemi,
1499        ]);
1500    }
1501
1502    #[test]
1503    fn type_keywords() {
1504        assert_eq!(lex("string"), vec![Token::TypeString]);
1505        assert_eq!(lex("int"), vec![Token::TypeInt]);
1506        assert_eq!(lex("float"), vec![Token::TypeFloat]);
1507        assert_eq!(lex("bool"), vec![Token::TypeBool]);
1508    }
1509
1510    // ═══════════════════════════════════════════════════════════════════
1511    // Operator tests
1512    // ═══════════════════════════════════════════════════════════════════
1513
1514    #[test]
1515    fn single_char_operators() {
1516        assert_eq!(lex("="), vec![Token::Eq]);
1517        assert_eq!(lex("|"), vec![Token::Pipe]);
1518        assert_eq!(lex("&"), vec![Token::Amp]);
1519        assert_eq!(lex(">"), vec![Token::Gt]);
1520        assert_eq!(lex("<"), vec![Token::Lt]);
1521        assert_eq!(lex(";"), vec![Token::Semi]);
1522        assert_eq!(lex(":"), vec![Token::Colon]);
1523        assert_eq!(lex(","), vec![Token::Comma]);
1524        assert_eq!(lex("."), vec![Token::Dot]);
1525    }
1526
1527    #[test]
1528    fn multi_char_operators() {
1529        assert_eq!(lex("&&"), vec![Token::And]);
1530        assert_eq!(lex("||"), vec![Token::Or]);
1531        assert_eq!(lex("=="), vec![Token::EqEq]);
1532        assert_eq!(lex("!="), vec![Token::NotEq]);
1533        assert_eq!(lex("=~"), vec![Token::Match]);
1534        assert_eq!(lex("!~"), vec![Token::NotMatch]);
1535        assert_eq!(lex(">="), vec![Token::GtEq]);
1536        assert_eq!(lex("<="), vec![Token::LtEq]);
1537        assert_eq!(lex(">>"), vec![Token::GtGt]);
1538        assert_eq!(lex("2>"), vec![Token::Stderr]);
1539        assert_eq!(lex("&>"), vec![Token::Both]);
1540    }
1541
1542    #[test]
1543    fn brackets() {
1544        assert_eq!(lex("{"), vec![Token::LBrace]);
1545        assert_eq!(lex("}"), vec![Token::RBrace]);
1546        assert_eq!(lex("["), vec![Token::LBracket]);
1547        assert_eq!(lex("]"), vec![Token::RBracket]);
1548        assert_eq!(lex("("), vec![Token::LParen]);
1549        assert_eq!(lex(")"), vec![Token::RParen]);
1550    }
1551
1552    // ═══════════════════════════════════════════════════════════════════
1553    // Literal tests
1554    // ═══════════════════════════════════════════════════════════════════
1555
1556    #[test]
1557    fn integers() {
1558        assert_eq!(lex("0"), vec![Token::Int(0)]);
1559        assert_eq!(lex("42"), vec![Token::Int(42)]);
1560        assert_eq!(lex("-1"), vec![Token::Int(-1)]);
1561        assert_eq!(lex("999999"), vec![Token::Int(999999)]);
1562    }
1563
1564    #[test]
1565    fn floats() {
1566        assert_eq!(lex("3.14"), vec![Token::Float(3.14)]);
1567        assert_eq!(lex("-0.5"), vec![Token::Float(-0.5)]);
1568        assert_eq!(lex("123.456"), vec![Token::Float(123.456)]);
1569    }
1570
1571    #[test]
1572    fn strings() {
1573        assert_eq!(lex(r#""hello""#), vec![Token::String("hello".to_string())]);
1574        assert_eq!(lex(r#""hello world""#), vec![Token::String("hello world".to_string())]);
1575        assert_eq!(lex(r#""""#), vec![Token::String("".to_string())]); // empty string
1576        assert_eq!(lex(r#""with \"quotes\"""#), vec![Token::String("with \"quotes\"".to_string())]);
1577        assert_eq!(lex(r#""with\nnewline""#), vec![Token::String("with\nnewline".to_string())]);
1578    }
1579
1580    #[test]
1581    fn var_refs() {
1582        assert_eq!(lex("${X}"), vec![Token::VarRef("${X}".to_string())]);
1583        assert_eq!(lex("${VAR}"), vec![Token::VarRef("${VAR}".to_string())]);
1584        assert_eq!(lex("${VAR.field}"), vec![Token::VarRef("${VAR.field}".to_string())]);
1585        assert_eq!(lex("${VAR[0]}"), vec![Token::VarRef("${VAR[0]}".to_string())]);
1586        assert_eq!(lex("${?.ok}"), vec![Token::VarRef("${?.ok}".to_string())]);
1587    }
1588
1589    // ═══════════════════════════════════════════════════════════════════
1590    // Identifier tests
1591    // ═══════════════════════════════════════════════════════════════════
1592
1593    #[test]
1594    fn identifiers() {
1595        assert_eq!(lex("foo"), vec![Token::Ident("foo".to_string())]);
1596        assert_eq!(lex("foo_bar"), vec![Token::Ident("foo_bar".to_string())]);
1597        assert_eq!(lex("foo-bar"), vec![Token::Ident("foo-bar".to_string())]);
1598        assert_eq!(lex("_private"), vec![Token::Ident("_private".to_string())]);
1599        assert_eq!(lex("cmd123"), vec![Token::Ident("cmd123".to_string())]);
1600    }
1601
1602    #[test]
1603    fn keyword_prefix_identifiers() {
1604        // Identifiers that start with keywords but aren't keywords
1605        assert_eq!(lex("setup"), vec![Token::Ident("setup".to_string())]);
1606        assert_eq!(lex("tools"), vec![Token::Ident("tools".to_string())]);
1607        assert_eq!(lex("iffy"), vec![Token::Ident("iffy".to_string())]);
1608        assert_eq!(lex("forked"), vec![Token::Ident("forked".to_string())]);
1609        assert_eq!(lex("done-with-it"), vec![Token::Ident("done-with-it".to_string())]);
1610    }
1611
1612    // ═══════════════════════════════════════════════════════════════════
1613    // Statement tests
1614    // ═══════════════════════════════════════════════════════════════════
1615
1616    #[test]
1617    fn assignment() {
1618        assert_eq!(
1619            lex("set X = 5"),
1620            vec![Token::Set, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
1621        );
1622    }
1623
1624    #[test]
1625    fn command_simple() {
1626        assert_eq!(lex("echo"), vec![Token::Ident("echo".to_string())]);
1627        assert_eq!(
1628            lex(r#"echo "hello""#),
1629            vec![Token::Ident("echo".to_string()), Token::String("hello".to_string())]
1630        );
1631    }
1632
1633    #[test]
1634    fn command_with_args() {
1635        assert_eq!(
1636            lex("cmd arg1 arg2"),
1637            vec![Token::Ident("cmd".to_string()), Token::Ident("arg1".to_string()), Token::Ident("arg2".to_string())]
1638        );
1639    }
1640
1641    #[test]
1642    fn command_with_named_args() {
1643        assert_eq!(
1644            lex("cmd key=value"),
1645            vec![Token::Ident("cmd".to_string()), Token::Ident("key".to_string()), Token::Eq, Token::Ident("value".to_string())]
1646        );
1647    }
1648
1649    #[test]
1650    fn pipeline() {
1651        assert_eq!(
1652            lex("a | b | c"),
1653            vec![Token::Ident("a".to_string()), Token::Pipe, Token::Ident("b".to_string()), Token::Pipe, Token::Ident("c".to_string())]
1654        );
1655    }
1656
1657    #[test]
1658    fn if_statement() {
1659        assert_eq!(
1660            lex("if true; then echo; fi"),
1661            vec![
1662                Token::If,
1663                Token::True,
1664                Token::Semi,
1665                Token::Then,
1666                Token::Ident("echo".to_string()),
1667                Token::Semi,
1668                Token::Fi
1669            ]
1670        );
1671    }
1672
1673    #[test]
1674    fn for_loop() {
1675        assert_eq!(
1676            lex("for X in items; do echo; done"),
1677            vec![
1678                Token::For,
1679                Token::Ident("X".to_string()),
1680                Token::In,
1681                Token::Ident("items".to_string()),
1682                Token::Semi,
1683                Token::Do,
1684                Token::Ident("echo".to_string()),
1685                Token::Semi,
1686                Token::Done
1687            ]
1688        );
1689    }
1690
1691    // ═══════════════════════════════════════════════════════════════════
1692    // Whitespace and newlines
1693    // ═══════════════════════════════════════════════════════════════════
1694
1695    #[test]
1696    fn whitespace_ignored() {
1697        assert_eq!(lex("   set   X   =   5   "), lex("set X = 5"));
1698    }
1699
1700    #[test]
1701    fn newlines_preserved() {
1702        let tokens = lex("a\nb");
1703        assert_eq!(
1704            tokens,
1705            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
1706        );
1707    }
1708
1709    #[test]
1710    fn multiple_newlines() {
1711        let tokens = lex("a\n\n\nb");
1712        assert_eq!(
1713            tokens,
1714            vec![Token::Ident("a".to_string()), Token::Newline, Token::Newline, Token::Newline, Token::Ident("b".to_string())]
1715        );
1716    }
1717
1718    // ═══════════════════════════════════════════════════════════════════
1719    // Comments
1720    // ═══════════════════════════════════════════════════════════════════
1721
1722    #[test]
1723    fn comments_skipped() {
1724        assert_eq!(lex("# comment"), vec![]);
1725        assert_eq!(lex("a # comment"), vec![Token::Ident("a".to_string())]);
1726        assert_eq!(
1727            lex("a # comment\nb"),
1728            vec![Token::Ident("a".to_string()), Token::Newline, Token::Ident("b".to_string())]
1729        );
1730    }
1731
1732    #[test]
1733    fn comments_preserved_when_requested() {
1734        let tokens = tokenize_with_comments("a # comment")
1735            .expect("should succeed")
1736            .into_iter()
1737            .map(|s| s.token)
1738            .collect::<Vec<_>>();
1739        assert_eq!(tokens, vec![Token::Ident("a".to_string()), Token::Comment]);
1740    }
1741
1742    // ═══════════════════════════════════════════════════════════════════
1743    // String parsing
1744    // ═══════════════════════════════════════════════════════════════════
1745
1746    #[test]
1747    fn parse_simple_string() {
1748        assert_eq!(parse_string_literal(r#""hello""#).expect("ok"), "hello");
1749    }
1750
1751    #[test]
1752    fn parse_string_with_escapes() {
1753        assert_eq!(
1754            parse_string_literal(r#""hello\nworld""#).expect("ok"),
1755            "hello\nworld"
1756        );
1757        assert_eq!(
1758            parse_string_literal(r#""tab\there""#).expect("ok"),
1759            "tab\there"
1760        );
1761        assert_eq!(
1762            parse_string_literal(r#""quote\"here""#).expect("ok"),
1763            "quote\"here"
1764        );
1765    }
1766
1767    #[test]
1768    fn parse_string_with_unicode() {
1769        assert_eq!(
1770            parse_string_literal(r#""emoji \u2764""#).expect("ok"),
1771            "emoji ❤"
1772        );
1773    }
1774
1775    #[test]
1776    fn parse_string_with_escaped_dollar() {
1777        // \$ produces a marker that parse_interpolated_string will convert to $
1778        // The marker __KAISH_ESCAPED_DOLLAR__ is used to prevent re-interpretation
1779        assert_eq!(
1780            parse_string_literal(r#""\$VAR""#).expect("ok"),
1781            "__KAISH_ESCAPED_DOLLAR__VAR"
1782        );
1783        assert_eq!(
1784            parse_string_literal(r#""cost: \$100""#).expect("ok"),
1785            "cost: __KAISH_ESCAPED_DOLLAR__100"
1786        );
1787    }
1788
1789    // ═══════════════════════════════════════════════════════════════════
1790    // Variable reference parsing
1791    // ═══════════════════════════════════════════════════════════════════
1792
1793    #[test]
1794    fn parse_simple_var() {
1795        assert_eq!(
1796            parse_var_ref("${X}").expect("ok"),
1797            vec!["X"]
1798        );
1799    }
1800
1801    #[test]
1802    fn parse_var_with_field() {
1803        assert_eq!(
1804            parse_var_ref("${VAR.field}").expect("ok"),
1805            vec!["VAR", "field"]
1806        );
1807    }
1808
1809    #[test]
1810    fn parse_var_with_index() {
1811        assert_eq!(
1812            parse_var_ref("${VAR[0]}").expect("ok"),
1813            vec!["VAR", "[0]"]
1814        );
1815    }
1816
1817    #[test]
1818    fn parse_var_nested() {
1819        assert_eq!(
1820            parse_var_ref("${VAR.field[0].nested}").expect("ok"),
1821            vec!["VAR", "field", "[0]", "nested"]
1822        );
1823    }
1824
1825    #[test]
1826    fn parse_last_result() {
1827        assert_eq!(
1828            parse_var_ref("${?}").expect("ok"),
1829            vec!["?"]
1830        );
1831        assert_eq!(
1832            parse_var_ref("${?.ok}").expect("ok"),
1833            vec!["?", "ok"]
1834        );
1835    }
1836
1837    // ═══════════════════════════════════════════════════════════════════
1838    // Number parsing
1839    // ═══════════════════════════════════════════════════════════════════
1840
1841    #[test]
1842    fn parse_integers() {
1843        assert_eq!(parse_int("0").expect("ok"), 0);
1844        assert_eq!(parse_int("42").expect("ok"), 42);
1845        assert_eq!(parse_int("-1").expect("ok"), -1);
1846    }
1847
1848    #[test]
1849    fn parse_floats() {
1850        assert!((parse_float("3.14").expect("ok") - 3.14).abs() < f64::EPSILON);
1851        assert!((parse_float("-0.5").expect("ok") - (-0.5)).abs() < f64::EPSILON);
1852    }
1853
1854    // ═══════════════════════════════════════════════════════════════════
1855    // Edge cases and errors
1856    // ═══════════════════════════════════════════════════════════════════
1857
1858    #[test]
1859    fn empty_input() {
1860        assert_eq!(lex(""), vec![]);
1861    }
1862
1863    #[test]
1864    fn only_whitespace() {
1865        assert_eq!(lex("   \t\t   "), vec![]);
1866    }
1867
1868    #[test]
1869    fn json_array() {
1870        assert_eq!(
1871            lex(r#"[1, 2, 3]"#),
1872            vec![
1873                Token::LBracket,
1874                Token::Int(1),
1875                Token::Comma,
1876                Token::Int(2),
1877                Token::Comma,
1878                Token::Int(3),
1879                Token::RBracket
1880            ]
1881        );
1882    }
1883
1884    #[test]
1885    fn json_object() {
1886        assert_eq!(
1887            lex(r#"{"key": "value"}"#),
1888            vec![
1889                Token::LBrace,
1890                Token::String("key".to_string()),
1891                Token::Colon,
1892                Token::String("value".to_string()),
1893                Token::RBrace
1894            ]
1895        );
1896    }
1897
1898    #[test]
1899    fn redirect_operators() {
1900        assert_eq!(
1901            lex("cmd > file"),
1902            vec![Token::Ident("cmd".to_string()), Token::Gt, Token::Ident("file".to_string())]
1903        );
1904        assert_eq!(
1905            lex("cmd >> file"),
1906            vec![Token::Ident("cmd".to_string()), Token::GtGt, Token::Ident("file".to_string())]
1907        );
1908        assert_eq!(
1909            lex("cmd 2> err"),
1910            vec![Token::Ident("cmd".to_string()), Token::Stderr, Token::Ident("err".to_string())]
1911        );
1912        assert_eq!(
1913            lex("cmd &> all"),
1914            vec![Token::Ident("cmd".to_string()), Token::Both, Token::Ident("all".to_string())]
1915        );
1916    }
1917
1918    #[test]
1919    fn background_job() {
1920        assert_eq!(
1921            lex("cmd &"),
1922            vec![Token::Ident("cmd".to_string()), Token::Amp]
1923        );
1924    }
1925
1926    #[test]
1927    fn command_substitution() {
1928        assert_eq!(
1929            lex("$(cmd)"),
1930            vec![Token::CmdSubstStart, Token::Ident("cmd".to_string()), Token::RParen]
1931        );
1932        assert_eq!(
1933            lex("$(cmd arg)"),
1934            vec![
1935                Token::CmdSubstStart,
1936                Token::Ident("cmd".to_string()),
1937                Token::Ident("arg".to_string()),
1938                Token::RParen
1939            ]
1940        );
1941        assert_eq!(
1942            lex("$(a | b)"),
1943            vec![
1944                Token::CmdSubstStart,
1945                Token::Ident("a".to_string()),
1946                Token::Pipe,
1947                Token::Ident("b".to_string()),
1948                Token::RParen
1949            ]
1950        );
1951    }
1952
1953    #[test]
1954    fn complex_pipeline() {
1955        assert_eq!(
1956            lex(r#"cat file | grep pattern="foo" | head count=10"#),
1957            vec![
1958                Token::Ident("cat".to_string()),
1959                Token::Ident("file".to_string()),
1960                Token::Pipe,
1961                Token::Ident("grep".to_string()),
1962                Token::Ident("pattern".to_string()),
1963                Token::Eq,
1964                Token::String("foo".to_string()),
1965                Token::Pipe,
1966                Token::Ident("head".to_string()),
1967                Token::Ident("count".to_string()),
1968                Token::Eq,
1969                Token::Int(10),
1970            ]
1971        );
1972    }
1973
1974    // ═══════════════════════════════════════════════════════════════════
1975    // Flag tests
1976    // ═══════════════════════════════════════════════════════════════════
1977
1978    #[test]
1979    fn short_flag() {
1980        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
1981        assert_eq!(lex("-a"), vec![Token::ShortFlag("a".to_string())]);
1982        assert_eq!(lex("-v"), vec![Token::ShortFlag("v".to_string())]);
1983    }
1984
1985    #[test]
1986    fn short_flag_combined() {
1987        // Combined short flags like -la
1988        assert_eq!(lex("-la"), vec![Token::ShortFlag("la".to_string())]);
1989        assert_eq!(lex("-vvv"), vec![Token::ShortFlag("vvv".to_string())]);
1990    }
1991
1992    #[test]
1993    fn long_flag() {
1994        assert_eq!(lex("--force"), vec![Token::LongFlag("force".to_string())]);
1995        assert_eq!(lex("--verbose"), vec![Token::LongFlag("verbose".to_string())]);
1996        assert_eq!(lex("--foo-bar"), vec![Token::LongFlag("foo-bar".to_string())]);
1997    }
1998
1999    #[test]
2000    fn double_dash() {
2001        // -- alone marks end of flags
2002        assert_eq!(lex("--"), vec![Token::DoubleDash]);
2003    }
2004
2005    #[test]
2006    fn flags_vs_negative_numbers() {
2007        // -123 should be a negative integer, not a flag
2008        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2009        // -l should be a flag
2010        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2011        // -1a is ambiguous - should be Int(-1) then Ident(a)
2012        // Actually the regex -[a-zA-Z] won't match -1a since 1 isn't a letter
2013        assert_eq!(
2014            lex("-1 a"),
2015            vec![Token::Int(-1), Token::Ident("a".to_string())]
2016        );
2017    }
2018
2019    #[test]
2020    fn command_with_flags() {
2021        assert_eq!(
2022            lex("ls -l"),
2023            vec![
2024                Token::Ident("ls".to_string()),
2025                Token::ShortFlag("l".to_string()),
2026            ]
2027        );
2028        assert_eq!(
2029            lex("git commit -m"),
2030            vec![
2031                Token::Ident("git".to_string()),
2032                Token::Ident("commit".to_string()),
2033                Token::ShortFlag("m".to_string()),
2034            ]
2035        );
2036        assert_eq!(
2037            lex("git push --force"),
2038            vec![
2039                Token::Ident("git".to_string()),
2040                Token::Ident("push".to_string()),
2041                Token::LongFlag("force".to_string()),
2042            ]
2043        );
2044    }
2045
2046    #[test]
2047    fn flag_with_value() {
2048        assert_eq!(
2049            lex(r#"git commit -m "message""#),
2050            vec![
2051                Token::Ident("git".to_string()),
2052                Token::Ident("commit".to_string()),
2053                Token::ShortFlag("m".to_string()),
2054                Token::String("message".to_string()),
2055            ]
2056        );
2057        assert_eq!(
2058            lex(r#"--message="hello""#),
2059            vec![
2060                Token::LongFlag("message".to_string()),
2061                Token::Eq,
2062                Token::String("hello".to_string()),
2063            ]
2064        );
2065    }
2066
2067    #[test]
2068    fn end_of_flags_marker() {
2069        assert_eq!(
2070            lex("git checkout -- file"),
2071            vec![
2072                Token::Ident("git".to_string()),
2073                Token::Ident("checkout".to_string()),
2074                Token::DoubleDash,
2075                Token::Ident("file".to_string()),
2076            ]
2077        );
2078    }
2079
2080    // ═══════════════════════════════════════════════════════════════════
2081    // Bash compatibility tokens
2082    // ═══════════════════════════════════════════════════════════════════
2083
2084    #[test]
2085    fn local_keyword() {
2086        assert_eq!(lex("local"), vec![Token::Local]);
2087        assert_eq!(
2088            lex("local X = 5"),
2089            vec![Token::Local, Token::Ident("X".to_string()), Token::Eq, Token::Int(5)]
2090        );
2091    }
2092
2093    #[test]
2094    fn simple_var_ref() {
2095        assert_eq!(lex("$X"), vec![Token::SimpleVarRef("X".to_string())]);
2096        assert_eq!(lex("$foo"), vec![Token::SimpleVarRef("foo".to_string())]);
2097        assert_eq!(lex("$foo_bar"), vec![Token::SimpleVarRef("foo_bar".to_string())]);
2098        assert_eq!(lex("$_private"), vec![Token::SimpleVarRef("_private".to_string())]);
2099    }
2100
2101    #[test]
2102    fn simple_var_ref_in_command() {
2103        assert_eq!(
2104            lex("echo $NAME"),
2105            vec![Token::Ident("echo".to_string()), Token::SimpleVarRef("NAME".to_string())]
2106        );
2107    }
2108
2109    #[test]
2110    fn single_quoted_strings() {
2111        assert_eq!(lex("'hello'"), vec![Token::SingleString("hello".to_string())]);
2112        assert_eq!(lex("'hello world'"), vec![Token::SingleString("hello world".to_string())]);
2113        assert_eq!(lex("''"), vec![Token::SingleString("".to_string())]);
2114        // Single quotes don't process escapes or variables
2115        assert_eq!(lex(r"'no $VAR here'"), vec![Token::SingleString("no $VAR here".to_string())]);
2116        assert_eq!(lex(r"'backslash \n stays'"), vec![Token::SingleString(r"backslash \n stays".to_string())]);
2117    }
2118
2119    #[test]
2120    fn test_brackets() {
2121        // [[ and ]] are now two separate bracket tokens to avoid conflicts with nested arrays
2122        assert_eq!(lex("[["), vec![Token::LBracket, Token::LBracket]);
2123        assert_eq!(lex("]]"), vec![Token::RBracket, Token::RBracket]);
2124        assert_eq!(
2125            lex("[[ -f file ]]"),
2126            vec![
2127                Token::LBracket,
2128                Token::LBracket,
2129                Token::ShortFlag("f".to_string()),
2130                Token::Ident("file".to_string()),
2131                Token::RBracket,
2132                Token::RBracket
2133            ]
2134        );
2135    }
2136
2137    #[test]
2138    fn test_expression_syntax() {
2139        assert_eq!(
2140            lex(r#"[[ $X == "value" ]]"#),
2141            vec![
2142                Token::LBracket,
2143                Token::LBracket,
2144                Token::SimpleVarRef("X".to_string()),
2145                Token::EqEq,
2146                Token::String("value".to_string()),
2147                Token::RBracket,
2148                Token::RBracket
2149            ]
2150        );
2151    }
2152
2153    #[test]
2154    fn bash_style_assignment() {
2155        // NAME="value" (no spaces) - lexer sees IDENT EQ STRING
2156        assert_eq!(
2157            lex(r#"NAME="value""#),
2158            vec![
2159                Token::Ident("NAME".to_string()),
2160                Token::Eq,
2161                Token::String("value".to_string())
2162            ]
2163        );
2164    }
2165
2166    #[test]
2167    fn positional_params() {
2168        assert_eq!(lex("$0"), vec![Token::Positional(0)]);
2169        assert_eq!(lex("$1"), vec![Token::Positional(1)]);
2170        assert_eq!(lex("$9"), vec![Token::Positional(9)]);
2171        assert_eq!(lex("$@"), vec![Token::AllArgs]);
2172        assert_eq!(lex("$#"), vec![Token::ArgCount]);
2173    }
2174
2175    #[test]
2176    fn positional_in_context() {
2177        assert_eq!(
2178            lex("echo $1 $2"),
2179            vec![
2180                Token::Ident("echo".to_string()),
2181                Token::Positional(1),
2182                Token::Positional(2),
2183            ]
2184        );
2185    }
2186
2187    #[test]
2188    fn var_length() {
2189        assert_eq!(lex("${#X}"), vec![Token::VarLength("X".to_string())]);
2190        assert_eq!(lex("${#NAME}"), vec![Token::VarLength("NAME".to_string())]);
2191        assert_eq!(lex("${#foo_bar}"), vec![Token::VarLength("foo_bar".to_string())]);
2192    }
2193
2194    #[test]
2195    fn var_length_in_context() {
2196        assert_eq!(
2197            lex("echo ${#NAME}"),
2198            vec![
2199                Token::Ident("echo".to_string()),
2200                Token::VarLength("NAME".to_string()),
2201            ]
2202        );
2203    }
2204
2205    // ═══════════════════════════════════════════════════════════════════
2206    // Edge case tests: Flag ambiguities
2207    // ═══════════════════════════════════════════════════════════════════
2208
2209    #[test]
2210    fn plus_flag() {
2211        // Plus flags for set +e
2212        assert_eq!(lex("+e"), vec![Token::PlusFlag("e".to_string())]);
2213        assert_eq!(lex("+x"), vec![Token::PlusFlag("x".to_string())]);
2214        assert_eq!(lex("+ex"), vec![Token::PlusFlag("ex".to_string())]);
2215    }
2216
2217    #[test]
2218    fn set_with_plus_flag() {
2219        assert_eq!(
2220            lex("set +e"),
2221            vec![
2222                Token::Set,
2223                Token::PlusFlag("e".to_string()),
2224            ]
2225        );
2226    }
2227
2228    #[test]
2229    fn set_with_multiple_flags() {
2230        assert_eq!(
2231            lex("set -e -u"),
2232            vec![
2233                Token::Set,
2234                Token::ShortFlag("e".to_string()),
2235                Token::ShortFlag("u".to_string()),
2236            ]
2237        );
2238    }
2239
2240    #[test]
2241    fn flags_vs_negative_numbers_edge_cases() {
2242        // -1a should be negative int followed by ident
2243        assert_eq!(
2244            lex("-1 a"),
2245            vec![Token::Int(-1), Token::Ident("a".to_string())]
2246        );
2247        // -l is a flag
2248        assert_eq!(lex("-l"), vec![Token::ShortFlag("l".to_string())]);
2249        // -123 is negative number
2250        assert_eq!(lex("-123"), vec![Token::Int(-123)]);
2251    }
2252
2253    #[test]
2254    fn single_dash_is_minus_alone() {
2255        // Single dash alone - now handled as MinusAlone for `cat -` stdin indicator
2256        let result = tokenize("-").expect("should lex");
2257        assert_eq!(result.len(), 1);
2258        assert!(matches!(result[0].token, Token::MinusAlone));
2259    }
2260
2261    #[test]
2262    fn plus_bare_for_date_format() {
2263        // `date +%s` - the +%s should be PlusBare
2264        let result = tokenize("+%s").expect("should lex");
2265        assert_eq!(result.len(), 1);
2266        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%s"));
2267
2268        // `date +%Y-%m-%d` - format string with dashes
2269        let result = tokenize("+%Y-%m-%d").expect("should lex");
2270        assert_eq!(result.len(), 1);
2271        assert!(matches!(result[0].token, Token::PlusBare(ref s) if s == "+%Y-%m-%d"));
2272    }
2273
2274    #[test]
2275    fn plus_flag_still_works() {
2276        // `set +e` - should still be PlusFlag
2277        let result = tokenize("+e").expect("should lex");
2278        assert_eq!(result.len(), 1);
2279        assert!(matches!(result[0].token, Token::PlusFlag(ref s) if s == "e"));
2280    }
2281
2282    #[test]
2283    fn while_keyword_vs_while_loop() {
2284        // 'while' as keyword in loop context
2285        assert_eq!(lex("while"), vec![Token::While]);
2286        // 'while' at start followed by condition
2287        assert_eq!(
2288            lex("while true"),
2289            vec![Token::While, Token::True]
2290        );
2291    }
2292
2293    #[test]
2294    fn control_flow_keywords() {
2295        assert_eq!(lex("break"), vec![Token::Break]);
2296        assert_eq!(lex("continue"), vec![Token::Continue]);
2297        assert_eq!(lex("return"), vec![Token::Return]);
2298        assert_eq!(lex("exit"), vec![Token::Exit]);
2299    }
2300
2301    #[test]
2302    fn control_flow_with_numbers() {
2303        assert_eq!(
2304            lex("break 2"),
2305            vec![Token::Break, Token::Int(2)]
2306        );
2307        assert_eq!(
2308            lex("continue 3"),
2309            vec![Token::Continue, Token::Int(3)]
2310        );
2311        assert_eq!(
2312            lex("exit 1"),
2313            vec![Token::Exit, Token::Int(1)]
2314        );
2315    }
2316
2317    // ═══════════════════════════════════════════════════════════════════
2318    // Here-doc tests
2319    // ═══════════════════════════════════════════════════════════════════
2320
2321    #[test]
2322    fn heredoc_simple() {
2323        let source = "cat <<EOF\nhello\nworld\nEOF";
2324        let tokens = lex(source);
2325        assert_eq!(tokens, vec![
2326            Token::Ident("cat".to_string()),
2327            Token::HereDocStart,
2328            Token::HereDoc(HereDocData { content: "hello\nworld".to_string(), literal: false }),
2329            Token::Newline,
2330        ]);
2331    }
2332
2333    #[test]
2334    fn heredoc_empty() {
2335        let source = "cat <<EOF\nEOF";
2336        let tokens = lex(source);
2337        assert_eq!(tokens, vec![
2338            Token::Ident("cat".to_string()),
2339            Token::HereDocStart,
2340            Token::HereDoc(HereDocData { content: "".to_string(), literal: false }),
2341            Token::Newline,
2342        ]);
2343    }
2344
2345    #[test]
2346    fn heredoc_with_special_chars() {
2347        let source = "cat <<EOF\n$VAR and \"quoted\" 'single'\nEOF";
2348        let tokens = lex(source);
2349        assert_eq!(tokens, vec![
2350            Token::Ident("cat".to_string()),
2351            Token::HereDocStart,
2352            Token::HereDoc(HereDocData { content: "$VAR and \"quoted\" 'single'".to_string(), literal: false }),
2353            Token::Newline,
2354        ]);
2355    }
2356
2357    #[test]
2358    fn heredoc_multiline() {
2359        let source = "cat <<END\nline1\nline2\nline3\nEND";
2360        let tokens = lex(source);
2361        assert_eq!(tokens, vec![
2362            Token::Ident("cat".to_string()),
2363            Token::HereDocStart,
2364            Token::HereDoc(HereDocData { content: "line1\nline2\nline3".to_string(), literal: false }),
2365            Token::Newline,
2366        ]);
2367    }
2368
2369    #[test]
2370    fn heredoc_in_command() {
2371        let source = "cat <<EOF\nhello\nEOF\necho goodbye";
2372        let tokens = lex(source);
2373        assert_eq!(tokens, vec![
2374            Token::Ident("cat".to_string()),
2375            Token::HereDocStart,
2376            Token::HereDoc(HereDocData { content: "hello".to_string(), literal: false }),
2377            Token::Newline,
2378            Token::Ident("echo".to_string()),
2379            Token::Ident("goodbye".to_string()),
2380        ]);
2381    }
2382
2383    #[test]
2384    fn heredoc_strip_tabs() {
2385        let source = "cat <<-EOF\n\thello\n\tworld\n\tEOF";
2386        let tokens = lex(source);
2387        // Content has tabs preserved, only delimiter matching strips tabs
2388        assert_eq!(tokens, vec![
2389            Token::Ident("cat".to_string()),
2390            Token::HereDocStart,
2391            Token::HereDoc(HereDocData { content: "\thello\n\tworld".to_string(), literal: false }),
2392            Token::Newline,
2393        ]);
2394    }
2395
2396    // ═══════════════════════════════════════════════════════════════════
2397    // Arithmetic expression tests
2398    // ═══════════════════════════════════════════════════════════════════
2399
2400    #[test]
2401    fn arithmetic_simple() {
2402        let source = "$((1 + 2))";
2403        let tokens = lex(source);
2404        assert_eq!(tokens, vec![Token::Arithmetic("1 + 2".to_string())]);
2405    }
2406
2407    #[test]
2408    fn arithmetic_in_assignment() {
2409        let source = "X=$((5 * 3))";
2410        let tokens = lex(source);
2411        assert_eq!(tokens, vec![
2412            Token::Ident("X".to_string()),
2413            Token::Eq,
2414            Token::Arithmetic("5 * 3".to_string()),
2415        ]);
2416    }
2417
2418    #[test]
2419    fn arithmetic_with_nested_parens() {
2420        let source = "$((2 * (3 + 4)))";
2421        let tokens = lex(source);
2422        assert_eq!(tokens, vec![Token::Arithmetic("2 * (3 + 4)".to_string())]);
2423    }
2424
2425    #[test]
2426    fn arithmetic_with_variable() {
2427        let source = "$((X + 1))";
2428        let tokens = lex(source);
2429        assert_eq!(tokens, vec![Token::Arithmetic("X + 1".to_string())]);
2430    }
2431
2432    #[test]
2433    fn arithmetic_command_subst_not_confused() {
2434        // $( should not be treated as arithmetic
2435        let source = "$(echo hello)";
2436        let tokens = lex(source);
2437        assert_eq!(tokens, vec![
2438            Token::CmdSubstStart,
2439            Token::Ident("echo".to_string()),
2440            Token::Ident("hello".to_string()),
2441            Token::RParen,
2442        ]);
2443    }
2444
2445    #[test]
2446    fn arithmetic_nesting_limit() {
2447        // Create deeply nested parens that exceed MAX_PAREN_DEPTH (256)
2448        let open_parens = "(".repeat(300);
2449        let close_parens = ")".repeat(300);
2450        let source = format!("$(({}1{}))", open_parens, close_parens);
2451        let result = tokenize(&source);
2452        assert!(result.is_err());
2453        let errors = result.unwrap_err();
2454        assert_eq!(errors.len(), 1);
2455        assert_eq!(errors[0].token, LexerError::NestingTooDeep);
2456    }
2457
2458    #[test]
2459    fn arithmetic_nesting_within_limit() {
2460        // Nesting within limit should work
2461        let source = "$((((1 + 2) * 3)))";
2462        let tokens = lex(source);
2463        assert_eq!(tokens, vec![Token::Arithmetic("((1 + 2) * 3)".to_string())]);
2464    }
2465
2466    // ═══════════════════════════════════════════════════════════════════
2467    // Token category tests
2468    // ═══════════════════════════════════════════════════════════════════
2469
2470    #[test]
2471    fn token_categories() {
2472        // Keywords
2473        assert_eq!(Token::If.category(), TokenCategory::Keyword);
2474        assert_eq!(Token::Then.category(), TokenCategory::Keyword);
2475        assert_eq!(Token::For.category(), TokenCategory::Keyword);
2476        assert_eq!(Token::Function.category(), TokenCategory::Keyword);
2477        assert_eq!(Token::True.category(), TokenCategory::Keyword);
2478        assert_eq!(Token::TypeString.category(), TokenCategory::Keyword);
2479
2480        // Operators
2481        assert_eq!(Token::Pipe.category(), TokenCategory::Operator);
2482        assert_eq!(Token::And.category(), TokenCategory::Operator);
2483        assert_eq!(Token::Or.category(), TokenCategory::Operator);
2484        assert_eq!(Token::StderrToStdout.category(), TokenCategory::Operator);
2485        assert_eq!(Token::GtGt.category(), TokenCategory::Operator);
2486
2487        // Strings
2488        assert_eq!(Token::String("test".to_string()).category(), TokenCategory::String);
2489        assert_eq!(Token::SingleString("test".to_string()).category(), TokenCategory::String);
2490        assert_eq!(Token::HereDoc(HereDocData { content: "test".to_string(), literal: false }).category(), TokenCategory::String);
2491
2492        // Numbers
2493        assert_eq!(Token::Int(42).category(), TokenCategory::Number);
2494        assert_eq!(Token::Float(3.14).category(), TokenCategory::Number);
2495        assert_eq!(Token::Arithmetic("1+2".to_string()).category(), TokenCategory::Number);
2496
2497        // Variables
2498        assert_eq!(Token::SimpleVarRef("X".to_string()).category(), TokenCategory::Variable);
2499        assert_eq!(Token::VarRef("${X}".to_string()).category(), TokenCategory::Variable);
2500        assert_eq!(Token::Positional(1).category(), TokenCategory::Variable);
2501        assert_eq!(Token::AllArgs.category(), TokenCategory::Variable);
2502        assert_eq!(Token::ArgCount.category(), TokenCategory::Variable);
2503        assert_eq!(Token::LastExitCode.category(), TokenCategory::Variable);
2504        assert_eq!(Token::CurrentPid.category(), TokenCategory::Variable);
2505
2506        // Flags
2507        assert_eq!(Token::ShortFlag("l".to_string()).category(), TokenCategory::Flag);
2508        assert_eq!(Token::LongFlag("verbose".to_string()).category(), TokenCategory::Flag);
2509        assert_eq!(Token::PlusFlag("e".to_string()).category(), TokenCategory::Flag);
2510        assert_eq!(Token::DoubleDash.category(), TokenCategory::Flag);
2511
2512        // Punctuation
2513        assert_eq!(Token::Semi.category(), TokenCategory::Punctuation);
2514        assert_eq!(Token::LParen.category(), TokenCategory::Punctuation);
2515        assert_eq!(Token::LBracket.category(), TokenCategory::Punctuation);
2516        assert_eq!(Token::Newline.category(), TokenCategory::Punctuation);
2517
2518        // Comments
2519        assert_eq!(Token::Comment.category(), TokenCategory::Comment);
2520
2521        // Paths
2522        assert_eq!(Token::Path("/tmp/file".to_string()).category(), TokenCategory::Path);
2523
2524        // Commands
2525        assert_eq!(Token::Ident("echo".to_string()).category(), TokenCategory::Command);
2526
2527        // Errors
2528        assert_eq!(Token::InvalidNumberIdent.category(), TokenCategory::Error);
2529        assert_eq!(Token::InvalidFloatNoLeading.category(), TokenCategory::Error);
2530        assert_eq!(Token::InvalidFloatNoTrailing.category(), TokenCategory::Error);
2531    }
2532
2533    #[test]
2534    fn test_heredoc_piped_to_command() {
2535        // Bug 4: "cat <<EOF | jq" should produce: cat <<heredoc | jq
2536        // Not: cat | jq <<heredoc
2537        let tokens = tokenize("cat <<EOF | jq\n{\"key\": \"val\"}\nEOF").unwrap();
2538        let heredoc_pos = tokens.iter().position(|t| matches!(t.token, Token::HereDoc(_)));
2539        let pipe_pos = tokens.iter().position(|t| matches!(t.token, Token::Pipe));
2540        assert!(heredoc_pos.is_some(), "should have a heredoc token");
2541        assert!(pipe_pos.is_some(), "should have a pipe token");
2542        assert!(
2543            pipe_pos.unwrap() > heredoc_pos.unwrap(),
2544            "Pipe must come after heredoc, got heredoc at {}, pipe at {}. Tokens: {:?}",
2545            heredoc_pos.unwrap(), pipe_pos.unwrap(), tokens,
2546        );
2547    }
2548
2549    #[test]
2550    fn test_heredoc_standalone_still_works() {
2551        // Regression: standalone heredoc (no pipe) must still work
2552        let tokens = tokenize("cat <<EOF\nhello\nEOF").unwrap();
2553        assert!(tokens.iter().any(|t| matches!(t.token, Token::HereDoc(_))));
2554        assert!(!tokens.iter().any(|t| matches!(t.token, Token::Pipe)));
2555    }
2556
2557    #[test]
2558    fn test_heredoc_preserves_leading_empty_lines() {
2559        // Bug B: heredoc starting with a blank line must preserve it
2560        let tokens = tokenize("cat <<EOF\n\nhello\nEOF").unwrap();
2561        let heredoc = tokens.iter().find_map(|t| {
2562            if let Token::HereDoc(data) = &t.token {
2563                Some(data.clone())
2564            } else {
2565                None
2566            }
2567        });
2568        assert!(heredoc.is_some(), "should have a heredoc token");
2569        let data = heredoc.unwrap();
2570        assert!(data.content.starts_with('\n'), "leading empty line must be preserved, got: {:?}", data.content);
2571        assert_eq!(data.content, "\nhello");
2572    }
2573
2574    #[test]
2575    fn test_heredoc_quoted_delimiter_sets_literal() {
2576        // Bug N: quoted delimiter (<<'EOF') should set literal=true
2577        let tokens = tokenize("cat <<'EOF'\nhello $HOME\nEOF").unwrap();
2578        let heredoc = tokens.iter().find_map(|t| {
2579            if let Token::HereDoc(data) = &t.token {
2580                Some(data.clone())
2581            } else {
2582                None
2583            }
2584        });
2585        assert!(heredoc.is_some(), "should have a heredoc token");
2586        let data = heredoc.unwrap();
2587        assert!(data.literal, "quoted delimiter should set literal=true");
2588        assert_eq!(data.content, "hello $HOME");
2589    }
2590
2591    #[test]
2592    fn test_heredoc_unquoted_delimiter_not_literal() {
2593        // Bug N: unquoted delimiter (<<EOF) should have literal=false
2594        let tokens = tokenize("cat <<EOF\nhello $HOME\nEOF").unwrap();
2595        let heredoc = tokens.iter().find_map(|t| {
2596            if let Token::HereDoc(data) = &t.token {
2597                Some(data.clone())
2598            } else {
2599                None
2600            }
2601        });
2602        assert!(heredoc.is_some(), "should have a heredoc token");
2603        let data = heredoc.unwrap();
2604        assert!(!data.literal, "unquoted delimiter should have literal=false");
2605    }
2606}
kaish_kernel/lexer.rs

kaish_kernel/
lexer.rs