Skip to main content

zshrs_parse/
lexer.rs

1//! Zsh lexical analyzer - Direct port from zsh/Src/lex.c
2//!
3//! This lexer tokenizes zsh shell input into a stream of tokens.
4//! It handles all zsh-specific syntax including:
5//! - Single/double/dollar quotes
6//! - Command substitution $(...)  and `...`
7//! - Arithmetic $((...))
8//! - Parameter expansion ${...}
9//! - Process substitution <(...) >(...)
10//! - Here documents
11//! - All redirection operators
12//! - Comments
13//! - Continuation lines
14
15use crate::tokens::{char_tokens, LexTok};
16use std::collections::VecDeque;
17
18/// Lexer flags controlling behavior
19#[derive(Debug, Clone, Copy, Default)]
20pub struct LexFlags {
21    /// Parsing for ZLE (line editor) completion
22    pub zle: bool,
23    /// Return newlines as tokens
24    pub newline: bool,
25    /// Preserve comments in output
26    pub comments_keep: bool,
27    /// Strip comments from output
28    pub comments_strip: bool,
29    /// Active lexing (from bufferwords)
30    pub active: bool,
31}
32
33/// Buffer state for building tokens
34#[derive(Debug, Clone)]
35struct LexBuf {
36    data: String,
37    siz: usize,
38}
39
40impl LexBuf {
41    fn new() -> Self {
42        LexBuf {
43            data: String::with_capacity(256),
44            siz: 256,
45        }
46    }
47
48    fn clear(&mut self) {
49        self.data.clear();
50    }
51
52    fn add(&mut self, c: char) {
53        self.data.push(c);
54        if self.data.len() >= self.siz {
55            self.siz *= 2;
56            self.data.reserve(self.siz - self.data.len());
57        }
58    }
59
60    #[allow(dead_code)]
61    fn add_str(&mut self, s: &str) {
62        self.data.push_str(s);
63    }
64
65    fn len(&self) -> usize {
66        self.data.len()
67    }
68
69    fn as_str(&self) -> &str {
70        &self.data
71    }
72
73    #[allow(dead_code)]
74    fn into_string(self) -> String {
75        self.data
76    }
77
78    #[allow(dead_code)]
79    fn last_char(&self) -> Option<char> {
80        self.data.chars().last()
81    }
82
83    fn pop(&mut self) -> Option<char> {
84        self.data.pop()
85    }
86}
87
88/// Here-document state
89#[derive(Debug, Clone)]
90pub struct HereDoc {
91    pub terminator: String,
92    pub strip_tabs: bool,
93    pub content: String,
94    /// True if the terminator was originally quoted (`<<'EOF'`,
95    /// `<<"EOF"`, or `<<\EOF`). Disables variable expansion / command
96    /// substitution / arithmetic in the body.
97    pub quoted: bool,
98    /// True once `process_heredocs` has read the body. Distinct from
99    /// "content is empty" because an empty heredoc legitimately has
100    /// empty content.
101    pub processed: bool,
102}
103
104/// The Zsh Lexer
105pub struct ZshLexer<'a> {
106    /// Input source
107    pub(crate) input: &'a str,
108    /// Current position in input
109    pub(crate) pos: usize,
110    /// Look-ahead buffer for ungotten characters
111    unget_buf: VecDeque<char>,
112    /// Current token string
113    pub tokstr: Option<String>,
114    /// Current token type
115    pub tok: LexTok,
116    /// File descriptor for redirections (e.g., 2> means fd=2)
117    pub tokfd: i32,
118    /// Line number at start of current token
119    pub toklineno: u64,
120    /// Current line number
121    pub lineno: u64,
122    /// Lexer has stopped (EOF or error)
123    pub lexstop: bool,
124    /// In command position (can accept reserved words)
125    pub incmdpos: bool,
126    /// In condition [[ ... ]]
127    pub incond: i32,
128    /// In pattern context (RHS of == != =~ in [[ ]])
129    pub incondpat: bool,
130    /// In case pattern
131    pub incasepat: i32,
132    /// In redirection
133    pub inredir: bool,
134    /// Saved `incmdpos` from before a redirop / for / foreach / select
135    /// — restored on the NEXT non-redir token. Mirrors `static int oldpos`
136    /// in C zsh's `ctxtlex` (lex.c:319). Required for cases like
137    /// `for x ( ... )` where `(` after the var name should tokenize as
138    /// INPAR — that depends on incmdpos being restored to 1 from before
139    /// FOR was lexed, which in turn depends on this saved value.
140    pub oldpos: bool,
141    /// After 'for' keyword
142    pub infor: i32,
143    /// After 'repeat' keyword
144    inrepeat: i32,
145    /// Parsing typeset arguments
146    pub intypeset: bool,
147    /// Inside (( ... )) arithmetic
148    dbparens: bool,
149    /// Disable alias expansion
150    pub noaliases: bool,
151    /// Disable spelling correction
152    pub nocorrect: i32,
153    /// Disable comment recognition
154    pub nocomments: bool,
155    /// Lexer flags
156    pub lexflags: LexFlags,
157    /// Whether this is the first line
158    pub isfirstln: bool,
159    /// Whether this is the first char of command
160    #[allow(dead_code)]
161    isfirstch: bool,
162    /// Pending here-documents
163    pub heredocs: Vec<HereDoc>,
164    /// Expecting heredoc terminator (0 = no, 1 = <<, 2 = <<-)
165    heredoc_pending: u8,
166    /// Token buffer
167    lexbuf: LexBuf,
168    /// After newline
169    pub isnewlin: i32,
170    /// Error message if any
171    pub error: Option<String>,
172    /// Global iteration counter for infinite loop detection
173    global_iterations: usize,
174    /// Recursion depth counter
175    recursion_depth: usize,
176    /// Raw-input capture flag — when nonzero, every char read through
177    /// `hgetc` is also appended to `tokstr_raw` via zshlex_raw_add.
178    /// Direct mirror of zsh/Src/lex.c:161 `lex_add_raw`. Used by
179    /// skipcomm (lex.c:2082) to preserve the literal text of `$(...)`
180    /// command substitutions for re-execution / display.
181    pub lex_add_raw: i32,
182    /// Raw-input capture buffer. Direct mirror of lex.c:165
183    /// `tokstr_raw` / lex.c:166 `lexbuf_raw`. Combined into one
184    /// `LexBuf` here since Rust's String tracks both the data and
185    /// length internally.
186    lexbuf_raw: LexBuf,
187}
188
189const MAX_LEXER_RECURSION: usize = 200;
190
191/// Per-alias info returned by `AliasResolver::lookup_alias` and
192/// `lookup_suffix_alias`. Mirrors zsh's `struct alias` fields used
193/// at lex.c:1914-1943: `text` (replacement body), `in_use` (the
194/// recursion-guard flag), `global` (vs command-position-only).
195#[derive(Debug, Clone)]
196pub struct AliasInfo {
197    pub text: String,
198    pub in_use: bool,
199    pub global: bool,
200}
201
202/// Trait the lexer uses to look up aliases and reserved words during
203/// `exalias`. Implementors typically delegate to the executor's
204/// alias/reswd hash tables. Defining the trait here keeps lexer.rs
205/// free of executor-specific types — same pattern zsh uses with the
206/// hashtable.h opaque-handle approach against aliastab/reswdtab/
207/// sufaliastab.
208pub trait AliasResolver {
209    /// Look up an alias by name. Returns `None` if not found, or the
210    /// alias body + flags otherwise.
211    fn lookup_alias(&self, name: &str) -> Option<AliasInfo>;
212    /// Look up a suffix alias (e.g. `.txt → less`) by suffix only.
213    fn lookup_suffix_alias(&self, suffix: &str) -> Option<AliasInfo>;
214    /// Resolve a reserved word. Returns the LexTok the word should
215    /// promote to (e.g. "if" → IF), or None if not a reswd.
216    fn lookup_reswd(&self, name: &str) -> Option<LexTok>;
217    /// Mark an alias as in-use (recursion guard). Called when an
218    /// alias is about to be expanded; the matching unmark happens
219    /// when the alias text has been fully consumed by the lexer.
220    fn mark_in_use(&mut self, name: &str, in_use: bool);
221}
222
223/// Saved lexical state for nested-context handling. Direct port of
224/// `struct lex_stack` declared in zsh/Src/zsh.h and used by
225/// zsh/Src/lex.c:215-239 (`lex_context_save`) and lex.c:244-262
226/// (`lex_context_restore`). Used when entering command substitution,
227/// here-docs, or eval where the outer lexer state must be pushed and
228/// restored after the inner parse completes.
229#[derive(Debug, Clone)]
230pub struct LexStack {
231    pub dbparens: bool,
232    pub isfirstln: bool,
233    pub isfirstch: bool,
234    pub lexflags: LexFlags,
235    pub tok: LexTok,
236    pub tokstr: Option<String>,
237    pub lexbuf_data: String,
238    pub lexbuf_siz: usize,
239    pub lexstop: bool,
240    pub toklineno: u64,
241}
242
243impl Default for LexStack {
244    fn default() -> Self {
245        // Mirrors lex.c:235-238 reset state after a save: tokstr / lexbuf
246        // zeroed, lexbuf.siz back to the initial 256 alloc, tok to
247        // ENDINPUT (the C source doesn't explicitly reset tok here but
248        // the natural baseline is ENDINPUT — same as lexinit).
249        LexStack {
250            dbparens: false,
251            isfirstln: false,
252            isfirstch: false,
253            lexflags: LexFlags::default(),
254            tok: LexTok::Endinput,
255            tokstr: None,
256            lexbuf_data: String::new(),
257            lexbuf_siz: 256,
258            lexstop: false,
259            toklineno: 0,
260        }
261    }
262}
263
264impl<'a> ZshLexer<'a> {
265    /// Create a new lexer for the given input
266    pub fn new(input: &'a str) -> Self {
267        ZshLexer {
268            input,
269            pos: 0,
270            unget_buf: VecDeque::new(),
271            tokstr: None,
272            tok: LexTok::Endinput,
273            tokfd: -1,
274            toklineno: 1,
275            lineno: 1,
276            lexstop: false,
277            incmdpos: true,
278            incond: 0,
279            incondpat: false,
280            incasepat: 0,
281            inredir: false,
282            oldpos: true,
283            infor: 0,
284            inrepeat: 0,
285            intypeset: false,
286            dbparens: false,
287            noaliases: false,
288            nocorrect: 0,
289            nocomments: false,
290            lexflags: LexFlags::default(),
291            isfirstln: true,
292            isfirstch: true,
293            heredocs: Vec::new(),
294            heredoc_pending: 0,
295            lexbuf: LexBuf::new(),
296            isnewlin: 0,
297            error: None,
298            global_iterations: 0,
299            recursion_depth: 0,
300            lex_add_raw: 0,
301            lexbuf_raw: LexBuf::new(),
302        }
303    }
304
305    /// Append a char to the raw-input capture buffer. Direct port of
306    /// zsh/Src/lex.c:2024-2039 `zshlex_raw_add`. Called from hgetc
307    /// when `lex_add_raw` is nonzero so cmd-sub bodies (`$(...)`,
308    /// `<(...)`, `>(...)`) can be replayed verbatim without re-lexing.
309    pub fn zshlex_raw_add(&mut self, c: char) {
310        // lex.c:2027-2028 — guard on lex_add_raw flag.
311        if self.lex_add_raw == 0 {
312            return;
313        }
314        // lex.c:2030-2038 — append to lexbuf_raw. The C source manages
315        // explicit ptr/len/siz with hrealloc; Rust's String handles
316        // resize automatically.
317        self.lexbuf_raw.add(c);
318    }
319
320    /// Run alias / reserved-word expansion on the just-lexed token.
321    /// Direct port of zsh/Src/lex.c:1949-2021 `exalias`. Returns true
322    /// if an alias was injected (the caller's loop should re-run
323    /// gettok to consume the injected text).
324    ///
325    /// C source flow:
326    ///   1. Spell-correct (lex.c:1958-1962) — disabled in zshrs.
327    ///   2. If tokstr is None: set lextext from `tokstrings[tok]` and
328    ///      checkalias against that (lex.c:1964-1969).
329    ///   3. Otherwise: untokenize tokstr into a working copy (lex.c:
330    ///      1971-1980).
331    ///   4. ZLE word-tracking: call gotword() if LEXFLAGS_ZLE
332    ///      (lex.c:1982-1991).
333    ///   5. STRING tokens: try checkalias, then reservation lookup
334    ///      (lex.c:1993-2015).
335    ///   6. Clear inalmore (lex.c:2016).
336    ///
337    /// Takes an `AliasResolver` trait object so the lexer doesn't
338    /// hard-depend on the executor's alias-table types. zshrs callers
339    /// implement `AliasResolver` over their alias hash tables.
340    pub fn exalias<R: AliasResolver>(&mut self, resolver: &mut R) -> bool {
341        // lex.c:1957 — `hwend()` ends the history-word region. zshrs's
342        // history layer doesn't track per-word boundaries here; no-op.
343
344        // lex.c:1958-1962 — spell correction via spckword. zshrs
345        // doesn't implement spell correction yet; documented divergence.
346
347        // lex.c:1964-1969 — bare-token path (no tokstr).
348        if self.tokstr.is_none() {
349            // lex.c:1965 — `zshlextext = tokstrings[tok];` — for tokens
350            // like SEMI/AMPER/etc. the canonical text comes from a
351            // static table. zshrs's check_alias_for_text uses the
352            // resolver directly with the token's text representation.
353            if self.tok == LexTok::Newlin {
354                return false;
355            }
356            // Use punctuation-token text; unknown tokens skip alias.
357            let text = match self.tok {
358                LexTok::Semi => ";",
359                LexTok::Amper => "&",
360                LexTok::Bar => "|",
361                _ => return false,
362            };
363            return self.check_alias(resolver, text);
364        }
365
366        let tokstr = self.tokstr.clone().unwrap();
367        // lex.c:1973-1980 — untokenize: convert the lexer's internal
368        // tokenized form (Pound..ztokens shifts) into the literal
369        // shell text. Call the global helper.
370        let lextext = if has_token(&tokstr) {
371            untokenize(&tokstr)
372        } else {
373            tokstr.clone()
374        };
375
376        // lex.c:1982-1991 — ZLE word-tracking for completion.
377        if self.lexflags.zle {
378            let zp = self.lexflags;
379            self.gotword();
380            // lex.c:1986-1990 — if gotword cleared lexflags, the cursor
381            // word has been reached; abort exalias so completion can
382            // capture the partial token unchanged.
383            if zp.zle && !self.lexflags.zle {
384                return false;
385            }
386        }
387
388        // lex.c:1993-2015 — STRING-token alias / reswd check.
389        if self.tok == LexTok::String {
390            // lex.c:1995 — `checkalias()`. POSIX-aliases gate skipped
391            // here (zshrs doesn't have the option flag wired).
392            if self.check_alias(resolver, &lextext) {
393                return true;
394            }
395
396            // lex.c:2002-2009 — reserved-word lookup. Fires when in
397            // command position OR when the text is bare `}` and
398            // IGNOREBRACES is unset (so `}` ends a brace block).
399            if self.incmdpos || lextext == "}" {
400                if let Some(rwtok) = resolver.lookup_reswd(&lextext) {
401                    self.tok = rwtok;
402                    if rwtok == LexTok::Repeat {
403                        self.inrepeat = 1;
404                    }
405                    if rwtok == LexTok::Dinbrack {
406                        self.incond = 1;
407                    }
408                }
409            } else if self.incond > 0 && lextext == "]]" {
410                // lex.c:2010-2012 — `]]` closes the cond expression.
411                self.tok = LexTok::Doutbrack;
412                self.incond = 0;
413            } else if self.incond == 1 && lextext == "!" {
414                // lex.c:2013-2014 — `!` inside `[[ ]]` is the BANG
415                // negation, not a literal.
416                self.tok = LexTok::Bang;
417            }
418        }
419
420        // lex.c:2016 — `inalmore = 0;` — alias-more flag clears after
421        // any non-alias token.
422        // (zshrs's lexer doesn't have inalmore yet — added here would
423        // require gettok to track when an alias-pushed token has more
424        // text after it. Documented divergence.)
425
426        false
427    }
428
429    /// Helper for `exalias`. Direct port of zsh/Src/lex.c:1899-1947
430    /// `checkalias`. Returns true if the lookup matched (regular or
431    /// suffix alias) AND the alias text was successfully injected
432    /// back into the input stream for re-lexing.
433    fn check_alias<R: AliasResolver>(&mut self, resolver: &mut R, lextext: &str) -> bool {
434        // lex.c:1906-1907 — guard on null lextext.
435        if lextext.is_empty() {
436            return false;
437        }
438
439        // lex.c:1909-1911 — guard: alias expansion is disabled, or
440        // POSIX aliases require the token to be a STRING and not a
441        // reserved word.
442        if self.noaliases {
443            return false;
444        }
445
446        // lex.c:1914-1933 — regular alias lookup.
447        if let Some(alias) = resolver.lookup_alias(lextext) {
448            if !alias.in_use && (alias.global || (self.incmdpos && self.tok == LexTok::String)) {
449                // lex.c:1918-1927 — if the next char isn't blank,
450                // insert a space so the alias body can't accidentally
451                // join the following word.
452                if !self.lexstop {
453                    if let Some(c) = self.peek() {
454                        if !Self::is_blank(c) {
455                            self.inject_alias_text(" ");
456                        }
457                    }
458                }
459                // lex.c:1928 — `inpush(an->text, INP_ALIAS, an);`
460                self.inject_alias_text(&alias.text);
461                resolver.mark_in_use(lextext, true);
462                self.lexstop = false;
463                return true;
464            }
465        }
466
467        // lex.c:1934-1943 — suffix-alias lookup. The token must end
468        // with `.SUFFIX`, the suffix name must be a registered
469        // suffix-alias, AND the lexer must be in command position.
470        if self.incmdpos {
471            if let Some(dot_pos) = lextext.rfind('.') {
472                if dot_pos > 0 && dot_pos + 1 < lextext.len() {
473                    let suffix = &lextext[dot_pos + 1..];
474                    if let Some(alias) = resolver.lookup_suffix_alias(suffix) {
475                        if !alias.in_use {
476                            // lex.c:1938-1940 — push three things in
477                            // reverse: the alias text, a space, then
478                            // the original word.
479                            self.inject_alias_text(&alias.text);
480                            self.inject_alias_text(" ");
481                            self.inject_alias_text(lextext);
482                            resolver.mark_in_use(suffix, true);
483                            self.lexstop = false;
484                            return true;
485                        }
486                    }
487                }
488            }
489        }
490
491        false
492    }
493
494    /// Push alias text back into the input stream so the lexer
495    /// re-reads it. Equivalent to zsh's `inpush(text, INP_ALIAS, an)`
496    /// at lex.c:1928,1938,1940. zshrs uses the existing `unget_buf`
497    /// (a VecDeque<char>) to inject chars in reverse order so the
498    /// next hgetc consumes them first.
499    fn inject_alias_text(&mut self, text: &str) {
500        // Insert at front in reverse so the first char of `text`
501        // comes out first.
502        for c in text.chars().rev() {
503            self.unget_buf.push_front(c);
504        }
505    }
506
507    /// Pop the last char from the raw-input capture buffer. Direct
508    /// port of zsh/Src/lex.c:2042-2049 `zshlex_raw_back`. Called when
509    /// the lexer ungets a char that was just captured raw — the raw
510    /// buffer must mirror the live input so this undoes the last add.
511    pub fn zshlex_raw_back(&mut self) {
512        // lex.c:2045-2046 — guard.
513        if self.lex_add_raw == 0 {
514            return;
515        }
516        // lex.c:2047-2048 — `lexbuf_raw.ptr--; lexbuf_raw.len--;`
517        self.lexbuf_raw.pop();
518    }
519
520    /// Mark the current raw-buffer offset (for restore later). Direct
521    /// port of zsh/Src/lex.c:2052-2058 `zshlex_raw_mark`. Returns
522    /// `len + offset` so callers can restore via `back_to_mark`.
523    pub fn zshlex_raw_mark(&self, offset: i64) -> i64 {
524        // lex.c:2055-2056 — guard.
525        if self.lex_add_raw == 0 {
526            return 0;
527        }
528        // lex.c:2057 — `return lexbuf_raw.len + offset;`
529        (self.lexbuf_raw.len() as i64) + offset
530    }
531
532    /// Restore raw-buffer offset to a previously-saved mark. Direct
533    /// port of zsh/Src/lex.c:2061-2068 `zshlex_raw_back_to_mark`.
534    /// Truncates the raw buffer to `mark` bytes — undoes any captures
535    /// since the mark was taken (used when a speculative parse fails
536    /// and the lexer rolls back).
537    pub fn zshlex_raw_back_to_mark(&mut self, mark: i64) {
538        // lex.c:2064-2065 — guard.
539        if self.lex_add_raw == 0 {
540            return;
541        }
542        // lex.c:2066-2067 — `lexbuf_raw.ptr = tokstr_raw + mark;
543        // lexbuf_raw.len = mark;` — Rust truncate handles both.
544        let m = mark.max(0) as usize;
545        self.lexbuf_raw.data.truncate(m);
546    }
547
548    /// Take the captured raw-input buffer, clearing it. Useful for
549    /// callers that need the literal command-sub body after lexing
550    /// (e.g. compile-time string capture for `$(...)`).
551    pub fn take_raw_buf(&mut self) -> String {
552        std::mem::take(&mut self.lexbuf_raw.data)
553    }
554
555    /// Save lexical context onto a `LexStack`. Direct port of
556    /// zsh/Src/lex.c:215-239 `lex_context_save`. After save, the lexer
557    /// is in a clean state suitable for parsing a nested input (command
558    /// substitution body, here-doc terminator, eval'd string).
559    pub fn lex_context_save(&mut self, ls: &mut LexStack) {
560        // lex.c:220-233 — copy live state into the stack.
561        ls.dbparens = self.dbparens;
562        ls.isfirstln = self.isfirstln;
563        ls.isfirstch = self.isfirstch;
564        ls.lexflags = self.lexflags;
565        ls.tok = self.tok;
566        ls.tokstr = self.tokstr.take();
567        ls.lexbuf_data = std::mem::take(&mut self.lexbuf.data);
568        ls.lexbuf_siz = self.lexbuf.siz;
569        ls.lexstop = self.lexstop;
570        ls.toklineno = self.toklineno;
571
572        // lex.c:235-238 — reset live state to defaults so a nested
573        // parse starts from a clean slate. tokstr/lexbuf are zeroed,
574        // lexbuf.siz reset to 256 (the C-source initial alloc).
575        self.tokstr = None;
576        self.lexbuf.data.clear();
577        self.lexbuf.siz = 256;
578    }
579
580    /// Restore lexical context from a `LexStack`. Direct port of
581    /// zsh/Src/lex.c:244-262 `lex_context_restore`. Inverse of
582    /// `lex_context_save`. Called after the nested parse completes.
583    pub fn lex_context_restore(&mut self, ls: &mut LexStack) {
584        // lex.c:249-261 — copy stack state back into live fields.
585        self.dbparens = ls.dbparens;
586        self.isfirstln = ls.isfirstln;
587        self.isfirstch = ls.isfirstch;
588        self.lexflags = ls.lexflags;
589        self.tok = ls.tok;
590        self.tokstr = ls.tokstr.take();
591        self.lexbuf.data = std::mem::take(&mut ls.lexbuf_data);
592        self.lexbuf.siz = ls.lexbuf_siz;
593        self.lexstop = ls.lexstop;
594        self.toklineno = ls.toklineno;
595    }
596
597    /// Initialize lexical state. Direct port of zsh/Src/lex.c:440-445
598    /// `lexinit`. Resets dbparens / nocorrect / lexstop and sets `tok`
599    /// to ENDINPUT so the next gettok starts from a known baseline.
600    /// Note: the constructor `Self::new` already sets equivalent
601    /// defaults; this method exists for the rare case a caller wants
602    /// to recycle a `ZshLexer` across multiple input strings.
603    pub fn lexinit(&mut self) {
604        // lex.c:443 — `nocorrect = dbparens = lexstop = 0;`
605        self.nocorrect = 0;
606        self.dbparens = false;
607        self.lexstop = false;
608        // lex.c:444 — `tok = ENDINPUT;`
609        self.tok = LexTok::Endinput;
610    }
611
612    /// Check recursion depth; returns true if exceeded
613    #[inline]
614    fn check_recursion(&mut self) -> bool {
615        if self.recursion_depth > MAX_LEXER_RECURSION {
616            self.error = Some("lexer exceeded max recursion depth".to_string());
617            self.lexstop = true;
618            true
619        } else {
620            false
621        }
622    }
623
624    /// Check and increment global iteration counter; returns true if limit exceeded
625    /// Soft cap on `hgetc` invocations — an infinite-loop tripwire.
626    /// Real-world scripts: zinit.zsh ~5K lines / ~200KB, p10k's
627    /// internal/p10k.zsh ~10K lines / ~360KB, the user's daily-driver
628    /// `.zshrc` + zpwr stack collectively crosses 1M+ chars per shell
629    /// invocation. The previous 50K cap was tripped by p10k by line
630    /// 1277 (well below its actual 10K-line size). 100M chars handles
631    /// every reasonable script while still bailing out of a real
632    /// runaway lexer state machine.
633    const LEXER_HGETC_CAP: u64 = 100_000_000;
634
635    #[inline]
636    fn check_iterations(&mut self) -> bool {
637        self.global_iterations += 1;
638        if self.global_iterations as u64 > Self::LEXER_HGETC_CAP {
639            self.error = Some(format!(
640                "lexer exceeded {} hgetc iterations — possible infinite loop",
641                Self::LEXER_HGETC_CAP
642            ));
643            self.lexstop = true;
644            self.tok = LexTok::Lexerr;
645            true
646        } else {
647            false
648        }
649    }
650
651    /// Get next character from input
652    fn hgetc(&mut self) -> Option<char> {
653        if self.check_iterations() {
654            return None;
655        }
656
657        // Re-read from unget_buf: increment lineno on `\n` HERE
658        // too. hungetc() decremented lineno when the char was put
659        // back; without a matching increment on the way out, every
660        // `\n` that's ungetted-then-reread leaves lineno
661        // permanently one short. Symptom: $LINENO stuck at 1 in
662        // every script statement because the parser ungets the
663        // separating newline once between statements.
664        if let Some(c) = self.unget_buf.pop_front() {
665            if c == '\n' {
666                self.lineno += 1;
667            }
668            return Some(c);
669        }
670
671        let c = self.input[self.pos..].chars().next()?;
672        self.pos += c.len_utf8();
673
674        if c == '\n' {
675            self.lineno += 1;
676        }
677
678        Some(c)
679    }
680
681    /// Put character back into input
682    fn hungetc(&mut self, c: char) {
683        self.unget_buf.push_front(c);
684        if c == '\n' && self.lineno > 1 {
685            self.lineno -= 1;
686        }
687        self.lexstop = false;
688    }
689
690    /// Peek at next character without consuming
691    #[allow(dead_code)]
692    fn peek(&mut self) -> Option<char> {
693        if let Some(&c) = self.unget_buf.front() {
694            return Some(c);
695        }
696        self.input[self.pos..].chars().next()
697    }
698
699    /// Add character to token buffer
700    fn add(&mut self, c: char) {
701        self.lexbuf.add(c);
702    }
703
704    /// Check if character is blank (space or tab)
705    fn is_blank(c: char) -> bool {
706        c == ' ' || c == '\t'
707    }
708
709    /// Peek for a zsh numeric range glob shape after a `<`: returns the
710    /// captured `N*-M*>` (everything *after* the leading `<`) when the
711    /// upcoming chars match `[0-9]*-[0-9]*>` exactly. Otherwise returns
712    /// None and leaves the input untouched.
713    fn try_numeric_range_glob(&mut self) -> Option<String> {
714        let mut buf: Vec<char> = Vec::new();
715        // optional leading digits
716        loop {
717            match self.hgetc() {
718                Some(c) if c.is_ascii_digit() => buf.push(c),
719                Some(c) => {
720                    buf.push(c);
721                    break;
722                }
723                None => break,
724            }
725        }
726        // last char in buf must be '-' for the range form
727        if buf.last() != Some(&'-') {
728            for c in buf.iter().rev() {
729                self.hungetc(*c);
730            }
731            return None;
732        }
733        // optional trailing digits
734        loop {
735            match self.hgetc() {
736                Some(c) if c.is_ascii_digit() => buf.push(c),
737                Some(c) => {
738                    buf.push(c);
739                    break;
740                }
741                None => break,
742            }
743        }
744        if buf.last() != Some(&'>') {
745            for c in buf.iter().rev() {
746                self.hungetc(*c);
747            }
748            return None;
749        }
750        Some(buf.into_iter().collect())
751    }
752
753    /// Check if character is blank (including other whitespace except newline)
754    fn is_inblank(c: char) -> bool {
755        matches!(c, ' ' | '\t' | '\x0b' | '\x0c' | '\r')
756    }
757
758    /// Check if character is a digit
759    fn is_digit(c: char) -> bool {
760        c.is_ascii_digit()
761    }
762
763    /// Check if character is identifier start
764    #[allow(dead_code)]
765    fn is_ident_start(c: char) -> bool {
766        c.is_ascii_alphabetic() || c == '_'
767    }
768
769    /// Check if character is identifier continuation
770    fn is_ident(c: char) -> bool {
771        c.is_ascii_alphanumeric() || c == '_'
772    }
773
774    /// Main lexer entry point — fetch the next token. Direct port of
775    /// zsh/Src/lex.c:265-313 `zshlex`. Loop body matches the C source
776    /// `do { ... } while (tok != ENDINPUT && exalias())` at lex.c:270-276,
777    /// followed by here-doc draining (lex.c:278-306), newline tracking
778    /// (lex.c:307-310), and SEMI/NEWLIN→SEPER folding (lex.c:311-312).
779    ///
780    /// zshrs port note: `exalias()` (lex.c:1953) is not yet wired into
781    /// the loop. The C source iterates as long as exalias keeps
782    /// re-injecting alias text into the input buffer; zshrs's alias
783    /// expansion happens post-lex in exec.rs. The loop body therefore
784    /// runs once and breaks unconditionally — documented divergence.
785    pub fn zshlex(&mut self) {
786        // lex.c:268-269 — early-out on prior LEXERR.
787        if self.tok == LexTok::Lexerr {
788            return;
789        }
790
791        // Note: Do NOT reset global_iterations here - it must accumulate across all
792        // zshlex calls in a parse to prevent infinite loops in the parser
793
794        // lex.c:270-276 — gettok / exalias one-pass body. The C source
795        // wraps gettok in `do { ... } while (exalias())` so an alias
796        // re-injection re-enters the lex. Until exalias is wired we
797        // run the body exactly once, no loop scaffolding.
798        // lex.c:271-272 — bump inrepeat counter for `repeat N {}`
799        // detection.
800        if self.inrepeat > 0 {
801            self.inrepeat += 1;
802        }
803        // lex.c:273-274 — at the third token after `repeat`,
804        // SHORTLOOPS / SHORTREPEAT options force back into cmd
805        // position so the loop body can start. zshrs unconditionally
806        // does this since the option-lookup lives in exec.rs.
807        if self.inrepeat == 3 {
808            self.incmdpos = true;
809        }
810
811        // lex.c:275 — `tok = gettok();`
812        self.tok = self.gettok();
813
814        // lex.c:277 — `nocorrect &= 1;` — clear bit 1 (lookahead-only)
815        // so the persistent low bit survives but the per-word bit is
816        // dropped.
817        self.nocorrect &= 1;
818
819        // lex.c:278-306 — drain pending here-documents at the start
820        // of a new line. zshrs's process_heredocs reads the full body
821        // and stitches it onto the matching redir token.
822        if self.tok == LexTok::Newlin || self.tok == LexTok::Endinput {
823            self.process_heredocs();
824        }
825
826        // lex.c:307-310 — track whether we just saw a newline.
827        // C uses `inbufct` to distinguish "newline at EOF" (=1)
828        // from "newline mid-input" (=-1); zshrs reads `pos < len`.
829        if self.tok != LexTok::Newlin {
830            self.isnewlin = 0;
831        } else {
832            self.isnewlin = if self.pos < self.input.len() { -1 } else { 1 };
833        }
834
835        // lex.c:311-312 — fold SEMI / NEWLIN into SEPER unless
836        // LEXFLAGS_NEWLINE is set to preserve newlines (used by
837        // ZLE for completion of partial lines).
838        if self.tok == LexTok::Semi || (self.tok == LexTok::Newlin && !self.lexflags.newline) {
839            self.tok = LexTok::Seper;
840        }
841
842        // Reserved-word promotion. Per lex.c:2002-2005 in `exalias`:
843        //   - `{` only promotes to INBRACE in command position
844        //   - `}` promotes to OUTBRACE either in cmdpos OR via the
845        //     special `closing-brace-special` rule (IGNOREBRACES unset
846        //     — assumed since zshrs doesn't expose that option yet)
847        //   - other reserved words: only when incmdpos (or `}` exception)
848        if self.tok == LexTok::String {
849            if let Some(ref s) = self.tokstr {
850                if s == "{" && self.incmdpos {
851                    self.tok = LexTok::Inbrace;
852                } else if s == "}" {
853                    self.tok = LexTok::Outbrace;
854                } else if self.incasepat == 0 {
855                    // Skip reserved word checking in case pattern context —
856                    // words like `time`, `end` should be patterns, not
857                    // keywords.
858                    self.check_reserved_word();
859                }
860            }
861        }
862
863        // If we were expecting a heredoc terminator, register it now
864        if self.heredoc_pending > 0 && self.tok == LexTok::String {
865            if let Some(ref terminator) = self.tokstr {
866                let strip_tabs = self.heredoc_pending == 2;
867                // Detect originally-quoted terminator (`<<'EOF'`,
868                // `<<"EOF"`). The lexer wraps single-quoted text in
869                // SNULL (`\u{9d}`) and double-quoted text in DNULL
870                // (`\u{9e}`); plain `EOF` has neither. Quoted-terminator
871                // heredocs disable variable / command-sub / arithmetic
872                // expansion in the body — see `compile_redir` for the
873                // expansion side.
874                // Quoted terminators (`<<'EOF'`, `<<"EOF"`, `<<\EOF`)
875                // disable expansion in the body. SNULL/DNULL mark
876                // single/double-quoted spans; BNULL (`\u{9f}`) marks
877                // any backslash-escaped char — its presence alone is
878                // enough to flag the terminator as quoted (zsh's
879                // `<<\EOF` shorthand for `<<'EOF'`).
880                let quoted = terminator.contains('\u{9d}')
881                    || terminator.contains('\u{9e}')
882                    || terminator.contains('\u{9f}')
883                    || terminator.starts_with('\'')
884                    || terminator.starts_with('"');
885                let term = terminator
886                    .chars()
887                    .filter(|c| {
888                        *c != '\''
889                            && *c != '"'
890                            && *c != '\u{9d}'
891                            && *c != '\u{9e}'
892                            && *c != '\u{9f}'
893                    })
894                    .collect::<String>();
895                self.heredocs.push(HereDoc {
896                    terminator: term,
897                    strip_tabs,
898                    content: String::new(),
899                    quoted,
900                    processed: false,
901                });
902            }
903            self.heredoc_pending = 0;
904        }
905
906        // Track pattern context inside [[ ... ]] - after = == != =~ the RHS is a pattern
907        if self.incond > 0 {
908            if let Some(ref s) = self.tokstr {
909                // Check if this token is a comparison operator
910                // Note: single = is also a comparison operator in [[ ]]
911                // The internal marker \u{8d} is used for =
912                if s == "="
913                    || s == "=="
914                    || s == "!="
915                    || s == "=~"
916                    || s == "\u{8d}"
917                    || s == "\u{8d}\u{8d}"
918                    || s == "!\u{8d}"
919                    || s == "\u{8d}~"
920                    || s == "\u{8d}\u{98}"
921                {
922                    self.incondpat = true;
923                } else if self.incondpat {
924                    // We were in pattern context, now we've consumed the pattern
925                    // Reset after the pattern token is consumed
926                    // But actually, pattern can span multiple tokens, so we should
927                    // stay in pattern mode until ]] or && or ||
928                }
929            }
930            // Reset pattern context on ]] or logical operators (&&, ||)
931            // and grouping parens. zsh par_cond_3 (cond.c) treats
932            // these as cond-pattern terminators — the next operand is
933            // a fresh primary, NOT a continuation of the prior pattern.
934            // Without resetting on Damper/Dbar/Inpar/Outpar, the `(`
935            // after `[[ a == a && (b == b ... ` was lexed as a literal
936            // glob char (incondpat=true → gettokstr) and the whole
937            // remainder collapsed into one String token.
938            match self.tok {
939                LexTok::Doutbrack
940                | LexTok::Damper
941                | LexTok::Dbar
942                | LexTok::Inpar
943                | LexTok::Outpar
944                | LexTok::Bang => {
945                    self.incondpat = false;
946                }
947                _ => {}
948            }
949        } else {
950            self.incondpat = false;
951        }
952
953        // Update command position for next token based on current token
954        // Note: In case patterns (incasepat > 0), | is a pattern separator, not pipeline,
955        // so we don't set incmdpos after Bar in that context
956        match self.tok {
957            LexTok::Seper
958            | LexTok::Newlin
959            | LexTok::Semi
960            | LexTok::Dsemi
961            | LexTok::Semiamp
962            | LexTok::Semibar
963            | LexTok::Amper
964            | LexTok::Amperbang
965            | LexTok::Inpar
966            | LexTok::Inbrace
967            | LexTok::Dbar
968            | LexTok::Damper
969            | LexTok::Baramp
970            | LexTok::Inoutpar
971            | LexTok::Doloop
972            | LexTok::Then
973            | LexTok::Elif
974            | LexTok::Else
975            | LexTok::Doutbrack
976            | LexTok::Func => {
977                self.incmdpos = true;
978            }
979            LexTok::Bar
980                // In case patterns, | is a pattern separator - don't change incmdpos
981                if self.incasepat <= 0 => {
982                    self.incmdpos = true;
983                }
984            LexTok::String
985            | LexTok::Typeset
986            | LexTok::Envarray
987            | LexTok::Outpar
988            | LexTok::Case
989            | LexTok::Dinbrack => {
990                self.incmdpos = false;
991            }
992            _ => {}
993        }
994
995        // Track 'for' keyword for C-style for loop: for (( init; cond; step ))
996        // When we see 'for', set infor=2 to expect the init and cond parts
997        // Each Dinpar (after semicolon in arithmetic) decrements it
998        if self.tok != LexTok::Dinpar {
999            self.infor = if self.tok == LexTok::For { 2 } else { 0 };
1000        }
1001
1002
1003        // Handle redirection / for-loop context. Mirrors lex.c:359-368
1004        // ctxtlex `oldpos` save/restore. The saved value lives in
1005        // `self.oldpos` (struct field) so it survives across zshlex
1006        // calls — the previous local `let oldpos = self.incmdpos`
1007        // captured the JUST-updated value (always wrong) and lost the
1008        // pre-FOR incmdpos. With the field, FOR x → STRING x → INPAR
1009        // sequence correctly restores incmdpos=1 before the `(`.
1010        if self.tok.is_redirop()
1011            || self.tok == LexTok::For
1012            || self.tok == LexTok::Foreach
1013            || self.tok == LexTok::Select
1014        {
1015            self.inredir = true;
1016            self.oldpos = self.incmdpos;
1017            self.incmdpos = false;
1018        } else if self.inredir {
1019            self.incmdpos = self.oldpos;
1020            self.inredir = false;
1021        }
1022    }
1023
1024    /// Process pending here-documents. Walks each heredoc whose body
1025    /// hasn't been filled yet (content is empty AND terminator is set),
1026    /// reads lines from input until the terminator, and stuffs the body
1027    /// into `hdoc.content` IN PLACE. The list itself is preserved so the
1028    /// parser can index into it after parse() finishes.
1029    fn process_heredocs(&mut self) {
1030        let n = self.heredocs.len();
1031        for i in 0..n {
1032            // Skip heredocs we've already processed AND those without
1033            // a terminator (early-error case). The `processed` bool
1034            // distinguishes "filled with empty body" from "not yet
1035            // visited" — both have empty `content`.
1036            if self.heredocs[i].processed || self.heredocs[i].terminator.is_empty() {
1037                continue;
1038            }
1039            let strip_tabs = self.heredocs[i].strip_tabs;
1040            let terminator = self.heredocs[i].terminator.clone();
1041            let mut content = String::new();
1042            let mut line_count = 0;
1043
1044            loop {
1045                line_count += 1;
1046                if line_count > 10000 {
1047                    self.error = Some("heredoc exceeded 10000 lines".to_string());
1048                    self.tok = LexTok::Lexerr;
1049                    return;
1050                }
1051
1052                let line = self.read_line();
1053                if line.is_none() {
1054                    self.error = Some("here document too large or unterminated".to_string());
1055                    self.tok = LexTok::Lexerr;
1056                    return;
1057                }
1058
1059                let line = line.unwrap();
1060                let check_line = if strip_tabs {
1061                    line.trim_start_matches('\t')
1062                } else {
1063                    line.as_str()
1064                };
1065
1066                if check_line.trim_end_matches('\n') == terminator {
1067                    break;
1068                }
1069
1070                // `<<-` strips leading tabs from BODY lines too, not just
1071                // from terminator-match comparison. Without this, tabs in
1072                // here-doc content survive into stdin.
1073                if strip_tabs {
1074                    content.push_str(check_line);
1075                } else {
1076                    content.push_str(&line);
1077                }
1078            }
1079
1080            self.heredocs[i].content = content;
1081            self.heredocs[i].processed = true;
1082        }
1083    }
1084
1085    /// Read a line from input (returns partial line at EOF)
1086    fn read_line(&mut self) -> Option<String> {
1087        let mut line = String::new();
1088
1089        loop {
1090            match self.hgetc() {
1091                Some(c) => {
1092                    line.push(c);
1093                    if c == '\n' {
1094                        break;
1095                    }
1096                }
1097                None => {
1098                    // EOF - return partial line if any
1099                    if line.is_empty() {
1100                        return None;
1101                    }
1102                    break;
1103                }
1104            }
1105        }
1106
1107        Some(line)
1108    }
1109
1110    /// Get the next token. Direct port of zsh/Src/lex.c:613-936
1111    /// `gettok`. Reads characters from the input via hgetc, dispatches
1112    /// on the leading char through lexact1[]/lexact2[] tables (zshrs
1113    /// uses inline `match` in lex_initial / lex_inang / lex_outang
1114    /// since Rust pattern-matching subsumes the table dispatch).
1115    ///
1116    /// Structural divergence from C: the giant ~322-line C switch
1117    /// statement at lex.c:725-936 is split into helper methods in
1118    /// Rust (lex_initial = LX1_OTHER plus the punctuation cases,
1119    /// lex_inang / lex_outang for the < and > arms). The flow is
1120    /// equivalent — same chars consumed, same tokens emitted — but
1121    /// the source-level layout differs. C's table-driven dispatch
1122    /// would Rust-port as `match c { '\\' => ..., '\n' => ..., ... }`
1123    /// which is what the helpers ultimately do.
1124    fn gettok(&mut self) -> LexTok {
1125        // lex.c:621 — `tokstr = NULL;` reset before each token.
1126        self.tokstr = None;
1127        // (zshrs-specific: tokfd reset lives here too — C does it
1128        // implicitly via the `peekfd = -1` local at lex.c:617 used
1129        // only when a digit-prefix redirection is detected.)
1130        self.tokfd = -1;
1131
1132        // lex.c:622 — `while (iblank(c = hgetc()) && !lexstop);` —
1133        // skip leading blanks (space/tab, NOT newline).
1134        let mut ws_iterations = 0;
1135        loop {
1136            ws_iterations += 1;
1137            if ws_iterations > 100_000 {
1138                self.error = Some("gettok: infinite loop in whitespace skip".to_string());
1139                return LexTok::Lexerr;
1140            }
1141            let c = match self.hgetc() {
1142                Some(c) => c,
1143                None => {
1144                    // lex.c:624-625 — lexstop set, return ENDINPUT
1145                    // (or LEXERR if errflag is set elsewhere).
1146                    self.lexstop = true;
1147                    return if self.error.is_some() {
1148                        LexTok::Lexerr
1149                    } else {
1150                        LexTok::Endinput
1151                    };
1152                }
1153            };
1154
1155            if !Self::is_blank(c) {
1156                self.hungetc(c);
1157                break;
1158            }
1159        }
1160
1161        let c = match self.hgetc() {
1162            Some(c) => c,
1163            None => {
1164                self.lexstop = true;
1165                return LexTok::Endinput;
1166            }
1167        };
1168
1169        // lex.c:623 — `toklineno = lineno;`
1170        self.toklineno = self.lineno;
1171        // lex.c:626 — `isfirstln = 0;` once we've consumed any non-
1172        // blank.
1173        self.isfirstln = false;
1174
1175        // lex.c:631-648 — dbparens (inside `(( … ))`) special path:
1176        // call dquote_parse with `;` or `)` as the end-char and
1177        // either return DINPAR (continue for-loop arith) or DOUTPAR
1178        // (close the arith block) or LEXERR.
1179        if self.dbparens {
1180            return self.lex_arith(c);
1181        }
1182
1183        // lex.c:649-668 — digit prefix on a redirection: `2> file`
1184        // treats `2` as the fd to redirect, not a literal arg. Three
1185        // shapes: `N>`/`N<` (single redir), `N&>` (errwrite), or
1186        // anything else (push back, treat as literal digit).
1187        if Self::is_digit(c) {
1188            let d = self.hgetc();
1189            match d {
1190                Some('&') => {
1191                    let e = self.hgetc();
1192                    if e == Some('>') {
1193                        // lex.c:653-657 — `N&>` shape detected.
1194                        self.tokfd = (c as u8 - b'0') as i32;
1195                        self.hungetc('>');
1196                        return self.lex_initial('&');
1197                    }
1198                    // lex.c:658-661 — not `N&>`, push everything back.
1199                    if let Some(e) = e {
1200                        self.hungetc(e);
1201                    }
1202                    self.hungetc('&');
1203                }
1204                Some('>') | Some('<') => {
1205                    // lex.c:662-664 — `N>` or `N<` shape detected.
1206                    self.tokfd = (c as u8 - b'0') as i32;
1207                    return self.lex_initial(d.unwrap());
1208                }
1209                Some(d) => {
1210                    // lex.c:665-668 — not a redir prefix, push back.
1211                    self.hungetc(d);
1212                }
1213                None => {}
1214            }
1215            self.lexstop = false;
1216        }
1217
1218        // lex.c:670-936 — main dispatch on the leading char. zshrs
1219        // delegates to lex_initial which holds the equivalent of
1220        // lex.c's `switch (lexact1[c])` plus the gettokstr fallback
1221        // for LX1_OTHER.
1222        self.lex_initial(c)
1223    }
1224
1225    /// Lex (( ... )) arithmetic expression
1226    fn lex_arith(&mut self, c: char) -> LexTok {
1227        self.lexbuf.clear();
1228        self.hungetc(c);
1229
1230        let end_char = if self.infor > 0 { ';' } else { ')' };
1231        if self.dquote_parse(end_char, false).is_err() {
1232            return LexTok::Lexerr;
1233        }
1234
1235        self.tokstr = Some(self.lexbuf.as_str().to_string());
1236
1237        if !self.lexstop && self.infor > 0 {
1238            self.infor -= 1;
1239            return LexTok::Dinpar;
1240        }
1241
1242        // Check for closing ))
1243        match self.hgetc() {
1244            Some(')') => {
1245                self.dbparens = false;
1246                LexTok::Doutpar
1247            }
1248            c => {
1249                if let Some(c) = c {
1250                    self.hungetc(c);
1251                }
1252                LexTok::Lexerr
1253            }
1254        }
1255    }
1256
1257    /// Handle initial character of token
1258    fn lex_initial(&mut self, c: char) -> LexTok {
1259        // Handle comments
1260        if c == '#' && !self.nocomments {
1261            return self.lex_comment();
1262        }
1263
1264        match c {
1265            '\\' => {
1266                let d = self.hgetc();
1267                if d == Some('\n') {
1268                    // Line continuation - get next token
1269                    return self.gettok();
1270                }
1271                if let Some(d) = d {
1272                    self.hungetc(d);
1273                }
1274                self.lexstop = false;
1275                self.gettokstr(c, false)
1276            }
1277
1278            '\n' => LexTok::Newlin,
1279
1280            ';' => {
1281                let d = self.hgetc();
1282                match d {
1283                    Some(';') => LexTok::Dsemi,
1284                    Some('&') => LexTok::Semiamp,
1285                    Some('|') => LexTok::Semibar,
1286                    _ => {
1287                        if let Some(d) = d {
1288                            self.hungetc(d);
1289                        }
1290                        self.lexstop = false;
1291                        LexTok::Semi
1292                    }
1293                }
1294            }
1295
1296            '&' => {
1297                let d = self.hgetc();
1298                match d {
1299                    Some('&') => LexTok::Damper,
1300                    Some('!') | Some('|') => LexTok::Amperbang,
1301                    Some('>') => {
1302                        self.tokfd = self.tokfd.max(0);
1303                        let e = self.hgetc();
1304                        match e {
1305                            Some('!') | Some('|') => LexTok::Outangampbang,
1306                            Some('>') => {
1307                                let f = self.hgetc();
1308                                match f {
1309                                    Some('!') | Some('|') => LexTok::Doutangampbang,
1310                                    _ => {
1311                                        if let Some(f) = f {
1312                                            self.hungetc(f);
1313                                        }
1314                                        self.lexstop = false;
1315                                        LexTok::Doutangamp
1316                                    }
1317                                }
1318                            }
1319                            _ => {
1320                                if let Some(e) = e {
1321                                    self.hungetc(e);
1322                                }
1323                                self.lexstop = false;
1324                                LexTok::Ampoutang
1325                            }
1326                        }
1327                    }
1328                    _ => {
1329                        if let Some(d) = d {
1330                            self.hungetc(d);
1331                        }
1332                        self.lexstop = false;
1333                        LexTok::Amper
1334                    }
1335                }
1336            }
1337
1338            '|' => {
1339                let d = self.hgetc();
1340                match d {
1341                    Some('|') if self.incasepat <= 0 => LexTok::Dbar,
1342                    Some('&') => LexTok::Baramp,
1343                    _ => {
1344                        if let Some(d) = d {
1345                            self.hungetc(d);
1346                        }
1347                        self.lexstop = false;
1348                        LexTok::Bar
1349                    }
1350                }
1351            }
1352
1353            '(' => {
1354                let d = self.hgetc();
1355                match d {
1356                    Some('(') => {
1357                        if self.infor > 0 {
1358                            self.dbparens = true;
1359                            return LexTok::Dinpar;
1360                        }
1361                        if self.incmdpos {
1362                            // Could be (( arithmetic )) or ( subshell )
1363                            self.lexbuf.clear();
1364                            match self.cmd_or_math() {
1365                                CmdOrMath::Math => {
1366                                    self.tokstr = Some(self.lexbuf.as_str().to_string());
1367                                    return LexTok::Dinpar;
1368                                }
1369                                CmdOrMath::Cmd => {
1370                                    self.tokstr = None;
1371                                    return LexTok::Inpar;
1372                                }
1373                                CmdOrMath::Err => return LexTok::Lexerr,
1374                            }
1375                        }
1376                        self.hungetc('(');
1377                        self.lexstop = false;
1378                        self.gettokstr('(', false)
1379                    }
1380                    Some(')') => LexTok::Inoutpar,
1381                    _ => {
1382                        if let Some(d) = d {
1383                            self.hungetc(d);
1384                        }
1385                        self.lexstop = false;
1386                        // Per lex.c:822 LX1_INPAR — at word boundary `(`
1387                        // tokenizes as INPAR when SHGLOB || incond==1 ||
1388                        // incmdpos. Otherwise falls through to gettokstr
1389                        // (the `(` becomes start of a STRING — typical
1390                        // for unquoted glob args like `ls (^foo)*`).
1391                        // For `for x ( ... )` form, incmdpos is restored
1392                        // to 1 via the oldpos-save-after-FOR mechanism,
1393                        // so the next-token `(` correctly INPAR-izes.
1394                        if self.incond == 1 || self.incmdpos || self.incasepat >= 1 {
1395                            LexTok::Inpar
1396                        } else {
1397                            self.gettokstr('(', false)
1398                        }
1399                    }
1400                }
1401            }
1402
1403            ')' => LexTok::Outpar,
1404
1405            '{' => {
1406                // { is a command group only if followed by whitespace,
1407                // newline, or `}` (the empty-block form `{}`). zsh
1408                // treats `{}` as an empty compound — `foo() {}` is a
1409                // valid no-op function. Without `}` in this list,
1410                // `{}` got consumed as one literal token and ran as a
1411                // command, failing "command not found: {}".
1412                // The empty `{}` is also recognised AFTER a function
1413                // header `name()` even when `incmdpos` got cleared by
1414                // the preceding Outpar — peek for `}` regardless and
1415                // treat as Inbrace so `foo() {}` parses as a no-op
1416                // function body.
1417                let next = self.hgetc();
1418                let next_is_close = matches!(next, Some('}'));
1419                if self.incmdpos {
1420                    let is_brace_group = matches!(next, Some(' ' | '\t' | '\n' | '}') | None);
1421                    if let Some(ch) = next {
1422                        self.hungetc(ch);
1423                    }
1424                    if is_brace_group {
1425                        self.tokstr = Some("{".to_string());
1426                        LexTok::Inbrace
1427                    } else {
1428                        self.gettokstr(c, false)
1429                    }
1430                } else if next_is_close {
1431                    // `{}` empty block in non-cmd position (function
1432                    // body after `()`). Treat as Inbrace; the parser
1433                    // will follow with Outbrace.
1434                    if let Some(ch) = next {
1435                        self.hungetc(ch);
1436                    }
1437                    self.tokstr = Some("{".to_string());
1438                    LexTok::Inbrace
1439                } else {
1440                    if let Some(ch) = next {
1441                        self.hungetc(ch);
1442                    }
1443                    self.gettokstr(c, false)
1444                }
1445            }
1446
1447            '}' => {
1448                // } at start of token is always Outbrace (ends command group)
1449                // Inside a word, } would be handled by gettokstr but we never reach here mid-word
1450                self.tokstr = Some("}".to_string());
1451                LexTok::Outbrace
1452            }
1453
1454            '[' => {
1455                // [[ is a conditional expression start
1456                // [ can also be a command (test builtin) or array subscript
1457                // In case patterns (incasepat > 0), [ is part of glob pattern like [yY]
1458                if self.incasepat > 0 {
1459                    self.gettokstr(c, false)
1460                } else if self.incmdpos {
1461                    let next = self.hgetc();
1462                    if next == Some('[') {
1463                        // [[ - double bracket conditional
1464                        self.tokstr = Some("[[".to_string());
1465                        self.incond = 1;
1466                        return LexTok::Dinbrack;
1467                    }
1468                    // Single [ - either test command or start of glob pattern
1469                    if let Some(ch) = next {
1470                        self.hungetc(ch);
1471                    }
1472                    self.tokstr = Some("[".to_string());
1473                    LexTok::String
1474                } else {
1475                    self.gettokstr(c, false)
1476                }
1477            }
1478
1479            ']' => {
1480                // ]] ends a conditional expression started by [[
1481                if self.incond > 0 {
1482                    let next = self.hgetc();
1483                    if next == Some(']') {
1484                        self.tokstr = Some("]]".to_string());
1485                        self.incond = 0;
1486                        return LexTok::Doutbrack;
1487                    }
1488                    if let Some(ch) = next {
1489                        self.hungetc(ch);
1490                    }
1491                }
1492                self.gettokstr(c, false)
1493            }
1494
1495            '<' => {
1496                // In pattern context, < is literal (e.g., <-> in glob)
1497                if self.incondpat || self.incasepat > 0 {
1498                    self.gettokstr(c, false)
1499                } else {
1500                    self.lex_inang()
1501                }
1502            }
1503
1504            '>' => {
1505                // In pattern context, > is literal
1506                if self.incondpat || self.incasepat > 0 {
1507                    self.gettokstr(c, false)
1508                } else {
1509                    self.lex_outang()
1510                }
1511            }
1512
1513            _ => self.gettokstr(c, false),
1514        }
1515    }
1516
1517    /// Lex comment
1518    fn lex_comment(&mut self) -> LexTok {
1519        if self.lexflags.comments_keep {
1520            self.lexbuf.clear();
1521            self.add('#');
1522        }
1523
1524        loop {
1525            let c = self.hgetc();
1526            match c {
1527                Some('\n') | None => break,
1528                Some(c) => {
1529                    if self.lexflags.comments_keep {
1530                        self.add(c);
1531                    }
1532                }
1533            }
1534        }
1535
1536        if self.lexflags.comments_keep {
1537            self.tokstr = Some(self.lexbuf.as_str().to_string());
1538            if !self.lexstop {
1539                self.hungetc('\n');
1540            }
1541            return LexTok::String;
1542        }
1543
1544        if self.lexflags.comments_strip && self.lexstop {
1545            return LexTok::Endinput;
1546        }
1547
1548        LexTok::Newlin
1549    }
1550
1551    /// Lex < and variants
1552    fn lex_inang(&mut self) -> LexTok {
1553        let d = self.hgetc();
1554        match d {
1555            Some('(') => {
1556                // Process substitution <(...)
1557                self.hungetc('(');
1558                self.lexstop = false;
1559                self.gettokstr('<', false)
1560            }
1561            Some('>') => LexTok::Inoutang,
1562            Some('<') => {
1563                let e = self.hgetc();
1564                match e {
1565                    Some('(') => {
1566                        self.hungetc('(');
1567                        self.hungetc('<');
1568                        LexTok::Inang
1569                    }
1570                    Some('<') => LexTok::Trinang,
1571                    Some('-') => {
1572                        self.heredoc_pending = 2; // <<- expects terminator next
1573                        LexTok::Dinangdash
1574                    }
1575                    _ => {
1576                        if let Some(e) = e {
1577                            self.hungetc(e);
1578                        }
1579                        self.lexstop = false;
1580                        self.heredoc_pending = 1; // << expects terminator next
1581                        LexTok::Dinang
1582                    }
1583                }
1584            }
1585            Some('&') => LexTok::Inangamp,
1586            _ => {
1587                if let Some(d) = d {
1588                    self.hungetc(d);
1589                }
1590                self.lexstop = false;
1591                LexTok::Inang
1592            }
1593        }
1594    }
1595
1596    /// Lex > and variants
1597    fn lex_outang(&mut self) -> LexTok {
1598        let d = self.hgetc();
1599        match d {
1600            Some('(') => {
1601                // Process substitution >(...)
1602                self.hungetc('(');
1603                self.lexstop = false;
1604                self.gettokstr('>', false)
1605            }
1606            Some('&') => {
1607                let e = self.hgetc();
1608                match e {
1609                    Some('!') | Some('|') => LexTok::Outangampbang,
1610                    _ => {
1611                        if let Some(e) = e {
1612                            self.hungetc(e);
1613                        }
1614                        self.lexstop = false;
1615                        LexTok::Outangamp
1616                    }
1617                }
1618            }
1619            Some('!') | Some('|') => LexTok::Outangbang,
1620            Some('>') => {
1621                let e = self.hgetc();
1622                match e {
1623                    Some('&') => {
1624                        let f = self.hgetc();
1625                        match f {
1626                            Some('!') | Some('|') => LexTok::Doutangampbang,
1627                            _ => {
1628                                if let Some(f) = f {
1629                                    self.hungetc(f);
1630                                }
1631                                self.lexstop = false;
1632                                LexTok::Doutangamp
1633                            }
1634                        }
1635                    }
1636                    Some('!') | Some('|') => LexTok::Doutangbang,
1637                    Some('(') => {
1638                        self.hungetc('(');
1639                        self.hungetc('>');
1640                        LexTok::Outang
1641                    }
1642                    _ => {
1643                        if let Some(e) = e {
1644                            self.hungetc(e);
1645                        }
1646                        self.lexstop = false;
1647                        LexTok::Doutang
1648                    }
1649                }
1650            }
1651            _ => {
1652                if let Some(d) = d {
1653                    self.hungetc(d);
1654                }
1655                self.lexstop = false;
1656                LexTok::Outang
1657            }
1658        }
1659    }
1660
1661    /// Get rest of token string
1662    fn gettokstr(&mut self, c: char, sub: bool) -> LexTok {
1663        let mut bct = 0; // brace count
1664        let mut pct = 0; // parenthesis count
1665        let mut brct = 0; // bracket count
1666        let mut in_brace_param = 0;
1667        let mut peek = LexTok::String;
1668        let mut intpos = 1;
1669        let mut unmatched = '\0';
1670        let mut c = c;
1671        const MAX_ITERATIONS: usize = 100_000;
1672        let mut iterations = 0;
1673
1674        if !sub {
1675            self.lexbuf.clear();
1676        }
1677
1678        loop {
1679            iterations += 1;
1680            if iterations > MAX_ITERATIONS {
1681                self.error = Some("gettokstr exceeded maximum iterations".to_string());
1682                return LexTok::Lexerr;
1683            }
1684
1685            let inbl = Self::is_inblank(c);
1686
1687            if inbl && in_brace_param == 0 && pct == 0 {
1688                // Whitespace outside brace param ends token
1689                break;
1690            }
1691
1692            match c {
1693                // Whitespace is handled above for most cases
1694                ')' => {
1695                    if in_brace_param > 0 || sub {
1696                        self.add(char_tokens::OUTPAR);
1697                    } else if pct > 0 {
1698                        pct -= 1;
1699                        self.add(char_tokens::OUTPAR);
1700                    } else {
1701                        break;
1702                    }
1703                }
1704
1705                '|' => {
1706                    if pct == 0 && in_brace_param == 0 {
1707                        if sub {
1708                            self.add(c);
1709                        } else {
1710                            break;
1711                        }
1712                    } else {
1713                        self.add(char_tokens::BAR);
1714                    }
1715                }
1716
1717                '$' => {
1718                    let e = self.hgetc();
1719                    match e {
1720                        Some('\\') => {
1721                            let f = self.hgetc();
1722                            if f != Some('\n') {
1723                                if let Some(f) = f {
1724                                    self.hungetc(f);
1725                                }
1726                                self.hungetc('\\');
1727                                self.add(char_tokens::STRING);
1728                            } else {
1729                                // Line continuation after $
1730                                continue;
1731                            }
1732                        }
1733                        Some('[') => {
1734                            // $[...] arithmetic
1735                            self.add(char_tokens::STRING);
1736                            self.add(char_tokens::INBRACK);
1737                            if self.dquote_parse(']', sub).is_err() {
1738                                peek = LexTok::Lexerr;
1739                                break;
1740                            }
1741                            self.add(char_tokens::OUTBRACK);
1742                        }
1743                        Some('(') => {
1744                            // $(...) or $((...))
1745                            self.add(char_tokens::STRING);
1746                            match self.cmd_or_math_sub() {
1747                                CmdOrMath::Cmd => self.add(char_tokens::OUTPAR),
1748                                CmdOrMath::Math => self.add(char_tokens::OUTPARMATH),
1749                                CmdOrMath::Err => {
1750                                    peek = LexTok::Lexerr;
1751                                    break;
1752                                }
1753                            }
1754                        }
1755                        Some('{') => {
1756                            self.add(c);
1757                            self.add(char_tokens::INBRACE);
1758                            bct += 1;
1759                            if in_brace_param == 0 {
1760                                in_brace_param = bct;
1761                            }
1762                        }
1763                        Some('\'') => {
1764                            // $'...' ANSI-C escape syntax.
1765                            // Port of Src/lex.c:1284-1314 (LX2_QUOTE
1766                            // branch when prev char was `String`):
1767                            // only `\\` and `\'` emit a `Bnull`
1768                            // marker (so getkeystring later
1769                            // recognizes them as user-literal); any
1770                            // other `\X` emits a literal `\` + the
1771                            // following char so getkeystring's
1772                            // standard `\n`/`\x`/`\u`/... decoding
1773                            // can fire.
1774                            self.add(char_tokens::QSTRING);
1775                            self.add(char_tokens::SNULL);
1776                            loop {
1777                                let ch = self.hgetc();
1778                                match ch {
1779                                    Some('\'') => break,
1780                                    Some('\\') => {
1781                                        let next = self.hgetc();
1782                                        match next {
1783                                            Some(n) => {
1784                                                if n == '\\' || n == '\'' {
1785                                                    self.add(char_tokens::BNULL);
1786                                                } else {
1787                                                    self.add('\\');
1788                                                }
1789                                                self.add(n);
1790                                            }
1791                                            None => {
1792                                                self.lexstop = true;
1793                                                unmatched = '\'';
1794                                                peek = LexTok::Lexerr;
1795                                                break;
1796                                            }
1797                                        }
1798                                    }
1799                                    Some(ch) => self.add(ch),
1800                                    None => {
1801                                        self.lexstop = true;
1802                                        unmatched = '\'';
1803                                        peek = LexTok::Lexerr;
1804                                        break;
1805                                    }
1806                                }
1807                            }
1808                            if unmatched != '\0' {
1809                                break;
1810                            }
1811                            self.add(char_tokens::SNULL);
1812                        }
1813                        Some('"') => {
1814                            // $"..." localized string. Same shape as a
1815                            // plain "..." but flagged via QSTRING+DNULL
1816                            // so post-lex translation can substitute.
1817                            self.add(char_tokens::QSTRING);
1818                            self.add(char_tokens::DNULL);
1819                            if self.dquote_parse('"', sub).is_err() {
1820                                peek = LexTok::Lexerr;
1821                                break;
1822                            }
1823                            self.add(char_tokens::DNULL);
1824                        }
1825                        _ => {
1826                            if let Some(e) = e {
1827                                self.hungetc(e);
1828                            }
1829                            self.lexstop = false;
1830                            self.add(char_tokens::STRING);
1831                        }
1832                    }
1833                }
1834
1835                '[' => {
1836                    if in_brace_param == 0 {
1837                        brct += 1;
1838                    }
1839                    self.add(char_tokens::INBRACK);
1840                }
1841
1842                ']' => {
1843                    if in_brace_param == 0 && brct > 0 {
1844                        brct -= 1;
1845                    }
1846                    self.add(char_tokens::OUTBRACK);
1847                }
1848
1849                '(' => {
1850                    // lex.c:1078-1135 LX2_INPAR — when `(` appears inside
1851                    // a STRING and is immediately followed by `)`, the
1852                    // string terminates at the `(`. The `()` is then
1853                    // re-lexed as a separate INOUTPAR token. This handles
1854                    // function definitions: `name()` lexes as STRING `name`
1855                    // + INOUTPAR `()`, not STRING `name()`.
1856                    //
1857                    // Also (lex.c:1109-1112): under SHGLOB, a `(` followed
1858                    // by whitespace at the start of a command-position word
1859                    // (no nested brackets/braces) is a ksh function
1860                    // definition signal — same break-out behavior.
1861                    if in_brace_param == 0 && !sub {
1862                        let e = self.hgetc();
1863                        if let Some(ch) = e {
1864                            self.hungetc(ch);
1865                        }
1866                        self.lexstop = false;
1867                        if e == Some(')') {
1868                            // `name()` — terminate STRING at `(` so the
1869                            // following `()` re-lexes as INOUTPAR. The
1870                            // loop's exit guard at line 2067 will
1871                            // `hungetc(c)` to push the `(` back; we only
1872                            // need to ensure `)` is also there. The
1873                            // hungetc(ch) above already pushed `)`, so
1874                            // breaking here yields unget_buf = [`(`, `)`]
1875                            // after the guard, which the outer dispatch
1876                            // reads as Inoutpar.
1877                            break;
1878                        }
1879                    }
1880                    if in_brace_param == 0 {
1881                        pct += 1;
1882                    }
1883                    self.add(char_tokens::INPAR);
1884                }
1885
1886                '{' => {
1887                    // Track braces for both ${...} param expansion and {...} brace expansion
1888                    bct += 1;
1889                    self.add(c);
1890                }
1891
1892                '}' => {
1893                    if in_brace_param > 0 {
1894                        if bct == in_brace_param {
1895                            in_brace_param = 0;
1896                        }
1897                        bct -= 1;
1898                        self.add(char_tokens::OUTBRACE);
1899                    } else if bct > 0 {
1900                        // Closing a brace expansion like {a,b}
1901                        bct -= 1;
1902                        self.add(c);
1903                    } else {
1904                        break;
1905                    }
1906                }
1907
1908                '>' => {
1909                    // In pattern context (incondpat), > is literal
1910                    if in_brace_param > 0 || sub || self.incondpat || self.incasepat > 0 {
1911                        self.add(c);
1912                    } else {
1913                        let e = self.hgetc();
1914                        if e != Some('(') {
1915                            if let Some(e) = e {
1916                                self.hungetc(e);
1917                            }
1918                            self.lexstop = false;
1919                            break;
1920                        }
1921                        // >(...)
1922                        self.add(char_tokens::OUTANGPROC);
1923                        if self.skip_command_sub().is_err() {
1924                            peek = LexTok::Lexerr;
1925                            break;
1926                        }
1927                        self.add(char_tokens::OUTPAR);
1928                    }
1929                }
1930
1931                '<' => {
1932                    // In pattern context (incondpat), < is literal
1933                    if in_brace_param > 0 || sub || self.incondpat || self.incasepat > 0 {
1934                        self.add(c);
1935                    } else if let Some(range_chars) = self.try_numeric_range_glob() {
1936                        // zsh numeric range glob `<N-M>`, `<->`, `<N->`,
1937                        // `<-M>`. When `<` mid-word matches that exact
1938                        // shape, swallow it into the word instead of
1939                        // breaking out for redirection.
1940                        self.add(c);
1941                        for ch in range_chars.chars() {
1942                            self.add(ch);
1943                        }
1944                    } else {
1945                        let e = self.hgetc();
1946                        if e != Some('(') {
1947                            if let Some(e) = e {
1948                                self.hungetc(e);
1949                            }
1950                            self.lexstop = false;
1951                            break;
1952                        }
1953                        // <(...)
1954                        self.add(char_tokens::INANG);
1955                        if self.skip_command_sub().is_err() {
1956                            peek = LexTok::Lexerr;
1957                            break;
1958                        }
1959                        self.add(char_tokens::OUTPAR);
1960                    }
1961                }
1962
1963                '=' => {
1964                    if !sub {
1965                        if intpos > 0 {
1966                            // At start of token, check for =(...) process substitution
1967                            let e = self.hgetc();
1968                            if e == Some('(') {
1969                                self.add(char_tokens::EQUALS);
1970                                if self.skip_command_sub().is_err() {
1971                                    peek = LexTok::Lexerr;
1972                                    break;
1973                                }
1974                                self.add(char_tokens::OUTPAR);
1975                            } else {
1976                                if let Some(e) = e {
1977                                    self.hungetc(e);
1978                                }
1979                                self.lexstop = false;
1980                                self.add(char_tokens::EQUALS);
1981                            }
1982                        } else if peek != LexTok::Envstring
1983                            && (self.incmdpos || self.intypeset)
1984                            && bct == 0
1985                            && brct == 0
1986                            && self.incasepat == 0
1987                        {
1988                            // Check for VAR=value assignment (but not in case pattern context)
1989                            let tok_so_far = self.lexbuf.as_str().to_string();
1990                            if self.is_valid_assignment_target(&tok_so_far) {
1991                                let next = self.hgetc();
1992                                if next == Some('(') {
1993                                    // VAR=(...) array assignment. Per zsh
1994                                    // (lex.c emits ENVARRAY with tokstr =
1995                                    // just the variable name, NOT
1996                                    // including the `=`). The `=` and
1997                                    // `(` are consumed by the lexer; the
1998                                    // parser knows ENVARRAY means assign-
1999                                    // array and reads the body that
2000                                    // follows.
2001                                    self.tokstr = Some(self.lexbuf.as_str().to_string());
2002                                    return LexTok::Envarray;
2003                                }
2004                                if let Some(next) = next {
2005                                    self.hungetc(next);
2006                                }
2007                                self.lexstop = false;
2008                                peek = LexTok::Envstring;
2009                                intpos = 2;
2010                                self.add(char_tokens::EQUALS);
2011                            } else {
2012                                self.add(char_tokens::EQUALS);
2013                            }
2014                        } else {
2015                            self.add(char_tokens::EQUALS);
2016                        }
2017                    } else {
2018                        self.add(char_tokens::EQUALS);
2019                    }
2020                }
2021
2022                '\\' => {
2023                    let next = self.hgetc();
2024                    if next == Some('\n') {
2025                        // Line continuation
2026                        let next = self.hgetc();
2027                        if let Some(next) = next {
2028                            c = next;
2029                            continue;
2030                        }
2031                        break;
2032                    } else {
2033                        self.add(char_tokens::BNULL);
2034                        if let Some(next) = next {
2035                            self.add(next);
2036                        }
2037                    }
2038                }
2039
2040                '\'' => {
2041                    // Single quoted string - everything literal until '
2042                    self.add(char_tokens::SNULL);
2043                    loop {
2044                        let ch = self.hgetc();
2045                        match ch {
2046                            Some('\'') => break,
2047                            Some(ch) => self.add(ch),
2048                            None => {
2049                                self.lexstop = true;
2050                                unmatched = '\'';
2051                                peek = LexTok::Lexerr;
2052                                break;
2053                            }
2054                        }
2055                    }
2056                    if unmatched != '\0' {
2057                        break;
2058                    }
2059                    self.add(char_tokens::SNULL);
2060                }
2061
2062                '"' => {
2063                    // Double quoted string
2064                    self.add(char_tokens::DNULL);
2065                    if self.dquote_parse('"', sub).is_err() {
2066                        unmatched = '"';
2067                        if !self.lexflags.active {
2068                            peek = LexTok::Lexerr;
2069                        }
2070                        break;
2071                    }
2072                    self.add(char_tokens::DNULL);
2073                }
2074
2075                '`' => {
2076                    // Backtick command substitution
2077                    self.add(char_tokens::TICK);
2078                    loop {
2079                        let ch = self.hgetc();
2080                        match ch {
2081                            Some('`') => break,
2082                            Some('\\') => {
2083                                let next = self.hgetc();
2084                                match next {
2085                                    Some('\n') => continue, // Line continuation
2086                                    Some(c) if c == '`' || c == '\\' || c == '$' => {
2087                                        self.add(char_tokens::BNULL);
2088                                        self.add(c);
2089                                    }
2090                                    Some(c) => {
2091                                        self.add('\\');
2092                                        self.add(c);
2093                                    }
2094                                    None => break,
2095                                }
2096                            }
2097                            Some(ch) => self.add(ch),
2098                            None => {
2099                                self.lexstop = true;
2100                                unmatched = '`';
2101                                peek = LexTok::Lexerr;
2102                                break;
2103                            }
2104                        }
2105                    }
2106                    if unmatched != '\0' {
2107                        break;
2108                    }
2109                    self.add(char_tokens::TICK);
2110                }
2111
2112                '~' => {
2113                    self.add(char_tokens::TILDE);
2114                }
2115
2116                '#' => {
2117                    self.add(char_tokens::POUND);
2118                }
2119
2120                '^' => {
2121                    self.add(char_tokens::HAT);
2122                }
2123
2124                '*' => {
2125                    self.add(char_tokens::STAR);
2126                }
2127
2128                '?' => {
2129                    self.add(char_tokens::QUEST);
2130                }
2131
2132                ',' if bct > in_brace_param => {
2133                    self.add(char_tokens::COMMA);
2134                }
2135
2136                '-' => {
2137                    self.add(char_tokens::DASH);
2138                }
2139
2140                '!' if brct > 0 => {
2141                    self.add(char_tokens::BANG);
2142                }
2143
2144                // Terminators — but only when we're at the top level of
2145                // the current word. Inside a brace parameter expansion
2146                // `${...}`, parenthesized flag block `(@s.;.)`, or
2147                // bracketed subscript `[...]`, `;` is just a delimiter
2148                // character (e.g. the field separator in `(@s.;.)`),
2149                // not a statement terminator. Real zsh handles this
2150                // via gettokstr's incmdpos / bct / pct accounting; we
2151                // gate on the same counters.
2152                '\n' | ';' | '&' if in_brace_param == 0 && pct == 0 && brct == 0 => {
2153                    break;
2154                }
2155                '\n' | ';' | '&' => {
2156                    self.add(c);
2157                }
2158
2159                _ => {
2160                    self.add(c);
2161                }
2162            }
2163
2164            c = match self.hgetc() {
2165                Some(c) => c,
2166                None => {
2167                    self.lexstop = true;
2168                    break;
2169                }
2170            };
2171
2172            if intpos > 0 {
2173                intpos -= 1;
2174            }
2175        }
2176
2177        // Put back the character that ended the token
2178        if !self.lexstop {
2179            self.hungetc(c);
2180        }
2181
2182        if unmatched != '\0' && !self.lexflags.active {
2183            self.error = Some(format!("unmatched {}", unmatched));
2184        }
2185
2186        if in_brace_param > 0 {
2187            self.error = Some("closing brace expected".to_string());
2188        }
2189
2190        self.tokstr = Some(self.lexbuf.as_str().to_string());
2191        peek
2192    }
2193
2194    /// Check if a string is a valid assignment target (identifier or array ref).
2195    ///
2196    /// zsh accepts identifier (`[A-Za-z_][A-Za-z0-9_]*`) optionally followed by
2197    /// a `[...]` subscript. Bare digits are NOT a valid lvalue (rejected at
2198    /// `if c.is_ascii_digit()` below — array index expressions like `arr[2]`
2199    /// are caught by the subscript handler, not here). And the first char
2200    /// must NOT be a zsh internal token byte — `$=foo` (where `$` becomes
2201    /// the STRING token 0x85) is parameter substitution with the `=` flag,
2202    /// NOT an envstring assignment.
2203    fn is_valid_assignment_target(&self, s: &str) -> bool {
2204        let mut chars = s.chars().peekable();
2205
2206        // Reject leading token byte — `$VAR=` is parameter substitution,
2207        // not assignment. Same for `*=`, `?=`, etc.
2208        if let Some(&c) = chars.peek() {
2209            if char_tokens::is_token(c) {
2210                return false;
2211            }
2212        }
2213
2214        // Check for leading digit (invalid)
2215        if let Some(&c) = chars.peek() {
2216            if c.is_ascii_digit() {
2217                // Could be array index, check rest
2218                while let Some(&c) = chars.peek() {
2219                    if !c.is_ascii_digit() {
2220                        break;
2221                    }
2222                    chars.next();
2223                }
2224                return chars.peek().is_none();
2225            }
2226        }
2227
2228        // Check identifier
2229        let mut has_ident = false;
2230        while let Some(&c) = chars.peek() {
2231            if c == char_tokens::INBRACK || c == '[' {
2232                break;
2233            }
2234            if c == '+' {
2235                // foo+=value
2236                chars.next();
2237                return chars.peek().is_none() || chars.peek() == Some(&'=');
2238            }
2239            if !Self::is_ident(c) && c != char_tokens::STRING && !char_tokens::is_token(c) {
2240                return false;
2241            }
2242            has_ident = true;
2243            chars.next();
2244        }
2245
2246        has_ident
2247    }
2248
2249    /// Parse the body of a double-quoted string (or any context that
2250    /// uses double-quote tokenization — `(( ))`, `${...}`, `$( ( ) )`).
2251    /// Direct port of zsh/Src/lex.c:1486-1693 `dquote_parse`. Reads
2252    /// chars until `endchar` is seen at depth 0, handling escapes,
2253    /// `${...}` parameter substitutions, `$(...)` and backtick command
2254    /// substitutions, `$((...))` arithmetic, and inner double-quoted
2255    /// strings. The `sub` flag toggles substitution-context tokens
2256    /// (lex.c:1487 `int sub` argument).
2257    ///
2258    /// zshrs port note: the recursion guard at the top is a Rust
2259    /// safety net; the C source relies on the runtime stack. Inner
2260    /// logic delegates to `dquote_parse_inner` which holds the actual
2261    /// per-char state machine matching lex.c:1495-1692.
2262    fn dquote_parse(&mut self, endchar: char, sub: bool) -> Result<(), ()> {
2263        self.recursion_depth += 1;
2264        if self.check_recursion() {
2265            self.recursion_depth -= 1;
2266            return Err(());
2267        }
2268
2269        let result = self.dquote_parse_inner(endchar, sub);
2270        self.recursion_depth -= 1;
2271        result
2272    }
2273
2274    fn dquote_parse_inner(&mut self, endchar: char, sub: bool) -> Result<(), ()> {
2275        let mut pct = 0; // parenthesis count
2276        let mut brct = 0; // bracket count
2277        let mut bct = 0; // brace count (for ${...})
2278        let mut intick = false; // inside backtick
2279        let is_math = endchar == ')' || endchar == ']' || self.infor > 0;
2280        const MAX_ITERATIONS: usize = 100_000;
2281        let mut iterations = 0;
2282
2283        loop {
2284            iterations += 1;
2285            if iterations > MAX_ITERATIONS {
2286                self.error = Some("dquote_parse exceeded maximum iterations".to_string());
2287                return Err(());
2288            }
2289            let c = self.hgetc();
2290            let c = match c {
2291                Some(c) if c == endchar && !intick && bct == 0 => {
2292                    if is_math && (pct > 0 || brct > 0) {
2293                        self.add(c);
2294                        if c == ')' {
2295                            pct -= 1;
2296                        } else if c == ']' {
2297                            brct -= 1;
2298                        }
2299                        continue;
2300                    }
2301                    return Ok(());
2302                }
2303                Some(c) => c,
2304                None => {
2305                    self.lexstop = true;
2306                    return Err(());
2307                }
2308            };
2309
2310            match c {
2311                '\\' => {
2312                    let next = self.hgetc();
2313                    match next {
2314                        Some('\n') if !sub => continue, // Line continuation
2315                        Some(c)
2316                            if c == '$'
2317                                || c == '\\'
2318                                || (c == '}' && !intick && bct > 0)
2319                                || c == endchar
2320                                || c == '`'
2321                                || (endchar == ']'
2322                                    && (c == '['
2323                                        || c == ']'
2324                                        || c == '('
2325                                        || c == ')'
2326                                        || c == '{'
2327                                        || c == '}'
2328                                        || (c == '"' && sub))) =>
2329                        {
2330                            self.add(char_tokens::BNULL);
2331                            self.add(c);
2332                        }
2333                        Some(c) => {
2334                            self.add('\\');
2335                            self.hungetc(c);
2336                            continue;
2337                        }
2338                        None => {
2339                            self.add('\\');
2340                        }
2341                    }
2342                }
2343
2344                '$' => {
2345                    if intick {
2346                        self.add(c);
2347                        continue;
2348                    }
2349                    let next = self.hgetc();
2350                    match next {
2351                        Some('(') => {
2352                            self.add(char_tokens::QSTRING);
2353                            match self.cmd_or_math_sub() {
2354                                CmdOrMath::Cmd => self.add(char_tokens::OUTPAR),
2355                                CmdOrMath::Math => self.add(char_tokens::OUTPARMATH),
2356                                CmdOrMath::Err => return Err(()),
2357                            }
2358                        }
2359                        Some('[') => {
2360                            self.add(char_tokens::STRING);
2361                            self.add(char_tokens::INBRACK);
2362                            self.dquote_parse(']', sub)?;
2363                            self.add(char_tokens::OUTBRACK);
2364                        }
2365                        Some('{') => {
2366                            self.add(char_tokens::QSTRING);
2367                            self.add(char_tokens::INBRACE);
2368                            bct += 1;
2369                        }
2370                        Some('$') => {
2371                            self.add(char_tokens::QSTRING);
2372                            self.add('$');
2373                        }
2374                        _ => {
2375                            if let Some(next) = next {
2376                                self.hungetc(next);
2377                            }
2378                            self.lexstop = false;
2379                            self.add(char_tokens::QSTRING);
2380                        }
2381                    }
2382                }
2383
2384                '}' => {
2385                    if intick || bct == 0 {
2386                        self.add(c);
2387                    } else {
2388                        self.add(char_tokens::OUTBRACE);
2389                        bct -= 1;
2390                    }
2391                }
2392
2393                '`' => {
2394                    self.add(char_tokens::QTICK);
2395                    intick = !intick;
2396                }
2397
2398                '(' => {
2399                    if !is_math || bct == 0 {
2400                        pct += 1;
2401                    }
2402                    self.add(c);
2403                }
2404
2405                ')' => {
2406                    if !is_math || bct == 0 {
2407                        if pct == 0 && is_math {
2408                            return Err(());
2409                        }
2410                        pct -= 1;
2411                    }
2412                    self.add(c);
2413                }
2414
2415                '[' => {
2416                    if !is_math || bct == 0 {
2417                        brct += 1;
2418                    }
2419                    self.add(c);
2420                }
2421
2422                ']' => {
2423                    if !is_math || bct == 0 {
2424                        if brct == 0 && is_math {
2425                            return Err(());
2426                        }
2427                        brct -= 1;
2428                    }
2429                    self.add(c);
2430                }
2431
2432                '"' => {
2433                    if intick || (endchar != '"' && bct == 0) {
2434                        self.add(c);
2435                    } else if bct > 0 {
2436                        self.add(char_tokens::DNULL);
2437                        self.dquote_parse('"', sub)?;
2438                        self.add(char_tokens::DNULL);
2439                    } else {
2440                        return Err(());
2441                    }
2442                }
2443
2444                _ => {
2445                    self.add(c);
2446                }
2447            }
2448        }
2449    }
2450
2451    /// Determine if (( is arithmetic or command
2452    /// Decide whether `( ... )` after a `$` is a math expression
2453    /// `$((...))` or a command substitution `$(...)`. Direct port of
2454    /// zsh/Src/lex.c:495-532 `cmd_or_math`. Tries dquote_parse first;
2455    /// if it succeeds AND the next char is `)` (closing the second
2456    /// paren of `(( ))`), it's math. Otherwise rewinds and treats as
2457    /// a command substitution.
2458    fn cmd_or_math(&mut self) -> CmdOrMath {
2459        let oldlen = self.lexbuf.len();
2460
2461        // Per lex.c:498-518 — `cmd_or_math` calls `dquote_parse(')')`
2462        // which fills lexbuf with ONLY the inner expression, then checks
2463        // for the closing `)`. The surrounding `((` / `))` are NOT added
2464        // to lexbuf. zshrs previously added INPAR + '(' before dquote and
2465        // ')' after, polluting DINPAR's tokstr with the literal parens.
2466        // Removed to match C exactly.
2467        if self.dquote_parse(')', false).is_err() {
2468            // Back up and try as command
2469            while self.lexbuf.len() > oldlen {
2470                if let Some(c) = self.lexbuf.pop() {
2471                    self.hungetc(c);
2472                }
2473            }
2474            self.hungetc('(');
2475            self.lexstop = false;
2476            return if self.skip_command_sub().is_err() {
2477                CmdOrMath::Err
2478            } else {
2479                CmdOrMath::Cmd
2480            };
2481        }
2482
2483        // Check for closing ) — matches C lex.c:511-512: success-with-`)`
2484        // means `((..))` was math. Don't add `)` to lexbuf.
2485        let c = self.hgetc();
2486        if c == Some(')') {
2487            return CmdOrMath::Math;
2488        }
2489
2490        // Not math, back up
2491        if let Some(c) = c {
2492            self.hungetc(c);
2493        }
2494        self.lexstop = false;
2495
2496        // Back up token
2497        while self.lexbuf.len() > oldlen {
2498            if let Some(c) = self.lexbuf.pop() {
2499                self.hungetc(c);
2500            }
2501        }
2502        self.hungetc('(');
2503
2504        if self.skip_command_sub().is_err() {
2505            CmdOrMath::Err
2506        } else {
2507            CmdOrMath::Cmd
2508        }
2509    }
2510
2511    /// Parse `$(...)` or `$((...))` after the `$` has been consumed.
2512    /// Direct port of zsh/Src/lex.c:540-573 `cmd_or_math_sub`. Reads
2513    /// the next char to discriminate: a leading `(` plus successful
2514    /// math parse via `cmd_or_math` → arithmetic substitution (with
2515    /// the open-paren retroactively rewritten to Inparmath); else
2516    /// command substitution via skip_command_sub.
2517    fn cmd_or_math_sub(&mut self) -> CmdOrMath {
2518        const MAX_CONTINUATIONS: usize = 10_000;
2519        let mut continuations = 0;
2520
2521        loop {
2522            continuations += 1;
2523            if continuations > MAX_CONTINUATIONS {
2524                self.error = Some("cmd_or_math_sub: too many line continuations".to_string());
2525                return CmdOrMath::Err;
2526            }
2527
2528            let c = self.hgetc();
2529            if c == Some('\\') {
2530                let c2 = self.hgetc();
2531                if c2 != Some('\n') {
2532                    if let Some(c2) = c2 {
2533                        self.hungetc(c2);
2534                    }
2535                    self.hungetc('\\');
2536                    self.lexstop = false;
2537                    return if self.skip_command_sub().is_err() {
2538                        CmdOrMath::Err
2539                    } else {
2540                        CmdOrMath::Cmd
2541                    };
2542                }
2543                // Line continuation, try again (loop instead of recursion)
2544                continue;
2545            }
2546
2547            // Not a line continuation, process normally
2548            if c == Some('(') {
2549                // Might be $((...))
2550                let lexpos = self.lexbuf.len();
2551                self.add(char_tokens::INPAR);
2552                self.add('(');
2553
2554                if self.dquote_parse(')', false).is_ok() {
2555                    let c2 = self.hgetc();
2556                    if c2 == Some(')') {
2557                        self.add(')');
2558                        return CmdOrMath::Math;
2559                    }
2560                    if let Some(c2) = c2 {
2561                        self.hungetc(c2);
2562                    }
2563                }
2564
2565                // Not math, restore and parse as command
2566                while self.lexbuf.len() > lexpos {
2567                    if let Some(ch) = self.lexbuf.pop() {
2568                        self.hungetc(ch);
2569                    }
2570                }
2571                self.hungetc('(');
2572                self.lexstop = false;
2573            } else {
2574                if let Some(c) = c {
2575                    self.hungetc(c);
2576                }
2577                self.lexstop = false;
2578            }
2579
2580            return if self.skip_command_sub().is_err() {
2581                CmdOrMath::Err
2582            } else {
2583                CmdOrMath::Cmd
2584            };
2585        }
2586    }
2587
2588    /// Skip over `(...)` for command-style substitutions: `$(...)`,
2589    /// `<(...)`, `>(...)`. Direct port of zsh/Src/lex.c:2080-end
2590    /// `skipcomm`. Per the C source comment: "we'll parse the input
2591    /// until we find an unmatched closing parenthesis. However, we'll
2592    /// throw away the result of the parsing and just keep the string
2593    /// we've built up on the way."
2594    ///
2595    /// zshrs port note: the C source uses zcontext_save/restore +
2596    /// strinbeg/inpush to set up an isolated lex context for the
2597    /// throw-away parse. zshrs's standalone walker tracks paren
2598    /// depth directly without re-entering the parser. Same
2599    /// invariant: stops at the matching `)`.
2600    fn skip_command_sub(&mut self) -> Result<(), ()> {
2601        let mut pct = 1;
2602        let mut start = true;
2603        const MAX_ITERATIONS: usize = 100_000;
2604        let mut iterations = 0;
2605
2606        self.add(char_tokens::INPAR);
2607
2608        loop {
2609            iterations += 1;
2610            if iterations > MAX_ITERATIONS {
2611                self.error = Some("skip_command_sub exceeded maximum iterations".to_string());
2612                return Err(());
2613            }
2614
2615            let c = self.hgetc();
2616            let c = match c {
2617                Some(c) => c,
2618                None => {
2619                    self.lexstop = true;
2620                    return Err(());
2621                }
2622            };
2623
2624            let iswhite = Self::is_inblank(c);
2625
2626            match c {
2627                '(' => {
2628                    pct += 1;
2629                    self.add(c);
2630                }
2631                ')' => {
2632                    pct -= 1;
2633                    if pct == 0 {
2634                        return Ok(());
2635                    }
2636                    self.add(c);
2637                }
2638                '\\' => {
2639                    self.add(c);
2640                    if let Some(c) = self.hgetc() {
2641                        self.add(c);
2642                    }
2643                }
2644                '\'' => {
2645                    self.add(c);
2646                    loop {
2647                        let ch = self.hgetc();
2648                        match ch {
2649                            Some('\'') => {
2650                                self.add('\'');
2651                                break;
2652                            }
2653                            Some(ch) => self.add(ch),
2654                            None => {
2655                                self.lexstop = true;
2656                                return Err(());
2657                            }
2658                        }
2659                    }
2660                }
2661                '"' => {
2662                    self.add(c);
2663                    loop {
2664                        let ch = self.hgetc();
2665                        match ch {
2666                            Some('"') => {
2667                                self.add('"');
2668                                break;
2669                            }
2670                            Some('\\') => {
2671                                self.add('\\');
2672                                if let Some(ch) = self.hgetc() {
2673                                    self.add(ch);
2674                                }
2675                            }
2676                            Some(ch) => self.add(ch),
2677                            None => {
2678                                self.lexstop = true;
2679                                return Err(());
2680                            }
2681                        }
2682                    }
2683                }
2684                '`' => {
2685                    self.add(c);
2686                    loop {
2687                        let ch = self.hgetc();
2688                        match ch {
2689                            Some('`') => {
2690                                self.add('`');
2691                                break;
2692                            }
2693                            Some('\\') => {
2694                                self.add('\\');
2695                                if let Some(ch) = self.hgetc() {
2696                                    self.add(ch);
2697                                }
2698                            }
2699                            Some(ch) => self.add(ch),
2700                            None => {
2701                                self.lexstop = true;
2702                                return Err(());
2703                            }
2704                        }
2705                    }
2706                }
2707                '#' if start => {
2708                    self.add(c);
2709                    // Skip comment to end of line
2710                    loop {
2711                        let ch = self.hgetc();
2712                        match ch {
2713                            Some('\n') => {
2714                                self.add('\n');
2715                                break;
2716                            }
2717                            Some(ch) => self.add(ch),
2718                            None => break,
2719                        }
2720                    }
2721                }
2722                _ => {
2723                    self.add(c);
2724                }
2725            }
2726
2727            start = iswhite;
2728        }
2729    }
2730
2731    /// Lex next token AND update per-context flags. Direct port of
2732    /// zsh/Src/lex.c:316-369 `ctxtlex`. The post-token state machine
2733    /// at lex.c:322-358 sets `incmdpos` based on the token shape:
2734    /// list separators / pipes / control keywords reset to cmd-pos;
2735    /// word-shaped tokens leave cmd-pos. Redirections (lex.c:361-368)
2736    /// stash prior incmdpos and force the redir target to non-cmd-pos.
2737    pub fn ctxtlex(&mut self) {
2738        // lex.c:319 — static `oldpos` cache for redir-target restore
2739        // is captured per-call here as `oldpos` below (zshrs's parser
2740        // re-enters ctxtlex per token, no need for static persistence).
2741
2742        // lex.c:321 — `zshlex();` to advance to the next token.
2743        self.zshlex();
2744
2745        // lex.c:322-358 — post-token incmdpos switch.
2746        match self.tok {
2747            // lex.c:323-343 — separators / openers / conjunctions /
2748            // control keywords — back into cmd-pos so the next token
2749            // can be a fresh command.
2750            LexTok::Seper
2751            | LexTok::Newlin
2752            | LexTok::Semi
2753            | LexTok::Dsemi
2754            | LexTok::Semiamp
2755            | LexTok::Semibar
2756            | LexTok::Amper
2757            | LexTok::Amperbang
2758            | LexTok::Inpar
2759            | LexTok::Inbrace
2760            | LexTok::Dbar
2761            | LexTok::Damper
2762            | LexTok::Bar
2763            | LexTok::Baramp
2764            | LexTok::Inoutpar
2765            | LexTok::Doloop
2766            | LexTok::Then
2767            | LexTok::Elif
2768            | LexTok::Else
2769            | LexTok::Doutbrack => {
2770                self.incmdpos = true;
2771            }
2772            // lex.c:345-353 — word/value-shaped tokens leave cmd-pos
2773            // so subsequent tokens are arguments, not a fresh command.
2774            LexTok::String
2775            | LexTok::Typeset
2776            | LexTok::Envarray
2777            | LexTok::Outpar
2778            | LexTok::Case
2779            | LexTok::Dinbrack => {
2780                self.incmdpos = false;
2781            }
2782            _ => {}
2783        }
2784
2785        // lex.c:359-360 — `infor` decay. FOR sets infor=2 so the next
2786        // DINPAR can detect c-style for. After any non-DINPAR, decay
2787        // to 0 (or back to 2 if we just saw FOR again).
2788        if self.tok != LexTok::Dinpar {
2789            self.infor = if self.tok == LexTok::For { 2 } else { 0 };
2790        }
2791
2792        // lex.c:361-368 — redir-target context dance. After consuming
2793        // a redir operator, the following token (the file path) sees
2794        // incmdpos=0 even when its inherent shape would put it back
2795        // in cmd-pos. After the redir target, restore from oldpos
2796        // (struct field — must persist across zshlex calls).
2797        if self.tok.is_redirop()
2798            || self.tok == LexTok::For
2799            || self.tok == LexTok::Foreach
2800            || self.tok == LexTok::Select
2801        {
2802            self.inredir = true;
2803            self.oldpos = self.incmdpos;
2804            self.incmdpos = false;
2805        } else if self.inredir {
2806            self.incmdpos = self.oldpos;
2807            self.inredir = false;
2808        }
2809    }
2810
2811    /// Mark the current word as the one ZLE was looking for. Direct
2812    /// port of zsh/Src/lex.c:1881-1897 `gotword`. Only meaningful
2813    /// when the lexer was started with LEXFLAGS_ZLE for completion;
2814    /// after this call `lexflags` is cleared so subsequent tokens
2815    /// don't re-trigger word tracking.
2816    ///
2817    /// zshrs port note: zsh's gotword updates `wb`/`we` (word begin/
2818    /// end positions) based on `zlemetacs` (cursor pos), `zlemetall`
2819    /// (line length), `inbufct`, and `addedx` — all live in zsh's
2820    /// input.c globals which zshrs hasn't wired through the lexer.
2821    /// Only the `lexflags = 0` side-effect at lex.c:1895 is
2822    /// reproducible without that integration.
2823    pub fn gotword(&mut self) {
2824        // lex.c:1895 — `lexflags = 0;`
2825        self.lexflags = LexFlags::default();
2826    }
2827
2828    /// Register a heredoc to be processed at next newline
2829    pub fn register_heredoc(&mut self, terminator: String, strip_tabs: bool) {
2830        self.heredocs.push(HereDoc {
2831            terminator,
2832            strip_tabs,
2833            content: String::new(),
2834            quoted: false,
2835            processed: false,
2836        });
2837    }
2838
2839    /// Check for reserved word — mirrors lex.c:2002-2015 in `exalias`,
2840    /// but reachable from the bare `zshlex` path (without an
2841    /// AliasResolver). Promotes STRING tokens to keyword tokens when:
2842    ///   - incmdpos is set (or text is `}` ending a brace block)
2843    ///   - text is `]]` and we're inside `[[ ]]` (incond > 0)
2844    ///   - text is bare `!` and we're at the start of a cond (incond == 1)
2845    pub fn check_reserved_word(&mut self) -> bool {
2846        if let Some(ref tokstr) = self.tokstr {
2847            if self.incmdpos || (tokstr == "}" && self.tok == LexTok::String) {
2848                if let Some(tok) = crate::tokens::lookup_reserved_word(tokstr) {
2849                    self.tok = tok;
2850                    if tok == LexTok::Repeat {
2851                        self.inrepeat = 1;
2852                    }
2853                    if tok == LexTok::Dinbrack {
2854                        self.incond = 1;
2855                    }
2856                    return true;
2857                }
2858                if tokstr == "]]" && self.incond > 0 {
2859                    self.tok = LexTok::Doutbrack;
2860                    self.incond = 0;
2861                    return true;
2862                }
2863            }
2864            // lex.c:2010-2014 — `]]` and `!` are recognized inside `[[`
2865            // regardless of incmdpos.
2866            if self.incond > 0 && tokstr == "]]" {
2867                self.tok = LexTok::Doutbrack;
2868                self.incond = 0;
2869                return true;
2870            }
2871            if self.incond == 1 && tokstr == "!" {
2872                self.tok = LexTok::Bang;
2873                return true;
2874            }
2875        }
2876        false
2877    }
2878}
2879
2880/// Result of determining if (( is arithmetic or command
2881enum CmdOrMath {
2882    Cmd,
2883    Math,
2884    Err,
2885}
2886
2887// ============================================================================
2888// Additional parsing functions ported from lex.c
2889// ============================================================================
2890
2891/// Check whether we're looking at valid numeric globbing syntax
2892/// `<N-M>` / `<N->` / `<-M>` / `<->`. Call pointing just after the
2893/// opening `<`. Leaves the input position unchanged, returning true
2894/// or false.
2895///
2896/// Direct port of zsh/Src/lex.c:580-610 `isnumglob`. C source uses
2897/// hgetc/hungetc against the input stream and a temp buffer to
2898/// remember consumed chars; zshrs takes a `(input, pos)` slice and
2899/// scans without consumption. Same predicate, different I/O model.
2900pub fn isnumglob(input: &str, pos: usize) -> bool {
2901    let chars: Vec<char> = input[pos..].chars().collect();
2902    let mut i = 0;
2903    let mut expect_close = false;
2904
2905    // Look for digits, then -, then digits, then >
2906    while i < chars.len() {
2907        let c = chars[i];
2908        if c.is_ascii_digit() {
2909            i += 1;
2910        } else if c == '-' && !expect_close {
2911            expect_close = true;
2912            i += 1;
2913        } else if c == '>' && expect_close {
2914            return true;
2915        } else {
2916            break;
2917        }
2918    }
2919    false
2920}
2921
2922/// Tokenize a string as if in double quotes (error-tolerant variant).
2923///
2924/// Direct port of zsh/Src/lex.c:1713-1733 `parsestrnoerr`. The C
2925/// source: zcontext_save → untokenize → inpush → strinbeg →
2926/// `lexbuf.ptr = tokstr = *s; lexbuf.siz = l + 1` →
2927/// `err = dquote_parse('\0', 1)` → strinend → inpop → zcontext_restore.
2928/// Returns the tokenized string on success, or the offending char as
2929/// an error code (zsh convention: `> 32 && < 127` → printable, else
2930/// generic).
2931///
2932/// zshrs port: the C version drives the lexer's dquote_parse method
2933/// against the input string. zshrs's standalone walker produces the
2934/// same BNULL/QSTRING/QTICK token markers without re-entering the
2935/// lexer — same output for typical bodies. Documented divergence:
2936/// nested cmd-sub `$(...)` and arith `$((...))` aren't lexed
2937/// recursively; the runtime handles them at expansion time.
2938pub fn parsestrnoerr(s: &str) -> Result<String, String> {
2939    parsestr_inner(s)
2940}
2941
2942/// Tokenize a string as if in double quotes (error-reporting variant).
2943///
2944/// Direct port of zsh/Src/lex.c:1693-1709 `parsestr`. C source:
2945/// `if ((err = parsestrnoerr(s))) { untokenize(*s); ... zerr("parse
2946/// error near `%c'", err); tok = LEXERR; }`. zshrs's wrapper
2947/// returns the same Result and lets the caller emit the diagnostic.
2948///
2949/// Both `parsestr` and `parsestrnoerr` share the inner walker; the
2950/// only difference in C is whether errors trigger `zerr`. zshrs
2951/// returns `Err(msg)` from both — the caller decides whether to
2952/// surface the diagnostic.
2953pub fn parsestr(s: &str) -> Result<String, String> {
2954    parsestr_inner(s)
2955}
2956
2957/// Shared body for parsestr / parsestrnoerr.
2958fn parsestr_inner(s: &str) -> Result<String, String> {
2959    let mut result = String::with_capacity(s.len());
2960    let chars: Vec<char> = s.chars().collect();
2961    let mut i = 0;
2962
2963    while i < chars.len() {
2964        let c = chars[i];
2965        match c {
2966            '\\' => {
2967                i += 1;
2968                if i < chars.len() {
2969                    let next = chars[i];
2970                    match next {
2971                        '$' | '\\' | '`' | '"' | '\n' => {
2972                            result.push(char_tokens::BNULL);
2973                            result.push(next);
2974                        }
2975                        _ => {
2976                            result.push('\\');
2977                            result.push(next);
2978                        }
2979                    }
2980                } else {
2981                    result.push('\\');
2982                }
2983            }
2984            '$' => {
2985                result.push(char_tokens::QSTRING);
2986                if i + 1 < chars.len() {
2987                    let next = chars[i + 1];
2988                    if next == '{' {
2989                        result.push(char_tokens::INBRACE);
2990                        i += 1;
2991                    } else if next == '(' {
2992                        result.push(char_tokens::INPAR);
2993                        i += 1;
2994                    }
2995                }
2996            }
2997            '`' => {
2998                result.push(char_tokens::QTICK);
2999            }
3000            _ => {
3001                result.push(c);
3002            }
3003        }
3004        i += 1;
3005    }
3006
3007    Ok(result)
3008}
3009
3010/// Parse a subscript in string s. Return the position after the
3011/// closing bracket, or None on error.
3012///
3013/// Direct port of zsh/Src/lex.c:1742-1788 `parse_subscript`. The C
3014/// source uses dupstring_wlen + inpush + dquote_parse to lex the
3015/// subscript through the main lexer; zshrs implements a focused
3016/// bracket-balancing walker that handles the same nesting rules
3017/// (`[...]`, `(...)`, `{...}`) without re-entering the lexer.
3018///
3019/// zshrs port note: zsh's parse_subscript also handles a `sub`
3020/// flag that controls whether `$` and quotes are tokenized — that
3021/// flag isn't exposed here. Most callers don't need it; the few
3022/// that do (parameter expansion's `${var[expr]}`) handle the
3023/// quote-aware lex separately at the expansion layer.
3024pub fn parse_subscript(s: &str, endchar: char) -> Option<usize> {
3025    if s.is_empty() || s.starts_with(endchar) {
3026        return None;
3027    }
3028
3029    let chars: Vec<char> = s.chars().collect();
3030    let mut i = 0;
3031    let mut depth = 0;
3032    let mut in_dquote = false;
3033    let mut in_squote = false;
3034
3035    while i < chars.len() {
3036        let c = chars[i];
3037
3038        if in_squote {
3039            if c == '\'' {
3040                in_squote = false;
3041            }
3042            i += 1;
3043            continue;
3044        }
3045
3046        if in_dquote {
3047            if c == '"' {
3048                in_dquote = false;
3049            } else if c == '\\' && i + 1 < chars.len() {
3050                i += 1; // skip escaped char
3051            }
3052            i += 1;
3053            continue;
3054        }
3055
3056        match c {
3057            '\\' => {
3058                i += 1; // skip next char
3059            }
3060            '\'' => {
3061                in_squote = true;
3062            }
3063            '"' => {
3064                in_dquote = true;
3065            }
3066            '[' | '(' => {
3067                depth += 1;
3068            }
3069            ']' | ')' => {
3070                if depth > 0 {
3071                    depth -= 1;
3072                } else if c == endchar {
3073                    return Some(i);
3074                }
3075            }
3076            _ => {}
3077        }
3078
3079        if c == endchar && depth == 0 {
3080            return Some(i);
3081        }
3082
3083        i += 1;
3084    }
3085
3086    None
3087}
3088
3089/// Tokenize a string as if it were a normal command-line argument
3090/// but it may contain separators. Used for ${...%...} substitutions.
3091///
3092/// Direct port of zsh/Src/lex.c:1796-1880 `parse_subst_string`.
3093/// zsh's version sets `noaliases = 1` + `lexflags = 0` + uses
3094/// zcontext_save/inpush/strinbeg → dquote_parse('\0', 1) →
3095/// strinend/inpop/zcontext_restore. zshrs's standalone walker
3096/// produces the same BNULL/SNULL/DNULL/INPAR/INBRACK markers
3097/// without re-entering the lexer.
3098///
3099/// zshrs port note: the C source returns int (0=ok, char value =
3100/// where it stopped on error); zshrs returns Result<String,String>
3101/// returning the tokenized text directly. Lossy for callers that
3102/// need to know the exact stop position, but nothing in zshrs's
3103/// expansion layer uses that yet.
3104pub fn parse_subst_string(s: &str) -> Result<String, String> {
3105    if s.is_empty() {
3106        return Ok(String::new());
3107    }
3108
3109    let mut result = String::with_capacity(s.len());
3110    let chars: Vec<char> = s.chars().collect();
3111    let mut i = 0;
3112
3113    while i < chars.len() {
3114        let c = chars[i];
3115        match c {
3116            '\\' => {
3117                result.push(char_tokens::BNULL);
3118                i += 1;
3119                if i < chars.len() {
3120                    result.push(chars[i]);
3121                }
3122            }
3123            '\'' => {
3124                result.push(char_tokens::SNULL);
3125                i += 1;
3126                while i < chars.len() && chars[i] != '\'' {
3127                    result.push(chars[i]);
3128                    i += 1;
3129                }
3130                result.push(char_tokens::SNULL);
3131            }
3132            '"' => {
3133                result.push(char_tokens::DNULL);
3134                i += 1;
3135                while i < chars.len() && chars[i] != '"' {
3136                    if chars[i] == '\\' && i + 1 < chars.len() {
3137                        result.push(char_tokens::BNULL);
3138                        i += 1;
3139                        result.push(chars[i]);
3140                    } else if chars[i] == '$' {
3141                        result.push(char_tokens::QSTRING);
3142                    } else {
3143                        result.push(chars[i]);
3144                    }
3145                    i += 1;
3146                }
3147                result.push(char_tokens::DNULL);
3148            }
3149            '$' => {
3150                result.push(char_tokens::STRING);
3151                if i + 1 < chars.len() {
3152                    match chars[i + 1] {
3153                        '{' => {
3154                            result.push(char_tokens::INBRACE);
3155                            i += 1;
3156                        }
3157                        '(' => {
3158                            result.push(char_tokens::INPAR);
3159                            i += 1;
3160                        }
3161                        _ => {}
3162                    }
3163                }
3164            }
3165            '*' => result.push(char_tokens::STAR),
3166            '?' => result.push(char_tokens::QUEST),
3167            '[' => result.push(char_tokens::INBRACK),
3168            ']' => result.push(char_tokens::OUTBRACK),
3169            '{' => result.push(char_tokens::INBRACE),
3170            '}' => result.push(char_tokens::OUTBRACE),
3171            '~' => result.push(char_tokens::TILDE),
3172            '#' => result.push(char_tokens::POUND),
3173            '^' => result.push(char_tokens::HAT),
3174            _ => result.push(c),
3175        }
3176        i += 1;
3177    }
3178
3179    Ok(result)
3180}
3181
3182/// Untokenize a string - convert tokenized chars back to original
3183///
3184/// Port of untokenize() from exec.c (but used by lexer too)
3185/// Like `untokenize`, but maps SNULL → `'` and DNULL → `"` instead of
3186/// stripping them. Used by callers that need the source form including
3187/// quoting (e.g. arithmetic-substitution detection in compile_zsh).
3188pub fn untokenize_preserve_quotes(s: &str) -> String {
3189    let mut result = String::with_capacity(s.len() + 4);
3190    for c in s.chars() {
3191        let cu = c as u32;
3192        if (0x83..=0x9f).contains(&cu) {
3193            match c {
3194                c if c == char_tokens::POUND => result.push('#'),
3195                c if c == char_tokens::STRING => result.push('$'),
3196                c if c == char_tokens::HAT => result.push('^'),
3197                c if c == char_tokens::STAR => result.push('*'),
3198                c if c == char_tokens::INPAR => result.push('('),
3199                c if c == char_tokens::OUTPAR => result.push(')'),
3200                c if c == char_tokens::INPARMATH => result.push('('),
3201                c if c == char_tokens::OUTPARMATH => result.push(')'),
3202                c if c == char_tokens::QSTRING => result.push('$'),
3203                c if c == char_tokens::EQUALS => result.push('='),
3204                c if c == char_tokens::BAR => result.push('|'),
3205                c if c == char_tokens::INBRACE => result.push('{'),
3206                c if c == char_tokens::OUTBRACE => result.push('}'),
3207                c if c == char_tokens::INBRACK => result.push('['),
3208                c if c == char_tokens::OUTBRACK => result.push(']'),
3209                c if c == char_tokens::TICK => result.push('`'),
3210                c if c == char_tokens::INANG => result.push('<'),
3211                c if c == char_tokens::OUTANG => result.push('>'),
3212                c if c == char_tokens::OUTANGPROC => result.push('>'),
3213                c if c == char_tokens::QUEST => result.push('?'),
3214                c if c == char_tokens::TILDE => result.push('~'),
3215                c if c == char_tokens::QTICK => result.push('`'),
3216                c if c == char_tokens::COMMA => result.push(','),
3217                c if c == char_tokens::DASH => result.push('-'),
3218                c if c == char_tokens::BANG => result.push('!'),
3219                c if c == char_tokens::SNULL => result.push('\''),
3220                c if c == char_tokens::DNULL => result.push('"'),
3221                c if c == char_tokens::BNULL => result.push('\\'),
3222                _ => {
3223                    let idx = c as usize;
3224                    if idx < char_tokens::ZTOKENS.len() {
3225                        result.push(char_tokens::ZTOKENS.chars().nth(idx).unwrap_or(c));
3226                    } else {
3227                        result.push(c);
3228                    }
3229                }
3230            }
3231        } else {
3232            result.push(c);
3233        }
3234    }
3235    result
3236}
3237
3238/// Decode `\X` escape sequences for `$'...'` content.
3239/// Port of `getkeystring()` from Src/utils.c:6915 with the
3240/// `GETKEYS_DOLLARS_QUOTE` flag — handles the `\n`/`\t`/`\r`/`\e`/
3241/// `\E`/`\a`/`\b`/`\f`/`\v`/`\xNN`/`\uNNNN`/`\UNNNNNNNN`/octal/`\\`/`\'`
3242/// arms the C source recognizes inside dollar-single-quoted
3243/// strings. Walks `chars[start..]` until `Snull` is hit, returns
3244/// `(decoded, end_idx)` where `end_idx` points at the terminating
3245/// `Snull`. `Bnull \\` and `Bnull '` are user-literal `\` / `'`
3246/// per Src/lex.c:1303.
3247fn getkeystring_dollar_quote(chars: &[char], start: usize) -> (String, usize) {
3248    let mut out = String::new();
3249    let mut i = start;
3250    while i < chars.len() {
3251        let c = chars[i];
3252        if c == char_tokens::SNULL {
3253            return (out, i);
3254        }
3255        if c == char_tokens::BNULL {
3256            // Bnull marks a user-literal `\\` or `\'` per
3257            // Src/lex.c:1303-1306. The next char is the literal.
3258            i += 1;
3259            if i < chars.len() {
3260                out.push(chars[i]);
3261                i += 1;
3262            }
3263            continue;
3264        }
3265        if c == '\\' && i + 1 < chars.len() {
3266            let nc = chars[i + 1];
3267            match nc {
3268                'a' => {
3269                    out.push('\x07');
3270                    i += 2;
3271                }
3272                'b' => {
3273                    out.push('\x08');
3274                    i += 2;
3275                }
3276                'e' | 'E' => {
3277                    out.push('\x1b');
3278                    i += 2;
3279                }
3280                'f' => {
3281                    out.push('\x0c');
3282                    i += 2;
3283                }
3284                'n' => {
3285                    out.push('\n');
3286                    i += 2;
3287                }
3288                'r' => {
3289                    out.push('\r');
3290                    i += 2;
3291                }
3292                't' => {
3293                    out.push('\t');
3294                    i += 2;
3295                }
3296                'v' => {
3297                    out.push('\x0b');
3298                    i += 2;
3299                }
3300                '\\' | '\'' | '"' => {
3301                    out.push(nc);
3302                    i += 2;
3303                }
3304                'x' => {
3305                    // \xNN — up to 2 hex digits per Src/utils.c:7156
3306                    let mut val: u32 = 0;
3307                    let mut consumed = 2; // \x
3308                    let mut got = 0;
3309                    while got < 2 && i + consumed < chars.len() {
3310                        let h = chars[i + consumed];
3311                        if let Some(d) = h.to_digit(16) {
3312                            val = val * 16 + d;
3313                            consumed += 1;
3314                            got += 1;
3315                        } else {
3316                            break;
3317                        }
3318                    }
3319                    if got == 0 {
3320                        // No hex digits — emit literal `\x` per
3321                        // Src/utils.c:7160-7163 fallthrough
3322                        out.push('\\');
3323                        out.push('x');
3324                    } else if let Some(ch) = char::from_u32(val) {
3325                        out.push(ch);
3326                    }
3327                    i += consumed;
3328                }
3329                'u' | 'U' => {
3330                    let n = if nc == 'u' { 4 } else { 8 };
3331                    let mut val: u32 = 0;
3332                    let mut consumed = 2; // \u or \U
3333                    let mut got = 0;
3334                    while got < n && i + consumed < chars.len() {
3335                        let h = chars[i + consumed];
3336                        if let Some(d) = h.to_digit(16) {
3337                            val = val * 16 + d;
3338                            consumed += 1;
3339                            got += 1;
3340                        } else {
3341                            break;
3342                        }
3343                    }
3344                    if let Some(ch) = char::from_u32(val) {
3345                        out.push(ch);
3346                    }
3347                    i += consumed;
3348                }
3349                '0'..='7' => {
3350                    // Octal — up to 3 digits per Src/utils.c:7156
3351                    let mut val: u32 = 0;
3352                    let mut consumed = 1; // skip backslash
3353                    let mut got = 0;
3354                    while got < 3 && i + consumed < chars.len() {
3355                        let h = chars[i + consumed];
3356                        if let Some(d) = h.to_digit(8) {
3357                            val = val * 8 + d;
3358                            consumed += 1;
3359                            got += 1;
3360                        } else {
3361                            break;
3362                        }
3363                    }
3364                    if let Some(ch) = char::from_u32(val) {
3365                        out.push(ch);
3366                    }
3367                    i += consumed;
3368                }
3369                _ => {
3370                    // Unknown escape — keep `\` per
3371                    // Src/utils.c:7180-7185 default branch
3372                    out.push('\\');
3373                    out.push(nc);
3374                    i += 2;
3375                }
3376            }
3377            continue;
3378        }
3379        out.push(c);
3380        i += 1;
3381    }
3382    (out, i)
3383}
3384
3385pub fn untokenize(s: &str) -> String {
3386    let mut result = String::with_capacity(s.len());
3387    let chars: Vec<char> = s.chars().collect();
3388    let mut i = 0;
3389
3390    while i < chars.len() {
3391        let c = chars[i];
3392        // Token chars live in zsh's META range (0x83 = META through 0x9f =
3393        // BNULL). Anything in that range needs un-mapping before display
3394        // or downstream consumption. The original `< 32` test was wrong —
3395        // none of zsh's tokens land in that range.
3396        let cu = c as u32;
3397        if (0x83..=0x9f).contains(&cu) {
3398            // `Qstring Snull` opens a `$'...'` ANSI-C-quoted region.
3399            // Per Src/subst.c:301-304, when `stringsubst()` hits an
3400            // `Snull` it calls `stringsubstquote()` (line 206) which
3401            // calls `getkeystring(s+2, ...)` over the content,
3402            // skipping the leading `Qstring Snull` and stopping at
3403            // the closing `Snull`. zshrs's pipeline runs untokenize
3404            // at points where C runs subst, so we apply the same
3405            // decoding inline here. Result: the entire `$'...'`
3406            // region is replaced by its decoded content with no
3407            // `$`/`'`/marker remnants.
3408            if c == char_tokens::QSTRING
3409                && i + 1 < chars.len()
3410                && chars[i + 1] == char_tokens::SNULL
3411            {
3412                let (decoded, end) = getkeystring_dollar_quote(&chars, i + 2);
3413                result.push_str(&decoded);
3414                // `end` points at the closing `Snull` (or end of
3415                // string if unterminated); skip past it.
3416                i = if end < chars.len() { end + 1 } else { end };
3417                continue;
3418            }
3419            // Convert token back to original character
3420            match c {
3421                c if c == char_tokens::POUND => result.push('#'),
3422                c if c == char_tokens::STRING => result.push('$'),
3423                c if c == char_tokens::HAT => result.push('^'),
3424                c if c == char_tokens::STAR => result.push('*'),
3425                c if c == char_tokens::INPAR => result.push('('),
3426                c if c == char_tokens::OUTPAR => result.push(')'),
3427                c if c == char_tokens::INPARMATH => result.push('('),
3428                c if c == char_tokens::OUTPARMATH => result.push(')'),
3429                c if c == char_tokens::QSTRING => result.push('$'),
3430                c if c == char_tokens::EQUALS => result.push('='),
3431                c if c == char_tokens::BAR => result.push('|'),
3432                c if c == char_tokens::INBRACE => result.push('{'),
3433                c if c == char_tokens::OUTBRACE => result.push('}'),
3434                c if c == char_tokens::INBRACK => result.push('['),
3435                c if c == char_tokens::OUTBRACK => result.push(']'),
3436                c if c == char_tokens::TICK => result.push('`'),
3437                c if c == char_tokens::INANG => result.push('<'),
3438                c if c == char_tokens::OUTANG => result.push('>'),
3439                c if c == char_tokens::OUTANGPROC => result.push('>'),
3440                c if c == char_tokens::QUEST => result.push('?'),
3441                c if c == char_tokens::TILDE => result.push('~'),
3442                c if c == char_tokens::QTICK => result.push('`'),
3443                c if c == char_tokens::COMMA => result.push(','),
3444                c if c == char_tokens::DASH => result.push('-'),
3445                c if c == char_tokens::BANG => result.push('!'),
3446                c if c == char_tokens::SNULL
3447                    || c == char_tokens::DNULL
3448                    || c == char_tokens::BNULL =>
3449                {
3450                    // Null markers - skip
3451                }
3452                _ => {
3453                    // Unknown token, try ztokens lookup
3454                    let idx = c as usize;
3455                    if idx < char_tokens::ZTOKENS.len() {
3456                        result.push(char_tokens::ZTOKENS.chars().nth(idx).unwrap_or(c));
3457                    } else {
3458                        result.push(c);
3459                    }
3460                }
3461            }
3462        } else {
3463            result.push(c);
3464        }
3465        i += 1;
3466    }
3467
3468    result
3469}
3470
3471/// Check if a string contains any token characters
3472pub fn has_token(s: &str) -> bool {
3473    s.chars().any(|c| (c as u32) < 32)
3474}
3475
3476/// Convert token characters to their printable form for display
3477pub fn tokens_to_printable(s: &str) -> String {
3478    untokenize(s)
3479}
3480
3481#[cfg(test)]
3482mod tests {
3483    use super::*;
3484
3485    #[test]
3486    fn test_simple_command() {
3487        let mut lexer = ZshLexer::new("echo hello");
3488        lexer.zshlex();
3489        assert_eq!(lexer.tok, LexTok::String);
3490        assert_eq!(lexer.tokstr, Some("echo".to_string()));
3491
3492        lexer.zshlex();
3493        assert_eq!(lexer.tok, LexTok::String);
3494        assert_eq!(lexer.tokstr, Some("hello".to_string()));
3495
3496        lexer.zshlex();
3497        assert_eq!(lexer.tok, LexTok::Endinput);
3498    }
3499
3500    #[test]
3501    fn test_pipeline() {
3502        let mut lexer = ZshLexer::new("ls | grep foo");
3503        lexer.zshlex();
3504        assert_eq!(lexer.tok, LexTok::String);
3505
3506        lexer.zshlex();
3507        assert_eq!(lexer.tok, LexTok::Bar);
3508
3509        lexer.zshlex();
3510        assert_eq!(lexer.tok, LexTok::String);
3511
3512        lexer.zshlex();
3513        assert_eq!(lexer.tok, LexTok::String);
3514    }
3515
3516    #[test]
3517    fn test_redirections() {
3518        let mut lexer = ZshLexer::new("echo > file");
3519        lexer.zshlex();
3520        assert_eq!(lexer.tok, LexTok::String);
3521
3522        lexer.zshlex();
3523        assert_eq!(lexer.tok, LexTok::Outang);
3524
3525        lexer.zshlex();
3526        assert_eq!(lexer.tok, LexTok::String);
3527    }
3528
3529    #[test]
3530    fn test_heredoc() {
3531        let mut lexer = ZshLexer::new("cat << EOF");
3532        lexer.zshlex();
3533        assert_eq!(lexer.tok, LexTok::String);
3534
3535        lexer.zshlex();
3536        assert_eq!(lexer.tok, LexTok::Dinang);
3537
3538        lexer.zshlex();
3539        assert_eq!(lexer.tok, LexTok::String);
3540    }
3541
3542    #[test]
3543    fn test_single_quotes() {
3544        let mut lexer = ZshLexer::new("echo 'hello world'");
3545        lexer.zshlex();
3546        assert_eq!(lexer.tok, LexTok::String);
3547
3548        lexer.zshlex();
3549        assert_eq!(lexer.tok, LexTok::String);
3550        // Should contain Snull markers around literal content
3551        assert!(lexer.tokstr.is_some());
3552    }
3553
3554    #[test]
3555    fn test_function_tokens() {
3556        let mut lexer = ZshLexer::new("function foo { }");
3557        lexer.zshlex();
3558        assert_eq!(
3559            lexer.tok,
3560            LexTok::Func,
3561            "expected Func, got {:?}",
3562            lexer.tok
3563        );
3564
3565        lexer.zshlex();
3566        assert_eq!(
3567            lexer.tok,
3568            LexTok::String,
3569            "expected String for 'foo', got {:?}",
3570            lexer.tok
3571        );
3572        assert_eq!(lexer.tokstr, Some("foo".to_string()));
3573
3574        lexer.zshlex();
3575        assert_eq!(
3576            lexer.tok,
3577            LexTok::Inbrace,
3578            "expected Inbrace, got {:?} tokstr={:?}",
3579            lexer.tok,
3580            lexer.tokstr
3581        );
3582
3583        lexer.zshlex();
3584        assert_eq!(
3585            lexer.tok,
3586            LexTok::Outbrace,
3587            "expected Outbrace, got {:?} tokstr={:?} incmdpos={}",
3588            lexer.tok,
3589            lexer.tokstr,
3590            lexer.incmdpos
3591        );
3592    }
3593
3594    #[test]
3595    fn test_double_quotes() {
3596        let mut lexer = ZshLexer::new("echo \"hello $name\"");
3597        lexer.zshlex();
3598        assert_eq!(lexer.tok, LexTok::String);
3599
3600        lexer.zshlex();
3601        assert_eq!(lexer.tok, LexTok::String);
3602        // Should contain tokenized content
3603        assert!(lexer.tokstr.is_some());
3604    }
3605
3606    #[test]
3607    fn test_command_substitution() {
3608        let mut lexer = ZshLexer::new("echo $(pwd)");
3609        lexer.zshlex();
3610        assert_eq!(lexer.tok, LexTok::String);
3611
3612        lexer.zshlex();
3613        assert_eq!(lexer.tok, LexTok::String);
3614    }
3615
3616    #[test]
3617    fn test_env_assignment() {
3618        let mut lexer = ZshLexer::new("FOO=bar echo");
3619        lexer.incmdpos = true;
3620        lexer.zshlex();
3621        assert_eq!(
3622            lexer.tok,
3623            LexTok::Envstring,
3624            "tok={:?} tokstr={:?}",
3625            lexer.tok,
3626            lexer.tokstr
3627        );
3628
3629        lexer.zshlex();
3630        assert_eq!(lexer.tok, LexTok::String);
3631    }
3632
3633    #[test]
3634    fn test_array_assignment() {
3635        let mut lexer = ZshLexer::new("arr=(a b c)");
3636        lexer.incmdpos = true;
3637        lexer.zshlex();
3638        assert_eq!(lexer.tok, LexTok::Envarray);
3639    }
3640
3641    #[test]
3642    fn test_process_substitution() {
3643        let mut lexer = ZshLexer::new("diff <(ls) >(cat)");
3644        lexer.zshlex();
3645        assert_eq!(lexer.tok, LexTok::String);
3646
3647        lexer.zshlex();
3648        assert_eq!(lexer.tok, LexTok::String);
3649        // <(ls) is tokenized into the string
3650
3651        lexer.zshlex();
3652        assert_eq!(lexer.tok, LexTok::String);
3653        // >(cat) is tokenized
3654    }
3655
3656    #[test]
3657    fn test_arithmetic() {
3658        let mut lexer = ZshLexer::new("echo $((1+2))");
3659        lexer.zshlex();
3660        assert_eq!(lexer.tok, LexTok::String);
3661
3662        lexer.zshlex();
3663        assert_eq!(lexer.tok, LexTok::String);
3664    }
3665
3666    #[test]
3667    fn test_semicolon_variants() {
3668        let mut lexer = ZshLexer::new("case x in a) cmd;; b) cmd;& c) cmd;| esac");
3669
3670        // Skip to first ;;
3671        loop {
3672            lexer.zshlex();
3673            if lexer.tok == LexTok::Dsemi || lexer.tok == LexTok::Endinput {
3674                break;
3675            }
3676        }
3677        assert_eq!(lexer.tok, LexTok::Dsemi);
3678
3679        // Find ;&
3680        loop {
3681            lexer.zshlex();
3682            if lexer.tok == LexTok::Semiamp || lexer.tok == LexTok::Endinput {
3683                break;
3684            }
3685        }
3686        assert_eq!(lexer.tok, LexTok::Semiamp);
3687
3688        // Find ;|
3689        loop {
3690            lexer.zshlex();
3691            if lexer.tok == LexTok::Semibar || lexer.tok == LexTok::Endinput {
3692                break;
3693            }
3694        }
3695        assert_eq!(lexer.tok, LexTok::Semibar);
3696    }
3697}