zshrs_parse/
lexer.rs

1//! Zsh lexical analyzer - Direct port from zsh/Src/lex.c
2//!
3//! This lexer tokenizes zsh shell input into a stream of tokens.
4//! It handles all zsh-specific syntax including:
5//! - Single/double/dollar quotes
6//! - Command substitution $(...)  and `...`
7//! - Arithmetic $((...))
8//! - Parameter expansion ${...}
9//! - Process substitution <(...) >(...)
10//! - Here documents
11//! - All redirection operators
12//! - Comments
13//! - Continuation lines
14
15use crate::tokens::{char_tokens, LexTok};
16use std::collections::VecDeque;
17
18/// Lexer flags controlling behavior
19#[derive(Debug, Clone, Copy, Default)]
20pub struct LexFlags {
21    /// Parsing for ZLE (line editor) completion
22    pub zle: bool,
23    /// Return newlines as tokens
24    pub newline: bool,
25    /// Preserve comments in output
26    pub comments_keep: bool,
27    /// Strip comments from output
28    pub comments_strip: bool,
29    /// Active lexing (from bufferwords)
30    pub active: bool,
31}
32
33/// Buffer state for building tokens
34#[derive(Debug, Clone)]
35struct LexBuf {
36    data: String,
37    siz: usize,
38}
39
40impl LexBuf {
41    fn new() -> Self {
42        LexBuf {
43            data: String::with_capacity(256),
44            siz: 256,
45        }
46    }
47
48    fn clear(&mut self) {
49        self.data.clear();
50    }
51
52    fn add(&mut self, c: char) {
53        self.data.push(c);
54        if self.data.len() >= self.siz {
55            self.siz *= 2;
56            self.data.reserve(self.siz - self.data.len());
57        }
58    }
59
60    #[allow(dead_code)]
61    fn add_str(&mut self, s: &str) {
62        self.data.push_str(s);
63    }
64
65    fn len(&self) -> usize {
66        self.data.len()
67    }
68
69    fn as_str(&self) -> &str {
70        &self.data
71    }
72
73    #[allow(dead_code)]
74    fn into_string(self) -> String {
75        self.data
76    }
77
78    #[allow(dead_code)]
79    fn last_char(&self) -> Option<char> {
80        self.data.chars().last()
81    }
82
83    fn pop(&mut self) -> Option<char> {
84        self.data.pop()
85    }
86}
87
88/// Here-document state
89#[derive(Debug, Clone)]
90pub struct HereDoc {
91    pub terminator: String,
92    pub strip_tabs: bool,
93    pub content: String,
94    /// True if the terminator was originally quoted (`<<'EOF'`,
95    /// `<<"EOF"`, or `<<\EOF`). Disables variable expansion / command
96    /// substitution / arithmetic in the body.
97    pub quoted: bool,
98    /// True once `process_heredocs` has read the body. Distinct from
99    /// "content is empty" because an empty heredoc legitimately has
100    /// empty content.
101    pub processed: bool,
102}
103
104/// The Zsh Lexer
105pub struct ZshLexer<'a> {
106    /// Input source
107    pub(crate) input: &'a str,
108    /// Current position in input
109    pub(crate) pos: usize,
110    /// Look-ahead buffer for ungotten characters
111    unget_buf: VecDeque<char>,
112    /// Current token string
113    pub tokstr: Option<String>,
114    /// Current token type
115    pub tok: LexTok,
116    /// File descriptor for redirections (e.g., 2> means fd=2)
117    pub tokfd: i32,
118    /// Line number at start of current token
119    pub toklineno: u64,
120    /// Current line number
121    pub lineno: u64,
122    /// Lexer has stopped (EOF or error)
123    pub lexstop: bool,
124    /// In command position (can accept reserved words)
125    pub incmdpos: bool,
126    /// In condition [[ ... ]]
127    pub incond: i32,
128    /// In pattern context (RHS of == != =~ in [[ ]])
129    pub incondpat: bool,
130    /// In case pattern
131    pub incasepat: i32,
132    /// In redirection
133    pub inredir: bool,
134    /// Saved `incmdpos` from before a redirop / for / foreach / select
135    /// — restored on the NEXT non-redir token. Mirrors `static int oldpos`
136    /// in C zsh's `ctxtlex` (lex.c:319). Required for cases like
137    /// `for x ( ... )` where `(` after the var name should tokenize as
138    /// INPAR — that depends on incmdpos being restored to 1 from before
139    /// FOR was lexed, which in turn depends on this saved value.
140    pub oldpos: bool,
141    /// After 'for' keyword
142    pub infor: i32,
143    /// After 'repeat' keyword
144    inrepeat: i32,
145    /// Parsing typeset arguments
146    pub intypeset: bool,
147    /// Inside (( ... )) arithmetic
148    dbparens: bool,
149    /// Disable alias expansion
150    pub noaliases: bool,
151    /// Disable spelling correction
152    pub nocorrect: i32,
153    /// Disable comment recognition
154    pub nocomments: bool,
155    /// Lexer flags
156    pub lexflags: LexFlags,
157    /// Whether this is the first line
158    pub isfirstln: bool,
159    /// Whether this is the first char of command
160    #[allow(dead_code)]
161    isfirstch: bool,
162    /// Pending here-documents
163    pub heredocs: Vec<HereDoc>,
164    /// Expecting heredoc terminator (0 = no, 1 = <<, 2 = <<-)
165    heredoc_pending: u8,
166    /// Token buffer
167    lexbuf: LexBuf,
168    /// After newline
169    pub isnewlin: i32,
170    /// Error message if any
171    pub error: Option<String>,
172    /// Global iteration counter for infinite loop detection
173    global_iterations: usize,
174    /// Recursion depth counter
175    recursion_depth: usize,
176    /// Raw-input capture flag — when nonzero, every char read through
177    /// `hgetc` is also appended to `tokstr_raw` via zshlex_raw_add.
178    /// Direct mirror of zsh/Src/lex.c:161 `lex_add_raw`. Used by
179    /// skipcomm (lex.c:2082) to preserve the literal text of `$(...)`
180    /// command substitutions for re-execution / display.
181    pub lex_add_raw: i32,
182    /// Raw-input capture buffer. Direct mirror of lex.c:165
183    /// `tokstr_raw` / lex.c:166 `lexbuf_raw`. Combined into one
184    /// `LexBuf` here since Rust's String tracks both the data and
185    /// length internally.
186    lexbuf_raw: LexBuf,
187}
188
189const MAX_LEXER_RECURSION: usize = 200;
190
191/// Per-alias info returned by `AliasResolver::lookup_alias` and
192/// `lookup_suffix_alias`. Mirrors zsh's `struct alias` fields used
193/// at lex.c:1914-1943: `text` (replacement body), `in_use` (the
194/// recursion-guard flag), `global` (vs command-position-only).
195#[derive(Debug, Clone)]
196pub struct AliasInfo {
197    pub text: String,
198    pub in_use: bool,
199    pub global: bool,
200}
201
202/// Trait the lexer uses to look up aliases and reserved words during
203/// `exalias`. Implementors typically delegate to the executor's
204/// alias/reswd hash tables. Defining the trait here keeps lexer.rs
205/// free of executor-specific types — same pattern zsh uses with the
206/// hashtable.h opaque-handle approach against aliastab/reswdtab/
207/// sufaliastab.
208pub trait AliasResolver {
209    /// Look up an alias by name. Returns `None` if not found, or the
210    /// alias body + flags otherwise.
211    fn lookup_alias(&self, name: &str) -> Option<AliasInfo>;
212    /// Look up a suffix alias (e.g. `.txt → less`) by suffix only.
213    fn lookup_suffix_alias(&self, suffix: &str) -> Option<AliasInfo>;
214    /// Resolve a reserved word. Returns the LexTok the word should
215    /// promote to (e.g. "if" → IF), or None if not a reswd.
216    fn lookup_reswd(&self, name: &str) -> Option<LexTok>;
217    /// Mark an alias as in-use (recursion guard). Called when an
218    /// alias is about to be expanded; the matching unmark happens
219    /// when the alias text has been fully consumed by the lexer.
220    fn mark_in_use(&mut self, name: &str, in_use: bool);
221}
222
223/// Saved lexical state for nested-context handling. Direct port of
224/// `struct lex_stack` declared in zsh/Src/zsh.h and used by
225/// zsh/Src/lex.c:215-239 (`lex_context_save`) and lex.c:244-262
226/// (`lex_context_restore`). Used when entering command substitution,
227/// here-docs, or eval where the outer lexer state must be pushed and
228/// restored after the inner parse completes.
229#[derive(Debug, Clone)]
230pub struct LexStack {
231    pub dbparens: bool,
232    pub isfirstln: bool,
233    pub isfirstch: bool,
234    pub lexflags: LexFlags,
235    pub tok: LexTok,
236    pub tokstr: Option<String>,
237    pub lexbuf_data: String,
238    pub lexbuf_siz: usize,
239    pub lexstop: bool,
240    pub toklineno: u64,
241}
242
243impl Default for LexStack {
244    fn default() -> Self {
245        // Mirrors lex.c:235-238 reset state after a save: tokstr / lexbuf
246        // zeroed, lexbuf.siz back to the initial 256 alloc, tok to
247        // ENDINPUT (the C source doesn't explicitly reset tok here but
248        // the natural baseline is ENDINPUT — same as lexinit).
249        LexStack {
250            dbparens: false,
251            isfirstln: false,
252            isfirstch: false,
253            lexflags: LexFlags::default(),
254            tok: LexTok::Endinput,
255            tokstr: None,
256            lexbuf_data: String::new(),
257            lexbuf_siz: 256,
258            lexstop: false,
259            toklineno: 0,
260        }
261    }
262}
263
264impl<'a> ZshLexer<'a> {
265    /// Create a new lexer for the given input
266    pub fn new(input: &'a str) -> Self {
267        ZshLexer {
268            input,
269            pos: 0,
270            unget_buf: VecDeque::new(),
271            tokstr: None,
272            tok: LexTok::Endinput,
273            tokfd: -1,
274            toklineno: 1,
275            lineno: 1,
276            lexstop: false,
277            incmdpos: true,
278            incond: 0,
279            incondpat: false,
280            incasepat: 0,
281            inredir: false,
282            oldpos: true,
283            infor: 0,
284            inrepeat: 0,
285            intypeset: false,
286            dbparens: false,
287            noaliases: false,
288            nocorrect: 0,
289            nocomments: false,
290            lexflags: LexFlags::default(),
291            isfirstln: true,
292            isfirstch: true,
293            heredocs: Vec::new(),
294            heredoc_pending: 0,
295            lexbuf: LexBuf::new(),
296            isnewlin: 0,
297            error: None,
298            global_iterations: 0,
299            recursion_depth: 0,
300            lex_add_raw: 0,
301            lexbuf_raw: LexBuf::new(),
302        }
303    }
304
305    /// Append a char to the raw-input capture buffer. Direct port of
306    /// zsh/Src/lex.c:2024-2039 `zshlex_raw_add`. Called from hgetc
307    /// when `lex_add_raw` is nonzero so cmd-sub bodies (`$(...)`,
308    /// `<(...)`, `>(...)`) can be replayed verbatim without re-lexing.
309    pub fn zshlex_raw_add(&mut self, c: char) {
310        // lex.c:2027-2028 — guard on lex_add_raw flag.
311        if self.lex_add_raw == 0 {
312            return;
313        }
314        // lex.c:2030-2038 — append to lexbuf_raw. The C source manages
315        // explicit ptr/len/siz with hrealloc; Rust's String handles
316        // resize automatically.
317        self.lexbuf_raw.add(c);
318    }
319
320    /// Run alias / reserved-word expansion on the just-lexed token.
321    /// Direct port of zsh/Src/lex.c:1949-2021 `exalias`. Returns true
322    /// if an alias was injected (the caller's loop should re-run
323    /// gettok to consume the injected text).
324    ///
325    /// C source flow:
326    ///   1. Spell-correct (lex.c:1958-1962) — disabled in zshrs.
327    ///   2. If tokstr is None: set lextext from `tokstrings[tok]` and
328    ///      checkalias against that (lex.c:1964-1969).
329    ///   3. Otherwise: untokenize tokstr into a working copy (lex.c:
330    ///      1971-1980).
331    ///   4. ZLE word-tracking: call gotword() if LEXFLAGS_ZLE
332    ///      (lex.c:1982-1991).
333    ///   5. STRING tokens: try checkalias, then reservation lookup
334    ///      (lex.c:1993-2015).
335    ///   6. Clear inalmore (lex.c:2016).
336    ///
337    /// Takes an `AliasResolver` trait object so the lexer doesn't
338    /// hard-depend on the executor's alias-table types. zshrs callers
339    /// implement `AliasResolver` over their alias hash tables.
340    pub fn exalias<R: AliasResolver>(&mut self, resolver: &mut R) -> bool {
341        // lex.c:1957 — `hwend()` ends the history-word region. zshrs's
342        // history layer doesn't track per-word boundaries here; no-op.
343
344        // lex.c:1958-1962 — spell correction via spckword. zshrs
345        // doesn't implement spell correction yet; documented divergence.
346
347        // lex.c:1964-1969 — bare-token path (no tokstr).
348        if self.tokstr.is_none() {
349            // lex.c:1965 — `zshlextext = tokstrings[tok];` — for tokens
350            // like SEMI/AMPER/etc. the canonical text comes from a
351            // static table. zshrs's check_alias_for_text uses the
352            // resolver directly with the token's text representation.
353            if self.tok == LexTok::Newlin {
354                return false;
355            }
356            // Use punctuation-token text; unknown tokens skip alias.
357            let text = match self.tok {
358                LexTok::Semi => ";",
359                LexTok::Amper => "&",
360                LexTok::Bar => "|",
361                _ => return false,
362            };
363            return self.check_alias(resolver, text);
364        }
365
366        let tokstr = self.tokstr.clone().unwrap();
367        // lex.c:1973-1980 — untokenize: convert the lexer's internal
368        // tokenized form (Pound..ztokens shifts) into the literal
369        // shell text. Call the global helper.
370        let lextext = if has_token(&tokstr) {
371            untokenize(&tokstr)
372        } else {
373            tokstr.clone()
374        };
375
376        // lex.c:1982-1991 — ZLE word-tracking for completion.
377        if self.lexflags.zle {
378            let zp = self.lexflags;
379            self.gotword();
380            // lex.c:1986-1990 — if gotword cleared lexflags, the cursor
381            // word has been reached; abort exalias so completion can
382            // capture the partial token unchanged.
383            if zp.zle && !self.lexflags.zle {
384                return false;
385            }
386        }
387
388        // lex.c:1993-2015 — STRING-token alias / reswd check.
389        if self.tok == LexTok::String {
390            // lex.c:1995 — `checkalias()`. POSIX-aliases gate skipped
391            // here (zshrs doesn't have the option flag wired).
392            if self.check_alias(resolver, &lextext) {
393                return true;
394            }
395
396            // lex.c:2002-2009 — reserved-word lookup. Fires when in
397            // command position OR when the text is bare `}` and
398            // IGNOREBRACES is unset (so `}` ends a brace block).
399            if self.incmdpos || lextext == "}" {
400                if let Some(rwtok) = resolver.lookup_reswd(&lextext) {
401                    self.tok = rwtok;
402                    if rwtok == LexTok::Repeat {
403                        self.inrepeat = 1;
404                    }
405                    if rwtok == LexTok::Dinbrack {
406                        self.incond = 1;
407                    }
408                }
409            } else if self.incond > 0 && lextext == "]]" {
410                // lex.c:2010-2012 — `]]` closes the cond expression.
411                self.tok = LexTok::Doutbrack;
412                self.incond = 0;
413            } else if self.incond == 1 && lextext == "!" {
414                // lex.c:2013-2014 — `!` inside `[[ ]]` is the BANG
415                // negation, not a literal.
416                self.tok = LexTok::Bang;
417            }
418        }
419
420        // lex.c:2016 — `inalmore = 0;` — alias-more flag clears after
421        // any non-alias token.
422        // (zshrs's lexer doesn't have inalmore yet — added here would
423        // require gettok to track when an alias-pushed token has more
424        // text after it. Documented divergence.)
425
426        false
427    }
428
429    /// Helper for `exalias`. Direct port of zsh/Src/lex.c:1899-1947
430    /// `checkalias`. Returns true if the lookup matched (regular or
431    /// suffix alias) AND the alias text was successfully injected
432    /// back into the input stream for re-lexing.
433    fn check_alias<R: AliasResolver>(&mut self, resolver: &mut R, lextext: &str) -> bool {
434        // lex.c:1906-1907 — guard on null lextext.
435        if lextext.is_empty() {
436            return false;
437        }
438
439        // lex.c:1909-1911 — guard: alias expansion is disabled, or
440        // POSIX aliases require the token to be a STRING and not a
441        // reserved word.
442        if self.noaliases {
443            return false;
444        }
445
446        // lex.c:1914-1933 — regular alias lookup.
447        if let Some(alias) = resolver.lookup_alias(lextext) {
448            if !alias.in_use && (alias.global || (self.incmdpos && self.tok == LexTok::String)) {
449                // lex.c:1918-1927 — if the next char isn't blank,
450                // insert a space so the alias body can't accidentally
451                // join the following word.
452                if !self.lexstop {
453                    if let Some(c) = self.peek() {
454                        if !Self::is_blank(c) {
455                            self.inject_alias_text(" ");
456                        }
457                    }
458                }
459                // lex.c:1928 — `inpush(an->text, INP_ALIAS, an);`
460                self.inject_alias_text(&alias.text);
461                resolver.mark_in_use(lextext, true);
462                self.lexstop = false;
463                return true;
464            }
465        }
466
467        // lex.c:1934-1943 — suffix-alias lookup. The token must end
468        // with `.SUFFIX`, the suffix name must be a registered
469        // suffix-alias, AND the lexer must be in command position.
470        if self.incmdpos {
471            if let Some(dot_pos) = lextext.rfind('.') {
472                if dot_pos > 0 && dot_pos + 1 < lextext.len() {
473                    let suffix = &lextext[dot_pos + 1..];
474                    if let Some(alias) = resolver.lookup_suffix_alias(suffix) {
475                        if !alias.in_use {
476                            // lex.c:1938-1940 — push three things in
477                            // reverse: the alias text, a space, then
478                            // the original word.
479                            self.inject_alias_text(&alias.text);
480                            self.inject_alias_text(" ");
481                            self.inject_alias_text(lextext);
482                            resolver.mark_in_use(suffix, true);
483                            self.lexstop = false;
484                            return true;
485                        }
486                    }
487                }
488            }
489        }
490
491        false
492    }
493
494    /// Push alias text back into the input stream so the lexer
495    /// re-reads it. Equivalent to zsh's `inpush(text, INP_ALIAS, an)`
496    /// at lex.c:1928,1938,1940. zshrs uses the existing `unget_buf`
497    /// (a VecDeque<char>) to inject chars in reverse order so the
498    /// next hgetc consumes them first.
499    fn inject_alias_text(&mut self, text: &str) {
500        // Insert at front in reverse so the first char of `text`
501        // comes out first.
502        for c in text.chars().rev() {
503            self.unget_buf.push_front(c);
504        }
505    }
506
507    /// Pop the last char from the raw-input capture buffer. Direct
508    /// port of zsh/Src/lex.c:2042-2049 `zshlex_raw_back`. Called when
509    /// the lexer ungets a char that was just captured raw — the raw
510    /// buffer must mirror the live input so this undoes the last add.
511    pub fn zshlex_raw_back(&mut self) {
512        // lex.c:2045-2046 — guard.
513        if self.lex_add_raw == 0 {
514            return;
515        }
516        // lex.c:2047-2048 — `lexbuf_raw.ptr--; lexbuf_raw.len--;`
517        self.lexbuf_raw.pop();
518    }
519
520    /// Mark the current raw-buffer offset (for restore later). Direct
521    /// port of zsh/Src/lex.c:2052-2058 `zshlex_raw_mark`. Returns
522    /// `len + offset` so callers can restore via `back_to_mark`.
523    pub fn zshlex_raw_mark(&self, offset: i64) -> i64 {
524        // lex.c:2055-2056 — guard.
525        if self.lex_add_raw == 0 {
526            return 0;
527        }
528        // lex.c:2057 — `return lexbuf_raw.len + offset;`
529        (self.lexbuf_raw.len() as i64) + offset
530    }
531
532    /// Restore raw-buffer offset to a previously-saved mark. Direct
533    /// port of zsh/Src/lex.c:2061-2068 `zshlex_raw_back_to_mark`.
534    /// Truncates the raw buffer to `mark` bytes — undoes any captures
535    /// since the mark was taken (used when a speculative parse fails
536    /// and the lexer rolls back).
537    pub fn zshlex_raw_back_to_mark(&mut self, mark: i64) {
538        // lex.c:2064-2065 — guard.
539        if self.lex_add_raw == 0 {
540            return;
541        }
542        // lex.c:2066-2067 — `lexbuf_raw.ptr = tokstr_raw + mark;
543        // lexbuf_raw.len = mark;` — Rust truncate handles both.
544        let m = mark.max(0) as usize;
545        self.lexbuf_raw.data.truncate(m);
546    }
547
548    /// Take the captured raw-input buffer, clearing it. Useful for
549    /// callers that need the literal command-sub body after lexing
550    /// (e.g. compile-time string capture for `$(...)`).
551    pub fn take_raw_buf(&mut self) -> String {
552        std::mem::take(&mut self.lexbuf_raw.data)
553    }
554
555    /// Save lexical context onto a `LexStack`. Direct port of
556    /// zsh/Src/lex.c:215-239 `lex_context_save`. After save, the lexer
557    /// is in a clean state suitable for parsing a nested input (command
558    /// substitution body, here-doc terminator, eval'd string).
559    pub fn lex_context_save(&mut self, ls: &mut LexStack) {
560        // lex.c:220-233 — copy live state into the stack.
561        ls.dbparens = self.dbparens;
562        ls.isfirstln = self.isfirstln;
563        ls.isfirstch = self.isfirstch;
564        ls.lexflags = self.lexflags;
565        ls.tok = self.tok;
566        ls.tokstr = self.tokstr.take();
567        ls.lexbuf_data = std::mem::take(&mut self.lexbuf.data);
568        ls.lexbuf_siz = self.lexbuf.siz;
569        ls.lexstop = self.lexstop;
570        ls.toklineno = self.toklineno;
571
572        // lex.c:235-238 — reset live state to defaults so a nested
573        // parse starts from a clean slate. tokstr/lexbuf are zeroed,
574        // lexbuf.siz reset to 256 (the C-source initial alloc).
575        self.tokstr = None;
576        self.lexbuf.data.clear();
577        self.lexbuf.siz = 256;
578    }
579
580    /// Restore lexical context from a `LexStack`. Direct port of
581    /// zsh/Src/lex.c:244-262 `lex_context_restore`. Inverse of
582    /// `lex_context_save`. Called after the nested parse completes.
583    pub fn lex_context_restore(&mut self, ls: &mut LexStack) {
584        // lex.c:249-261 — copy stack state back into live fields.
585        self.dbparens = ls.dbparens;
586        self.isfirstln = ls.isfirstln;
587        self.isfirstch = ls.isfirstch;
588        self.lexflags = ls.lexflags;
589        self.tok = ls.tok;
590        self.tokstr = ls.tokstr.take();
591        self.lexbuf.data = std::mem::take(&mut ls.lexbuf_data);
592        self.lexbuf.siz = ls.lexbuf_siz;
593        self.lexstop = ls.lexstop;
594        self.toklineno = ls.toklineno;
595    }
596
597    /// Initialize lexical state. Direct port of zsh/Src/lex.c:440-445
598    /// `lexinit`. Resets dbparens / nocorrect / lexstop and sets `tok`
599    /// to ENDINPUT so the next gettok starts from a known baseline.
600    /// Note: the constructor `Self::new` already sets equivalent
601    /// defaults; this method exists for the rare case a caller wants
602    /// to recycle a `ZshLexer` across multiple input strings.
603    pub fn lexinit(&mut self) {
604        // lex.c:443 — `nocorrect = dbparens = lexstop = 0;`
605        self.nocorrect = 0;
606        self.dbparens = false;
607        self.lexstop = false;
608        // lex.c:444 — `tok = ENDINPUT;`
609        self.tok = LexTok::Endinput;
610    }
611
612    /// Check recursion depth; returns true if exceeded
613    #[inline]
614    fn check_recursion(&mut self) -> bool {
615        if self.recursion_depth > MAX_LEXER_RECURSION {
616            self.error = Some("lexer exceeded max recursion depth".to_string());
617            self.lexstop = true;
618            true
619        } else {
620            false
621        }
622    }
623
624    /// Check and increment global iteration counter; returns true if limit exceeded
625    #[inline]
626    fn check_iterations(&mut self) -> bool {
627        self.global_iterations += 1;
628        if self.global_iterations > 50_000 {
629            self.error = Some("lexer exceeded 50K iterations".to_string());
630            self.lexstop = true;
631            self.tok = LexTok::Lexerr;
632            true
633        } else {
634            false
635        }
636    }
637
638    /// Get next character from input
639    fn hgetc(&mut self) -> Option<char> {
640        if self.check_iterations() {
641            return None;
642        }
643
644        // Re-read from unget_buf: increment lineno on `\n` HERE
645        // too. hungetc() decremented lineno when the char was put
646        // back; without a matching increment on the way out, every
647        // `\n` that's ungetted-then-reread leaves lineno
648        // permanently one short. Symptom: $LINENO stuck at 1 in
649        // every script statement because the parser ungets the
650        // separating newline once between statements.
651        if let Some(c) = self.unget_buf.pop_front() {
652            if c == '\n' {
653                self.lineno += 1;
654            }
655            return Some(c);
656        }
657
658        let c = self.input[self.pos..].chars().next()?;
659        self.pos += c.len_utf8();
660
661        if c == '\n' {
662            self.lineno += 1;
663        }
664
665        Some(c)
666    }
667
668    /// Put character back into input
669    fn hungetc(&mut self, c: char) {
670        self.unget_buf.push_front(c);
671        if c == '\n' && self.lineno > 1 {
672            self.lineno -= 1;
673        }
674        self.lexstop = false;
675    }
676
677    /// Peek at next character without consuming
678    #[allow(dead_code)]
679    fn peek(&mut self) -> Option<char> {
680        if let Some(&c) = self.unget_buf.front() {
681            return Some(c);
682        }
683        self.input[self.pos..].chars().next()
684    }
685
686    /// Add character to token buffer
687    fn add(&mut self, c: char) {
688        self.lexbuf.add(c);
689    }
690
691    /// Check if character is blank (space or tab)
692    fn is_blank(c: char) -> bool {
693        c == ' ' || c == '\t'
694    }
695
696    /// Peek for a zsh numeric range glob shape after a `<`: returns the
697    /// captured `N*-M*>` (everything *after* the leading `<`) when the
698    /// upcoming chars match `[0-9]*-[0-9]*>` exactly. Otherwise returns
699    /// None and leaves the input untouched.
700    fn try_numeric_range_glob(&mut self) -> Option<String> {
701        let mut buf: Vec<char> = Vec::new();
702        // optional leading digits
703        loop {
704            match self.hgetc() {
705                Some(c) if c.is_ascii_digit() => buf.push(c),
706                Some(c) => {
707                    buf.push(c);
708                    break;
709                }
710                None => break,
711            }
712        }
713        // last char in buf must be '-' for the range form
714        if buf.last() != Some(&'-') {
715            for c in buf.iter().rev() {
716                self.hungetc(*c);
717            }
718            return None;
719        }
720        // optional trailing digits
721        loop {
722            match self.hgetc() {
723                Some(c) if c.is_ascii_digit() => buf.push(c),
724                Some(c) => {
725                    buf.push(c);
726                    break;
727                }
728                None => break,
729            }
730        }
731        if buf.last() != Some(&'>') {
732            for c in buf.iter().rev() {
733                self.hungetc(*c);
734            }
735            return None;
736        }
737        Some(buf.into_iter().collect())
738    }
739
740    /// Check if character is blank (including other whitespace except newline)
741    fn is_inblank(c: char) -> bool {
742        matches!(c, ' ' | '\t' | '\x0b' | '\x0c' | '\r')
743    }
744
745    /// Check if character is a digit
746    fn is_digit(c: char) -> bool {
747        c.is_ascii_digit()
748    }
749
750    /// Check if character is identifier start
751    #[allow(dead_code)]
752    fn is_ident_start(c: char) -> bool {
753        c.is_ascii_alphabetic() || c == '_'
754    }
755
756    /// Check if character is identifier continuation
757    fn is_ident(c: char) -> bool {
758        c.is_ascii_alphanumeric() || c == '_'
759    }
760
761    /// Main lexer entry point — fetch the next token. Direct port of
762    /// zsh/Src/lex.c:265-313 `zshlex`. Loop body matches the C source
763    /// `do { ... } while (tok != ENDINPUT && exalias())` at lex.c:270-276,
764    /// followed by here-doc draining (lex.c:278-306), newline tracking
765    /// (lex.c:307-310), and SEMI/NEWLIN→SEPER folding (lex.c:311-312).
766    ///
767    /// zshrs port note: `exalias()` (lex.c:1953) is not yet wired into
768    /// the loop. The C source iterates as long as exalias keeps
769    /// re-injecting alias text into the input buffer; zshrs's alias
770    /// expansion happens post-lex in exec.rs. The loop body therefore
771    /// runs once and breaks unconditionally — documented divergence.
772    pub fn zshlex(&mut self) {
773        // lex.c:268-269 — early-out on prior LEXERR.
774        if self.tok == LexTok::Lexerr {
775            return;
776        }
777
778        // Note: Do NOT reset global_iterations here - it must accumulate across all
779        // zshlex calls in a parse to prevent infinite loops in the parser
780
781        // lex.c:270-276 — gettok / exalias loop. Without exalias wired,
782        // the inner body runs once and we `break` unconditionally.
783        loop {
784            // lex.c:271-272 — bump inrepeat counter for `repeat N {}`
785            // detection.
786            if self.inrepeat > 0 {
787                self.inrepeat += 1;
788            }
789            // lex.c:273-274 — at the third token after `repeat`,
790            // SHORTLOOPS / SHORTREPEAT options force back into cmd
791            // position so the loop body can start. zshrs unconditionally
792            // does this since the option-lookup lives in exec.rs.
793            if self.inrepeat == 3 {
794                self.incmdpos = true;
795            }
796
797            // lex.c:275 — `tok = gettok();`
798            self.tok = self.gettok();
799
800            // lex.c:276 — `while (tok != ENDINPUT && exalias())` —
801            // when exalias re-injects alias text it returns true and
802            // the loop iterates. Without exalias wired, we break.
803            break;
804        }
805
806        // lex.c:277 — `nocorrect &= 1;` — clear bit 1 (lookahead-only)
807        // so the persistent low bit survives but the per-word bit is
808        // dropped.
809        self.nocorrect &= 1;
810
811        // lex.c:278-306 — drain pending here-documents at the start
812        // of a new line. zshrs's process_heredocs reads the full body
813        // and stitches it onto the matching redir token.
814        if self.tok == LexTok::Newlin || self.tok == LexTok::Endinput {
815            self.process_heredocs();
816        }
817
818        // lex.c:307-310 — track whether we just saw a newline.
819        // C uses `inbufct` to distinguish "newline at EOF" (=1)
820        // from "newline mid-input" (=-1); zshrs reads `pos < len`.
821        if self.tok != LexTok::Newlin {
822            self.isnewlin = 0;
823        } else {
824            self.isnewlin = if self.pos < self.input.len() { -1 } else { 1 };
825        }
826
827        // lex.c:311-312 — fold SEMI / NEWLIN into SEPER unless
828        // LEXFLAGS_NEWLINE is set to preserve newlines (used by
829        // ZLE for completion of partial lines).
830        if self.tok == LexTok::Semi || (self.tok == LexTok::Newlin && !self.lexflags.newline) {
831            self.tok = LexTok::Seper;
832        }
833
834        // Reserved-word promotion. Per lex.c:2002-2005 in `exalias`:
835        //   - `{` only promotes to INBRACE in command position
836        //   - `}` promotes to OUTBRACE either in cmdpos OR via the
837        //     special `closing-brace-special` rule (IGNOREBRACES unset
838        //     — assumed since zshrs doesn't expose that option yet)
839        //   - other reserved words: only when incmdpos (or `}` exception)
840        if self.tok == LexTok::String {
841            if let Some(ref s) = self.tokstr {
842                if s == "{" && self.incmdpos {
843                    self.tok = LexTok::Inbrace;
844                } else if s == "}" {
845                    self.tok = LexTok::Outbrace;
846                } else if self.incasepat == 0 {
847                    // Skip reserved word checking in case pattern context —
848                    // words like `time`, `end` should be patterns, not
849                    // keywords.
850                    self.check_reserved_word();
851                }
852            }
853        }
854
855        // If we were expecting a heredoc terminator, register it now
856        if self.heredoc_pending > 0 && self.tok == LexTok::String {
857            if let Some(ref terminator) = self.tokstr {
858                let strip_tabs = self.heredoc_pending == 2;
859                // Detect originally-quoted terminator (`<<'EOF'`,
860                // `<<"EOF"`). The lexer wraps single-quoted text in
861                // SNULL (`\u{9d}`) and double-quoted text in DNULL
862                // (`\u{9e}`); plain `EOF` has neither. Quoted-terminator
863                // heredocs disable variable / command-sub / arithmetic
864                // expansion in the body — see `compile_redir` for the
865                // expansion side.
866                // Quoted terminators (`<<'EOF'`, `<<"EOF"`, `<<\EOF`)
867                // disable expansion in the body. SNULL/DNULL mark
868                // single/double-quoted spans; BNULL (`\u{9f}`) marks
869                // any backslash-escaped char — its presence alone is
870                // enough to flag the terminator as quoted (zsh's
871                // `<<\EOF` shorthand for `<<'EOF'`).
872                let quoted = terminator.contains('\u{9d}')
873                    || terminator.contains('\u{9e}')
874                    || terminator.contains('\u{9f}')
875                    || terminator.starts_with('\'')
876                    || terminator.starts_with('"');
877                let term = terminator
878                    .chars()
879                    .filter(|c| {
880                        *c != '\''
881                            && *c != '"'
882                            && *c != '\u{9d}'
883                            && *c != '\u{9e}'
884                            && *c != '\u{9f}'
885                    })
886                    .collect::<String>();
887                self.heredocs.push(HereDoc {
888                    terminator: term,
889                    strip_tabs,
890                    content: String::new(),
891                    quoted,
892                    processed: false,
893                });
894            }
895            self.heredoc_pending = 0;
896        }
897
898        // Track pattern context inside [[ ... ]] - after = == != =~ the RHS is a pattern
899        if self.incond > 0 {
900            if let Some(ref s) = self.tokstr {
901                // Check if this token is a comparison operator
902                // Note: single = is also a comparison operator in [[ ]]
903                // The internal marker \u{8d} is used for =
904                if s == "="
905                    || s == "=="
906                    || s == "!="
907                    || s == "=~"
908                    || s == "\u{8d}"
909                    || s == "\u{8d}\u{8d}"
910                    || s == "!\u{8d}"
911                    || s == "\u{8d}~"
912                    || s == "\u{8d}\u{98}"
913                {
914                    self.incondpat = true;
915                } else if self.incondpat {
916                    // We were in pattern context, now we've consumed the pattern
917                    // Reset after the pattern token is consumed
918                    // But actually, pattern can span multiple tokens, so we should
919                    // stay in pattern mode until ]] or && or ||
920                }
921            }
922            // Reset pattern context on ]] or logical operators (&&, ||)
923            // and grouping parens. zsh par_cond_3 (cond.c) treats
924            // these as cond-pattern terminators — the next operand is
925            // a fresh primary, NOT a continuation of the prior pattern.
926            // Without resetting on Damper/Dbar/Inpar/Outpar, the `(`
927            // after `[[ a == a && (b == b ... ` was lexed as a literal
928            // glob char (incondpat=true → gettokstr) and the whole
929            // remainder collapsed into one String token.
930            match self.tok {
931                LexTok::Doutbrack
932                | LexTok::Damper
933                | LexTok::Dbar
934                | LexTok::Inpar
935                | LexTok::Outpar
936                | LexTok::Bang => {
937                    self.incondpat = false;
938                }
939                _ => {}
940            }
941        } else {
942            self.incondpat = false;
943        }
944
945        // Update command position for next token based on current token
946        // Note: In case patterns (incasepat > 0), | is a pattern separator, not pipeline,
947        // so we don't set incmdpos after Bar in that context
948        match self.tok {
949            LexTok::Seper
950            | LexTok::Newlin
951            | LexTok::Semi
952            | LexTok::Dsemi
953            | LexTok::Semiamp
954            | LexTok::Semibar
955            | LexTok::Amper
956            | LexTok::Amperbang
957            | LexTok::Inpar
958            | LexTok::Inbrace
959            | LexTok::Dbar
960            | LexTok::Damper
961            | LexTok::Baramp
962            | LexTok::Inoutpar
963            | LexTok::Doloop
964            | LexTok::Then
965            | LexTok::Elif
966            | LexTok::Else
967            | LexTok::Doutbrack
968            | LexTok::Func => {
969                self.incmdpos = true;
970            }
971            LexTok::Bar
972                // In case patterns, | is a pattern separator - don't change incmdpos
973                if self.incasepat <= 0 => {
974                    self.incmdpos = true;
975                }
976            LexTok::String
977            | LexTok::Typeset
978            | LexTok::Envarray
979            | LexTok::Outpar
980            | LexTok::Case
981            | LexTok::Dinbrack => {
982                self.incmdpos = false;
983            }
984            _ => {}
985        }
986
987        // Track 'for' keyword for C-style for loop: for (( init; cond; step ))
988        // When we see 'for', set infor=2 to expect the init and cond parts
989        // Each Dinpar (after semicolon in arithmetic) decrements it
990        if self.tok != LexTok::Dinpar {
991            self.infor = if self.tok == LexTok::For { 2 } else { 0 };
992        }
993
994
995        // Handle redirection / for-loop context. Mirrors lex.c:359-368
996        // ctxtlex `oldpos` save/restore. The saved value lives in
997        // `self.oldpos` (struct field) so it survives across zshlex
998        // calls — the previous local `let oldpos = self.incmdpos`
999        // captured the JUST-updated value (always wrong) and lost the
1000        // pre-FOR incmdpos. With the field, FOR x → STRING x → INPAR
1001        // sequence correctly restores incmdpos=1 before the `(`.
1002        if self.tok.is_redirop()
1003            || self.tok == LexTok::For
1004            || self.tok == LexTok::Foreach
1005            || self.tok == LexTok::Select
1006        {
1007            self.inredir = true;
1008            self.oldpos = self.incmdpos;
1009            self.incmdpos = false;
1010        } else if self.inredir {
1011            self.incmdpos = self.oldpos;
1012            self.inredir = false;
1013        }
1014    }
1015
1016    /// Process pending here-documents. Walks each heredoc whose body
1017    /// hasn't been filled yet (content is empty AND terminator is set),
1018    /// reads lines from input until the terminator, and stuffs the body
1019    /// into `hdoc.content` IN PLACE. The list itself is preserved so the
1020    /// parser can index into it after parse() finishes.
1021    fn process_heredocs(&mut self) {
1022        let n = self.heredocs.len();
1023        for i in 0..n {
1024            // Skip heredocs we've already processed AND those without
1025            // a terminator (early-error case). The `processed` bool
1026            // distinguishes "filled with empty body" from "not yet
1027            // visited" — both have empty `content`.
1028            if self.heredocs[i].processed || self.heredocs[i].terminator.is_empty() {
1029                continue;
1030            }
1031            let strip_tabs = self.heredocs[i].strip_tabs;
1032            let terminator = self.heredocs[i].terminator.clone();
1033            let mut content = String::new();
1034            let mut line_count = 0;
1035
1036            loop {
1037                line_count += 1;
1038                if line_count > 10000 {
1039                    self.error = Some("heredoc exceeded 10000 lines".to_string());
1040                    self.tok = LexTok::Lexerr;
1041                    return;
1042                }
1043
1044                let line = self.read_line();
1045                if line.is_none() {
1046                    self.error = Some("here document too large or unterminated".to_string());
1047                    self.tok = LexTok::Lexerr;
1048                    return;
1049                }
1050
1051                let line = line.unwrap();
1052                let check_line = if strip_tabs {
1053                    line.trim_start_matches('\t')
1054                } else {
1055                    line.as_str()
1056                };
1057
1058                if check_line.trim_end_matches('\n') == terminator {
1059                    break;
1060                }
1061
1062                // `<<-` strips leading tabs from BODY lines too, not just
1063                // from terminator-match comparison. Without this, tabs in
1064                // here-doc content survive into stdin.
1065                if strip_tabs {
1066                    content.push_str(check_line);
1067                } else {
1068                    content.push_str(&line);
1069                }
1070            }
1071
1072            self.heredocs[i].content = content;
1073            self.heredocs[i].processed = true;
1074        }
1075    }
1076
1077    /// Read a line from input (returns partial line at EOF)
1078    fn read_line(&mut self) -> Option<String> {
1079        let mut line = String::new();
1080
1081        loop {
1082            match self.hgetc() {
1083                Some(c) => {
1084                    line.push(c);
1085                    if c == '\n' {
1086                        break;
1087                    }
1088                }
1089                None => {
1090                    // EOF - return partial line if any
1091                    if line.is_empty() {
1092                        return None;
1093                    }
1094                    break;
1095                }
1096            }
1097        }
1098
1099        Some(line)
1100    }
1101
1102    /// Get the next token. Direct port of zsh/Src/lex.c:613-936
1103    /// `gettok`. Reads characters from the input via hgetc, dispatches
1104    /// on the leading char through lexact1[]/lexact2[] tables (zshrs
1105    /// uses inline `match` in lex_initial / lex_inang / lex_outang
1106    /// since Rust pattern-matching subsumes the table dispatch).
1107    ///
1108    /// Structural divergence from C: the giant ~322-line C switch
1109    /// statement at lex.c:725-936 is split into helper methods in
1110    /// Rust (lex_initial = LX1_OTHER plus the punctuation cases,
1111    /// lex_inang / lex_outang for the < and > arms). The flow is
1112    /// equivalent — same chars consumed, same tokens emitted — but
1113    /// the source-level layout differs. C's table-driven dispatch
1114    /// would Rust-port as `match c { '\\' => ..., '\n' => ..., ... }`
1115    /// which is what the helpers ultimately do.
1116    fn gettok(&mut self) -> LexTok {
1117        // lex.c:621 — `tokstr = NULL;` reset before each token.
1118        self.tokstr = None;
1119        // (zshrs-specific: tokfd reset lives here too — C does it
1120        // implicitly via the `peekfd = -1` local at lex.c:617 used
1121        // only when a digit-prefix redirection is detected.)
1122        self.tokfd = -1;
1123
1124        // lex.c:622 — `while (iblank(c = hgetc()) && !lexstop);` —
1125        // skip leading blanks (space/tab, NOT newline).
1126        let mut ws_iterations = 0;
1127        loop {
1128            ws_iterations += 1;
1129            if ws_iterations > 100_000 {
1130                self.error = Some("gettok: infinite loop in whitespace skip".to_string());
1131                return LexTok::Lexerr;
1132            }
1133            let c = match self.hgetc() {
1134                Some(c) => c,
1135                None => {
1136                    // lex.c:624-625 — lexstop set, return ENDINPUT
1137                    // (or LEXERR if errflag is set elsewhere).
1138                    self.lexstop = true;
1139                    return if self.error.is_some() {
1140                        LexTok::Lexerr
1141                    } else {
1142                        LexTok::Endinput
1143                    };
1144                }
1145            };
1146
1147            if !Self::is_blank(c) {
1148                self.hungetc(c);
1149                break;
1150            }
1151        }
1152
1153        let c = match self.hgetc() {
1154            Some(c) => c,
1155            None => {
1156                self.lexstop = true;
1157                return LexTok::Endinput;
1158            }
1159        };
1160
1161        // lex.c:623 — `toklineno = lineno;`
1162        self.toklineno = self.lineno;
1163        // lex.c:626 — `isfirstln = 0;` once we've consumed any non-
1164        // blank.
1165        self.isfirstln = false;
1166
1167        // lex.c:631-648 — dbparens (inside `(( … ))`) special path:
1168        // call dquote_parse with `;` or `)` as the end-char and
1169        // either return DINPAR (continue for-loop arith) or DOUTPAR
1170        // (close the arith block) or LEXERR.
1171        if self.dbparens {
1172            return self.lex_arith(c);
1173        }
1174
1175        // lex.c:649-668 — digit prefix on a redirection: `2> file`
1176        // treats `2` as the fd to redirect, not a literal arg. Three
1177        // shapes: `N>`/`N<` (single redir), `N&>` (errwrite), or
1178        // anything else (push back, treat as literal digit).
1179        if Self::is_digit(c) {
1180            let d = self.hgetc();
1181            match d {
1182                Some('&') => {
1183                    let e = self.hgetc();
1184                    if e == Some('>') {
1185                        // lex.c:653-657 — `N&>` shape detected.
1186                        self.tokfd = (c as u8 - b'0') as i32;
1187                        self.hungetc('>');
1188                        return self.lex_initial('&');
1189                    }
1190                    // lex.c:658-661 — not `N&>`, push everything back.
1191                    if let Some(e) = e {
1192                        self.hungetc(e);
1193                    }
1194                    self.hungetc('&');
1195                }
1196                Some('>') | Some('<') => {
1197                    // lex.c:662-664 — `N>` or `N<` shape detected.
1198                    self.tokfd = (c as u8 - b'0') as i32;
1199                    return self.lex_initial(d.unwrap());
1200                }
1201                Some(d) => {
1202                    // lex.c:665-668 — not a redir prefix, push back.
1203                    self.hungetc(d);
1204                }
1205                None => {}
1206            }
1207            self.lexstop = false;
1208        }
1209
1210        // lex.c:670-936 — main dispatch on the leading char. zshrs
1211        // delegates to lex_initial which holds the equivalent of
1212        // lex.c's `switch (lexact1[c])` plus the gettokstr fallback
1213        // for LX1_OTHER.
1214        self.lex_initial(c)
1215    }
1216
1217    /// Lex (( ... )) arithmetic expression
1218    fn lex_arith(&mut self, c: char) -> LexTok {
1219        self.lexbuf.clear();
1220        self.hungetc(c);
1221
1222        let end_char = if self.infor > 0 { ';' } else { ')' };
1223        if self.dquote_parse(end_char, false).is_err() {
1224            return LexTok::Lexerr;
1225        }
1226
1227        self.tokstr = Some(self.lexbuf.as_str().to_string());
1228
1229        if !self.lexstop && self.infor > 0 {
1230            self.infor -= 1;
1231            return LexTok::Dinpar;
1232        }
1233
1234        // Check for closing ))
1235        match self.hgetc() {
1236            Some(')') => {
1237                self.dbparens = false;
1238                LexTok::Doutpar
1239            }
1240            c => {
1241                if let Some(c) = c {
1242                    self.hungetc(c);
1243                }
1244                LexTok::Lexerr
1245            }
1246        }
1247    }
1248
1249    /// Handle initial character of token
1250    fn lex_initial(&mut self, c: char) -> LexTok {
1251        // Handle comments
1252        if c == '#' && !self.nocomments {
1253            return self.lex_comment();
1254        }
1255
1256        match c {
1257            '\\' => {
1258                let d = self.hgetc();
1259                if d == Some('\n') {
1260                    // Line continuation - get next token
1261                    return self.gettok();
1262                }
1263                if let Some(d) = d {
1264                    self.hungetc(d);
1265                }
1266                self.lexstop = false;
1267                self.gettokstr(c, false)
1268            }
1269
1270            '\n' => LexTok::Newlin,
1271
1272            ';' => {
1273                let d = self.hgetc();
1274                match d {
1275                    Some(';') => LexTok::Dsemi,
1276                    Some('&') => LexTok::Semiamp,
1277                    Some('|') => LexTok::Semibar,
1278                    _ => {
1279                        if let Some(d) = d {
1280                            self.hungetc(d);
1281                        }
1282                        self.lexstop = false;
1283                        LexTok::Semi
1284                    }
1285                }
1286            }
1287
1288            '&' => {
1289                let d = self.hgetc();
1290                match d {
1291                    Some('&') => LexTok::Damper,
1292                    Some('!') | Some('|') => LexTok::Amperbang,
1293                    Some('>') => {
1294                        self.tokfd = self.tokfd.max(0);
1295                        let e = self.hgetc();
1296                        match e {
1297                            Some('!') | Some('|') => LexTok::Outangampbang,
1298                            Some('>') => {
1299                                let f = self.hgetc();
1300                                match f {
1301                                    Some('!') | Some('|') => LexTok::Doutangampbang,
1302                                    _ => {
1303                                        if let Some(f) = f {
1304                                            self.hungetc(f);
1305                                        }
1306                                        self.lexstop = false;
1307                                        LexTok::Doutangamp
1308                                    }
1309                                }
1310                            }
1311                            _ => {
1312                                if let Some(e) = e {
1313                                    self.hungetc(e);
1314                                }
1315                                self.lexstop = false;
1316                                LexTok::Ampoutang
1317                            }
1318                        }
1319                    }
1320                    _ => {
1321                        if let Some(d) = d {
1322                            self.hungetc(d);
1323                        }
1324                        self.lexstop = false;
1325                        LexTok::Amper
1326                    }
1327                }
1328            }
1329
1330            '|' => {
1331                let d = self.hgetc();
1332                match d {
1333                    Some('|') if self.incasepat <= 0 => LexTok::Dbar,
1334                    Some('&') => LexTok::Baramp,
1335                    _ => {
1336                        if let Some(d) = d {
1337                            self.hungetc(d);
1338                        }
1339                        self.lexstop = false;
1340                        LexTok::Bar
1341                    }
1342                }
1343            }
1344
1345            '(' => {
1346                let d = self.hgetc();
1347                match d {
1348                    Some('(') => {
1349                        if self.infor > 0 {
1350                            self.dbparens = true;
1351                            return LexTok::Dinpar;
1352                        }
1353                        if self.incmdpos {
1354                            // Could be (( arithmetic )) or ( subshell )
1355                            self.lexbuf.clear();
1356                            match self.cmd_or_math() {
1357                                CmdOrMath::Math => {
1358                                    self.tokstr = Some(self.lexbuf.as_str().to_string());
1359                                    return LexTok::Dinpar;
1360                                }
1361                                CmdOrMath::Cmd => {
1362                                    self.tokstr = None;
1363                                    return LexTok::Inpar;
1364                                }
1365                                CmdOrMath::Err => return LexTok::Lexerr,
1366                            }
1367                        }
1368                        self.hungetc('(');
1369                        self.lexstop = false;
1370                        self.gettokstr('(', false)
1371                    }
1372                    Some(')') => LexTok::Inoutpar,
1373                    _ => {
1374                        if let Some(d) = d {
1375                            self.hungetc(d);
1376                        }
1377                        self.lexstop = false;
1378                        // Per lex.c:822 LX1_INPAR — at word boundary `(`
1379                        // tokenizes as INPAR when SHGLOB || incond==1 ||
1380                        // incmdpos. Otherwise falls through to gettokstr
1381                        // (the `(` becomes start of a STRING — typical
1382                        // for unquoted glob args like `ls (^foo)*`).
1383                        // For `for x ( ... )` form, incmdpos is restored
1384                        // to 1 via the oldpos-save-after-FOR mechanism,
1385                        // so the next-token `(` correctly INPAR-izes.
1386                        if self.incond == 1 || self.incmdpos || self.incasepat >= 1 {
1387                            LexTok::Inpar
1388                        } else {
1389                            self.gettokstr('(', false)
1390                        }
1391                    }
1392                }
1393            }
1394
1395            ')' => LexTok::Outpar,
1396
1397            '{' => {
1398                // { is a command group only if followed by whitespace,
1399                // newline, or `}` (the empty-block form `{}`). zsh
1400                // treats `{}` as an empty compound — `foo() {}` is a
1401                // valid no-op function. Without `}` in this list,
1402                // `{}` got consumed as one literal token and ran as a
1403                // command, failing "command not found: {}".
1404                // The empty `{}` is also recognised AFTER a function
1405                // header `name()` even when `incmdpos` got cleared by
1406                // the preceding Outpar — peek for `}` regardless and
1407                // treat as Inbrace so `foo() {}` parses as a no-op
1408                // function body.
1409                let next = self.hgetc();
1410                let next_is_close = matches!(next, Some('}'));
1411                if self.incmdpos {
1412                    let is_brace_group = match next {
1413                        Some(' ') | Some('\t') | Some('\n') | Some('}') | None => true,
1414                        _ => false,
1415                    };
1416                    if let Some(ch) = next {
1417                        self.hungetc(ch);
1418                    }
1419                    if is_brace_group {
1420                        self.tokstr = Some("{".to_string());
1421                        LexTok::Inbrace
1422                    } else {
1423                        self.gettokstr(c, false)
1424                    }
1425                } else if next_is_close {
1426                    // `{}` empty block in non-cmd position (function
1427                    // body after `()`). Treat as Inbrace; the parser
1428                    // will follow with Outbrace.
1429                    if let Some(ch) = next {
1430                        self.hungetc(ch);
1431                    }
1432                    self.tokstr = Some("{".to_string());
1433                    LexTok::Inbrace
1434                } else {
1435                    if let Some(ch) = next {
1436                        self.hungetc(ch);
1437                    }
1438                    self.gettokstr(c, false)
1439                }
1440            }
1441
1442            '}' => {
1443                // } at start of token is always Outbrace (ends command group)
1444                // Inside a word, } would be handled by gettokstr but we never reach here mid-word
1445                self.tokstr = Some("}".to_string());
1446                LexTok::Outbrace
1447            }
1448
1449            '[' => {
1450                // [[ is a conditional expression start
1451                // [ can also be a command (test builtin) or array subscript
1452                // In case patterns (incasepat > 0), [ is part of glob pattern like [yY]
1453                if self.incasepat > 0 {
1454                    self.gettokstr(c, false)
1455                } else if self.incmdpos {
1456                    let next = self.hgetc();
1457                    if next == Some('[') {
1458                        // [[ - double bracket conditional
1459                        self.tokstr = Some("[[".to_string());
1460                        self.incond = 1;
1461                        return LexTok::Dinbrack;
1462                    }
1463                    // Single [ - either test command or start of glob pattern
1464                    if let Some(ch) = next {
1465                        self.hungetc(ch);
1466                    }
1467                    self.tokstr = Some("[".to_string());
1468                    LexTok::String
1469                } else {
1470                    self.gettokstr(c, false)
1471                }
1472            }
1473
1474            ']' => {
1475                // ]] ends a conditional expression started by [[
1476                if self.incond > 0 {
1477                    let next = self.hgetc();
1478                    if next == Some(']') {
1479                        self.tokstr = Some("]]".to_string());
1480                        self.incond = 0;
1481                        return LexTok::Doutbrack;
1482                    }
1483                    if let Some(ch) = next {
1484                        self.hungetc(ch);
1485                    }
1486                }
1487                self.gettokstr(c, false)
1488            }
1489
1490            '<' => {
1491                // In pattern context, < is literal (e.g., <-> in glob)
1492                if self.incondpat || self.incasepat > 0 {
1493                    self.gettokstr(c, false)
1494                } else {
1495                    self.lex_inang()
1496                }
1497            }
1498
1499            '>' => {
1500                // In pattern context, > is literal
1501                if self.incondpat || self.incasepat > 0 {
1502                    self.gettokstr(c, false)
1503                } else {
1504                    self.lex_outang()
1505                }
1506            }
1507
1508            _ => self.gettokstr(c, false),
1509        }
1510    }
1511
1512    /// Lex comment
1513    fn lex_comment(&mut self) -> LexTok {
1514        if self.lexflags.comments_keep {
1515            self.lexbuf.clear();
1516            self.add('#');
1517        }
1518
1519        loop {
1520            let c = self.hgetc();
1521            match c {
1522                Some('\n') | None => break,
1523                Some(c) => {
1524                    if self.lexflags.comments_keep {
1525                        self.add(c);
1526                    }
1527                }
1528            }
1529        }
1530
1531        if self.lexflags.comments_keep {
1532            self.tokstr = Some(self.lexbuf.as_str().to_string());
1533            if !self.lexstop {
1534                self.hungetc('\n');
1535            }
1536            return LexTok::String;
1537        }
1538
1539        if self.lexflags.comments_strip && self.lexstop {
1540            return LexTok::Endinput;
1541        }
1542
1543        LexTok::Newlin
1544    }
1545
1546    /// Lex < and variants
1547    fn lex_inang(&mut self) -> LexTok {
1548        let d = self.hgetc();
1549        match d {
1550            Some('(') => {
1551                // Process substitution <(...)
1552                self.hungetc('(');
1553                self.lexstop = false;
1554                self.gettokstr('<', false)
1555            }
1556            Some('>') => LexTok::Inoutang,
1557            Some('<') => {
1558                let e = self.hgetc();
1559                match e {
1560                    Some('(') => {
1561                        self.hungetc('(');
1562                        self.hungetc('<');
1563                        LexTok::Inang
1564                    }
1565                    Some('<') => LexTok::Trinang,
1566                    Some('-') => {
1567                        self.heredoc_pending = 2; // <<- expects terminator next
1568                        LexTok::Dinangdash
1569                    }
1570                    _ => {
1571                        if let Some(e) = e {
1572                            self.hungetc(e);
1573                        }
1574                        self.lexstop = false;
1575                        self.heredoc_pending = 1; // << expects terminator next
1576                        LexTok::Dinang
1577                    }
1578                }
1579            }
1580            Some('&') => LexTok::Inangamp,
1581            _ => {
1582                if let Some(d) = d {
1583                    self.hungetc(d);
1584                }
1585                self.lexstop = false;
1586                LexTok::Inang
1587            }
1588        }
1589    }
1590
1591    /// Lex > and variants
1592    fn lex_outang(&mut self) -> LexTok {
1593        let d = self.hgetc();
1594        match d {
1595            Some('(') => {
1596                // Process substitution >(...)
1597                self.hungetc('(');
1598                self.lexstop = false;
1599                self.gettokstr('>', false)
1600            }
1601            Some('&') => {
1602                let e = self.hgetc();
1603                match e {
1604                    Some('!') | Some('|') => LexTok::Outangampbang,
1605                    _ => {
1606                        if let Some(e) = e {
1607                            self.hungetc(e);
1608                        }
1609                        self.lexstop = false;
1610                        LexTok::Outangamp
1611                    }
1612                }
1613            }
1614            Some('!') | Some('|') => LexTok::Outangbang,
1615            Some('>') => {
1616                let e = self.hgetc();
1617                match e {
1618                    Some('&') => {
1619                        let f = self.hgetc();
1620                        match f {
1621                            Some('!') | Some('|') => LexTok::Doutangampbang,
1622                            _ => {
1623                                if let Some(f) = f {
1624                                    self.hungetc(f);
1625                                }
1626                                self.lexstop = false;
1627                                LexTok::Doutangamp
1628                            }
1629                        }
1630                    }
1631                    Some('!') | Some('|') => LexTok::Doutangbang,
1632                    Some('(') => {
1633                        self.hungetc('(');
1634                        self.hungetc('>');
1635                        LexTok::Outang
1636                    }
1637                    _ => {
1638                        if let Some(e) = e {
1639                            self.hungetc(e);
1640                        }
1641                        self.lexstop = false;
1642                        LexTok::Doutang
1643                    }
1644                }
1645            }
1646            _ => {
1647                if let Some(d) = d {
1648                    self.hungetc(d);
1649                }
1650                self.lexstop = false;
1651                LexTok::Outang
1652            }
1653        }
1654    }
1655
1656    /// Get rest of token string
1657    fn gettokstr(&mut self, c: char, sub: bool) -> LexTok {
1658        let mut bct = 0; // brace count
1659        let mut pct = 0; // parenthesis count
1660        let mut brct = 0; // bracket count
1661        let mut in_brace_param = 0;
1662        let mut peek = LexTok::String;
1663        let mut intpos = 1;
1664        let mut unmatched = '\0';
1665        let mut c = c;
1666        const MAX_ITERATIONS: usize = 100_000;
1667        let mut iterations = 0;
1668
1669        if !sub {
1670            self.lexbuf.clear();
1671        }
1672
1673        loop {
1674            iterations += 1;
1675            if iterations > MAX_ITERATIONS {
1676                self.error = Some("gettokstr exceeded maximum iterations".to_string());
1677                return LexTok::Lexerr;
1678            }
1679
1680            let inbl = Self::is_inblank(c);
1681
1682            if inbl && in_brace_param == 0 && pct == 0 {
1683                // Whitespace outside brace param ends token
1684                break;
1685            }
1686
1687            match c {
1688                // Whitespace is handled above for most cases
1689                ')' => {
1690                    if in_brace_param > 0 || sub {
1691                        self.add(char_tokens::OUTPAR);
1692                    } else if pct > 0 {
1693                        pct -= 1;
1694                        self.add(char_tokens::OUTPAR);
1695                    } else {
1696                        break;
1697                    }
1698                }
1699
1700                '|' => {
1701                    if pct == 0 && in_brace_param == 0 {
1702                        if sub {
1703                            self.add(c);
1704                        } else {
1705                            break;
1706                        }
1707                    } else {
1708                        self.add(char_tokens::BAR);
1709                    }
1710                }
1711
1712                '$' => {
1713                    let e = self.hgetc();
1714                    match e {
1715                        Some('\\') => {
1716                            let f = self.hgetc();
1717                            if f != Some('\n') {
1718                                if let Some(f) = f {
1719                                    self.hungetc(f);
1720                                }
1721                                self.hungetc('\\');
1722                                self.add(char_tokens::STRING);
1723                            } else {
1724                                // Line continuation after $
1725                                continue;
1726                            }
1727                        }
1728                        Some('[') => {
1729                            // $[...] arithmetic
1730                            self.add(char_tokens::STRING);
1731                            self.add(char_tokens::INBRACK);
1732                            if self.dquote_parse(']', sub).is_err() {
1733                                peek = LexTok::Lexerr;
1734                                break;
1735                            }
1736                            self.add(char_tokens::OUTBRACK);
1737                        }
1738                        Some('(') => {
1739                            // $(...) or $((...))
1740                            self.add(char_tokens::STRING);
1741                            match self.cmd_or_math_sub() {
1742                                CmdOrMath::Cmd => self.add(char_tokens::OUTPAR),
1743                                CmdOrMath::Math => self.add(char_tokens::OUTPARMATH),
1744                                CmdOrMath::Err => {
1745                                    peek = LexTok::Lexerr;
1746                                    break;
1747                                }
1748                            }
1749                        }
1750                        Some('{') => {
1751                            self.add(c);
1752                            self.add(char_tokens::INBRACE);
1753                            bct += 1;
1754                            if in_brace_param == 0 {
1755                                in_brace_param = bct;
1756                            }
1757                        }
1758                        Some('\'') => {
1759                            // $'...' ANSI-C escape syntax. Inside, `\X`
1760                            // sequences are escapes (`\n`, `\t`, `\x1b`,
1761                            // `\'` for literal apostrophe, `\\` for
1762                            // backslash). Lexer captures the raw form
1763                            // wrapped in QSTRING/SNULL markers; later
1764                            // expansion decodes the escapes. zsh's
1765                            // analogue lives in lex.c gettokstr's
1766                            // LX2_QUOTE branch when prev char was `$`.
1767                            self.add(char_tokens::QSTRING);
1768                            self.add(char_tokens::SNULL);
1769                            loop {
1770                                let ch = self.hgetc();
1771                                match ch {
1772                                    Some('\'') => break,
1773                                    Some('\\') => {
1774                                        // `\X` — store both chars literally;
1775                                        // expansion handles the actual escape.
1776                                        self.add(char_tokens::BNULL);
1777                                        match self.hgetc() {
1778                                            Some(n) => self.add(n),
1779                                            None => {
1780                                                self.lexstop = true;
1781                                                unmatched = '\'';
1782                                                peek = LexTok::Lexerr;
1783                                                break;
1784                                            }
1785                                        }
1786                                    }
1787                                    Some(ch) => self.add(ch),
1788                                    None => {
1789                                        self.lexstop = true;
1790                                        unmatched = '\'';
1791                                        peek = LexTok::Lexerr;
1792                                        break;
1793                                    }
1794                                }
1795                            }
1796                            if unmatched != '\0' {
1797                                break;
1798                            }
1799                            self.add(char_tokens::SNULL);
1800                        }
1801                        Some('"') => {
1802                            // $"..." localized string. Same shape as a
1803                            // plain "..." but flagged via QSTRING+DNULL
1804                            // so post-lex translation can substitute.
1805                            self.add(char_tokens::QSTRING);
1806                            self.add(char_tokens::DNULL);
1807                            if self.dquote_parse('"', sub).is_err() {
1808                                peek = LexTok::Lexerr;
1809                                break;
1810                            }
1811                            self.add(char_tokens::DNULL);
1812                        }
1813                        _ => {
1814                            if let Some(e) = e {
1815                                self.hungetc(e);
1816                            }
1817                            self.lexstop = false;
1818                            self.add(char_tokens::STRING);
1819                        }
1820                    }
1821                }
1822
1823                '[' => {
1824                    if in_brace_param == 0 {
1825                        brct += 1;
1826                    }
1827                    self.add(char_tokens::INBRACK);
1828                }
1829
1830                ']' => {
1831                    if in_brace_param == 0 && brct > 0 {
1832                        brct -= 1;
1833                    }
1834                    self.add(char_tokens::OUTBRACK);
1835                }
1836
1837                '(' => {
1838                    // lex.c:1078-1135 LX2_INPAR — when `(` appears inside
1839                    // a STRING and is immediately followed by `)`, the
1840                    // string terminates at the `(`. The `()` is then
1841                    // re-lexed as a separate INOUTPAR token. This handles
1842                    // function definitions: `name()` lexes as STRING `name`
1843                    // + INOUTPAR `()`, not STRING `name()`.
1844                    //
1845                    // Also (lex.c:1109-1112): under SHGLOB, a `(` followed
1846                    // by whitespace at the start of a command-position word
1847                    // (no nested brackets/braces) is a ksh function
1848                    // definition signal — same break-out behavior.
1849                    if in_brace_param == 0 && !sub {
1850                        let e = self.hgetc();
1851                        if let Some(ch) = e {
1852                            self.hungetc(ch);
1853                        }
1854                        self.lexstop = false;
1855                        if e == Some(')') {
1856                            // `name()` — terminate STRING at `(` so the
1857                            // following `()` re-lexes as INOUTPAR. The
1858                            // loop's exit guard at line 2067 will
1859                            // `hungetc(c)` to push the `(` back; we only
1860                            // need to ensure `)` is also there. The
1861                            // hungetc(ch) above already pushed `)`, so
1862                            // breaking here yields unget_buf = [`(`, `)`]
1863                            // after the guard, which the outer dispatch
1864                            // reads as Inoutpar.
1865                            break;
1866                        }
1867                    }
1868                    if in_brace_param == 0 {
1869                        pct += 1;
1870                    }
1871                    self.add(char_tokens::INPAR);
1872                }
1873
1874                '{' => {
1875                    // Track braces for both ${...} param expansion and {...} brace expansion
1876                    bct += 1;
1877                    self.add(c);
1878                }
1879
1880                '}' => {
1881                    if in_brace_param > 0 {
1882                        if bct == in_brace_param {
1883                            in_brace_param = 0;
1884                        }
1885                        bct -= 1;
1886                        self.add(char_tokens::OUTBRACE);
1887                    } else if bct > 0 {
1888                        // Closing a brace expansion like {a,b}
1889                        bct -= 1;
1890                        self.add(c);
1891                    } else {
1892                        break;
1893                    }
1894                }
1895
1896                '>' => {
1897                    // In pattern context (incondpat), > is literal
1898                    if in_brace_param > 0 || sub || self.incondpat || self.incasepat > 0 {
1899                        self.add(c);
1900                    } else {
1901                        let e = self.hgetc();
1902                        if e != Some('(') {
1903                            if let Some(e) = e {
1904                                self.hungetc(e);
1905                            }
1906                            self.lexstop = false;
1907                            break;
1908                        }
1909                        // >(...)
1910                        self.add(char_tokens::OUTANGPROC);
1911                        if self.skip_command_sub().is_err() {
1912                            peek = LexTok::Lexerr;
1913                            break;
1914                        }
1915                        self.add(char_tokens::OUTPAR);
1916                    }
1917                }
1918
1919                '<' => {
1920                    // In pattern context (incondpat), < is literal
1921                    if in_brace_param > 0 || sub || self.incondpat || self.incasepat > 0 {
1922                        self.add(c);
1923                    } else if let Some(range_chars) = self.try_numeric_range_glob() {
1924                        // zsh numeric range glob `<N-M>`, `<->`, `<N->`,
1925                        // `<-M>`. When `<` mid-word matches that exact
1926                        // shape, swallow it into the word instead of
1927                        // breaking out for redirection.
1928                        self.add(c);
1929                        for ch in range_chars.chars() {
1930                            self.add(ch);
1931                        }
1932                    } else {
1933                        let e = self.hgetc();
1934                        if e != Some('(') {
1935                            if let Some(e) = e {
1936                                self.hungetc(e);
1937                            }
1938                            self.lexstop = false;
1939                            break;
1940                        }
1941                        // <(...)
1942                        self.add(char_tokens::INANG);
1943                        if self.skip_command_sub().is_err() {
1944                            peek = LexTok::Lexerr;
1945                            break;
1946                        }
1947                        self.add(char_tokens::OUTPAR);
1948                    }
1949                }
1950
1951                '=' => {
1952                    if !sub {
1953                        if intpos > 0 {
1954                            // At start of token, check for =(...) process substitution
1955                            let e = self.hgetc();
1956                            if e == Some('(') {
1957                                self.add(char_tokens::EQUALS);
1958                                if self.skip_command_sub().is_err() {
1959                                    peek = LexTok::Lexerr;
1960                                    break;
1961                                }
1962                                self.add(char_tokens::OUTPAR);
1963                            } else {
1964                                if let Some(e) = e {
1965                                    self.hungetc(e);
1966                                }
1967                                self.lexstop = false;
1968                                self.add(char_tokens::EQUALS);
1969                            }
1970                        } else if peek != LexTok::Envstring
1971                            && (self.incmdpos || self.intypeset)
1972                            && bct == 0
1973                            && brct == 0
1974                            && self.incasepat == 0
1975                        {
1976                            // Check for VAR=value assignment (but not in case pattern context)
1977                            let tok_so_far = self.lexbuf.as_str().to_string();
1978                            if self.is_valid_assignment_target(&tok_so_far) {
1979                                let next = self.hgetc();
1980                                if next == Some('(') {
1981                                    // VAR=(...) array assignment. Per zsh
1982                                    // (lex.c emits ENVARRAY with tokstr =
1983                                    // just the variable name, NOT
1984                                    // including the `=`). The `=` and
1985                                    // `(` are consumed by the lexer; the
1986                                    // parser knows ENVARRAY means assign-
1987                                    // array and reads the body that
1988                                    // follows.
1989                                    self.tokstr = Some(self.lexbuf.as_str().to_string());
1990                                    return LexTok::Envarray;
1991                                }
1992                                if let Some(next) = next {
1993                                    self.hungetc(next);
1994                                }
1995                                self.lexstop = false;
1996                                peek = LexTok::Envstring;
1997                                intpos = 2;
1998                                self.add(char_tokens::EQUALS);
1999                            } else {
2000                                self.add(char_tokens::EQUALS);
2001                            }
2002                        } else {
2003                            self.add(char_tokens::EQUALS);
2004                        }
2005                    } else {
2006                        self.add(char_tokens::EQUALS);
2007                    }
2008                }
2009
2010                '\\' => {
2011                    let next = self.hgetc();
2012                    if next == Some('\n') {
2013                        // Line continuation
2014                        let next = self.hgetc();
2015                        if let Some(next) = next {
2016                            c = next;
2017                            continue;
2018                        }
2019                        break;
2020                    } else {
2021                        self.add(char_tokens::BNULL);
2022                        if let Some(next) = next {
2023                            self.add(next);
2024                        }
2025                    }
2026                }
2027
2028                '\'' => {
2029                    // Single quoted string - everything literal until '
2030                    self.add(char_tokens::SNULL);
2031                    loop {
2032                        let ch = self.hgetc();
2033                        match ch {
2034                            Some('\'') => break,
2035                            Some(ch) => self.add(ch),
2036                            None => {
2037                                self.lexstop = true;
2038                                unmatched = '\'';
2039                                peek = LexTok::Lexerr;
2040                                break;
2041                            }
2042                        }
2043                    }
2044                    if unmatched != '\0' {
2045                        break;
2046                    }
2047                    self.add(char_tokens::SNULL);
2048                }
2049
2050                '"' => {
2051                    // Double quoted string
2052                    self.add(char_tokens::DNULL);
2053                    if self.dquote_parse('"', sub).is_err() {
2054                        unmatched = '"';
2055                        if !self.lexflags.active {
2056                            peek = LexTok::Lexerr;
2057                        }
2058                        break;
2059                    }
2060                    self.add(char_tokens::DNULL);
2061                }
2062
2063                '`' => {
2064                    // Backtick command substitution
2065                    self.add(char_tokens::TICK);
2066                    loop {
2067                        let ch = self.hgetc();
2068                        match ch {
2069                            Some('`') => break,
2070                            Some('\\') => {
2071                                let next = self.hgetc();
2072                                match next {
2073                                    Some('\n') => continue, // Line continuation
2074                                    Some(c) if c == '`' || c == '\\' || c == '$' => {
2075                                        self.add(char_tokens::BNULL);
2076                                        self.add(c);
2077                                    }
2078                                    Some(c) => {
2079                                        self.add('\\');
2080                                        self.add(c);
2081                                    }
2082                                    None => break,
2083                                }
2084                            }
2085                            Some(ch) => self.add(ch),
2086                            None => {
2087                                self.lexstop = true;
2088                                unmatched = '`';
2089                                peek = LexTok::Lexerr;
2090                                break;
2091                            }
2092                        }
2093                    }
2094                    if unmatched != '\0' {
2095                        break;
2096                    }
2097                    self.add(char_tokens::TICK);
2098                }
2099
2100                '~' => {
2101                    self.add(char_tokens::TILDE);
2102                }
2103
2104                '#' => {
2105                    self.add(char_tokens::POUND);
2106                }
2107
2108                '^' => {
2109                    self.add(char_tokens::HAT);
2110                }
2111
2112                '*' => {
2113                    self.add(char_tokens::STAR);
2114                }
2115
2116                '?' => {
2117                    self.add(char_tokens::QUEST);
2118                }
2119
2120                ',' if bct > in_brace_param => {
2121                    self.add(char_tokens::COMMA);
2122                }
2123
2124                '-' => {
2125                    self.add(char_tokens::DASH);
2126                }
2127
2128                '!' if brct > 0 => {
2129                    self.add(char_tokens::BANG);
2130                }
2131
2132                // Terminators
2133                '\n' | ';' | '&' => {
2134                    break;
2135                }
2136
2137                _ => {
2138                    self.add(c);
2139                }
2140            }
2141
2142            c = match self.hgetc() {
2143                Some(c) => c,
2144                None => {
2145                    self.lexstop = true;
2146                    break;
2147                }
2148            };
2149
2150            if intpos > 0 {
2151                intpos -= 1;
2152            }
2153        }
2154
2155        // Put back the character that ended the token
2156        if !self.lexstop {
2157            self.hungetc(c);
2158        }
2159
2160        if unmatched != '\0' && !self.lexflags.active {
2161            self.error = Some(format!("unmatched {}", unmatched));
2162        }
2163
2164        if in_brace_param > 0 {
2165            self.error = Some("closing brace expected".to_string());
2166        }
2167
2168        self.tokstr = Some(self.lexbuf.as_str().to_string());
2169        peek
2170    }
2171
2172    /// Check if a string is a valid assignment target (identifier or array ref).
2173    ///
2174    /// zsh accepts identifier (`[A-Za-z_][A-Za-z0-9_]*`) optionally followed by
2175    /// a `[...]` subscript. Bare digits are NOT a valid lvalue (rejected at
2176    /// `if c.is_ascii_digit()` below — array index expressions like `arr[2]`
2177    /// are caught by the subscript handler, not here). And the first char
2178    /// must NOT be a zsh internal token byte — `$=foo` (where `$` becomes
2179    /// the STRING token 0x85) is parameter substitution with the `=` flag,
2180    /// NOT an envstring assignment.
2181    fn is_valid_assignment_target(&self, s: &str) -> bool {
2182        let mut chars = s.chars().peekable();
2183
2184        // Reject leading token byte — `$VAR=` is parameter substitution,
2185        // not assignment. Same for `*=`, `?=`, etc.
2186        if let Some(&c) = chars.peek() {
2187            if char_tokens::is_token(c) {
2188                return false;
2189            }
2190        }
2191
2192        // Check for leading digit (invalid)
2193        if let Some(&c) = chars.peek() {
2194            if c.is_ascii_digit() {
2195                // Could be array index, check rest
2196                while let Some(&c) = chars.peek() {
2197                    if !c.is_ascii_digit() {
2198                        break;
2199                    }
2200                    chars.next();
2201                }
2202                return chars.peek().is_none();
2203            }
2204        }
2205
2206        // Check identifier
2207        let mut has_ident = false;
2208        while let Some(&c) = chars.peek() {
2209            if c == char_tokens::INBRACK || c == '[' {
2210                break;
2211            }
2212            if c == '+' {
2213                // foo+=value
2214                chars.next();
2215                return chars.peek().is_none() || chars.peek() == Some(&'=');
2216            }
2217            if !Self::is_ident(c) && c != char_tokens::STRING && !char_tokens::is_token(c) {
2218                return false;
2219            }
2220            has_ident = true;
2221            chars.next();
2222        }
2223
2224        has_ident
2225    }
2226
2227    /// Parse the body of a double-quoted string (or any context that
2228    /// uses double-quote tokenization — `(( ))`, `${...}`, `$( ( ) )`).
2229    /// Direct port of zsh/Src/lex.c:1486-1693 `dquote_parse`. Reads
2230    /// chars until `endchar` is seen at depth 0, handling escapes,
2231    /// `${...}` parameter substitutions, `$(...)` and backtick command
2232    /// substitutions, `$((...))` arithmetic, and inner double-quoted
2233    /// strings. The `sub` flag toggles substitution-context tokens
2234    /// (lex.c:1487 `int sub` argument).
2235    ///
2236    /// zshrs port note: the recursion guard at the top is a Rust
2237    /// safety net; the C source relies on the runtime stack. Inner
2238    /// logic delegates to `dquote_parse_inner` which holds the actual
2239    /// per-char state machine matching lex.c:1495-1692.
2240    fn dquote_parse(&mut self, endchar: char, sub: bool) -> Result<(), ()> {
2241        self.recursion_depth += 1;
2242        if self.check_recursion() {
2243            self.recursion_depth -= 1;
2244            return Err(());
2245        }
2246
2247        let result = self.dquote_parse_inner(endchar, sub);
2248        self.recursion_depth -= 1;
2249        result
2250    }
2251
2252    fn dquote_parse_inner(&mut self, endchar: char, sub: bool) -> Result<(), ()> {
2253        let mut pct = 0; // parenthesis count
2254        let mut brct = 0; // bracket count
2255        let mut bct = 0; // brace count (for ${...})
2256        let mut intick = false; // inside backtick
2257        let is_math = endchar == ')' || endchar == ']' || self.infor > 0;
2258        const MAX_ITERATIONS: usize = 100_000;
2259        let mut iterations = 0;
2260
2261        loop {
2262            iterations += 1;
2263            if iterations > MAX_ITERATIONS {
2264                self.error = Some("dquote_parse exceeded maximum iterations".to_string());
2265                return Err(());
2266            }
2267            let c = self.hgetc();
2268            let c = match c {
2269                Some(c) if c == endchar && !intick && bct == 0 => {
2270                    if is_math && (pct > 0 || brct > 0) {
2271                        self.add(c);
2272                        if c == ')' {
2273                            pct -= 1;
2274                        } else if c == ']' {
2275                            brct -= 1;
2276                        }
2277                        continue;
2278                    }
2279                    return Ok(());
2280                }
2281                Some(c) => c,
2282                None => {
2283                    self.lexstop = true;
2284                    return Err(());
2285                }
2286            };
2287
2288            match c {
2289                '\\' => {
2290                    let next = self.hgetc();
2291                    match next {
2292                        Some('\n') if !sub => continue, // Line continuation
2293                        Some(c)
2294                            if c == '$'
2295                                || c == '\\'
2296                                || (c == '}' && !intick && bct > 0)
2297                                || c == endchar
2298                                || c == '`'
2299                                || (endchar == ']'
2300                                    && (c == '['
2301                                        || c == ']'
2302                                        || c == '('
2303                                        || c == ')'
2304                                        || c == '{'
2305                                        || c == '}'
2306                                        || (c == '"' && sub))) =>
2307                        {
2308                            self.add(char_tokens::BNULL);
2309                            self.add(c);
2310                        }
2311                        Some(c) => {
2312                            self.add('\\');
2313                            self.hungetc(c);
2314                            continue;
2315                        }
2316                        None => {
2317                            self.add('\\');
2318                        }
2319                    }
2320                }
2321
2322                '$' => {
2323                    if intick {
2324                        self.add(c);
2325                        continue;
2326                    }
2327                    let next = self.hgetc();
2328                    match next {
2329                        Some('(') => {
2330                            self.add(char_tokens::QSTRING);
2331                            match self.cmd_or_math_sub() {
2332                                CmdOrMath::Cmd => self.add(char_tokens::OUTPAR),
2333                                CmdOrMath::Math => self.add(char_tokens::OUTPARMATH),
2334                                CmdOrMath::Err => return Err(()),
2335                            }
2336                        }
2337                        Some('[') => {
2338                            self.add(char_tokens::STRING);
2339                            self.add(char_tokens::INBRACK);
2340                            self.dquote_parse(']', sub)?;
2341                            self.add(char_tokens::OUTBRACK);
2342                        }
2343                        Some('{') => {
2344                            self.add(char_tokens::QSTRING);
2345                            self.add(char_tokens::INBRACE);
2346                            bct += 1;
2347                        }
2348                        Some('$') => {
2349                            self.add(char_tokens::QSTRING);
2350                            self.add('$');
2351                        }
2352                        _ => {
2353                            if let Some(next) = next {
2354                                self.hungetc(next);
2355                            }
2356                            self.lexstop = false;
2357                            self.add(char_tokens::QSTRING);
2358                        }
2359                    }
2360                }
2361
2362                '}' => {
2363                    if intick || bct == 0 {
2364                        self.add(c);
2365                    } else {
2366                        self.add(char_tokens::OUTBRACE);
2367                        bct -= 1;
2368                    }
2369                }
2370
2371                '`' => {
2372                    self.add(char_tokens::QTICK);
2373                    intick = !intick;
2374                }
2375
2376                '(' => {
2377                    if !is_math || bct == 0 {
2378                        pct += 1;
2379                    }
2380                    self.add(c);
2381                }
2382
2383                ')' => {
2384                    if !is_math || bct == 0 {
2385                        if pct == 0 && is_math {
2386                            return Err(());
2387                        }
2388                        pct -= 1;
2389                    }
2390                    self.add(c);
2391                }
2392
2393                '[' => {
2394                    if !is_math || bct == 0 {
2395                        brct += 1;
2396                    }
2397                    self.add(c);
2398                }
2399
2400                ']' => {
2401                    if !is_math || bct == 0 {
2402                        if brct == 0 && is_math {
2403                            return Err(());
2404                        }
2405                        brct -= 1;
2406                    }
2407                    self.add(c);
2408                }
2409
2410                '"' => {
2411                    if intick || (endchar != '"' && bct == 0) {
2412                        self.add(c);
2413                    } else if bct > 0 {
2414                        self.add(char_tokens::DNULL);
2415                        self.dquote_parse('"', sub)?;
2416                        self.add(char_tokens::DNULL);
2417                    } else {
2418                        return Err(());
2419                    }
2420                }
2421
2422                _ => {
2423                    self.add(c);
2424                }
2425            }
2426        }
2427    }
2428
2429    /// Determine if (( is arithmetic or command
2430    /// Decide whether `( ... )` after a `$` is a math expression
2431    /// `$((...))` or a command substitution `$(...)`. Direct port of
2432    /// zsh/Src/lex.c:495-532 `cmd_or_math`. Tries dquote_parse first;
2433    /// if it succeeds AND the next char is `)` (closing the second
2434    /// paren of `(( ))`), it's math. Otherwise rewinds and treats as
2435    /// a command substitution.
2436    fn cmd_or_math(&mut self) -> CmdOrMath {
2437        let oldlen = self.lexbuf.len();
2438
2439        // Per lex.c:498-518 — `cmd_or_math` calls `dquote_parse(')')`
2440        // which fills lexbuf with ONLY the inner expression, then checks
2441        // for the closing `)`. The surrounding `((` / `))` are NOT added
2442        // to lexbuf. zshrs previously added INPAR + '(' before dquote and
2443        // ')' after, polluting DINPAR's tokstr with the literal parens.
2444        // Removed to match C exactly.
2445        if self.dquote_parse(')', false).is_err() {
2446            // Back up and try as command
2447            while self.lexbuf.len() > oldlen {
2448                if let Some(c) = self.lexbuf.pop() {
2449                    self.hungetc(c);
2450                }
2451            }
2452            self.hungetc('(');
2453            self.lexstop = false;
2454            return if self.skip_command_sub().is_err() {
2455                CmdOrMath::Err
2456            } else {
2457                CmdOrMath::Cmd
2458            };
2459        }
2460
2461        // Check for closing ) — matches C lex.c:511-512: success-with-`)`
2462        // means `((..))` was math. Don't add `)` to lexbuf.
2463        let c = self.hgetc();
2464        if c == Some(')') {
2465            return CmdOrMath::Math;
2466        }
2467
2468        // Not math, back up
2469        if let Some(c) = c {
2470            self.hungetc(c);
2471        }
2472        self.lexstop = false;
2473
2474        // Back up token
2475        while self.lexbuf.len() > oldlen {
2476            if let Some(c) = self.lexbuf.pop() {
2477                self.hungetc(c);
2478            }
2479        }
2480        self.hungetc('(');
2481
2482        if self.skip_command_sub().is_err() {
2483            CmdOrMath::Err
2484        } else {
2485            CmdOrMath::Cmd
2486        }
2487    }
2488
2489    /// Parse `$(...)` or `$((...))` after the `$` has been consumed.
2490    /// Direct port of zsh/Src/lex.c:540-573 `cmd_or_math_sub`. Reads
2491    /// the next char to discriminate: a leading `(` plus successful
2492    /// math parse via `cmd_or_math` → arithmetic substitution (with
2493    /// the open-paren retroactively rewritten to Inparmath); else
2494    /// command substitution via skip_command_sub.
2495    fn cmd_or_math_sub(&mut self) -> CmdOrMath {
2496        const MAX_CONTINUATIONS: usize = 10_000;
2497        let mut continuations = 0;
2498
2499        loop {
2500            continuations += 1;
2501            if continuations > MAX_CONTINUATIONS {
2502                self.error = Some("cmd_or_math_sub: too many line continuations".to_string());
2503                return CmdOrMath::Err;
2504            }
2505
2506            let c = self.hgetc();
2507            if c == Some('\\') {
2508                let c2 = self.hgetc();
2509                if c2 != Some('\n') {
2510                    if let Some(c2) = c2 {
2511                        self.hungetc(c2);
2512                    }
2513                    self.hungetc('\\');
2514                    self.lexstop = false;
2515                    return if self.skip_command_sub().is_err() {
2516                        CmdOrMath::Err
2517                    } else {
2518                        CmdOrMath::Cmd
2519                    };
2520                }
2521                // Line continuation, try again (loop instead of recursion)
2522                continue;
2523            }
2524
2525            // Not a line continuation, process normally
2526            if c == Some('(') {
2527                // Might be $((...))
2528                let lexpos = self.lexbuf.len();
2529                self.add(char_tokens::INPAR);
2530                self.add('(');
2531
2532                if self.dquote_parse(')', false).is_ok() {
2533                    let c2 = self.hgetc();
2534                    if c2 == Some(')') {
2535                        self.add(')');
2536                        return CmdOrMath::Math;
2537                    }
2538                    if let Some(c2) = c2 {
2539                        self.hungetc(c2);
2540                    }
2541                }
2542
2543                // Not math, restore and parse as command
2544                while self.lexbuf.len() > lexpos {
2545                    if let Some(ch) = self.lexbuf.pop() {
2546                        self.hungetc(ch);
2547                    }
2548                }
2549                self.hungetc('(');
2550                self.lexstop = false;
2551            } else {
2552                if let Some(c) = c {
2553                    self.hungetc(c);
2554                }
2555                self.lexstop = false;
2556            }
2557
2558            return if self.skip_command_sub().is_err() {
2559                CmdOrMath::Err
2560            } else {
2561                CmdOrMath::Cmd
2562            };
2563        }
2564    }
2565
2566    /// Skip over `(...)` for command-style substitutions: `$(...)`,
2567    /// `<(...)`, `>(...)`. Direct port of zsh/Src/lex.c:2080-end
2568    /// `skipcomm`. Per the C source comment: "we'll parse the input
2569    /// until we find an unmatched closing parenthesis. However, we'll
2570    /// throw away the result of the parsing and just keep the string
2571    /// we've built up on the way."
2572    ///
2573    /// zshrs port note: the C source uses zcontext_save/restore +
2574    /// strinbeg/inpush to set up an isolated lex context for the
2575    /// throw-away parse. zshrs's standalone walker tracks paren
2576    /// depth directly without re-entering the parser. Same
2577    /// invariant: stops at the matching `)`.
2578    fn skip_command_sub(&mut self) -> Result<(), ()> {
2579        let mut pct = 1;
2580        let mut start = true;
2581        const MAX_ITERATIONS: usize = 100_000;
2582        let mut iterations = 0;
2583
2584        self.add(char_tokens::INPAR);
2585
2586        loop {
2587            iterations += 1;
2588            if iterations > MAX_ITERATIONS {
2589                self.error = Some("skip_command_sub exceeded maximum iterations".to_string());
2590                return Err(());
2591            }
2592
2593            let c = self.hgetc();
2594            let c = match c {
2595                Some(c) => c,
2596                None => {
2597                    self.lexstop = true;
2598                    return Err(());
2599                }
2600            };
2601
2602            let iswhite = Self::is_inblank(c);
2603
2604            match c {
2605                '(' => {
2606                    pct += 1;
2607                    self.add(c);
2608                }
2609                ')' => {
2610                    pct -= 1;
2611                    if pct == 0 {
2612                        return Ok(());
2613                    }
2614                    self.add(c);
2615                }
2616                '\\' => {
2617                    self.add(c);
2618                    if let Some(c) = self.hgetc() {
2619                        self.add(c);
2620                    }
2621                }
2622                '\'' => {
2623                    self.add(c);
2624                    loop {
2625                        let ch = self.hgetc();
2626                        match ch {
2627                            Some('\'') => {
2628                                self.add('\'');
2629                                break;
2630                            }
2631                            Some(ch) => self.add(ch),
2632                            None => {
2633                                self.lexstop = true;
2634                                return Err(());
2635                            }
2636                        }
2637                    }
2638                }
2639                '"' => {
2640                    self.add(c);
2641                    loop {
2642                        let ch = self.hgetc();
2643                        match ch {
2644                            Some('"') => {
2645                                self.add('"');
2646                                break;
2647                            }
2648                            Some('\\') => {
2649                                self.add('\\');
2650                                if let Some(ch) = self.hgetc() {
2651                                    self.add(ch);
2652                                }
2653                            }
2654                            Some(ch) => self.add(ch),
2655                            None => {
2656                                self.lexstop = true;
2657                                return Err(());
2658                            }
2659                        }
2660                    }
2661                }
2662                '`' => {
2663                    self.add(c);
2664                    loop {
2665                        let ch = self.hgetc();
2666                        match ch {
2667                            Some('`') => {
2668                                self.add('`');
2669                                break;
2670                            }
2671                            Some('\\') => {
2672                                self.add('\\');
2673                                if let Some(ch) = self.hgetc() {
2674                                    self.add(ch);
2675                                }
2676                            }
2677                            Some(ch) => self.add(ch),
2678                            None => {
2679                                self.lexstop = true;
2680                                return Err(());
2681                            }
2682                        }
2683                    }
2684                }
2685                '#' if start => {
2686                    self.add(c);
2687                    // Skip comment to end of line
2688                    loop {
2689                        let ch = self.hgetc();
2690                        match ch {
2691                            Some('\n') => {
2692                                self.add('\n');
2693                                break;
2694                            }
2695                            Some(ch) => self.add(ch),
2696                            None => break,
2697                        }
2698                    }
2699                }
2700                _ => {
2701                    self.add(c);
2702                }
2703            }
2704
2705            start = iswhite;
2706        }
2707    }
2708
2709    /// Lex next token AND update per-context flags. Direct port of
2710    /// zsh/Src/lex.c:316-369 `ctxtlex`. The post-token state machine
2711    /// at lex.c:322-358 sets `incmdpos` based on the token shape:
2712    /// list separators / pipes / control keywords reset to cmd-pos;
2713    /// word-shaped tokens leave cmd-pos. Redirections (lex.c:361-368)
2714    /// stash prior incmdpos and force the redir target to non-cmd-pos.
2715    pub fn ctxtlex(&mut self) {
2716        // lex.c:319 — static `oldpos` cache for redir-target restore
2717        // is captured per-call here as `oldpos` below (zshrs's parser
2718        // re-enters ctxtlex per token, no need for static persistence).
2719
2720        // lex.c:321 — `zshlex();` to advance to the next token.
2721        self.zshlex();
2722
2723        // lex.c:322-358 — post-token incmdpos switch.
2724        match self.tok {
2725            // lex.c:323-343 — separators / openers / conjunctions /
2726            // control keywords — back into cmd-pos so the next token
2727            // can be a fresh command.
2728            LexTok::Seper
2729            | LexTok::Newlin
2730            | LexTok::Semi
2731            | LexTok::Dsemi
2732            | LexTok::Semiamp
2733            | LexTok::Semibar
2734            | LexTok::Amper
2735            | LexTok::Amperbang
2736            | LexTok::Inpar
2737            | LexTok::Inbrace
2738            | LexTok::Dbar
2739            | LexTok::Damper
2740            | LexTok::Bar
2741            | LexTok::Baramp
2742            | LexTok::Inoutpar
2743            | LexTok::Doloop
2744            | LexTok::Then
2745            | LexTok::Elif
2746            | LexTok::Else
2747            | LexTok::Doutbrack => {
2748                self.incmdpos = true;
2749            }
2750            // lex.c:345-353 — word/value-shaped tokens leave cmd-pos
2751            // so subsequent tokens are arguments, not a fresh command.
2752            LexTok::String
2753            | LexTok::Typeset
2754            | LexTok::Envarray
2755            | LexTok::Outpar
2756            | LexTok::Case
2757            | LexTok::Dinbrack => {
2758                self.incmdpos = false;
2759            }
2760            _ => {}
2761        }
2762
2763        // lex.c:359-360 — `infor` decay. FOR sets infor=2 so the next
2764        // DINPAR can detect c-style for. After any non-DINPAR, decay
2765        // to 0 (or back to 2 if we just saw FOR again).
2766        if self.tok != LexTok::Dinpar {
2767            self.infor = if self.tok == LexTok::For { 2 } else { 0 };
2768        }
2769
2770        // lex.c:361-368 — redir-target context dance. After consuming
2771        // a redir operator, the following token (the file path) sees
2772        // incmdpos=0 even when its inherent shape would put it back
2773        // in cmd-pos. After the redir target, restore from oldpos
2774        // (struct field — must persist across zshlex calls).
2775        if self.tok.is_redirop()
2776            || self.tok == LexTok::For
2777            || self.tok == LexTok::Foreach
2778            || self.tok == LexTok::Select
2779        {
2780            self.inredir = true;
2781            self.oldpos = self.incmdpos;
2782            self.incmdpos = false;
2783        } else if self.inredir {
2784            self.incmdpos = self.oldpos;
2785            self.inredir = false;
2786        }
2787    }
2788
2789    /// Mark the current word as the one ZLE was looking for. Direct
2790    /// port of zsh/Src/lex.c:1881-1897 `gotword`. Only meaningful
2791    /// when the lexer was started with LEXFLAGS_ZLE for completion;
2792    /// after this call `lexflags` is cleared so subsequent tokens
2793    /// don't re-trigger word tracking.
2794    ///
2795    /// zshrs port note: zsh's gotword updates `wb`/`we` (word begin/
2796    /// end positions) based on `zlemetacs` (cursor pos), `zlemetall`
2797    /// (line length), `inbufct`, and `addedx` — all live in zsh's
2798    /// input.c globals which zshrs hasn't wired through the lexer.
2799    /// Only the `lexflags = 0` side-effect at lex.c:1895 is
2800    /// reproducible without that integration.
2801    pub fn gotword(&mut self) {
2802        // lex.c:1895 — `lexflags = 0;`
2803        self.lexflags = LexFlags::default();
2804    }
2805
2806    /// Register a heredoc to be processed at next newline
2807    pub fn register_heredoc(&mut self, terminator: String, strip_tabs: bool) {
2808        self.heredocs.push(HereDoc {
2809            terminator,
2810            strip_tabs,
2811            content: String::new(),
2812            quoted: false,
2813            processed: false,
2814        });
2815    }
2816
2817    /// Check for reserved word — mirrors lex.c:2002-2015 in `exalias`,
2818    /// but reachable from the bare `zshlex` path (without an
2819    /// AliasResolver). Promotes STRING tokens to keyword tokens when:
2820    ///   - incmdpos is set (or text is `}` ending a brace block)
2821    ///   - text is `]]` and we're inside `[[ ]]` (incond > 0)
2822    ///   - text is bare `!` and we're at the start of a cond (incond == 1)
2823    pub fn check_reserved_word(&mut self) -> bool {
2824        if let Some(ref tokstr) = self.tokstr {
2825            if self.incmdpos || (tokstr == "}" && self.tok == LexTok::String) {
2826                if let Some(tok) = crate::tokens::lookup_reserved_word(tokstr) {
2827                    self.tok = tok;
2828                    if tok == LexTok::Repeat {
2829                        self.inrepeat = 1;
2830                    }
2831                    if tok == LexTok::Dinbrack {
2832                        self.incond = 1;
2833                    }
2834                    return true;
2835                }
2836                if tokstr == "]]" && self.incond > 0 {
2837                    self.tok = LexTok::Doutbrack;
2838                    self.incond = 0;
2839                    return true;
2840                }
2841            }
2842            // lex.c:2010-2014 — `]]` and `!` are recognized inside `[[`
2843            // regardless of incmdpos.
2844            if self.incond > 0 && tokstr == "]]" {
2845                self.tok = LexTok::Doutbrack;
2846                self.incond = 0;
2847                return true;
2848            }
2849            if self.incond == 1 && tokstr == "!" {
2850                self.tok = LexTok::Bang;
2851                return true;
2852            }
2853        }
2854        false
2855    }
2856}
2857
2858/// Result of determining if (( is arithmetic or command
2859enum CmdOrMath {
2860    Cmd,
2861    Math,
2862    Err,
2863}
2864
2865// ============================================================================
2866// Additional parsing functions ported from lex.c
2867// ============================================================================
2868
2869/// Check whether we're looking at valid numeric globbing syntax
2870/// `<N-M>` / `<N->` / `<-M>` / `<->`. Call pointing just after the
2871/// opening `<`. Leaves the input position unchanged, returning true
2872/// or false.
2873///
2874/// Direct port of zsh/Src/lex.c:580-610 `isnumglob`. C source uses
2875/// hgetc/hungetc against the input stream and a temp buffer to
2876/// remember consumed chars; zshrs takes a `(input, pos)` slice and
2877/// scans without consumption. Same predicate, different I/O model.
2878pub fn isnumglob(input: &str, pos: usize) -> bool {
2879    let chars: Vec<char> = input[pos..].chars().collect();
2880    let mut i = 0;
2881    let mut expect_close = false;
2882
2883    // Look for digits, then -, then digits, then >
2884    while i < chars.len() {
2885        let c = chars[i];
2886        if c.is_ascii_digit() {
2887            i += 1;
2888        } else if c == '-' && !expect_close {
2889            expect_close = true;
2890            i += 1;
2891        } else if c == '>' && expect_close {
2892            return true;
2893        } else {
2894            break;
2895        }
2896    }
2897    false
2898}
2899
2900/// Tokenize a string as if in double quotes (error-tolerant variant).
2901///
2902/// Direct port of zsh/Src/lex.c:1713-1733 `parsestrnoerr`. The C
2903/// source: zcontext_save → untokenize → inpush → strinbeg →
2904/// `lexbuf.ptr = tokstr = *s; lexbuf.siz = l + 1` →
2905/// `err = dquote_parse('\0', 1)` → strinend → inpop → zcontext_restore.
2906/// Returns the tokenized string on success, or the offending char as
2907/// an error code (zsh convention: `> 32 && < 127` → printable, else
2908/// generic).
2909///
2910/// zshrs port: the C version drives the lexer's dquote_parse method
2911/// against the input string. zshrs's standalone walker produces the
2912/// same BNULL/QSTRING/QTICK token markers without re-entering the
2913/// lexer — same output for typical bodies. Documented divergence:
2914/// nested cmd-sub `$(...)` and arith `$((...))` aren't lexed
2915/// recursively; the runtime handles them at expansion time.
2916pub fn parsestrnoerr(s: &str) -> Result<String, String> {
2917    parsestr_inner(s)
2918}
2919
2920/// Tokenize a string as if in double quotes (error-reporting variant).
2921///
2922/// Direct port of zsh/Src/lex.c:1693-1709 `parsestr`. C source:
2923/// `if ((err = parsestrnoerr(s))) { untokenize(*s); ... zerr("parse
2924/// error near `%c'", err); tok = LEXERR; }`. zshrs's wrapper
2925/// returns the same Result and lets the caller emit the diagnostic.
2926///
2927/// Both `parsestr` and `parsestrnoerr` share the inner walker; the
2928/// only difference in C is whether errors trigger `zerr`. zshrs
2929/// returns `Err(msg)` from both — the caller decides whether to
2930/// surface the diagnostic.
2931pub fn parsestr(s: &str) -> Result<String, String> {
2932    parsestr_inner(s)
2933}
2934
2935/// Shared body for parsestr / parsestrnoerr.
2936fn parsestr_inner(s: &str) -> Result<String, String> {
2937    let mut result = String::with_capacity(s.len());
2938    let chars: Vec<char> = s.chars().collect();
2939    let mut i = 0;
2940
2941    while i < chars.len() {
2942        let c = chars[i];
2943        match c {
2944            '\\' => {
2945                i += 1;
2946                if i < chars.len() {
2947                    let next = chars[i];
2948                    match next {
2949                        '$' | '\\' | '`' | '"' | '\n' => {
2950                            result.push(char_tokens::BNULL);
2951                            result.push(next);
2952                        }
2953                        _ => {
2954                            result.push('\\');
2955                            result.push(next);
2956                        }
2957                    }
2958                } else {
2959                    result.push('\\');
2960                }
2961            }
2962            '$' => {
2963                result.push(char_tokens::QSTRING);
2964                if i + 1 < chars.len() {
2965                    let next = chars[i + 1];
2966                    if next == '{' {
2967                        result.push(char_tokens::INBRACE);
2968                        i += 1;
2969                    } else if next == '(' {
2970                        result.push(char_tokens::INPAR);
2971                        i += 1;
2972                    }
2973                }
2974            }
2975            '`' => {
2976                result.push(char_tokens::QTICK);
2977            }
2978            _ => {
2979                result.push(c);
2980            }
2981        }
2982        i += 1;
2983    }
2984
2985    Ok(result)
2986}
2987
2988/// Parse a subscript in string s. Return the position after the
2989/// closing bracket, or None on error.
2990///
2991/// Direct port of zsh/Src/lex.c:1742-1788 `parse_subscript`. The C
2992/// source uses dupstring_wlen + inpush + dquote_parse to lex the
2993/// subscript through the main lexer; zshrs implements a focused
2994/// bracket-balancing walker that handles the same nesting rules
2995/// (`[...]`, `(...)`, `{...}`) without re-entering the lexer.
2996///
2997/// zshrs port note: zsh's parse_subscript also handles a `sub`
2998/// flag that controls whether `$` and quotes are tokenized — that
2999/// flag isn't exposed here. Most callers don't need it; the few
3000/// that do (parameter expansion's `${var[expr]}`) handle the
3001/// quote-aware lex separately at the expansion layer.
3002pub fn parse_subscript(s: &str, endchar: char) -> Option<usize> {
3003    if s.is_empty() || s.starts_with(endchar) {
3004        return None;
3005    }
3006
3007    let chars: Vec<char> = s.chars().collect();
3008    let mut i = 0;
3009    let mut depth = 0;
3010    let mut in_dquote = false;
3011    let mut in_squote = false;
3012
3013    while i < chars.len() {
3014        let c = chars[i];
3015
3016        if in_squote {
3017            if c == '\'' {
3018                in_squote = false;
3019            }
3020            i += 1;
3021            continue;
3022        }
3023
3024        if in_dquote {
3025            if c == '"' {
3026                in_dquote = false;
3027            } else if c == '\\' && i + 1 < chars.len() {
3028                i += 1; // skip escaped char
3029            }
3030            i += 1;
3031            continue;
3032        }
3033
3034        match c {
3035            '\\' => {
3036                i += 1; // skip next char
3037            }
3038            '\'' => {
3039                in_squote = true;
3040            }
3041            '"' => {
3042                in_dquote = true;
3043            }
3044            '[' | '(' => {
3045                depth += 1;
3046            }
3047            ']' | ')' => {
3048                if depth > 0 {
3049                    depth -= 1;
3050                } else if c == endchar {
3051                    return Some(i);
3052                }
3053            }
3054            _ => {}
3055        }
3056
3057        if c == endchar && depth == 0 {
3058            return Some(i);
3059        }
3060
3061        i += 1;
3062    }
3063
3064    None
3065}
3066
3067/// Tokenize a string as if it were a normal command-line argument
3068/// but it may contain separators. Used for ${...%...} substitutions.
3069///
3070/// Direct port of zsh/Src/lex.c:1796-1880 `parse_subst_string`.
3071/// zsh's version sets `noaliases = 1` + `lexflags = 0` + uses
3072/// zcontext_save/inpush/strinbeg → dquote_parse('\0', 1) →
3073/// strinend/inpop/zcontext_restore. zshrs's standalone walker
3074/// produces the same BNULL/SNULL/DNULL/INPAR/INBRACK markers
3075/// without re-entering the lexer.
3076///
3077/// zshrs port note: the C source returns int (0=ok, char value =
3078/// where it stopped on error); zshrs returns Result<String,String>
3079/// returning the tokenized text directly. Lossy for callers that
3080/// need to know the exact stop position, but nothing in zshrs's
3081/// expansion layer uses that yet.
3082pub fn parse_subst_string(s: &str) -> Result<String, String> {
3083    if s.is_empty() {
3084        return Ok(String::new());
3085    }
3086
3087    let mut result = String::with_capacity(s.len());
3088    let chars: Vec<char> = s.chars().collect();
3089    let mut i = 0;
3090
3091    while i < chars.len() {
3092        let c = chars[i];
3093        match c {
3094            '\\' => {
3095                result.push(char_tokens::BNULL);
3096                i += 1;
3097                if i < chars.len() {
3098                    result.push(chars[i]);
3099                }
3100            }
3101            '\'' => {
3102                result.push(char_tokens::SNULL);
3103                i += 1;
3104                while i < chars.len() && chars[i] != '\'' {
3105                    result.push(chars[i]);
3106                    i += 1;
3107                }
3108                result.push(char_tokens::SNULL);
3109            }
3110            '"' => {
3111                result.push(char_tokens::DNULL);
3112                i += 1;
3113                while i < chars.len() && chars[i] != '"' {
3114                    if chars[i] == '\\' && i + 1 < chars.len() {
3115                        result.push(char_tokens::BNULL);
3116                        i += 1;
3117                        result.push(chars[i]);
3118                    } else if chars[i] == '$' {
3119                        result.push(char_tokens::QSTRING);
3120                    } else {
3121                        result.push(chars[i]);
3122                    }
3123                    i += 1;
3124                }
3125                result.push(char_tokens::DNULL);
3126            }
3127            '$' => {
3128                result.push(char_tokens::STRING);
3129                if i + 1 < chars.len() {
3130                    match chars[i + 1] {
3131                        '{' => {
3132                            result.push(char_tokens::INBRACE);
3133                            i += 1;
3134                        }
3135                        '(' => {
3136                            result.push(char_tokens::INPAR);
3137                            i += 1;
3138                        }
3139                        _ => {}
3140                    }
3141                }
3142            }
3143            '*' => result.push(char_tokens::STAR),
3144            '?' => result.push(char_tokens::QUEST),
3145            '[' => result.push(char_tokens::INBRACK),
3146            ']' => result.push(char_tokens::OUTBRACK),
3147            '{' => result.push(char_tokens::INBRACE),
3148            '}' => result.push(char_tokens::OUTBRACE),
3149            '~' => result.push(char_tokens::TILDE),
3150            '#' => result.push(char_tokens::POUND),
3151            '^' => result.push(char_tokens::HAT),
3152            _ => result.push(c),
3153        }
3154        i += 1;
3155    }
3156
3157    Ok(result)
3158}
3159
3160/// Untokenize a string - convert tokenized chars back to original
3161///
3162/// Port of untokenize() from exec.c (but used by lexer too)
3163/// Like `untokenize`, but maps SNULL → `'` and DNULL → `"` instead of
3164/// stripping them. Used by callers that need the source form including
3165/// quoting (e.g. arithmetic-substitution detection in compile_zsh).
3166pub fn untokenize_preserve_quotes(s: &str) -> String {
3167    let mut result = String::with_capacity(s.len() + 4);
3168    for c in s.chars() {
3169        let cu = c as u32;
3170        if (0x83..=0x9f).contains(&cu) {
3171            match c {
3172                c if c == char_tokens::POUND => result.push('#'),
3173                c if c == char_tokens::STRING => result.push('$'),
3174                c if c == char_tokens::HAT => result.push('^'),
3175                c if c == char_tokens::STAR => result.push('*'),
3176                c if c == char_tokens::INPAR => result.push('('),
3177                c if c == char_tokens::OUTPAR => result.push(')'),
3178                c if c == char_tokens::INPARMATH => result.push('('),
3179                c if c == char_tokens::OUTPARMATH => result.push(')'),
3180                c if c == char_tokens::QSTRING => result.push('$'),
3181                c if c == char_tokens::EQUALS => result.push('='),
3182                c if c == char_tokens::BAR => result.push('|'),
3183                c if c == char_tokens::INBRACE => result.push('{'),
3184                c if c == char_tokens::OUTBRACE => result.push('}'),
3185                c if c == char_tokens::INBRACK => result.push('['),
3186                c if c == char_tokens::OUTBRACK => result.push(']'),
3187                c if c == char_tokens::TICK => result.push('`'),
3188                c if c == char_tokens::INANG => result.push('<'),
3189                c if c == char_tokens::OUTANG => result.push('>'),
3190                c if c == char_tokens::OUTANGPROC => result.push('>'),
3191                c if c == char_tokens::QUEST => result.push('?'),
3192                c if c == char_tokens::TILDE => result.push('~'),
3193                c if c == char_tokens::QTICK => result.push('`'),
3194                c if c == char_tokens::COMMA => result.push(','),
3195                c if c == char_tokens::DASH => result.push('-'),
3196                c if c == char_tokens::BANG => result.push('!'),
3197                c if c == char_tokens::SNULL => result.push('\''),
3198                c if c == char_tokens::DNULL => result.push('"'),
3199                c if c == char_tokens::BNULL => result.push('\\'),
3200                _ => {
3201                    let idx = c as usize;
3202                    if idx < char_tokens::ZTOKENS.len() {
3203                        result.push(char_tokens::ZTOKENS.chars().nth(idx).unwrap_or(c));
3204                    } else {
3205                        result.push(c);
3206                    }
3207                }
3208            }
3209        } else {
3210            result.push(c);
3211        }
3212    }
3213    result
3214}
3215
3216pub fn untokenize(s: &str) -> String {
3217    let mut result = String::with_capacity(s.len());
3218    let chars: Vec<char> = s.chars().collect();
3219    let mut i = 0;
3220
3221    while i < chars.len() {
3222        let c = chars[i];
3223        // Token chars live in zsh's META range (0x83 = META through 0x9f =
3224        // BNULL). Anything in that range needs un-mapping before display
3225        // or downstream consumption. The original `< 32` test was wrong —
3226        // none of zsh's tokens land in that range.
3227        let cu = c as u32;
3228        if (0x83..=0x9f).contains(&cu) {
3229            // Convert token back to original character
3230            match c {
3231                c if c == char_tokens::POUND => result.push('#'),
3232                c if c == char_tokens::STRING => result.push('$'),
3233                c if c == char_tokens::HAT => result.push('^'),
3234                c if c == char_tokens::STAR => result.push('*'),
3235                c if c == char_tokens::INPAR => result.push('('),
3236                c if c == char_tokens::OUTPAR => result.push(')'),
3237                c if c == char_tokens::INPARMATH => result.push('('),
3238                c if c == char_tokens::OUTPARMATH => result.push(')'),
3239                c if c == char_tokens::QSTRING => result.push('$'),
3240                c if c == char_tokens::EQUALS => result.push('='),
3241                c if c == char_tokens::BAR => result.push('|'),
3242                c if c == char_tokens::INBRACE => result.push('{'),
3243                c if c == char_tokens::OUTBRACE => result.push('}'),
3244                c if c == char_tokens::INBRACK => result.push('['),
3245                c if c == char_tokens::OUTBRACK => result.push(']'),
3246                c if c == char_tokens::TICK => result.push('`'),
3247                c if c == char_tokens::INANG => result.push('<'),
3248                c if c == char_tokens::OUTANG => result.push('>'),
3249                c if c == char_tokens::OUTANGPROC => result.push('>'),
3250                c if c == char_tokens::QUEST => result.push('?'),
3251                c if c == char_tokens::TILDE => result.push('~'),
3252                c if c == char_tokens::QTICK => result.push('`'),
3253                c if c == char_tokens::COMMA => result.push(','),
3254                c if c == char_tokens::DASH => result.push('-'),
3255                c if c == char_tokens::BANG => result.push('!'),
3256                c if c == char_tokens::SNULL
3257                    || c == char_tokens::DNULL
3258                    || c == char_tokens::BNULL =>
3259                {
3260                    // Null markers - skip
3261                }
3262                _ => {
3263                    // Unknown token, try ztokens lookup
3264                    let idx = c as usize;
3265                    if idx < char_tokens::ZTOKENS.len() {
3266                        result.push(char_tokens::ZTOKENS.chars().nth(idx).unwrap_or(c));
3267                    } else {
3268                        result.push(c);
3269                    }
3270                }
3271            }
3272        } else {
3273            result.push(c);
3274        }
3275        i += 1;
3276    }
3277
3278    result
3279}
3280
3281/// Check if a string contains any token characters
3282pub fn has_token(s: &str) -> bool {
3283    s.chars().any(|c| (c as u32) < 32)
3284}
3285
3286/// Convert token characters to their printable form for display
3287pub fn tokens_to_printable(s: &str) -> String {
3288    untokenize(s)
3289}
3290
3291#[cfg(test)]
3292mod tests {
3293    use super::*;
3294
3295    #[test]
3296    fn test_simple_command() {
3297        let mut lexer = ZshLexer::new("echo hello");
3298        lexer.zshlex();
3299        assert_eq!(lexer.tok, LexTok::String);
3300        assert_eq!(lexer.tokstr, Some("echo".to_string()));
3301
3302        lexer.zshlex();
3303        assert_eq!(lexer.tok, LexTok::String);
3304        assert_eq!(lexer.tokstr, Some("hello".to_string()));
3305
3306        lexer.zshlex();
3307        assert_eq!(lexer.tok, LexTok::Endinput);
3308    }
3309
3310    #[test]
3311    fn test_pipeline() {
3312        let mut lexer = ZshLexer::new("ls | grep foo");
3313        lexer.zshlex();
3314        assert_eq!(lexer.tok, LexTok::String);
3315
3316        lexer.zshlex();
3317        assert_eq!(lexer.tok, LexTok::Bar);
3318
3319        lexer.zshlex();
3320        assert_eq!(lexer.tok, LexTok::String);
3321
3322        lexer.zshlex();
3323        assert_eq!(lexer.tok, LexTok::String);
3324    }
3325
3326    #[test]
3327    fn test_redirections() {
3328        let mut lexer = ZshLexer::new("echo > file");
3329        lexer.zshlex();
3330        assert_eq!(lexer.tok, LexTok::String);
3331
3332        lexer.zshlex();
3333        assert_eq!(lexer.tok, LexTok::Outang);
3334
3335        lexer.zshlex();
3336        assert_eq!(lexer.tok, LexTok::String);
3337    }
3338
3339    #[test]
3340    fn test_heredoc() {
3341        let mut lexer = ZshLexer::new("cat << EOF");
3342        lexer.zshlex();
3343        assert_eq!(lexer.tok, LexTok::String);
3344
3345        lexer.zshlex();
3346        assert_eq!(lexer.tok, LexTok::Dinang);
3347
3348        lexer.zshlex();
3349        assert_eq!(lexer.tok, LexTok::String);
3350    }
3351
3352    #[test]
3353    fn test_single_quotes() {
3354        let mut lexer = ZshLexer::new("echo 'hello world'");
3355        lexer.zshlex();
3356        assert_eq!(lexer.tok, LexTok::String);
3357
3358        lexer.zshlex();
3359        assert_eq!(lexer.tok, LexTok::String);
3360        // Should contain Snull markers around literal content
3361        assert!(lexer.tokstr.is_some());
3362    }
3363
3364    #[test]
3365    fn test_function_tokens() {
3366        let mut lexer = ZshLexer::new("function foo { }");
3367        lexer.zshlex();
3368        assert_eq!(
3369            lexer.tok,
3370            LexTok::Func,
3371            "expected Func, got {:?}",
3372            lexer.tok
3373        );
3374
3375        lexer.zshlex();
3376        assert_eq!(
3377            lexer.tok,
3378            LexTok::String,
3379            "expected String for 'foo', got {:?}",
3380            lexer.tok
3381        );
3382        assert_eq!(lexer.tokstr, Some("foo".to_string()));
3383
3384        lexer.zshlex();
3385        assert_eq!(
3386            lexer.tok,
3387            LexTok::Inbrace,
3388            "expected Inbrace, got {:?} tokstr={:?}",
3389            lexer.tok,
3390            lexer.tokstr
3391        );
3392
3393        lexer.zshlex();
3394        assert_eq!(
3395            lexer.tok,
3396            LexTok::Outbrace,
3397            "expected Outbrace, got {:?} tokstr={:?} incmdpos={}",
3398            lexer.tok,
3399            lexer.tokstr,
3400            lexer.incmdpos
3401        );
3402    }
3403
3404    #[test]
3405    fn test_double_quotes() {
3406        let mut lexer = ZshLexer::new("echo \"hello $name\"");
3407        lexer.zshlex();
3408        assert_eq!(lexer.tok, LexTok::String);
3409
3410        lexer.zshlex();
3411        assert_eq!(lexer.tok, LexTok::String);
3412        // Should contain tokenized content
3413        assert!(lexer.tokstr.is_some());
3414    }
3415
3416    #[test]
3417    fn test_command_substitution() {
3418        let mut lexer = ZshLexer::new("echo $(pwd)");
3419        lexer.zshlex();
3420        assert_eq!(lexer.tok, LexTok::String);
3421
3422        lexer.zshlex();
3423        assert_eq!(lexer.tok, LexTok::String);
3424    }
3425
3426    #[test]
3427    fn test_env_assignment() {
3428        let mut lexer = ZshLexer::new("FOO=bar echo");
3429        lexer.incmdpos = true;
3430        lexer.zshlex();
3431        assert_eq!(
3432            lexer.tok,
3433            LexTok::Envstring,
3434            "tok={:?} tokstr={:?}",
3435            lexer.tok,
3436            lexer.tokstr
3437        );
3438
3439        lexer.zshlex();
3440        assert_eq!(lexer.tok, LexTok::String);
3441    }
3442
3443    #[test]
3444    fn test_array_assignment() {
3445        let mut lexer = ZshLexer::new("arr=(a b c)");
3446        lexer.incmdpos = true;
3447        lexer.zshlex();
3448        assert_eq!(lexer.tok, LexTok::Envarray);
3449    }
3450
3451    #[test]
3452    fn test_process_substitution() {
3453        let mut lexer = ZshLexer::new("diff <(ls) >(cat)");
3454        lexer.zshlex();
3455        assert_eq!(lexer.tok, LexTok::String);
3456
3457        lexer.zshlex();
3458        assert_eq!(lexer.tok, LexTok::String);
3459        // <(ls) is tokenized into the string
3460
3461        lexer.zshlex();
3462        assert_eq!(lexer.tok, LexTok::String);
3463        // >(cat) is tokenized
3464    }
3465
3466    #[test]
3467    fn test_arithmetic() {
3468        let mut lexer = ZshLexer::new("echo $((1+2))");
3469        lexer.zshlex();
3470        assert_eq!(lexer.tok, LexTok::String);
3471
3472        lexer.zshlex();
3473        assert_eq!(lexer.tok, LexTok::String);
3474    }
3475
3476    #[test]
3477    fn test_semicolon_variants() {
3478        let mut lexer = ZshLexer::new("case x in a) cmd;; b) cmd;& c) cmd;| esac");
3479
3480        // Skip to first ;;
3481        loop {
3482            lexer.zshlex();
3483            if lexer.tok == LexTok::Dsemi || lexer.tok == LexTok::Endinput {
3484                break;
3485            }
3486        }
3487        assert_eq!(lexer.tok, LexTok::Dsemi);
3488
3489        // Find ;&
3490        loop {
3491            lexer.zshlex();
3492            if lexer.tok == LexTok::Semiamp || lexer.tok == LexTok::Endinput {
3493                break;
3494            }
3495        }
3496        assert_eq!(lexer.tok, LexTok::Semiamp);
3497
3498        // Find ;|
3499        loop {
3500            lexer.zshlex();
3501            if lexer.tok == LexTok::Semibar || lexer.tok == LexTok::Endinput {
3502                break;
3503            }
3504        }
3505        assert_eq!(lexer.tok, LexTok::Semibar);
3506    }
3507}
zshrs_parse/lexer.rs

zshrs_parse/
lexer.rs