zshrs_parse/
lexer.rs

1//! Zsh lexical analyzer - Direct port from zsh/Src/lex.c
2//!
3//! This lexer tokenizes zsh shell input into a stream of tokens.
4//! It handles all zsh-specific syntax including:
5//! - Single/double/dollar quotes
6//! - Command substitution $(...)  and `...`
7//! - Arithmetic $((...))
8//! - Parameter expansion ${...}
9//! - Process substitution <(...) >(...)
10//! - Here documents
11//! - All redirection operators
12//! - Comments
13//! - Continuation lines
14
15use crate::tokens::{char_tokens, LexTok};
16use std::collections::VecDeque;
17
18/// Lexer flags controlling behavior
19#[derive(Debug, Clone, Copy, Default)]
20pub struct LexFlags {
21    /// Parsing for ZLE (line editor) completion
22    pub zle: bool,
23    /// Return newlines as tokens
24    pub newline: bool,
25    /// Preserve comments in output
26    pub comments_keep: bool,
27    /// Strip comments from output
28    pub comments_strip: bool,
29    /// Active lexing (from bufferwords)
30    pub active: bool,
31}
32
33/// Buffer state for building tokens
34#[derive(Debug, Clone)]
35struct LexBuf {
36    data: String,
37    siz: usize,
38}
39
40impl LexBuf {
41    fn new() -> Self {
42        LexBuf {
43            data: String::with_capacity(256),
44            siz: 256,
45        }
46    }
47
48    fn clear(&mut self) {
49        self.data.clear();
50    }
51
52    fn add(&mut self, c: char) {
53        self.data.push(c);
54        if self.data.len() >= self.siz {
55            self.siz *= 2;
56            self.data.reserve(self.siz - self.data.len());
57        }
58    }
59
60    #[allow(dead_code)]
61    fn add_str(&mut self, s: &str) {
62        self.data.push_str(s);
63    }
64
65    fn len(&self) -> usize {
66        self.data.len()
67    }
68
69    fn as_str(&self) -> &str {
70        &self.data
71    }
72
73    #[allow(dead_code)]
74    fn into_string(self) -> String {
75        self.data
76    }
77
78    #[allow(dead_code)]
79    fn last_char(&self) -> Option<char> {
80        self.data.chars().last()
81    }
82
83    fn pop(&mut self) -> Option<char> {
84        self.data.pop()
85    }
86}
87
88/// Here-document state
89#[derive(Debug, Clone)]
90pub struct HereDoc {
91    pub terminator: String,
92    pub strip_tabs: bool,
93    pub content: String,
94    /// True if the terminator was originally quoted (`<<'EOF'`,
95    /// `<<"EOF"`, or `<<\EOF`). Disables variable expansion / command
96    /// substitution / arithmetic in the body.
97    pub quoted: bool,
98    /// True once `process_heredocs` has read the body. Distinct from
99    /// "content is empty" because an empty heredoc legitimately has
100    /// empty content.
101    pub processed: bool,
102}
103
104/// The Zsh Lexer
105pub struct ZshLexer<'a> {
106    /// Input source
107    pub(crate) input: &'a str,
108    /// Current position in input
109    pub(crate) pos: usize,
110    /// Look-ahead buffer for ungotten characters
111    unget_buf: VecDeque<char>,
112    /// Current token string
113    pub tokstr: Option<String>,
114    /// Current token type
115    pub tok: LexTok,
116    /// File descriptor for redirections (e.g., 2> means fd=2)
117    pub tokfd: i32,
118    /// Line number at start of current token
119    pub toklineno: u64,
120    /// Current line number
121    pub lineno: u64,
122    /// Lexer has stopped (EOF or error)
123    pub lexstop: bool,
124    /// In command position (can accept reserved words)
125    pub incmdpos: bool,
126    /// In condition [[ ... ]]
127    pub incond: i32,
128    /// In pattern context (RHS of == != =~ in [[ ]])
129    pub incondpat: bool,
130    /// In case pattern
131    pub incasepat: i32,
132    /// In redirection
133    pub inredir: bool,
134    /// After 'for' keyword
135    pub infor: i32,
136    /// After 'repeat' keyword
137    inrepeat: i32,
138    /// Parsing typeset arguments
139    pub intypeset: bool,
140    /// Inside (( ... )) arithmetic
141    dbparens: bool,
142    /// Disable alias expansion
143    pub noaliases: bool,
144    /// Disable spelling correction
145    pub nocorrect: i32,
146    /// Disable comment recognition
147    pub nocomments: bool,
148    /// Lexer flags
149    pub lexflags: LexFlags,
150    /// Whether this is the first line
151    pub isfirstln: bool,
152    /// Whether this is the first char of command
153    #[allow(dead_code)]
154    isfirstch: bool,
155    /// Pending here-documents
156    pub heredocs: Vec<HereDoc>,
157    /// Expecting heredoc terminator (0 = no, 1 = <<, 2 = <<-)
158    heredoc_pending: u8,
159    /// Token buffer
160    lexbuf: LexBuf,
161    /// After newline
162    pub isnewlin: i32,
163    /// Error message if any
164    pub error: Option<String>,
165    /// Global iteration counter for infinite loop detection
166    global_iterations: usize,
167    /// Recursion depth counter
168    recursion_depth: usize,
169    /// Raw-input capture flag — when nonzero, every char read through
170    /// `hgetc` is also appended to `tokstr_raw` via zshlex_raw_add.
171    /// Direct mirror of zsh/Src/lex.c:161 `lex_add_raw`. Used by
172    /// skipcomm (lex.c:2082) to preserve the literal text of `$(...)`
173    /// command substitutions for re-execution / display.
174    pub lex_add_raw: i32,
175    /// Raw-input capture buffer. Direct mirror of lex.c:165
176    /// `tokstr_raw` / lex.c:166 `lexbuf_raw`. Combined into one
177    /// `LexBuf` here since Rust's String tracks both the data and
178    /// length internally.
179    lexbuf_raw: LexBuf,
180}
181
182const MAX_LEXER_RECURSION: usize = 200;
183
184/// Per-alias info returned by `AliasResolver::lookup_alias` and
185/// `lookup_suffix_alias`. Mirrors zsh's `struct alias` fields used
186/// at lex.c:1914-1943: `text` (replacement body), `in_use` (the
187/// recursion-guard flag), `global` (vs command-position-only).
188#[derive(Debug, Clone)]
189pub struct AliasInfo {
190    pub text: String,
191    pub in_use: bool,
192    pub global: bool,
193}
194
195/// Trait the lexer uses to look up aliases and reserved words during
196/// `exalias`. Implementors typically delegate to the executor's
197/// alias/reswd hash tables. Defining the trait here keeps lexer.rs
198/// free of executor-specific types — same pattern zsh uses with the
199/// hashtable.h opaque-handle approach against aliastab/reswdtab/
200/// sufaliastab.
201pub trait AliasResolver {
202    /// Look up an alias by name. Returns `None` if not found, or the
203    /// alias body + flags otherwise.
204    fn lookup_alias(&self, name: &str) -> Option<AliasInfo>;
205    /// Look up a suffix alias (e.g. `.txt → less`) by suffix only.
206    fn lookup_suffix_alias(&self, suffix: &str) -> Option<AliasInfo>;
207    /// Resolve a reserved word. Returns the LexTok the word should
208    /// promote to (e.g. "if" → IF), or None if not a reswd.
209    fn lookup_reswd(&self, name: &str) -> Option<LexTok>;
210    /// Mark an alias as in-use (recursion guard). Called when an
211    /// alias is about to be expanded; the matching unmark happens
212    /// when the alias text has been fully consumed by the lexer.
213    fn mark_in_use(&mut self, name: &str, in_use: bool);
214}
215
216/// Saved lexical state for nested-context handling. Direct port of
217/// `struct lex_stack` declared in zsh/Src/zsh.h and used by
218/// zsh/Src/lex.c:215-239 (`lex_context_save`) and lex.c:244-262
219/// (`lex_context_restore`). Used when entering command substitution,
220/// here-docs, or eval where the outer lexer state must be pushed and
221/// restored after the inner parse completes.
222#[derive(Debug, Clone)]
223pub struct LexStack {
224    pub dbparens: bool,
225    pub isfirstln: bool,
226    pub isfirstch: bool,
227    pub lexflags: LexFlags,
228    pub tok: LexTok,
229    pub tokstr: Option<String>,
230    pub lexbuf_data: String,
231    pub lexbuf_siz: usize,
232    pub lexstop: bool,
233    pub toklineno: u64,
234}
235
236impl Default for LexStack {
237    fn default() -> Self {
238        // Mirrors lex.c:235-238 reset state after a save: tokstr / lexbuf
239        // zeroed, lexbuf.siz back to the initial 256 alloc, tok to
240        // ENDINPUT (the C source doesn't explicitly reset tok here but
241        // the natural baseline is ENDINPUT — same as lexinit).
242        LexStack {
243            dbparens: false,
244            isfirstln: false,
245            isfirstch: false,
246            lexflags: LexFlags::default(),
247            tok: LexTok::Endinput,
248            tokstr: None,
249            lexbuf_data: String::new(),
250            lexbuf_siz: 256,
251            lexstop: false,
252            toklineno: 0,
253        }
254    }
255}
256
257impl<'a> ZshLexer<'a> {
258    /// Create a new lexer for the given input
259    pub fn new(input: &'a str) -> Self {
260        ZshLexer {
261            input,
262            pos: 0,
263            unget_buf: VecDeque::new(),
264            tokstr: None,
265            tok: LexTok::Endinput,
266            tokfd: -1,
267            toklineno: 1,
268            lineno: 1,
269            lexstop: false,
270            incmdpos: true,
271            incond: 0,
272            incondpat: false,
273            incasepat: 0,
274            inredir: false,
275            infor: 0,
276            inrepeat: 0,
277            intypeset: false,
278            dbparens: false,
279            noaliases: false,
280            nocorrect: 0,
281            nocomments: false,
282            lexflags: LexFlags::default(),
283            isfirstln: true,
284            isfirstch: true,
285            heredocs: Vec::new(),
286            heredoc_pending: 0,
287            lexbuf: LexBuf::new(),
288            isnewlin: 0,
289            error: None,
290            global_iterations: 0,
291            recursion_depth: 0,
292            lex_add_raw: 0,
293            lexbuf_raw: LexBuf::new(),
294        }
295    }
296
297    /// Append a char to the raw-input capture buffer. Direct port of
298    /// zsh/Src/lex.c:2024-2039 `zshlex_raw_add`. Called from hgetc
299    /// when `lex_add_raw` is nonzero so cmd-sub bodies (`$(...)`,
300    /// `<(...)`, `>(...)`) can be replayed verbatim without re-lexing.
301    pub fn zshlex_raw_add(&mut self, c: char) {
302        // lex.c:2027-2028 — guard on lex_add_raw flag.
303        if self.lex_add_raw == 0 {
304            return;
305        }
306        // lex.c:2030-2038 — append to lexbuf_raw. The C source manages
307        // explicit ptr/len/siz with hrealloc; Rust's String handles
308        // resize automatically.
309        self.lexbuf_raw.add(c);
310    }
311
312    /// Run alias / reserved-word expansion on the just-lexed token.
313    /// Direct port of zsh/Src/lex.c:1949-2021 `exalias`. Returns true
314    /// if an alias was injected (the caller's loop should re-run
315    /// gettok to consume the injected text).
316    ///
317    /// C source flow:
318    ///   1. Spell-correct (lex.c:1958-1962) — disabled in zshrs.
319    ///   2. If tokstr is None: set lextext from `tokstrings[tok]` and
320    ///      checkalias against that (lex.c:1964-1969).
321    ///   3. Otherwise: untokenize tokstr into a working copy (lex.c:
322    ///      1971-1980).
323    ///   4. ZLE word-tracking: call gotword() if LEXFLAGS_ZLE
324    ///      (lex.c:1982-1991).
325    ///   5. STRING tokens: try checkalias, then reservation lookup
326    ///      (lex.c:1993-2015).
327    ///   6. Clear inalmore (lex.c:2016).
328    ///
329    /// Takes an `AliasResolver` trait object so the lexer doesn't
330    /// hard-depend on the executor's alias-table types. zshrs callers
331    /// implement `AliasResolver` over their alias hash tables.
332    pub fn exalias<R: AliasResolver>(&mut self, resolver: &mut R) -> bool {
333        // lex.c:1957 — `hwend()` ends the history-word region. zshrs's
334        // history layer doesn't track per-word boundaries here; no-op.
335
336        // lex.c:1958-1962 — spell correction via spckword. zshrs
337        // doesn't implement spell correction yet; documented divergence.
338
339        // lex.c:1964-1969 — bare-token path (no tokstr).
340        if self.tokstr.is_none() {
341            // lex.c:1965 — `zshlextext = tokstrings[tok];` — for tokens
342            // like SEMI/AMPER/etc. the canonical text comes from a
343            // static table. zshrs's check_alias_for_text uses the
344            // resolver directly with the token's text representation.
345            if self.tok == LexTok::Newlin {
346                return false;
347            }
348            // Use punctuation-token text; unknown tokens skip alias.
349            let text = match self.tok {
350                LexTok::Semi => ";",
351                LexTok::Amper => "&",
352                LexTok::Bar => "|",
353                _ => return false,
354            };
355            return self.check_alias(resolver, text);
356        }
357
358        let tokstr = self.tokstr.clone().unwrap();
359        // lex.c:1973-1980 — untokenize: convert the lexer's internal
360        // tokenized form (Pound..ztokens shifts) into the literal
361        // shell text. Call the global helper.
362        let lextext = if has_token(&tokstr) {
363            untokenize(&tokstr)
364        } else {
365            tokstr.clone()
366        };
367
368        // lex.c:1982-1991 — ZLE word-tracking for completion.
369        if self.lexflags.zle {
370            let zp = self.lexflags;
371            self.gotword();
372            // lex.c:1986-1990 — if gotword cleared lexflags, the cursor
373            // word has been reached; abort exalias so completion can
374            // capture the partial token unchanged.
375            if zp.zle && !self.lexflags.zle {
376                return false;
377            }
378        }
379
380        // lex.c:1993-2015 — STRING-token alias / reswd check.
381        if self.tok == LexTok::String {
382            // lex.c:1995 — `checkalias()`. POSIX-aliases gate skipped
383            // here (zshrs doesn't have the option flag wired).
384            if self.check_alias(resolver, &lextext) {
385                return true;
386            }
387
388            // lex.c:2002-2009 — reserved-word lookup. Fires when in
389            // command position OR when the text is bare `}` and
390            // IGNOREBRACES is unset (so `}` ends a brace block).
391            if self.incmdpos || lextext == "}" {
392                if let Some(rwtok) = resolver.lookup_reswd(&lextext) {
393                    self.tok = rwtok;
394                    if rwtok == LexTok::Repeat {
395                        self.inrepeat = 1;
396                    }
397                    if rwtok == LexTok::Dinbrack {
398                        self.incond = 1;
399                    }
400                }
401            } else if self.incond > 0 && lextext == "]]" {
402                // lex.c:2010-2012 — `]]` closes the cond expression.
403                self.tok = LexTok::Doutbrack;
404                self.incond = 0;
405            } else if self.incond == 1 && lextext == "!" {
406                // lex.c:2013-2014 — `!` inside `[[ ]]` is the BANG
407                // negation, not a literal.
408                self.tok = LexTok::Bang;
409            }
410        }
411
412        // lex.c:2016 — `inalmore = 0;` — alias-more flag clears after
413        // any non-alias token.
414        // (zshrs's lexer doesn't have inalmore yet — added here would
415        // require gettok to track when an alias-pushed token has more
416        // text after it. Documented divergence.)
417
418        false
419    }
420
421    /// Helper for `exalias`. Direct port of zsh/Src/lex.c:1899-1947
422    /// `checkalias`. Returns true if the lookup matched (regular or
423    /// suffix alias) AND the alias text was successfully injected
424    /// back into the input stream for re-lexing.
425    fn check_alias<R: AliasResolver>(&mut self, resolver: &mut R, lextext: &str) -> bool {
426        // lex.c:1906-1907 — guard on null lextext.
427        if lextext.is_empty() {
428            return false;
429        }
430
431        // lex.c:1909-1911 — guard: alias expansion is disabled, or
432        // POSIX aliases require the token to be a STRING and not a
433        // reserved word.
434        if self.noaliases {
435            return false;
436        }
437
438        // lex.c:1914-1933 — regular alias lookup.
439        if let Some(alias) = resolver.lookup_alias(lextext) {
440            if !alias.in_use && (alias.global || (self.incmdpos && self.tok == LexTok::String)) {
441                // lex.c:1918-1927 — if the next char isn't blank,
442                // insert a space so the alias body can't accidentally
443                // join the following word.
444                if !self.lexstop {
445                    if let Some(c) = self.peek() {
446                        if !Self::is_blank(c) {
447                            self.inject_alias_text(" ");
448                        }
449                    }
450                }
451                // lex.c:1928 — `inpush(an->text, INP_ALIAS, an);`
452                self.inject_alias_text(&alias.text);
453                resolver.mark_in_use(lextext, true);
454                self.lexstop = false;
455                return true;
456            }
457        }
458
459        // lex.c:1934-1943 — suffix-alias lookup. The token must end
460        // with `.SUFFIX`, the suffix name must be a registered
461        // suffix-alias, AND the lexer must be in command position.
462        if self.incmdpos {
463            if let Some(dot_pos) = lextext.rfind('.') {
464                if dot_pos > 0 && dot_pos + 1 < lextext.len() {
465                    let suffix = &lextext[dot_pos + 1..];
466                    if let Some(alias) = resolver.lookup_suffix_alias(suffix) {
467                        if !alias.in_use {
468                            // lex.c:1938-1940 — push three things in
469                            // reverse: the alias text, a space, then
470                            // the original word.
471                            self.inject_alias_text(&alias.text);
472                            self.inject_alias_text(" ");
473                            self.inject_alias_text(lextext);
474                            resolver.mark_in_use(suffix, true);
475                            self.lexstop = false;
476                            return true;
477                        }
478                    }
479                }
480            }
481        }
482
483        false
484    }
485
486    /// Push alias text back into the input stream so the lexer
487    /// re-reads it. Equivalent to zsh's `inpush(text, INP_ALIAS, an)`
488    /// at lex.c:1928,1938,1940. zshrs uses the existing `unget_buf`
489    /// (a VecDeque<char>) to inject chars in reverse order so the
490    /// next hgetc consumes them first.
491    fn inject_alias_text(&mut self, text: &str) {
492        // Insert at front in reverse so the first char of `text`
493        // comes out first.
494        for c in text.chars().rev() {
495            self.unget_buf.push_front(c);
496        }
497    }
498
499    /// Pop the last char from the raw-input capture buffer. Direct
500    /// port of zsh/Src/lex.c:2042-2049 `zshlex_raw_back`. Called when
501    /// the lexer ungets a char that was just captured raw — the raw
502    /// buffer must mirror the live input so this undoes the last add.
503    pub fn zshlex_raw_back(&mut self) {
504        // lex.c:2045-2046 — guard.
505        if self.lex_add_raw == 0 {
506            return;
507        }
508        // lex.c:2047-2048 — `lexbuf_raw.ptr--; lexbuf_raw.len--;`
509        self.lexbuf_raw.pop();
510    }
511
512    /// Mark the current raw-buffer offset (for restore later). Direct
513    /// port of zsh/Src/lex.c:2052-2058 `zshlex_raw_mark`. Returns
514    /// `len + offset` so callers can restore via `back_to_mark`.
515    pub fn zshlex_raw_mark(&self, offset: i64) -> i64 {
516        // lex.c:2055-2056 — guard.
517        if self.lex_add_raw == 0 {
518            return 0;
519        }
520        // lex.c:2057 — `return lexbuf_raw.len + offset;`
521        (self.lexbuf_raw.len() as i64) + offset
522    }
523
524    /// Restore raw-buffer offset to a previously-saved mark. Direct
525    /// port of zsh/Src/lex.c:2061-2068 `zshlex_raw_back_to_mark`.
526    /// Truncates the raw buffer to `mark` bytes — undoes any captures
527    /// since the mark was taken (used when a speculative parse fails
528    /// and the lexer rolls back).
529    pub fn zshlex_raw_back_to_mark(&mut self, mark: i64) {
530        // lex.c:2064-2065 — guard.
531        if self.lex_add_raw == 0 {
532            return;
533        }
534        // lex.c:2066-2067 — `lexbuf_raw.ptr = tokstr_raw + mark;
535        // lexbuf_raw.len = mark;` — Rust truncate handles both.
536        let m = mark.max(0) as usize;
537        self.lexbuf_raw.data.truncate(m);
538    }
539
540    /// Take the captured raw-input buffer, clearing it. Useful for
541    /// callers that need the literal command-sub body after lexing
542    /// (e.g. compile-time string capture for `$(...)`).
543    pub fn take_raw_buf(&mut self) -> String {
544        std::mem::take(&mut self.lexbuf_raw.data)
545    }
546
547    /// Save lexical context onto a `LexStack`. Direct port of
548    /// zsh/Src/lex.c:215-239 `lex_context_save`. After save, the lexer
549    /// is in a clean state suitable for parsing a nested input (command
550    /// substitution body, here-doc terminator, eval'd string).
551    pub fn lex_context_save(&mut self, ls: &mut LexStack) {
552        // lex.c:220-233 — copy live state into the stack.
553        ls.dbparens = self.dbparens;
554        ls.isfirstln = self.isfirstln;
555        ls.isfirstch = self.isfirstch;
556        ls.lexflags = self.lexflags;
557        ls.tok = self.tok;
558        ls.tokstr = self.tokstr.take();
559        ls.lexbuf_data = std::mem::take(&mut self.lexbuf.data);
560        ls.lexbuf_siz = self.lexbuf.siz;
561        ls.lexstop = self.lexstop;
562        ls.toklineno = self.toklineno;
563
564        // lex.c:235-238 — reset live state to defaults so a nested
565        // parse starts from a clean slate. tokstr/lexbuf are zeroed,
566        // lexbuf.siz reset to 256 (the C-source initial alloc).
567        self.tokstr = None;
568        self.lexbuf.data.clear();
569        self.lexbuf.siz = 256;
570    }
571
572    /// Restore lexical context from a `LexStack`. Direct port of
573    /// zsh/Src/lex.c:244-262 `lex_context_restore`. Inverse of
574    /// `lex_context_save`. Called after the nested parse completes.
575    pub fn lex_context_restore(&mut self, ls: &mut LexStack) {
576        // lex.c:249-261 — copy stack state back into live fields.
577        self.dbparens = ls.dbparens;
578        self.isfirstln = ls.isfirstln;
579        self.isfirstch = ls.isfirstch;
580        self.lexflags = ls.lexflags;
581        self.tok = ls.tok;
582        self.tokstr = ls.tokstr.take();
583        self.lexbuf.data = std::mem::take(&mut ls.lexbuf_data);
584        self.lexbuf.siz = ls.lexbuf_siz;
585        self.lexstop = ls.lexstop;
586        self.toklineno = ls.toklineno;
587    }
588
589    /// Initialize lexical state. Direct port of zsh/Src/lex.c:440-445
590    /// `lexinit`. Resets dbparens / nocorrect / lexstop and sets `tok`
591    /// to ENDINPUT so the next gettok starts from a known baseline.
592    /// Note: the constructor `Self::new` already sets equivalent
593    /// defaults; this method exists for the rare case a caller wants
594    /// to recycle a `ZshLexer` across multiple input strings.
595    pub fn lexinit(&mut self) {
596        // lex.c:443 — `nocorrect = dbparens = lexstop = 0;`
597        self.nocorrect = 0;
598        self.dbparens = false;
599        self.lexstop = false;
600        // lex.c:444 — `tok = ENDINPUT;`
601        self.tok = LexTok::Endinput;
602    }
603
604    /// Check recursion depth; returns true if exceeded
605    #[inline]
606    fn check_recursion(&mut self) -> bool {
607        if self.recursion_depth > MAX_LEXER_RECURSION {
608            self.error = Some("lexer exceeded max recursion depth".to_string());
609            self.lexstop = true;
610            true
611        } else {
612            false
613        }
614    }
615
616    /// Check and increment global iteration counter; returns true if limit exceeded
617    #[inline]
618    fn check_iterations(&mut self) -> bool {
619        self.global_iterations += 1;
620        if self.global_iterations > 50_000 {
621            self.error = Some("lexer exceeded 50K iterations".to_string());
622            self.lexstop = true;
623            self.tok = LexTok::Lexerr;
624            true
625        } else {
626            false
627        }
628    }
629
630    /// Get next character from input
631    fn hgetc(&mut self) -> Option<char> {
632        if self.check_iterations() {
633            return None;
634        }
635
636        // Re-read from unget_buf: increment lineno on `\n` HERE
637        // too. hungetc() decremented lineno when the char was put
638        // back; without a matching increment on the way out, every
639        // `\n` that's ungetted-then-reread leaves lineno
640        // permanently one short. Symptom: $LINENO stuck at 1 in
641        // every script statement because the parser ungets the
642        // separating newline once between statements.
643        if let Some(c) = self.unget_buf.pop_front() {
644            if c == '\n' {
645                self.lineno += 1;
646            }
647            return Some(c);
648        }
649
650        let c = self.input[self.pos..].chars().next()?;
651        self.pos += c.len_utf8();
652
653        if c == '\n' {
654            self.lineno += 1;
655        }
656
657        Some(c)
658    }
659
660    /// Put character back into input
661    fn hungetc(&mut self, c: char) {
662        self.unget_buf.push_front(c);
663        if c == '\n' && self.lineno > 1 {
664            self.lineno -= 1;
665        }
666        self.lexstop = false;
667    }
668
669    /// Peek at next character without consuming
670    #[allow(dead_code)]
671    fn peek(&mut self) -> Option<char> {
672        if let Some(&c) = self.unget_buf.front() {
673            return Some(c);
674        }
675        self.input[self.pos..].chars().next()
676    }
677
678    /// Add character to token buffer
679    fn add(&mut self, c: char) {
680        self.lexbuf.add(c);
681    }
682
683    /// Check if character is blank (space or tab)
684    fn is_blank(c: char) -> bool {
685        c == ' ' || c == '\t'
686    }
687
688    /// Peek for a zsh numeric range glob shape after a `<`: returns the
689    /// captured `N*-M*>` (everything *after* the leading `<`) when the
690    /// upcoming chars match `[0-9]*-[0-9]*>` exactly. Otherwise returns
691    /// None and leaves the input untouched.
692    fn try_numeric_range_glob(&mut self) -> Option<String> {
693        let mut buf: Vec<char> = Vec::new();
694        // optional leading digits
695        loop {
696            match self.hgetc() {
697                Some(c) if c.is_ascii_digit() => buf.push(c),
698                Some(c) => {
699                    buf.push(c);
700                    break;
701                }
702                None => break,
703            }
704        }
705        // last char in buf must be '-' for the range form
706        if buf.last() != Some(&'-') {
707            for c in buf.iter().rev() {
708                self.hungetc(*c);
709            }
710            return None;
711        }
712        // optional trailing digits
713        loop {
714            match self.hgetc() {
715                Some(c) if c.is_ascii_digit() => buf.push(c),
716                Some(c) => {
717                    buf.push(c);
718                    break;
719                }
720                None => break,
721            }
722        }
723        if buf.last() != Some(&'>') {
724            for c in buf.iter().rev() {
725                self.hungetc(*c);
726            }
727            return None;
728        }
729        Some(buf.into_iter().collect())
730    }
731
732    /// Check if character is blank (including other whitespace except newline)
733    fn is_inblank(c: char) -> bool {
734        matches!(c, ' ' | '\t' | '\x0b' | '\x0c' | '\r')
735    }
736
737    /// Check if character is a digit
738    fn is_digit(c: char) -> bool {
739        c.is_ascii_digit()
740    }
741
742    /// Check if character is identifier start
743    #[allow(dead_code)]
744    fn is_ident_start(c: char) -> bool {
745        c.is_ascii_alphabetic() || c == '_'
746    }
747
748    /// Check if character is identifier continuation
749    fn is_ident(c: char) -> bool {
750        c.is_ascii_alphanumeric() || c == '_'
751    }
752
753    /// Main lexer entry point — fetch the next token. Direct port of
754    /// zsh/Src/lex.c:265-313 `zshlex`. Loop body matches the C source
755    /// `do { ... } while (tok != ENDINPUT && exalias())` at lex.c:270-276,
756    /// followed by here-doc draining (lex.c:278-306), newline tracking
757    /// (lex.c:307-310), and SEMI/NEWLIN→SEPER folding (lex.c:311-312).
758    ///
759    /// zshrs port note: `exalias()` (lex.c:1953) is not yet wired into
760    /// the loop. The C source iterates as long as exalias keeps
761    /// re-injecting alias text into the input buffer; zshrs's alias
762    /// expansion happens post-lex in exec.rs. The loop body therefore
763    /// runs once and breaks unconditionally — documented divergence.
764    pub fn zshlex(&mut self) {
765        // lex.c:268-269 — early-out on prior LEXERR.
766        if self.tok == LexTok::Lexerr {
767            return;
768        }
769
770        // Note: Do NOT reset global_iterations here - it must accumulate across all
771        // zshlex calls in a parse to prevent infinite loops in the parser
772
773        // lex.c:270-276 — gettok / exalias loop. Without exalias wired,
774        // the inner body runs once and we `break` unconditionally.
775        loop {
776            // lex.c:271-272 — bump inrepeat counter for `repeat N {}`
777            // detection.
778            if self.inrepeat > 0 {
779                self.inrepeat += 1;
780            }
781            // lex.c:273-274 — at the third token after `repeat`,
782            // SHORTLOOPS / SHORTREPEAT options force back into cmd
783            // position so the loop body can start. zshrs unconditionally
784            // does this since the option-lookup lives in exec.rs.
785            if self.inrepeat == 3 {
786                self.incmdpos = true;
787            }
788
789            // lex.c:275 — `tok = gettok();`
790            self.tok = self.gettok();
791
792            // lex.c:276 — `while (tok != ENDINPUT && exalias())` —
793            // when exalias re-injects alias text it returns true and
794            // the loop iterates. Without exalias wired, we break.
795            break;
796        }
797
798        // lex.c:277 — `nocorrect &= 1;` — clear bit 1 (lookahead-only)
799        // so the persistent low bit survives but the per-word bit is
800        // dropped.
801        self.nocorrect &= 1;
802
803        // lex.c:278-306 — drain pending here-documents at the start
804        // of a new line. zshrs's process_heredocs reads the full body
805        // and stitches it onto the matching redir token.
806        if self.tok == LexTok::Newlin || self.tok == LexTok::Endinput {
807            self.process_heredocs();
808        }
809
810        // lex.c:307-310 — track whether we just saw a newline.
811        // C uses `inbufct` to distinguish "newline at EOF" (=1)
812        // from "newline mid-input" (=-1); zshrs reads `pos < len`.
813        if self.tok != LexTok::Newlin {
814            self.isnewlin = 0;
815        } else {
816            self.isnewlin = if self.pos < self.input.len() { -1 } else { 1 };
817        }
818
819        // lex.c:311-312 — fold SEMI / NEWLIN into SEPER unless
820        // LEXFLAGS_NEWLINE is set to preserve newlines (used by
821        // ZLE for completion of partial lines).
822        if self.tok == LexTok::Semi || (self.tok == LexTok::Newlin && !self.lexflags.newline) {
823            self.tok = LexTok::Seper;
824        }
825
826        // Reserved-word promotion. Per lex.c:2002-2005 in `exalias`:
827        //   - `{` only promotes to INBRACE in command position
828        //   - `}` promotes to OUTBRACE either in cmdpos OR via the
829        //     special `closing-brace-special` rule (IGNOREBRACES unset
830        //     — assumed since zshrs doesn't expose that option yet)
831        //   - other reserved words: only when incmdpos (or `}` exception)
832        if self.tok == LexTok::String {
833            if let Some(ref s) = self.tokstr {
834                if s == "{" && self.incmdpos {
835                    self.tok = LexTok::Inbrace;
836                } else if s == "}" {
837                    self.tok = LexTok::Outbrace;
838                } else if self.incasepat == 0 {
839                    // Skip reserved word checking in case pattern context —
840                    // words like `time`, `end` should be patterns, not
841                    // keywords.
842                    self.check_reserved_word();
843                }
844            }
845        }
846
847        // If we were expecting a heredoc terminator, register it now
848        if self.heredoc_pending > 0 && self.tok == LexTok::String {
849            if let Some(ref terminator) = self.tokstr {
850                let strip_tabs = self.heredoc_pending == 2;
851                // Detect originally-quoted terminator (`<<'EOF'`,
852                // `<<"EOF"`). The lexer wraps single-quoted text in
853                // SNULL (`\u{9d}`) and double-quoted text in DNULL
854                // (`\u{9e}`); plain `EOF` has neither. Quoted-terminator
855                // heredocs disable variable / command-sub / arithmetic
856                // expansion in the body — see `compile_redir` for the
857                // expansion side.
858                // Quoted terminators (`<<'EOF'`, `<<"EOF"`, `<<\EOF`)
859                // disable expansion in the body. SNULL/DNULL mark
860                // single/double-quoted spans; BNULL (`\u{9f}`) marks
861                // any backslash-escaped char — its presence alone is
862                // enough to flag the terminator as quoted (zsh's
863                // `<<\EOF` shorthand for `<<'EOF'`).
864                let quoted = terminator.contains('\u{9d}')
865                    || terminator.contains('\u{9e}')
866                    || terminator.contains('\u{9f}')
867                    || terminator.starts_with('\'')
868                    || terminator.starts_with('"');
869                let term = terminator
870                    .chars()
871                    .filter(|c| {
872                        *c != '\''
873                            && *c != '"'
874                            && *c != '\u{9d}'
875                            && *c != '\u{9e}'
876                            && *c != '\u{9f}'
877                    })
878                    .collect::<String>();
879                self.heredocs.push(HereDoc {
880                    terminator: term,
881                    strip_tabs,
882                    content: String::new(),
883                    quoted,
884                    processed: false,
885                });
886            }
887            self.heredoc_pending = 0;
888        }
889
890        // Track pattern context inside [[ ... ]] - after = == != =~ the RHS is a pattern
891        if self.incond > 0 {
892            if let Some(ref s) = self.tokstr {
893                // Check if this token is a comparison operator
894                // Note: single = is also a comparison operator in [[ ]]
895                // The internal marker \u{8d} is used for =
896                if s == "="
897                    || s == "=="
898                    || s == "!="
899                    || s == "=~"
900                    || s == "\u{8d}"
901                    || s == "\u{8d}\u{8d}"
902                    || s == "!\u{8d}"
903                    || s == "\u{8d}~"
904                    || s == "\u{8d}\u{98}"
905                {
906                    self.incondpat = true;
907                } else if self.incondpat {
908                    // We were in pattern context, now we've consumed the pattern
909                    // Reset after the pattern token is consumed
910                    // But actually, pattern can span multiple tokens, so we should
911                    // stay in pattern mode until ]] or && or ||
912                }
913            }
914            // Reset pattern context on ]] or logical operators (&&, ||)
915            // and grouping parens. zsh par_cond_3 (cond.c) treats
916            // these as cond-pattern terminators — the next operand is
917            // a fresh primary, NOT a continuation of the prior pattern.
918            // Without resetting on Damper/Dbar/Inpar/Outpar, the `(`
919            // after `[[ a == a && (b == b ... ` was lexed as a literal
920            // glob char (incondpat=true → gettokstr) and the whole
921            // remainder collapsed into one String token.
922            match self.tok {
923                LexTok::Doutbrack
924                | LexTok::Damper
925                | LexTok::Dbar
926                | LexTok::Inpar
927                | LexTok::Outpar
928                | LexTok::Bang => {
929                    self.incondpat = false;
930                }
931                _ => {}
932            }
933        } else {
934            self.incondpat = false;
935        }
936
937        // Update command position for next token based on current token
938        // Note: In case patterns (incasepat > 0), | is a pattern separator, not pipeline,
939        // so we don't set incmdpos after Bar in that context
940        match self.tok {
941            LexTok::Seper
942            | LexTok::Newlin
943            | LexTok::Semi
944            | LexTok::Dsemi
945            | LexTok::Semiamp
946            | LexTok::Semibar
947            | LexTok::Amper
948            | LexTok::Amperbang
949            | LexTok::Inpar
950            | LexTok::Inbrace
951            | LexTok::Dbar
952            | LexTok::Damper
953            | LexTok::Baramp
954            | LexTok::Inoutpar
955            | LexTok::Doloop
956            | LexTok::Then
957            | LexTok::Elif
958            | LexTok::Else
959            | LexTok::Doutbrack
960            | LexTok::Func => {
961                self.incmdpos = true;
962            }
963            LexTok::Bar
964                // In case patterns, | is a pattern separator - don't change incmdpos
965                if self.incasepat <= 0 => {
966                    self.incmdpos = true;
967                }
968            LexTok::String
969            | LexTok::Typeset
970            | LexTok::Envarray
971            | LexTok::Outpar
972            | LexTok::Case
973            | LexTok::Dinbrack => {
974                self.incmdpos = false;
975            }
976            _ => {}
977        }
978
979        // Track 'for' keyword for C-style for loop: for (( init; cond; step ))
980        // When we see 'for', set infor=2 to expect the init and cond parts
981        // Each Dinpar (after semicolon in arithmetic) decrements it
982        if self.tok != LexTok::Dinpar {
983            self.infor = if self.tok == LexTok::For { 2 } else { 0 };
984        }
985
986        // Handle redirection context
987        let oldpos = self.incmdpos;
988        if self.tok.is_redirop()
989            || self.tok == LexTok::For
990            || self.tok == LexTok::Foreach
991            || self.tok == LexTok::Select
992        {
993            self.inredir = true;
994            self.incmdpos = false;
995        } else if self.inredir {
996            self.incmdpos = oldpos;
997            self.inredir = false;
998        }
999    }
1000
1001    /// Process pending here-documents. Walks each heredoc whose body
1002    /// hasn't been filled yet (content is empty AND terminator is set),
1003    /// reads lines from input until the terminator, and stuffs the body
1004    /// into `hdoc.content` IN PLACE. The list itself is preserved so the
1005    /// parser can index into it after parse() finishes.
1006    fn process_heredocs(&mut self) {
1007        let n = self.heredocs.len();
1008        for i in 0..n {
1009            // Skip heredocs we've already processed AND those without
1010            // a terminator (early-error case). The `processed` bool
1011            // distinguishes "filled with empty body" from "not yet
1012            // visited" — both have empty `content`.
1013            if self.heredocs[i].processed || self.heredocs[i].terminator.is_empty() {
1014                continue;
1015            }
1016            let strip_tabs = self.heredocs[i].strip_tabs;
1017            let terminator = self.heredocs[i].terminator.clone();
1018            let mut content = String::new();
1019            let mut line_count = 0;
1020
1021            loop {
1022                line_count += 1;
1023                if line_count > 10000 {
1024                    self.error = Some("heredoc exceeded 10000 lines".to_string());
1025                    self.tok = LexTok::Lexerr;
1026                    return;
1027                }
1028
1029                let line = self.read_line();
1030                if line.is_none() {
1031                    self.error = Some("here document too large or unterminated".to_string());
1032                    self.tok = LexTok::Lexerr;
1033                    return;
1034                }
1035
1036                let line = line.unwrap();
1037                let check_line = if strip_tabs {
1038                    line.trim_start_matches('\t')
1039                } else {
1040                    line.as_str()
1041                };
1042
1043                if check_line.trim_end_matches('\n') == terminator {
1044                    break;
1045                }
1046
1047                // `<<-` strips leading tabs from BODY lines too, not just
1048                // from terminator-match comparison. Without this, tabs in
1049                // here-doc content survive into stdin.
1050                if strip_tabs {
1051                    content.push_str(check_line);
1052                } else {
1053                    content.push_str(&line);
1054                }
1055            }
1056
1057            self.heredocs[i].content = content;
1058            self.heredocs[i].processed = true;
1059        }
1060    }
1061
1062    /// Read a line from input (returns partial line at EOF)
1063    fn read_line(&mut self) -> Option<String> {
1064        let mut line = String::new();
1065
1066        loop {
1067            match self.hgetc() {
1068                Some(c) => {
1069                    line.push(c);
1070                    if c == '\n' {
1071                        break;
1072                    }
1073                }
1074                None => {
1075                    // EOF - return partial line if any
1076                    if line.is_empty() {
1077                        return None;
1078                    }
1079                    break;
1080                }
1081            }
1082        }
1083
1084        Some(line)
1085    }
1086
1087    /// Get the next token. Direct port of zsh/Src/lex.c:613-936
1088    /// `gettok`. Reads characters from the input via hgetc, dispatches
1089    /// on the leading char through lexact1[]/lexact2[] tables (zshrs
1090    /// uses inline `match` in lex_initial / lex_inang / lex_outang
1091    /// since Rust pattern-matching subsumes the table dispatch).
1092    ///
1093    /// Structural divergence from C: the giant ~322-line C switch
1094    /// statement at lex.c:725-936 is split into helper methods in
1095    /// Rust (lex_initial = LX1_OTHER plus the punctuation cases,
1096    /// lex_inang / lex_outang for the < and > arms). The flow is
1097    /// equivalent — same chars consumed, same tokens emitted — but
1098    /// the source-level layout differs. C's table-driven dispatch
1099    /// would Rust-port as `match c { '\\' => ..., '\n' => ..., ... }`
1100    /// which is what the helpers ultimately do.
1101    fn gettok(&mut self) -> LexTok {
1102        // lex.c:621 — `tokstr = NULL;` reset before each token.
1103        self.tokstr = None;
1104        // (zshrs-specific: tokfd reset lives here too — C does it
1105        // implicitly via the `peekfd = -1` local at lex.c:617 used
1106        // only when a digit-prefix redirection is detected.)
1107        self.tokfd = -1;
1108
1109        // lex.c:622 — `while (iblank(c = hgetc()) && !lexstop);` —
1110        // skip leading blanks (space/tab, NOT newline).
1111        let mut ws_iterations = 0;
1112        loop {
1113            ws_iterations += 1;
1114            if ws_iterations > 100_000 {
1115                self.error = Some("gettok: infinite loop in whitespace skip".to_string());
1116                return LexTok::Lexerr;
1117            }
1118            let c = match self.hgetc() {
1119                Some(c) => c,
1120                None => {
1121                    // lex.c:624-625 — lexstop set, return ENDINPUT
1122                    // (or LEXERR if errflag is set elsewhere).
1123                    self.lexstop = true;
1124                    return if self.error.is_some() {
1125                        LexTok::Lexerr
1126                    } else {
1127                        LexTok::Endinput
1128                    };
1129                }
1130            };
1131
1132            if !Self::is_blank(c) {
1133                self.hungetc(c);
1134                break;
1135            }
1136        }
1137
1138        let c = match self.hgetc() {
1139            Some(c) => c,
1140            None => {
1141                self.lexstop = true;
1142                return LexTok::Endinput;
1143            }
1144        };
1145
1146        // lex.c:623 — `toklineno = lineno;`
1147        self.toklineno = self.lineno;
1148        // lex.c:626 — `isfirstln = 0;` once we've consumed any non-
1149        // blank.
1150        self.isfirstln = false;
1151
1152        // lex.c:631-648 — dbparens (inside `(( … ))`) special path:
1153        // call dquote_parse with `;` or `)` as the end-char and
1154        // either return DINPAR (continue for-loop arith) or DOUTPAR
1155        // (close the arith block) or LEXERR.
1156        if self.dbparens {
1157            return self.lex_arith(c);
1158        }
1159
1160        // lex.c:649-668 — digit prefix on a redirection: `2> file`
1161        // treats `2` as the fd to redirect, not a literal arg. Three
1162        // shapes: `N>`/`N<` (single redir), `N&>` (errwrite), or
1163        // anything else (push back, treat as literal digit).
1164        if Self::is_digit(c) {
1165            let d = self.hgetc();
1166            match d {
1167                Some('&') => {
1168                    let e = self.hgetc();
1169                    if e == Some('>') {
1170                        // lex.c:653-657 — `N&>` shape detected.
1171                        self.tokfd = (c as u8 - b'0') as i32;
1172                        self.hungetc('>');
1173                        return self.lex_initial('&');
1174                    }
1175                    // lex.c:658-661 — not `N&>`, push everything back.
1176                    if let Some(e) = e {
1177                        self.hungetc(e);
1178                    }
1179                    self.hungetc('&');
1180                }
1181                Some('>') | Some('<') => {
1182                    // lex.c:662-664 — `N>` or `N<` shape detected.
1183                    self.tokfd = (c as u8 - b'0') as i32;
1184                    return self.lex_initial(d.unwrap());
1185                }
1186                Some(d) => {
1187                    // lex.c:665-668 — not a redir prefix, push back.
1188                    self.hungetc(d);
1189                }
1190                None => {}
1191            }
1192            self.lexstop = false;
1193        }
1194
1195        // lex.c:670-936 — main dispatch on the leading char. zshrs
1196        // delegates to lex_initial which holds the equivalent of
1197        // lex.c's `switch (lexact1[c])` plus the gettokstr fallback
1198        // for LX1_OTHER.
1199        self.lex_initial(c)
1200    }
1201
1202    /// Lex (( ... )) arithmetic expression
1203    fn lex_arith(&mut self, c: char) -> LexTok {
1204        self.lexbuf.clear();
1205        self.hungetc(c);
1206
1207        let end_char = if self.infor > 0 { ';' } else { ')' };
1208        if self.dquote_parse(end_char, false).is_err() {
1209            return LexTok::Lexerr;
1210        }
1211
1212        self.tokstr = Some(self.lexbuf.as_str().to_string());
1213
1214        if !self.lexstop && self.infor > 0 {
1215            self.infor -= 1;
1216            return LexTok::Dinpar;
1217        }
1218
1219        // Check for closing ))
1220        match self.hgetc() {
1221            Some(')') => {
1222                self.dbparens = false;
1223                LexTok::Doutpar
1224            }
1225            c => {
1226                if let Some(c) = c {
1227                    self.hungetc(c);
1228                }
1229                LexTok::Lexerr
1230            }
1231        }
1232    }
1233
1234    /// Handle initial character of token
1235    fn lex_initial(&mut self, c: char) -> LexTok {
1236        // Handle comments
1237        if c == '#' && !self.nocomments {
1238            return self.lex_comment();
1239        }
1240
1241        match c {
1242            '\\' => {
1243                let d = self.hgetc();
1244                if d == Some('\n') {
1245                    // Line continuation - get next token
1246                    return self.gettok();
1247                }
1248                if let Some(d) = d {
1249                    self.hungetc(d);
1250                }
1251                self.lexstop = false;
1252                self.gettokstr(c, false)
1253            }
1254
1255            '\n' => LexTok::Newlin,
1256
1257            ';' => {
1258                let d = self.hgetc();
1259                match d {
1260                    Some(';') => LexTok::Dsemi,
1261                    Some('&') => LexTok::Semiamp,
1262                    Some('|') => LexTok::Semibar,
1263                    _ => {
1264                        if let Some(d) = d {
1265                            self.hungetc(d);
1266                        }
1267                        self.lexstop = false;
1268                        LexTok::Semi
1269                    }
1270                }
1271            }
1272
1273            '&' => {
1274                let d = self.hgetc();
1275                match d {
1276                    Some('&') => LexTok::Damper,
1277                    Some('!') | Some('|') => LexTok::Amperbang,
1278                    Some('>') => {
1279                        self.tokfd = self.tokfd.max(0);
1280                        let e = self.hgetc();
1281                        match e {
1282                            Some('!') | Some('|') => LexTok::Outangampbang,
1283                            Some('>') => {
1284                                let f = self.hgetc();
1285                                match f {
1286                                    Some('!') | Some('|') => LexTok::Doutangampbang,
1287                                    _ => {
1288                                        if let Some(f) = f {
1289                                            self.hungetc(f);
1290                                        }
1291                                        self.lexstop = false;
1292                                        LexTok::Doutangamp
1293                                    }
1294                                }
1295                            }
1296                            _ => {
1297                                if let Some(e) = e {
1298                                    self.hungetc(e);
1299                                }
1300                                self.lexstop = false;
1301                                LexTok::Ampoutang
1302                            }
1303                        }
1304                    }
1305                    _ => {
1306                        if let Some(d) = d {
1307                            self.hungetc(d);
1308                        }
1309                        self.lexstop = false;
1310                        LexTok::Amper
1311                    }
1312                }
1313            }
1314
1315            '|' => {
1316                let d = self.hgetc();
1317                match d {
1318                    Some('|') if self.incasepat <= 0 => LexTok::Dbar,
1319                    Some('&') => LexTok::Baramp,
1320                    _ => {
1321                        if let Some(d) = d {
1322                            self.hungetc(d);
1323                        }
1324                        self.lexstop = false;
1325                        LexTok::Bar
1326                    }
1327                }
1328            }
1329
1330            '(' => {
1331                let d = self.hgetc();
1332                match d {
1333                    Some('(') => {
1334                        if self.infor > 0 {
1335                            self.dbparens = true;
1336                            return LexTok::Dinpar;
1337                        }
1338                        if self.incmdpos {
1339                            // Could be (( arithmetic )) or ( subshell )
1340                            self.lexbuf.clear();
1341                            match self.cmd_or_math() {
1342                                CmdOrMath::Math => {
1343                                    self.tokstr = Some(self.lexbuf.as_str().to_string());
1344                                    return LexTok::Dinpar;
1345                                }
1346                                CmdOrMath::Cmd => {
1347                                    self.tokstr = None;
1348                                    return LexTok::Inpar;
1349                                }
1350                                CmdOrMath::Err => return LexTok::Lexerr,
1351                            }
1352                        }
1353                        self.hungetc('(');
1354                        self.lexstop = false;
1355                        self.gettokstr('(', false)
1356                    }
1357                    Some(')') => LexTok::Inoutpar,
1358                    _ => {
1359                        if let Some(d) = d {
1360                            self.hungetc(d);
1361                        }
1362                        self.lexstop = false;
1363                        // In pattern context (after == != =~ in [[ ]]), ( is part of pattern
1364                        // In case pattern context, ( at start is optional delimiter, not pattern
1365                        // incasepat == 1 means "at start of pattern", > 1 means "inside pattern"
1366                        if self.incondpat || self.incasepat > 1 {
1367                            self.gettokstr('(', false)
1368                        } else if self.incond == 1 || self.incmdpos || self.incasepat == 1 {
1369                            LexTok::Inpar
1370                        } else {
1371                            self.gettokstr('(', false)
1372                        }
1373                    }
1374                }
1375            }
1376
1377            ')' => LexTok::Outpar,
1378
1379            '{' => {
1380                // { is a command group only if followed by whitespace,
1381                // newline, or `}` (the empty-block form `{}`). zsh
1382                // treats `{}` as an empty compound — `foo() {}` is a
1383                // valid no-op function. Without `}` in this list,
1384                // `{}` got consumed as one literal token and ran as a
1385                // command, failing "command not found: {}".
1386                // The empty `{}` is also recognised AFTER a function
1387                // header `name()` even when `incmdpos` got cleared by
1388                // the preceding Outpar — peek for `}` regardless and
1389                // treat as Inbrace so `foo() {}` parses as a no-op
1390                // function body.
1391                let next = self.hgetc();
1392                let next_is_close = matches!(next, Some('}'));
1393                if self.incmdpos {
1394                    let is_brace_group = match next {
1395                        Some(' ') | Some('\t') | Some('\n') | Some('}') | None => true,
1396                        _ => false,
1397                    };
1398                    if let Some(ch) = next {
1399                        self.hungetc(ch);
1400                    }
1401                    if is_brace_group {
1402                        self.tokstr = Some("{".to_string());
1403                        LexTok::Inbrace
1404                    } else {
1405                        self.gettokstr(c, false)
1406                    }
1407                } else if next_is_close {
1408                    // `{}` empty block in non-cmd position (function
1409                    // body after `()`). Treat as Inbrace; the parser
1410                    // will follow with Outbrace.
1411                    if let Some(ch) = next {
1412                        self.hungetc(ch);
1413                    }
1414                    self.tokstr = Some("{".to_string());
1415                    LexTok::Inbrace
1416                } else {
1417                    if let Some(ch) = next {
1418                        self.hungetc(ch);
1419                    }
1420                    self.gettokstr(c, false)
1421                }
1422            }
1423
1424            '}' => {
1425                // } at start of token is always Outbrace (ends command group)
1426                // Inside a word, } would be handled by gettokstr but we never reach here mid-word
1427                self.tokstr = Some("}".to_string());
1428                LexTok::Outbrace
1429            }
1430
1431            '[' => {
1432                // [[ is a conditional expression start
1433                // [ can also be a command (test builtin) or array subscript
1434                // In case patterns (incasepat > 0), [ is part of glob pattern like [yY]
1435                if self.incasepat > 0 {
1436                    self.gettokstr(c, false)
1437                } else if self.incmdpos {
1438                    let next = self.hgetc();
1439                    if next == Some('[') {
1440                        // [[ - double bracket conditional
1441                        self.tokstr = Some("[[".to_string());
1442                        self.incond = 1;
1443                        return LexTok::Dinbrack;
1444                    }
1445                    // Single [ - either test command or start of glob pattern
1446                    if let Some(ch) = next {
1447                        self.hungetc(ch);
1448                    }
1449                    self.tokstr = Some("[".to_string());
1450                    LexTok::String
1451                } else {
1452                    self.gettokstr(c, false)
1453                }
1454            }
1455
1456            ']' => {
1457                // ]] ends a conditional expression started by [[
1458                if self.incond > 0 {
1459                    let next = self.hgetc();
1460                    if next == Some(']') {
1461                        self.tokstr = Some("]]".to_string());
1462                        self.incond = 0;
1463                        return LexTok::Doutbrack;
1464                    }
1465                    if let Some(ch) = next {
1466                        self.hungetc(ch);
1467                    }
1468                }
1469                self.gettokstr(c, false)
1470            }
1471
1472            '<' => {
1473                // In pattern context, < is literal (e.g., <-> in glob)
1474                if self.incondpat || self.incasepat > 0 {
1475                    self.gettokstr(c, false)
1476                } else {
1477                    self.lex_inang()
1478                }
1479            }
1480
1481            '>' => {
1482                // In pattern context, > is literal
1483                if self.incondpat || self.incasepat > 0 {
1484                    self.gettokstr(c, false)
1485                } else {
1486                    self.lex_outang()
1487                }
1488            }
1489
1490            _ => self.gettokstr(c, false),
1491        }
1492    }
1493
1494    /// Lex comment
1495    fn lex_comment(&mut self) -> LexTok {
1496        if self.lexflags.comments_keep {
1497            self.lexbuf.clear();
1498            self.add('#');
1499        }
1500
1501        loop {
1502            let c = self.hgetc();
1503            match c {
1504                Some('\n') | None => break,
1505                Some(c) => {
1506                    if self.lexflags.comments_keep {
1507                        self.add(c);
1508                    }
1509                }
1510            }
1511        }
1512
1513        if self.lexflags.comments_keep {
1514            self.tokstr = Some(self.lexbuf.as_str().to_string());
1515            if !self.lexstop {
1516                self.hungetc('\n');
1517            }
1518            return LexTok::String;
1519        }
1520
1521        if self.lexflags.comments_strip && self.lexstop {
1522            return LexTok::Endinput;
1523        }
1524
1525        LexTok::Newlin
1526    }
1527
1528    /// Lex < and variants
1529    fn lex_inang(&mut self) -> LexTok {
1530        let d = self.hgetc();
1531        match d {
1532            Some('(') => {
1533                // Process substitution <(...)
1534                self.hungetc('(');
1535                self.lexstop = false;
1536                self.gettokstr('<', false)
1537            }
1538            Some('>') => LexTok::Inoutang,
1539            Some('<') => {
1540                let e = self.hgetc();
1541                match e {
1542                    Some('(') => {
1543                        self.hungetc('(');
1544                        self.hungetc('<');
1545                        LexTok::Inang
1546                    }
1547                    Some('<') => LexTok::Trinang,
1548                    Some('-') => {
1549                        self.heredoc_pending = 2; // <<- expects terminator next
1550                        LexTok::Dinangdash
1551                    }
1552                    _ => {
1553                        if let Some(e) = e {
1554                            self.hungetc(e);
1555                        }
1556                        self.lexstop = false;
1557                        self.heredoc_pending = 1; // << expects terminator next
1558                        LexTok::Dinang
1559                    }
1560                }
1561            }
1562            Some('&') => LexTok::Inangamp,
1563            _ => {
1564                if let Some(d) = d {
1565                    self.hungetc(d);
1566                }
1567                self.lexstop = false;
1568                LexTok::Inang
1569            }
1570        }
1571    }
1572
1573    /// Lex > and variants
1574    fn lex_outang(&mut self) -> LexTok {
1575        let d = self.hgetc();
1576        match d {
1577            Some('(') => {
1578                // Process substitution >(...)
1579                self.hungetc('(');
1580                self.lexstop = false;
1581                self.gettokstr('>', false)
1582            }
1583            Some('&') => {
1584                let e = self.hgetc();
1585                match e {
1586                    Some('!') | Some('|') => LexTok::Outangampbang,
1587                    _ => {
1588                        if let Some(e) = e {
1589                            self.hungetc(e);
1590                        }
1591                        self.lexstop = false;
1592                        LexTok::Outangamp
1593                    }
1594                }
1595            }
1596            Some('!') | Some('|') => LexTok::Outangbang,
1597            Some('>') => {
1598                let e = self.hgetc();
1599                match e {
1600                    Some('&') => {
1601                        let f = self.hgetc();
1602                        match f {
1603                            Some('!') | Some('|') => LexTok::Doutangampbang,
1604                            _ => {
1605                                if let Some(f) = f {
1606                                    self.hungetc(f);
1607                                }
1608                                self.lexstop = false;
1609                                LexTok::Doutangamp
1610                            }
1611                        }
1612                    }
1613                    Some('!') | Some('|') => LexTok::Doutangbang,
1614                    Some('(') => {
1615                        self.hungetc('(');
1616                        self.hungetc('>');
1617                        LexTok::Outang
1618                    }
1619                    _ => {
1620                        if let Some(e) = e {
1621                            self.hungetc(e);
1622                        }
1623                        self.lexstop = false;
1624                        LexTok::Doutang
1625                    }
1626                }
1627            }
1628            _ => {
1629                if let Some(d) = d {
1630                    self.hungetc(d);
1631                }
1632                self.lexstop = false;
1633                LexTok::Outang
1634            }
1635        }
1636    }
1637
1638    /// Get rest of token string
1639    fn gettokstr(&mut self, c: char, sub: bool) -> LexTok {
1640        let mut bct = 0; // brace count
1641        let mut pct = 0; // parenthesis count
1642        let mut brct = 0; // bracket count
1643        let mut in_brace_param = 0;
1644        let mut peek = LexTok::String;
1645        let mut intpos = 1;
1646        let mut unmatched = '\0';
1647        let mut c = c;
1648        const MAX_ITERATIONS: usize = 100_000;
1649        let mut iterations = 0;
1650
1651        if !sub {
1652            self.lexbuf.clear();
1653        }
1654
1655        loop {
1656            iterations += 1;
1657            if iterations > MAX_ITERATIONS {
1658                self.error = Some("gettokstr exceeded maximum iterations".to_string());
1659                return LexTok::Lexerr;
1660            }
1661
1662            let inbl = Self::is_inblank(c);
1663
1664            if inbl && in_brace_param == 0 && pct == 0 {
1665                // Whitespace outside brace param ends token
1666                break;
1667            }
1668
1669            match c {
1670                // Whitespace is handled above for most cases
1671                ')' => {
1672                    if in_brace_param > 0 || sub {
1673                        self.add(char_tokens::OUTPAR);
1674                    } else if pct > 0 {
1675                        pct -= 1;
1676                        self.add(char_tokens::OUTPAR);
1677                    } else {
1678                        break;
1679                    }
1680                }
1681
1682                '|' => {
1683                    if pct == 0 && in_brace_param == 0 {
1684                        if sub {
1685                            self.add(c);
1686                        } else {
1687                            break;
1688                        }
1689                    } else {
1690                        self.add(char_tokens::BAR);
1691                    }
1692                }
1693
1694                '$' => {
1695                    let e = self.hgetc();
1696                    match e {
1697                        Some('\\') => {
1698                            let f = self.hgetc();
1699                            if f != Some('\n') {
1700                                if let Some(f) = f {
1701                                    self.hungetc(f);
1702                                }
1703                                self.hungetc('\\');
1704                                self.add(char_tokens::STRING);
1705                            } else {
1706                                // Line continuation after $
1707                                continue;
1708                            }
1709                        }
1710                        Some('[') => {
1711                            // $[...] arithmetic
1712                            self.add(char_tokens::STRING);
1713                            self.add(char_tokens::INBRACK);
1714                            if self.dquote_parse(']', sub).is_err() {
1715                                peek = LexTok::Lexerr;
1716                                break;
1717                            }
1718                            self.add(char_tokens::OUTBRACK);
1719                        }
1720                        Some('(') => {
1721                            // $(...) or $((...))
1722                            self.add(char_tokens::STRING);
1723                            match self.cmd_or_math_sub() {
1724                                CmdOrMath::Cmd => self.add(char_tokens::OUTPAR),
1725                                CmdOrMath::Math => self.add(char_tokens::OUTPARMATH),
1726                                CmdOrMath::Err => {
1727                                    peek = LexTok::Lexerr;
1728                                    break;
1729                                }
1730                            }
1731                        }
1732                        Some('{') => {
1733                            self.add(c);
1734                            self.add(char_tokens::INBRACE);
1735                            bct += 1;
1736                            if in_brace_param == 0 {
1737                                in_brace_param = bct;
1738                            }
1739                        }
1740                        _ => {
1741                            if let Some(e) = e {
1742                                self.hungetc(e);
1743                            }
1744                            self.lexstop = false;
1745                            self.add(char_tokens::STRING);
1746                        }
1747                    }
1748                }
1749
1750                '[' => {
1751                    if in_brace_param == 0 {
1752                        brct += 1;
1753                    }
1754                    self.add(char_tokens::INBRACK);
1755                }
1756
1757                ']' => {
1758                    if in_brace_param == 0 && brct > 0 {
1759                        brct -= 1;
1760                    }
1761                    self.add(char_tokens::OUTBRACK);
1762                }
1763
1764                '(' => {
1765                    // lex.c:1078-1135 LX2_INPAR — when `(` appears inside
1766                    // a STRING and is immediately followed by `)`, the
1767                    // string terminates at the `(`. The `()` is then
1768                    // re-lexed as a separate INOUTPAR token. This handles
1769                    // function definitions: `name()` lexes as STRING `name`
1770                    // + INOUTPAR `()`, not STRING `name()`.
1771                    //
1772                    // Also (lex.c:1109-1112): under SHGLOB, a `(` followed
1773                    // by whitespace at the start of a command-position word
1774                    // (no nested brackets/braces) is a ksh function
1775                    // definition signal — same break-out behavior.
1776                    if in_brace_param == 0 && !sub {
1777                        let e = self.hgetc();
1778                        if let Some(ch) = e {
1779                            self.hungetc(ch);
1780                        }
1781                        self.lexstop = false;
1782                        if e == Some(')') {
1783                            // `name()` — terminate STRING at `(` so the
1784                            // following `()` re-lexes as INOUTPAR. The
1785                            // loop's exit guard at line 2067 will
1786                            // `hungetc(c)` to push the `(` back; we only
1787                            // need to ensure `)` is also there. The
1788                            // hungetc(ch) above already pushed `)`, so
1789                            // breaking here yields unget_buf = [`(`, `)`]
1790                            // after the guard, which the outer dispatch
1791                            // reads as Inoutpar.
1792                            break;
1793                        }
1794                    }
1795                    if in_brace_param == 0 {
1796                        pct += 1;
1797                    }
1798                    self.add(char_tokens::INPAR);
1799                }
1800
1801                '{' => {
1802                    // Track braces for both ${...} param expansion and {...} brace expansion
1803                    bct += 1;
1804                    self.add(c);
1805                }
1806
1807                '}' => {
1808                    if in_brace_param > 0 {
1809                        if bct == in_brace_param {
1810                            in_brace_param = 0;
1811                        }
1812                        bct -= 1;
1813                        self.add(char_tokens::OUTBRACE);
1814                    } else if bct > 0 {
1815                        // Closing a brace expansion like {a,b}
1816                        bct -= 1;
1817                        self.add(c);
1818                    } else {
1819                        break;
1820                    }
1821                }
1822
1823                '>' => {
1824                    // In pattern context (incondpat), > is literal
1825                    if in_brace_param > 0 || sub || self.incondpat || self.incasepat > 0 {
1826                        self.add(c);
1827                    } else {
1828                        let e = self.hgetc();
1829                        if e != Some('(') {
1830                            if let Some(e) = e {
1831                                self.hungetc(e);
1832                            }
1833                            self.lexstop = false;
1834                            break;
1835                        }
1836                        // >(...)
1837                        self.add(char_tokens::OUTANGPROC);
1838                        if self.skip_command_sub().is_err() {
1839                            peek = LexTok::Lexerr;
1840                            break;
1841                        }
1842                        self.add(char_tokens::OUTPAR);
1843                    }
1844                }
1845
1846                '<' => {
1847                    // In pattern context (incondpat), < is literal
1848                    if in_brace_param > 0 || sub || self.incondpat || self.incasepat > 0 {
1849                        self.add(c);
1850                    } else if let Some(range_chars) = self.try_numeric_range_glob() {
1851                        // zsh numeric range glob `<N-M>`, `<->`, `<N->`,
1852                        // `<-M>`. When `<` mid-word matches that exact
1853                        // shape, swallow it into the word instead of
1854                        // breaking out for redirection.
1855                        self.add(c);
1856                        for ch in range_chars.chars() {
1857                            self.add(ch);
1858                        }
1859                    } else {
1860                        let e = self.hgetc();
1861                        if e != Some('(') {
1862                            if let Some(e) = e {
1863                                self.hungetc(e);
1864                            }
1865                            self.lexstop = false;
1866                            break;
1867                        }
1868                        // <(...)
1869                        self.add(char_tokens::INANG);
1870                        if self.skip_command_sub().is_err() {
1871                            peek = LexTok::Lexerr;
1872                            break;
1873                        }
1874                        self.add(char_tokens::OUTPAR);
1875                    }
1876                }
1877
1878                '=' => {
1879                    if !sub {
1880                        if intpos > 0 {
1881                            // At start of token, check for =(...) process substitution
1882                            let e = self.hgetc();
1883                            if e == Some('(') {
1884                                self.add(char_tokens::EQUALS);
1885                                if self.skip_command_sub().is_err() {
1886                                    peek = LexTok::Lexerr;
1887                                    break;
1888                                }
1889                                self.add(char_tokens::OUTPAR);
1890                            } else {
1891                                if let Some(e) = e {
1892                                    self.hungetc(e);
1893                                }
1894                                self.lexstop = false;
1895                                self.add(char_tokens::EQUALS);
1896                            }
1897                        } else if peek != LexTok::Envstring
1898                            && (self.incmdpos || self.intypeset)
1899                            && bct == 0
1900                            && brct == 0
1901                            && self.incasepat == 0
1902                        {
1903                            // Check for VAR=value assignment (but not in case pattern context)
1904                            let tok_so_far = self.lexbuf.as_str().to_string();
1905                            if self.is_valid_assignment_target(&tok_so_far) {
1906                                let next = self.hgetc();
1907                                if next == Some('(') {
1908                                    // VAR=(...) array assignment. Per zsh
1909                                    // (lex.c emits ENVARRAY with tokstr =
1910                                    // just the variable name, NOT
1911                                    // including the `=`). The `=` and
1912                                    // `(` are consumed by the lexer; the
1913                                    // parser knows ENVARRAY means assign-
1914                                    // array and reads the body that
1915                                    // follows.
1916                                    self.tokstr = Some(self.lexbuf.as_str().to_string());
1917                                    return LexTok::Envarray;
1918                                }
1919                                if let Some(next) = next {
1920                                    self.hungetc(next);
1921                                }
1922                                self.lexstop = false;
1923                                peek = LexTok::Envstring;
1924                                intpos = 2;
1925                                self.add(char_tokens::EQUALS);
1926                            } else {
1927                                self.add(char_tokens::EQUALS);
1928                            }
1929                        } else {
1930                            self.add(char_tokens::EQUALS);
1931                        }
1932                    } else {
1933                        self.add(char_tokens::EQUALS);
1934                    }
1935                }
1936
1937                '\\' => {
1938                    let next = self.hgetc();
1939                    if next == Some('\n') {
1940                        // Line continuation
1941                        let next = self.hgetc();
1942                        if let Some(next) = next {
1943                            c = next;
1944                            continue;
1945                        }
1946                        break;
1947                    } else {
1948                        self.add(char_tokens::BNULL);
1949                        if let Some(next) = next {
1950                            self.add(next);
1951                        }
1952                    }
1953                }
1954
1955                '\'' => {
1956                    // Single quoted string - everything literal until '
1957                    self.add(char_tokens::SNULL);
1958                    loop {
1959                        let ch = self.hgetc();
1960                        match ch {
1961                            Some('\'') => break,
1962                            Some(ch) => self.add(ch),
1963                            None => {
1964                                self.lexstop = true;
1965                                unmatched = '\'';
1966                                peek = LexTok::Lexerr;
1967                                break;
1968                            }
1969                        }
1970                    }
1971                    if unmatched != '\0' {
1972                        break;
1973                    }
1974                    self.add(char_tokens::SNULL);
1975                }
1976
1977                '"' => {
1978                    // Double quoted string
1979                    self.add(char_tokens::DNULL);
1980                    if self.dquote_parse('"', sub).is_err() {
1981                        unmatched = '"';
1982                        if !self.lexflags.active {
1983                            peek = LexTok::Lexerr;
1984                        }
1985                        break;
1986                    }
1987                    self.add(char_tokens::DNULL);
1988                }
1989
1990                '`' => {
1991                    // Backtick command substitution
1992                    self.add(char_tokens::TICK);
1993                    loop {
1994                        let ch = self.hgetc();
1995                        match ch {
1996                            Some('`') => break,
1997                            Some('\\') => {
1998                                let next = self.hgetc();
1999                                match next {
2000                                    Some('\n') => continue, // Line continuation
2001                                    Some(c) if c == '`' || c == '\\' || c == '$' => {
2002                                        self.add(char_tokens::BNULL);
2003                                        self.add(c);
2004                                    }
2005                                    Some(c) => {
2006                                        self.add('\\');
2007                                        self.add(c);
2008                                    }
2009                                    None => break,
2010                                }
2011                            }
2012                            Some(ch) => self.add(ch),
2013                            None => {
2014                                self.lexstop = true;
2015                                unmatched = '`';
2016                                peek = LexTok::Lexerr;
2017                                break;
2018                            }
2019                        }
2020                    }
2021                    if unmatched != '\0' {
2022                        break;
2023                    }
2024                    self.add(char_tokens::TICK);
2025                }
2026
2027                '~' => {
2028                    self.add(char_tokens::TILDE);
2029                }
2030
2031                '#' => {
2032                    self.add(char_tokens::POUND);
2033                }
2034
2035                '^' => {
2036                    self.add(char_tokens::HAT);
2037                }
2038
2039                '*' => {
2040                    self.add(char_tokens::STAR);
2041                }
2042
2043                '?' => {
2044                    self.add(char_tokens::QUEST);
2045                }
2046
2047                ',' if bct > in_brace_param => {
2048                    self.add(char_tokens::COMMA);
2049                }
2050
2051                '-' => {
2052                    self.add(char_tokens::DASH);
2053                }
2054
2055                '!' if brct > 0 => {
2056                    self.add(char_tokens::BANG);
2057                }
2058
2059                // Terminators
2060                '\n' | ';' | '&' => {
2061                    break;
2062                }
2063
2064                _ => {
2065                    self.add(c);
2066                }
2067            }
2068
2069            c = match self.hgetc() {
2070                Some(c) => c,
2071                None => {
2072                    self.lexstop = true;
2073                    break;
2074                }
2075            };
2076
2077            if intpos > 0 {
2078                intpos -= 1;
2079            }
2080        }
2081
2082        // Put back the character that ended the token
2083        if !self.lexstop {
2084            self.hungetc(c);
2085        }
2086
2087        if unmatched != '\0' && !self.lexflags.active {
2088            self.error = Some(format!("unmatched {}", unmatched));
2089        }
2090
2091        if in_brace_param > 0 {
2092            self.error = Some("closing brace expected".to_string());
2093        }
2094
2095        self.tokstr = Some(self.lexbuf.as_str().to_string());
2096        peek
2097    }
2098
2099    /// Check if a string is a valid assignment target (identifier or array ref).
2100    ///
2101    /// zsh accepts identifier (`[A-Za-z_][A-Za-z0-9_]*`) optionally followed by
2102    /// a `[...]` subscript. Bare digits are NOT a valid lvalue (rejected at
2103    /// `if c.is_ascii_digit()` below — array index expressions like `arr[2]`
2104    /// are caught by the subscript handler, not here). And the first char
2105    /// must NOT be a zsh internal token byte — `$=foo` (where `$` becomes
2106    /// the STRING token 0x85) is parameter substitution with the `=` flag,
2107    /// NOT an envstring assignment.
2108    fn is_valid_assignment_target(&self, s: &str) -> bool {
2109        let mut chars = s.chars().peekable();
2110
2111        // Reject leading token byte — `$VAR=` is parameter substitution,
2112        // not assignment. Same for `*=`, `?=`, etc.
2113        if let Some(&c) = chars.peek() {
2114            if char_tokens::is_token(c) {
2115                return false;
2116            }
2117        }
2118
2119        // Check for leading digit (invalid)
2120        if let Some(&c) = chars.peek() {
2121            if c.is_ascii_digit() {
2122                // Could be array index, check rest
2123                while let Some(&c) = chars.peek() {
2124                    if !c.is_ascii_digit() {
2125                        break;
2126                    }
2127                    chars.next();
2128                }
2129                return chars.peek().is_none();
2130            }
2131        }
2132
2133        // Check identifier
2134        let mut has_ident = false;
2135        while let Some(&c) = chars.peek() {
2136            if c == char_tokens::INBRACK || c == '[' {
2137                break;
2138            }
2139            if c == '+' {
2140                // foo+=value
2141                chars.next();
2142                return chars.peek().is_none() || chars.peek() == Some(&'=');
2143            }
2144            if !Self::is_ident(c) && c != char_tokens::STRING && !char_tokens::is_token(c) {
2145                return false;
2146            }
2147            has_ident = true;
2148            chars.next();
2149        }
2150
2151        has_ident
2152    }
2153
2154    /// Parse the body of a double-quoted string (or any context that
2155    /// uses double-quote tokenization — `(( ))`, `${...}`, `$( ( ) )`).
2156    /// Direct port of zsh/Src/lex.c:1486-1693 `dquote_parse`. Reads
2157    /// chars until `endchar` is seen at depth 0, handling escapes,
2158    /// `${...}` parameter substitutions, `$(...)` and backtick command
2159    /// substitutions, `$((...))` arithmetic, and inner double-quoted
2160    /// strings. The `sub` flag toggles substitution-context tokens
2161    /// (lex.c:1487 `int sub` argument).
2162    ///
2163    /// zshrs port note: the recursion guard at the top is a Rust
2164    /// safety net; the C source relies on the runtime stack. Inner
2165    /// logic delegates to `dquote_parse_inner` which holds the actual
2166    /// per-char state machine matching lex.c:1495-1692.
2167    fn dquote_parse(&mut self, endchar: char, sub: bool) -> Result<(), ()> {
2168        self.recursion_depth += 1;
2169        if self.check_recursion() {
2170            self.recursion_depth -= 1;
2171            return Err(());
2172        }
2173
2174        let result = self.dquote_parse_inner(endchar, sub);
2175        self.recursion_depth -= 1;
2176        result
2177    }
2178
2179    fn dquote_parse_inner(&mut self, endchar: char, sub: bool) -> Result<(), ()> {
2180        let mut pct = 0; // parenthesis count
2181        let mut brct = 0; // bracket count
2182        let mut bct = 0; // brace count (for ${...})
2183        let mut intick = false; // inside backtick
2184        let is_math = endchar == ')' || endchar == ']' || self.infor > 0;
2185        const MAX_ITERATIONS: usize = 100_000;
2186        let mut iterations = 0;
2187
2188        loop {
2189            iterations += 1;
2190            if iterations > MAX_ITERATIONS {
2191                self.error = Some("dquote_parse exceeded maximum iterations".to_string());
2192                return Err(());
2193            }
2194            let c = self.hgetc();
2195            let c = match c {
2196                Some(c) if c == endchar && !intick && bct == 0 => {
2197                    if is_math && (pct > 0 || brct > 0) {
2198                        self.add(c);
2199                        if c == ')' {
2200                            pct -= 1;
2201                        } else if c == ']' {
2202                            brct -= 1;
2203                        }
2204                        continue;
2205                    }
2206                    return Ok(());
2207                }
2208                Some(c) => c,
2209                None => {
2210                    self.lexstop = true;
2211                    return Err(());
2212                }
2213            };
2214
2215            match c {
2216                '\\' => {
2217                    let next = self.hgetc();
2218                    match next {
2219                        Some('\n') if !sub => continue, // Line continuation
2220                        Some(c)
2221                            if c == '$'
2222                                || c == '\\'
2223                                || (c == '}' && !intick && bct > 0)
2224                                || c == endchar
2225                                || c == '`'
2226                                || (endchar == ']'
2227                                    && (c == '['
2228                                        || c == ']'
2229                                        || c == '('
2230                                        || c == ')'
2231                                        || c == '{'
2232                                        || c == '}'
2233                                        || (c == '"' && sub))) =>
2234                        {
2235                            self.add(char_tokens::BNULL);
2236                            self.add(c);
2237                        }
2238                        Some(c) => {
2239                            self.add('\\');
2240                            self.hungetc(c);
2241                            continue;
2242                        }
2243                        None => {
2244                            self.add('\\');
2245                        }
2246                    }
2247                }
2248
2249                '$' => {
2250                    if intick {
2251                        self.add(c);
2252                        continue;
2253                    }
2254                    let next = self.hgetc();
2255                    match next {
2256                        Some('(') => {
2257                            self.add(char_tokens::QSTRING);
2258                            match self.cmd_or_math_sub() {
2259                                CmdOrMath::Cmd => self.add(char_tokens::OUTPAR),
2260                                CmdOrMath::Math => self.add(char_tokens::OUTPARMATH),
2261                                CmdOrMath::Err => return Err(()),
2262                            }
2263                        }
2264                        Some('[') => {
2265                            self.add(char_tokens::STRING);
2266                            self.add(char_tokens::INBRACK);
2267                            self.dquote_parse(']', sub)?;
2268                            self.add(char_tokens::OUTBRACK);
2269                        }
2270                        Some('{') => {
2271                            self.add(char_tokens::QSTRING);
2272                            self.add(char_tokens::INBRACE);
2273                            bct += 1;
2274                        }
2275                        Some('$') => {
2276                            self.add(char_tokens::QSTRING);
2277                            self.add('$');
2278                        }
2279                        _ => {
2280                            if let Some(next) = next {
2281                                self.hungetc(next);
2282                            }
2283                            self.lexstop = false;
2284                            self.add(char_tokens::QSTRING);
2285                        }
2286                    }
2287                }
2288
2289                '}' => {
2290                    if intick || bct == 0 {
2291                        self.add(c);
2292                    } else {
2293                        self.add(char_tokens::OUTBRACE);
2294                        bct -= 1;
2295                    }
2296                }
2297
2298                '`' => {
2299                    self.add(char_tokens::QTICK);
2300                    intick = !intick;
2301                }
2302
2303                '(' => {
2304                    if !is_math || bct == 0 {
2305                        pct += 1;
2306                    }
2307                    self.add(c);
2308                }
2309
2310                ')' => {
2311                    if !is_math || bct == 0 {
2312                        if pct == 0 && is_math {
2313                            return Err(());
2314                        }
2315                        pct -= 1;
2316                    }
2317                    self.add(c);
2318                }
2319
2320                '[' => {
2321                    if !is_math || bct == 0 {
2322                        brct += 1;
2323                    }
2324                    self.add(c);
2325                }
2326
2327                ']' => {
2328                    if !is_math || bct == 0 {
2329                        if brct == 0 && is_math {
2330                            return Err(());
2331                        }
2332                        brct -= 1;
2333                    }
2334                    self.add(c);
2335                }
2336
2337                '"' => {
2338                    if intick || (endchar != '"' && bct == 0) {
2339                        self.add(c);
2340                    } else if bct > 0 {
2341                        self.add(char_tokens::DNULL);
2342                        self.dquote_parse('"', sub)?;
2343                        self.add(char_tokens::DNULL);
2344                    } else {
2345                        return Err(());
2346                    }
2347                }
2348
2349                _ => {
2350                    self.add(c);
2351                }
2352            }
2353        }
2354    }
2355
2356    /// Determine if (( is arithmetic or command
2357    /// Decide whether `( ... )` after a `$` is a math expression
2358    /// `$((...))` or a command substitution `$(...)`. Direct port of
2359    /// zsh/Src/lex.c:495-532 `cmd_or_math`. Tries dquote_parse first;
2360    /// if it succeeds AND the next char is `)` (closing the second
2361    /// paren of `(( ))`), it's math. Otherwise rewinds and treats as
2362    /// a command substitution.
2363    fn cmd_or_math(&mut self) -> CmdOrMath {
2364        let oldlen = self.lexbuf.len();
2365
2366        // Per lex.c:498-518 — `cmd_or_math` calls `dquote_parse(')')`
2367        // which fills lexbuf with ONLY the inner expression, then checks
2368        // for the closing `)`. The surrounding `((` / `))` are NOT added
2369        // to lexbuf. zshrs previously added INPAR + '(' before dquote and
2370        // ')' after, polluting DINPAR's tokstr with the literal parens.
2371        // Removed to match C exactly.
2372        if self.dquote_parse(')', false).is_err() {
2373            // Back up and try as command
2374            while self.lexbuf.len() > oldlen {
2375                if let Some(c) = self.lexbuf.pop() {
2376                    self.hungetc(c);
2377                }
2378            }
2379            self.hungetc('(');
2380            self.lexstop = false;
2381            return if self.skip_command_sub().is_err() {
2382                CmdOrMath::Err
2383            } else {
2384                CmdOrMath::Cmd
2385            };
2386        }
2387
2388        // Check for closing ) — matches C lex.c:511-512: success-with-`)`
2389        // means `((..))` was math. Don't add `)` to lexbuf.
2390        let c = self.hgetc();
2391        if c == Some(')') {
2392            return CmdOrMath::Math;
2393        }
2394
2395        // Not math, back up
2396        if let Some(c) = c {
2397            self.hungetc(c);
2398        }
2399        self.lexstop = false;
2400
2401        // Back up token
2402        while self.lexbuf.len() > oldlen {
2403            if let Some(c) = self.lexbuf.pop() {
2404                self.hungetc(c);
2405            }
2406        }
2407        self.hungetc('(');
2408
2409        if self.skip_command_sub().is_err() {
2410            CmdOrMath::Err
2411        } else {
2412            CmdOrMath::Cmd
2413        }
2414    }
2415
2416    /// Parse `$(...)` or `$((...))` after the `$` has been consumed.
2417    /// Direct port of zsh/Src/lex.c:540-573 `cmd_or_math_sub`. Reads
2418    /// the next char to discriminate: a leading `(` plus successful
2419    /// math parse via `cmd_or_math` → arithmetic substitution (with
2420    /// the open-paren retroactively rewritten to Inparmath); else
2421    /// command substitution via skip_command_sub.
2422    fn cmd_or_math_sub(&mut self) -> CmdOrMath {
2423        const MAX_CONTINUATIONS: usize = 10_000;
2424        let mut continuations = 0;
2425
2426        loop {
2427            continuations += 1;
2428            if continuations > MAX_CONTINUATIONS {
2429                self.error = Some("cmd_or_math_sub: too many line continuations".to_string());
2430                return CmdOrMath::Err;
2431            }
2432
2433            let c = self.hgetc();
2434            if c == Some('\\') {
2435                let c2 = self.hgetc();
2436                if c2 != Some('\n') {
2437                    if let Some(c2) = c2 {
2438                        self.hungetc(c2);
2439                    }
2440                    self.hungetc('\\');
2441                    self.lexstop = false;
2442                    return if self.skip_command_sub().is_err() {
2443                        CmdOrMath::Err
2444                    } else {
2445                        CmdOrMath::Cmd
2446                    };
2447                }
2448                // Line continuation, try again (loop instead of recursion)
2449                continue;
2450            }
2451
2452            // Not a line continuation, process normally
2453            if c == Some('(') {
2454                // Might be $((...))
2455                let lexpos = self.lexbuf.len();
2456                self.add(char_tokens::INPAR);
2457                self.add('(');
2458
2459                if self.dquote_parse(')', false).is_ok() {
2460                    let c2 = self.hgetc();
2461                    if c2 == Some(')') {
2462                        self.add(')');
2463                        return CmdOrMath::Math;
2464                    }
2465                    if let Some(c2) = c2 {
2466                        self.hungetc(c2);
2467                    }
2468                }
2469
2470                // Not math, restore and parse as command
2471                while self.lexbuf.len() > lexpos {
2472                    if let Some(ch) = self.lexbuf.pop() {
2473                        self.hungetc(ch);
2474                    }
2475                }
2476                self.hungetc('(');
2477                self.lexstop = false;
2478            } else {
2479                if let Some(c) = c {
2480                    self.hungetc(c);
2481                }
2482                self.lexstop = false;
2483            }
2484
2485            return if self.skip_command_sub().is_err() {
2486                CmdOrMath::Err
2487            } else {
2488                CmdOrMath::Cmd
2489            };
2490        }
2491    }
2492
2493    /// Skip over `(...)` for command-style substitutions: `$(...)`,
2494    /// `<(...)`, `>(...)`. Direct port of zsh/Src/lex.c:2080-end
2495    /// `skipcomm`. Per the C source comment: "we'll parse the input
2496    /// until we find an unmatched closing parenthesis. However, we'll
2497    /// throw away the result of the parsing and just keep the string
2498    /// we've built up on the way."
2499    ///
2500    /// zshrs port note: the C source uses zcontext_save/restore +
2501    /// strinbeg/inpush to set up an isolated lex context for the
2502    /// throw-away parse. zshrs's standalone walker tracks paren
2503    /// depth directly without re-entering the parser. Same
2504    /// invariant: stops at the matching `)`.
2505    fn skip_command_sub(&mut self) -> Result<(), ()> {
2506        let mut pct = 1;
2507        let mut start = true;
2508        const MAX_ITERATIONS: usize = 100_000;
2509        let mut iterations = 0;
2510
2511        self.add(char_tokens::INPAR);
2512
2513        loop {
2514            iterations += 1;
2515            if iterations > MAX_ITERATIONS {
2516                self.error = Some("skip_command_sub exceeded maximum iterations".to_string());
2517                return Err(());
2518            }
2519
2520            let c = self.hgetc();
2521            let c = match c {
2522                Some(c) => c,
2523                None => {
2524                    self.lexstop = true;
2525                    return Err(());
2526                }
2527            };
2528
2529            let iswhite = Self::is_inblank(c);
2530
2531            match c {
2532                '(' => {
2533                    pct += 1;
2534                    self.add(c);
2535                }
2536                ')' => {
2537                    pct -= 1;
2538                    if pct == 0 {
2539                        return Ok(());
2540                    }
2541                    self.add(c);
2542                }
2543                '\\' => {
2544                    self.add(c);
2545                    if let Some(c) = self.hgetc() {
2546                        self.add(c);
2547                    }
2548                }
2549                '\'' => {
2550                    self.add(c);
2551                    loop {
2552                        let ch = self.hgetc();
2553                        match ch {
2554                            Some('\'') => {
2555                                self.add('\'');
2556                                break;
2557                            }
2558                            Some(ch) => self.add(ch),
2559                            None => {
2560                                self.lexstop = true;
2561                                return Err(());
2562                            }
2563                        }
2564                    }
2565                }
2566                '"' => {
2567                    self.add(c);
2568                    loop {
2569                        let ch = self.hgetc();
2570                        match ch {
2571                            Some('"') => {
2572                                self.add('"');
2573                                break;
2574                            }
2575                            Some('\\') => {
2576                                self.add('\\');
2577                                if let Some(ch) = self.hgetc() {
2578                                    self.add(ch);
2579                                }
2580                            }
2581                            Some(ch) => self.add(ch),
2582                            None => {
2583                                self.lexstop = true;
2584                                return Err(());
2585                            }
2586                        }
2587                    }
2588                }
2589                '`' => {
2590                    self.add(c);
2591                    loop {
2592                        let ch = self.hgetc();
2593                        match ch {
2594                            Some('`') => {
2595                                self.add('`');
2596                                break;
2597                            }
2598                            Some('\\') => {
2599                                self.add('\\');
2600                                if let Some(ch) = self.hgetc() {
2601                                    self.add(ch);
2602                                }
2603                            }
2604                            Some(ch) => self.add(ch),
2605                            None => {
2606                                self.lexstop = true;
2607                                return Err(());
2608                            }
2609                        }
2610                    }
2611                }
2612                '#' if start => {
2613                    self.add(c);
2614                    // Skip comment to end of line
2615                    loop {
2616                        let ch = self.hgetc();
2617                        match ch {
2618                            Some('\n') => {
2619                                self.add('\n');
2620                                break;
2621                            }
2622                            Some(ch) => self.add(ch),
2623                            None => break,
2624                        }
2625                    }
2626                }
2627                _ => {
2628                    self.add(c);
2629                }
2630            }
2631
2632            start = iswhite;
2633        }
2634    }
2635
2636    /// Lex next token AND update per-context flags. Direct port of
2637    /// zsh/Src/lex.c:316-369 `ctxtlex`. The post-token state machine
2638    /// at lex.c:322-358 sets `incmdpos` based on the token shape:
2639    /// list separators / pipes / control keywords reset to cmd-pos;
2640    /// word-shaped tokens leave cmd-pos. Redirections (lex.c:361-368)
2641    /// stash prior incmdpos and force the redir target to non-cmd-pos.
2642    pub fn ctxtlex(&mut self) {
2643        // lex.c:319 — static `oldpos` cache for redir-target restore
2644        // is captured per-call here as `oldpos` below (zshrs's parser
2645        // re-enters ctxtlex per token, no need for static persistence).
2646
2647        // lex.c:321 — `zshlex();` to advance to the next token.
2648        self.zshlex();
2649
2650        // lex.c:322-358 — post-token incmdpos switch.
2651        match self.tok {
2652            // lex.c:323-343 — separators / openers / conjunctions /
2653            // control keywords — back into cmd-pos so the next token
2654            // can be a fresh command.
2655            LexTok::Seper
2656            | LexTok::Newlin
2657            | LexTok::Semi
2658            | LexTok::Dsemi
2659            | LexTok::Semiamp
2660            | LexTok::Semibar
2661            | LexTok::Amper
2662            | LexTok::Amperbang
2663            | LexTok::Inpar
2664            | LexTok::Inbrace
2665            | LexTok::Dbar
2666            | LexTok::Damper
2667            | LexTok::Bar
2668            | LexTok::Baramp
2669            | LexTok::Inoutpar
2670            | LexTok::Doloop
2671            | LexTok::Then
2672            | LexTok::Elif
2673            | LexTok::Else
2674            | LexTok::Doutbrack => {
2675                self.incmdpos = true;
2676            }
2677            // lex.c:345-353 — word/value-shaped tokens leave cmd-pos
2678            // so subsequent tokens are arguments, not a fresh command.
2679            LexTok::String
2680            | LexTok::Typeset
2681            | LexTok::Envarray
2682            | LexTok::Outpar
2683            | LexTok::Case
2684            | LexTok::Dinbrack => {
2685                self.incmdpos = false;
2686            }
2687            _ => {}
2688        }
2689
2690        // lex.c:359-360 — `infor` decay. FOR sets infor=2 so the next
2691        // DINPAR can detect c-style for. After any non-DINPAR, decay
2692        // to 0 (or back to 2 if we just saw FOR again).
2693        if self.tok != LexTok::Dinpar {
2694            self.infor = if self.tok == LexTok::For { 2 } else { 0 };
2695        }
2696
2697        // lex.c:361-368 — redir-target context dance. After consuming
2698        // a redir operator, the following token (the file path) sees
2699        // incmdpos=0 even when its inherent shape would put it back
2700        // in cmd-pos. After the redir target, restore `oldpos`.
2701        let oldpos = self.incmdpos;
2702        if self.tok.is_redirop()
2703            || self.tok == LexTok::For
2704            || self.tok == LexTok::Foreach
2705            || self.tok == LexTok::Select
2706        {
2707            self.inredir = true;
2708            self.incmdpos = false;
2709        } else if self.inredir {
2710            self.incmdpos = oldpos;
2711            self.inredir = false;
2712        }
2713    }
2714
2715    /// Mark the current word as the one ZLE was looking for. Direct
2716    /// port of zsh/Src/lex.c:1881-1897 `gotword`. Only meaningful
2717    /// when the lexer was started with LEXFLAGS_ZLE for completion;
2718    /// after this call `lexflags` is cleared so subsequent tokens
2719    /// don't re-trigger word tracking.
2720    ///
2721    /// zshrs port note: zsh's gotword updates `wb`/`we` (word begin/
2722    /// end positions) based on `zlemetacs` (cursor pos), `zlemetall`
2723    /// (line length), `inbufct`, and `addedx` — all live in zsh's
2724    /// input.c globals which zshrs hasn't wired through the lexer.
2725    /// Only the `lexflags = 0` side-effect at lex.c:1895 is
2726    /// reproducible without that integration.
2727    pub fn gotword(&mut self) {
2728        // lex.c:1895 — `lexflags = 0;`
2729        self.lexflags = LexFlags::default();
2730    }
2731
2732    /// Register a heredoc to be processed at next newline
2733    pub fn register_heredoc(&mut self, terminator: String, strip_tabs: bool) {
2734        self.heredocs.push(HereDoc {
2735            terminator,
2736            strip_tabs,
2737            content: String::new(),
2738            quoted: false,
2739            processed: false,
2740        });
2741    }
2742
2743    /// Check for reserved word — mirrors lex.c:2002-2015 in `exalias`,
2744    /// but reachable from the bare `zshlex` path (without an
2745    /// AliasResolver). Promotes STRING tokens to keyword tokens when:
2746    ///   - incmdpos is set (or text is `}` ending a brace block)
2747    ///   - text is `]]` and we're inside `[[ ]]` (incond > 0)
2748    ///   - text is bare `!` and we're at the start of a cond (incond == 1)
2749    pub fn check_reserved_word(&mut self) -> bool {
2750        if let Some(ref tokstr) = self.tokstr {
2751            if self.incmdpos || (tokstr == "}" && self.tok == LexTok::String) {
2752                if let Some(tok) = crate::tokens::lookup_reserved_word(tokstr) {
2753                    self.tok = tok;
2754                    if tok == LexTok::Repeat {
2755                        self.inrepeat = 1;
2756                    }
2757                    if tok == LexTok::Dinbrack {
2758                        self.incond = 1;
2759                    }
2760                    return true;
2761                }
2762                if tokstr == "]]" && self.incond > 0 {
2763                    self.tok = LexTok::Doutbrack;
2764                    self.incond = 0;
2765                    return true;
2766                }
2767            }
2768            // lex.c:2010-2014 — `]]` and `!` are recognized inside `[[`
2769            // regardless of incmdpos.
2770            if self.incond > 0 && tokstr == "]]" {
2771                self.tok = LexTok::Doutbrack;
2772                self.incond = 0;
2773                return true;
2774            }
2775            if self.incond == 1 && tokstr == "!" {
2776                self.tok = LexTok::Bang;
2777                return true;
2778            }
2779        }
2780        false
2781    }
2782}
2783
2784/// Result of determining if (( is arithmetic or command
2785enum CmdOrMath {
2786    Cmd,
2787    Math,
2788    Err,
2789}
2790
2791// ============================================================================
2792// Additional parsing functions ported from lex.c
2793// ============================================================================
2794
2795/// Check whether we're looking at valid numeric globbing syntax
2796/// `<N-M>` / `<N->` / `<-M>` / `<->`. Call pointing just after the
2797/// opening `<`. Leaves the input position unchanged, returning true
2798/// or false.
2799///
2800/// Direct port of zsh/Src/lex.c:580-610 `isnumglob`. C source uses
2801/// hgetc/hungetc against the input stream and a temp buffer to
2802/// remember consumed chars; zshrs takes a `(input, pos)` slice and
2803/// scans without consumption. Same predicate, different I/O model.
2804pub fn isnumglob(input: &str, pos: usize) -> bool {
2805    let chars: Vec<char> = input[pos..].chars().collect();
2806    let mut i = 0;
2807    let mut expect_close = false;
2808
2809    // Look for digits, then -, then digits, then >
2810    while i < chars.len() {
2811        let c = chars[i];
2812        if c.is_ascii_digit() {
2813            i += 1;
2814        } else if c == '-' && !expect_close {
2815            expect_close = true;
2816            i += 1;
2817        } else if c == '>' && expect_close {
2818            return true;
2819        } else {
2820            break;
2821        }
2822    }
2823    false
2824}
2825
2826/// Tokenize a string as if in double quotes (error-tolerant variant).
2827///
2828/// Direct port of zsh/Src/lex.c:1713-1733 `parsestrnoerr`. The C
2829/// source: zcontext_save → untokenize → inpush → strinbeg →
2830/// `lexbuf.ptr = tokstr = *s; lexbuf.siz = l + 1` →
2831/// `err = dquote_parse('\0', 1)` → strinend → inpop → zcontext_restore.
2832/// Returns the tokenized string on success, or the offending char as
2833/// an error code (zsh convention: `> 32 && < 127` → printable, else
2834/// generic).
2835///
2836/// zshrs port: the C version drives the lexer's dquote_parse method
2837/// against the input string. zshrs's standalone walker produces the
2838/// same BNULL/QSTRING/QTICK token markers without re-entering the
2839/// lexer — same output for typical bodies. Documented divergence:
2840/// nested cmd-sub `$(...)` and arith `$((...))` aren't lexed
2841/// recursively; the runtime handles them at expansion time.
2842pub fn parsestrnoerr(s: &str) -> Result<String, String> {
2843    parsestr_inner(s)
2844}
2845
2846/// Tokenize a string as if in double quotes (error-reporting variant).
2847///
2848/// Direct port of zsh/Src/lex.c:1693-1709 `parsestr`. C source:
2849/// `if ((err = parsestrnoerr(s))) { untokenize(*s); ... zerr("parse
2850/// error near `%c'", err); tok = LEXERR; }`. zshrs's wrapper
2851/// returns the same Result and lets the caller emit the diagnostic.
2852///
2853/// Both `parsestr` and `parsestrnoerr` share the inner walker; the
2854/// only difference in C is whether errors trigger `zerr`. zshrs
2855/// returns `Err(msg)` from both — the caller decides whether to
2856/// surface the diagnostic.
2857pub fn parsestr(s: &str) -> Result<String, String> {
2858    parsestr_inner(s)
2859}
2860
2861/// Shared body for parsestr / parsestrnoerr.
2862fn parsestr_inner(s: &str) -> Result<String, String> {
2863    let mut result = String::with_capacity(s.len());
2864    let chars: Vec<char> = s.chars().collect();
2865    let mut i = 0;
2866
2867    while i < chars.len() {
2868        let c = chars[i];
2869        match c {
2870            '\\' => {
2871                i += 1;
2872                if i < chars.len() {
2873                    let next = chars[i];
2874                    match next {
2875                        '$' | '\\' | '`' | '"' | '\n' => {
2876                            result.push(char_tokens::BNULL);
2877                            result.push(next);
2878                        }
2879                        _ => {
2880                            result.push('\\');
2881                            result.push(next);
2882                        }
2883                    }
2884                } else {
2885                    result.push('\\');
2886                }
2887            }
2888            '$' => {
2889                result.push(char_tokens::QSTRING);
2890                if i + 1 < chars.len() {
2891                    let next = chars[i + 1];
2892                    if next == '{' {
2893                        result.push(char_tokens::INBRACE);
2894                        i += 1;
2895                    } else if next == '(' {
2896                        result.push(char_tokens::INPAR);
2897                        i += 1;
2898                    }
2899                }
2900            }
2901            '`' => {
2902                result.push(char_tokens::QTICK);
2903            }
2904            _ => {
2905                result.push(c);
2906            }
2907        }
2908        i += 1;
2909    }
2910
2911    Ok(result)
2912}
2913
2914/// Parse a subscript in string s. Return the position after the
2915/// closing bracket, or None on error.
2916///
2917/// Direct port of zsh/Src/lex.c:1742-1788 `parse_subscript`. The C
2918/// source uses dupstring_wlen + inpush + dquote_parse to lex the
2919/// subscript through the main lexer; zshrs implements a focused
2920/// bracket-balancing walker that handles the same nesting rules
2921/// (`[...]`, `(...)`, `{...}`) without re-entering the lexer.
2922///
2923/// zshrs port note: zsh's parse_subscript also handles a `sub`
2924/// flag that controls whether `$` and quotes are tokenized — that
2925/// flag isn't exposed here. Most callers don't need it; the few
2926/// that do (parameter expansion's `${var[expr]}`) handle the
2927/// quote-aware lex separately at the expansion layer.
2928pub fn parse_subscript(s: &str, endchar: char) -> Option<usize> {
2929    if s.is_empty() || s.starts_with(endchar) {
2930        return None;
2931    }
2932
2933    let chars: Vec<char> = s.chars().collect();
2934    let mut i = 0;
2935    let mut depth = 0;
2936    let mut in_dquote = false;
2937    let mut in_squote = false;
2938
2939    while i < chars.len() {
2940        let c = chars[i];
2941
2942        if in_squote {
2943            if c == '\'' {
2944                in_squote = false;
2945            }
2946            i += 1;
2947            continue;
2948        }
2949
2950        if in_dquote {
2951            if c == '"' {
2952                in_dquote = false;
2953            } else if c == '\\' && i + 1 < chars.len() {
2954                i += 1; // skip escaped char
2955            }
2956            i += 1;
2957            continue;
2958        }
2959
2960        match c {
2961            '\\' => {
2962                i += 1; // skip next char
2963            }
2964            '\'' => {
2965                in_squote = true;
2966            }
2967            '"' => {
2968                in_dquote = true;
2969            }
2970            '[' | '(' => {
2971                depth += 1;
2972            }
2973            ']' | ')' => {
2974                if depth > 0 {
2975                    depth -= 1;
2976                } else if c == endchar {
2977                    return Some(i);
2978                }
2979            }
2980            _ => {}
2981        }
2982
2983        if c == endchar && depth == 0 {
2984            return Some(i);
2985        }
2986
2987        i += 1;
2988    }
2989
2990    None
2991}
2992
2993/// Tokenize a string as if it were a normal command-line argument
2994/// but it may contain separators. Used for ${...%...} substitutions.
2995///
2996/// Direct port of zsh/Src/lex.c:1796-1880 `parse_subst_string`.
2997/// zsh's version sets `noaliases = 1` + `lexflags = 0` + uses
2998/// zcontext_save/inpush/strinbeg → dquote_parse('\0', 1) →
2999/// strinend/inpop/zcontext_restore. zshrs's standalone walker
3000/// produces the same BNULL/SNULL/DNULL/INPAR/INBRACK markers
3001/// without re-entering the lexer.
3002///
3003/// zshrs port note: the C source returns int (0=ok, char value =
3004/// where it stopped on error); zshrs returns Result<String,String>
3005/// returning the tokenized text directly. Lossy for callers that
3006/// need to know the exact stop position, but nothing in zshrs's
3007/// expansion layer uses that yet.
3008pub fn parse_subst_string(s: &str) -> Result<String, String> {
3009    if s.is_empty() {
3010        return Ok(String::new());
3011    }
3012
3013    let mut result = String::with_capacity(s.len());
3014    let chars: Vec<char> = s.chars().collect();
3015    let mut i = 0;
3016
3017    while i < chars.len() {
3018        let c = chars[i];
3019        match c {
3020            '\\' => {
3021                result.push(char_tokens::BNULL);
3022                i += 1;
3023                if i < chars.len() {
3024                    result.push(chars[i]);
3025                }
3026            }
3027            '\'' => {
3028                result.push(char_tokens::SNULL);
3029                i += 1;
3030                while i < chars.len() && chars[i] != '\'' {
3031                    result.push(chars[i]);
3032                    i += 1;
3033                }
3034                result.push(char_tokens::SNULL);
3035            }
3036            '"' => {
3037                result.push(char_tokens::DNULL);
3038                i += 1;
3039                while i < chars.len() && chars[i] != '"' {
3040                    if chars[i] == '\\' && i + 1 < chars.len() {
3041                        result.push(char_tokens::BNULL);
3042                        i += 1;
3043                        result.push(chars[i]);
3044                    } else if chars[i] == '$' {
3045                        result.push(char_tokens::QSTRING);
3046                    } else {
3047                        result.push(chars[i]);
3048                    }
3049                    i += 1;
3050                }
3051                result.push(char_tokens::DNULL);
3052            }
3053            '$' => {
3054                result.push(char_tokens::STRING);
3055                if i + 1 < chars.len() {
3056                    match chars[i + 1] {
3057                        '{' => {
3058                            result.push(char_tokens::INBRACE);
3059                            i += 1;
3060                        }
3061                        '(' => {
3062                            result.push(char_tokens::INPAR);
3063                            i += 1;
3064                        }
3065                        _ => {}
3066                    }
3067                }
3068            }
3069            '*' => result.push(char_tokens::STAR),
3070            '?' => result.push(char_tokens::QUEST),
3071            '[' => result.push(char_tokens::INBRACK),
3072            ']' => result.push(char_tokens::OUTBRACK),
3073            '{' => result.push(char_tokens::INBRACE),
3074            '}' => result.push(char_tokens::OUTBRACE),
3075            '~' => result.push(char_tokens::TILDE),
3076            '#' => result.push(char_tokens::POUND),
3077            '^' => result.push(char_tokens::HAT),
3078            _ => result.push(c),
3079        }
3080        i += 1;
3081    }
3082
3083    Ok(result)
3084}
3085
3086/// Untokenize a string - convert tokenized chars back to original
3087///
3088/// Port of untokenize() from exec.c (but used by lexer too)
3089/// Like `untokenize`, but maps SNULL → `'` and DNULL → `"` instead of
3090/// stripping them. Used by callers that need the source form including
3091/// quoting (e.g. arithmetic-substitution detection in compile_zsh).
3092pub fn untokenize_preserve_quotes(s: &str) -> String {
3093    let mut result = String::with_capacity(s.len() + 4);
3094    for c in s.chars() {
3095        let cu = c as u32;
3096        if (0x83..=0x9f).contains(&cu) {
3097            match c {
3098                c if c == char_tokens::POUND => result.push('#'),
3099                c if c == char_tokens::STRING => result.push('$'),
3100                c if c == char_tokens::HAT => result.push('^'),
3101                c if c == char_tokens::STAR => result.push('*'),
3102                c if c == char_tokens::INPAR => result.push('('),
3103                c if c == char_tokens::OUTPAR => result.push(')'),
3104                c if c == char_tokens::INPARMATH => result.push('('),
3105                c if c == char_tokens::OUTPARMATH => result.push(')'),
3106                c if c == char_tokens::QSTRING => result.push('$'),
3107                c if c == char_tokens::EQUALS => result.push('='),
3108                c if c == char_tokens::BAR => result.push('|'),
3109                c if c == char_tokens::INBRACE => result.push('{'),
3110                c if c == char_tokens::OUTBRACE => result.push('}'),
3111                c if c == char_tokens::INBRACK => result.push('['),
3112                c if c == char_tokens::OUTBRACK => result.push(']'),
3113                c if c == char_tokens::TICK => result.push('`'),
3114                c if c == char_tokens::INANG => result.push('<'),
3115                c if c == char_tokens::OUTANG => result.push('>'),
3116                c if c == char_tokens::OUTANGPROC => result.push('>'),
3117                c if c == char_tokens::QUEST => result.push('?'),
3118                c if c == char_tokens::TILDE => result.push('~'),
3119                c if c == char_tokens::QTICK => result.push('`'),
3120                c if c == char_tokens::COMMA => result.push(','),
3121                c if c == char_tokens::DASH => result.push('-'),
3122                c if c == char_tokens::BANG => result.push('!'),
3123                c if c == char_tokens::SNULL => result.push('\''),
3124                c if c == char_tokens::DNULL => result.push('"'),
3125                c if c == char_tokens::BNULL => result.push('\\'),
3126                _ => {
3127                    let idx = c as usize;
3128                    if idx < char_tokens::ZTOKENS.len() {
3129                        result.push(char_tokens::ZTOKENS.chars().nth(idx).unwrap_or(c));
3130                    } else {
3131                        result.push(c);
3132                    }
3133                }
3134            }
3135        } else {
3136            result.push(c);
3137        }
3138    }
3139    result
3140}
3141
3142pub fn untokenize(s: &str) -> String {
3143    let mut result = String::with_capacity(s.len());
3144    let chars: Vec<char> = s.chars().collect();
3145    let mut i = 0;
3146
3147    while i < chars.len() {
3148        let c = chars[i];
3149        // Token chars live in zsh's META range (0x83 = META through 0x9f =
3150        // BNULL). Anything in that range needs un-mapping before display
3151        // or downstream consumption. The original `< 32` test was wrong —
3152        // none of zsh's tokens land in that range.
3153        let cu = c as u32;
3154        if (0x83..=0x9f).contains(&cu) {
3155            // Convert token back to original character
3156            match c {
3157                c if c == char_tokens::POUND => result.push('#'),
3158                c if c == char_tokens::STRING => result.push('$'),
3159                c if c == char_tokens::HAT => result.push('^'),
3160                c if c == char_tokens::STAR => result.push('*'),
3161                c if c == char_tokens::INPAR => result.push('('),
3162                c if c == char_tokens::OUTPAR => result.push(')'),
3163                c if c == char_tokens::INPARMATH => result.push('('),
3164                c if c == char_tokens::OUTPARMATH => result.push(')'),
3165                c if c == char_tokens::QSTRING => result.push('$'),
3166                c if c == char_tokens::EQUALS => result.push('='),
3167                c if c == char_tokens::BAR => result.push('|'),
3168                c if c == char_tokens::INBRACE => result.push('{'),
3169                c if c == char_tokens::OUTBRACE => result.push('}'),
3170                c if c == char_tokens::INBRACK => result.push('['),
3171                c if c == char_tokens::OUTBRACK => result.push(']'),
3172                c if c == char_tokens::TICK => result.push('`'),
3173                c if c == char_tokens::INANG => result.push('<'),
3174                c if c == char_tokens::OUTANG => result.push('>'),
3175                c if c == char_tokens::OUTANGPROC => result.push('>'),
3176                c if c == char_tokens::QUEST => result.push('?'),
3177                c if c == char_tokens::TILDE => result.push('~'),
3178                c if c == char_tokens::QTICK => result.push('`'),
3179                c if c == char_tokens::COMMA => result.push(','),
3180                c if c == char_tokens::DASH => result.push('-'),
3181                c if c == char_tokens::BANG => result.push('!'),
3182                c if c == char_tokens::SNULL
3183                    || c == char_tokens::DNULL
3184                    || c == char_tokens::BNULL =>
3185                {
3186                    // Null markers - skip
3187                }
3188                _ => {
3189                    // Unknown token, try ztokens lookup
3190                    let idx = c as usize;
3191                    if idx < char_tokens::ZTOKENS.len() {
3192                        result.push(char_tokens::ZTOKENS.chars().nth(idx).unwrap_or(c));
3193                    } else {
3194                        result.push(c);
3195                    }
3196                }
3197            }
3198        } else {
3199            result.push(c);
3200        }
3201        i += 1;
3202    }
3203
3204    result
3205}
3206
3207/// Check if a string contains any token characters
3208pub fn has_token(s: &str) -> bool {
3209    s.chars().any(|c| (c as u32) < 32)
3210}
3211
3212/// Convert token characters to their printable form for display
3213pub fn tokens_to_printable(s: &str) -> String {
3214    untokenize(s)
3215}
3216
3217#[cfg(test)]
3218mod tests {
3219    use super::*;
3220
3221    #[test]
3222    fn test_simple_command() {
3223        let mut lexer = ZshLexer::new("echo hello");
3224        lexer.zshlex();
3225        assert_eq!(lexer.tok, LexTok::String);
3226        assert_eq!(lexer.tokstr, Some("echo".to_string()));
3227
3228        lexer.zshlex();
3229        assert_eq!(lexer.tok, LexTok::String);
3230        assert_eq!(lexer.tokstr, Some("hello".to_string()));
3231
3232        lexer.zshlex();
3233        assert_eq!(lexer.tok, LexTok::Endinput);
3234    }
3235
3236    #[test]
3237    fn test_pipeline() {
3238        let mut lexer = ZshLexer::new("ls | grep foo");
3239        lexer.zshlex();
3240        assert_eq!(lexer.tok, LexTok::String);
3241
3242        lexer.zshlex();
3243        assert_eq!(lexer.tok, LexTok::Bar);
3244
3245        lexer.zshlex();
3246        assert_eq!(lexer.tok, LexTok::String);
3247
3248        lexer.zshlex();
3249        assert_eq!(lexer.tok, LexTok::String);
3250    }
3251
3252    #[test]
3253    fn test_redirections() {
3254        let mut lexer = ZshLexer::new("echo > file");
3255        lexer.zshlex();
3256        assert_eq!(lexer.tok, LexTok::String);
3257
3258        lexer.zshlex();
3259        assert_eq!(lexer.tok, LexTok::Outang);
3260
3261        lexer.zshlex();
3262        assert_eq!(lexer.tok, LexTok::String);
3263    }
3264
3265    #[test]
3266    fn test_heredoc() {
3267        let mut lexer = ZshLexer::new("cat << EOF");
3268        lexer.zshlex();
3269        assert_eq!(lexer.tok, LexTok::String);
3270
3271        lexer.zshlex();
3272        assert_eq!(lexer.tok, LexTok::Dinang);
3273
3274        lexer.zshlex();
3275        assert_eq!(lexer.tok, LexTok::String);
3276    }
3277
3278    #[test]
3279    fn test_single_quotes() {
3280        let mut lexer = ZshLexer::new("echo 'hello world'");
3281        lexer.zshlex();
3282        assert_eq!(lexer.tok, LexTok::String);
3283
3284        lexer.zshlex();
3285        assert_eq!(lexer.tok, LexTok::String);
3286        // Should contain Snull markers around literal content
3287        assert!(lexer.tokstr.is_some());
3288    }
3289
3290    #[test]
3291    fn test_function_tokens() {
3292        let mut lexer = ZshLexer::new("function foo { }");
3293        lexer.zshlex();
3294        assert_eq!(
3295            lexer.tok,
3296            LexTok::Func,
3297            "expected Func, got {:?}",
3298            lexer.tok
3299        );
3300
3301        lexer.zshlex();
3302        assert_eq!(
3303            lexer.tok,
3304            LexTok::String,
3305            "expected String for 'foo', got {:?}",
3306            lexer.tok
3307        );
3308        assert_eq!(lexer.tokstr, Some("foo".to_string()));
3309
3310        lexer.zshlex();
3311        assert_eq!(
3312            lexer.tok,
3313            LexTok::Inbrace,
3314            "expected Inbrace, got {:?} tokstr={:?}",
3315            lexer.tok,
3316            lexer.tokstr
3317        );
3318
3319        lexer.zshlex();
3320        assert_eq!(
3321            lexer.tok,
3322            LexTok::Outbrace,
3323            "expected Outbrace, got {:?} tokstr={:?} incmdpos={}",
3324            lexer.tok,
3325            lexer.tokstr,
3326            lexer.incmdpos
3327        );
3328    }
3329
3330    #[test]
3331    fn test_double_quotes() {
3332        let mut lexer = ZshLexer::new("echo \"hello $name\"");
3333        lexer.zshlex();
3334        assert_eq!(lexer.tok, LexTok::String);
3335
3336        lexer.zshlex();
3337        assert_eq!(lexer.tok, LexTok::String);
3338        // Should contain tokenized content
3339        assert!(lexer.tokstr.is_some());
3340    }
3341
3342    #[test]
3343    fn test_command_substitution() {
3344        let mut lexer = ZshLexer::new("echo $(pwd)");
3345        lexer.zshlex();
3346        assert_eq!(lexer.tok, LexTok::String);
3347
3348        lexer.zshlex();
3349        assert_eq!(lexer.tok, LexTok::String);
3350    }
3351
3352    #[test]
3353    fn test_env_assignment() {
3354        let mut lexer = ZshLexer::new("FOO=bar echo");
3355        lexer.incmdpos = true;
3356        lexer.zshlex();
3357        assert_eq!(
3358            lexer.tok,
3359            LexTok::Envstring,
3360            "tok={:?} tokstr={:?}",
3361            lexer.tok,
3362            lexer.tokstr
3363        );
3364
3365        lexer.zshlex();
3366        assert_eq!(lexer.tok, LexTok::String);
3367    }
3368
3369    #[test]
3370    fn test_array_assignment() {
3371        let mut lexer = ZshLexer::new("arr=(a b c)");
3372        lexer.incmdpos = true;
3373        lexer.zshlex();
3374        assert_eq!(lexer.tok, LexTok::Envarray);
3375    }
3376
3377    #[test]
3378    fn test_process_substitution() {
3379        let mut lexer = ZshLexer::new("diff <(ls) >(cat)");
3380        lexer.zshlex();
3381        assert_eq!(lexer.tok, LexTok::String);
3382
3383        lexer.zshlex();
3384        assert_eq!(lexer.tok, LexTok::String);
3385        // <(ls) is tokenized into the string
3386
3387        lexer.zshlex();
3388        assert_eq!(lexer.tok, LexTok::String);
3389        // >(cat) is tokenized
3390    }
3391
3392    #[test]
3393    fn test_arithmetic() {
3394        let mut lexer = ZshLexer::new("echo $((1+2))");
3395        lexer.zshlex();
3396        assert_eq!(lexer.tok, LexTok::String);
3397
3398        lexer.zshlex();
3399        assert_eq!(lexer.tok, LexTok::String);
3400    }
3401
3402    #[test]
3403    fn test_semicolon_variants() {
3404        let mut lexer = ZshLexer::new("case x in a) cmd;; b) cmd;& c) cmd;| esac");
3405
3406        // Skip to first ;;
3407        loop {
3408            lexer.zshlex();
3409            if lexer.tok == LexTok::Dsemi || lexer.tok == LexTok::Endinput {
3410                break;
3411            }
3412        }
3413        assert_eq!(lexer.tok, LexTok::Dsemi);
3414
3415        // Find ;&
3416        loop {
3417            lexer.zshlex();
3418            if lexer.tok == LexTok::Semiamp || lexer.tok == LexTok::Endinput {
3419                break;
3420            }
3421        }
3422        assert_eq!(lexer.tok, LexTok::Semiamp);
3423
3424        // Find ;|
3425        loop {
3426            lexer.zshlex();
3427            if lexer.tok == LexTok::Semibar || lexer.tok == LexTok::Endinput {
3428                break;
3429            }
3430        }
3431        assert_eq!(lexer.tok, LexTok::Semibar);
3432    }
3433}
zshrs_parse/lexer.rs

zshrs_parse/
lexer.rs