zshrs_parse/lexer.rs
1//! Zsh lexical analyzer - Direct port from zsh/Src/lex.c
2//!
3//! This lexer tokenizes zsh shell input into a stream of tokens.
4//! It handles all zsh-specific syntax including:
5//! - Single/double/dollar quotes
6//! - Command substitution $(...) and `...`
7//! - Arithmetic $((...))
8//! - Parameter expansion ${...}
9//! - Process substitution <(...) >(...)
10//! - Here documents
11//! - All redirection operators
12//! - Comments
13//! - Continuation lines
14
15use crate::tokens::{char_tokens, LexTok};
16use std::collections::VecDeque;
17
18/// Lexer flags controlling behavior
19#[derive(Debug, Clone, Copy, Default)]
20pub struct LexFlags {
21 /// Parsing for ZLE (line editor) completion
22 pub zle: bool,
23 /// Return newlines as tokens
24 pub newline: bool,
25 /// Preserve comments in output
26 pub comments_keep: bool,
27 /// Strip comments from output
28 pub comments_strip: bool,
29 /// Active lexing (from bufferwords)
30 pub active: bool,
31}
32
33/// Buffer state for building tokens
34#[derive(Debug, Clone)]
35struct LexBuf {
36 data: String,
37 siz: usize,
38}
39
40impl LexBuf {
41 fn new() -> Self {
42 LexBuf {
43 data: String::with_capacity(256),
44 siz: 256,
45 }
46 }
47
48 fn clear(&mut self) {
49 self.data.clear();
50 }
51
52 fn add(&mut self, c: char) {
53 self.data.push(c);
54 if self.data.len() >= self.siz {
55 self.siz *= 2;
56 self.data.reserve(self.siz - self.data.len());
57 }
58 }
59
60 #[allow(dead_code)]
61 fn add_str(&mut self, s: &str) {
62 self.data.push_str(s);
63 }
64
65 fn len(&self) -> usize {
66 self.data.len()
67 }
68
69 fn as_str(&self) -> &str {
70 &self.data
71 }
72
73 #[allow(dead_code)]
74 fn into_string(self) -> String {
75 self.data
76 }
77
78 #[allow(dead_code)]
79 fn last_char(&self) -> Option<char> {
80 self.data.chars().last()
81 }
82
83 fn pop(&mut self) -> Option<char> {
84 self.data.pop()
85 }
86}
87
88/// Here-document state
89#[derive(Debug, Clone)]
90pub struct HereDoc {
91 pub terminator: String,
92 pub strip_tabs: bool,
93 pub content: String,
94 /// True if the terminator was originally quoted (`<<'EOF'`,
95 /// `<<"EOF"`, or `<<\EOF`). Disables variable expansion / command
96 /// substitution / arithmetic in the body.
97 pub quoted: bool,
98 /// True once `process_heredocs` has read the body. Distinct from
99 /// "content is empty" because an empty heredoc legitimately has
100 /// empty content.
101 pub processed: bool,
102}
103
104/// The Zsh Lexer
105pub struct ZshLexer<'a> {
106 /// Input source
107 pub(crate) input: &'a str,
108 /// Current position in input
109 pub(crate) pos: usize,
110 /// Look-ahead buffer for ungotten characters
111 unget_buf: VecDeque<char>,
112 /// Current token string
113 pub tokstr: Option<String>,
114 /// Current token type
115 pub tok: LexTok,
116 /// File descriptor for redirections (e.g., 2> means fd=2)
117 pub tokfd: i32,
118 /// Line number at start of current token
119 pub toklineno: u64,
120 /// Current line number
121 pub lineno: u64,
122 /// Lexer has stopped (EOF or error)
123 pub lexstop: bool,
124 /// In command position (can accept reserved words)
125 pub incmdpos: bool,
126 /// In condition [[ ... ]]
127 pub incond: i32,
128 /// In pattern context (RHS of == != =~ in [[ ]])
129 pub incondpat: bool,
130 /// In case pattern
131 pub incasepat: i32,
132 /// In redirection
133 pub inredir: bool,
134 /// Saved `incmdpos` from before a redirop / for / foreach / select
135 /// — restored on the NEXT non-redir token. Mirrors `static int oldpos`
136 /// in C zsh's `ctxtlex` (lex.c:319). Required for cases like
137 /// `for x ( ... )` where `(` after the var name should tokenize as
138 /// INPAR — that depends on incmdpos being restored to 1 from before
139 /// FOR was lexed, which in turn depends on this saved value.
140 pub oldpos: bool,
141 /// After 'for' keyword
142 pub infor: i32,
143 /// After 'repeat' keyword
144 inrepeat: i32,
145 /// Parsing typeset arguments
146 pub intypeset: bool,
147 /// Inside (( ... )) arithmetic
148 dbparens: bool,
149 /// Disable alias expansion
150 pub noaliases: bool,
151 /// Disable spelling correction
152 pub nocorrect: i32,
153 /// Disable comment recognition
154 pub nocomments: bool,
155 /// Lexer flags
156 pub lexflags: LexFlags,
157 /// Whether this is the first line
158 pub isfirstln: bool,
159 /// Whether this is the first char of command
160 #[allow(dead_code)]
161 isfirstch: bool,
162 /// Pending here-documents
163 pub heredocs: Vec<HereDoc>,
164 /// Expecting heredoc terminator (0 = no, 1 = <<, 2 = <<-)
165 heredoc_pending: u8,
166 /// Token buffer
167 lexbuf: LexBuf,
168 /// After newline
169 pub isnewlin: i32,
170 /// Error message if any
171 pub error: Option<String>,
172 /// Global iteration counter for infinite loop detection
173 global_iterations: usize,
174 /// Recursion depth counter
175 recursion_depth: usize,
176 /// Raw-input capture flag — when nonzero, every char read through
177 /// `hgetc` is also appended to `tokstr_raw` via zshlex_raw_add.
178 /// Direct mirror of zsh/Src/lex.c:161 `lex_add_raw`. Used by
179 /// skipcomm (lex.c:2082) to preserve the literal text of `$(...)`
180 /// command substitutions for re-execution / display.
181 pub lex_add_raw: i32,
182 /// Raw-input capture buffer. Direct mirror of lex.c:165
183 /// `tokstr_raw` / lex.c:166 `lexbuf_raw`. Combined into one
184 /// `LexBuf` here since Rust's String tracks both the data and
185 /// length internally.
186 lexbuf_raw: LexBuf,
187}
188
189const MAX_LEXER_RECURSION: usize = 200;
190
191/// Per-alias info returned by `AliasResolver::lookup_alias` and
192/// `lookup_suffix_alias`. Mirrors zsh's `struct alias` fields used
193/// at lex.c:1914-1943: `text` (replacement body), `in_use` (the
194/// recursion-guard flag), `global` (vs command-position-only).
195#[derive(Debug, Clone)]
196pub struct AliasInfo {
197 pub text: String,
198 pub in_use: bool,
199 pub global: bool,
200}
201
202/// Trait the lexer uses to look up aliases and reserved words during
203/// `exalias`. Implementors typically delegate to the executor's
204/// alias/reswd hash tables. Defining the trait here keeps lexer.rs
205/// free of executor-specific types — same pattern zsh uses with the
206/// hashtable.h opaque-handle approach against aliastab/reswdtab/
207/// sufaliastab.
208pub trait AliasResolver {
209 /// Look up an alias by name. Returns `None` if not found, or the
210 /// alias body + flags otherwise.
211 fn lookup_alias(&self, name: &str) -> Option<AliasInfo>;
212 /// Look up a suffix alias (e.g. `.txt → less`) by suffix only.
213 fn lookup_suffix_alias(&self, suffix: &str) -> Option<AliasInfo>;
214 /// Resolve a reserved word. Returns the LexTok the word should
215 /// promote to (e.g. "if" → IF), or None if not a reswd.
216 fn lookup_reswd(&self, name: &str) -> Option<LexTok>;
217 /// Mark an alias as in-use (recursion guard). Called when an
218 /// alias is about to be expanded; the matching unmark happens
219 /// when the alias text has been fully consumed by the lexer.
220 fn mark_in_use(&mut self, name: &str, in_use: bool);
221}
222
223/// Saved lexical state for nested-context handling. Direct port of
224/// `struct lex_stack` declared in zsh/Src/zsh.h and used by
225/// zsh/Src/lex.c:215-239 (`lex_context_save`) and lex.c:244-262
226/// (`lex_context_restore`). Used when entering command substitution,
227/// here-docs, or eval where the outer lexer state must be pushed and
228/// restored after the inner parse completes.
229#[derive(Debug, Clone)]
230pub struct LexStack {
231 pub dbparens: bool,
232 pub isfirstln: bool,
233 pub isfirstch: bool,
234 pub lexflags: LexFlags,
235 pub tok: LexTok,
236 pub tokstr: Option<String>,
237 pub lexbuf_data: String,
238 pub lexbuf_siz: usize,
239 pub lexstop: bool,
240 pub toklineno: u64,
241}
242
243impl Default for LexStack {
244 fn default() -> Self {
245 // Mirrors lex.c:235-238 reset state after a save: tokstr / lexbuf
246 // zeroed, lexbuf.siz back to the initial 256 alloc, tok to
247 // ENDINPUT (the C source doesn't explicitly reset tok here but
248 // the natural baseline is ENDINPUT — same as lexinit).
249 LexStack {
250 dbparens: false,
251 isfirstln: false,
252 isfirstch: false,
253 lexflags: LexFlags::default(),
254 tok: LexTok::Endinput,
255 tokstr: None,
256 lexbuf_data: String::new(),
257 lexbuf_siz: 256,
258 lexstop: false,
259 toklineno: 0,
260 }
261 }
262}
263
264impl<'a> ZshLexer<'a> {
265 /// Create a new lexer for the given input
266 pub fn new(input: &'a str) -> Self {
267 ZshLexer {
268 input,
269 pos: 0,
270 unget_buf: VecDeque::new(),
271 tokstr: None,
272 tok: LexTok::Endinput,
273 tokfd: -1,
274 toklineno: 1,
275 lineno: 1,
276 lexstop: false,
277 incmdpos: true,
278 incond: 0,
279 incondpat: false,
280 incasepat: 0,
281 inredir: false,
282 oldpos: true,
283 infor: 0,
284 inrepeat: 0,
285 intypeset: false,
286 dbparens: false,
287 noaliases: false,
288 nocorrect: 0,
289 nocomments: false,
290 lexflags: LexFlags::default(),
291 isfirstln: true,
292 isfirstch: true,
293 heredocs: Vec::new(),
294 heredoc_pending: 0,
295 lexbuf: LexBuf::new(),
296 isnewlin: 0,
297 error: None,
298 global_iterations: 0,
299 recursion_depth: 0,
300 lex_add_raw: 0,
301 lexbuf_raw: LexBuf::new(),
302 }
303 }
304
305 /// Append a char to the raw-input capture buffer. Direct port of
306 /// zsh/Src/lex.c:2024-2039 `zshlex_raw_add`. Called from hgetc
307 /// when `lex_add_raw` is nonzero so cmd-sub bodies (`$(...)`,
308 /// `<(...)`, `>(...)`) can be replayed verbatim without re-lexing.
309 pub fn zshlex_raw_add(&mut self, c: char) {
310 // lex.c:2027-2028 — guard on lex_add_raw flag.
311 if self.lex_add_raw == 0 {
312 return;
313 }
314 // lex.c:2030-2038 — append to lexbuf_raw. The C source manages
315 // explicit ptr/len/siz with hrealloc; Rust's String handles
316 // resize automatically.
317 self.lexbuf_raw.add(c);
318 }
319
320 /// Run alias / reserved-word expansion on the just-lexed token.
321 /// Direct port of zsh/Src/lex.c:1949-2021 `exalias`. Returns true
322 /// if an alias was injected (the caller's loop should re-run
323 /// gettok to consume the injected text).
324 ///
325 /// C source flow:
326 /// 1. Spell-correct (lex.c:1958-1962) — disabled in zshrs.
327 /// 2. If tokstr is None: set lextext from `tokstrings[tok]` and
328 /// checkalias against that (lex.c:1964-1969).
329 /// 3. Otherwise: untokenize tokstr into a working copy (lex.c:
330 /// 1971-1980).
331 /// 4. ZLE word-tracking: call gotword() if LEXFLAGS_ZLE
332 /// (lex.c:1982-1991).
333 /// 5. STRING tokens: try checkalias, then reservation lookup
334 /// (lex.c:1993-2015).
335 /// 6. Clear inalmore (lex.c:2016).
336 ///
337 /// Takes an `AliasResolver` trait object so the lexer doesn't
338 /// hard-depend on the executor's alias-table types. zshrs callers
339 /// implement `AliasResolver` over their alias hash tables.
340 pub fn exalias<R: AliasResolver>(&mut self, resolver: &mut R) -> bool {
341 // lex.c:1957 — `hwend()` ends the history-word region. zshrs's
342 // history layer doesn't track per-word boundaries here; no-op.
343
344 // lex.c:1958-1962 — spell correction via spckword. zshrs
345 // doesn't implement spell correction yet; documented divergence.
346
347 // lex.c:1964-1969 — bare-token path (no tokstr).
348 if self.tokstr.is_none() {
349 // lex.c:1965 — `zshlextext = tokstrings[tok];` — for tokens
350 // like SEMI/AMPER/etc. the canonical text comes from a
351 // static table. zshrs's check_alias_for_text uses the
352 // resolver directly with the token's text representation.
353 if self.tok == LexTok::Newlin {
354 return false;
355 }
356 // Use punctuation-token text; unknown tokens skip alias.
357 let text = match self.tok {
358 LexTok::Semi => ";",
359 LexTok::Amper => "&",
360 LexTok::Bar => "|",
361 _ => return false,
362 };
363 return self.check_alias(resolver, text);
364 }
365
366 let tokstr = self.tokstr.clone().unwrap();
367 // lex.c:1973-1980 — untokenize: convert the lexer's internal
368 // tokenized form (Pound..ztokens shifts) into the literal
369 // shell text. Call the global helper.
370 let lextext = if has_token(&tokstr) {
371 untokenize(&tokstr)
372 } else {
373 tokstr.clone()
374 };
375
376 // lex.c:1982-1991 — ZLE word-tracking for completion.
377 if self.lexflags.zle {
378 let zp = self.lexflags;
379 self.gotword();
380 // lex.c:1986-1990 — if gotword cleared lexflags, the cursor
381 // word has been reached; abort exalias so completion can
382 // capture the partial token unchanged.
383 if zp.zle && !self.lexflags.zle {
384 return false;
385 }
386 }
387
388 // lex.c:1993-2015 — STRING-token alias / reswd check.
389 if self.tok == LexTok::String {
390 // lex.c:1995 — `checkalias()`. POSIX-aliases gate skipped
391 // here (zshrs doesn't have the option flag wired).
392 if self.check_alias(resolver, &lextext) {
393 return true;
394 }
395
396 // lex.c:2002-2009 — reserved-word lookup. Fires when in
397 // command position OR when the text is bare `}` and
398 // IGNOREBRACES is unset (so `}` ends a brace block).
399 if self.incmdpos || lextext == "}" {
400 if let Some(rwtok) = resolver.lookup_reswd(&lextext) {
401 self.tok = rwtok;
402 if rwtok == LexTok::Repeat {
403 self.inrepeat = 1;
404 }
405 if rwtok == LexTok::Dinbrack {
406 self.incond = 1;
407 }
408 }
409 } else if self.incond > 0 && lextext == "]]" {
410 // lex.c:2010-2012 — `]]` closes the cond expression.
411 self.tok = LexTok::Doutbrack;
412 self.incond = 0;
413 } else if self.incond == 1 && lextext == "!" {
414 // lex.c:2013-2014 — `!` inside `[[ ]]` is the BANG
415 // negation, not a literal.
416 self.tok = LexTok::Bang;
417 }
418 }
419
420 // lex.c:2016 — `inalmore = 0;` — alias-more flag clears after
421 // any non-alias token.
422 // (zshrs's lexer doesn't have inalmore yet — added here would
423 // require gettok to track when an alias-pushed token has more
424 // text after it. Documented divergence.)
425
426 false
427 }
428
429 /// Helper for `exalias`. Direct port of zsh/Src/lex.c:1899-1947
430 /// `checkalias`. Returns true if the lookup matched (regular or
431 /// suffix alias) AND the alias text was successfully injected
432 /// back into the input stream for re-lexing.
433 fn check_alias<R: AliasResolver>(&mut self, resolver: &mut R, lextext: &str) -> bool {
434 // lex.c:1906-1907 — guard on null lextext.
435 if lextext.is_empty() {
436 return false;
437 }
438
439 // lex.c:1909-1911 — guard: alias expansion is disabled, or
440 // POSIX aliases require the token to be a STRING and not a
441 // reserved word.
442 if self.noaliases {
443 return false;
444 }
445
446 // lex.c:1914-1933 — regular alias lookup.
447 if let Some(alias) = resolver.lookup_alias(lextext) {
448 if !alias.in_use && (alias.global || (self.incmdpos && self.tok == LexTok::String)) {
449 // lex.c:1918-1927 — if the next char isn't blank,
450 // insert a space so the alias body can't accidentally
451 // join the following word.
452 if !self.lexstop {
453 if let Some(c) = self.peek() {
454 if !Self::is_blank(c) {
455 self.inject_alias_text(" ");
456 }
457 }
458 }
459 // lex.c:1928 — `inpush(an->text, INP_ALIAS, an);`
460 self.inject_alias_text(&alias.text);
461 resolver.mark_in_use(lextext, true);
462 self.lexstop = false;
463 return true;
464 }
465 }
466
467 // lex.c:1934-1943 — suffix-alias lookup. The token must end
468 // with `.SUFFIX`, the suffix name must be a registered
469 // suffix-alias, AND the lexer must be in command position.
470 if self.incmdpos {
471 if let Some(dot_pos) = lextext.rfind('.') {
472 if dot_pos > 0 && dot_pos + 1 < lextext.len() {
473 let suffix = &lextext[dot_pos + 1..];
474 if let Some(alias) = resolver.lookup_suffix_alias(suffix) {
475 if !alias.in_use {
476 // lex.c:1938-1940 — push three things in
477 // reverse: the alias text, a space, then
478 // the original word.
479 self.inject_alias_text(&alias.text);
480 self.inject_alias_text(" ");
481 self.inject_alias_text(lextext);
482 resolver.mark_in_use(suffix, true);
483 self.lexstop = false;
484 return true;
485 }
486 }
487 }
488 }
489 }
490
491 false
492 }
493
494 /// Push alias text back into the input stream so the lexer
495 /// re-reads it. Equivalent to zsh's `inpush(text, INP_ALIAS, an)`
496 /// at lex.c:1928,1938,1940. zshrs uses the existing `unget_buf`
497 /// (a VecDeque<char>) to inject chars in reverse order so the
498 /// next hgetc consumes them first.
499 fn inject_alias_text(&mut self, text: &str) {
500 // Insert at front in reverse so the first char of `text`
501 // comes out first.
502 for c in text.chars().rev() {
503 self.unget_buf.push_front(c);
504 }
505 }
506
507 /// Pop the last char from the raw-input capture buffer. Direct
508 /// port of zsh/Src/lex.c:2042-2049 `zshlex_raw_back`. Called when
509 /// the lexer ungets a char that was just captured raw — the raw
510 /// buffer must mirror the live input so this undoes the last add.
511 pub fn zshlex_raw_back(&mut self) {
512 // lex.c:2045-2046 — guard.
513 if self.lex_add_raw == 0 {
514 return;
515 }
516 // lex.c:2047-2048 — `lexbuf_raw.ptr--; lexbuf_raw.len--;`
517 self.lexbuf_raw.pop();
518 }
519
520 /// Mark the current raw-buffer offset (for restore later). Direct
521 /// port of zsh/Src/lex.c:2052-2058 `zshlex_raw_mark`. Returns
522 /// `len + offset` so callers can restore via `back_to_mark`.
523 pub fn zshlex_raw_mark(&self, offset: i64) -> i64 {
524 // lex.c:2055-2056 — guard.
525 if self.lex_add_raw == 0 {
526 return 0;
527 }
528 // lex.c:2057 — `return lexbuf_raw.len + offset;`
529 (self.lexbuf_raw.len() as i64) + offset
530 }
531
532 /// Restore raw-buffer offset to a previously-saved mark. Direct
533 /// port of zsh/Src/lex.c:2061-2068 `zshlex_raw_back_to_mark`.
534 /// Truncates the raw buffer to `mark` bytes — undoes any captures
535 /// since the mark was taken (used when a speculative parse fails
536 /// and the lexer rolls back).
537 pub fn zshlex_raw_back_to_mark(&mut self, mark: i64) {
538 // lex.c:2064-2065 — guard.
539 if self.lex_add_raw == 0 {
540 return;
541 }
542 // lex.c:2066-2067 — `lexbuf_raw.ptr = tokstr_raw + mark;
543 // lexbuf_raw.len = mark;` — Rust truncate handles both.
544 let m = mark.max(0) as usize;
545 self.lexbuf_raw.data.truncate(m);
546 }
547
548 /// Take the captured raw-input buffer, clearing it. Useful for
549 /// callers that need the literal command-sub body after lexing
550 /// (e.g. compile-time string capture for `$(...)`).
551 pub fn take_raw_buf(&mut self) -> String {
552 std::mem::take(&mut self.lexbuf_raw.data)
553 }
554
555 /// Save lexical context onto a `LexStack`. Direct port of
556 /// zsh/Src/lex.c:215-239 `lex_context_save`. After save, the lexer
557 /// is in a clean state suitable for parsing a nested input (command
558 /// substitution body, here-doc terminator, eval'd string).
559 pub fn lex_context_save(&mut self, ls: &mut LexStack) {
560 // lex.c:220-233 — copy live state into the stack.
561 ls.dbparens = self.dbparens;
562 ls.isfirstln = self.isfirstln;
563 ls.isfirstch = self.isfirstch;
564 ls.lexflags = self.lexflags;
565 ls.tok = self.tok;
566 ls.tokstr = self.tokstr.take();
567 ls.lexbuf_data = std::mem::take(&mut self.lexbuf.data);
568 ls.lexbuf_siz = self.lexbuf.siz;
569 ls.lexstop = self.lexstop;
570 ls.toklineno = self.toklineno;
571
572 // lex.c:235-238 — reset live state to defaults so a nested
573 // parse starts from a clean slate. tokstr/lexbuf are zeroed,
574 // lexbuf.siz reset to 256 (the C-source initial alloc).
575 self.tokstr = None;
576 self.lexbuf.data.clear();
577 self.lexbuf.siz = 256;
578 }
579
580 /// Restore lexical context from a `LexStack`. Direct port of
581 /// zsh/Src/lex.c:244-262 `lex_context_restore`. Inverse of
582 /// `lex_context_save`. Called after the nested parse completes.
583 pub fn lex_context_restore(&mut self, ls: &mut LexStack) {
584 // lex.c:249-261 — copy stack state back into live fields.
585 self.dbparens = ls.dbparens;
586 self.isfirstln = ls.isfirstln;
587 self.isfirstch = ls.isfirstch;
588 self.lexflags = ls.lexflags;
589 self.tok = ls.tok;
590 self.tokstr = ls.tokstr.take();
591 self.lexbuf.data = std::mem::take(&mut ls.lexbuf_data);
592 self.lexbuf.siz = ls.lexbuf_siz;
593 self.lexstop = ls.lexstop;
594 self.toklineno = ls.toklineno;
595 }
596
597 /// Initialize lexical state. Direct port of zsh/Src/lex.c:440-445
598 /// `lexinit`. Resets dbparens / nocorrect / lexstop and sets `tok`
599 /// to ENDINPUT so the next gettok starts from a known baseline.
600 /// Note: the constructor `Self::new` already sets equivalent
601 /// defaults; this method exists for the rare case a caller wants
602 /// to recycle a `ZshLexer` across multiple input strings.
603 pub fn lexinit(&mut self) {
604 // lex.c:443 — `nocorrect = dbparens = lexstop = 0;`
605 self.nocorrect = 0;
606 self.dbparens = false;
607 self.lexstop = false;
608 // lex.c:444 — `tok = ENDINPUT;`
609 self.tok = LexTok::Endinput;
610 }
611
612 /// Check recursion depth; returns true if exceeded
613 #[inline]
614 fn check_recursion(&mut self) -> bool {
615 if self.recursion_depth > MAX_LEXER_RECURSION {
616 self.error = Some("lexer exceeded max recursion depth".to_string());
617 self.lexstop = true;
618 true
619 } else {
620 false
621 }
622 }
623
624 /// Check and increment global iteration counter; returns true if limit exceeded
625 #[inline]
626 fn check_iterations(&mut self) -> bool {
627 self.global_iterations += 1;
628 if self.global_iterations > 50_000 {
629 self.error = Some("lexer exceeded 50K iterations".to_string());
630 self.lexstop = true;
631 self.tok = LexTok::Lexerr;
632 true
633 } else {
634 false
635 }
636 }
637
638 /// Get next character from input
639 fn hgetc(&mut self) -> Option<char> {
640 if self.check_iterations() {
641 return None;
642 }
643
644 // Re-read from unget_buf: increment lineno on `\n` HERE
645 // too. hungetc() decremented lineno when the char was put
646 // back; without a matching increment on the way out, every
647 // `\n` that's ungetted-then-reread leaves lineno
648 // permanently one short. Symptom: $LINENO stuck at 1 in
649 // every script statement because the parser ungets the
650 // separating newline once between statements.
651 if let Some(c) = self.unget_buf.pop_front() {
652 if c == '\n' {
653 self.lineno += 1;
654 }
655 return Some(c);
656 }
657
658 let c = self.input[self.pos..].chars().next()?;
659 self.pos += c.len_utf8();
660
661 if c == '\n' {
662 self.lineno += 1;
663 }
664
665 Some(c)
666 }
667
668 /// Put character back into input
669 fn hungetc(&mut self, c: char) {
670 self.unget_buf.push_front(c);
671 if c == '\n' && self.lineno > 1 {
672 self.lineno -= 1;
673 }
674 self.lexstop = false;
675 }
676
677 /// Peek at next character without consuming
678 #[allow(dead_code)]
679 fn peek(&mut self) -> Option<char> {
680 if let Some(&c) = self.unget_buf.front() {
681 return Some(c);
682 }
683 self.input[self.pos..].chars().next()
684 }
685
686 /// Add character to token buffer
687 fn add(&mut self, c: char) {
688 self.lexbuf.add(c);
689 }
690
691 /// Check if character is blank (space or tab)
692 fn is_blank(c: char) -> bool {
693 c == ' ' || c == '\t'
694 }
695
696 /// Peek for a zsh numeric range glob shape after a `<`: returns the
697 /// captured `N*-M*>` (everything *after* the leading `<`) when the
698 /// upcoming chars match `[0-9]*-[0-9]*>` exactly. Otherwise returns
699 /// None and leaves the input untouched.
700 fn try_numeric_range_glob(&mut self) -> Option<String> {
701 let mut buf: Vec<char> = Vec::new();
702 // optional leading digits
703 loop {
704 match self.hgetc() {
705 Some(c) if c.is_ascii_digit() => buf.push(c),
706 Some(c) => {
707 buf.push(c);
708 break;
709 }
710 None => break,
711 }
712 }
713 // last char in buf must be '-' for the range form
714 if buf.last() != Some(&'-') {
715 for c in buf.iter().rev() {
716 self.hungetc(*c);
717 }
718 return None;
719 }
720 // optional trailing digits
721 loop {
722 match self.hgetc() {
723 Some(c) if c.is_ascii_digit() => buf.push(c),
724 Some(c) => {
725 buf.push(c);
726 break;
727 }
728 None => break,
729 }
730 }
731 if buf.last() != Some(&'>') {
732 for c in buf.iter().rev() {
733 self.hungetc(*c);
734 }
735 return None;
736 }
737 Some(buf.into_iter().collect())
738 }
739
740 /// Check if character is blank (including other whitespace except newline)
741 fn is_inblank(c: char) -> bool {
742 matches!(c, ' ' | '\t' | '\x0b' | '\x0c' | '\r')
743 }
744
745 /// Check if character is a digit
746 fn is_digit(c: char) -> bool {
747 c.is_ascii_digit()
748 }
749
750 /// Check if character is identifier start
751 #[allow(dead_code)]
752 fn is_ident_start(c: char) -> bool {
753 c.is_ascii_alphabetic() || c == '_'
754 }
755
756 /// Check if character is identifier continuation
757 fn is_ident(c: char) -> bool {
758 c.is_ascii_alphanumeric() || c == '_'
759 }
760
761 /// Main lexer entry point — fetch the next token. Direct port of
762 /// zsh/Src/lex.c:265-313 `zshlex`. Loop body matches the C source
763 /// `do { ... } while (tok != ENDINPUT && exalias())` at lex.c:270-276,
764 /// followed by here-doc draining (lex.c:278-306), newline tracking
765 /// (lex.c:307-310), and SEMI/NEWLIN→SEPER folding (lex.c:311-312).
766 ///
767 /// zshrs port note: `exalias()` (lex.c:1953) is not yet wired into
768 /// the loop. The C source iterates as long as exalias keeps
769 /// re-injecting alias text into the input buffer; zshrs's alias
770 /// expansion happens post-lex in exec.rs. The loop body therefore
771 /// runs once and breaks unconditionally — documented divergence.
772 pub fn zshlex(&mut self) {
773 // lex.c:268-269 — early-out on prior LEXERR.
774 if self.tok == LexTok::Lexerr {
775 return;
776 }
777
778 // Note: Do NOT reset global_iterations here - it must accumulate across all
779 // zshlex calls in a parse to prevent infinite loops in the parser
780
781 // lex.c:270-276 — gettok / exalias loop. Without exalias wired,
782 // the inner body runs once and we `break` unconditionally.
783 loop {
784 // lex.c:271-272 — bump inrepeat counter for `repeat N {}`
785 // detection.
786 if self.inrepeat > 0 {
787 self.inrepeat += 1;
788 }
789 // lex.c:273-274 — at the third token after `repeat`,
790 // SHORTLOOPS / SHORTREPEAT options force back into cmd
791 // position so the loop body can start. zshrs unconditionally
792 // does this since the option-lookup lives in exec.rs.
793 if self.inrepeat == 3 {
794 self.incmdpos = true;
795 }
796
797 // lex.c:275 — `tok = gettok();`
798 self.tok = self.gettok();
799
800 // lex.c:276 — `while (tok != ENDINPUT && exalias())` —
801 // when exalias re-injects alias text it returns true and
802 // the loop iterates. Without exalias wired, we break.
803 break;
804 }
805
806 // lex.c:277 — `nocorrect &= 1;` — clear bit 1 (lookahead-only)
807 // so the persistent low bit survives but the per-word bit is
808 // dropped.
809 self.nocorrect &= 1;
810
811 // lex.c:278-306 — drain pending here-documents at the start
812 // of a new line. zshrs's process_heredocs reads the full body
813 // and stitches it onto the matching redir token.
814 if self.tok == LexTok::Newlin || self.tok == LexTok::Endinput {
815 self.process_heredocs();
816 }
817
818 // lex.c:307-310 — track whether we just saw a newline.
819 // C uses `inbufct` to distinguish "newline at EOF" (=1)
820 // from "newline mid-input" (=-1); zshrs reads `pos < len`.
821 if self.tok != LexTok::Newlin {
822 self.isnewlin = 0;
823 } else {
824 self.isnewlin = if self.pos < self.input.len() { -1 } else { 1 };
825 }
826
827 // lex.c:311-312 — fold SEMI / NEWLIN into SEPER unless
828 // LEXFLAGS_NEWLINE is set to preserve newlines (used by
829 // ZLE for completion of partial lines).
830 if self.tok == LexTok::Semi || (self.tok == LexTok::Newlin && !self.lexflags.newline) {
831 self.tok = LexTok::Seper;
832 }
833
834 // Reserved-word promotion. Per lex.c:2002-2005 in `exalias`:
835 // - `{` only promotes to INBRACE in command position
836 // - `}` promotes to OUTBRACE either in cmdpos OR via the
837 // special `closing-brace-special` rule (IGNOREBRACES unset
838 // — assumed since zshrs doesn't expose that option yet)
839 // - other reserved words: only when incmdpos (or `}` exception)
840 if self.tok == LexTok::String {
841 if let Some(ref s) = self.tokstr {
842 if s == "{" && self.incmdpos {
843 self.tok = LexTok::Inbrace;
844 } else if s == "}" {
845 self.tok = LexTok::Outbrace;
846 } else if self.incasepat == 0 {
847 // Skip reserved word checking in case pattern context —
848 // words like `time`, `end` should be patterns, not
849 // keywords.
850 self.check_reserved_word();
851 }
852 }
853 }
854
855 // If we were expecting a heredoc terminator, register it now
856 if self.heredoc_pending > 0 && self.tok == LexTok::String {
857 if let Some(ref terminator) = self.tokstr {
858 let strip_tabs = self.heredoc_pending == 2;
859 // Detect originally-quoted terminator (`<<'EOF'`,
860 // `<<"EOF"`). The lexer wraps single-quoted text in
861 // SNULL (`\u{9d}`) and double-quoted text in DNULL
862 // (`\u{9e}`); plain `EOF` has neither. Quoted-terminator
863 // heredocs disable variable / command-sub / arithmetic
864 // expansion in the body — see `compile_redir` for the
865 // expansion side.
866 // Quoted terminators (`<<'EOF'`, `<<"EOF"`, `<<\EOF`)
867 // disable expansion in the body. SNULL/DNULL mark
868 // single/double-quoted spans; BNULL (`\u{9f}`) marks
869 // any backslash-escaped char — its presence alone is
870 // enough to flag the terminator as quoted (zsh's
871 // `<<\EOF` shorthand for `<<'EOF'`).
872 let quoted = terminator.contains('\u{9d}')
873 || terminator.contains('\u{9e}')
874 || terminator.contains('\u{9f}')
875 || terminator.starts_with('\'')
876 || terminator.starts_with('"');
877 let term = terminator
878 .chars()
879 .filter(|c| {
880 *c != '\''
881 && *c != '"'
882 && *c != '\u{9d}'
883 && *c != '\u{9e}'
884 && *c != '\u{9f}'
885 })
886 .collect::<String>();
887 self.heredocs.push(HereDoc {
888 terminator: term,
889 strip_tabs,
890 content: String::new(),
891 quoted,
892 processed: false,
893 });
894 }
895 self.heredoc_pending = 0;
896 }
897
898 // Track pattern context inside [[ ... ]] - after = == != =~ the RHS is a pattern
899 if self.incond > 0 {
900 if let Some(ref s) = self.tokstr {
901 // Check if this token is a comparison operator
902 // Note: single = is also a comparison operator in [[ ]]
903 // The internal marker \u{8d} is used for =
904 if s == "="
905 || s == "=="
906 || s == "!="
907 || s == "=~"
908 || s == "\u{8d}"
909 || s == "\u{8d}\u{8d}"
910 || s == "!\u{8d}"
911 || s == "\u{8d}~"
912 || s == "\u{8d}\u{98}"
913 {
914 self.incondpat = true;
915 } else if self.incondpat {
916 // We were in pattern context, now we've consumed the pattern
917 // Reset after the pattern token is consumed
918 // But actually, pattern can span multiple tokens, so we should
919 // stay in pattern mode until ]] or && or ||
920 }
921 }
922 // Reset pattern context on ]] or logical operators (&&, ||)
923 // and grouping parens. zsh par_cond_3 (cond.c) treats
924 // these as cond-pattern terminators — the next operand is
925 // a fresh primary, NOT a continuation of the prior pattern.
926 // Without resetting on Damper/Dbar/Inpar/Outpar, the `(`
927 // after `[[ a == a && (b == b ... ` was lexed as a literal
928 // glob char (incondpat=true → gettokstr) and the whole
929 // remainder collapsed into one String token.
930 match self.tok {
931 LexTok::Doutbrack
932 | LexTok::Damper
933 | LexTok::Dbar
934 | LexTok::Inpar
935 | LexTok::Outpar
936 | LexTok::Bang => {
937 self.incondpat = false;
938 }
939 _ => {}
940 }
941 } else {
942 self.incondpat = false;
943 }
944
945 // Update command position for next token based on current token
946 // Note: In case patterns (incasepat > 0), | is a pattern separator, not pipeline,
947 // so we don't set incmdpos after Bar in that context
948 match self.tok {
949 LexTok::Seper
950 | LexTok::Newlin
951 | LexTok::Semi
952 | LexTok::Dsemi
953 | LexTok::Semiamp
954 | LexTok::Semibar
955 | LexTok::Amper
956 | LexTok::Amperbang
957 | LexTok::Inpar
958 | LexTok::Inbrace
959 | LexTok::Dbar
960 | LexTok::Damper
961 | LexTok::Baramp
962 | LexTok::Inoutpar
963 | LexTok::Doloop
964 | LexTok::Then
965 | LexTok::Elif
966 | LexTok::Else
967 | LexTok::Doutbrack
968 | LexTok::Func => {
969 self.incmdpos = true;
970 }
971 LexTok::Bar
972 // In case patterns, | is a pattern separator - don't change incmdpos
973 if self.incasepat <= 0 => {
974 self.incmdpos = true;
975 }
976 LexTok::String
977 | LexTok::Typeset
978 | LexTok::Envarray
979 | LexTok::Outpar
980 | LexTok::Case
981 | LexTok::Dinbrack => {
982 self.incmdpos = false;
983 }
984 _ => {}
985 }
986
987 // Track 'for' keyword for C-style for loop: for (( init; cond; step ))
988 // When we see 'for', set infor=2 to expect the init and cond parts
989 // Each Dinpar (after semicolon in arithmetic) decrements it
990 if self.tok != LexTok::Dinpar {
991 self.infor = if self.tok == LexTok::For { 2 } else { 0 };
992 }
993
994
995 // Handle redirection / for-loop context. Mirrors lex.c:359-368
996 // ctxtlex `oldpos` save/restore. The saved value lives in
997 // `self.oldpos` (struct field) so it survives across zshlex
998 // calls — the previous local `let oldpos = self.incmdpos`
999 // captured the JUST-updated value (always wrong) and lost the
1000 // pre-FOR incmdpos. With the field, FOR x → STRING x → INPAR
1001 // sequence correctly restores incmdpos=1 before the `(`.
1002 if self.tok.is_redirop()
1003 || self.tok == LexTok::For
1004 || self.tok == LexTok::Foreach
1005 || self.tok == LexTok::Select
1006 {
1007 self.inredir = true;
1008 self.oldpos = self.incmdpos;
1009 self.incmdpos = false;
1010 } else if self.inredir {
1011 self.incmdpos = self.oldpos;
1012 self.inredir = false;
1013 }
1014 }
1015
1016 /// Process pending here-documents. Walks each heredoc whose body
1017 /// hasn't been filled yet (content is empty AND terminator is set),
1018 /// reads lines from input until the terminator, and stuffs the body
1019 /// into `hdoc.content` IN PLACE. The list itself is preserved so the
1020 /// parser can index into it after parse() finishes.
1021 fn process_heredocs(&mut self) {
1022 let n = self.heredocs.len();
1023 for i in 0..n {
1024 // Skip heredocs we've already processed AND those without
1025 // a terminator (early-error case). The `processed` bool
1026 // distinguishes "filled with empty body" from "not yet
1027 // visited" — both have empty `content`.
1028 if self.heredocs[i].processed || self.heredocs[i].terminator.is_empty() {
1029 continue;
1030 }
1031 let strip_tabs = self.heredocs[i].strip_tabs;
1032 let terminator = self.heredocs[i].terminator.clone();
1033 let mut content = String::new();
1034 let mut line_count = 0;
1035
1036 loop {
1037 line_count += 1;
1038 if line_count > 10000 {
1039 self.error = Some("heredoc exceeded 10000 lines".to_string());
1040 self.tok = LexTok::Lexerr;
1041 return;
1042 }
1043
1044 let line = self.read_line();
1045 if line.is_none() {
1046 self.error = Some("here document too large or unterminated".to_string());
1047 self.tok = LexTok::Lexerr;
1048 return;
1049 }
1050
1051 let line = line.unwrap();
1052 let check_line = if strip_tabs {
1053 line.trim_start_matches('\t')
1054 } else {
1055 line.as_str()
1056 };
1057
1058 if check_line.trim_end_matches('\n') == terminator {
1059 break;
1060 }
1061
1062 // `<<-` strips leading tabs from BODY lines too, not just
1063 // from terminator-match comparison. Without this, tabs in
1064 // here-doc content survive into stdin.
1065 if strip_tabs {
1066 content.push_str(check_line);
1067 } else {
1068 content.push_str(&line);
1069 }
1070 }
1071
1072 self.heredocs[i].content = content;
1073 self.heredocs[i].processed = true;
1074 }
1075 }
1076
1077 /// Read a line from input (returns partial line at EOF)
1078 fn read_line(&mut self) -> Option<String> {
1079 let mut line = String::new();
1080
1081 loop {
1082 match self.hgetc() {
1083 Some(c) => {
1084 line.push(c);
1085 if c == '\n' {
1086 break;
1087 }
1088 }
1089 None => {
1090 // EOF - return partial line if any
1091 if line.is_empty() {
1092 return None;
1093 }
1094 break;
1095 }
1096 }
1097 }
1098
1099 Some(line)
1100 }
1101
1102 /// Get the next token. Direct port of zsh/Src/lex.c:613-936
1103 /// `gettok`. Reads characters from the input via hgetc, dispatches
1104 /// on the leading char through lexact1[]/lexact2[] tables (zshrs
1105 /// uses inline `match` in lex_initial / lex_inang / lex_outang
1106 /// since Rust pattern-matching subsumes the table dispatch).
1107 ///
1108 /// Structural divergence from C: the giant ~322-line C switch
1109 /// statement at lex.c:725-936 is split into helper methods in
1110 /// Rust (lex_initial = LX1_OTHER plus the punctuation cases,
1111 /// lex_inang / lex_outang for the < and > arms). The flow is
1112 /// equivalent — same chars consumed, same tokens emitted — but
1113 /// the source-level layout differs. C's table-driven dispatch
1114 /// would Rust-port as `match c { '\\' => ..., '\n' => ..., ... }`
1115 /// which is what the helpers ultimately do.
1116 fn gettok(&mut self) -> LexTok {
1117 // lex.c:621 — `tokstr = NULL;` reset before each token.
1118 self.tokstr = None;
1119 // (zshrs-specific: tokfd reset lives here too — C does it
1120 // implicitly via the `peekfd = -1` local at lex.c:617 used
1121 // only when a digit-prefix redirection is detected.)
1122 self.tokfd = -1;
1123
1124 // lex.c:622 — `while (iblank(c = hgetc()) && !lexstop);` —
1125 // skip leading blanks (space/tab, NOT newline).
1126 let mut ws_iterations = 0;
1127 loop {
1128 ws_iterations += 1;
1129 if ws_iterations > 100_000 {
1130 self.error = Some("gettok: infinite loop in whitespace skip".to_string());
1131 return LexTok::Lexerr;
1132 }
1133 let c = match self.hgetc() {
1134 Some(c) => c,
1135 None => {
1136 // lex.c:624-625 — lexstop set, return ENDINPUT
1137 // (or LEXERR if errflag is set elsewhere).
1138 self.lexstop = true;
1139 return if self.error.is_some() {
1140 LexTok::Lexerr
1141 } else {
1142 LexTok::Endinput
1143 };
1144 }
1145 };
1146
1147 if !Self::is_blank(c) {
1148 self.hungetc(c);
1149 break;
1150 }
1151 }
1152
1153 let c = match self.hgetc() {
1154 Some(c) => c,
1155 None => {
1156 self.lexstop = true;
1157 return LexTok::Endinput;
1158 }
1159 };
1160
1161 // lex.c:623 — `toklineno = lineno;`
1162 self.toklineno = self.lineno;
1163 // lex.c:626 — `isfirstln = 0;` once we've consumed any non-
1164 // blank.
1165 self.isfirstln = false;
1166
1167 // lex.c:631-648 — dbparens (inside `(( … ))`) special path:
1168 // call dquote_parse with `;` or `)` as the end-char and
1169 // either return DINPAR (continue for-loop arith) or DOUTPAR
1170 // (close the arith block) or LEXERR.
1171 if self.dbparens {
1172 return self.lex_arith(c);
1173 }
1174
1175 // lex.c:649-668 — digit prefix on a redirection: `2> file`
1176 // treats `2` as the fd to redirect, not a literal arg. Three
1177 // shapes: `N>`/`N<` (single redir), `N&>` (errwrite), or
1178 // anything else (push back, treat as literal digit).
1179 if Self::is_digit(c) {
1180 let d = self.hgetc();
1181 match d {
1182 Some('&') => {
1183 let e = self.hgetc();
1184 if e == Some('>') {
1185 // lex.c:653-657 — `N&>` shape detected.
1186 self.tokfd = (c as u8 - b'0') as i32;
1187 self.hungetc('>');
1188 return self.lex_initial('&');
1189 }
1190 // lex.c:658-661 — not `N&>`, push everything back.
1191 if let Some(e) = e {
1192 self.hungetc(e);
1193 }
1194 self.hungetc('&');
1195 }
1196 Some('>') | Some('<') => {
1197 // lex.c:662-664 — `N>` or `N<` shape detected.
1198 self.tokfd = (c as u8 - b'0') as i32;
1199 return self.lex_initial(d.unwrap());
1200 }
1201 Some(d) => {
1202 // lex.c:665-668 — not a redir prefix, push back.
1203 self.hungetc(d);
1204 }
1205 None => {}
1206 }
1207 self.lexstop = false;
1208 }
1209
1210 // lex.c:670-936 — main dispatch on the leading char. zshrs
1211 // delegates to lex_initial which holds the equivalent of
1212 // lex.c's `switch (lexact1[c])` plus the gettokstr fallback
1213 // for LX1_OTHER.
1214 self.lex_initial(c)
1215 }
1216
1217 /// Lex (( ... )) arithmetic expression
1218 fn lex_arith(&mut self, c: char) -> LexTok {
1219 self.lexbuf.clear();
1220 self.hungetc(c);
1221
1222 let end_char = if self.infor > 0 { ';' } else { ')' };
1223 if self.dquote_parse(end_char, false).is_err() {
1224 return LexTok::Lexerr;
1225 }
1226
1227 self.tokstr = Some(self.lexbuf.as_str().to_string());
1228
1229 if !self.lexstop && self.infor > 0 {
1230 self.infor -= 1;
1231 return LexTok::Dinpar;
1232 }
1233
1234 // Check for closing ))
1235 match self.hgetc() {
1236 Some(')') => {
1237 self.dbparens = false;
1238 LexTok::Doutpar
1239 }
1240 c => {
1241 if let Some(c) = c {
1242 self.hungetc(c);
1243 }
1244 LexTok::Lexerr
1245 }
1246 }
1247 }
1248
1249 /// Handle initial character of token
1250 fn lex_initial(&mut self, c: char) -> LexTok {
1251 // Handle comments
1252 if c == '#' && !self.nocomments {
1253 return self.lex_comment();
1254 }
1255
1256 match c {
1257 '\\' => {
1258 let d = self.hgetc();
1259 if d == Some('\n') {
1260 // Line continuation - get next token
1261 return self.gettok();
1262 }
1263 if let Some(d) = d {
1264 self.hungetc(d);
1265 }
1266 self.lexstop = false;
1267 self.gettokstr(c, false)
1268 }
1269
1270 '\n' => LexTok::Newlin,
1271
1272 ';' => {
1273 let d = self.hgetc();
1274 match d {
1275 Some(';') => LexTok::Dsemi,
1276 Some('&') => LexTok::Semiamp,
1277 Some('|') => LexTok::Semibar,
1278 _ => {
1279 if let Some(d) = d {
1280 self.hungetc(d);
1281 }
1282 self.lexstop = false;
1283 LexTok::Semi
1284 }
1285 }
1286 }
1287
1288 '&' => {
1289 let d = self.hgetc();
1290 match d {
1291 Some('&') => LexTok::Damper,
1292 Some('!') | Some('|') => LexTok::Amperbang,
1293 Some('>') => {
1294 self.tokfd = self.tokfd.max(0);
1295 let e = self.hgetc();
1296 match e {
1297 Some('!') | Some('|') => LexTok::Outangampbang,
1298 Some('>') => {
1299 let f = self.hgetc();
1300 match f {
1301 Some('!') | Some('|') => LexTok::Doutangampbang,
1302 _ => {
1303 if let Some(f) = f {
1304 self.hungetc(f);
1305 }
1306 self.lexstop = false;
1307 LexTok::Doutangamp
1308 }
1309 }
1310 }
1311 _ => {
1312 if let Some(e) = e {
1313 self.hungetc(e);
1314 }
1315 self.lexstop = false;
1316 LexTok::Ampoutang
1317 }
1318 }
1319 }
1320 _ => {
1321 if let Some(d) = d {
1322 self.hungetc(d);
1323 }
1324 self.lexstop = false;
1325 LexTok::Amper
1326 }
1327 }
1328 }
1329
1330 '|' => {
1331 let d = self.hgetc();
1332 match d {
1333 Some('|') if self.incasepat <= 0 => LexTok::Dbar,
1334 Some('&') => LexTok::Baramp,
1335 _ => {
1336 if let Some(d) = d {
1337 self.hungetc(d);
1338 }
1339 self.lexstop = false;
1340 LexTok::Bar
1341 }
1342 }
1343 }
1344
1345 '(' => {
1346 let d = self.hgetc();
1347 match d {
1348 Some('(') => {
1349 if self.infor > 0 {
1350 self.dbparens = true;
1351 return LexTok::Dinpar;
1352 }
1353 if self.incmdpos {
1354 // Could be (( arithmetic )) or ( subshell )
1355 self.lexbuf.clear();
1356 match self.cmd_or_math() {
1357 CmdOrMath::Math => {
1358 self.tokstr = Some(self.lexbuf.as_str().to_string());
1359 return LexTok::Dinpar;
1360 }
1361 CmdOrMath::Cmd => {
1362 self.tokstr = None;
1363 return LexTok::Inpar;
1364 }
1365 CmdOrMath::Err => return LexTok::Lexerr,
1366 }
1367 }
1368 self.hungetc('(');
1369 self.lexstop = false;
1370 self.gettokstr('(', false)
1371 }
1372 Some(')') => LexTok::Inoutpar,
1373 _ => {
1374 if let Some(d) = d {
1375 self.hungetc(d);
1376 }
1377 self.lexstop = false;
1378 // Per lex.c:822 LX1_INPAR — at word boundary `(`
1379 // tokenizes as INPAR when SHGLOB || incond==1 ||
1380 // incmdpos. Otherwise falls through to gettokstr
1381 // (the `(` becomes start of a STRING — typical
1382 // for unquoted glob args like `ls (^foo)*`).
1383 // For `for x ( ... )` form, incmdpos is restored
1384 // to 1 via the oldpos-save-after-FOR mechanism,
1385 // so the next-token `(` correctly INPAR-izes.
1386 if self.incond == 1 || self.incmdpos || self.incasepat >= 1 {
1387 LexTok::Inpar
1388 } else {
1389 self.gettokstr('(', false)
1390 }
1391 }
1392 }
1393 }
1394
1395 ')' => LexTok::Outpar,
1396
1397 '{' => {
1398 // { is a command group only if followed by whitespace,
1399 // newline, or `}` (the empty-block form `{}`). zsh
1400 // treats `{}` as an empty compound — `foo() {}` is a
1401 // valid no-op function. Without `}` in this list,
1402 // `{}` got consumed as one literal token and ran as a
1403 // command, failing "command not found: {}".
1404 // The empty `{}` is also recognised AFTER a function
1405 // header `name()` even when `incmdpos` got cleared by
1406 // the preceding Outpar — peek for `}` regardless and
1407 // treat as Inbrace so `foo() {}` parses as a no-op
1408 // function body.
1409 let next = self.hgetc();
1410 let next_is_close = matches!(next, Some('}'));
1411 if self.incmdpos {
1412 let is_brace_group = match next {
1413 Some(' ') | Some('\t') | Some('\n') | Some('}') | None => true,
1414 _ => false,
1415 };
1416 if let Some(ch) = next {
1417 self.hungetc(ch);
1418 }
1419 if is_brace_group {
1420 self.tokstr = Some("{".to_string());
1421 LexTok::Inbrace
1422 } else {
1423 self.gettokstr(c, false)
1424 }
1425 } else if next_is_close {
1426 // `{}` empty block in non-cmd position (function
1427 // body after `()`). Treat as Inbrace; the parser
1428 // will follow with Outbrace.
1429 if let Some(ch) = next {
1430 self.hungetc(ch);
1431 }
1432 self.tokstr = Some("{".to_string());
1433 LexTok::Inbrace
1434 } else {
1435 if let Some(ch) = next {
1436 self.hungetc(ch);
1437 }
1438 self.gettokstr(c, false)
1439 }
1440 }
1441
1442 '}' => {
1443 // } at start of token is always Outbrace (ends command group)
1444 // Inside a word, } would be handled by gettokstr but we never reach here mid-word
1445 self.tokstr = Some("}".to_string());
1446 LexTok::Outbrace
1447 }
1448
1449 '[' => {
1450 // [[ is a conditional expression start
1451 // [ can also be a command (test builtin) or array subscript
1452 // In case patterns (incasepat > 0), [ is part of glob pattern like [yY]
1453 if self.incasepat > 0 {
1454 self.gettokstr(c, false)
1455 } else if self.incmdpos {
1456 let next = self.hgetc();
1457 if next == Some('[') {
1458 // [[ - double bracket conditional
1459 self.tokstr = Some("[[".to_string());
1460 self.incond = 1;
1461 return LexTok::Dinbrack;
1462 }
1463 // Single [ - either test command or start of glob pattern
1464 if let Some(ch) = next {
1465 self.hungetc(ch);
1466 }
1467 self.tokstr = Some("[".to_string());
1468 LexTok::String
1469 } else {
1470 self.gettokstr(c, false)
1471 }
1472 }
1473
1474 ']' => {
1475 // ]] ends a conditional expression started by [[
1476 if self.incond > 0 {
1477 let next = self.hgetc();
1478 if next == Some(']') {
1479 self.tokstr = Some("]]".to_string());
1480 self.incond = 0;
1481 return LexTok::Doutbrack;
1482 }
1483 if let Some(ch) = next {
1484 self.hungetc(ch);
1485 }
1486 }
1487 self.gettokstr(c, false)
1488 }
1489
1490 '<' => {
1491 // In pattern context, < is literal (e.g., <-> in glob)
1492 if self.incondpat || self.incasepat > 0 {
1493 self.gettokstr(c, false)
1494 } else {
1495 self.lex_inang()
1496 }
1497 }
1498
1499 '>' => {
1500 // In pattern context, > is literal
1501 if self.incondpat || self.incasepat > 0 {
1502 self.gettokstr(c, false)
1503 } else {
1504 self.lex_outang()
1505 }
1506 }
1507
1508 _ => self.gettokstr(c, false),
1509 }
1510 }
1511
1512 /// Lex comment
1513 fn lex_comment(&mut self) -> LexTok {
1514 if self.lexflags.comments_keep {
1515 self.lexbuf.clear();
1516 self.add('#');
1517 }
1518
1519 loop {
1520 let c = self.hgetc();
1521 match c {
1522 Some('\n') | None => break,
1523 Some(c) => {
1524 if self.lexflags.comments_keep {
1525 self.add(c);
1526 }
1527 }
1528 }
1529 }
1530
1531 if self.lexflags.comments_keep {
1532 self.tokstr = Some(self.lexbuf.as_str().to_string());
1533 if !self.lexstop {
1534 self.hungetc('\n');
1535 }
1536 return LexTok::String;
1537 }
1538
1539 if self.lexflags.comments_strip && self.lexstop {
1540 return LexTok::Endinput;
1541 }
1542
1543 LexTok::Newlin
1544 }
1545
1546 /// Lex < and variants
1547 fn lex_inang(&mut self) -> LexTok {
1548 let d = self.hgetc();
1549 match d {
1550 Some('(') => {
1551 // Process substitution <(...)
1552 self.hungetc('(');
1553 self.lexstop = false;
1554 self.gettokstr('<', false)
1555 }
1556 Some('>') => LexTok::Inoutang,
1557 Some('<') => {
1558 let e = self.hgetc();
1559 match e {
1560 Some('(') => {
1561 self.hungetc('(');
1562 self.hungetc('<');
1563 LexTok::Inang
1564 }
1565 Some('<') => LexTok::Trinang,
1566 Some('-') => {
1567 self.heredoc_pending = 2; // <<- expects terminator next
1568 LexTok::Dinangdash
1569 }
1570 _ => {
1571 if let Some(e) = e {
1572 self.hungetc(e);
1573 }
1574 self.lexstop = false;
1575 self.heredoc_pending = 1; // << expects terminator next
1576 LexTok::Dinang
1577 }
1578 }
1579 }
1580 Some('&') => LexTok::Inangamp,
1581 _ => {
1582 if let Some(d) = d {
1583 self.hungetc(d);
1584 }
1585 self.lexstop = false;
1586 LexTok::Inang
1587 }
1588 }
1589 }
1590
1591 /// Lex > and variants
1592 fn lex_outang(&mut self) -> LexTok {
1593 let d = self.hgetc();
1594 match d {
1595 Some('(') => {
1596 // Process substitution >(...)
1597 self.hungetc('(');
1598 self.lexstop = false;
1599 self.gettokstr('>', false)
1600 }
1601 Some('&') => {
1602 let e = self.hgetc();
1603 match e {
1604 Some('!') | Some('|') => LexTok::Outangampbang,
1605 _ => {
1606 if let Some(e) = e {
1607 self.hungetc(e);
1608 }
1609 self.lexstop = false;
1610 LexTok::Outangamp
1611 }
1612 }
1613 }
1614 Some('!') | Some('|') => LexTok::Outangbang,
1615 Some('>') => {
1616 let e = self.hgetc();
1617 match e {
1618 Some('&') => {
1619 let f = self.hgetc();
1620 match f {
1621 Some('!') | Some('|') => LexTok::Doutangampbang,
1622 _ => {
1623 if let Some(f) = f {
1624 self.hungetc(f);
1625 }
1626 self.lexstop = false;
1627 LexTok::Doutangamp
1628 }
1629 }
1630 }
1631 Some('!') | Some('|') => LexTok::Doutangbang,
1632 Some('(') => {
1633 self.hungetc('(');
1634 self.hungetc('>');
1635 LexTok::Outang
1636 }
1637 _ => {
1638 if let Some(e) = e {
1639 self.hungetc(e);
1640 }
1641 self.lexstop = false;
1642 LexTok::Doutang
1643 }
1644 }
1645 }
1646 _ => {
1647 if let Some(d) = d {
1648 self.hungetc(d);
1649 }
1650 self.lexstop = false;
1651 LexTok::Outang
1652 }
1653 }
1654 }
1655
1656 /// Get rest of token string
1657 fn gettokstr(&mut self, c: char, sub: bool) -> LexTok {
1658 let mut bct = 0; // brace count
1659 let mut pct = 0; // parenthesis count
1660 let mut brct = 0; // bracket count
1661 let mut in_brace_param = 0;
1662 let mut peek = LexTok::String;
1663 let mut intpos = 1;
1664 let mut unmatched = '\0';
1665 let mut c = c;
1666 const MAX_ITERATIONS: usize = 100_000;
1667 let mut iterations = 0;
1668
1669 if !sub {
1670 self.lexbuf.clear();
1671 }
1672
1673 loop {
1674 iterations += 1;
1675 if iterations > MAX_ITERATIONS {
1676 self.error = Some("gettokstr exceeded maximum iterations".to_string());
1677 return LexTok::Lexerr;
1678 }
1679
1680 let inbl = Self::is_inblank(c);
1681
1682 if inbl && in_brace_param == 0 && pct == 0 {
1683 // Whitespace outside brace param ends token
1684 break;
1685 }
1686
1687 match c {
1688 // Whitespace is handled above for most cases
1689 ')' => {
1690 if in_brace_param > 0 || sub {
1691 self.add(char_tokens::OUTPAR);
1692 } else if pct > 0 {
1693 pct -= 1;
1694 self.add(char_tokens::OUTPAR);
1695 } else {
1696 break;
1697 }
1698 }
1699
1700 '|' => {
1701 if pct == 0 && in_brace_param == 0 {
1702 if sub {
1703 self.add(c);
1704 } else {
1705 break;
1706 }
1707 } else {
1708 self.add(char_tokens::BAR);
1709 }
1710 }
1711
1712 '$' => {
1713 let e = self.hgetc();
1714 match e {
1715 Some('\\') => {
1716 let f = self.hgetc();
1717 if f != Some('\n') {
1718 if let Some(f) = f {
1719 self.hungetc(f);
1720 }
1721 self.hungetc('\\');
1722 self.add(char_tokens::STRING);
1723 } else {
1724 // Line continuation after $
1725 continue;
1726 }
1727 }
1728 Some('[') => {
1729 // $[...] arithmetic
1730 self.add(char_tokens::STRING);
1731 self.add(char_tokens::INBRACK);
1732 if self.dquote_parse(']', sub).is_err() {
1733 peek = LexTok::Lexerr;
1734 break;
1735 }
1736 self.add(char_tokens::OUTBRACK);
1737 }
1738 Some('(') => {
1739 // $(...) or $((...))
1740 self.add(char_tokens::STRING);
1741 match self.cmd_or_math_sub() {
1742 CmdOrMath::Cmd => self.add(char_tokens::OUTPAR),
1743 CmdOrMath::Math => self.add(char_tokens::OUTPARMATH),
1744 CmdOrMath::Err => {
1745 peek = LexTok::Lexerr;
1746 break;
1747 }
1748 }
1749 }
1750 Some('{') => {
1751 self.add(c);
1752 self.add(char_tokens::INBRACE);
1753 bct += 1;
1754 if in_brace_param == 0 {
1755 in_brace_param = bct;
1756 }
1757 }
1758 Some('\'') => {
1759 // $'...' ANSI-C escape syntax. Inside, `\X`
1760 // sequences are escapes (`\n`, `\t`, `\x1b`,
1761 // `\'` for literal apostrophe, `\\` for
1762 // backslash). Lexer captures the raw form
1763 // wrapped in QSTRING/SNULL markers; later
1764 // expansion decodes the escapes. zsh's
1765 // analogue lives in lex.c gettokstr's
1766 // LX2_QUOTE branch when prev char was `$`.
1767 self.add(char_tokens::QSTRING);
1768 self.add(char_tokens::SNULL);
1769 loop {
1770 let ch = self.hgetc();
1771 match ch {
1772 Some('\'') => break,
1773 Some('\\') => {
1774 // `\X` — store both chars literally;
1775 // expansion handles the actual escape.
1776 self.add(char_tokens::BNULL);
1777 match self.hgetc() {
1778 Some(n) => self.add(n),
1779 None => {
1780 self.lexstop = true;
1781 unmatched = '\'';
1782 peek = LexTok::Lexerr;
1783 break;
1784 }
1785 }
1786 }
1787 Some(ch) => self.add(ch),
1788 None => {
1789 self.lexstop = true;
1790 unmatched = '\'';
1791 peek = LexTok::Lexerr;
1792 break;
1793 }
1794 }
1795 }
1796 if unmatched != '\0' {
1797 break;
1798 }
1799 self.add(char_tokens::SNULL);
1800 }
1801 Some('"') => {
1802 // $"..." localized string. Same shape as a
1803 // plain "..." but flagged via QSTRING+DNULL
1804 // so post-lex translation can substitute.
1805 self.add(char_tokens::QSTRING);
1806 self.add(char_tokens::DNULL);
1807 if self.dquote_parse('"', sub).is_err() {
1808 peek = LexTok::Lexerr;
1809 break;
1810 }
1811 self.add(char_tokens::DNULL);
1812 }
1813 _ => {
1814 if let Some(e) = e {
1815 self.hungetc(e);
1816 }
1817 self.lexstop = false;
1818 self.add(char_tokens::STRING);
1819 }
1820 }
1821 }
1822
1823 '[' => {
1824 if in_brace_param == 0 {
1825 brct += 1;
1826 }
1827 self.add(char_tokens::INBRACK);
1828 }
1829
1830 ']' => {
1831 if in_brace_param == 0 && brct > 0 {
1832 brct -= 1;
1833 }
1834 self.add(char_tokens::OUTBRACK);
1835 }
1836
1837 '(' => {
1838 // lex.c:1078-1135 LX2_INPAR — when `(` appears inside
1839 // a STRING and is immediately followed by `)`, the
1840 // string terminates at the `(`. The `()` is then
1841 // re-lexed as a separate INOUTPAR token. This handles
1842 // function definitions: `name()` lexes as STRING `name`
1843 // + INOUTPAR `()`, not STRING `name()`.
1844 //
1845 // Also (lex.c:1109-1112): under SHGLOB, a `(` followed
1846 // by whitespace at the start of a command-position word
1847 // (no nested brackets/braces) is a ksh function
1848 // definition signal — same break-out behavior.
1849 if in_brace_param == 0 && !sub {
1850 let e = self.hgetc();
1851 if let Some(ch) = e {
1852 self.hungetc(ch);
1853 }
1854 self.lexstop = false;
1855 if e == Some(')') {
1856 // `name()` — terminate STRING at `(` so the
1857 // following `()` re-lexes as INOUTPAR. The
1858 // loop's exit guard at line 2067 will
1859 // `hungetc(c)` to push the `(` back; we only
1860 // need to ensure `)` is also there. The
1861 // hungetc(ch) above already pushed `)`, so
1862 // breaking here yields unget_buf = [`(`, `)`]
1863 // after the guard, which the outer dispatch
1864 // reads as Inoutpar.
1865 break;
1866 }
1867 }
1868 if in_brace_param == 0 {
1869 pct += 1;
1870 }
1871 self.add(char_tokens::INPAR);
1872 }
1873
1874 '{' => {
1875 // Track braces for both ${...} param expansion and {...} brace expansion
1876 bct += 1;
1877 self.add(c);
1878 }
1879
1880 '}' => {
1881 if in_brace_param > 0 {
1882 if bct == in_brace_param {
1883 in_brace_param = 0;
1884 }
1885 bct -= 1;
1886 self.add(char_tokens::OUTBRACE);
1887 } else if bct > 0 {
1888 // Closing a brace expansion like {a,b}
1889 bct -= 1;
1890 self.add(c);
1891 } else {
1892 break;
1893 }
1894 }
1895
1896 '>' => {
1897 // In pattern context (incondpat), > is literal
1898 if in_brace_param > 0 || sub || self.incondpat || self.incasepat > 0 {
1899 self.add(c);
1900 } else {
1901 let e = self.hgetc();
1902 if e != Some('(') {
1903 if let Some(e) = e {
1904 self.hungetc(e);
1905 }
1906 self.lexstop = false;
1907 break;
1908 }
1909 // >(...)
1910 self.add(char_tokens::OUTANGPROC);
1911 if self.skip_command_sub().is_err() {
1912 peek = LexTok::Lexerr;
1913 break;
1914 }
1915 self.add(char_tokens::OUTPAR);
1916 }
1917 }
1918
1919 '<' => {
1920 // In pattern context (incondpat), < is literal
1921 if in_brace_param > 0 || sub || self.incondpat || self.incasepat > 0 {
1922 self.add(c);
1923 } else if let Some(range_chars) = self.try_numeric_range_glob() {
1924 // zsh numeric range glob `<N-M>`, `<->`, `<N->`,
1925 // `<-M>`. When `<` mid-word matches that exact
1926 // shape, swallow it into the word instead of
1927 // breaking out for redirection.
1928 self.add(c);
1929 for ch in range_chars.chars() {
1930 self.add(ch);
1931 }
1932 } else {
1933 let e = self.hgetc();
1934 if e != Some('(') {
1935 if let Some(e) = e {
1936 self.hungetc(e);
1937 }
1938 self.lexstop = false;
1939 break;
1940 }
1941 // <(...)
1942 self.add(char_tokens::INANG);
1943 if self.skip_command_sub().is_err() {
1944 peek = LexTok::Lexerr;
1945 break;
1946 }
1947 self.add(char_tokens::OUTPAR);
1948 }
1949 }
1950
1951 '=' => {
1952 if !sub {
1953 if intpos > 0 {
1954 // At start of token, check for =(...) process substitution
1955 let e = self.hgetc();
1956 if e == Some('(') {
1957 self.add(char_tokens::EQUALS);
1958 if self.skip_command_sub().is_err() {
1959 peek = LexTok::Lexerr;
1960 break;
1961 }
1962 self.add(char_tokens::OUTPAR);
1963 } else {
1964 if let Some(e) = e {
1965 self.hungetc(e);
1966 }
1967 self.lexstop = false;
1968 self.add(char_tokens::EQUALS);
1969 }
1970 } else if peek != LexTok::Envstring
1971 && (self.incmdpos || self.intypeset)
1972 && bct == 0
1973 && brct == 0
1974 && self.incasepat == 0
1975 {
1976 // Check for VAR=value assignment (but not in case pattern context)
1977 let tok_so_far = self.lexbuf.as_str().to_string();
1978 if self.is_valid_assignment_target(&tok_so_far) {
1979 let next = self.hgetc();
1980 if next == Some('(') {
1981 // VAR=(...) array assignment. Per zsh
1982 // (lex.c emits ENVARRAY with tokstr =
1983 // just the variable name, NOT
1984 // including the `=`). The `=` and
1985 // `(` are consumed by the lexer; the
1986 // parser knows ENVARRAY means assign-
1987 // array and reads the body that
1988 // follows.
1989 self.tokstr = Some(self.lexbuf.as_str().to_string());
1990 return LexTok::Envarray;
1991 }
1992 if let Some(next) = next {
1993 self.hungetc(next);
1994 }
1995 self.lexstop = false;
1996 peek = LexTok::Envstring;
1997 intpos = 2;
1998 self.add(char_tokens::EQUALS);
1999 } else {
2000 self.add(char_tokens::EQUALS);
2001 }
2002 } else {
2003 self.add(char_tokens::EQUALS);
2004 }
2005 } else {
2006 self.add(char_tokens::EQUALS);
2007 }
2008 }
2009
2010 '\\' => {
2011 let next = self.hgetc();
2012 if next == Some('\n') {
2013 // Line continuation
2014 let next = self.hgetc();
2015 if let Some(next) = next {
2016 c = next;
2017 continue;
2018 }
2019 break;
2020 } else {
2021 self.add(char_tokens::BNULL);
2022 if let Some(next) = next {
2023 self.add(next);
2024 }
2025 }
2026 }
2027
2028 '\'' => {
2029 // Single quoted string - everything literal until '
2030 self.add(char_tokens::SNULL);
2031 loop {
2032 let ch = self.hgetc();
2033 match ch {
2034 Some('\'') => break,
2035 Some(ch) => self.add(ch),
2036 None => {
2037 self.lexstop = true;
2038 unmatched = '\'';
2039 peek = LexTok::Lexerr;
2040 break;
2041 }
2042 }
2043 }
2044 if unmatched != '\0' {
2045 break;
2046 }
2047 self.add(char_tokens::SNULL);
2048 }
2049
2050 '"' => {
2051 // Double quoted string
2052 self.add(char_tokens::DNULL);
2053 if self.dquote_parse('"', sub).is_err() {
2054 unmatched = '"';
2055 if !self.lexflags.active {
2056 peek = LexTok::Lexerr;
2057 }
2058 break;
2059 }
2060 self.add(char_tokens::DNULL);
2061 }
2062
2063 '`' => {
2064 // Backtick command substitution
2065 self.add(char_tokens::TICK);
2066 loop {
2067 let ch = self.hgetc();
2068 match ch {
2069 Some('`') => break,
2070 Some('\\') => {
2071 let next = self.hgetc();
2072 match next {
2073 Some('\n') => continue, // Line continuation
2074 Some(c) if c == '`' || c == '\\' || c == '$' => {
2075 self.add(char_tokens::BNULL);
2076 self.add(c);
2077 }
2078 Some(c) => {
2079 self.add('\\');
2080 self.add(c);
2081 }
2082 None => break,
2083 }
2084 }
2085 Some(ch) => self.add(ch),
2086 None => {
2087 self.lexstop = true;
2088 unmatched = '`';
2089 peek = LexTok::Lexerr;
2090 break;
2091 }
2092 }
2093 }
2094 if unmatched != '\0' {
2095 break;
2096 }
2097 self.add(char_tokens::TICK);
2098 }
2099
2100 '~' => {
2101 self.add(char_tokens::TILDE);
2102 }
2103
2104 '#' => {
2105 self.add(char_tokens::POUND);
2106 }
2107
2108 '^' => {
2109 self.add(char_tokens::HAT);
2110 }
2111
2112 '*' => {
2113 self.add(char_tokens::STAR);
2114 }
2115
2116 '?' => {
2117 self.add(char_tokens::QUEST);
2118 }
2119
2120 ',' if bct > in_brace_param => {
2121 self.add(char_tokens::COMMA);
2122 }
2123
2124 '-' => {
2125 self.add(char_tokens::DASH);
2126 }
2127
2128 '!' if brct > 0 => {
2129 self.add(char_tokens::BANG);
2130 }
2131
2132 // Terminators
2133 '\n' | ';' | '&' => {
2134 break;
2135 }
2136
2137 _ => {
2138 self.add(c);
2139 }
2140 }
2141
2142 c = match self.hgetc() {
2143 Some(c) => c,
2144 None => {
2145 self.lexstop = true;
2146 break;
2147 }
2148 };
2149
2150 if intpos > 0 {
2151 intpos -= 1;
2152 }
2153 }
2154
2155 // Put back the character that ended the token
2156 if !self.lexstop {
2157 self.hungetc(c);
2158 }
2159
2160 if unmatched != '\0' && !self.lexflags.active {
2161 self.error = Some(format!("unmatched {}", unmatched));
2162 }
2163
2164 if in_brace_param > 0 {
2165 self.error = Some("closing brace expected".to_string());
2166 }
2167
2168 self.tokstr = Some(self.lexbuf.as_str().to_string());
2169 peek
2170 }
2171
2172 /// Check if a string is a valid assignment target (identifier or array ref).
2173 ///
2174 /// zsh accepts identifier (`[A-Za-z_][A-Za-z0-9_]*`) optionally followed by
2175 /// a `[...]` subscript. Bare digits are NOT a valid lvalue (rejected at
2176 /// `if c.is_ascii_digit()` below — array index expressions like `arr[2]`
2177 /// are caught by the subscript handler, not here). And the first char
2178 /// must NOT be a zsh internal token byte — `$=foo` (where `$` becomes
2179 /// the STRING token 0x85) is parameter substitution with the `=` flag,
2180 /// NOT an envstring assignment.
2181 fn is_valid_assignment_target(&self, s: &str) -> bool {
2182 let mut chars = s.chars().peekable();
2183
2184 // Reject leading token byte — `$VAR=` is parameter substitution,
2185 // not assignment. Same for `*=`, `?=`, etc.
2186 if let Some(&c) = chars.peek() {
2187 if char_tokens::is_token(c) {
2188 return false;
2189 }
2190 }
2191
2192 // Check for leading digit (invalid)
2193 if let Some(&c) = chars.peek() {
2194 if c.is_ascii_digit() {
2195 // Could be array index, check rest
2196 while let Some(&c) = chars.peek() {
2197 if !c.is_ascii_digit() {
2198 break;
2199 }
2200 chars.next();
2201 }
2202 return chars.peek().is_none();
2203 }
2204 }
2205
2206 // Check identifier
2207 let mut has_ident = false;
2208 while let Some(&c) = chars.peek() {
2209 if c == char_tokens::INBRACK || c == '[' {
2210 break;
2211 }
2212 if c == '+' {
2213 // foo+=value
2214 chars.next();
2215 return chars.peek().is_none() || chars.peek() == Some(&'=');
2216 }
2217 if !Self::is_ident(c) && c != char_tokens::STRING && !char_tokens::is_token(c) {
2218 return false;
2219 }
2220 has_ident = true;
2221 chars.next();
2222 }
2223
2224 has_ident
2225 }
2226
2227 /// Parse the body of a double-quoted string (or any context that
2228 /// uses double-quote tokenization — `(( ))`, `${...}`, `$( ( ) )`).
2229 /// Direct port of zsh/Src/lex.c:1486-1693 `dquote_parse`. Reads
2230 /// chars until `endchar` is seen at depth 0, handling escapes,
2231 /// `${...}` parameter substitutions, `$(...)` and backtick command
2232 /// substitutions, `$((...))` arithmetic, and inner double-quoted
2233 /// strings. The `sub` flag toggles substitution-context tokens
2234 /// (lex.c:1487 `int sub` argument).
2235 ///
2236 /// zshrs port note: the recursion guard at the top is a Rust
2237 /// safety net; the C source relies on the runtime stack. Inner
2238 /// logic delegates to `dquote_parse_inner` which holds the actual
2239 /// per-char state machine matching lex.c:1495-1692.
2240 fn dquote_parse(&mut self, endchar: char, sub: bool) -> Result<(), ()> {
2241 self.recursion_depth += 1;
2242 if self.check_recursion() {
2243 self.recursion_depth -= 1;
2244 return Err(());
2245 }
2246
2247 let result = self.dquote_parse_inner(endchar, sub);
2248 self.recursion_depth -= 1;
2249 result
2250 }
2251
2252 fn dquote_parse_inner(&mut self, endchar: char, sub: bool) -> Result<(), ()> {
2253 let mut pct = 0; // parenthesis count
2254 let mut brct = 0; // bracket count
2255 let mut bct = 0; // brace count (for ${...})
2256 let mut intick = false; // inside backtick
2257 let is_math = endchar == ')' || endchar == ']' || self.infor > 0;
2258 const MAX_ITERATIONS: usize = 100_000;
2259 let mut iterations = 0;
2260
2261 loop {
2262 iterations += 1;
2263 if iterations > MAX_ITERATIONS {
2264 self.error = Some("dquote_parse exceeded maximum iterations".to_string());
2265 return Err(());
2266 }
2267 let c = self.hgetc();
2268 let c = match c {
2269 Some(c) if c == endchar && !intick && bct == 0 => {
2270 if is_math && (pct > 0 || brct > 0) {
2271 self.add(c);
2272 if c == ')' {
2273 pct -= 1;
2274 } else if c == ']' {
2275 brct -= 1;
2276 }
2277 continue;
2278 }
2279 return Ok(());
2280 }
2281 Some(c) => c,
2282 None => {
2283 self.lexstop = true;
2284 return Err(());
2285 }
2286 };
2287
2288 match c {
2289 '\\' => {
2290 let next = self.hgetc();
2291 match next {
2292 Some('\n') if !sub => continue, // Line continuation
2293 Some(c)
2294 if c == '$'
2295 || c == '\\'
2296 || (c == '}' && !intick && bct > 0)
2297 || c == endchar
2298 || c == '`'
2299 || (endchar == ']'
2300 && (c == '['
2301 || c == ']'
2302 || c == '('
2303 || c == ')'
2304 || c == '{'
2305 || c == '}'
2306 || (c == '"' && sub))) =>
2307 {
2308 self.add(char_tokens::BNULL);
2309 self.add(c);
2310 }
2311 Some(c) => {
2312 self.add('\\');
2313 self.hungetc(c);
2314 continue;
2315 }
2316 None => {
2317 self.add('\\');
2318 }
2319 }
2320 }
2321
2322 '$' => {
2323 if intick {
2324 self.add(c);
2325 continue;
2326 }
2327 let next = self.hgetc();
2328 match next {
2329 Some('(') => {
2330 self.add(char_tokens::QSTRING);
2331 match self.cmd_or_math_sub() {
2332 CmdOrMath::Cmd => self.add(char_tokens::OUTPAR),
2333 CmdOrMath::Math => self.add(char_tokens::OUTPARMATH),
2334 CmdOrMath::Err => return Err(()),
2335 }
2336 }
2337 Some('[') => {
2338 self.add(char_tokens::STRING);
2339 self.add(char_tokens::INBRACK);
2340 self.dquote_parse(']', sub)?;
2341 self.add(char_tokens::OUTBRACK);
2342 }
2343 Some('{') => {
2344 self.add(char_tokens::QSTRING);
2345 self.add(char_tokens::INBRACE);
2346 bct += 1;
2347 }
2348 Some('$') => {
2349 self.add(char_tokens::QSTRING);
2350 self.add('$');
2351 }
2352 _ => {
2353 if let Some(next) = next {
2354 self.hungetc(next);
2355 }
2356 self.lexstop = false;
2357 self.add(char_tokens::QSTRING);
2358 }
2359 }
2360 }
2361
2362 '}' => {
2363 if intick || bct == 0 {
2364 self.add(c);
2365 } else {
2366 self.add(char_tokens::OUTBRACE);
2367 bct -= 1;
2368 }
2369 }
2370
2371 '`' => {
2372 self.add(char_tokens::QTICK);
2373 intick = !intick;
2374 }
2375
2376 '(' => {
2377 if !is_math || bct == 0 {
2378 pct += 1;
2379 }
2380 self.add(c);
2381 }
2382
2383 ')' => {
2384 if !is_math || bct == 0 {
2385 if pct == 0 && is_math {
2386 return Err(());
2387 }
2388 pct -= 1;
2389 }
2390 self.add(c);
2391 }
2392
2393 '[' => {
2394 if !is_math || bct == 0 {
2395 brct += 1;
2396 }
2397 self.add(c);
2398 }
2399
2400 ']' => {
2401 if !is_math || bct == 0 {
2402 if brct == 0 && is_math {
2403 return Err(());
2404 }
2405 brct -= 1;
2406 }
2407 self.add(c);
2408 }
2409
2410 '"' => {
2411 if intick || (endchar != '"' && bct == 0) {
2412 self.add(c);
2413 } else if bct > 0 {
2414 self.add(char_tokens::DNULL);
2415 self.dquote_parse('"', sub)?;
2416 self.add(char_tokens::DNULL);
2417 } else {
2418 return Err(());
2419 }
2420 }
2421
2422 _ => {
2423 self.add(c);
2424 }
2425 }
2426 }
2427 }
2428
2429 /// Determine if (( is arithmetic or command
2430 /// Decide whether `( ... )` after a `$` is a math expression
2431 /// `$((...))` or a command substitution `$(...)`. Direct port of
2432 /// zsh/Src/lex.c:495-532 `cmd_or_math`. Tries dquote_parse first;
2433 /// if it succeeds AND the next char is `)` (closing the second
2434 /// paren of `(( ))`), it's math. Otherwise rewinds and treats as
2435 /// a command substitution.
2436 fn cmd_or_math(&mut self) -> CmdOrMath {
2437 let oldlen = self.lexbuf.len();
2438
2439 // Per lex.c:498-518 — `cmd_or_math` calls `dquote_parse(')')`
2440 // which fills lexbuf with ONLY the inner expression, then checks
2441 // for the closing `)`. The surrounding `((` / `))` are NOT added
2442 // to lexbuf. zshrs previously added INPAR + '(' before dquote and
2443 // ')' after, polluting DINPAR's tokstr with the literal parens.
2444 // Removed to match C exactly.
2445 if self.dquote_parse(')', false).is_err() {
2446 // Back up and try as command
2447 while self.lexbuf.len() > oldlen {
2448 if let Some(c) = self.lexbuf.pop() {
2449 self.hungetc(c);
2450 }
2451 }
2452 self.hungetc('(');
2453 self.lexstop = false;
2454 return if self.skip_command_sub().is_err() {
2455 CmdOrMath::Err
2456 } else {
2457 CmdOrMath::Cmd
2458 };
2459 }
2460
2461 // Check for closing ) — matches C lex.c:511-512: success-with-`)`
2462 // means `((..))` was math. Don't add `)` to lexbuf.
2463 let c = self.hgetc();
2464 if c == Some(')') {
2465 return CmdOrMath::Math;
2466 }
2467
2468 // Not math, back up
2469 if let Some(c) = c {
2470 self.hungetc(c);
2471 }
2472 self.lexstop = false;
2473
2474 // Back up token
2475 while self.lexbuf.len() > oldlen {
2476 if let Some(c) = self.lexbuf.pop() {
2477 self.hungetc(c);
2478 }
2479 }
2480 self.hungetc('(');
2481
2482 if self.skip_command_sub().is_err() {
2483 CmdOrMath::Err
2484 } else {
2485 CmdOrMath::Cmd
2486 }
2487 }
2488
2489 /// Parse `$(...)` or `$((...))` after the `$` has been consumed.
2490 /// Direct port of zsh/Src/lex.c:540-573 `cmd_or_math_sub`. Reads
2491 /// the next char to discriminate: a leading `(` plus successful
2492 /// math parse via `cmd_or_math` → arithmetic substitution (with
2493 /// the open-paren retroactively rewritten to Inparmath); else
2494 /// command substitution via skip_command_sub.
2495 fn cmd_or_math_sub(&mut self) -> CmdOrMath {
2496 const MAX_CONTINUATIONS: usize = 10_000;
2497 let mut continuations = 0;
2498
2499 loop {
2500 continuations += 1;
2501 if continuations > MAX_CONTINUATIONS {
2502 self.error = Some("cmd_or_math_sub: too many line continuations".to_string());
2503 return CmdOrMath::Err;
2504 }
2505
2506 let c = self.hgetc();
2507 if c == Some('\\') {
2508 let c2 = self.hgetc();
2509 if c2 != Some('\n') {
2510 if let Some(c2) = c2 {
2511 self.hungetc(c2);
2512 }
2513 self.hungetc('\\');
2514 self.lexstop = false;
2515 return if self.skip_command_sub().is_err() {
2516 CmdOrMath::Err
2517 } else {
2518 CmdOrMath::Cmd
2519 };
2520 }
2521 // Line continuation, try again (loop instead of recursion)
2522 continue;
2523 }
2524
2525 // Not a line continuation, process normally
2526 if c == Some('(') {
2527 // Might be $((...))
2528 let lexpos = self.lexbuf.len();
2529 self.add(char_tokens::INPAR);
2530 self.add('(');
2531
2532 if self.dquote_parse(')', false).is_ok() {
2533 let c2 = self.hgetc();
2534 if c2 == Some(')') {
2535 self.add(')');
2536 return CmdOrMath::Math;
2537 }
2538 if let Some(c2) = c2 {
2539 self.hungetc(c2);
2540 }
2541 }
2542
2543 // Not math, restore and parse as command
2544 while self.lexbuf.len() > lexpos {
2545 if let Some(ch) = self.lexbuf.pop() {
2546 self.hungetc(ch);
2547 }
2548 }
2549 self.hungetc('(');
2550 self.lexstop = false;
2551 } else {
2552 if let Some(c) = c {
2553 self.hungetc(c);
2554 }
2555 self.lexstop = false;
2556 }
2557
2558 return if self.skip_command_sub().is_err() {
2559 CmdOrMath::Err
2560 } else {
2561 CmdOrMath::Cmd
2562 };
2563 }
2564 }
2565
2566 /// Skip over `(...)` for command-style substitutions: `$(...)`,
2567 /// `<(...)`, `>(...)`. Direct port of zsh/Src/lex.c:2080-end
2568 /// `skipcomm`. Per the C source comment: "we'll parse the input
2569 /// until we find an unmatched closing parenthesis. However, we'll
2570 /// throw away the result of the parsing and just keep the string
2571 /// we've built up on the way."
2572 ///
2573 /// zshrs port note: the C source uses zcontext_save/restore +
2574 /// strinbeg/inpush to set up an isolated lex context for the
2575 /// throw-away parse. zshrs's standalone walker tracks paren
2576 /// depth directly without re-entering the parser. Same
2577 /// invariant: stops at the matching `)`.
2578 fn skip_command_sub(&mut self) -> Result<(), ()> {
2579 let mut pct = 1;
2580 let mut start = true;
2581 const MAX_ITERATIONS: usize = 100_000;
2582 let mut iterations = 0;
2583
2584 self.add(char_tokens::INPAR);
2585
2586 loop {
2587 iterations += 1;
2588 if iterations > MAX_ITERATIONS {
2589 self.error = Some("skip_command_sub exceeded maximum iterations".to_string());
2590 return Err(());
2591 }
2592
2593 let c = self.hgetc();
2594 let c = match c {
2595 Some(c) => c,
2596 None => {
2597 self.lexstop = true;
2598 return Err(());
2599 }
2600 };
2601
2602 let iswhite = Self::is_inblank(c);
2603
2604 match c {
2605 '(' => {
2606 pct += 1;
2607 self.add(c);
2608 }
2609 ')' => {
2610 pct -= 1;
2611 if pct == 0 {
2612 return Ok(());
2613 }
2614 self.add(c);
2615 }
2616 '\\' => {
2617 self.add(c);
2618 if let Some(c) = self.hgetc() {
2619 self.add(c);
2620 }
2621 }
2622 '\'' => {
2623 self.add(c);
2624 loop {
2625 let ch = self.hgetc();
2626 match ch {
2627 Some('\'') => {
2628 self.add('\'');
2629 break;
2630 }
2631 Some(ch) => self.add(ch),
2632 None => {
2633 self.lexstop = true;
2634 return Err(());
2635 }
2636 }
2637 }
2638 }
2639 '"' => {
2640 self.add(c);
2641 loop {
2642 let ch = self.hgetc();
2643 match ch {
2644 Some('"') => {
2645 self.add('"');
2646 break;
2647 }
2648 Some('\\') => {
2649 self.add('\\');
2650 if let Some(ch) = self.hgetc() {
2651 self.add(ch);
2652 }
2653 }
2654 Some(ch) => self.add(ch),
2655 None => {
2656 self.lexstop = true;
2657 return Err(());
2658 }
2659 }
2660 }
2661 }
2662 '`' => {
2663 self.add(c);
2664 loop {
2665 let ch = self.hgetc();
2666 match ch {
2667 Some('`') => {
2668 self.add('`');
2669 break;
2670 }
2671 Some('\\') => {
2672 self.add('\\');
2673 if let Some(ch) = self.hgetc() {
2674 self.add(ch);
2675 }
2676 }
2677 Some(ch) => self.add(ch),
2678 None => {
2679 self.lexstop = true;
2680 return Err(());
2681 }
2682 }
2683 }
2684 }
2685 '#' if start => {
2686 self.add(c);
2687 // Skip comment to end of line
2688 loop {
2689 let ch = self.hgetc();
2690 match ch {
2691 Some('\n') => {
2692 self.add('\n');
2693 break;
2694 }
2695 Some(ch) => self.add(ch),
2696 None => break,
2697 }
2698 }
2699 }
2700 _ => {
2701 self.add(c);
2702 }
2703 }
2704
2705 start = iswhite;
2706 }
2707 }
2708
2709 /// Lex next token AND update per-context flags. Direct port of
2710 /// zsh/Src/lex.c:316-369 `ctxtlex`. The post-token state machine
2711 /// at lex.c:322-358 sets `incmdpos` based on the token shape:
2712 /// list separators / pipes / control keywords reset to cmd-pos;
2713 /// word-shaped tokens leave cmd-pos. Redirections (lex.c:361-368)
2714 /// stash prior incmdpos and force the redir target to non-cmd-pos.
2715 pub fn ctxtlex(&mut self) {
2716 // lex.c:319 — static `oldpos` cache for redir-target restore
2717 // is captured per-call here as `oldpos` below (zshrs's parser
2718 // re-enters ctxtlex per token, no need for static persistence).
2719
2720 // lex.c:321 — `zshlex();` to advance to the next token.
2721 self.zshlex();
2722
2723 // lex.c:322-358 — post-token incmdpos switch.
2724 match self.tok {
2725 // lex.c:323-343 — separators / openers / conjunctions /
2726 // control keywords — back into cmd-pos so the next token
2727 // can be a fresh command.
2728 LexTok::Seper
2729 | LexTok::Newlin
2730 | LexTok::Semi
2731 | LexTok::Dsemi
2732 | LexTok::Semiamp
2733 | LexTok::Semibar
2734 | LexTok::Amper
2735 | LexTok::Amperbang
2736 | LexTok::Inpar
2737 | LexTok::Inbrace
2738 | LexTok::Dbar
2739 | LexTok::Damper
2740 | LexTok::Bar
2741 | LexTok::Baramp
2742 | LexTok::Inoutpar
2743 | LexTok::Doloop
2744 | LexTok::Then
2745 | LexTok::Elif
2746 | LexTok::Else
2747 | LexTok::Doutbrack => {
2748 self.incmdpos = true;
2749 }
2750 // lex.c:345-353 — word/value-shaped tokens leave cmd-pos
2751 // so subsequent tokens are arguments, not a fresh command.
2752 LexTok::String
2753 | LexTok::Typeset
2754 | LexTok::Envarray
2755 | LexTok::Outpar
2756 | LexTok::Case
2757 | LexTok::Dinbrack => {
2758 self.incmdpos = false;
2759 }
2760 _ => {}
2761 }
2762
2763 // lex.c:359-360 — `infor` decay. FOR sets infor=2 so the next
2764 // DINPAR can detect c-style for. After any non-DINPAR, decay
2765 // to 0 (or back to 2 if we just saw FOR again).
2766 if self.tok != LexTok::Dinpar {
2767 self.infor = if self.tok == LexTok::For { 2 } else { 0 };
2768 }
2769
2770 // lex.c:361-368 — redir-target context dance. After consuming
2771 // a redir operator, the following token (the file path) sees
2772 // incmdpos=0 even when its inherent shape would put it back
2773 // in cmd-pos. After the redir target, restore from oldpos
2774 // (struct field — must persist across zshlex calls).
2775 if self.tok.is_redirop()
2776 || self.tok == LexTok::For
2777 || self.tok == LexTok::Foreach
2778 || self.tok == LexTok::Select
2779 {
2780 self.inredir = true;
2781 self.oldpos = self.incmdpos;
2782 self.incmdpos = false;
2783 } else if self.inredir {
2784 self.incmdpos = self.oldpos;
2785 self.inredir = false;
2786 }
2787 }
2788
2789 /// Mark the current word as the one ZLE was looking for. Direct
2790 /// port of zsh/Src/lex.c:1881-1897 `gotword`. Only meaningful
2791 /// when the lexer was started with LEXFLAGS_ZLE for completion;
2792 /// after this call `lexflags` is cleared so subsequent tokens
2793 /// don't re-trigger word tracking.
2794 ///
2795 /// zshrs port note: zsh's gotword updates `wb`/`we` (word begin/
2796 /// end positions) based on `zlemetacs` (cursor pos), `zlemetall`
2797 /// (line length), `inbufct`, and `addedx` — all live in zsh's
2798 /// input.c globals which zshrs hasn't wired through the lexer.
2799 /// Only the `lexflags = 0` side-effect at lex.c:1895 is
2800 /// reproducible without that integration.
2801 pub fn gotword(&mut self) {
2802 // lex.c:1895 — `lexflags = 0;`
2803 self.lexflags = LexFlags::default();
2804 }
2805
2806 /// Register a heredoc to be processed at next newline
2807 pub fn register_heredoc(&mut self, terminator: String, strip_tabs: bool) {
2808 self.heredocs.push(HereDoc {
2809 terminator,
2810 strip_tabs,
2811 content: String::new(),
2812 quoted: false,
2813 processed: false,
2814 });
2815 }
2816
2817 /// Check for reserved word — mirrors lex.c:2002-2015 in `exalias`,
2818 /// but reachable from the bare `zshlex` path (without an
2819 /// AliasResolver). Promotes STRING tokens to keyword tokens when:
2820 /// - incmdpos is set (or text is `}` ending a brace block)
2821 /// - text is `]]` and we're inside `[[ ]]` (incond > 0)
2822 /// - text is bare `!` and we're at the start of a cond (incond == 1)
2823 pub fn check_reserved_word(&mut self) -> bool {
2824 if let Some(ref tokstr) = self.tokstr {
2825 if self.incmdpos || (tokstr == "}" && self.tok == LexTok::String) {
2826 if let Some(tok) = crate::tokens::lookup_reserved_word(tokstr) {
2827 self.tok = tok;
2828 if tok == LexTok::Repeat {
2829 self.inrepeat = 1;
2830 }
2831 if tok == LexTok::Dinbrack {
2832 self.incond = 1;
2833 }
2834 return true;
2835 }
2836 if tokstr == "]]" && self.incond > 0 {
2837 self.tok = LexTok::Doutbrack;
2838 self.incond = 0;
2839 return true;
2840 }
2841 }
2842 // lex.c:2010-2014 — `]]` and `!` are recognized inside `[[`
2843 // regardless of incmdpos.
2844 if self.incond > 0 && tokstr == "]]" {
2845 self.tok = LexTok::Doutbrack;
2846 self.incond = 0;
2847 return true;
2848 }
2849 if self.incond == 1 && tokstr == "!" {
2850 self.tok = LexTok::Bang;
2851 return true;
2852 }
2853 }
2854 false
2855 }
2856}
2857
2858/// Result of determining if (( is arithmetic or command
2859enum CmdOrMath {
2860 Cmd,
2861 Math,
2862 Err,
2863}
2864
2865// ============================================================================
2866// Additional parsing functions ported from lex.c
2867// ============================================================================
2868
2869/// Check whether we're looking at valid numeric globbing syntax
2870/// `<N-M>` / `<N->` / `<-M>` / `<->`. Call pointing just after the
2871/// opening `<`. Leaves the input position unchanged, returning true
2872/// or false.
2873///
2874/// Direct port of zsh/Src/lex.c:580-610 `isnumglob`. C source uses
2875/// hgetc/hungetc against the input stream and a temp buffer to
2876/// remember consumed chars; zshrs takes a `(input, pos)` slice and
2877/// scans without consumption. Same predicate, different I/O model.
2878pub fn isnumglob(input: &str, pos: usize) -> bool {
2879 let chars: Vec<char> = input[pos..].chars().collect();
2880 let mut i = 0;
2881 let mut expect_close = false;
2882
2883 // Look for digits, then -, then digits, then >
2884 while i < chars.len() {
2885 let c = chars[i];
2886 if c.is_ascii_digit() {
2887 i += 1;
2888 } else if c == '-' && !expect_close {
2889 expect_close = true;
2890 i += 1;
2891 } else if c == '>' && expect_close {
2892 return true;
2893 } else {
2894 break;
2895 }
2896 }
2897 false
2898}
2899
2900/// Tokenize a string as if in double quotes (error-tolerant variant).
2901///
2902/// Direct port of zsh/Src/lex.c:1713-1733 `parsestrnoerr`. The C
2903/// source: zcontext_save → untokenize → inpush → strinbeg →
2904/// `lexbuf.ptr = tokstr = *s; lexbuf.siz = l + 1` →
2905/// `err = dquote_parse('\0', 1)` → strinend → inpop → zcontext_restore.
2906/// Returns the tokenized string on success, or the offending char as
2907/// an error code (zsh convention: `> 32 && < 127` → printable, else
2908/// generic).
2909///
2910/// zshrs port: the C version drives the lexer's dquote_parse method
2911/// against the input string. zshrs's standalone walker produces the
2912/// same BNULL/QSTRING/QTICK token markers without re-entering the
2913/// lexer — same output for typical bodies. Documented divergence:
2914/// nested cmd-sub `$(...)` and arith `$((...))` aren't lexed
2915/// recursively; the runtime handles them at expansion time.
2916pub fn parsestrnoerr(s: &str) -> Result<String, String> {
2917 parsestr_inner(s)
2918}
2919
2920/// Tokenize a string as if in double quotes (error-reporting variant).
2921///
2922/// Direct port of zsh/Src/lex.c:1693-1709 `parsestr`. C source:
2923/// `if ((err = parsestrnoerr(s))) { untokenize(*s); ... zerr("parse
2924/// error near `%c'", err); tok = LEXERR; }`. zshrs's wrapper
2925/// returns the same Result and lets the caller emit the diagnostic.
2926///
2927/// Both `parsestr` and `parsestrnoerr` share the inner walker; the
2928/// only difference in C is whether errors trigger `zerr`. zshrs
2929/// returns `Err(msg)` from both — the caller decides whether to
2930/// surface the diagnostic.
2931pub fn parsestr(s: &str) -> Result<String, String> {
2932 parsestr_inner(s)
2933}
2934
2935/// Shared body for parsestr / parsestrnoerr.
2936fn parsestr_inner(s: &str) -> Result<String, String> {
2937 let mut result = String::with_capacity(s.len());
2938 let chars: Vec<char> = s.chars().collect();
2939 let mut i = 0;
2940
2941 while i < chars.len() {
2942 let c = chars[i];
2943 match c {
2944 '\\' => {
2945 i += 1;
2946 if i < chars.len() {
2947 let next = chars[i];
2948 match next {
2949 '$' | '\\' | '`' | '"' | '\n' => {
2950 result.push(char_tokens::BNULL);
2951 result.push(next);
2952 }
2953 _ => {
2954 result.push('\\');
2955 result.push(next);
2956 }
2957 }
2958 } else {
2959 result.push('\\');
2960 }
2961 }
2962 '$' => {
2963 result.push(char_tokens::QSTRING);
2964 if i + 1 < chars.len() {
2965 let next = chars[i + 1];
2966 if next == '{' {
2967 result.push(char_tokens::INBRACE);
2968 i += 1;
2969 } else if next == '(' {
2970 result.push(char_tokens::INPAR);
2971 i += 1;
2972 }
2973 }
2974 }
2975 '`' => {
2976 result.push(char_tokens::QTICK);
2977 }
2978 _ => {
2979 result.push(c);
2980 }
2981 }
2982 i += 1;
2983 }
2984
2985 Ok(result)
2986}
2987
2988/// Parse a subscript in string s. Return the position after the
2989/// closing bracket, or None on error.
2990///
2991/// Direct port of zsh/Src/lex.c:1742-1788 `parse_subscript`. The C
2992/// source uses dupstring_wlen + inpush + dquote_parse to lex the
2993/// subscript through the main lexer; zshrs implements a focused
2994/// bracket-balancing walker that handles the same nesting rules
2995/// (`[...]`, `(...)`, `{...}`) without re-entering the lexer.
2996///
2997/// zshrs port note: zsh's parse_subscript also handles a `sub`
2998/// flag that controls whether `$` and quotes are tokenized — that
2999/// flag isn't exposed here. Most callers don't need it; the few
3000/// that do (parameter expansion's `${var[expr]}`) handle the
3001/// quote-aware lex separately at the expansion layer.
3002pub fn parse_subscript(s: &str, endchar: char) -> Option<usize> {
3003 if s.is_empty() || s.starts_with(endchar) {
3004 return None;
3005 }
3006
3007 let chars: Vec<char> = s.chars().collect();
3008 let mut i = 0;
3009 let mut depth = 0;
3010 let mut in_dquote = false;
3011 let mut in_squote = false;
3012
3013 while i < chars.len() {
3014 let c = chars[i];
3015
3016 if in_squote {
3017 if c == '\'' {
3018 in_squote = false;
3019 }
3020 i += 1;
3021 continue;
3022 }
3023
3024 if in_dquote {
3025 if c == '"' {
3026 in_dquote = false;
3027 } else if c == '\\' && i + 1 < chars.len() {
3028 i += 1; // skip escaped char
3029 }
3030 i += 1;
3031 continue;
3032 }
3033
3034 match c {
3035 '\\' => {
3036 i += 1; // skip next char
3037 }
3038 '\'' => {
3039 in_squote = true;
3040 }
3041 '"' => {
3042 in_dquote = true;
3043 }
3044 '[' | '(' => {
3045 depth += 1;
3046 }
3047 ']' | ')' => {
3048 if depth > 0 {
3049 depth -= 1;
3050 } else if c == endchar {
3051 return Some(i);
3052 }
3053 }
3054 _ => {}
3055 }
3056
3057 if c == endchar && depth == 0 {
3058 return Some(i);
3059 }
3060
3061 i += 1;
3062 }
3063
3064 None
3065}
3066
3067/// Tokenize a string as if it were a normal command-line argument
3068/// but it may contain separators. Used for ${...%...} substitutions.
3069///
3070/// Direct port of zsh/Src/lex.c:1796-1880 `parse_subst_string`.
3071/// zsh's version sets `noaliases = 1` + `lexflags = 0` + uses
3072/// zcontext_save/inpush/strinbeg → dquote_parse('\0', 1) →
3073/// strinend/inpop/zcontext_restore. zshrs's standalone walker
3074/// produces the same BNULL/SNULL/DNULL/INPAR/INBRACK markers
3075/// without re-entering the lexer.
3076///
3077/// zshrs port note: the C source returns int (0=ok, char value =
3078/// where it stopped on error); zshrs returns Result<String,String>
3079/// returning the tokenized text directly. Lossy for callers that
3080/// need to know the exact stop position, but nothing in zshrs's
3081/// expansion layer uses that yet.
3082pub fn parse_subst_string(s: &str) -> Result<String, String> {
3083 if s.is_empty() {
3084 return Ok(String::new());
3085 }
3086
3087 let mut result = String::with_capacity(s.len());
3088 let chars: Vec<char> = s.chars().collect();
3089 let mut i = 0;
3090
3091 while i < chars.len() {
3092 let c = chars[i];
3093 match c {
3094 '\\' => {
3095 result.push(char_tokens::BNULL);
3096 i += 1;
3097 if i < chars.len() {
3098 result.push(chars[i]);
3099 }
3100 }
3101 '\'' => {
3102 result.push(char_tokens::SNULL);
3103 i += 1;
3104 while i < chars.len() && chars[i] != '\'' {
3105 result.push(chars[i]);
3106 i += 1;
3107 }
3108 result.push(char_tokens::SNULL);
3109 }
3110 '"' => {
3111 result.push(char_tokens::DNULL);
3112 i += 1;
3113 while i < chars.len() && chars[i] != '"' {
3114 if chars[i] == '\\' && i + 1 < chars.len() {
3115 result.push(char_tokens::BNULL);
3116 i += 1;
3117 result.push(chars[i]);
3118 } else if chars[i] == '$' {
3119 result.push(char_tokens::QSTRING);
3120 } else {
3121 result.push(chars[i]);
3122 }
3123 i += 1;
3124 }
3125 result.push(char_tokens::DNULL);
3126 }
3127 '$' => {
3128 result.push(char_tokens::STRING);
3129 if i + 1 < chars.len() {
3130 match chars[i + 1] {
3131 '{' => {
3132 result.push(char_tokens::INBRACE);
3133 i += 1;
3134 }
3135 '(' => {
3136 result.push(char_tokens::INPAR);
3137 i += 1;
3138 }
3139 _ => {}
3140 }
3141 }
3142 }
3143 '*' => result.push(char_tokens::STAR),
3144 '?' => result.push(char_tokens::QUEST),
3145 '[' => result.push(char_tokens::INBRACK),
3146 ']' => result.push(char_tokens::OUTBRACK),
3147 '{' => result.push(char_tokens::INBRACE),
3148 '}' => result.push(char_tokens::OUTBRACE),
3149 '~' => result.push(char_tokens::TILDE),
3150 '#' => result.push(char_tokens::POUND),
3151 '^' => result.push(char_tokens::HAT),
3152 _ => result.push(c),
3153 }
3154 i += 1;
3155 }
3156
3157 Ok(result)
3158}
3159
3160/// Untokenize a string - convert tokenized chars back to original
3161///
3162/// Port of untokenize() from exec.c (but used by lexer too)
3163/// Like `untokenize`, but maps SNULL → `'` and DNULL → `"` instead of
3164/// stripping them. Used by callers that need the source form including
3165/// quoting (e.g. arithmetic-substitution detection in compile_zsh).
3166pub fn untokenize_preserve_quotes(s: &str) -> String {
3167 let mut result = String::with_capacity(s.len() + 4);
3168 for c in s.chars() {
3169 let cu = c as u32;
3170 if (0x83..=0x9f).contains(&cu) {
3171 match c {
3172 c if c == char_tokens::POUND => result.push('#'),
3173 c if c == char_tokens::STRING => result.push('$'),
3174 c if c == char_tokens::HAT => result.push('^'),
3175 c if c == char_tokens::STAR => result.push('*'),
3176 c if c == char_tokens::INPAR => result.push('('),
3177 c if c == char_tokens::OUTPAR => result.push(')'),
3178 c if c == char_tokens::INPARMATH => result.push('('),
3179 c if c == char_tokens::OUTPARMATH => result.push(')'),
3180 c if c == char_tokens::QSTRING => result.push('$'),
3181 c if c == char_tokens::EQUALS => result.push('='),
3182 c if c == char_tokens::BAR => result.push('|'),
3183 c if c == char_tokens::INBRACE => result.push('{'),
3184 c if c == char_tokens::OUTBRACE => result.push('}'),
3185 c if c == char_tokens::INBRACK => result.push('['),
3186 c if c == char_tokens::OUTBRACK => result.push(']'),
3187 c if c == char_tokens::TICK => result.push('`'),
3188 c if c == char_tokens::INANG => result.push('<'),
3189 c if c == char_tokens::OUTANG => result.push('>'),
3190 c if c == char_tokens::OUTANGPROC => result.push('>'),
3191 c if c == char_tokens::QUEST => result.push('?'),
3192 c if c == char_tokens::TILDE => result.push('~'),
3193 c if c == char_tokens::QTICK => result.push('`'),
3194 c if c == char_tokens::COMMA => result.push(','),
3195 c if c == char_tokens::DASH => result.push('-'),
3196 c if c == char_tokens::BANG => result.push('!'),
3197 c if c == char_tokens::SNULL => result.push('\''),
3198 c if c == char_tokens::DNULL => result.push('"'),
3199 c if c == char_tokens::BNULL => result.push('\\'),
3200 _ => {
3201 let idx = c as usize;
3202 if idx < char_tokens::ZTOKENS.len() {
3203 result.push(char_tokens::ZTOKENS.chars().nth(idx).unwrap_or(c));
3204 } else {
3205 result.push(c);
3206 }
3207 }
3208 }
3209 } else {
3210 result.push(c);
3211 }
3212 }
3213 result
3214}
3215
3216pub fn untokenize(s: &str) -> String {
3217 let mut result = String::with_capacity(s.len());
3218 let chars: Vec<char> = s.chars().collect();
3219 let mut i = 0;
3220
3221 while i < chars.len() {
3222 let c = chars[i];
3223 // Token chars live in zsh's META range (0x83 = META through 0x9f =
3224 // BNULL). Anything in that range needs un-mapping before display
3225 // or downstream consumption. The original `< 32` test was wrong —
3226 // none of zsh's tokens land in that range.
3227 let cu = c as u32;
3228 if (0x83..=0x9f).contains(&cu) {
3229 // Convert token back to original character
3230 match c {
3231 c if c == char_tokens::POUND => result.push('#'),
3232 c if c == char_tokens::STRING => result.push('$'),
3233 c if c == char_tokens::HAT => result.push('^'),
3234 c if c == char_tokens::STAR => result.push('*'),
3235 c if c == char_tokens::INPAR => result.push('('),
3236 c if c == char_tokens::OUTPAR => result.push(')'),
3237 c if c == char_tokens::INPARMATH => result.push('('),
3238 c if c == char_tokens::OUTPARMATH => result.push(')'),
3239 c if c == char_tokens::QSTRING => result.push('$'),
3240 c if c == char_tokens::EQUALS => result.push('='),
3241 c if c == char_tokens::BAR => result.push('|'),
3242 c if c == char_tokens::INBRACE => result.push('{'),
3243 c if c == char_tokens::OUTBRACE => result.push('}'),
3244 c if c == char_tokens::INBRACK => result.push('['),
3245 c if c == char_tokens::OUTBRACK => result.push(']'),
3246 c if c == char_tokens::TICK => result.push('`'),
3247 c if c == char_tokens::INANG => result.push('<'),
3248 c if c == char_tokens::OUTANG => result.push('>'),
3249 c if c == char_tokens::OUTANGPROC => result.push('>'),
3250 c if c == char_tokens::QUEST => result.push('?'),
3251 c if c == char_tokens::TILDE => result.push('~'),
3252 c if c == char_tokens::QTICK => result.push('`'),
3253 c if c == char_tokens::COMMA => result.push(','),
3254 c if c == char_tokens::DASH => result.push('-'),
3255 c if c == char_tokens::BANG => result.push('!'),
3256 c if c == char_tokens::SNULL
3257 || c == char_tokens::DNULL
3258 || c == char_tokens::BNULL =>
3259 {
3260 // Null markers - skip
3261 }
3262 _ => {
3263 // Unknown token, try ztokens lookup
3264 let idx = c as usize;
3265 if idx < char_tokens::ZTOKENS.len() {
3266 result.push(char_tokens::ZTOKENS.chars().nth(idx).unwrap_or(c));
3267 } else {
3268 result.push(c);
3269 }
3270 }
3271 }
3272 } else {
3273 result.push(c);
3274 }
3275 i += 1;
3276 }
3277
3278 result
3279}
3280
3281/// Check if a string contains any token characters
3282pub fn has_token(s: &str) -> bool {
3283 s.chars().any(|c| (c as u32) < 32)
3284}
3285
3286/// Convert token characters to their printable form for display
3287pub fn tokens_to_printable(s: &str) -> String {
3288 untokenize(s)
3289}
3290
3291#[cfg(test)]
3292mod tests {
3293 use super::*;
3294
3295 #[test]
3296 fn test_simple_command() {
3297 let mut lexer = ZshLexer::new("echo hello");
3298 lexer.zshlex();
3299 assert_eq!(lexer.tok, LexTok::String);
3300 assert_eq!(lexer.tokstr, Some("echo".to_string()));
3301
3302 lexer.zshlex();
3303 assert_eq!(lexer.tok, LexTok::String);
3304 assert_eq!(lexer.tokstr, Some("hello".to_string()));
3305
3306 lexer.zshlex();
3307 assert_eq!(lexer.tok, LexTok::Endinput);
3308 }
3309
3310 #[test]
3311 fn test_pipeline() {
3312 let mut lexer = ZshLexer::new("ls | grep foo");
3313 lexer.zshlex();
3314 assert_eq!(lexer.tok, LexTok::String);
3315
3316 lexer.zshlex();
3317 assert_eq!(lexer.tok, LexTok::Bar);
3318
3319 lexer.zshlex();
3320 assert_eq!(lexer.tok, LexTok::String);
3321
3322 lexer.zshlex();
3323 assert_eq!(lexer.tok, LexTok::String);
3324 }
3325
3326 #[test]
3327 fn test_redirections() {
3328 let mut lexer = ZshLexer::new("echo > file");
3329 lexer.zshlex();
3330 assert_eq!(lexer.tok, LexTok::String);
3331
3332 lexer.zshlex();
3333 assert_eq!(lexer.tok, LexTok::Outang);
3334
3335 lexer.zshlex();
3336 assert_eq!(lexer.tok, LexTok::String);
3337 }
3338
3339 #[test]
3340 fn test_heredoc() {
3341 let mut lexer = ZshLexer::new("cat << EOF");
3342 lexer.zshlex();
3343 assert_eq!(lexer.tok, LexTok::String);
3344
3345 lexer.zshlex();
3346 assert_eq!(lexer.tok, LexTok::Dinang);
3347
3348 lexer.zshlex();
3349 assert_eq!(lexer.tok, LexTok::String);
3350 }
3351
3352 #[test]
3353 fn test_single_quotes() {
3354 let mut lexer = ZshLexer::new("echo 'hello world'");
3355 lexer.zshlex();
3356 assert_eq!(lexer.tok, LexTok::String);
3357
3358 lexer.zshlex();
3359 assert_eq!(lexer.tok, LexTok::String);
3360 // Should contain Snull markers around literal content
3361 assert!(lexer.tokstr.is_some());
3362 }
3363
3364 #[test]
3365 fn test_function_tokens() {
3366 let mut lexer = ZshLexer::new("function foo { }");
3367 lexer.zshlex();
3368 assert_eq!(
3369 lexer.tok,
3370 LexTok::Func,
3371 "expected Func, got {:?}",
3372 lexer.tok
3373 );
3374
3375 lexer.zshlex();
3376 assert_eq!(
3377 lexer.tok,
3378 LexTok::String,
3379 "expected String for 'foo', got {:?}",
3380 lexer.tok
3381 );
3382 assert_eq!(lexer.tokstr, Some("foo".to_string()));
3383
3384 lexer.zshlex();
3385 assert_eq!(
3386 lexer.tok,
3387 LexTok::Inbrace,
3388 "expected Inbrace, got {:?} tokstr={:?}",
3389 lexer.tok,
3390 lexer.tokstr
3391 );
3392
3393 lexer.zshlex();
3394 assert_eq!(
3395 lexer.tok,
3396 LexTok::Outbrace,
3397 "expected Outbrace, got {:?} tokstr={:?} incmdpos={}",
3398 lexer.tok,
3399 lexer.tokstr,
3400 lexer.incmdpos
3401 );
3402 }
3403
3404 #[test]
3405 fn test_double_quotes() {
3406 let mut lexer = ZshLexer::new("echo \"hello $name\"");
3407 lexer.zshlex();
3408 assert_eq!(lexer.tok, LexTok::String);
3409
3410 lexer.zshlex();
3411 assert_eq!(lexer.tok, LexTok::String);
3412 // Should contain tokenized content
3413 assert!(lexer.tokstr.is_some());
3414 }
3415
3416 #[test]
3417 fn test_command_substitution() {
3418 let mut lexer = ZshLexer::new("echo $(pwd)");
3419 lexer.zshlex();
3420 assert_eq!(lexer.tok, LexTok::String);
3421
3422 lexer.zshlex();
3423 assert_eq!(lexer.tok, LexTok::String);
3424 }
3425
3426 #[test]
3427 fn test_env_assignment() {
3428 let mut lexer = ZshLexer::new("FOO=bar echo");
3429 lexer.incmdpos = true;
3430 lexer.zshlex();
3431 assert_eq!(
3432 lexer.tok,
3433 LexTok::Envstring,
3434 "tok={:?} tokstr={:?}",
3435 lexer.tok,
3436 lexer.tokstr
3437 );
3438
3439 lexer.zshlex();
3440 assert_eq!(lexer.tok, LexTok::String);
3441 }
3442
3443 #[test]
3444 fn test_array_assignment() {
3445 let mut lexer = ZshLexer::new("arr=(a b c)");
3446 lexer.incmdpos = true;
3447 lexer.zshlex();
3448 assert_eq!(lexer.tok, LexTok::Envarray);
3449 }
3450
3451 #[test]
3452 fn test_process_substitution() {
3453 let mut lexer = ZshLexer::new("diff <(ls) >(cat)");
3454 lexer.zshlex();
3455 assert_eq!(lexer.tok, LexTok::String);
3456
3457 lexer.zshlex();
3458 assert_eq!(lexer.tok, LexTok::String);
3459 // <(ls) is tokenized into the string
3460
3461 lexer.zshlex();
3462 assert_eq!(lexer.tok, LexTok::String);
3463 // >(cat) is tokenized
3464 }
3465
3466 #[test]
3467 fn test_arithmetic() {
3468 let mut lexer = ZshLexer::new("echo $((1+2))");
3469 lexer.zshlex();
3470 assert_eq!(lexer.tok, LexTok::String);
3471
3472 lexer.zshlex();
3473 assert_eq!(lexer.tok, LexTok::String);
3474 }
3475
3476 #[test]
3477 fn test_semicolon_variants() {
3478 let mut lexer = ZshLexer::new("case x in a) cmd;; b) cmd;& c) cmd;| esac");
3479
3480 // Skip to first ;;
3481 loop {
3482 lexer.zshlex();
3483 if lexer.tok == LexTok::Dsemi || lexer.tok == LexTok::Endinput {
3484 break;
3485 }
3486 }
3487 assert_eq!(lexer.tok, LexTok::Dsemi);
3488
3489 // Find ;&
3490 loop {
3491 lexer.zshlex();
3492 if lexer.tok == LexTok::Semiamp || lexer.tok == LexTok::Endinput {
3493 break;
3494 }
3495 }
3496 assert_eq!(lexer.tok, LexTok::Semiamp);
3497
3498 // Find ;|
3499 loop {
3500 lexer.zshlex();
3501 if lexer.tok == LexTok::Semibar || lexer.tok == LexTok::Endinput {
3502 break;
3503 }
3504 }
3505 assert_eq!(lexer.tok, LexTok::Semibar);
3506 }
3507}