zshrs_parse/lexer.rs
1//! Zsh lexical analyzer - Direct port from zsh/Src/lex.c
2//!
3//! This lexer tokenizes zsh shell input into a stream of tokens.
4//! It handles all zsh-specific syntax including:
5//! - Single/double/dollar quotes
6//! - Command substitution $(...) and `...`
7//! - Arithmetic $((...))
8//! - Parameter expansion ${...}
9//! - Process substitution <(...) >(...)
10//! - Here documents
11//! - All redirection operators
12//! - Comments
13//! - Continuation lines
14
15use crate::tokens::{char_tokens, LexTok};
16use std::collections::VecDeque;
17
18/// Lexer flags controlling behavior
19#[derive(Debug, Clone, Copy, Default)]
20pub struct LexFlags {
21 /// Parsing for ZLE (line editor) completion
22 pub zle: bool,
23 /// Return newlines as tokens
24 pub newline: bool,
25 /// Preserve comments in output
26 pub comments_keep: bool,
27 /// Strip comments from output
28 pub comments_strip: bool,
29 /// Active lexing (from bufferwords)
30 pub active: bool,
31}
32
33/// Buffer state for building tokens
34#[derive(Debug, Clone)]
35struct LexBuf {
36 data: String,
37 siz: usize,
38}
39
40impl LexBuf {
41 fn new() -> Self {
42 LexBuf {
43 data: String::with_capacity(256),
44 siz: 256,
45 }
46 }
47
48 fn clear(&mut self) {
49 self.data.clear();
50 }
51
52 fn add(&mut self, c: char) {
53 self.data.push(c);
54 if self.data.len() >= self.siz {
55 self.siz *= 2;
56 self.data.reserve(self.siz - self.data.len());
57 }
58 }
59
60 #[allow(dead_code)]
61 fn add_str(&mut self, s: &str) {
62 self.data.push_str(s);
63 }
64
65 fn len(&self) -> usize {
66 self.data.len()
67 }
68
69 fn as_str(&self) -> &str {
70 &self.data
71 }
72
73 #[allow(dead_code)]
74 fn into_string(self) -> String {
75 self.data
76 }
77
78 #[allow(dead_code)]
79 fn last_char(&self) -> Option<char> {
80 self.data.chars().last()
81 }
82
83 fn pop(&mut self) -> Option<char> {
84 self.data.pop()
85 }
86}
87
88/// Here-document state
89#[derive(Debug, Clone)]
90pub struct HereDoc {
91 pub terminator: String,
92 pub strip_tabs: bool,
93 pub content: String,
94 /// True if the terminator was originally quoted (`<<'EOF'`,
95 /// `<<"EOF"`, or `<<\EOF`). Disables variable expansion / command
96 /// substitution / arithmetic in the body.
97 pub quoted: bool,
98 /// True once `process_heredocs` has read the body. Distinct from
99 /// "content is empty" because an empty heredoc legitimately has
100 /// empty content.
101 pub processed: bool,
102}
103
104/// The Zsh Lexer
105pub struct ZshLexer<'a> {
106 /// Input source
107 pub(crate) input: &'a str,
108 /// Current position in input
109 pub(crate) pos: usize,
110 /// Look-ahead buffer for ungotten characters
111 unget_buf: VecDeque<char>,
112 /// Current token string
113 pub tokstr: Option<String>,
114 /// Current token type
115 pub tok: LexTok,
116 /// File descriptor for redirections (e.g., 2> means fd=2)
117 pub tokfd: i32,
118 /// Line number at start of current token
119 pub toklineno: u64,
120 /// Current line number
121 pub lineno: u64,
122 /// Lexer has stopped (EOF or error)
123 pub lexstop: bool,
124 /// In command position (can accept reserved words)
125 pub incmdpos: bool,
126 /// In condition [[ ... ]]
127 pub incond: i32,
128 /// In pattern context (RHS of == != =~ in [[ ]])
129 pub incondpat: bool,
130 /// In case pattern
131 pub incasepat: i32,
132 /// In redirection
133 pub inredir: bool,
134 /// After 'for' keyword
135 pub infor: i32,
136 /// After 'repeat' keyword
137 inrepeat: i32,
138 /// Parsing typeset arguments
139 pub intypeset: bool,
140 /// Inside (( ... )) arithmetic
141 dbparens: bool,
142 /// Disable alias expansion
143 pub noaliases: bool,
144 /// Disable spelling correction
145 pub nocorrect: i32,
146 /// Disable comment recognition
147 pub nocomments: bool,
148 /// Lexer flags
149 pub lexflags: LexFlags,
150 /// Whether this is the first line
151 pub isfirstln: bool,
152 /// Whether this is the first char of command
153 #[allow(dead_code)]
154 isfirstch: bool,
155 /// Pending here-documents
156 pub heredocs: Vec<HereDoc>,
157 /// Expecting heredoc terminator (0 = no, 1 = <<, 2 = <<-)
158 heredoc_pending: u8,
159 /// Token buffer
160 lexbuf: LexBuf,
161 /// After newline
162 pub isnewlin: i32,
163 /// Error message if any
164 pub error: Option<String>,
165 /// Global iteration counter for infinite loop detection
166 global_iterations: usize,
167 /// Recursion depth counter
168 recursion_depth: usize,
169 /// Raw-input capture flag — when nonzero, every char read through
170 /// `hgetc` is also appended to `tokstr_raw` via zshlex_raw_add.
171 /// Direct mirror of zsh/Src/lex.c:161 `lex_add_raw`. Used by
172 /// skipcomm (lex.c:2082) to preserve the literal text of `$(...)`
173 /// command substitutions for re-execution / display.
174 pub lex_add_raw: i32,
175 /// Raw-input capture buffer. Direct mirror of lex.c:165
176 /// `tokstr_raw` / lex.c:166 `lexbuf_raw`. Combined into one
177 /// `LexBuf` here since Rust's String tracks both the data and
178 /// length internally.
179 lexbuf_raw: LexBuf,
180}
181
182const MAX_LEXER_RECURSION: usize = 200;
183
184/// Per-alias info returned by `AliasResolver::lookup_alias` and
185/// `lookup_suffix_alias`. Mirrors zsh's `struct alias` fields used
186/// at lex.c:1914-1943: `text` (replacement body), `in_use` (the
187/// recursion-guard flag), `global` (vs command-position-only).
188#[derive(Debug, Clone)]
189pub struct AliasInfo {
190 pub text: String,
191 pub in_use: bool,
192 pub global: bool,
193}
194
195/// Trait the lexer uses to look up aliases and reserved words during
196/// `exalias`. Implementors typically delegate to the executor's
197/// alias/reswd hash tables. Defining the trait here keeps lexer.rs
198/// free of executor-specific types — same pattern zsh uses with the
199/// hashtable.h opaque-handle approach against aliastab/reswdtab/
200/// sufaliastab.
201pub trait AliasResolver {
202 /// Look up an alias by name. Returns `None` if not found, or the
203 /// alias body + flags otherwise.
204 fn lookup_alias(&self, name: &str) -> Option<AliasInfo>;
205 /// Look up a suffix alias (e.g. `.txt → less`) by suffix only.
206 fn lookup_suffix_alias(&self, suffix: &str) -> Option<AliasInfo>;
207 /// Resolve a reserved word. Returns the LexTok the word should
208 /// promote to (e.g. "if" → IF), or None if not a reswd.
209 fn lookup_reswd(&self, name: &str) -> Option<LexTok>;
210 /// Mark an alias as in-use (recursion guard). Called when an
211 /// alias is about to be expanded; the matching unmark happens
212 /// when the alias text has been fully consumed by the lexer.
213 fn mark_in_use(&mut self, name: &str, in_use: bool);
214}
215
216/// Saved lexical state for nested-context handling. Direct port of
217/// `struct lex_stack` declared in zsh/Src/zsh.h and used by
218/// zsh/Src/lex.c:215-239 (`lex_context_save`) and lex.c:244-262
219/// (`lex_context_restore`). Used when entering command substitution,
220/// here-docs, or eval where the outer lexer state must be pushed and
221/// restored after the inner parse completes.
222#[derive(Debug, Clone)]
223pub struct LexStack {
224 pub dbparens: bool,
225 pub isfirstln: bool,
226 pub isfirstch: bool,
227 pub lexflags: LexFlags,
228 pub tok: LexTok,
229 pub tokstr: Option<String>,
230 pub lexbuf_data: String,
231 pub lexbuf_siz: usize,
232 pub lexstop: bool,
233 pub toklineno: u64,
234}
235
236impl Default for LexStack {
237 fn default() -> Self {
238 // Mirrors lex.c:235-238 reset state after a save: tokstr / lexbuf
239 // zeroed, lexbuf.siz back to the initial 256 alloc, tok to
240 // ENDINPUT (the C source doesn't explicitly reset tok here but
241 // the natural baseline is ENDINPUT — same as lexinit).
242 LexStack {
243 dbparens: false,
244 isfirstln: false,
245 isfirstch: false,
246 lexflags: LexFlags::default(),
247 tok: LexTok::Endinput,
248 tokstr: None,
249 lexbuf_data: String::new(),
250 lexbuf_siz: 256,
251 lexstop: false,
252 toklineno: 0,
253 }
254 }
255}
256
257impl<'a> ZshLexer<'a> {
258 /// Create a new lexer for the given input
259 pub fn new(input: &'a str) -> Self {
260 ZshLexer {
261 input,
262 pos: 0,
263 unget_buf: VecDeque::new(),
264 tokstr: None,
265 tok: LexTok::Endinput,
266 tokfd: -1,
267 toklineno: 1,
268 lineno: 1,
269 lexstop: false,
270 incmdpos: true,
271 incond: 0,
272 incondpat: false,
273 incasepat: 0,
274 inredir: false,
275 infor: 0,
276 inrepeat: 0,
277 intypeset: false,
278 dbparens: false,
279 noaliases: false,
280 nocorrect: 0,
281 nocomments: false,
282 lexflags: LexFlags::default(),
283 isfirstln: true,
284 isfirstch: true,
285 heredocs: Vec::new(),
286 heredoc_pending: 0,
287 lexbuf: LexBuf::new(),
288 isnewlin: 0,
289 error: None,
290 global_iterations: 0,
291 recursion_depth: 0,
292 lex_add_raw: 0,
293 lexbuf_raw: LexBuf::new(),
294 }
295 }
296
297 /// Append a char to the raw-input capture buffer. Direct port of
298 /// zsh/Src/lex.c:2024-2039 `zshlex_raw_add`. Called from hgetc
299 /// when `lex_add_raw` is nonzero so cmd-sub bodies (`$(...)`,
300 /// `<(...)`, `>(...)`) can be replayed verbatim without re-lexing.
301 pub fn zshlex_raw_add(&mut self, c: char) {
302 // lex.c:2027-2028 — guard on lex_add_raw flag.
303 if self.lex_add_raw == 0 {
304 return;
305 }
306 // lex.c:2030-2038 — append to lexbuf_raw. The C source manages
307 // explicit ptr/len/siz with hrealloc; Rust's String handles
308 // resize automatically.
309 self.lexbuf_raw.add(c);
310 }
311
312 /// Run alias / reserved-word expansion on the just-lexed token.
313 /// Direct port of zsh/Src/lex.c:1949-2021 `exalias`. Returns true
314 /// if an alias was injected (the caller's loop should re-run
315 /// gettok to consume the injected text).
316 ///
317 /// C source flow:
318 /// 1. Spell-correct (lex.c:1958-1962) — disabled in zshrs.
319 /// 2. If tokstr is None: set lextext from `tokstrings[tok]` and
320 /// checkalias against that (lex.c:1964-1969).
321 /// 3. Otherwise: untokenize tokstr into a working copy (lex.c:
322 /// 1971-1980).
323 /// 4. ZLE word-tracking: call gotword() if LEXFLAGS_ZLE
324 /// (lex.c:1982-1991).
325 /// 5. STRING tokens: try checkalias, then reservation lookup
326 /// (lex.c:1993-2015).
327 /// 6. Clear inalmore (lex.c:2016).
328 ///
329 /// Takes an `AliasResolver` trait object so the lexer doesn't
330 /// hard-depend on the executor's alias-table types. zshrs callers
331 /// implement `AliasResolver` over their alias hash tables.
332 pub fn exalias<R: AliasResolver>(&mut self, resolver: &mut R) -> bool {
333 // lex.c:1957 — `hwend()` ends the history-word region. zshrs's
334 // history layer doesn't track per-word boundaries here; no-op.
335
336 // lex.c:1958-1962 — spell correction via spckword. zshrs
337 // doesn't implement spell correction yet; documented divergence.
338
339 // lex.c:1964-1969 — bare-token path (no tokstr).
340 if self.tokstr.is_none() {
341 // lex.c:1965 — `zshlextext = tokstrings[tok];` — for tokens
342 // like SEMI/AMPER/etc. the canonical text comes from a
343 // static table. zshrs's check_alias_for_text uses the
344 // resolver directly with the token's text representation.
345 if self.tok == LexTok::Newlin {
346 return false;
347 }
348 // Use punctuation-token text; unknown tokens skip alias.
349 let text = match self.tok {
350 LexTok::Semi => ";",
351 LexTok::Amper => "&",
352 LexTok::Bar => "|",
353 _ => return false,
354 };
355 return self.check_alias(resolver, text);
356 }
357
358 let tokstr = self.tokstr.clone().unwrap();
359 // lex.c:1973-1980 — untokenize: convert the lexer's internal
360 // tokenized form (Pound..ztokens shifts) into the literal
361 // shell text. Call the global helper.
362 let lextext = if has_token(&tokstr) {
363 untokenize(&tokstr)
364 } else {
365 tokstr.clone()
366 };
367
368 // lex.c:1982-1991 — ZLE word-tracking for completion.
369 if self.lexflags.zle {
370 let zp = self.lexflags;
371 self.gotword();
372 // lex.c:1986-1990 — if gotword cleared lexflags, the cursor
373 // word has been reached; abort exalias so completion can
374 // capture the partial token unchanged.
375 if zp.zle && !self.lexflags.zle {
376 return false;
377 }
378 }
379
380 // lex.c:1993-2015 — STRING-token alias / reswd check.
381 if self.tok == LexTok::String {
382 // lex.c:1995 — `checkalias()`. POSIX-aliases gate skipped
383 // here (zshrs doesn't have the option flag wired).
384 if self.check_alias(resolver, &lextext) {
385 return true;
386 }
387
388 // lex.c:2002-2009 — reserved-word lookup. Fires when in
389 // command position OR when the text is bare `}` and
390 // IGNOREBRACES is unset (so `}` ends a brace block).
391 if self.incmdpos || lextext == "}" {
392 if let Some(rwtok) = resolver.lookup_reswd(&lextext) {
393 self.tok = rwtok;
394 if rwtok == LexTok::Repeat {
395 self.inrepeat = 1;
396 }
397 if rwtok == LexTok::Dinbrack {
398 self.incond = 1;
399 }
400 }
401 } else if self.incond > 0 && lextext == "]]" {
402 // lex.c:2010-2012 — `]]` closes the cond expression.
403 self.tok = LexTok::Doutbrack;
404 self.incond = 0;
405 } else if self.incond == 1 && lextext == "!" {
406 // lex.c:2013-2014 — `!` inside `[[ ]]` is the BANG
407 // negation, not a literal.
408 self.tok = LexTok::Bang;
409 }
410 }
411
412 // lex.c:2016 — `inalmore = 0;` — alias-more flag clears after
413 // any non-alias token.
414 // (zshrs's lexer doesn't have inalmore yet — added here would
415 // require gettok to track when an alias-pushed token has more
416 // text after it. Documented divergence.)
417
418 false
419 }
420
421 /// Helper for `exalias`. Direct port of zsh/Src/lex.c:1899-1947
422 /// `checkalias`. Returns true if the lookup matched (regular or
423 /// suffix alias) AND the alias text was successfully injected
424 /// back into the input stream for re-lexing.
425 fn check_alias<R: AliasResolver>(&mut self, resolver: &mut R, lextext: &str) -> bool {
426 // lex.c:1906-1907 — guard on null lextext.
427 if lextext.is_empty() {
428 return false;
429 }
430
431 // lex.c:1909-1911 — guard: alias expansion is disabled, or
432 // POSIX aliases require the token to be a STRING and not a
433 // reserved word.
434 if self.noaliases {
435 return false;
436 }
437
438 // lex.c:1914-1933 — regular alias lookup.
439 if let Some(alias) = resolver.lookup_alias(lextext) {
440 if !alias.in_use && (alias.global || (self.incmdpos && self.tok == LexTok::String)) {
441 // lex.c:1918-1927 — if the next char isn't blank,
442 // insert a space so the alias body can't accidentally
443 // join the following word.
444 if !self.lexstop {
445 if let Some(c) = self.peek() {
446 if !Self::is_blank(c) {
447 self.inject_alias_text(" ");
448 }
449 }
450 }
451 // lex.c:1928 — `inpush(an->text, INP_ALIAS, an);`
452 self.inject_alias_text(&alias.text);
453 resolver.mark_in_use(lextext, true);
454 self.lexstop = false;
455 return true;
456 }
457 }
458
459 // lex.c:1934-1943 — suffix-alias lookup. The token must end
460 // with `.SUFFIX`, the suffix name must be a registered
461 // suffix-alias, AND the lexer must be in command position.
462 if self.incmdpos {
463 if let Some(dot_pos) = lextext.rfind('.') {
464 if dot_pos > 0 && dot_pos + 1 < lextext.len() {
465 let suffix = &lextext[dot_pos + 1..];
466 if let Some(alias) = resolver.lookup_suffix_alias(suffix) {
467 if !alias.in_use {
468 // lex.c:1938-1940 — push three things in
469 // reverse: the alias text, a space, then
470 // the original word.
471 self.inject_alias_text(&alias.text);
472 self.inject_alias_text(" ");
473 self.inject_alias_text(lextext);
474 resolver.mark_in_use(suffix, true);
475 self.lexstop = false;
476 return true;
477 }
478 }
479 }
480 }
481 }
482
483 false
484 }
485
486 /// Push alias text back into the input stream so the lexer
487 /// re-reads it. Equivalent to zsh's `inpush(text, INP_ALIAS, an)`
488 /// at lex.c:1928,1938,1940. zshrs uses the existing `unget_buf`
489 /// (a VecDeque<char>) to inject chars in reverse order so the
490 /// next hgetc consumes them first.
491 fn inject_alias_text(&mut self, text: &str) {
492 // Insert at front in reverse so the first char of `text`
493 // comes out first.
494 for c in text.chars().rev() {
495 self.unget_buf.push_front(c);
496 }
497 }
498
499 /// Pop the last char from the raw-input capture buffer. Direct
500 /// port of zsh/Src/lex.c:2042-2049 `zshlex_raw_back`. Called when
501 /// the lexer ungets a char that was just captured raw — the raw
502 /// buffer must mirror the live input so this undoes the last add.
503 pub fn zshlex_raw_back(&mut self) {
504 // lex.c:2045-2046 — guard.
505 if self.lex_add_raw == 0 {
506 return;
507 }
508 // lex.c:2047-2048 — `lexbuf_raw.ptr--; lexbuf_raw.len--;`
509 self.lexbuf_raw.pop();
510 }
511
512 /// Mark the current raw-buffer offset (for restore later). Direct
513 /// port of zsh/Src/lex.c:2052-2058 `zshlex_raw_mark`. Returns
514 /// `len + offset` so callers can restore via `back_to_mark`.
515 pub fn zshlex_raw_mark(&self, offset: i64) -> i64 {
516 // lex.c:2055-2056 — guard.
517 if self.lex_add_raw == 0 {
518 return 0;
519 }
520 // lex.c:2057 — `return lexbuf_raw.len + offset;`
521 (self.lexbuf_raw.len() as i64) + offset
522 }
523
524 /// Restore raw-buffer offset to a previously-saved mark. Direct
525 /// port of zsh/Src/lex.c:2061-2068 `zshlex_raw_back_to_mark`.
526 /// Truncates the raw buffer to `mark` bytes — undoes any captures
527 /// since the mark was taken (used when a speculative parse fails
528 /// and the lexer rolls back).
529 pub fn zshlex_raw_back_to_mark(&mut self, mark: i64) {
530 // lex.c:2064-2065 — guard.
531 if self.lex_add_raw == 0 {
532 return;
533 }
534 // lex.c:2066-2067 — `lexbuf_raw.ptr = tokstr_raw + mark;
535 // lexbuf_raw.len = mark;` — Rust truncate handles both.
536 let m = mark.max(0) as usize;
537 self.lexbuf_raw.data.truncate(m);
538 }
539
540 /// Take the captured raw-input buffer, clearing it. Useful for
541 /// callers that need the literal command-sub body after lexing
542 /// (e.g. compile-time string capture for `$(...)`).
543 pub fn take_raw_buf(&mut self) -> String {
544 std::mem::take(&mut self.lexbuf_raw.data)
545 }
546
547 /// Save lexical context onto a `LexStack`. Direct port of
548 /// zsh/Src/lex.c:215-239 `lex_context_save`. After save, the lexer
549 /// is in a clean state suitable for parsing a nested input (command
550 /// substitution body, here-doc terminator, eval'd string).
551 pub fn lex_context_save(&mut self, ls: &mut LexStack) {
552 // lex.c:220-233 — copy live state into the stack.
553 ls.dbparens = self.dbparens;
554 ls.isfirstln = self.isfirstln;
555 ls.isfirstch = self.isfirstch;
556 ls.lexflags = self.lexflags;
557 ls.tok = self.tok;
558 ls.tokstr = self.tokstr.take();
559 ls.lexbuf_data = std::mem::take(&mut self.lexbuf.data);
560 ls.lexbuf_siz = self.lexbuf.siz;
561 ls.lexstop = self.lexstop;
562 ls.toklineno = self.toklineno;
563
564 // lex.c:235-238 — reset live state to defaults so a nested
565 // parse starts from a clean slate. tokstr/lexbuf are zeroed,
566 // lexbuf.siz reset to 256 (the C-source initial alloc).
567 self.tokstr = None;
568 self.lexbuf.data.clear();
569 self.lexbuf.siz = 256;
570 }
571
572 /// Restore lexical context from a `LexStack`. Direct port of
573 /// zsh/Src/lex.c:244-262 `lex_context_restore`. Inverse of
574 /// `lex_context_save`. Called after the nested parse completes.
575 pub fn lex_context_restore(&mut self, ls: &mut LexStack) {
576 // lex.c:249-261 — copy stack state back into live fields.
577 self.dbparens = ls.dbparens;
578 self.isfirstln = ls.isfirstln;
579 self.isfirstch = ls.isfirstch;
580 self.lexflags = ls.lexflags;
581 self.tok = ls.tok;
582 self.tokstr = ls.tokstr.take();
583 self.lexbuf.data = std::mem::take(&mut ls.lexbuf_data);
584 self.lexbuf.siz = ls.lexbuf_siz;
585 self.lexstop = ls.lexstop;
586 self.toklineno = ls.toklineno;
587 }
588
589 /// Initialize lexical state. Direct port of zsh/Src/lex.c:440-445
590 /// `lexinit`. Resets dbparens / nocorrect / lexstop and sets `tok`
591 /// to ENDINPUT so the next gettok starts from a known baseline.
592 /// Note: the constructor `Self::new` already sets equivalent
593 /// defaults; this method exists for the rare case a caller wants
594 /// to recycle a `ZshLexer` across multiple input strings.
595 pub fn lexinit(&mut self) {
596 // lex.c:443 — `nocorrect = dbparens = lexstop = 0;`
597 self.nocorrect = 0;
598 self.dbparens = false;
599 self.lexstop = false;
600 // lex.c:444 — `tok = ENDINPUT;`
601 self.tok = LexTok::Endinput;
602 }
603
604 /// Check recursion depth; returns true if exceeded
605 #[inline]
606 fn check_recursion(&mut self) -> bool {
607 if self.recursion_depth > MAX_LEXER_RECURSION {
608 self.error = Some("lexer exceeded max recursion depth".to_string());
609 self.lexstop = true;
610 true
611 } else {
612 false
613 }
614 }
615
616 /// Check and increment global iteration counter; returns true if limit exceeded
617 #[inline]
618 fn check_iterations(&mut self) -> bool {
619 self.global_iterations += 1;
620 if self.global_iterations > 50_000 {
621 self.error = Some("lexer exceeded 50K iterations".to_string());
622 self.lexstop = true;
623 self.tok = LexTok::Lexerr;
624 true
625 } else {
626 false
627 }
628 }
629
630 /// Get next character from input
631 fn hgetc(&mut self) -> Option<char> {
632 if self.check_iterations() {
633 return None;
634 }
635
636 // Re-read from unget_buf: increment lineno on `\n` HERE
637 // too. hungetc() decremented lineno when the char was put
638 // back; without a matching increment on the way out, every
639 // `\n` that's ungetted-then-reread leaves lineno
640 // permanently one short. Symptom: $LINENO stuck at 1 in
641 // every script statement because the parser ungets the
642 // separating newline once between statements.
643 if let Some(c) = self.unget_buf.pop_front() {
644 if c == '\n' {
645 self.lineno += 1;
646 }
647 return Some(c);
648 }
649
650 let c = self.input[self.pos..].chars().next()?;
651 self.pos += c.len_utf8();
652
653 if c == '\n' {
654 self.lineno += 1;
655 }
656
657 Some(c)
658 }
659
660 /// Put character back into input
661 fn hungetc(&mut self, c: char) {
662 self.unget_buf.push_front(c);
663 if c == '\n' && self.lineno > 1 {
664 self.lineno -= 1;
665 }
666 self.lexstop = false;
667 }
668
669 /// Peek at next character without consuming
670 #[allow(dead_code)]
671 fn peek(&mut self) -> Option<char> {
672 if let Some(&c) = self.unget_buf.front() {
673 return Some(c);
674 }
675 self.input[self.pos..].chars().next()
676 }
677
678 /// Add character to token buffer
679 fn add(&mut self, c: char) {
680 self.lexbuf.add(c);
681 }
682
683 /// Check if character is blank (space or tab)
684 fn is_blank(c: char) -> bool {
685 c == ' ' || c == '\t'
686 }
687
688 /// Peek for a zsh numeric range glob shape after a `<`: returns the
689 /// captured `N*-M*>` (everything *after* the leading `<`) when the
690 /// upcoming chars match `[0-9]*-[0-9]*>` exactly. Otherwise returns
691 /// None and leaves the input untouched.
692 fn try_numeric_range_glob(&mut self) -> Option<String> {
693 let mut buf: Vec<char> = Vec::new();
694 // optional leading digits
695 loop {
696 match self.hgetc() {
697 Some(c) if c.is_ascii_digit() => buf.push(c),
698 Some(c) => {
699 buf.push(c);
700 break;
701 }
702 None => break,
703 }
704 }
705 // last char in buf must be '-' for the range form
706 if buf.last() != Some(&'-') {
707 for c in buf.iter().rev() {
708 self.hungetc(*c);
709 }
710 return None;
711 }
712 // optional trailing digits
713 loop {
714 match self.hgetc() {
715 Some(c) if c.is_ascii_digit() => buf.push(c),
716 Some(c) => {
717 buf.push(c);
718 break;
719 }
720 None => break,
721 }
722 }
723 if buf.last() != Some(&'>') {
724 for c in buf.iter().rev() {
725 self.hungetc(*c);
726 }
727 return None;
728 }
729 Some(buf.into_iter().collect())
730 }
731
732 /// Check if character is blank (including other whitespace except newline)
733 fn is_inblank(c: char) -> bool {
734 matches!(c, ' ' | '\t' | '\x0b' | '\x0c' | '\r')
735 }
736
737 /// Check if character is a digit
738 fn is_digit(c: char) -> bool {
739 c.is_ascii_digit()
740 }
741
742 /// Check if character is identifier start
743 #[allow(dead_code)]
744 fn is_ident_start(c: char) -> bool {
745 c.is_ascii_alphabetic() || c == '_'
746 }
747
748 /// Check if character is identifier continuation
749 fn is_ident(c: char) -> bool {
750 c.is_ascii_alphanumeric() || c == '_'
751 }
752
753 /// Main lexer entry point — fetch the next token. Direct port of
754 /// zsh/Src/lex.c:265-313 `zshlex`. Loop body matches the C source
755 /// `do { ... } while (tok != ENDINPUT && exalias())` at lex.c:270-276,
756 /// followed by here-doc draining (lex.c:278-306), newline tracking
757 /// (lex.c:307-310), and SEMI/NEWLIN→SEPER folding (lex.c:311-312).
758 ///
759 /// zshrs port note: `exalias()` (lex.c:1953) is not yet wired into
760 /// the loop. The C source iterates as long as exalias keeps
761 /// re-injecting alias text into the input buffer; zshrs's alias
762 /// expansion happens post-lex in exec.rs. The loop body therefore
763 /// runs once and breaks unconditionally — documented divergence.
764 pub fn zshlex(&mut self) {
765 // lex.c:268-269 — early-out on prior LEXERR.
766 if self.tok == LexTok::Lexerr {
767 return;
768 }
769
770 // Note: Do NOT reset global_iterations here - it must accumulate across all
771 // zshlex calls in a parse to prevent infinite loops in the parser
772
773 // lex.c:270-276 — gettok / exalias loop. Without exalias wired,
774 // the inner body runs once and we `break` unconditionally.
775 loop {
776 // lex.c:271-272 — bump inrepeat counter for `repeat N {}`
777 // detection.
778 if self.inrepeat > 0 {
779 self.inrepeat += 1;
780 }
781 // lex.c:273-274 — at the third token after `repeat`,
782 // SHORTLOOPS / SHORTREPEAT options force back into cmd
783 // position so the loop body can start. zshrs unconditionally
784 // does this since the option-lookup lives in exec.rs.
785 if self.inrepeat == 3 {
786 self.incmdpos = true;
787 }
788
789 // lex.c:275 — `tok = gettok();`
790 self.tok = self.gettok();
791
792 // lex.c:276 — `while (tok != ENDINPUT && exalias())` —
793 // when exalias re-injects alias text it returns true and
794 // the loop iterates. Without exalias wired, we break.
795 break;
796 }
797
798 // lex.c:277 — `nocorrect &= 1;` — clear bit 1 (lookahead-only)
799 // so the persistent low bit survives but the per-word bit is
800 // dropped.
801 self.nocorrect &= 1;
802
803 // lex.c:278-306 — drain pending here-documents at the start
804 // of a new line. zshrs's process_heredocs reads the full body
805 // and stitches it onto the matching redir token.
806 if self.tok == LexTok::Newlin || self.tok == LexTok::Endinput {
807 self.process_heredocs();
808 }
809
810 // lex.c:307-310 — track whether we just saw a newline.
811 // C uses `inbufct` to distinguish "newline at EOF" (=1)
812 // from "newline mid-input" (=-1); zshrs reads `pos < len`.
813 if self.tok != LexTok::Newlin {
814 self.isnewlin = 0;
815 } else {
816 self.isnewlin = if self.pos < self.input.len() { -1 } else { 1 };
817 }
818
819 // lex.c:311-312 — fold SEMI / NEWLIN into SEPER unless
820 // LEXFLAGS_NEWLINE is set to preserve newlines (used by
821 // ZLE for completion of partial lines).
822 if self.tok == LexTok::Semi || (self.tok == LexTok::Newlin && !self.lexflags.newline) {
823 self.tok = LexTok::Seper;
824 }
825
826 // Reserved-word promotion. Per lex.c:2002-2005 in `exalias`:
827 // - `{` only promotes to INBRACE in command position
828 // - `}` promotes to OUTBRACE either in cmdpos OR via the
829 // special `closing-brace-special` rule (IGNOREBRACES unset
830 // — assumed since zshrs doesn't expose that option yet)
831 // - other reserved words: only when incmdpos (or `}` exception)
832 if self.tok == LexTok::String {
833 if let Some(ref s) = self.tokstr {
834 if s == "{" && self.incmdpos {
835 self.tok = LexTok::Inbrace;
836 } else if s == "}" {
837 self.tok = LexTok::Outbrace;
838 } else if self.incasepat == 0 {
839 // Skip reserved word checking in case pattern context —
840 // words like `time`, `end` should be patterns, not
841 // keywords.
842 self.check_reserved_word();
843 }
844 }
845 }
846
847 // If we were expecting a heredoc terminator, register it now
848 if self.heredoc_pending > 0 && self.tok == LexTok::String {
849 if let Some(ref terminator) = self.tokstr {
850 let strip_tabs = self.heredoc_pending == 2;
851 // Detect originally-quoted terminator (`<<'EOF'`,
852 // `<<"EOF"`). The lexer wraps single-quoted text in
853 // SNULL (`\u{9d}`) and double-quoted text in DNULL
854 // (`\u{9e}`); plain `EOF` has neither. Quoted-terminator
855 // heredocs disable variable / command-sub / arithmetic
856 // expansion in the body — see `compile_redir` for the
857 // expansion side.
858 // Quoted terminators (`<<'EOF'`, `<<"EOF"`, `<<\EOF`)
859 // disable expansion in the body. SNULL/DNULL mark
860 // single/double-quoted spans; BNULL (`\u{9f}`) marks
861 // any backslash-escaped char — its presence alone is
862 // enough to flag the terminator as quoted (zsh's
863 // `<<\EOF` shorthand for `<<'EOF'`).
864 let quoted = terminator.contains('\u{9d}')
865 || terminator.contains('\u{9e}')
866 || terminator.contains('\u{9f}')
867 || terminator.starts_with('\'')
868 || terminator.starts_with('"');
869 let term = terminator
870 .chars()
871 .filter(|c| {
872 *c != '\''
873 && *c != '"'
874 && *c != '\u{9d}'
875 && *c != '\u{9e}'
876 && *c != '\u{9f}'
877 })
878 .collect::<String>();
879 self.heredocs.push(HereDoc {
880 terminator: term,
881 strip_tabs,
882 content: String::new(),
883 quoted,
884 processed: false,
885 });
886 }
887 self.heredoc_pending = 0;
888 }
889
890 // Track pattern context inside [[ ... ]] - after = == != =~ the RHS is a pattern
891 if self.incond > 0 {
892 if let Some(ref s) = self.tokstr {
893 // Check if this token is a comparison operator
894 // Note: single = is also a comparison operator in [[ ]]
895 // The internal marker \u{8d} is used for =
896 if s == "="
897 || s == "=="
898 || s == "!="
899 || s == "=~"
900 || s == "\u{8d}"
901 || s == "\u{8d}\u{8d}"
902 || s == "!\u{8d}"
903 || s == "\u{8d}~"
904 || s == "\u{8d}\u{98}"
905 {
906 self.incondpat = true;
907 } else if self.incondpat {
908 // We were in pattern context, now we've consumed the pattern
909 // Reset after the pattern token is consumed
910 // But actually, pattern can span multiple tokens, so we should
911 // stay in pattern mode until ]] or && or ||
912 }
913 }
914 // Reset pattern context on ]] or logical operators (&&, ||)
915 // and grouping parens. zsh par_cond_3 (cond.c) treats
916 // these as cond-pattern terminators — the next operand is
917 // a fresh primary, NOT a continuation of the prior pattern.
918 // Without resetting on Damper/Dbar/Inpar/Outpar, the `(`
919 // after `[[ a == a && (b == b ... ` was lexed as a literal
920 // glob char (incondpat=true → gettokstr) and the whole
921 // remainder collapsed into one String token.
922 match self.tok {
923 LexTok::Doutbrack
924 | LexTok::Damper
925 | LexTok::Dbar
926 | LexTok::Inpar
927 | LexTok::Outpar
928 | LexTok::Bang => {
929 self.incondpat = false;
930 }
931 _ => {}
932 }
933 } else {
934 self.incondpat = false;
935 }
936
937 // Update command position for next token based on current token
938 // Note: In case patterns (incasepat > 0), | is a pattern separator, not pipeline,
939 // so we don't set incmdpos after Bar in that context
940 match self.tok {
941 LexTok::Seper
942 | LexTok::Newlin
943 | LexTok::Semi
944 | LexTok::Dsemi
945 | LexTok::Semiamp
946 | LexTok::Semibar
947 | LexTok::Amper
948 | LexTok::Amperbang
949 | LexTok::Inpar
950 | LexTok::Inbrace
951 | LexTok::Dbar
952 | LexTok::Damper
953 | LexTok::Baramp
954 | LexTok::Inoutpar
955 | LexTok::Doloop
956 | LexTok::Then
957 | LexTok::Elif
958 | LexTok::Else
959 | LexTok::Doutbrack
960 | LexTok::Func => {
961 self.incmdpos = true;
962 }
963 LexTok::Bar
964 // In case patterns, | is a pattern separator - don't change incmdpos
965 if self.incasepat <= 0 => {
966 self.incmdpos = true;
967 }
968 LexTok::String
969 | LexTok::Typeset
970 | LexTok::Envarray
971 | LexTok::Outpar
972 | LexTok::Case
973 | LexTok::Dinbrack => {
974 self.incmdpos = false;
975 }
976 _ => {}
977 }
978
979 // Track 'for' keyword for C-style for loop: for (( init; cond; step ))
980 // When we see 'for', set infor=2 to expect the init and cond parts
981 // Each Dinpar (after semicolon in arithmetic) decrements it
982 if self.tok != LexTok::Dinpar {
983 self.infor = if self.tok == LexTok::For { 2 } else { 0 };
984 }
985
986 // Handle redirection context
987 let oldpos = self.incmdpos;
988 if self.tok.is_redirop()
989 || self.tok == LexTok::For
990 || self.tok == LexTok::Foreach
991 || self.tok == LexTok::Select
992 {
993 self.inredir = true;
994 self.incmdpos = false;
995 } else if self.inredir {
996 self.incmdpos = oldpos;
997 self.inredir = false;
998 }
999 }
1000
1001 /// Process pending here-documents. Walks each heredoc whose body
1002 /// hasn't been filled yet (content is empty AND terminator is set),
1003 /// reads lines from input until the terminator, and stuffs the body
1004 /// into `hdoc.content` IN PLACE. The list itself is preserved so the
1005 /// parser can index into it after parse() finishes.
1006 fn process_heredocs(&mut self) {
1007 let n = self.heredocs.len();
1008 for i in 0..n {
1009 // Skip heredocs we've already processed AND those without
1010 // a terminator (early-error case). The `processed` bool
1011 // distinguishes "filled with empty body" from "not yet
1012 // visited" — both have empty `content`.
1013 if self.heredocs[i].processed || self.heredocs[i].terminator.is_empty() {
1014 continue;
1015 }
1016 let strip_tabs = self.heredocs[i].strip_tabs;
1017 let terminator = self.heredocs[i].terminator.clone();
1018 let mut content = String::new();
1019 let mut line_count = 0;
1020
1021 loop {
1022 line_count += 1;
1023 if line_count > 10000 {
1024 self.error = Some("heredoc exceeded 10000 lines".to_string());
1025 self.tok = LexTok::Lexerr;
1026 return;
1027 }
1028
1029 let line = self.read_line();
1030 if line.is_none() {
1031 self.error = Some("here document too large or unterminated".to_string());
1032 self.tok = LexTok::Lexerr;
1033 return;
1034 }
1035
1036 let line = line.unwrap();
1037 let check_line = if strip_tabs {
1038 line.trim_start_matches('\t')
1039 } else {
1040 line.as_str()
1041 };
1042
1043 if check_line.trim_end_matches('\n') == terminator {
1044 break;
1045 }
1046
1047 // `<<-` strips leading tabs from BODY lines too, not just
1048 // from terminator-match comparison. Without this, tabs in
1049 // here-doc content survive into stdin.
1050 if strip_tabs {
1051 content.push_str(check_line);
1052 } else {
1053 content.push_str(&line);
1054 }
1055 }
1056
1057 self.heredocs[i].content = content;
1058 self.heredocs[i].processed = true;
1059 }
1060 }
1061
1062 /// Read a line from input (returns partial line at EOF)
1063 fn read_line(&mut self) -> Option<String> {
1064 let mut line = String::new();
1065
1066 loop {
1067 match self.hgetc() {
1068 Some(c) => {
1069 line.push(c);
1070 if c == '\n' {
1071 break;
1072 }
1073 }
1074 None => {
1075 // EOF - return partial line if any
1076 if line.is_empty() {
1077 return None;
1078 }
1079 break;
1080 }
1081 }
1082 }
1083
1084 Some(line)
1085 }
1086
1087 /// Get the next token. Direct port of zsh/Src/lex.c:613-936
1088 /// `gettok`. Reads characters from the input via hgetc, dispatches
1089 /// on the leading char through lexact1[]/lexact2[] tables (zshrs
1090 /// uses inline `match` in lex_initial / lex_inang / lex_outang
1091 /// since Rust pattern-matching subsumes the table dispatch).
1092 ///
1093 /// Structural divergence from C: the giant ~322-line C switch
1094 /// statement at lex.c:725-936 is split into helper methods in
1095 /// Rust (lex_initial = LX1_OTHER plus the punctuation cases,
1096 /// lex_inang / lex_outang for the < and > arms). The flow is
1097 /// equivalent — same chars consumed, same tokens emitted — but
1098 /// the source-level layout differs. C's table-driven dispatch
1099 /// would Rust-port as `match c { '\\' => ..., '\n' => ..., ... }`
1100 /// which is what the helpers ultimately do.
1101 fn gettok(&mut self) -> LexTok {
1102 // lex.c:621 — `tokstr = NULL;` reset before each token.
1103 self.tokstr = None;
1104 // (zshrs-specific: tokfd reset lives here too — C does it
1105 // implicitly via the `peekfd = -1` local at lex.c:617 used
1106 // only when a digit-prefix redirection is detected.)
1107 self.tokfd = -1;
1108
1109 // lex.c:622 — `while (iblank(c = hgetc()) && !lexstop);` —
1110 // skip leading blanks (space/tab, NOT newline).
1111 let mut ws_iterations = 0;
1112 loop {
1113 ws_iterations += 1;
1114 if ws_iterations > 100_000 {
1115 self.error = Some("gettok: infinite loop in whitespace skip".to_string());
1116 return LexTok::Lexerr;
1117 }
1118 let c = match self.hgetc() {
1119 Some(c) => c,
1120 None => {
1121 // lex.c:624-625 — lexstop set, return ENDINPUT
1122 // (or LEXERR if errflag is set elsewhere).
1123 self.lexstop = true;
1124 return if self.error.is_some() {
1125 LexTok::Lexerr
1126 } else {
1127 LexTok::Endinput
1128 };
1129 }
1130 };
1131
1132 if !Self::is_blank(c) {
1133 self.hungetc(c);
1134 break;
1135 }
1136 }
1137
1138 let c = match self.hgetc() {
1139 Some(c) => c,
1140 None => {
1141 self.lexstop = true;
1142 return LexTok::Endinput;
1143 }
1144 };
1145
1146 // lex.c:623 — `toklineno = lineno;`
1147 self.toklineno = self.lineno;
1148 // lex.c:626 — `isfirstln = 0;` once we've consumed any non-
1149 // blank.
1150 self.isfirstln = false;
1151
1152 // lex.c:631-648 — dbparens (inside `(( … ))`) special path:
1153 // call dquote_parse with `;` or `)` as the end-char and
1154 // either return DINPAR (continue for-loop arith) or DOUTPAR
1155 // (close the arith block) or LEXERR.
1156 if self.dbparens {
1157 return self.lex_arith(c);
1158 }
1159
1160 // lex.c:649-668 — digit prefix on a redirection: `2> file`
1161 // treats `2` as the fd to redirect, not a literal arg. Three
1162 // shapes: `N>`/`N<` (single redir), `N&>` (errwrite), or
1163 // anything else (push back, treat as literal digit).
1164 if Self::is_digit(c) {
1165 let d = self.hgetc();
1166 match d {
1167 Some('&') => {
1168 let e = self.hgetc();
1169 if e == Some('>') {
1170 // lex.c:653-657 — `N&>` shape detected.
1171 self.tokfd = (c as u8 - b'0') as i32;
1172 self.hungetc('>');
1173 return self.lex_initial('&');
1174 }
1175 // lex.c:658-661 — not `N&>`, push everything back.
1176 if let Some(e) = e {
1177 self.hungetc(e);
1178 }
1179 self.hungetc('&');
1180 }
1181 Some('>') | Some('<') => {
1182 // lex.c:662-664 — `N>` or `N<` shape detected.
1183 self.tokfd = (c as u8 - b'0') as i32;
1184 return self.lex_initial(d.unwrap());
1185 }
1186 Some(d) => {
1187 // lex.c:665-668 — not a redir prefix, push back.
1188 self.hungetc(d);
1189 }
1190 None => {}
1191 }
1192 self.lexstop = false;
1193 }
1194
1195 // lex.c:670-936 — main dispatch on the leading char. zshrs
1196 // delegates to lex_initial which holds the equivalent of
1197 // lex.c's `switch (lexact1[c])` plus the gettokstr fallback
1198 // for LX1_OTHER.
1199 self.lex_initial(c)
1200 }
1201
1202 /// Lex (( ... )) arithmetic expression
1203 fn lex_arith(&mut self, c: char) -> LexTok {
1204 self.lexbuf.clear();
1205 self.hungetc(c);
1206
1207 let end_char = if self.infor > 0 { ';' } else { ')' };
1208 if self.dquote_parse(end_char, false).is_err() {
1209 return LexTok::Lexerr;
1210 }
1211
1212 self.tokstr = Some(self.lexbuf.as_str().to_string());
1213
1214 if !self.lexstop && self.infor > 0 {
1215 self.infor -= 1;
1216 return LexTok::Dinpar;
1217 }
1218
1219 // Check for closing ))
1220 match self.hgetc() {
1221 Some(')') => {
1222 self.dbparens = false;
1223 LexTok::Doutpar
1224 }
1225 c => {
1226 if let Some(c) = c {
1227 self.hungetc(c);
1228 }
1229 LexTok::Lexerr
1230 }
1231 }
1232 }
1233
1234 /// Handle initial character of token
1235 fn lex_initial(&mut self, c: char) -> LexTok {
1236 // Handle comments
1237 if c == '#' && !self.nocomments {
1238 return self.lex_comment();
1239 }
1240
1241 match c {
1242 '\\' => {
1243 let d = self.hgetc();
1244 if d == Some('\n') {
1245 // Line continuation - get next token
1246 return self.gettok();
1247 }
1248 if let Some(d) = d {
1249 self.hungetc(d);
1250 }
1251 self.lexstop = false;
1252 self.gettokstr(c, false)
1253 }
1254
1255 '\n' => LexTok::Newlin,
1256
1257 ';' => {
1258 let d = self.hgetc();
1259 match d {
1260 Some(';') => LexTok::Dsemi,
1261 Some('&') => LexTok::Semiamp,
1262 Some('|') => LexTok::Semibar,
1263 _ => {
1264 if let Some(d) = d {
1265 self.hungetc(d);
1266 }
1267 self.lexstop = false;
1268 LexTok::Semi
1269 }
1270 }
1271 }
1272
1273 '&' => {
1274 let d = self.hgetc();
1275 match d {
1276 Some('&') => LexTok::Damper,
1277 Some('!') | Some('|') => LexTok::Amperbang,
1278 Some('>') => {
1279 self.tokfd = self.tokfd.max(0);
1280 let e = self.hgetc();
1281 match e {
1282 Some('!') | Some('|') => LexTok::Outangampbang,
1283 Some('>') => {
1284 let f = self.hgetc();
1285 match f {
1286 Some('!') | Some('|') => LexTok::Doutangampbang,
1287 _ => {
1288 if let Some(f) = f {
1289 self.hungetc(f);
1290 }
1291 self.lexstop = false;
1292 LexTok::Doutangamp
1293 }
1294 }
1295 }
1296 _ => {
1297 if let Some(e) = e {
1298 self.hungetc(e);
1299 }
1300 self.lexstop = false;
1301 LexTok::Ampoutang
1302 }
1303 }
1304 }
1305 _ => {
1306 if let Some(d) = d {
1307 self.hungetc(d);
1308 }
1309 self.lexstop = false;
1310 LexTok::Amper
1311 }
1312 }
1313 }
1314
1315 '|' => {
1316 let d = self.hgetc();
1317 match d {
1318 Some('|') if self.incasepat <= 0 => LexTok::Dbar,
1319 Some('&') => LexTok::Baramp,
1320 _ => {
1321 if let Some(d) = d {
1322 self.hungetc(d);
1323 }
1324 self.lexstop = false;
1325 LexTok::Bar
1326 }
1327 }
1328 }
1329
1330 '(' => {
1331 let d = self.hgetc();
1332 match d {
1333 Some('(') => {
1334 if self.infor > 0 {
1335 self.dbparens = true;
1336 return LexTok::Dinpar;
1337 }
1338 if self.incmdpos {
1339 // Could be (( arithmetic )) or ( subshell )
1340 self.lexbuf.clear();
1341 match self.cmd_or_math() {
1342 CmdOrMath::Math => {
1343 self.tokstr = Some(self.lexbuf.as_str().to_string());
1344 return LexTok::Dinpar;
1345 }
1346 CmdOrMath::Cmd => {
1347 self.tokstr = None;
1348 return LexTok::Inpar;
1349 }
1350 CmdOrMath::Err => return LexTok::Lexerr,
1351 }
1352 }
1353 self.hungetc('(');
1354 self.lexstop = false;
1355 self.gettokstr('(', false)
1356 }
1357 Some(')') => LexTok::Inoutpar,
1358 _ => {
1359 if let Some(d) = d {
1360 self.hungetc(d);
1361 }
1362 self.lexstop = false;
1363 // In pattern context (after == != =~ in [[ ]]), ( is part of pattern
1364 // In case pattern context, ( at start is optional delimiter, not pattern
1365 // incasepat == 1 means "at start of pattern", > 1 means "inside pattern"
1366 if self.incondpat || self.incasepat > 1 {
1367 self.gettokstr('(', false)
1368 } else if self.incond == 1 || self.incmdpos || self.incasepat == 1 {
1369 LexTok::Inpar
1370 } else {
1371 self.gettokstr('(', false)
1372 }
1373 }
1374 }
1375 }
1376
1377 ')' => LexTok::Outpar,
1378
1379 '{' => {
1380 // { is a command group only if followed by whitespace,
1381 // newline, or `}` (the empty-block form `{}`). zsh
1382 // treats `{}` as an empty compound — `foo() {}` is a
1383 // valid no-op function. Without `}` in this list,
1384 // `{}` got consumed as one literal token and ran as a
1385 // command, failing "command not found: {}".
1386 // The empty `{}` is also recognised AFTER a function
1387 // header `name()` even when `incmdpos` got cleared by
1388 // the preceding Outpar — peek for `}` regardless and
1389 // treat as Inbrace so `foo() {}` parses as a no-op
1390 // function body.
1391 let next = self.hgetc();
1392 let next_is_close = matches!(next, Some('}'));
1393 if self.incmdpos {
1394 let is_brace_group = match next {
1395 Some(' ') | Some('\t') | Some('\n') | Some('}') | None => true,
1396 _ => false,
1397 };
1398 if let Some(ch) = next {
1399 self.hungetc(ch);
1400 }
1401 if is_brace_group {
1402 self.tokstr = Some("{".to_string());
1403 LexTok::Inbrace
1404 } else {
1405 self.gettokstr(c, false)
1406 }
1407 } else if next_is_close {
1408 // `{}` empty block in non-cmd position (function
1409 // body after `()`). Treat as Inbrace; the parser
1410 // will follow with Outbrace.
1411 if let Some(ch) = next {
1412 self.hungetc(ch);
1413 }
1414 self.tokstr = Some("{".to_string());
1415 LexTok::Inbrace
1416 } else {
1417 if let Some(ch) = next {
1418 self.hungetc(ch);
1419 }
1420 self.gettokstr(c, false)
1421 }
1422 }
1423
1424 '}' => {
1425 // } at start of token is always Outbrace (ends command group)
1426 // Inside a word, } would be handled by gettokstr but we never reach here mid-word
1427 self.tokstr = Some("}".to_string());
1428 LexTok::Outbrace
1429 }
1430
1431 '[' => {
1432 // [[ is a conditional expression start
1433 // [ can also be a command (test builtin) or array subscript
1434 // In case patterns (incasepat > 0), [ is part of glob pattern like [yY]
1435 if self.incasepat > 0 {
1436 self.gettokstr(c, false)
1437 } else if self.incmdpos {
1438 let next = self.hgetc();
1439 if next == Some('[') {
1440 // [[ - double bracket conditional
1441 self.tokstr = Some("[[".to_string());
1442 self.incond = 1;
1443 return LexTok::Dinbrack;
1444 }
1445 // Single [ - either test command or start of glob pattern
1446 if let Some(ch) = next {
1447 self.hungetc(ch);
1448 }
1449 self.tokstr = Some("[".to_string());
1450 LexTok::String
1451 } else {
1452 self.gettokstr(c, false)
1453 }
1454 }
1455
1456 ']' => {
1457 // ]] ends a conditional expression started by [[
1458 if self.incond > 0 {
1459 let next = self.hgetc();
1460 if next == Some(']') {
1461 self.tokstr = Some("]]".to_string());
1462 self.incond = 0;
1463 return LexTok::Doutbrack;
1464 }
1465 if let Some(ch) = next {
1466 self.hungetc(ch);
1467 }
1468 }
1469 self.gettokstr(c, false)
1470 }
1471
1472 '<' => {
1473 // In pattern context, < is literal (e.g., <-> in glob)
1474 if self.incondpat || self.incasepat > 0 {
1475 self.gettokstr(c, false)
1476 } else {
1477 self.lex_inang()
1478 }
1479 }
1480
1481 '>' => {
1482 // In pattern context, > is literal
1483 if self.incondpat || self.incasepat > 0 {
1484 self.gettokstr(c, false)
1485 } else {
1486 self.lex_outang()
1487 }
1488 }
1489
1490 _ => self.gettokstr(c, false),
1491 }
1492 }
1493
1494 /// Lex comment
1495 fn lex_comment(&mut self) -> LexTok {
1496 if self.lexflags.comments_keep {
1497 self.lexbuf.clear();
1498 self.add('#');
1499 }
1500
1501 loop {
1502 let c = self.hgetc();
1503 match c {
1504 Some('\n') | None => break,
1505 Some(c) => {
1506 if self.lexflags.comments_keep {
1507 self.add(c);
1508 }
1509 }
1510 }
1511 }
1512
1513 if self.lexflags.comments_keep {
1514 self.tokstr = Some(self.lexbuf.as_str().to_string());
1515 if !self.lexstop {
1516 self.hungetc('\n');
1517 }
1518 return LexTok::String;
1519 }
1520
1521 if self.lexflags.comments_strip && self.lexstop {
1522 return LexTok::Endinput;
1523 }
1524
1525 LexTok::Newlin
1526 }
1527
1528 /// Lex < and variants
1529 fn lex_inang(&mut self) -> LexTok {
1530 let d = self.hgetc();
1531 match d {
1532 Some('(') => {
1533 // Process substitution <(...)
1534 self.hungetc('(');
1535 self.lexstop = false;
1536 self.gettokstr('<', false)
1537 }
1538 Some('>') => LexTok::Inoutang,
1539 Some('<') => {
1540 let e = self.hgetc();
1541 match e {
1542 Some('(') => {
1543 self.hungetc('(');
1544 self.hungetc('<');
1545 LexTok::Inang
1546 }
1547 Some('<') => LexTok::Trinang,
1548 Some('-') => {
1549 self.heredoc_pending = 2; // <<- expects terminator next
1550 LexTok::Dinangdash
1551 }
1552 _ => {
1553 if let Some(e) = e {
1554 self.hungetc(e);
1555 }
1556 self.lexstop = false;
1557 self.heredoc_pending = 1; // << expects terminator next
1558 LexTok::Dinang
1559 }
1560 }
1561 }
1562 Some('&') => LexTok::Inangamp,
1563 _ => {
1564 if let Some(d) = d {
1565 self.hungetc(d);
1566 }
1567 self.lexstop = false;
1568 LexTok::Inang
1569 }
1570 }
1571 }
1572
1573 /// Lex > and variants
1574 fn lex_outang(&mut self) -> LexTok {
1575 let d = self.hgetc();
1576 match d {
1577 Some('(') => {
1578 // Process substitution >(...)
1579 self.hungetc('(');
1580 self.lexstop = false;
1581 self.gettokstr('>', false)
1582 }
1583 Some('&') => {
1584 let e = self.hgetc();
1585 match e {
1586 Some('!') | Some('|') => LexTok::Outangampbang,
1587 _ => {
1588 if let Some(e) = e {
1589 self.hungetc(e);
1590 }
1591 self.lexstop = false;
1592 LexTok::Outangamp
1593 }
1594 }
1595 }
1596 Some('!') | Some('|') => LexTok::Outangbang,
1597 Some('>') => {
1598 let e = self.hgetc();
1599 match e {
1600 Some('&') => {
1601 let f = self.hgetc();
1602 match f {
1603 Some('!') | Some('|') => LexTok::Doutangampbang,
1604 _ => {
1605 if let Some(f) = f {
1606 self.hungetc(f);
1607 }
1608 self.lexstop = false;
1609 LexTok::Doutangamp
1610 }
1611 }
1612 }
1613 Some('!') | Some('|') => LexTok::Doutangbang,
1614 Some('(') => {
1615 self.hungetc('(');
1616 self.hungetc('>');
1617 LexTok::Outang
1618 }
1619 _ => {
1620 if let Some(e) = e {
1621 self.hungetc(e);
1622 }
1623 self.lexstop = false;
1624 LexTok::Doutang
1625 }
1626 }
1627 }
1628 _ => {
1629 if let Some(d) = d {
1630 self.hungetc(d);
1631 }
1632 self.lexstop = false;
1633 LexTok::Outang
1634 }
1635 }
1636 }
1637
1638 /// Get rest of token string
1639 fn gettokstr(&mut self, c: char, sub: bool) -> LexTok {
1640 let mut bct = 0; // brace count
1641 let mut pct = 0; // parenthesis count
1642 let mut brct = 0; // bracket count
1643 let mut in_brace_param = 0;
1644 let mut peek = LexTok::String;
1645 let mut intpos = 1;
1646 let mut unmatched = '\0';
1647 let mut c = c;
1648 const MAX_ITERATIONS: usize = 100_000;
1649 let mut iterations = 0;
1650
1651 if !sub {
1652 self.lexbuf.clear();
1653 }
1654
1655 loop {
1656 iterations += 1;
1657 if iterations > MAX_ITERATIONS {
1658 self.error = Some("gettokstr exceeded maximum iterations".to_string());
1659 return LexTok::Lexerr;
1660 }
1661
1662 let inbl = Self::is_inblank(c);
1663
1664 if inbl && in_brace_param == 0 && pct == 0 {
1665 // Whitespace outside brace param ends token
1666 break;
1667 }
1668
1669 match c {
1670 // Whitespace is handled above for most cases
1671 ')' => {
1672 if in_brace_param > 0 || sub {
1673 self.add(char_tokens::OUTPAR);
1674 } else if pct > 0 {
1675 pct -= 1;
1676 self.add(char_tokens::OUTPAR);
1677 } else {
1678 break;
1679 }
1680 }
1681
1682 '|' => {
1683 if pct == 0 && in_brace_param == 0 {
1684 if sub {
1685 self.add(c);
1686 } else {
1687 break;
1688 }
1689 } else {
1690 self.add(char_tokens::BAR);
1691 }
1692 }
1693
1694 '$' => {
1695 let e = self.hgetc();
1696 match e {
1697 Some('\\') => {
1698 let f = self.hgetc();
1699 if f != Some('\n') {
1700 if let Some(f) = f {
1701 self.hungetc(f);
1702 }
1703 self.hungetc('\\');
1704 self.add(char_tokens::STRING);
1705 } else {
1706 // Line continuation after $
1707 continue;
1708 }
1709 }
1710 Some('[') => {
1711 // $[...] arithmetic
1712 self.add(char_tokens::STRING);
1713 self.add(char_tokens::INBRACK);
1714 if self.dquote_parse(']', sub).is_err() {
1715 peek = LexTok::Lexerr;
1716 break;
1717 }
1718 self.add(char_tokens::OUTBRACK);
1719 }
1720 Some('(') => {
1721 // $(...) or $((...))
1722 self.add(char_tokens::STRING);
1723 match self.cmd_or_math_sub() {
1724 CmdOrMath::Cmd => self.add(char_tokens::OUTPAR),
1725 CmdOrMath::Math => self.add(char_tokens::OUTPARMATH),
1726 CmdOrMath::Err => {
1727 peek = LexTok::Lexerr;
1728 break;
1729 }
1730 }
1731 }
1732 Some('{') => {
1733 self.add(c);
1734 self.add(char_tokens::INBRACE);
1735 bct += 1;
1736 if in_brace_param == 0 {
1737 in_brace_param = bct;
1738 }
1739 }
1740 _ => {
1741 if let Some(e) = e {
1742 self.hungetc(e);
1743 }
1744 self.lexstop = false;
1745 self.add(char_tokens::STRING);
1746 }
1747 }
1748 }
1749
1750 '[' => {
1751 if in_brace_param == 0 {
1752 brct += 1;
1753 }
1754 self.add(char_tokens::INBRACK);
1755 }
1756
1757 ']' => {
1758 if in_brace_param == 0 && brct > 0 {
1759 brct -= 1;
1760 }
1761 self.add(char_tokens::OUTBRACK);
1762 }
1763
1764 '(' => {
1765 // lex.c:1078-1135 LX2_INPAR — when `(` appears inside
1766 // a STRING and is immediately followed by `)`, the
1767 // string terminates at the `(`. The `()` is then
1768 // re-lexed as a separate INOUTPAR token. This handles
1769 // function definitions: `name()` lexes as STRING `name`
1770 // + INOUTPAR `()`, not STRING `name()`.
1771 //
1772 // Also (lex.c:1109-1112): under SHGLOB, a `(` followed
1773 // by whitespace at the start of a command-position word
1774 // (no nested brackets/braces) is a ksh function
1775 // definition signal — same break-out behavior.
1776 if in_brace_param == 0 && !sub {
1777 let e = self.hgetc();
1778 if let Some(ch) = e {
1779 self.hungetc(ch);
1780 }
1781 self.lexstop = false;
1782 if e == Some(')') {
1783 // `name()` — terminate STRING at `(` so the
1784 // following `()` re-lexes as INOUTPAR. The
1785 // loop's exit guard at line 2067 will
1786 // `hungetc(c)` to push the `(` back; we only
1787 // need to ensure `)` is also there. The
1788 // hungetc(ch) above already pushed `)`, so
1789 // breaking here yields unget_buf = [`(`, `)`]
1790 // after the guard, which the outer dispatch
1791 // reads as Inoutpar.
1792 break;
1793 }
1794 }
1795 if in_brace_param == 0 {
1796 pct += 1;
1797 }
1798 self.add(char_tokens::INPAR);
1799 }
1800
1801 '{' => {
1802 // Track braces for both ${...} param expansion and {...} brace expansion
1803 bct += 1;
1804 self.add(c);
1805 }
1806
1807 '}' => {
1808 if in_brace_param > 0 {
1809 if bct == in_brace_param {
1810 in_brace_param = 0;
1811 }
1812 bct -= 1;
1813 self.add(char_tokens::OUTBRACE);
1814 } else if bct > 0 {
1815 // Closing a brace expansion like {a,b}
1816 bct -= 1;
1817 self.add(c);
1818 } else {
1819 break;
1820 }
1821 }
1822
1823 '>' => {
1824 // In pattern context (incondpat), > is literal
1825 if in_brace_param > 0 || sub || self.incondpat || self.incasepat > 0 {
1826 self.add(c);
1827 } else {
1828 let e = self.hgetc();
1829 if e != Some('(') {
1830 if let Some(e) = e {
1831 self.hungetc(e);
1832 }
1833 self.lexstop = false;
1834 break;
1835 }
1836 // >(...)
1837 self.add(char_tokens::OUTANGPROC);
1838 if self.skip_command_sub().is_err() {
1839 peek = LexTok::Lexerr;
1840 break;
1841 }
1842 self.add(char_tokens::OUTPAR);
1843 }
1844 }
1845
1846 '<' => {
1847 // In pattern context (incondpat), < is literal
1848 if in_brace_param > 0 || sub || self.incondpat || self.incasepat > 0 {
1849 self.add(c);
1850 } else if let Some(range_chars) = self.try_numeric_range_glob() {
1851 // zsh numeric range glob `<N-M>`, `<->`, `<N->`,
1852 // `<-M>`. When `<` mid-word matches that exact
1853 // shape, swallow it into the word instead of
1854 // breaking out for redirection.
1855 self.add(c);
1856 for ch in range_chars.chars() {
1857 self.add(ch);
1858 }
1859 } else {
1860 let e = self.hgetc();
1861 if e != Some('(') {
1862 if let Some(e) = e {
1863 self.hungetc(e);
1864 }
1865 self.lexstop = false;
1866 break;
1867 }
1868 // <(...)
1869 self.add(char_tokens::INANG);
1870 if self.skip_command_sub().is_err() {
1871 peek = LexTok::Lexerr;
1872 break;
1873 }
1874 self.add(char_tokens::OUTPAR);
1875 }
1876 }
1877
1878 '=' => {
1879 if !sub {
1880 if intpos > 0 {
1881 // At start of token, check for =(...) process substitution
1882 let e = self.hgetc();
1883 if e == Some('(') {
1884 self.add(char_tokens::EQUALS);
1885 if self.skip_command_sub().is_err() {
1886 peek = LexTok::Lexerr;
1887 break;
1888 }
1889 self.add(char_tokens::OUTPAR);
1890 } else {
1891 if let Some(e) = e {
1892 self.hungetc(e);
1893 }
1894 self.lexstop = false;
1895 self.add(char_tokens::EQUALS);
1896 }
1897 } else if peek != LexTok::Envstring
1898 && (self.incmdpos || self.intypeset)
1899 && bct == 0
1900 && brct == 0
1901 && self.incasepat == 0
1902 {
1903 // Check for VAR=value assignment (but not in case pattern context)
1904 let tok_so_far = self.lexbuf.as_str().to_string();
1905 if self.is_valid_assignment_target(&tok_so_far) {
1906 let next = self.hgetc();
1907 if next == Some('(') {
1908 // VAR=(...) array assignment. Per zsh
1909 // (lex.c emits ENVARRAY with tokstr =
1910 // just the variable name, NOT
1911 // including the `=`). The `=` and
1912 // `(` are consumed by the lexer; the
1913 // parser knows ENVARRAY means assign-
1914 // array and reads the body that
1915 // follows.
1916 self.tokstr = Some(self.lexbuf.as_str().to_string());
1917 return LexTok::Envarray;
1918 }
1919 if let Some(next) = next {
1920 self.hungetc(next);
1921 }
1922 self.lexstop = false;
1923 peek = LexTok::Envstring;
1924 intpos = 2;
1925 self.add(char_tokens::EQUALS);
1926 } else {
1927 self.add(char_tokens::EQUALS);
1928 }
1929 } else {
1930 self.add(char_tokens::EQUALS);
1931 }
1932 } else {
1933 self.add(char_tokens::EQUALS);
1934 }
1935 }
1936
1937 '\\' => {
1938 let next = self.hgetc();
1939 if next == Some('\n') {
1940 // Line continuation
1941 let next = self.hgetc();
1942 if let Some(next) = next {
1943 c = next;
1944 continue;
1945 }
1946 break;
1947 } else {
1948 self.add(char_tokens::BNULL);
1949 if let Some(next) = next {
1950 self.add(next);
1951 }
1952 }
1953 }
1954
1955 '\'' => {
1956 // Single quoted string - everything literal until '
1957 self.add(char_tokens::SNULL);
1958 loop {
1959 let ch = self.hgetc();
1960 match ch {
1961 Some('\'') => break,
1962 Some(ch) => self.add(ch),
1963 None => {
1964 self.lexstop = true;
1965 unmatched = '\'';
1966 peek = LexTok::Lexerr;
1967 break;
1968 }
1969 }
1970 }
1971 if unmatched != '\0' {
1972 break;
1973 }
1974 self.add(char_tokens::SNULL);
1975 }
1976
1977 '"' => {
1978 // Double quoted string
1979 self.add(char_tokens::DNULL);
1980 if self.dquote_parse('"', sub).is_err() {
1981 unmatched = '"';
1982 if !self.lexflags.active {
1983 peek = LexTok::Lexerr;
1984 }
1985 break;
1986 }
1987 self.add(char_tokens::DNULL);
1988 }
1989
1990 '`' => {
1991 // Backtick command substitution
1992 self.add(char_tokens::TICK);
1993 loop {
1994 let ch = self.hgetc();
1995 match ch {
1996 Some('`') => break,
1997 Some('\\') => {
1998 let next = self.hgetc();
1999 match next {
2000 Some('\n') => continue, // Line continuation
2001 Some(c) if c == '`' || c == '\\' || c == '$' => {
2002 self.add(char_tokens::BNULL);
2003 self.add(c);
2004 }
2005 Some(c) => {
2006 self.add('\\');
2007 self.add(c);
2008 }
2009 None => break,
2010 }
2011 }
2012 Some(ch) => self.add(ch),
2013 None => {
2014 self.lexstop = true;
2015 unmatched = '`';
2016 peek = LexTok::Lexerr;
2017 break;
2018 }
2019 }
2020 }
2021 if unmatched != '\0' {
2022 break;
2023 }
2024 self.add(char_tokens::TICK);
2025 }
2026
2027 '~' => {
2028 self.add(char_tokens::TILDE);
2029 }
2030
2031 '#' => {
2032 self.add(char_tokens::POUND);
2033 }
2034
2035 '^' => {
2036 self.add(char_tokens::HAT);
2037 }
2038
2039 '*' => {
2040 self.add(char_tokens::STAR);
2041 }
2042
2043 '?' => {
2044 self.add(char_tokens::QUEST);
2045 }
2046
2047 ',' if bct > in_brace_param => {
2048 self.add(char_tokens::COMMA);
2049 }
2050
2051 '-' => {
2052 self.add(char_tokens::DASH);
2053 }
2054
2055 '!' if brct > 0 => {
2056 self.add(char_tokens::BANG);
2057 }
2058
2059 // Terminators
2060 '\n' | ';' | '&' => {
2061 break;
2062 }
2063
2064 _ => {
2065 self.add(c);
2066 }
2067 }
2068
2069 c = match self.hgetc() {
2070 Some(c) => c,
2071 None => {
2072 self.lexstop = true;
2073 break;
2074 }
2075 };
2076
2077 if intpos > 0 {
2078 intpos -= 1;
2079 }
2080 }
2081
2082 // Put back the character that ended the token
2083 if !self.lexstop {
2084 self.hungetc(c);
2085 }
2086
2087 if unmatched != '\0' && !self.lexflags.active {
2088 self.error = Some(format!("unmatched {}", unmatched));
2089 }
2090
2091 if in_brace_param > 0 {
2092 self.error = Some("closing brace expected".to_string());
2093 }
2094
2095 self.tokstr = Some(self.lexbuf.as_str().to_string());
2096 peek
2097 }
2098
2099 /// Check if a string is a valid assignment target (identifier or array ref).
2100 ///
2101 /// zsh accepts identifier (`[A-Za-z_][A-Za-z0-9_]*`) optionally followed by
2102 /// a `[...]` subscript. Bare digits are NOT a valid lvalue (rejected at
2103 /// `if c.is_ascii_digit()` below — array index expressions like `arr[2]`
2104 /// are caught by the subscript handler, not here). And the first char
2105 /// must NOT be a zsh internal token byte — `$=foo` (where `$` becomes
2106 /// the STRING token 0x85) is parameter substitution with the `=` flag,
2107 /// NOT an envstring assignment.
2108 fn is_valid_assignment_target(&self, s: &str) -> bool {
2109 let mut chars = s.chars().peekable();
2110
2111 // Reject leading token byte — `$VAR=` is parameter substitution,
2112 // not assignment. Same for `*=`, `?=`, etc.
2113 if let Some(&c) = chars.peek() {
2114 if char_tokens::is_token(c) {
2115 return false;
2116 }
2117 }
2118
2119 // Check for leading digit (invalid)
2120 if let Some(&c) = chars.peek() {
2121 if c.is_ascii_digit() {
2122 // Could be array index, check rest
2123 while let Some(&c) = chars.peek() {
2124 if !c.is_ascii_digit() {
2125 break;
2126 }
2127 chars.next();
2128 }
2129 return chars.peek().is_none();
2130 }
2131 }
2132
2133 // Check identifier
2134 let mut has_ident = false;
2135 while let Some(&c) = chars.peek() {
2136 if c == char_tokens::INBRACK || c == '[' {
2137 break;
2138 }
2139 if c == '+' {
2140 // foo+=value
2141 chars.next();
2142 return chars.peek().is_none() || chars.peek() == Some(&'=');
2143 }
2144 if !Self::is_ident(c) && c != char_tokens::STRING && !char_tokens::is_token(c) {
2145 return false;
2146 }
2147 has_ident = true;
2148 chars.next();
2149 }
2150
2151 has_ident
2152 }
2153
2154 /// Parse the body of a double-quoted string (or any context that
2155 /// uses double-quote tokenization — `(( ))`, `${...}`, `$( ( ) )`).
2156 /// Direct port of zsh/Src/lex.c:1486-1693 `dquote_parse`. Reads
2157 /// chars until `endchar` is seen at depth 0, handling escapes,
2158 /// `${...}` parameter substitutions, `$(...)` and backtick command
2159 /// substitutions, `$((...))` arithmetic, and inner double-quoted
2160 /// strings. The `sub` flag toggles substitution-context tokens
2161 /// (lex.c:1487 `int sub` argument).
2162 ///
2163 /// zshrs port note: the recursion guard at the top is a Rust
2164 /// safety net; the C source relies on the runtime stack. Inner
2165 /// logic delegates to `dquote_parse_inner` which holds the actual
2166 /// per-char state machine matching lex.c:1495-1692.
2167 fn dquote_parse(&mut self, endchar: char, sub: bool) -> Result<(), ()> {
2168 self.recursion_depth += 1;
2169 if self.check_recursion() {
2170 self.recursion_depth -= 1;
2171 return Err(());
2172 }
2173
2174 let result = self.dquote_parse_inner(endchar, sub);
2175 self.recursion_depth -= 1;
2176 result
2177 }
2178
2179 fn dquote_parse_inner(&mut self, endchar: char, sub: bool) -> Result<(), ()> {
2180 let mut pct = 0; // parenthesis count
2181 let mut brct = 0; // bracket count
2182 let mut bct = 0; // brace count (for ${...})
2183 let mut intick = false; // inside backtick
2184 let is_math = endchar == ')' || endchar == ']' || self.infor > 0;
2185 const MAX_ITERATIONS: usize = 100_000;
2186 let mut iterations = 0;
2187
2188 loop {
2189 iterations += 1;
2190 if iterations > MAX_ITERATIONS {
2191 self.error = Some("dquote_parse exceeded maximum iterations".to_string());
2192 return Err(());
2193 }
2194 let c = self.hgetc();
2195 let c = match c {
2196 Some(c) if c == endchar && !intick && bct == 0 => {
2197 if is_math && (pct > 0 || brct > 0) {
2198 self.add(c);
2199 if c == ')' {
2200 pct -= 1;
2201 } else if c == ']' {
2202 brct -= 1;
2203 }
2204 continue;
2205 }
2206 return Ok(());
2207 }
2208 Some(c) => c,
2209 None => {
2210 self.lexstop = true;
2211 return Err(());
2212 }
2213 };
2214
2215 match c {
2216 '\\' => {
2217 let next = self.hgetc();
2218 match next {
2219 Some('\n') if !sub => continue, // Line continuation
2220 Some(c)
2221 if c == '$'
2222 || c == '\\'
2223 || (c == '}' && !intick && bct > 0)
2224 || c == endchar
2225 || c == '`'
2226 || (endchar == ']'
2227 && (c == '['
2228 || c == ']'
2229 || c == '('
2230 || c == ')'
2231 || c == '{'
2232 || c == '}'
2233 || (c == '"' && sub))) =>
2234 {
2235 self.add(char_tokens::BNULL);
2236 self.add(c);
2237 }
2238 Some(c) => {
2239 self.add('\\');
2240 self.hungetc(c);
2241 continue;
2242 }
2243 None => {
2244 self.add('\\');
2245 }
2246 }
2247 }
2248
2249 '$' => {
2250 if intick {
2251 self.add(c);
2252 continue;
2253 }
2254 let next = self.hgetc();
2255 match next {
2256 Some('(') => {
2257 self.add(char_tokens::QSTRING);
2258 match self.cmd_or_math_sub() {
2259 CmdOrMath::Cmd => self.add(char_tokens::OUTPAR),
2260 CmdOrMath::Math => self.add(char_tokens::OUTPARMATH),
2261 CmdOrMath::Err => return Err(()),
2262 }
2263 }
2264 Some('[') => {
2265 self.add(char_tokens::STRING);
2266 self.add(char_tokens::INBRACK);
2267 self.dquote_parse(']', sub)?;
2268 self.add(char_tokens::OUTBRACK);
2269 }
2270 Some('{') => {
2271 self.add(char_tokens::QSTRING);
2272 self.add(char_tokens::INBRACE);
2273 bct += 1;
2274 }
2275 Some('$') => {
2276 self.add(char_tokens::QSTRING);
2277 self.add('$');
2278 }
2279 _ => {
2280 if let Some(next) = next {
2281 self.hungetc(next);
2282 }
2283 self.lexstop = false;
2284 self.add(char_tokens::QSTRING);
2285 }
2286 }
2287 }
2288
2289 '}' => {
2290 if intick || bct == 0 {
2291 self.add(c);
2292 } else {
2293 self.add(char_tokens::OUTBRACE);
2294 bct -= 1;
2295 }
2296 }
2297
2298 '`' => {
2299 self.add(char_tokens::QTICK);
2300 intick = !intick;
2301 }
2302
2303 '(' => {
2304 if !is_math || bct == 0 {
2305 pct += 1;
2306 }
2307 self.add(c);
2308 }
2309
2310 ')' => {
2311 if !is_math || bct == 0 {
2312 if pct == 0 && is_math {
2313 return Err(());
2314 }
2315 pct -= 1;
2316 }
2317 self.add(c);
2318 }
2319
2320 '[' => {
2321 if !is_math || bct == 0 {
2322 brct += 1;
2323 }
2324 self.add(c);
2325 }
2326
2327 ']' => {
2328 if !is_math || bct == 0 {
2329 if brct == 0 && is_math {
2330 return Err(());
2331 }
2332 brct -= 1;
2333 }
2334 self.add(c);
2335 }
2336
2337 '"' => {
2338 if intick || (endchar != '"' && bct == 0) {
2339 self.add(c);
2340 } else if bct > 0 {
2341 self.add(char_tokens::DNULL);
2342 self.dquote_parse('"', sub)?;
2343 self.add(char_tokens::DNULL);
2344 } else {
2345 return Err(());
2346 }
2347 }
2348
2349 _ => {
2350 self.add(c);
2351 }
2352 }
2353 }
2354 }
2355
2356 /// Determine if (( is arithmetic or command
2357 /// Decide whether `( ... )` after a `$` is a math expression
2358 /// `$((...))` or a command substitution `$(...)`. Direct port of
2359 /// zsh/Src/lex.c:495-532 `cmd_or_math`. Tries dquote_parse first;
2360 /// if it succeeds AND the next char is `)` (closing the second
2361 /// paren of `(( ))`), it's math. Otherwise rewinds and treats as
2362 /// a command substitution.
2363 fn cmd_or_math(&mut self) -> CmdOrMath {
2364 let oldlen = self.lexbuf.len();
2365
2366 // Per lex.c:498-518 — `cmd_or_math` calls `dquote_parse(')')`
2367 // which fills lexbuf with ONLY the inner expression, then checks
2368 // for the closing `)`. The surrounding `((` / `))` are NOT added
2369 // to lexbuf. zshrs previously added INPAR + '(' before dquote and
2370 // ')' after, polluting DINPAR's tokstr with the literal parens.
2371 // Removed to match C exactly.
2372 if self.dquote_parse(')', false).is_err() {
2373 // Back up and try as command
2374 while self.lexbuf.len() > oldlen {
2375 if let Some(c) = self.lexbuf.pop() {
2376 self.hungetc(c);
2377 }
2378 }
2379 self.hungetc('(');
2380 self.lexstop = false;
2381 return if self.skip_command_sub().is_err() {
2382 CmdOrMath::Err
2383 } else {
2384 CmdOrMath::Cmd
2385 };
2386 }
2387
2388 // Check for closing ) — matches C lex.c:511-512: success-with-`)`
2389 // means `((..))` was math. Don't add `)` to lexbuf.
2390 let c = self.hgetc();
2391 if c == Some(')') {
2392 return CmdOrMath::Math;
2393 }
2394
2395 // Not math, back up
2396 if let Some(c) = c {
2397 self.hungetc(c);
2398 }
2399 self.lexstop = false;
2400
2401 // Back up token
2402 while self.lexbuf.len() > oldlen {
2403 if let Some(c) = self.lexbuf.pop() {
2404 self.hungetc(c);
2405 }
2406 }
2407 self.hungetc('(');
2408
2409 if self.skip_command_sub().is_err() {
2410 CmdOrMath::Err
2411 } else {
2412 CmdOrMath::Cmd
2413 }
2414 }
2415
2416 /// Parse `$(...)` or `$((...))` after the `$` has been consumed.
2417 /// Direct port of zsh/Src/lex.c:540-573 `cmd_or_math_sub`. Reads
2418 /// the next char to discriminate: a leading `(` plus successful
2419 /// math parse via `cmd_or_math` → arithmetic substitution (with
2420 /// the open-paren retroactively rewritten to Inparmath); else
2421 /// command substitution via skip_command_sub.
2422 fn cmd_or_math_sub(&mut self) -> CmdOrMath {
2423 const MAX_CONTINUATIONS: usize = 10_000;
2424 let mut continuations = 0;
2425
2426 loop {
2427 continuations += 1;
2428 if continuations > MAX_CONTINUATIONS {
2429 self.error = Some("cmd_or_math_sub: too many line continuations".to_string());
2430 return CmdOrMath::Err;
2431 }
2432
2433 let c = self.hgetc();
2434 if c == Some('\\') {
2435 let c2 = self.hgetc();
2436 if c2 != Some('\n') {
2437 if let Some(c2) = c2 {
2438 self.hungetc(c2);
2439 }
2440 self.hungetc('\\');
2441 self.lexstop = false;
2442 return if self.skip_command_sub().is_err() {
2443 CmdOrMath::Err
2444 } else {
2445 CmdOrMath::Cmd
2446 };
2447 }
2448 // Line continuation, try again (loop instead of recursion)
2449 continue;
2450 }
2451
2452 // Not a line continuation, process normally
2453 if c == Some('(') {
2454 // Might be $((...))
2455 let lexpos = self.lexbuf.len();
2456 self.add(char_tokens::INPAR);
2457 self.add('(');
2458
2459 if self.dquote_parse(')', false).is_ok() {
2460 let c2 = self.hgetc();
2461 if c2 == Some(')') {
2462 self.add(')');
2463 return CmdOrMath::Math;
2464 }
2465 if let Some(c2) = c2 {
2466 self.hungetc(c2);
2467 }
2468 }
2469
2470 // Not math, restore and parse as command
2471 while self.lexbuf.len() > lexpos {
2472 if let Some(ch) = self.lexbuf.pop() {
2473 self.hungetc(ch);
2474 }
2475 }
2476 self.hungetc('(');
2477 self.lexstop = false;
2478 } else {
2479 if let Some(c) = c {
2480 self.hungetc(c);
2481 }
2482 self.lexstop = false;
2483 }
2484
2485 return if self.skip_command_sub().is_err() {
2486 CmdOrMath::Err
2487 } else {
2488 CmdOrMath::Cmd
2489 };
2490 }
2491 }
2492
2493 /// Skip over `(...)` for command-style substitutions: `$(...)`,
2494 /// `<(...)`, `>(...)`. Direct port of zsh/Src/lex.c:2080-end
2495 /// `skipcomm`. Per the C source comment: "we'll parse the input
2496 /// until we find an unmatched closing parenthesis. However, we'll
2497 /// throw away the result of the parsing and just keep the string
2498 /// we've built up on the way."
2499 ///
2500 /// zshrs port note: the C source uses zcontext_save/restore +
2501 /// strinbeg/inpush to set up an isolated lex context for the
2502 /// throw-away parse. zshrs's standalone walker tracks paren
2503 /// depth directly without re-entering the parser. Same
2504 /// invariant: stops at the matching `)`.
2505 fn skip_command_sub(&mut self) -> Result<(), ()> {
2506 let mut pct = 1;
2507 let mut start = true;
2508 const MAX_ITERATIONS: usize = 100_000;
2509 let mut iterations = 0;
2510
2511 self.add(char_tokens::INPAR);
2512
2513 loop {
2514 iterations += 1;
2515 if iterations > MAX_ITERATIONS {
2516 self.error = Some("skip_command_sub exceeded maximum iterations".to_string());
2517 return Err(());
2518 }
2519
2520 let c = self.hgetc();
2521 let c = match c {
2522 Some(c) => c,
2523 None => {
2524 self.lexstop = true;
2525 return Err(());
2526 }
2527 };
2528
2529 let iswhite = Self::is_inblank(c);
2530
2531 match c {
2532 '(' => {
2533 pct += 1;
2534 self.add(c);
2535 }
2536 ')' => {
2537 pct -= 1;
2538 if pct == 0 {
2539 return Ok(());
2540 }
2541 self.add(c);
2542 }
2543 '\\' => {
2544 self.add(c);
2545 if let Some(c) = self.hgetc() {
2546 self.add(c);
2547 }
2548 }
2549 '\'' => {
2550 self.add(c);
2551 loop {
2552 let ch = self.hgetc();
2553 match ch {
2554 Some('\'') => {
2555 self.add('\'');
2556 break;
2557 }
2558 Some(ch) => self.add(ch),
2559 None => {
2560 self.lexstop = true;
2561 return Err(());
2562 }
2563 }
2564 }
2565 }
2566 '"' => {
2567 self.add(c);
2568 loop {
2569 let ch = self.hgetc();
2570 match ch {
2571 Some('"') => {
2572 self.add('"');
2573 break;
2574 }
2575 Some('\\') => {
2576 self.add('\\');
2577 if let Some(ch) = self.hgetc() {
2578 self.add(ch);
2579 }
2580 }
2581 Some(ch) => self.add(ch),
2582 None => {
2583 self.lexstop = true;
2584 return Err(());
2585 }
2586 }
2587 }
2588 }
2589 '`' => {
2590 self.add(c);
2591 loop {
2592 let ch = self.hgetc();
2593 match ch {
2594 Some('`') => {
2595 self.add('`');
2596 break;
2597 }
2598 Some('\\') => {
2599 self.add('\\');
2600 if let Some(ch) = self.hgetc() {
2601 self.add(ch);
2602 }
2603 }
2604 Some(ch) => self.add(ch),
2605 None => {
2606 self.lexstop = true;
2607 return Err(());
2608 }
2609 }
2610 }
2611 }
2612 '#' if start => {
2613 self.add(c);
2614 // Skip comment to end of line
2615 loop {
2616 let ch = self.hgetc();
2617 match ch {
2618 Some('\n') => {
2619 self.add('\n');
2620 break;
2621 }
2622 Some(ch) => self.add(ch),
2623 None => break,
2624 }
2625 }
2626 }
2627 _ => {
2628 self.add(c);
2629 }
2630 }
2631
2632 start = iswhite;
2633 }
2634 }
2635
2636 /// Lex next token AND update per-context flags. Direct port of
2637 /// zsh/Src/lex.c:316-369 `ctxtlex`. The post-token state machine
2638 /// at lex.c:322-358 sets `incmdpos` based on the token shape:
2639 /// list separators / pipes / control keywords reset to cmd-pos;
2640 /// word-shaped tokens leave cmd-pos. Redirections (lex.c:361-368)
2641 /// stash prior incmdpos and force the redir target to non-cmd-pos.
2642 pub fn ctxtlex(&mut self) {
2643 // lex.c:319 — static `oldpos` cache for redir-target restore
2644 // is captured per-call here as `oldpos` below (zshrs's parser
2645 // re-enters ctxtlex per token, no need for static persistence).
2646
2647 // lex.c:321 — `zshlex();` to advance to the next token.
2648 self.zshlex();
2649
2650 // lex.c:322-358 — post-token incmdpos switch.
2651 match self.tok {
2652 // lex.c:323-343 — separators / openers / conjunctions /
2653 // control keywords — back into cmd-pos so the next token
2654 // can be a fresh command.
2655 LexTok::Seper
2656 | LexTok::Newlin
2657 | LexTok::Semi
2658 | LexTok::Dsemi
2659 | LexTok::Semiamp
2660 | LexTok::Semibar
2661 | LexTok::Amper
2662 | LexTok::Amperbang
2663 | LexTok::Inpar
2664 | LexTok::Inbrace
2665 | LexTok::Dbar
2666 | LexTok::Damper
2667 | LexTok::Bar
2668 | LexTok::Baramp
2669 | LexTok::Inoutpar
2670 | LexTok::Doloop
2671 | LexTok::Then
2672 | LexTok::Elif
2673 | LexTok::Else
2674 | LexTok::Doutbrack => {
2675 self.incmdpos = true;
2676 }
2677 // lex.c:345-353 — word/value-shaped tokens leave cmd-pos
2678 // so subsequent tokens are arguments, not a fresh command.
2679 LexTok::String
2680 | LexTok::Typeset
2681 | LexTok::Envarray
2682 | LexTok::Outpar
2683 | LexTok::Case
2684 | LexTok::Dinbrack => {
2685 self.incmdpos = false;
2686 }
2687 _ => {}
2688 }
2689
2690 // lex.c:359-360 — `infor` decay. FOR sets infor=2 so the next
2691 // DINPAR can detect c-style for. After any non-DINPAR, decay
2692 // to 0 (or back to 2 if we just saw FOR again).
2693 if self.tok != LexTok::Dinpar {
2694 self.infor = if self.tok == LexTok::For { 2 } else { 0 };
2695 }
2696
2697 // lex.c:361-368 — redir-target context dance. After consuming
2698 // a redir operator, the following token (the file path) sees
2699 // incmdpos=0 even when its inherent shape would put it back
2700 // in cmd-pos. After the redir target, restore `oldpos`.
2701 let oldpos = self.incmdpos;
2702 if self.tok.is_redirop()
2703 || self.tok == LexTok::For
2704 || self.tok == LexTok::Foreach
2705 || self.tok == LexTok::Select
2706 {
2707 self.inredir = true;
2708 self.incmdpos = false;
2709 } else if self.inredir {
2710 self.incmdpos = oldpos;
2711 self.inredir = false;
2712 }
2713 }
2714
2715 /// Mark the current word as the one ZLE was looking for. Direct
2716 /// port of zsh/Src/lex.c:1881-1897 `gotword`. Only meaningful
2717 /// when the lexer was started with LEXFLAGS_ZLE for completion;
2718 /// after this call `lexflags` is cleared so subsequent tokens
2719 /// don't re-trigger word tracking.
2720 ///
2721 /// zshrs port note: zsh's gotword updates `wb`/`we` (word begin/
2722 /// end positions) based on `zlemetacs` (cursor pos), `zlemetall`
2723 /// (line length), `inbufct`, and `addedx` — all live in zsh's
2724 /// input.c globals which zshrs hasn't wired through the lexer.
2725 /// Only the `lexflags = 0` side-effect at lex.c:1895 is
2726 /// reproducible without that integration.
2727 pub fn gotword(&mut self) {
2728 // lex.c:1895 — `lexflags = 0;`
2729 self.lexflags = LexFlags::default();
2730 }
2731
2732 /// Register a heredoc to be processed at next newline
2733 pub fn register_heredoc(&mut self, terminator: String, strip_tabs: bool) {
2734 self.heredocs.push(HereDoc {
2735 terminator,
2736 strip_tabs,
2737 content: String::new(),
2738 quoted: false,
2739 processed: false,
2740 });
2741 }
2742
2743 /// Check for reserved word — mirrors lex.c:2002-2015 in `exalias`,
2744 /// but reachable from the bare `zshlex` path (without an
2745 /// AliasResolver). Promotes STRING tokens to keyword tokens when:
2746 /// - incmdpos is set (or text is `}` ending a brace block)
2747 /// - text is `]]` and we're inside `[[ ]]` (incond > 0)
2748 /// - text is bare `!` and we're at the start of a cond (incond == 1)
2749 pub fn check_reserved_word(&mut self) -> bool {
2750 if let Some(ref tokstr) = self.tokstr {
2751 if self.incmdpos || (tokstr == "}" && self.tok == LexTok::String) {
2752 if let Some(tok) = crate::tokens::lookup_reserved_word(tokstr) {
2753 self.tok = tok;
2754 if tok == LexTok::Repeat {
2755 self.inrepeat = 1;
2756 }
2757 if tok == LexTok::Dinbrack {
2758 self.incond = 1;
2759 }
2760 return true;
2761 }
2762 if tokstr == "]]" && self.incond > 0 {
2763 self.tok = LexTok::Doutbrack;
2764 self.incond = 0;
2765 return true;
2766 }
2767 }
2768 // lex.c:2010-2014 — `]]` and `!` are recognized inside `[[`
2769 // regardless of incmdpos.
2770 if self.incond > 0 && tokstr == "]]" {
2771 self.tok = LexTok::Doutbrack;
2772 self.incond = 0;
2773 return true;
2774 }
2775 if self.incond == 1 && tokstr == "!" {
2776 self.tok = LexTok::Bang;
2777 return true;
2778 }
2779 }
2780 false
2781 }
2782}
2783
2784/// Result of determining if (( is arithmetic or command
2785enum CmdOrMath {
2786 Cmd,
2787 Math,
2788 Err,
2789}
2790
2791// ============================================================================
2792// Additional parsing functions ported from lex.c
2793// ============================================================================
2794
2795/// Check whether we're looking at valid numeric globbing syntax
2796/// `<N-M>` / `<N->` / `<-M>` / `<->`. Call pointing just after the
2797/// opening `<`. Leaves the input position unchanged, returning true
2798/// or false.
2799///
2800/// Direct port of zsh/Src/lex.c:580-610 `isnumglob`. C source uses
2801/// hgetc/hungetc against the input stream and a temp buffer to
2802/// remember consumed chars; zshrs takes a `(input, pos)` slice and
2803/// scans without consumption. Same predicate, different I/O model.
2804pub fn isnumglob(input: &str, pos: usize) -> bool {
2805 let chars: Vec<char> = input[pos..].chars().collect();
2806 let mut i = 0;
2807 let mut expect_close = false;
2808
2809 // Look for digits, then -, then digits, then >
2810 while i < chars.len() {
2811 let c = chars[i];
2812 if c.is_ascii_digit() {
2813 i += 1;
2814 } else if c == '-' && !expect_close {
2815 expect_close = true;
2816 i += 1;
2817 } else if c == '>' && expect_close {
2818 return true;
2819 } else {
2820 break;
2821 }
2822 }
2823 false
2824}
2825
2826/// Tokenize a string as if in double quotes (error-tolerant variant).
2827///
2828/// Direct port of zsh/Src/lex.c:1713-1733 `parsestrnoerr`. The C
2829/// source: zcontext_save → untokenize → inpush → strinbeg →
2830/// `lexbuf.ptr = tokstr = *s; lexbuf.siz = l + 1` →
2831/// `err = dquote_parse('\0', 1)` → strinend → inpop → zcontext_restore.
2832/// Returns the tokenized string on success, or the offending char as
2833/// an error code (zsh convention: `> 32 && < 127` → printable, else
2834/// generic).
2835///
2836/// zshrs port: the C version drives the lexer's dquote_parse method
2837/// against the input string. zshrs's standalone walker produces the
2838/// same BNULL/QSTRING/QTICK token markers without re-entering the
2839/// lexer — same output for typical bodies. Documented divergence:
2840/// nested cmd-sub `$(...)` and arith `$((...))` aren't lexed
2841/// recursively; the runtime handles them at expansion time.
2842pub fn parsestrnoerr(s: &str) -> Result<String, String> {
2843 parsestr_inner(s)
2844}
2845
2846/// Tokenize a string as if in double quotes (error-reporting variant).
2847///
2848/// Direct port of zsh/Src/lex.c:1693-1709 `parsestr`. C source:
2849/// `if ((err = parsestrnoerr(s))) { untokenize(*s); ... zerr("parse
2850/// error near `%c'", err); tok = LEXERR; }`. zshrs's wrapper
2851/// returns the same Result and lets the caller emit the diagnostic.
2852///
2853/// Both `parsestr` and `parsestrnoerr` share the inner walker; the
2854/// only difference in C is whether errors trigger `zerr`. zshrs
2855/// returns `Err(msg)` from both — the caller decides whether to
2856/// surface the diagnostic.
2857pub fn parsestr(s: &str) -> Result<String, String> {
2858 parsestr_inner(s)
2859}
2860
2861/// Shared body for parsestr / parsestrnoerr.
2862fn parsestr_inner(s: &str) -> Result<String, String> {
2863 let mut result = String::with_capacity(s.len());
2864 let chars: Vec<char> = s.chars().collect();
2865 let mut i = 0;
2866
2867 while i < chars.len() {
2868 let c = chars[i];
2869 match c {
2870 '\\' => {
2871 i += 1;
2872 if i < chars.len() {
2873 let next = chars[i];
2874 match next {
2875 '$' | '\\' | '`' | '"' | '\n' => {
2876 result.push(char_tokens::BNULL);
2877 result.push(next);
2878 }
2879 _ => {
2880 result.push('\\');
2881 result.push(next);
2882 }
2883 }
2884 } else {
2885 result.push('\\');
2886 }
2887 }
2888 '$' => {
2889 result.push(char_tokens::QSTRING);
2890 if i + 1 < chars.len() {
2891 let next = chars[i + 1];
2892 if next == '{' {
2893 result.push(char_tokens::INBRACE);
2894 i += 1;
2895 } else if next == '(' {
2896 result.push(char_tokens::INPAR);
2897 i += 1;
2898 }
2899 }
2900 }
2901 '`' => {
2902 result.push(char_tokens::QTICK);
2903 }
2904 _ => {
2905 result.push(c);
2906 }
2907 }
2908 i += 1;
2909 }
2910
2911 Ok(result)
2912}
2913
2914/// Parse a subscript in string s. Return the position after the
2915/// closing bracket, or None on error.
2916///
2917/// Direct port of zsh/Src/lex.c:1742-1788 `parse_subscript`. The C
2918/// source uses dupstring_wlen + inpush + dquote_parse to lex the
2919/// subscript through the main lexer; zshrs implements a focused
2920/// bracket-balancing walker that handles the same nesting rules
2921/// (`[...]`, `(...)`, `{...}`) without re-entering the lexer.
2922///
2923/// zshrs port note: zsh's parse_subscript also handles a `sub`
2924/// flag that controls whether `$` and quotes are tokenized — that
2925/// flag isn't exposed here. Most callers don't need it; the few
2926/// that do (parameter expansion's `${var[expr]}`) handle the
2927/// quote-aware lex separately at the expansion layer.
2928pub fn parse_subscript(s: &str, endchar: char) -> Option<usize> {
2929 if s.is_empty() || s.starts_with(endchar) {
2930 return None;
2931 }
2932
2933 let chars: Vec<char> = s.chars().collect();
2934 let mut i = 0;
2935 let mut depth = 0;
2936 let mut in_dquote = false;
2937 let mut in_squote = false;
2938
2939 while i < chars.len() {
2940 let c = chars[i];
2941
2942 if in_squote {
2943 if c == '\'' {
2944 in_squote = false;
2945 }
2946 i += 1;
2947 continue;
2948 }
2949
2950 if in_dquote {
2951 if c == '"' {
2952 in_dquote = false;
2953 } else if c == '\\' && i + 1 < chars.len() {
2954 i += 1; // skip escaped char
2955 }
2956 i += 1;
2957 continue;
2958 }
2959
2960 match c {
2961 '\\' => {
2962 i += 1; // skip next char
2963 }
2964 '\'' => {
2965 in_squote = true;
2966 }
2967 '"' => {
2968 in_dquote = true;
2969 }
2970 '[' | '(' => {
2971 depth += 1;
2972 }
2973 ']' | ')' => {
2974 if depth > 0 {
2975 depth -= 1;
2976 } else if c == endchar {
2977 return Some(i);
2978 }
2979 }
2980 _ => {}
2981 }
2982
2983 if c == endchar && depth == 0 {
2984 return Some(i);
2985 }
2986
2987 i += 1;
2988 }
2989
2990 None
2991}
2992
2993/// Tokenize a string as if it were a normal command-line argument
2994/// but it may contain separators. Used for ${...%...} substitutions.
2995///
2996/// Direct port of zsh/Src/lex.c:1796-1880 `parse_subst_string`.
2997/// zsh's version sets `noaliases = 1` + `lexflags = 0` + uses
2998/// zcontext_save/inpush/strinbeg → dquote_parse('\0', 1) →
2999/// strinend/inpop/zcontext_restore. zshrs's standalone walker
3000/// produces the same BNULL/SNULL/DNULL/INPAR/INBRACK markers
3001/// without re-entering the lexer.
3002///
3003/// zshrs port note: the C source returns int (0=ok, char value =
3004/// where it stopped on error); zshrs returns Result<String,String>
3005/// returning the tokenized text directly. Lossy for callers that
3006/// need to know the exact stop position, but nothing in zshrs's
3007/// expansion layer uses that yet.
3008pub fn parse_subst_string(s: &str) -> Result<String, String> {
3009 if s.is_empty() {
3010 return Ok(String::new());
3011 }
3012
3013 let mut result = String::with_capacity(s.len());
3014 let chars: Vec<char> = s.chars().collect();
3015 let mut i = 0;
3016
3017 while i < chars.len() {
3018 let c = chars[i];
3019 match c {
3020 '\\' => {
3021 result.push(char_tokens::BNULL);
3022 i += 1;
3023 if i < chars.len() {
3024 result.push(chars[i]);
3025 }
3026 }
3027 '\'' => {
3028 result.push(char_tokens::SNULL);
3029 i += 1;
3030 while i < chars.len() && chars[i] != '\'' {
3031 result.push(chars[i]);
3032 i += 1;
3033 }
3034 result.push(char_tokens::SNULL);
3035 }
3036 '"' => {
3037 result.push(char_tokens::DNULL);
3038 i += 1;
3039 while i < chars.len() && chars[i] != '"' {
3040 if chars[i] == '\\' && i + 1 < chars.len() {
3041 result.push(char_tokens::BNULL);
3042 i += 1;
3043 result.push(chars[i]);
3044 } else if chars[i] == '$' {
3045 result.push(char_tokens::QSTRING);
3046 } else {
3047 result.push(chars[i]);
3048 }
3049 i += 1;
3050 }
3051 result.push(char_tokens::DNULL);
3052 }
3053 '$' => {
3054 result.push(char_tokens::STRING);
3055 if i + 1 < chars.len() {
3056 match chars[i + 1] {
3057 '{' => {
3058 result.push(char_tokens::INBRACE);
3059 i += 1;
3060 }
3061 '(' => {
3062 result.push(char_tokens::INPAR);
3063 i += 1;
3064 }
3065 _ => {}
3066 }
3067 }
3068 }
3069 '*' => result.push(char_tokens::STAR),
3070 '?' => result.push(char_tokens::QUEST),
3071 '[' => result.push(char_tokens::INBRACK),
3072 ']' => result.push(char_tokens::OUTBRACK),
3073 '{' => result.push(char_tokens::INBRACE),
3074 '}' => result.push(char_tokens::OUTBRACE),
3075 '~' => result.push(char_tokens::TILDE),
3076 '#' => result.push(char_tokens::POUND),
3077 '^' => result.push(char_tokens::HAT),
3078 _ => result.push(c),
3079 }
3080 i += 1;
3081 }
3082
3083 Ok(result)
3084}
3085
3086/// Untokenize a string - convert tokenized chars back to original
3087///
3088/// Port of untokenize() from exec.c (but used by lexer too)
3089/// Like `untokenize`, but maps SNULL → `'` and DNULL → `"` instead of
3090/// stripping them. Used by callers that need the source form including
3091/// quoting (e.g. arithmetic-substitution detection in compile_zsh).
3092pub fn untokenize_preserve_quotes(s: &str) -> String {
3093 let mut result = String::with_capacity(s.len() + 4);
3094 for c in s.chars() {
3095 let cu = c as u32;
3096 if (0x83..=0x9f).contains(&cu) {
3097 match c {
3098 c if c == char_tokens::POUND => result.push('#'),
3099 c if c == char_tokens::STRING => result.push('$'),
3100 c if c == char_tokens::HAT => result.push('^'),
3101 c if c == char_tokens::STAR => result.push('*'),
3102 c if c == char_tokens::INPAR => result.push('('),
3103 c if c == char_tokens::OUTPAR => result.push(')'),
3104 c if c == char_tokens::INPARMATH => result.push('('),
3105 c if c == char_tokens::OUTPARMATH => result.push(')'),
3106 c if c == char_tokens::QSTRING => result.push('$'),
3107 c if c == char_tokens::EQUALS => result.push('='),
3108 c if c == char_tokens::BAR => result.push('|'),
3109 c if c == char_tokens::INBRACE => result.push('{'),
3110 c if c == char_tokens::OUTBRACE => result.push('}'),
3111 c if c == char_tokens::INBRACK => result.push('['),
3112 c if c == char_tokens::OUTBRACK => result.push(']'),
3113 c if c == char_tokens::TICK => result.push('`'),
3114 c if c == char_tokens::INANG => result.push('<'),
3115 c if c == char_tokens::OUTANG => result.push('>'),
3116 c if c == char_tokens::OUTANGPROC => result.push('>'),
3117 c if c == char_tokens::QUEST => result.push('?'),
3118 c if c == char_tokens::TILDE => result.push('~'),
3119 c if c == char_tokens::QTICK => result.push('`'),
3120 c if c == char_tokens::COMMA => result.push(','),
3121 c if c == char_tokens::DASH => result.push('-'),
3122 c if c == char_tokens::BANG => result.push('!'),
3123 c if c == char_tokens::SNULL => result.push('\''),
3124 c if c == char_tokens::DNULL => result.push('"'),
3125 c if c == char_tokens::BNULL => result.push('\\'),
3126 _ => {
3127 let idx = c as usize;
3128 if idx < char_tokens::ZTOKENS.len() {
3129 result.push(char_tokens::ZTOKENS.chars().nth(idx).unwrap_or(c));
3130 } else {
3131 result.push(c);
3132 }
3133 }
3134 }
3135 } else {
3136 result.push(c);
3137 }
3138 }
3139 result
3140}
3141
3142pub fn untokenize(s: &str) -> String {
3143 let mut result = String::with_capacity(s.len());
3144 let chars: Vec<char> = s.chars().collect();
3145 let mut i = 0;
3146
3147 while i < chars.len() {
3148 let c = chars[i];
3149 // Token chars live in zsh's META range (0x83 = META through 0x9f =
3150 // BNULL). Anything in that range needs un-mapping before display
3151 // or downstream consumption. The original `< 32` test was wrong —
3152 // none of zsh's tokens land in that range.
3153 let cu = c as u32;
3154 if (0x83..=0x9f).contains(&cu) {
3155 // Convert token back to original character
3156 match c {
3157 c if c == char_tokens::POUND => result.push('#'),
3158 c if c == char_tokens::STRING => result.push('$'),
3159 c if c == char_tokens::HAT => result.push('^'),
3160 c if c == char_tokens::STAR => result.push('*'),
3161 c if c == char_tokens::INPAR => result.push('('),
3162 c if c == char_tokens::OUTPAR => result.push(')'),
3163 c if c == char_tokens::INPARMATH => result.push('('),
3164 c if c == char_tokens::OUTPARMATH => result.push(')'),
3165 c if c == char_tokens::QSTRING => result.push('$'),
3166 c if c == char_tokens::EQUALS => result.push('='),
3167 c if c == char_tokens::BAR => result.push('|'),
3168 c if c == char_tokens::INBRACE => result.push('{'),
3169 c if c == char_tokens::OUTBRACE => result.push('}'),
3170 c if c == char_tokens::INBRACK => result.push('['),
3171 c if c == char_tokens::OUTBRACK => result.push(']'),
3172 c if c == char_tokens::TICK => result.push('`'),
3173 c if c == char_tokens::INANG => result.push('<'),
3174 c if c == char_tokens::OUTANG => result.push('>'),
3175 c if c == char_tokens::OUTANGPROC => result.push('>'),
3176 c if c == char_tokens::QUEST => result.push('?'),
3177 c if c == char_tokens::TILDE => result.push('~'),
3178 c if c == char_tokens::QTICK => result.push('`'),
3179 c if c == char_tokens::COMMA => result.push(','),
3180 c if c == char_tokens::DASH => result.push('-'),
3181 c if c == char_tokens::BANG => result.push('!'),
3182 c if c == char_tokens::SNULL
3183 || c == char_tokens::DNULL
3184 || c == char_tokens::BNULL =>
3185 {
3186 // Null markers - skip
3187 }
3188 _ => {
3189 // Unknown token, try ztokens lookup
3190 let idx = c as usize;
3191 if idx < char_tokens::ZTOKENS.len() {
3192 result.push(char_tokens::ZTOKENS.chars().nth(idx).unwrap_or(c));
3193 } else {
3194 result.push(c);
3195 }
3196 }
3197 }
3198 } else {
3199 result.push(c);
3200 }
3201 i += 1;
3202 }
3203
3204 result
3205}
3206
3207/// Check if a string contains any token characters
3208pub fn has_token(s: &str) -> bool {
3209 s.chars().any(|c| (c as u32) < 32)
3210}
3211
3212/// Convert token characters to their printable form for display
3213pub fn tokens_to_printable(s: &str) -> String {
3214 untokenize(s)
3215}
3216
3217#[cfg(test)]
3218mod tests {
3219 use super::*;
3220
3221 #[test]
3222 fn test_simple_command() {
3223 let mut lexer = ZshLexer::new("echo hello");
3224 lexer.zshlex();
3225 assert_eq!(lexer.tok, LexTok::String);
3226 assert_eq!(lexer.tokstr, Some("echo".to_string()));
3227
3228 lexer.zshlex();
3229 assert_eq!(lexer.tok, LexTok::String);
3230 assert_eq!(lexer.tokstr, Some("hello".to_string()));
3231
3232 lexer.zshlex();
3233 assert_eq!(lexer.tok, LexTok::Endinput);
3234 }
3235
3236 #[test]
3237 fn test_pipeline() {
3238 let mut lexer = ZshLexer::new("ls | grep foo");
3239 lexer.zshlex();
3240 assert_eq!(lexer.tok, LexTok::String);
3241
3242 lexer.zshlex();
3243 assert_eq!(lexer.tok, LexTok::Bar);
3244
3245 lexer.zshlex();
3246 assert_eq!(lexer.tok, LexTok::String);
3247
3248 lexer.zshlex();
3249 assert_eq!(lexer.tok, LexTok::String);
3250 }
3251
3252 #[test]
3253 fn test_redirections() {
3254 let mut lexer = ZshLexer::new("echo > file");
3255 lexer.zshlex();
3256 assert_eq!(lexer.tok, LexTok::String);
3257
3258 lexer.zshlex();
3259 assert_eq!(lexer.tok, LexTok::Outang);
3260
3261 lexer.zshlex();
3262 assert_eq!(lexer.tok, LexTok::String);
3263 }
3264
3265 #[test]
3266 fn test_heredoc() {
3267 let mut lexer = ZshLexer::new("cat << EOF");
3268 lexer.zshlex();
3269 assert_eq!(lexer.tok, LexTok::String);
3270
3271 lexer.zshlex();
3272 assert_eq!(lexer.tok, LexTok::Dinang);
3273
3274 lexer.zshlex();
3275 assert_eq!(lexer.tok, LexTok::String);
3276 }
3277
3278 #[test]
3279 fn test_single_quotes() {
3280 let mut lexer = ZshLexer::new("echo 'hello world'");
3281 lexer.zshlex();
3282 assert_eq!(lexer.tok, LexTok::String);
3283
3284 lexer.zshlex();
3285 assert_eq!(lexer.tok, LexTok::String);
3286 // Should contain Snull markers around literal content
3287 assert!(lexer.tokstr.is_some());
3288 }
3289
3290 #[test]
3291 fn test_function_tokens() {
3292 let mut lexer = ZshLexer::new("function foo { }");
3293 lexer.zshlex();
3294 assert_eq!(
3295 lexer.tok,
3296 LexTok::Func,
3297 "expected Func, got {:?}",
3298 lexer.tok
3299 );
3300
3301 lexer.zshlex();
3302 assert_eq!(
3303 lexer.tok,
3304 LexTok::String,
3305 "expected String for 'foo', got {:?}",
3306 lexer.tok
3307 );
3308 assert_eq!(lexer.tokstr, Some("foo".to_string()));
3309
3310 lexer.zshlex();
3311 assert_eq!(
3312 lexer.tok,
3313 LexTok::Inbrace,
3314 "expected Inbrace, got {:?} tokstr={:?}",
3315 lexer.tok,
3316 lexer.tokstr
3317 );
3318
3319 lexer.zshlex();
3320 assert_eq!(
3321 lexer.tok,
3322 LexTok::Outbrace,
3323 "expected Outbrace, got {:?} tokstr={:?} incmdpos={}",
3324 lexer.tok,
3325 lexer.tokstr,
3326 lexer.incmdpos
3327 );
3328 }
3329
3330 #[test]
3331 fn test_double_quotes() {
3332 let mut lexer = ZshLexer::new("echo \"hello $name\"");
3333 lexer.zshlex();
3334 assert_eq!(lexer.tok, LexTok::String);
3335
3336 lexer.zshlex();
3337 assert_eq!(lexer.tok, LexTok::String);
3338 // Should contain tokenized content
3339 assert!(lexer.tokstr.is_some());
3340 }
3341
3342 #[test]
3343 fn test_command_substitution() {
3344 let mut lexer = ZshLexer::new("echo $(pwd)");
3345 lexer.zshlex();
3346 assert_eq!(lexer.tok, LexTok::String);
3347
3348 lexer.zshlex();
3349 assert_eq!(lexer.tok, LexTok::String);
3350 }
3351
3352 #[test]
3353 fn test_env_assignment() {
3354 let mut lexer = ZshLexer::new("FOO=bar echo");
3355 lexer.incmdpos = true;
3356 lexer.zshlex();
3357 assert_eq!(
3358 lexer.tok,
3359 LexTok::Envstring,
3360 "tok={:?} tokstr={:?}",
3361 lexer.tok,
3362 lexer.tokstr
3363 );
3364
3365 lexer.zshlex();
3366 assert_eq!(lexer.tok, LexTok::String);
3367 }
3368
3369 #[test]
3370 fn test_array_assignment() {
3371 let mut lexer = ZshLexer::new("arr=(a b c)");
3372 lexer.incmdpos = true;
3373 lexer.zshlex();
3374 assert_eq!(lexer.tok, LexTok::Envarray);
3375 }
3376
3377 #[test]
3378 fn test_process_substitution() {
3379 let mut lexer = ZshLexer::new("diff <(ls) >(cat)");
3380 lexer.zshlex();
3381 assert_eq!(lexer.tok, LexTok::String);
3382
3383 lexer.zshlex();
3384 assert_eq!(lexer.tok, LexTok::String);
3385 // <(ls) is tokenized into the string
3386
3387 lexer.zshlex();
3388 assert_eq!(lexer.tok, LexTok::String);
3389 // >(cat) is tokenized
3390 }
3391
3392 #[test]
3393 fn test_arithmetic() {
3394 let mut lexer = ZshLexer::new("echo $((1+2))");
3395 lexer.zshlex();
3396 assert_eq!(lexer.tok, LexTok::String);
3397
3398 lexer.zshlex();
3399 assert_eq!(lexer.tok, LexTok::String);
3400 }
3401
3402 #[test]
3403 fn test_semicolon_variants() {
3404 let mut lexer = ZshLexer::new("case x in a) cmd;; b) cmd;& c) cmd;| esac");
3405
3406 // Skip to first ;;
3407 loop {
3408 lexer.zshlex();
3409 if lexer.tok == LexTok::Dsemi || lexer.tok == LexTok::Endinput {
3410 break;
3411 }
3412 }
3413 assert_eq!(lexer.tok, LexTok::Dsemi);
3414
3415 // Find ;&
3416 loop {
3417 lexer.zshlex();
3418 if lexer.tok == LexTok::Semiamp || lexer.tok == LexTok::Endinput {
3419 break;
3420 }
3421 }
3422 assert_eq!(lexer.tok, LexTok::Semiamp);
3423
3424 // Find ;|
3425 loop {
3426 lexer.zshlex();
3427 if lexer.tok == LexTok::Semibar || lexer.tok == LexTok::Endinput {
3428 break;
3429 }
3430 }
3431 assert_eq!(lexer.tok, LexTok::Semibar);
3432 }
3433}