zshrs_parse/lexer.rs
1//! Zsh lexical analyzer - Direct port from zsh/Src/lex.c
2//!
3//! This lexer tokenizes zsh shell input into a stream of tokens.
4//! It handles all zsh-specific syntax including:
5//! - Single/double/dollar quotes
6//! - Command substitution $(...) and `...`
7//! - Arithmetic $((...))
8//! - Parameter expansion ${...}
9//! - Process substitution <(...) >(...)
10//! - Here documents
11//! - All redirection operators
12//! - Comments
13//! - Continuation lines
14
15use crate::tokens::{char_tokens, LexTok};
16use std::collections::VecDeque;
17
18/// Lexer flags controlling behavior
19#[derive(Debug, Clone, Copy, Default)]
20pub struct LexFlags {
21 /// Parsing for ZLE (line editor) completion
22 pub zle: bool,
23 /// Return newlines as tokens
24 pub newline: bool,
25 /// Preserve comments in output
26 pub comments_keep: bool,
27 /// Strip comments from output
28 pub comments_strip: bool,
29 /// Active lexing (from bufferwords)
30 pub active: bool,
31}
32
33/// Buffer state for building tokens
34#[derive(Debug, Clone)]
35struct LexBuf {
36 data: String,
37 siz: usize,
38}
39
40impl LexBuf {
41 fn new() -> Self {
42 LexBuf {
43 data: String::with_capacity(256),
44 siz: 256,
45 }
46 }
47
48 fn clear(&mut self) {
49 self.data.clear();
50 }
51
52 fn add(&mut self, c: char) {
53 self.data.push(c);
54 if self.data.len() >= self.siz {
55 self.siz *= 2;
56 self.data.reserve(self.siz - self.data.len());
57 }
58 }
59
60 #[allow(dead_code)]
61 fn add_str(&mut self, s: &str) {
62 self.data.push_str(s);
63 }
64
65 fn len(&self) -> usize {
66 self.data.len()
67 }
68
69 fn as_str(&self) -> &str {
70 &self.data
71 }
72
73 #[allow(dead_code)]
74 fn into_string(self) -> String {
75 self.data
76 }
77
78 #[allow(dead_code)]
79 fn last_char(&self) -> Option<char> {
80 self.data.chars().last()
81 }
82
83 fn pop(&mut self) -> Option<char> {
84 self.data.pop()
85 }
86}
87
88/// Here-document state
89#[derive(Debug, Clone)]
90pub struct HereDoc {
91 pub terminator: String,
92 pub strip_tabs: bool,
93 pub content: String,
94 /// True if the terminator was originally quoted (`<<'EOF'`,
95 /// `<<"EOF"`, or `<<\EOF`). Disables variable expansion / command
96 /// substitution / arithmetic in the body.
97 pub quoted: bool,
98 /// True once `process_heredocs` has read the body. Distinct from
99 /// "content is empty" because an empty heredoc legitimately has
100 /// empty content.
101 pub processed: bool,
102}
103
104/// The Zsh Lexer
105pub struct ZshLexer<'a> {
106 /// Input source
107 pub(crate) input: &'a str,
108 /// Current position in input
109 pub(crate) pos: usize,
110 /// Look-ahead buffer for ungotten characters
111 unget_buf: VecDeque<char>,
112 /// Current token string
113 pub tokstr: Option<String>,
114 /// Current token type
115 pub tok: LexTok,
116 /// File descriptor for redirections (e.g., 2> means fd=2)
117 pub tokfd: i32,
118 /// Line number at start of current token
119 pub toklineno: u64,
120 /// Current line number
121 pub lineno: u64,
122 /// Lexer has stopped (EOF or error)
123 pub lexstop: bool,
124 /// In command position (can accept reserved words)
125 pub incmdpos: bool,
126 /// In condition [[ ... ]]
127 pub incond: i32,
128 /// In pattern context (RHS of == != =~ in [[ ]])
129 pub incondpat: bool,
130 /// In case pattern
131 pub incasepat: i32,
132 /// In redirection
133 pub inredir: bool,
134 /// Saved `incmdpos` from before a redirop / for / foreach / select
135 /// — restored on the NEXT non-redir token. Mirrors `static int oldpos`
136 /// in C zsh's `ctxtlex` (lex.c:319). Required for cases like
137 /// `for x ( ... )` where `(` after the var name should tokenize as
138 /// INPAR — that depends on incmdpos being restored to 1 from before
139 /// FOR was lexed, which in turn depends on this saved value.
140 pub oldpos: bool,
141 /// After 'for' keyword
142 pub infor: i32,
143 /// After 'repeat' keyword
144 inrepeat: i32,
145 /// Parsing typeset arguments
146 pub intypeset: bool,
147 /// Inside (( ... )) arithmetic
148 dbparens: bool,
149 /// Disable alias expansion
150 pub noaliases: bool,
151 /// Disable spelling correction
152 pub nocorrect: i32,
153 /// Disable comment recognition
154 pub nocomments: bool,
155 /// Lexer flags
156 pub lexflags: LexFlags,
157 /// Whether this is the first line
158 pub isfirstln: bool,
159 /// Whether this is the first char of command
160 #[allow(dead_code)]
161 isfirstch: bool,
162 /// Pending here-documents
163 pub heredocs: Vec<HereDoc>,
164 /// Expecting heredoc terminator (0 = no, 1 = <<, 2 = <<-)
165 heredoc_pending: u8,
166 /// Token buffer
167 lexbuf: LexBuf,
168 /// After newline
169 pub isnewlin: i32,
170 /// Error message if any
171 pub error: Option<String>,
172 /// Global iteration counter for infinite loop detection
173 global_iterations: usize,
174 /// Recursion depth counter
175 recursion_depth: usize,
176 /// Raw-input capture flag — when nonzero, every char read through
177 /// `hgetc` is also appended to `tokstr_raw` via zshlex_raw_add.
178 /// Direct mirror of zsh/Src/lex.c:161 `lex_add_raw`. Used by
179 /// skipcomm (lex.c:2082) to preserve the literal text of `$(...)`
180 /// command substitutions for re-execution / display.
181 pub lex_add_raw: i32,
182 /// Raw-input capture buffer. Direct mirror of lex.c:165
183 /// `tokstr_raw` / lex.c:166 `lexbuf_raw`. Combined into one
184 /// `LexBuf` here since Rust's String tracks both the data and
185 /// length internally.
186 lexbuf_raw: LexBuf,
187}
188
189const MAX_LEXER_RECURSION: usize = 200;
190
191/// Per-alias info returned by `AliasResolver::lookup_alias` and
192/// `lookup_suffix_alias`. Mirrors zsh's `struct alias` fields used
193/// at lex.c:1914-1943: `text` (replacement body), `in_use` (the
194/// recursion-guard flag), `global` (vs command-position-only).
195#[derive(Debug, Clone)]
196pub struct AliasInfo {
197 pub text: String,
198 pub in_use: bool,
199 pub global: bool,
200}
201
202/// Trait the lexer uses to look up aliases and reserved words during
203/// `exalias`. Implementors typically delegate to the executor's
204/// alias/reswd hash tables. Defining the trait here keeps lexer.rs
205/// free of executor-specific types — same pattern zsh uses with the
206/// hashtable.h opaque-handle approach against aliastab/reswdtab/
207/// sufaliastab.
208pub trait AliasResolver {
209 /// Look up an alias by name. Returns `None` if not found, or the
210 /// alias body + flags otherwise.
211 fn lookup_alias(&self, name: &str) -> Option<AliasInfo>;
212 /// Look up a suffix alias (e.g. `.txt → less`) by suffix only.
213 fn lookup_suffix_alias(&self, suffix: &str) -> Option<AliasInfo>;
214 /// Resolve a reserved word. Returns the LexTok the word should
215 /// promote to (e.g. "if" → IF), or None if not a reswd.
216 fn lookup_reswd(&self, name: &str) -> Option<LexTok>;
217 /// Mark an alias as in-use (recursion guard). Called when an
218 /// alias is about to be expanded; the matching unmark happens
219 /// when the alias text has been fully consumed by the lexer.
220 fn mark_in_use(&mut self, name: &str, in_use: bool);
221}
222
223/// Saved lexical state for nested-context handling. Direct port of
224/// `struct lex_stack` declared in zsh/Src/zsh.h and used by
225/// zsh/Src/lex.c:215-239 (`lex_context_save`) and lex.c:244-262
226/// (`lex_context_restore`). Used when entering command substitution,
227/// here-docs, or eval where the outer lexer state must be pushed and
228/// restored after the inner parse completes.
229#[derive(Debug, Clone)]
230pub struct LexStack {
231 pub dbparens: bool,
232 pub isfirstln: bool,
233 pub isfirstch: bool,
234 pub lexflags: LexFlags,
235 pub tok: LexTok,
236 pub tokstr: Option<String>,
237 pub lexbuf_data: String,
238 pub lexbuf_siz: usize,
239 pub lexstop: bool,
240 pub toklineno: u64,
241}
242
243impl Default for LexStack {
244 fn default() -> Self {
245 // Mirrors lex.c:235-238 reset state after a save: tokstr / lexbuf
246 // zeroed, lexbuf.siz back to the initial 256 alloc, tok to
247 // ENDINPUT (the C source doesn't explicitly reset tok here but
248 // the natural baseline is ENDINPUT — same as lexinit).
249 LexStack {
250 dbparens: false,
251 isfirstln: false,
252 isfirstch: false,
253 lexflags: LexFlags::default(),
254 tok: LexTok::Endinput,
255 tokstr: None,
256 lexbuf_data: String::new(),
257 lexbuf_siz: 256,
258 lexstop: false,
259 toklineno: 0,
260 }
261 }
262}
263
264impl<'a> ZshLexer<'a> {
265 /// Create a new lexer for the given input
266 pub fn new(input: &'a str) -> Self {
267 ZshLexer {
268 input,
269 pos: 0,
270 unget_buf: VecDeque::new(),
271 tokstr: None,
272 tok: LexTok::Endinput,
273 tokfd: -1,
274 toklineno: 1,
275 lineno: 1,
276 lexstop: false,
277 incmdpos: true,
278 incond: 0,
279 incondpat: false,
280 incasepat: 0,
281 inredir: false,
282 oldpos: true,
283 infor: 0,
284 inrepeat: 0,
285 intypeset: false,
286 dbparens: false,
287 noaliases: false,
288 nocorrect: 0,
289 nocomments: false,
290 lexflags: LexFlags::default(),
291 isfirstln: true,
292 isfirstch: true,
293 heredocs: Vec::new(),
294 heredoc_pending: 0,
295 lexbuf: LexBuf::new(),
296 isnewlin: 0,
297 error: None,
298 global_iterations: 0,
299 recursion_depth: 0,
300 lex_add_raw: 0,
301 lexbuf_raw: LexBuf::new(),
302 }
303 }
304
305 /// Append a char to the raw-input capture buffer. Direct port of
306 /// zsh/Src/lex.c:2024-2039 `zshlex_raw_add`. Called from hgetc
307 /// when `lex_add_raw` is nonzero so cmd-sub bodies (`$(...)`,
308 /// `<(...)`, `>(...)`) can be replayed verbatim without re-lexing.
309 pub fn zshlex_raw_add(&mut self, c: char) {
310 // lex.c:2027-2028 — guard on lex_add_raw flag.
311 if self.lex_add_raw == 0 {
312 return;
313 }
314 // lex.c:2030-2038 — append to lexbuf_raw. The C source manages
315 // explicit ptr/len/siz with hrealloc; Rust's String handles
316 // resize automatically.
317 self.lexbuf_raw.add(c);
318 }
319
320 /// Run alias / reserved-word expansion on the just-lexed token.
321 /// Direct port of zsh/Src/lex.c:1949-2021 `exalias`. Returns true
322 /// if an alias was injected (the caller's loop should re-run
323 /// gettok to consume the injected text).
324 ///
325 /// C source flow:
326 /// 1. Spell-correct (lex.c:1958-1962) — disabled in zshrs.
327 /// 2. If tokstr is None: set lextext from `tokstrings[tok]` and
328 /// checkalias against that (lex.c:1964-1969).
329 /// 3. Otherwise: untokenize tokstr into a working copy (lex.c:
330 /// 1971-1980).
331 /// 4. ZLE word-tracking: call gotword() if LEXFLAGS_ZLE
332 /// (lex.c:1982-1991).
333 /// 5. STRING tokens: try checkalias, then reservation lookup
334 /// (lex.c:1993-2015).
335 /// 6. Clear inalmore (lex.c:2016).
336 ///
337 /// Takes an `AliasResolver` trait object so the lexer doesn't
338 /// hard-depend on the executor's alias-table types. zshrs callers
339 /// implement `AliasResolver` over their alias hash tables.
340 pub fn exalias<R: AliasResolver>(&mut self, resolver: &mut R) -> bool {
341 // lex.c:1957 — `hwend()` ends the history-word region. zshrs's
342 // history layer doesn't track per-word boundaries here; no-op.
343
344 // lex.c:1958-1962 — spell correction via spckword. zshrs
345 // doesn't implement spell correction yet; documented divergence.
346
347 // lex.c:1964-1969 — bare-token path (no tokstr).
348 if self.tokstr.is_none() {
349 // lex.c:1965 — `zshlextext = tokstrings[tok];` — for tokens
350 // like SEMI/AMPER/etc. the canonical text comes from a
351 // static table. zshrs's check_alias_for_text uses the
352 // resolver directly with the token's text representation.
353 if self.tok == LexTok::Newlin {
354 return false;
355 }
356 // Use punctuation-token text; unknown tokens skip alias.
357 let text = match self.tok {
358 LexTok::Semi => ";",
359 LexTok::Amper => "&",
360 LexTok::Bar => "|",
361 _ => return false,
362 };
363 return self.check_alias(resolver, text);
364 }
365
366 let tokstr = self.tokstr.clone().unwrap();
367 // lex.c:1973-1980 — untokenize: convert the lexer's internal
368 // tokenized form (Pound..ztokens shifts) into the literal
369 // shell text. Call the global helper.
370 let lextext = if has_token(&tokstr) {
371 untokenize(&tokstr)
372 } else {
373 tokstr.clone()
374 };
375
376 // lex.c:1982-1991 — ZLE word-tracking for completion.
377 if self.lexflags.zle {
378 let zp = self.lexflags;
379 self.gotword();
380 // lex.c:1986-1990 — if gotword cleared lexflags, the cursor
381 // word has been reached; abort exalias so completion can
382 // capture the partial token unchanged.
383 if zp.zle && !self.lexflags.zle {
384 return false;
385 }
386 }
387
388 // lex.c:1993-2015 — STRING-token alias / reswd check.
389 if self.tok == LexTok::String {
390 // lex.c:1995 — `checkalias()`. POSIX-aliases gate skipped
391 // here (zshrs doesn't have the option flag wired).
392 if self.check_alias(resolver, &lextext) {
393 return true;
394 }
395
396 // lex.c:2002-2009 — reserved-word lookup. Fires when in
397 // command position OR when the text is bare `}` and
398 // IGNOREBRACES is unset (so `}` ends a brace block).
399 if self.incmdpos || lextext == "}" {
400 if let Some(rwtok) = resolver.lookup_reswd(&lextext) {
401 self.tok = rwtok;
402 if rwtok == LexTok::Repeat {
403 self.inrepeat = 1;
404 }
405 if rwtok == LexTok::Dinbrack {
406 self.incond = 1;
407 }
408 }
409 } else if self.incond > 0 && lextext == "]]" {
410 // lex.c:2010-2012 — `]]` closes the cond expression.
411 self.tok = LexTok::Doutbrack;
412 self.incond = 0;
413 } else if self.incond == 1 && lextext == "!" {
414 // lex.c:2013-2014 — `!` inside `[[ ]]` is the BANG
415 // negation, not a literal.
416 self.tok = LexTok::Bang;
417 }
418 }
419
420 // lex.c:2016 — `inalmore = 0;` — alias-more flag clears after
421 // any non-alias token.
422 // (zshrs's lexer doesn't have inalmore yet — added here would
423 // require gettok to track when an alias-pushed token has more
424 // text after it. Documented divergence.)
425
426 false
427 }
428
429 /// Helper for `exalias`. Direct port of zsh/Src/lex.c:1899-1947
430 /// `checkalias`. Returns true if the lookup matched (regular or
431 /// suffix alias) AND the alias text was successfully injected
432 /// back into the input stream for re-lexing.
433 fn check_alias<R: AliasResolver>(&mut self, resolver: &mut R, lextext: &str) -> bool {
434 // lex.c:1906-1907 — guard on null lextext.
435 if lextext.is_empty() {
436 return false;
437 }
438
439 // lex.c:1909-1911 — guard: alias expansion is disabled, or
440 // POSIX aliases require the token to be a STRING and not a
441 // reserved word.
442 if self.noaliases {
443 return false;
444 }
445
446 // lex.c:1914-1933 — regular alias lookup.
447 if let Some(alias) = resolver.lookup_alias(lextext) {
448 if !alias.in_use && (alias.global || (self.incmdpos && self.tok == LexTok::String)) {
449 // lex.c:1918-1927 — if the next char isn't blank,
450 // insert a space so the alias body can't accidentally
451 // join the following word.
452 if !self.lexstop {
453 if let Some(c) = self.peek() {
454 if !Self::is_blank(c) {
455 self.inject_alias_text(" ");
456 }
457 }
458 }
459 // lex.c:1928 — `inpush(an->text, INP_ALIAS, an);`
460 self.inject_alias_text(&alias.text);
461 resolver.mark_in_use(lextext, true);
462 self.lexstop = false;
463 return true;
464 }
465 }
466
467 // lex.c:1934-1943 — suffix-alias lookup. The token must end
468 // with `.SUFFIX`, the suffix name must be a registered
469 // suffix-alias, AND the lexer must be in command position.
470 if self.incmdpos {
471 if let Some(dot_pos) = lextext.rfind('.') {
472 if dot_pos > 0 && dot_pos + 1 < lextext.len() {
473 let suffix = &lextext[dot_pos + 1..];
474 if let Some(alias) = resolver.lookup_suffix_alias(suffix) {
475 if !alias.in_use {
476 // lex.c:1938-1940 — push three things in
477 // reverse: the alias text, a space, then
478 // the original word.
479 self.inject_alias_text(&alias.text);
480 self.inject_alias_text(" ");
481 self.inject_alias_text(lextext);
482 resolver.mark_in_use(suffix, true);
483 self.lexstop = false;
484 return true;
485 }
486 }
487 }
488 }
489 }
490
491 false
492 }
493
494 /// Push alias text back into the input stream so the lexer
495 /// re-reads it. Equivalent to zsh's `inpush(text, INP_ALIAS, an)`
496 /// at lex.c:1928,1938,1940. zshrs uses the existing `unget_buf`
497 /// (a VecDeque<char>) to inject chars in reverse order so the
498 /// next hgetc consumes them first.
499 fn inject_alias_text(&mut self, text: &str) {
500 // Insert at front in reverse so the first char of `text`
501 // comes out first.
502 for c in text.chars().rev() {
503 self.unget_buf.push_front(c);
504 }
505 }
506
507 /// Pop the last char from the raw-input capture buffer. Direct
508 /// port of zsh/Src/lex.c:2042-2049 `zshlex_raw_back`. Called when
509 /// the lexer ungets a char that was just captured raw — the raw
510 /// buffer must mirror the live input so this undoes the last add.
511 pub fn zshlex_raw_back(&mut self) {
512 // lex.c:2045-2046 — guard.
513 if self.lex_add_raw == 0 {
514 return;
515 }
516 // lex.c:2047-2048 — `lexbuf_raw.ptr--; lexbuf_raw.len--;`
517 self.lexbuf_raw.pop();
518 }
519
520 /// Mark the current raw-buffer offset (for restore later). Direct
521 /// port of zsh/Src/lex.c:2052-2058 `zshlex_raw_mark`. Returns
522 /// `len + offset` so callers can restore via `back_to_mark`.
523 pub fn zshlex_raw_mark(&self, offset: i64) -> i64 {
524 // lex.c:2055-2056 — guard.
525 if self.lex_add_raw == 0 {
526 return 0;
527 }
528 // lex.c:2057 — `return lexbuf_raw.len + offset;`
529 (self.lexbuf_raw.len() as i64) + offset
530 }
531
532 /// Restore raw-buffer offset to a previously-saved mark. Direct
533 /// port of zsh/Src/lex.c:2061-2068 `zshlex_raw_back_to_mark`.
534 /// Truncates the raw buffer to `mark` bytes — undoes any captures
535 /// since the mark was taken (used when a speculative parse fails
536 /// and the lexer rolls back).
537 pub fn zshlex_raw_back_to_mark(&mut self, mark: i64) {
538 // lex.c:2064-2065 — guard.
539 if self.lex_add_raw == 0 {
540 return;
541 }
542 // lex.c:2066-2067 — `lexbuf_raw.ptr = tokstr_raw + mark;
543 // lexbuf_raw.len = mark;` — Rust truncate handles both.
544 let m = mark.max(0) as usize;
545 self.lexbuf_raw.data.truncate(m);
546 }
547
548 /// Take the captured raw-input buffer, clearing it. Useful for
549 /// callers that need the literal command-sub body after lexing
550 /// (e.g. compile-time string capture for `$(...)`).
551 pub fn take_raw_buf(&mut self) -> String {
552 std::mem::take(&mut self.lexbuf_raw.data)
553 }
554
555 /// Save lexical context onto a `LexStack`. Direct port of
556 /// zsh/Src/lex.c:215-239 `lex_context_save`. After save, the lexer
557 /// is in a clean state suitable for parsing a nested input (command
558 /// substitution body, here-doc terminator, eval'd string).
559 pub fn lex_context_save(&mut self, ls: &mut LexStack) {
560 // lex.c:220-233 — copy live state into the stack.
561 ls.dbparens = self.dbparens;
562 ls.isfirstln = self.isfirstln;
563 ls.isfirstch = self.isfirstch;
564 ls.lexflags = self.lexflags;
565 ls.tok = self.tok;
566 ls.tokstr = self.tokstr.take();
567 ls.lexbuf_data = std::mem::take(&mut self.lexbuf.data);
568 ls.lexbuf_siz = self.lexbuf.siz;
569 ls.lexstop = self.lexstop;
570 ls.toklineno = self.toklineno;
571
572 // lex.c:235-238 — reset live state to defaults so a nested
573 // parse starts from a clean slate. tokstr/lexbuf are zeroed,
574 // lexbuf.siz reset to 256 (the C-source initial alloc).
575 self.tokstr = None;
576 self.lexbuf.data.clear();
577 self.lexbuf.siz = 256;
578 }
579
580 /// Restore lexical context from a `LexStack`. Direct port of
581 /// zsh/Src/lex.c:244-262 `lex_context_restore`. Inverse of
582 /// `lex_context_save`. Called after the nested parse completes.
583 pub fn lex_context_restore(&mut self, ls: &mut LexStack) {
584 // lex.c:249-261 — copy stack state back into live fields.
585 self.dbparens = ls.dbparens;
586 self.isfirstln = ls.isfirstln;
587 self.isfirstch = ls.isfirstch;
588 self.lexflags = ls.lexflags;
589 self.tok = ls.tok;
590 self.tokstr = ls.tokstr.take();
591 self.lexbuf.data = std::mem::take(&mut ls.lexbuf_data);
592 self.lexbuf.siz = ls.lexbuf_siz;
593 self.lexstop = ls.lexstop;
594 self.toklineno = ls.toklineno;
595 }
596
597 /// Initialize lexical state. Direct port of zsh/Src/lex.c:440-445
598 /// `lexinit`. Resets dbparens / nocorrect / lexstop and sets `tok`
599 /// to ENDINPUT so the next gettok starts from a known baseline.
600 /// Note: the constructor `Self::new` already sets equivalent
601 /// defaults; this method exists for the rare case a caller wants
602 /// to recycle a `ZshLexer` across multiple input strings.
603 pub fn lexinit(&mut self) {
604 // lex.c:443 — `nocorrect = dbparens = lexstop = 0;`
605 self.nocorrect = 0;
606 self.dbparens = false;
607 self.lexstop = false;
608 // lex.c:444 — `tok = ENDINPUT;`
609 self.tok = LexTok::Endinput;
610 }
611
612 /// Check recursion depth; returns true if exceeded
613 #[inline]
614 fn check_recursion(&mut self) -> bool {
615 if self.recursion_depth > MAX_LEXER_RECURSION {
616 self.error = Some("lexer exceeded max recursion depth".to_string());
617 self.lexstop = true;
618 true
619 } else {
620 false
621 }
622 }
623
624 /// Check and increment global iteration counter; returns true if limit exceeded
625 /// Soft cap on `hgetc` invocations — an infinite-loop tripwire.
626 /// Real-world scripts: zinit.zsh ~5K lines / ~200KB, p10k's
627 /// internal/p10k.zsh ~10K lines / ~360KB, the user's daily-driver
628 /// `.zshrc` + zpwr stack collectively crosses 1M+ chars per shell
629 /// invocation. The previous 50K cap was tripped by p10k by line
630 /// 1277 (well below its actual 10K-line size). 100M chars handles
631 /// every reasonable script while still bailing out of a real
632 /// runaway lexer state machine.
633 const LEXER_HGETC_CAP: u64 = 100_000_000;
634
635 #[inline]
636 fn check_iterations(&mut self) -> bool {
637 self.global_iterations += 1;
638 if self.global_iterations as u64 > Self::LEXER_HGETC_CAP {
639 self.error = Some(format!(
640 "lexer exceeded {} hgetc iterations — possible infinite loop",
641 Self::LEXER_HGETC_CAP
642 ));
643 self.lexstop = true;
644 self.tok = LexTok::Lexerr;
645 true
646 } else {
647 false
648 }
649 }
650
651 /// Get next character from input
652 fn hgetc(&mut self) -> Option<char> {
653 if self.check_iterations() {
654 return None;
655 }
656
657 // Re-read from unget_buf: increment lineno on `\n` HERE
658 // too. hungetc() decremented lineno when the char was put
659 // back; without a matching increment on the way out, every
660 // `\n` that's ungetted-then-reread leaves lineno
661 // permanently one short. Symptom: $LINENO stuck at 1 in
662 // every script statement because the parser ungets the
663 // separating newline once between statements.
664 if let Some(c) = self.unget_buf.pop_front() {
665 if c == '\n' {
666 self.lineno += 1;
667 }
668 return Some(c);
669 }
670
671 let c = self.input[self.pos..].chars().next()?;
672 self.pos += c.len_utf8();
673
674 if c == '\n' {
675 self.lineno += 1;
676 }
677
678 Some(c)
679 }
680
681 /// Put character back into input
682 fn hungetc(&mut self, c: char) {
683 self.unget_buf.push_front(c);
684 if c == '\n' && self.lineno > 1 {
685 self.lineno -= 1;
686 }
687 self.lexstop = false;
688 }
689
690 /// Peek at next character without consuming
691 #[allow(dead_code)]
692 fn peek(&mut self) -> Option<char> {
693 if let Some(&c) = self.unget_buf.front() {
694 return Some(c);
695 }
696 self.input[self.pos..].chars().next()
697 }
698
699 /// Add character to token buffer
700 fn add(&mut self, c: char) {
701 self.lexbuf.add(c);
702 }
703
704 /// Check if character is blank (space or tab)
705 fn is_blank(c: char) -> bool {
706 c == ' ' || c == '\t'
707 }
708
709 /// Peek for a zsh numeric range glob shape after a `<`: returns the
710 /// captured `N*-M*>` (everything *after* the leading `<`) when the
711 /// upcoming chars match `[0-9]*-[0-9]*>` exactly. Otherwise returns
712 /// None and leaves the input untouched.
713 fn try_numeric_range_glob(&mut self) -> Option<String> {
714 let mut buf: Vec<char> = Vec::new();
715 // optional leading digits
716 loop {
717 match self.hgetc() {
718 Some(c) if c.is_ascii_digit() => buf.push(c),
719 Some(c) => {
720 buf.push(c);
721 break;
722 }
723 None => break,
724 }
725 }
726 // last char in buf must be '-' for the range form
727 if buf.last() != Some(&'-') {
728 for c in buf.iter().rev() {
729 self.hungetc(*c);
730 }
731 return None;
732 }
733 // optional trailing digits
734 loop {
735 match self.hgetc() {
736 Some(c) if c.is_ascii_digit() => buf.push(c),
737 Some(c) => {
738 buf.push(c);
739 break;
740 }
741 None => break,
742 }
743 }
744 if buf.last() != Some(&'>') {
745 for c in buf.iter().rev() {
746 self.hungetc(*c);
747 }
748 return None;
749 }
750 Some(buf.into_iter().collect())
751 }
752
753 /// Check if character is blank (including other whitespace except newline)
754 fn is_inblank(c: char) -> bool {
755 matches!(c, ' ' | '\t' | '\x0b' | '\x0c' | '\r')
756 }
757
758 /// Check if character is a digit
759 fn is_digit(c: char) -> bool {
760 c.is_ascii_digit()
761 }
762
763 /// Check if character is identifier start
764 #[allow(dead_code)]
765 fn is_ident_start(c: char) -> bool {
766 c.is_ascii_alphabetic() || c == '_'
767 }
768
769 /// Check if character is identifier continuation
770 fn is_ident(c: char) -> bool {
771 c.is_ascii_alphanumeric() || c == '_'
772 }
773
774 /// Main lexer entry point — fetch the next token. Direct port of
775 /// zsh/Src/lex.c:265-313 `zshlex`. Loop body matches the C source
776 /// `do { ... } while (tok != ENDINPUT && exalias())` at lex.c:270-276,
777 /// followed by here-doc draining (lex.c:278-306), newline tracking
778 /// (lex.c:307-310), and SEMI/NEWLIN→SEPER folding (lex.c:311-312).
779 ///
780 /// zshrs port note: `exalias()` (lex.c:1953) is not yet wired into
781 /// the loop. The C source iterates as long as exalias keeps
782 /// re-injecting alias text into the input buffer; zshrs's alias
783 /// expansion happens post-lex in exec.rs. The loop body therefore
784 /// runs once and breaks unconditionally — documented divergence.
785 pub fn zshlex(&mut self) {
786 // lex.c:268-269 — early-out on prior LEXERR.
787 if self.tok == LexTok::Lexerr {
788 return;
789 }
790
791 // Note: Do NOT reset global_iterations here - it must accumulate across all
792 // zshlex calls in a parse to prevent infinite loops in the parser
793
794 // lex.c:270-276 — gettok / exalias one-pass body. The C source
795 // wraps gettok in `do { ... } while (exalias())` so an alias
796 // re-injection re-enters the lex. Until exalias is wired we
797 // run the body exactly once, no loop scaffolding.
798 // lex.c:271-272 — bump inrepeat counter for `repeat N {}`
799 // detection.
800 if self.inrepeat > 0 {
801 self.inrepeat += 1;
802 }
803 // lex.c:273-274 — at the third token after `repeat`,
804 // SHORTLOOPS / SHORTREPEAT options force back into cmd
805 // position so the loop body can start. zshrs unconditionally
806 // does this since the option-lookup lives in exec.rs.
807 if self.inrepeat == 3 {
808 self.incmdpos = true;
809 }
810
811 // lex.c:275 — `tok = gettok();`
812 self.tok = self.gettok();
813
814 // lex.c:277 — `nocorrect &= 1;` — clear bit 1 (lookahead-only)
815 // so the persistent low bit survives but the per-word bit is
816 // dropped.
817 self.nocorrect &= 1;
818
819 // lex.c:278-306 — drain pending here-documents at the start
820 // of a new line. zshrs's process_heredocs reads the full body
821 // and stitches it onto the matching redir token.
822 if self.tok == LexTok::Newlin || self.tok == LexTok::Endinput {
823 self.process_heredocs();
824 }
825
826 // lex.c:307-310 — track whether we just saw a newline.
827 // C uses `inbufct` to distinguish "newline at EOF" (=1)
828 // from "newline mid-input" (=-1); zshrs reads `pos < len`.
829 if self.tok != LexTok::Newlin {
830 self.isnewlin = 0;
831 } else {
832 self.isnewlin = if self.pos < self.input.len() { -1 } else { 1 };
833 }
834
835 // lex.c:311-312 — fold SEMI / NEWLIN into SEPER unless
836 // LEXFLAGS_NEWLINE is set to preserve newlines (used by
837 // ZLE for completion of partial lines).
838 if self.tok == LexTok::Semi || (self.tok == LexTok::Newlin && !self.lexflags.newline) {
839 self.tok = LexTok::Seper;
840 }
841
842 // Reserved-word promotion. Per lex.c:2002-2005 in `exalias`:
843 // - `{` only promotes to INBRACE in command position
844 // - `}` promotes to OUTBRACE either in cmdpos OR via the
845 // special `closing-brace-special` rule (IGNOREBRACES unset
846 // — assumed since zshrs doesn't expose that option yet)
847 // - other reserved words: only when incmdpos (or `}` exception)
848 if self.tok == LexTok::String {
849 if let Some(ref s) = self.tokstr {
850 if s == "{" && self.incmdpos {
851 self.tok = LexTok::Inbrace;
852 } else if s == "}" {
853 self.tok = LexTok::Outbrace;
854 } else if self.incasepat == 0 {
855 // Skip reserved word checking in case pattern context —
856 // words like `time`, `end` should be patterns, not
857 // keywords.
858 self.check_reserved_word();
859 }
860 }
861 }
862
863 // If we were expecting a heredoc terminator, register it now
864 if self.heredoc_pending > 0 && self.tok == LexTok::String {
865 if let Some(ref terminator) = self.tokstr {
866 let strip_tabs = self.heredoc_pending == 2;
867 // Detect originally-quoted terminator (`<<'EOF'`,
868 // `<<"EOF"`). The lexer wraps single-quoted text in
869 // SNULL (`\u{9d}`) and double-quoted text in DNULL
870 // (`\u{9e}`); plain `EOF` has neither. Quoted-terminator
871 // heredocs disable variable / command-sub / arithmetic
872 // expansion in the body — see `compile_redir` for the
873 // expansion side.
874 // Quoted terminators (`<<'EOF'`, `<<"EOF"`, `<<\EOF`)
875 // disable expansion in the body. SNULL/DNULL mark
876 // single/double-quoted spans; BNULL (`\u{9f}`) marks
877 // any backslash-escaped char — its presence alone is
878 // enough to flag the terminator as quoted (zsh's
879 // `<<\EOF` shorthand for `<<'EOF'`).
880 let quoted = terminator.contains('\u{9d}')
881 || terminator.contains('\u{9e}')
882 || terminator.contains('\u{9f}')
883 || terminator.starts_with('\'')
884 || terminator.starts_with('"');
885 let term = terminator
886 .chars()
887 .filter(|c| {
888 *c != '\''
889 && *c != '"'
890 && *c != '\u{9d}'
891 && *c != '\u{9e}'
892 && *c != '\u{9f}'
893 })
894 .collect::<String>();
895 self.heredocs.push(HereDoc {
896 terminator: term,
897 strip_tabs,
898 content: String::new(),
899 quoted,
900 processed: false,
901 });
902 }
903 self.heredoc_pending = 0;
904 }
905
906 // Track pattern context inside [[ ... ]] - after = == != =~ the RHS is a pattern
907 if self.incond > 0 {
908 if let Some(ref s) = self.tokstr {
909 // Check if this token is a comparison operator
910 // Note: single = is also a comparison operator in [[ ]]
911 // The internal marker \u{8d} is used for =
912 if s == "="
913 || s == "=="
914 || s == "!="
915 || s == "=~"
916 || s == "\u{8d}"
917 || s == "\u{8d}\u{8d}"
918 || s == "!\u{8d}"
919 || s == "\u{8d}~"
920 || s == "\u{8d}\u{98}"
921 {
922 self.incondpat = true;
923 } else if self.incondpat {
924 // We were in pattern context, now we've consumed the pattern
925 // Reset after the pattern token is consumed
926 // But actually, pattern can span multiple tokens, so we should
927 // stay in pattern mode until ]] or && or ||
928 }
929 }
930 // Reset pattern context on ]] or logical operators (&&, ||)
931 // and grouping parens. zsh par_cond_3 (cond.c) treats
932 // these as cond-pattern terminators — the next operand is
933 // a fresh primary, NOT a continuation of the prior pattern.
934 // Without resetting on Damper/Dbar/Inpar/Outpar, the `(`
935 // after `[[ a == a && (b == b ... ` was lexed as a literal
936 // glob char (incondpat=true → gettokstr) and the whole
937 // remainder collapsed into one String token.
938 match self.tok {
939 LexTok::Doutbrack
940 | LexTok::Damper
941 | LexTok::Dbar
942 | LexTok::Inpar
943 | LexTok::Outpar
944 | LexTok::Bang => {
945 self.incondpat = false;
946 }
947 _ => {}
948 }
949 } else {
950 self.incondpat = false;
951 }
952
953 // Update command position for next token based on current token
954 // Note: In case patterns (incasepat > 0), | is a pattern separator, not pipeline,
955 // so we don't set incmdpos after Bar in that context
956 match self.tok {
957 LexTok::Seper
958 | LexTok::Newlin
959 | LexTok::Semi
960 | LexTok::Dsemi
961 | LexTok::Semiamp
962 | LexTok::Semibar
963 | LexTok::Amper
964 | LexTok::Amperbang
965 | LexTok::Inpar
966 | LexTok::Inbrace
967 | LexTok::Dbar
968 | LexTok::Damper
969 | LexTok::Baramp
970 | LexTok::Inoutpar
971 | LexTok::Doloop
972 | LexTok::Then
973 | LexTok::Elif
974 | LexTok::Else
975 | LexTok::Doutbrack
976 | LexTok::Func => {
977 self.incmdpos = true;
978 }
979 LexTok::Bar
980 // In case patterns, | is a pattern separator - don't change incmdpos
981 if self.incasepat <= 0 => {
982 self.incmdpos = true;
983 }
984 LexTok::String
985 | LexTok::Typeset
986 | LexTok::Envarray
987 | LexTok::Outpar
988 | LexTok::Case
989 | LexTok::Dinbrack => {
990 self.incmdpos = false;
991 }
992 _ => {}
993 }
994
995 // Track 'for' keyword for C-style for loop: for (( init; cond; step ))
996 // When we see 'for', set infor=2 to expect the init and cond parts
997 // Each Dinpar (after semicolon in arithmetic) decrements it
998 if self.tok != LexTok::Dinpar {
999 self.infor = if self.tok == LexTok::For { 2 } else { 0 };
1000 }
1001
1002
1003 // Handle redirection / for-loop context. Mirrors lex.c:359-368
1004 // ctxtlex `oldpos` save/restore. The saved value lives in
1005 // `self.oldpos` (struct field) so it survives across zshlex
1006 // calls — the previous local `let oldpos = self.incmdpos`
1007 // captured the JUST-updated value (always wrong) and lost the
1008 // pre-FOR incmdpos. With the field, FOR x → STRING x → INPAR
1009 // sequence correctly restores incmdpos=1 before the `(`.
1010 if self.tok.is_redirop()
1011 || self.tok == LexTok::For
1012 || self.tok == LexTok::Foreach
1013 || self.tok == LexTok::Select
1014 {
1015 self.inredir = true;
1016 self.oldpos = self.incmdpos;
1017 self.incmdpos = false;
1018 } else if self.inredir {
1019 self.incmdpos = self.oldpos;
1020 self.inredir = false;
1021 }
1022 }
1023
1024 /// Process pending here-documents. Walks each heredoc whose body
1025 /// hasn't been filled yet (content is empty AND terminator is set),
1026 /// reads lines from input until the terminator, and stuffs the body
1027 /// into `hdoc.content` IN PLACE. The list itself is preserved so the
1028 /// parser can index into it after parse() finishes.
1029 fn process_heredocs(&mut self) {
1030 let n = self.heredocs.len();
1031 for i in 0..n {
1032 // Skip heredocs we've already processed AND those without
1033 // a terminator (early-error case). The `processed` bool
1034 // distinguishes "filled with empty body" from "not yet
1035 // visited" — both have empty `content`.
1036 if self.heredocs[i].processed || self.heredocs[i].terminator.is_empty() {
1037 continue;
1038 }
1039 let strip_tabs = self.heredocs[i].strip_tabs;
1040 let terminator = self.heredocs[i].terminator.clone();
1041 let mut content = String::new();
1042 let mut line_count = 0;
1043
1044 loop {
1045 line_count += 1;
1046 if line_count > 10000 {
1047 self.error = Some("heredoc exceeded 10000 lines".to_string());
1048 self.tok = LexTok::Lexerr;
1049 return;
1050 }
1051
1052 let line = self.read_line();
1053 if line.is_none() {
1054 self.error = Some("here document too large or unterminated".to_string());
1055 self.tok = LexTok::Lexerr;
1056 return;
1057 }
1058
1059 let line = line.unwrap();
1060 let check_line = if strip_tabs {
1061 line.trim_start_matches('\t')
1062 } else {
1063 line.as_str()
1064 };
1065
1066 if check_line.trim_end_matches('\n') == terminator {
1067 break;
1068 }
1069
1070 // `<<-` strips leading tabs from BODY lines too, not just
1071 // from terminator-match comparison. Without this, tabs in
1072 // here-doc content survive into stdin.
1073 if strip_tabs {
1074 content.push_str(check_line);
1075 } else {
1076 content.push_str(&line);
1077 }
1078 }
1079
1080 self.heredocs[i].content = content;
1081 self.heredocs[i].processed = true;
1082 }
1083 }
1084
1085 /// Read a line from input (returns partial line at EOF)
1086 fn read_line(&mut self) -> Option<String> {
1087 let mut line = String::new();
1088
1089 loop {
1090 match self.hgetc() {
1091 Some(c) => {
1092 line.push(c);
1093 if c == '\n' {
1094 break;
1095 }
1096 }
1097 None => {
1098 // EOF - return partial line if any
1099 if line.is_empty() {
1100 return None;
1101 }
1102 break;
1103 }
1104 }
1105 }
1106
1107 Some(line)
1108 }
1109
1110 /// Get the next token. Direct port of zsh/Src/lex.c:613-936
1111 /// `gettok`. Reads characters from the input via hgetc, dispatches
1112 /// on the leading char through lexact1[]/lexact2[] tables (zshrs
1113 /// uses inline `match` in lex_initial / lex_inang / lex_outang
1114 /// since Rust pattern-matching subsumes the table dispatch).
1115 ///
1116 /// Structural divergence from C: the giant ~322-line C switch
1117 /// statement at lex.c:725-936 is split into helper methods in
1118 /// Rust (lex_initial = LX1_OTHER plus the punctuation cases,
1119 /// lex_inang / lex_outang for the < and > arms). The flow is
1120 /// equivalent — same chars consumed, same tokens emitted — but
1121 /// the source-level layout differs. C's table-driven dispatch
1122 /// would Rust-port as `match c { '\\' => ..., '\n' => ..., ... }`
1123 /// which is what the helpers ultimately do.
1124 fn gettok(&mut self) -> LexTok {
1125 // lex.c:621 — `tokstr = NULL;` reset before each token.
1126 self.tokstr = None;
1127 // (zshrs-specific: tokfd reset lives here too — C does it
1128 // implicitly via the `peekfd = -1` local at lex.c:617 used
1129 // only when a digit-prefix redirection is detected.)
1130 self.tokfd = -1;
1131
1132 // lex.c:622 — `while (iblank(c = hgetc()) && !lexstop);` —
1133 // skip leading blanks (space/tab, NOT newline).
1134 let mut ws_iterations = 0;
1135 loop {
1136 ws_iterations += 1;
1137 if ws_iterations > 100_000 {
1138 self.error = Some("gettok: infinite loop in whitespace skip".to_string());
1139 return LexTok::Lexerr;
1140 }
1141 let c = match self.hgetc() {
1142 Some(c) => c,
1143 None => {
1144 // lex.c:624-625 — lexstop set, return ENDINPUT
1145 // (or LEXERR if errflag is set elsewhere).
1146 self.lexstop = true;
1147 return if self.error.is_some() {
1148 LexTok::Lexerr
1149 } else {
1150 LexTok::Endinput
1151 };
1152 }
1153 };
1154
1155 if !Self::is_blank(c) {
1156 self.hungetc(c);
1157 break;
1158 }
1159 }
1160
1161 let c = match self.hgetc() {
1162 Some(c) => c,
1163 None => {
1164 self.lexstop = true;
1165 return LexTok::Endinput;
1166 }
1167 };
1168
1169 // lex.c:623 — `toklineno = lineno;`
1170 self.toklineno = self.lineno;
1171 // lex.c:626 — `isfirstln = 0;` once we've consumed any non-
1172 // blank.
1173 self.isfirstln = false;
1174
1175 // lex.c:631-648 — dbparens (inside `(( … ))`) special path:
1176 // call dquote_parse with `;` or `)` as the end-char and
1177 // either return DINPAR (continue for-loop arith) or DOUTPAR
1178 // (close the arith block) or LEXERR.
1179 if self.dbparens {
1180 return self.lex_arith(c);
1181 }
1182
1183 // lex.c:649-668 — digit prefix on a redirection: `2> file`
1184 // treats `2` as the fd to redirect, not a literal arg. Three
1185 // shapes: `N>`/`N<` (single redir), `N&>` (errwrite), or
1186 // anything else (push back, treat as literal digit).
1187 if Self::is_digit(c) {
1188 let d = self.hgetc();
1189 match d {
1190 Some('&') => {
1191 let e = self.hgetc();
1192 if e == Some('>') {
1193 // lex.c:653-657 — `N&>` shape detected.
1194 self.tokfd = (c as u8 - b'0') as i32;
1195 self.hungetc('>');
1196 return self.lex_initial('&');
1197 }
1198 // lex.c:658-661 — not `N&>`, push everything back.
1199 if let Some(e) = e {
1200 self.hungetc(e);
1201 }
1202 self.hungetc('&');
1203 }
1204 Some('>') | Some('<') => {
1205 // lex.c:662-664 — `N>` or `N<` shape detected.
1206 self.tokfd = (c as u8 - b'0') as i32;
1207 return self.lex_initial(d.unwrap());
1208 }
1209 Some(d) => {
1210 // lex.c:665-668 — not a redir prefix, push back.
1211 self.hungetc(d);
1212 }
1213 None => {}
1214 }
1215 self.lexstop = false;
1216 }
1217
1218 // lex.c:670-936 — main dispatch on the leading char. zshrs
1219 // delegates to lex_initial which holds the equivalent of
1220 // lex.c's `switch (lexact1[c])` plus the gettokstr fallback
1221 // for LX1_OTHER.
1222 self.lex_initial(c)
1223 }
1224
1225 /// Lex (( ... )) arithmetic expression
1226 fn lex_arith(&mut self, c: char) -> LexTok {
1227 self.lexbuf.clear();
1228 self.hungetc(c);
1229
1230 let end_char = if self.infor > 0 { ';' } else { ')' };
1231 if self.dquote_parse(end_char, false).is_err() {
1232 return LexTok::Lexerr;
1233 }
1234
1235 self.tokstr = Some(self.lexbuf.as_str().to_string());
1236
1237 if !self.lexstop && self.infor > 0 {
1238 self.infor -= 1;
1239 return LexTok::Dinpar;
1240 }
1241
1242 // Check for closing ))
1243 match self.hgetc() {
1244 Some(')') => {
1245 self.dbparens = false;
1246 LexTok::Doutpar
1247 }
1248 c => {
1249 if let Some(c) = c {
1250 self.hungetc(c);
1251 }
1252 LexTok::Lexerr
1253 }
1254 }
1255 }
1256
1257 /// Handle initial character of token
1258 fn lex_initial(&mut self, c: char) -> LexTok {
1259 // Handle comments
1260 if c == '#' && !self.nocomments {
1261 return self.lex_comment();
1262 }
1263
1264 match c {
1265 '\\' => {
1266 let d = self.hgetc();
1267 if d == Some('\n') {
1268 // Line continuation - get next token
1269 return self.gettok();
1270 }
1271 if let Some(d) = d {
1272 self.hungetc(d);
1273 }
1274 self.lexstop = false;
1275 self.gettokstr(c, false)
1276 }
1277
1278 '\n' => LexTok::Newlin,
1279
1280 ';' => {
1281 let d = self.hgetc();
1282 match d {
1283 Some(';') => LexTok::Dsemi,
1284 Some('&') => LexTok::Semiamp,
1285 Some('|') => LexTok::Semibar,
1286 _ => {
1287 if let Some(d) = d {
1288 self.hungetc(d);
1289 }
1290 self.lexstop = false;
1291 LexTok::Semi
1292 }
1293 }
1294 }
1295
1296 '&' => {
1297 let d = self.hgetc();
1298 match d {
1299 Some('&') => LexTok::Damper,
1300 Some('!') | Some('|') => LexTok::Amperbang,
1301 Some('>') => {
1302 self.tokfd = self.tokfd.max(0);
1303 let e = self.hgetc();
1304 match e {
1305 Some('!') | Some('|') => LexTok::Outangampbang,
1306 Some('>') => {
1307 let f = self.hgetc();
1308 match f {
1309 Some('!') | Some('|') => LexTok::Doutangampbang,
1310 _ => {
1311 if let Some(f) = f {
1312 self.hungetc(f);
1313 }
1314 self.lexstop = false;
1315 LexTok::Doutangamp
1316 }
1317 }
1318 }
1319 _ => {
1320 if let Some(e) = e {
1321 self.hungetc(e);
1322 }
1323 self.lexstop = false;
1324 LexTok::Ampoutang
1325 }
1326 }
1327 }
1328 _ => {
1329 if let Some(d) = d {
1330 self.hungetc(d);
1331 }
1332 self.lexstop = false;
1333 LexTok::Amper
1334 }
1335 }
1336 }
1337
1338 '|' => {
1339 let d = self.hgetc();
1340 match d {
1341 Some('|') if self.incasepat <= 0 => LexTok::Dbar,
1342 Some('&') => LexTok::Baramp,
1343 _ => {
1344 if let Some(d) = d {
1345 self.hungetc(d);
1346 }
1347 self.lexstop = false;
1348 LexTok::Bar
1349 }
1350 }
1351 }
1352
1353 '(' => {
1354 let d = self.hgetc();
1355 match d {
1356 Some('(') => {
1357 if self.infor > 0 {
1358 self.dbparens = true;
1359 return LexTok::Dinpar;
1360 }
1361 if self.incmdpos {
1362 // Could be (( arithmetic )) or ( subshell )
1363 self.lexbuf.clear();
1364 match self.cmd_or_math() {
1365 CmdOrMath::Math => {
1366 self.tokstr = Some(self.lexbuf.as_str().to_string());
1367 return LexTok::Dinpar;
1368 }
1369 CmdOrMath::Cmd => {
1370 self.tokstr = None;
1371 return LexTok::Inpar;
1372 }
1373 CmdOrMath::Err => return LexTok::Lexerr,
1374 }
1375 }
1376 self.hungetc('(');
1377 self.lexstop = false;
1378 self.gettokstr('(', false)
1379 }
1380 Some(')') => LexTok::Inoutpar,
1381 _ => {
1382 if let Some(d) = d {
1383 self.hungetc(d);
1384 }
1385 self.lexstop = false;
1386 // Per lex.c:822 LX1_INPAR — at word boundary `(`
1387 // tokenizes as INPAR when SHGLOB || incond==1 ||
1388 // incmdpos. Otherwise falls through to gettokstr
1389 // (the `(` becomes start of a STRING — typical
1390 // for unquoted glob args like `ls (^foo)*`).
1391 // For `for x ( ... )` form, incmdpos is restored
1392 // to 1 via the oldpos-save-after-FOR mechanism,
1393 // so the next-token `(` correctly INPAR-izes.
1394 if self.incond == 1 || self.incmdpos || self.incasepat >= 1 {
1395 LexTok::Inpar
1396 } else {
1397 self.gettokstr('(', false)
1398 }
1399 }
1400 }
1401 }
1402
1403 ')' => LexTok::Outpar,
1404
1405 '{' => {
1406 // { is a command group only if followed by whitespace,
1407 // newline, or `}` (the empty-block form `{}`). zsh
1408 // treats `{}` as an empty compound — `foo() {}` is a
1409 // valid no-op function. Without `}` in this list,
1410 // `{}` got consumed as one literal token and ran as a
1411 // command, failing "command not found: {}".
1412 // The empty `{}` is also recognised AFTER a function
1413 // header `name()` even when `incmdpos` got cleared by
1414 // the preceding Outpar — peek for `}` regardless and
1415 // treat as Inbrace so `foo() {}` parses as a no-op
1416 // function body.
1417 let next = self.hgetc();
1418 let next_is_close = matches!(next, Some('}'));
1419 if self.incmdpos {
1420 let is_brace_group = matches!(next, Some(' ' | '\t' | '\n' | '}') | None);
1421 if let Some(ch) = next {
1422 self.hungetc(ch);
1423 }
1424 if is_brace_group {
1425 self.tokstr = Some("{".to_string());
1426 LexTok::Inbrace
1427 } else {
1428 self.gettokstr(c, false)
1429 }
1430 } else if next_is_close {
1431 // `{}` empty block in non-cmd position (function
1432 // body after `()`). Treat as Inbrace; the parser
1433 // will follow with Outbrace.
1434 if let Some(ch) = next {
1435 self.hungetc(ch);
1436 }
1437 self.tokstr = Some("{".to_string());
1438 LexTok::Inbrace
1439 } else {
1440 if let Some(ch) = next {
1441 self.hungetc(ch);
1442 }
1443 self.gettokstr(c, false)
1444 }
1445 }
1446
1447 '}' => {
1448 // } at start of token is always Outbrace (ends command group)
1449 // Inside a word, } would be handled by gettokstr but we never reach here mid-word
1450 self.tokstr = Some("}".to_string());
1451 LexTok::Outbrace
1452 }
1453
1454 '[' => {
1455 // [[ is a conditional expression start
1456 // [ can also be a command (test builtin) or array subscript
1457 // In case patterns (incasepat > 0), [ is part of glob pattern like [yY]
1458 if self.incasepat > 0 {
1459 self.gettokstr(c, false)
1460 } else if self.incmdpos {
1461 let next = self.hgetc();
1462 if next == Some('[') {
1463 // [[ - double bracket conditional
1464 self.tokstr = Some("[[".to_string());
1465 self.incond = 1;
1466 return LexTok::Dinbrack;
1467 }
1468 // Single [ - either test command or start of glob pattern
1469 if let Some(ch) = next {
1470 self.hungetc(ch);
1471 }
1472 self.tokstr = Some("[".to_string());
1473 LexTok::String
1474 } else {
1475 self.gettokstr(c, false)
1476 }
1477 }
1478
1479 ']' => {
1480 // ]] ends a conditional expression started by [[
1481 if self.incond > 0 {
1482 let next = self.hgetc();
1483 if next == Some(']') {
1484 self.tokstr = Some("]]".to_string());
1485 self.incond = 0;
1486 return LexTok::Doutbrack;
1487 }
1488 if let Some(ch) = next {
1489 self.hungetc(ch);
1490 }
1491 }
1492 self.gettokstr(c, false)
1493 }
1494
1495 '<' => {
1496 // In pattern context, < is literal (e.g., <-> in glob)
1497 if self.incondpat || self.incasepat > 0 {
1498 self.gettokstr(c, false)
1499 } else {
1500 self.lex_inang()
1501 }
1502 }
1503
1504 '>' => {
1505 // In pattern context, > is literal
1506 if self.incondpat || self.incasepat > 0 {
1507 self.gettokstr(c, false)
1508 } else {
1509 self.lex_outang()
1510 }
1511 }
1512
1513 _ => self.gettokstr(c, false),
1514 }
1515 }
1516
1517 /// Lex comment
1518 fn lex_comment(&mut self) -> LexTok {
1519 if self.lexflags.comments_keep {
1520 self.lexbuf.clear();
1521 self.add('#');
1522 }
1523
1524 loop {
1525 let c = self.hgetc();
1526 match c {
1527 Some('\n') | None => break,
1528 Some(c) => {
1529 if self.lexflags.comments_keep {
1530 self.add(c);
1531 }
1532 }
1533 }
1534 }
1535
1536 if self.lexflags.comments_keep {
1537 self.tokstr = Some(self.lexbuf.as_str().to_string());
1538 if !self.lexstop {
1539 self.hungetc('\n');
1540 }
1541 return LexTok::String;
1542 }
1543
1544 if self.lexflags.comments_strip && self.lexstop {
1545 return LexTok::Endinput;
1546 }
1547
1548 LexTok::Newlin
1549 }
1550
1551 /// Lex < and variants
1552 fn lex_inang(&mut self) -> LexTok {
1553 let d = self.hgetc();
1554 match d {
1555 Some('(') => {
1556 // Process substitution <(...)
1557 self.hungetc('(');
1558 self.lexstop = false;
1559 self.gettokstr('<', false)
1560 }
1561 Some('>') => LexTok::Inoutang,
1562 Some('<') => {
1563 let e = self.hgetc();
1564 match e {
1565 Some('(') => {
1566 self.hungetc('(');
1567 self.hungetc('<');
1568 LexTok::Inang
1569 }
1570 Some('<') => LexTok::Trinang,
1571 Some('-') => {
1572 self.heredoc_pending = 2; // <<- expects terminator next
1573 LexTok::Dinangdash
1574 }
1575 _ => {
1576 if let Some(e) = e {
1577 self.hungetc(e);
1578 }
1579 self.lexstop = false;
1580 self.heredoc_pending = 1; // << expects terminator next
1581 LexTok::Dinang
1582 }
1583 }
1584 }
1585 Some('&') => LexTok::Inangamp,
1586 _ => {
1587 if let Some(d) = d {
1588 self.hungetc(d);
1589 }
1590 self.lexstop = false;
1591 LexTok::Inang
1592 }
1593 }
1594 }
1595
1596 /// Lex > and variants
1597 fn lex_outang(&mut self) -> LexTok {
1598 let d = self.hgetc();
1599 match d {
1600 Some('(') => {
1601 // Process substitution >(...)
1602 self.hungetc('(');
1603 self.lexstop = false;
1604 self.gettokstr('>', false)
1605 }
1606 Some('&') => {
1607 let e = self.hgetc();
1608 match e {
1609 Some('!') | Some('|') => LexTok::Outangampbang,
1610 _ => {
1611 if let Some(e) = e {
1612 self.hungetc(e);
1613 }
1614 self.lexstop = false;
1615 LexTok::Outangamp
1616 }
1617 }
1618 }
1619 Some('!') | Some('|') => LexTok::Outangbang,
1620 Some('>') => {
1621 let e = self.hgetc();
1622 match e {
1623 Some('&') => {
1624 let f = self.hgetc();
1625 match f {
1626 Some('!') | Some('|') => LexTok::Doutangampbang,
1627 _ => {
1628 if let Some(f) = f {
1629 self.hungetc(f);
1630 }
1631 self.lexstop = false;
1632 LexTok::Doutangamp
1633 }
1634 }
1635 }
1636 Some('!') | Some('|') => LexTok::Doutangbang,
1637 Some('(') => {
1638 self.hungetc('(');
1639 self.hungetc('>');
1640 LexTok::Outang
1641 }
1642 _ => {
1643 if let Some(e) = e {
1644 self.hungetc(e);
1645 }
1646 self.lexstop = false;
1647 LexTok::Doutang
1648 }
1649 }
1650 }
1651 _ => {
1652 if let Some(d) = d {
1653 self.hungetc(d);
1654 }
1655 self.lexstop = false;
1656 LexTok::Outang
1657 }
1658 }
1659 }
1660
1661 /// Get rest of token string
1662 fn gettokstr(&mut self, c: char, sub: bool) -> LexTok {
1663 let mut bct = 0; // brace count
1664 let mut pct = 0; // parenthesis count
1665 let mut brct = 0; // bracket count
1666 let mut in_brace_param = 0;
1667 let mut peek = LexTok::String;
1668 let mut intpos = 1;
1669 let mut unmatched = '\0';
1670 let mut c = c;
1671 const MAX_ITERATIONS: usize = 100_000;
1672 let mut iterations = 0;
1673
1674 if !sub {
1675 self.lexbuf.clear();
1676 }
1677
1678 loop {
1679 iterations += 1;
1680 if iterations > MAX_ITERATIONS {
1681 self.error = Some("gettokstr exceeded maximum iterations".to_string());
1682 return LexTok::Lexerr;
1683 }
1684
1685 let inbl = Self::is_inblank(c);
1686
1687 if inbl && in_brace_param == 0 && pct == 0 {
1688 // Whitespace outside brace param ends token
1689 break;
1690 }
1691
1692 match c {
1693 // Whitespace is handled above for most cases
1694 ')' => {
1695 if in_brace_param > 0 || sub {
1696 self.add(char_tokens::OUTPAR);
1697 } else if pct > 0 {
1698 pct -= 1;
1699 self.add(char_tokens::OUTPAR);
1700 } else {
1701 break;
1702 }
1703 }
1704
1705 '|' => {
1706 if pct == 0 && in_brace_param == 0 {
1707 if sub {
1708 self.add(c);
1709 } else {
1710 break;
1711 }
1712 } else {
1713 self.add(char_tokens::BAR);
1714 }
1715 }
1716
1717 '$' => {
1718 let e = self.hgetc();
1719 match e {
1720 Some('\\') => {
1721 let f = self.hgetc();
1722 if f != Some('\n') {
1723 if let Some(f) = f {
1724 self.hungetc(f);
1725 }
1726 self.hungetc('\\');
1727 self.add(char_tokens::STRING);
1728 } else {
1729 // Line continuation after $
1730 continue;
1731 }
1732 }
1733 Some('[') => {
1734 // $[...] arithmetic
1735 self.add(char_tokens::STRING);
1736 self.add(char_tokens::INBRACK);
1737 if self.dquote_parse(']', sub).is_err() {
1738 peek = LexTok::Lexerr;
1739 break;
1740 }
1741 self.add(char_tokens::OUTBRACK);
1742 }
1743 Some('(') => {
1744 // $(...) or $((...))
1745 self.add(char_tokens::STRING);
1746 match self.cmd_or_math_sub() {
1747 CmdOrMath::Cmd => self.add(char_tokens::OUTPAR),
1748 CmdOrMath::Math => self.add(char_tokens::OUTPARMATH),
1749 CmdOrMath::Err => {
1750 peek = LexTok::Lexerr;
1751 break;
1752 }
1753 }
1754 }
1755 Some('{') => {
1756 self.add(c);
1757 self.add(char_tokens::INBRACE);
1758 bct += 1;
1759 if in_brace_param == 0 {
1760 in_brace_param = bct;
1761 }
1762 }
1763 Some('\'') => {
1764 // $'...' ANSI-C escape syntax.
1765 // Port of Src/lex.c:1284-1314 (LX2_QUOTE
1766 // branch when prev char was `String`):
1767 // only `\\` and `\'` emit a `Bnull`
1768 // marker (so getkeystring later
1769 // recognizes them as user-literal); any
1770 // other `\X` emits a literal `\` + the
1771 // following char so getkeystring's
1772 // standard `\n`/`\x`/`\u`/... decoding
1773 // can fire.
1774 self.add(char_tokens::QSTRING);
1775 self.add(char_tokens::SNULL);
1776 loop {
1777 let ch = self.hgetc();
1778 match ch {
1779 Some('\'') => break,
1780 Some('\\') => {
1781 let next = self.hgetc();
1782 match next {
1783 Some(n) => {
1784 if n == '\\' || n == '\'' {
1785 self.add(char_tokens::BNULL);
1786 } else {
1787 self.add('\\');
1788 }
1789 self.add(n);
1790 }
1791 None => {
1792 self.lexstop = true;
1793 unmatched = '\'';
1794 peek = LexTok::Lexerr;
1795 break;
1796 }
1797 }
1798 }
1799 Some(ch) => self.add(ch),
1800 None => {
1801 self.lexstop = true;
1802 unmatched = '\'';
1803 peek = LexTok::Lexerr;
1804 break;
1805 }
1806 }
1807 }
1808 if unmatched != '\0' {
1809 break;
1810 }
1811 self.add(char_tokens::SNULL);
1812 }
1813 Some('"') => {
1814 // $"..." localized string. Same shape as a
1815 // plain "..." but flagged via QSTRING+DNULL
1816 // so post-lex translation can substitute.
1817 self.add(char_tokens::QSTRING);
1818 self.add(char_tokens::DNULL);
1819 if self.dquote_parse('"', sub).is_err() {
1820 peek = LexTok::Lexerr;
1821 break;
1822 }
1823 self.add(char_tokens::DNULL);
1824 }
1825 _ => {
1826 if let Some(e) = e {
1827 self.hungetc(e);
1828 }
1829 self.lexstop = false;
1830 self.add(char_tokens::STRING);
1831 }
1832 }
1833 }
1834
1835 '[' => {
1836 if in_brace_param == 0 {
1837 brct += 1;
1838 }
1839 self.add(char_tokens::INBRACK);
1840 }
1841
1842 ']' => {
1843 if in_brace_param == 0 && brct > 0 {
1844 brct -= 1;
1845 }
1846 self.add(char_tokens::OUTBRACK);
1847 }
1848
1849 '(' => {
1850 // lex.c:1078-1135 LX2_INPAR — when `(` appears inside
1851 // a STRING and is immediately followed by `)`, the
1852 // string terminates at the `(`. The `()` is then
1853 // re-lexed as a separate INOUTPAR token. This handles
1854 // function definitions: `name()` lexes as STRING `name`
1855 // + INOUTPAR `()`, not STRING `name()`.
1856 //
1857 // Also (lex.c:1109-1112): under SHGLOB, a `(` followed
1858 // by whitespace at the start of a command-position word
1859 // (no nested brackets/braces) is a ksh function
1860 // definition signal — same break-out behavior.
1861 if in_brace_param == 0 && !sub {
1862 let e = self.hgetc();
1863 if let Some(ch) = e {
1864 self.hungetc(ch);
1865 }
1866 self.lexstop = false;
1867 if e == Some(')') {
1868 // `name()` — terminate STRING at `(` so the
1869 // following `()` re-lexes as INOUTPAR. The
1870 // loop's exit guard at line 2067 will
1871 // `hungetc(c)` to push the `(` back; we only
1872 // need to ensure `)` is also there. The
1873 // hungetc(ch) above already pushed `)`, so
1874 // breaking here yields unget_buf = [`(`, `)`]
1875 // after the guard, which the outer dispatch
1876 // reads as Inoutpar.
1877 break;
1878 }
1879 }
1880 if in_brace_param == 0 {
1881 pct += 1;
1882 }
1883 self.add(char_tokens::INPAR);
1884 }
1885
1886 '{' => {
1887 // Track braces for both ${...} param expansion and {...} brace expansion
1888 bct += 1;
1889 self.add(c);
1890 }
1891
1892 '}' => {
1893 if in_brace_param > 0 {
1894 if bct == in_brace_param {
1895 in_brace_param = 0;
1896 }
1897 bct -= 1;
1898 self.add(char_tokens::OUTBRACE);
1899 } else if bct > 0 {
1900 // Closing a brace expansion like {a,b}
1901 bct -= 1;
1902 self.add(c);
1903 } else {
1904 break;
1905 }
1906 }
1907
1908 '>' => {
1909 // In pattern context (incondpat), > is literal
1910 if in_brace_param > 0 || sub || self.incondpat || self.incasepat > 0 {
1911 self.add(c);
1912 } else {
1913 let e = self.hgetc();
1914 if e != Some('(') {
1915 if let Some(e) = e {
1916 self.hungetc(e);
1917 }
1918 self.lexstop = false;
1919 break;
1920 }
1921 // >(...)
1922 self.add(char_tokens::OUTANGPROC);
1923 if self.skip_command_sub().is_err() {
1924 peek = LexTok::Lexerr;
1925 break;
1926 }
1927 self.add(char_tokens::OUTPAR);
1928 }
1929 }
1930
1931 '<' => {
1932 // In pattern context (incondpat), < is literal
1933 if in_brace_param > 0 || sub || self.incondpat || self.incasepat > 0 {
1934 self.add(c);
1935 } else if let Some(range_chars) = self.try_numeric_range_glob() {
1936 // zsh numeric range glob `<N-M>`, `<->`, `<N->`,
1937 // `<-M>`. When `<` mid-word matches that exact
1938 // shape, swallow it into the word instead of
1939 // breaking out for redirection.
1940 self.add(c);
1941 for ch in range_chars.chars() {
1942 self.add(ch);
1943 }
1944 } else {
1945 let e = self.hgetc();
1946 if e != Some('(') {
1947 if let Some(e) = e {
1948 self.hungetc(e);
1949 }
1950 self.lexstop = false;
1951 break;
1952 }
1953 // <(...)
1954 self.add(char_tokens::INANG);
1955 if self.skip_command_sub().is_err() {
1956 peek = LexTok::Lexerr;
1957 break;
1958 }
1959 self.add(char_tokens::OUTPAR);
1960 }
1961 }
1962
1963 '=' => {
1964 if !sub {
1965 if intpos > 0 {
1966 // At start of token, check for =(...) process substitution
1967 let e = self.hgetc();
1968 if e == Some('(') {
1969 self.add(char_tokens::EQUALS);
1970 if self.skip_command_sub().is_err() {
1971 peek = LexTok::Lexerr;
1972 break;
1973 }
1974 self.add(char_tokens::OUTPAR);
1975 } else {
1976 if let Some(e) = e {
1977 self.hungetc(e);
1978 }
1979 self.lexstop = false;
1980 self.add(char_tokens::EQUALS);
1981 }
1982 } else if peek != LexTok::Envstring
1983 && (self.incmdpos || self.intypeset)
1984 && bct == 0
1985 && brct == 0
1986 && self.incasepat == 0
1987 {
1988 // Check for VAR=value assignment (but not in case pattern context)
1989 let tok_so_far = self.lexbuf.as_str().to_string();
1990 if self.is_valid_assignment_target(&tok_so_far) {
1991 let next = self.hgetc();
1992 if next == Some('(') {
1993 // VAR=(...) array assignment. Per zsh
1994 // (lex.c emits ENVARRAY with tokstr =
1995 // just the variable name, NOT
1996 // including the `=`). The `=` and
1997 // `(` are consumed by the lexer; the
1998 // parser knows ENVARRAY means assign-
1999 // array and reads the body that
2000 // follows.
2001 self.tokstr = Some(self.lexbuf.as_str().to_string());
2002 return LexTok::Envarray;
2003 }
2004 if let Some(next) = next {
2005 self.hungetc(next);
2006 }
2007 self.lexstop = false;
2008 peek = LexTok::Envstring;
2009 intpos = 2;
2010 self.add(char_tokens::EQUALS);
2011 } else {
2012 self.add(char_tokens::EQUALS);
2013 }
2014 } else {
2015 self.add(char_tokens::EQUALS);
2016 }
2017 } else {
2018 self.add(char_tokens::EQUALS);
2019 }
2020 }
2021
2022 '\\' => {
2023 let next = self.hgetc();
2024 if next == Some('\n') {
2025 // Line continuation
2026 let next = self.hgetc();
2027 if let Some(next) = next {
2028 c = next;
2029 continue;
2030 }
2031 break;
2032 } else {
2033 self.add(char_tokens::BNULL);
2034 if let Some(next) = next {
2035 self.add(next);
2036 }
2037 }
2038 }
2039
2040 '\'' => {
2041 // Single quoted string - everything literal until '
2042 self.add(char_tokens::SNULL);
2043 loop {
2044 let ch = self.hgetc();
2045 match ch {
2046 Some('\'') => break,
2047 Some(ch) => self.add(ch),
2048 None => {
2049 self.lexstop = true;
2050 unmatched = '\'';
2051 peek = LexTok::Lexerr;
2052 break;
2053 }
2054 }
2055 }
2056 if unmatched != '\0' {
2057 break;
2058 }
2059 self.add(char_tokens::SNULL);
2060 }
2061
2062 '"' => {
2063 // Double quoted string
2064 self.add(char_tokens::DNULL);
2065 if self.dquote_parse('"', sub).is_err() {
2066 unmatched = '"';
2067 if !self.lexflags.active {
2068 peek = LexTok::Lexerr;
2069 }
2070 break;
2071 }
2072 self.add(char_tokens::DNULL);
2073 }
2074
2075 '`' => {
2076 // Backtick command substitution
2077 self.add(char_tokens::TICK);
2078 loop {
2079 let ch = self.hgetc();
2080 match ch {
2081 Some('`') => break,
2082 Some('\\') => {
2083 let next = self.hgetc();
2084 match next {
2085 Some('\n') => continue, // Line continuation
2086 Some(c) if c == '`' || c == '\\' || c == '$' => {
2087 self.add(char_tokens::BNULL);
2088 self.add(c);
2089 }
2090 Some(c) => {
2091 self.add('\\');
2092 self.add(c);
2093 }
2094 None => break,
2095 }
2096 }
2097 Some(ch) => self.add(ch),
2098 None => {
2099 self.lexstop = true;
2100 unmatched = '`';
2101 peek = LexTok::Lexerr;
2102 break;
2103 }
2104 }
2105 }
2106 if unmatched != '\0' {
2107 break;
2108 }
2109 self.add(char_tokens::TICK);
2110 }
2111
2112 '~' => {
2113 self.add(char_tokens::TILDE);
2114 }
2115
2116 '#' => {
2117 self.add(char_tokens::POUND);
2118 }
2119
2120 '^' => {
2121 self.add(char_tokens::HAT);
2122 }
2123
2124 '*' => {
2125 self.add(char_tokens::STAR);
2126 }
2127
2128 '?' => {
2129 self.add(char_tokens::QUEST);
2130 }
2131
2132 ',' if bct > in_brace_param => {
2133 self.add(char_tokens::COMMA);
2134 }
2135
2136 '-' => {
2137 self.add(char_tokens::DASH);
2138 }
2139
2140 '!' if brct > 0 => {
2141 self.add(char_tokens::BANG);
2142 }
2143
2144 // Terminators — but only when we're at the top level of
2145 // the current word. Inside a brace parameter expansion
2146 // `${...}`, parenthesized flag block `(@s.;.)`, or
2147 // bracketed subscript `[...]`, `;` is just a delimiter
2148 // character (e.g. the field separator in `(@s.;.)`),
2149 // not a statement terminator. Real zsh handles this
2150 // via gettokstr's incmdpos / bct / pct accounting; we
2151 // gate on the same counters.
2152 '\n' | ';' | '&' if in_brace_param == 0 && pct == 0 && brct == 0 => {
2153 break;
2154 }
2155 '\n' | ';' | '&' => {
2156 self.add(c);
2157 }
2158
2159 _ => {
2160 self.add(c);
2161 }
2162 }
2163
2164 c = match self.hgetc() {
2165 Some(c) => c,
2166 None => {
2167 self.lexstop = true;
2168 break;
2169 }
2170 };
2171
2172 if intpos > 0 {
2173 intpos -= 1;
2174 }
2175 }
2176
2177 // Put back the character that ended the token
2178 if !self.lexstop {
2179 self.hungetc(c);
2180 }
2181
2182 if unmatched != '\0' && !self.lexflags.active {
2183 self.error = Some(format!("unmatched {}", unmatched));
2184 }
2185
2186 if in_brace_param > 0 {
2187 self.error = Some("closing brace expected".to_string());
2188 }
2189
2190 self.tokstr = Some(self.lexbuf.as_str().to_string());
2191 peek
2192 }
2193
2194 /// Check if a string is a valid assignment target (identifier or array ref).
2195 ///
2196 /// zsh accepts identifier (`[A-Za-z_][A-Za-z0-9_]*`) optionally followed by
2197 /// a `[...]` subscript. Bare digits are NOT a valid lvalue (rejected at
2198 /// `if c.is_ascii_digit()` below — array index expressions like `arr[2]`
2199 /// are caught by the subscript handler, not here). And the first char
2200 /// must NOT be a zsh internal token byte — `$=foo` (where `$` becomes
2201 /// the STRING token 0x85) is parameter substitution with the `=` flag,
2202 /// NOT an envstring assignment.
2203 fn is_valid_assignment_target(&self, s: &str) -> bool {
2204 let mut chars = s.chars().peekable();
2205
2206 // Reject leading token byte — `$VAR=` is parameter substitution,
2207 // not assignment. Same for `*=`, `?=`, etc.
2208 if let Some(&c) = chars.peek() {
2209 if char_tokens::is_token(c) {
2210 return false;
2211 }
2212 }
2213
2214 // Check for leading digit (invalid)
2215 if let Some(&c) = chars.peek() {
2216 if c.is_ascii_digit() {
2217 // Could be array index, check rest
2218 while let Some(&c) = chars.peek() {
2219 if !c.is_ascii_digit() {
2220 break;
2221 }
2222 chars.next();
2223 }
2224 return chars.peek().is_none();
2225 }
2226 }
2227
2228 // Check identifier
2229 let mut has_ident = false;
2230 while let Some(&c) = chars.peek() {
2231 if c == char_tokens::INBRACK || c == '[' {
2232 break;
2233 }
2234 if c == '+' {
2235 // foo+=value
2236 chars.next();
2237 return chars.peek().is_none() || chars.peek() == Some(&'=');
2238 }
2239 if !Self::is_ident(c) && c != char_tokens::STRING && !char_tokens::is_token(c) {
2240 return false;
2241 }
2242 has_ident = true;
2243 chars.next();
2244 }
2245
2246 has_ident
2247 }
2248
2249 /// Parse the body of a double-quoted string (or any context that
2250 /// uses double-quote tokenization — `(( ))`, `${...}`, `$( ( ) )`).
2251 /// Direct port of zsh/Src/lex.c:1486-1693 `dquote_parse`. Reads
2252 /// chars until `endchar` is seen at depth 0, handling escapes,
2253 /// `${...}` parameter substitutions, `$(...)` and backtick command
2254 /// substitutions, `$((...))` arithmetic, and inner double-quoted
2255 /// strings. The `sub` flag toggles substitution-context tokens
2256 /// (lex.c:1487 `int sub` argument).
2257 ///
2258 /// zshrs port note: the recursion guard at the top is a Rust
2259 /// safety net; the C source relies on the runtime stack. Inner
2260 /// logic delegates to `dquote_parse_inner` which holds the actual
2261 /// per-char state machine matching lex.c:1495-1692.
2262 fn dquote_parse(&mut self, endchar: char, sub: bool) -> Result<(), ()> {
2263 self.recursion_depth += 1;
2264 if self.check_recursion() {
2265 self.recursion_depth -= 1;
2266 return Err(());
2267 }
2268
2269 let result = self.dquote_parse_inner(endchar, sub);
2270 self.recursion_depth -= 1;
2271 result
2272 }
2273
2274 fn dquote_parse_inner(&mut self, endchar: char, sub: bool) -> Result<(), ()> {
2275 let mut pct = 0; // parenthesis count
2276 let mut brct = 0; // bracket count
2277 let mut bct = 0; // brace count (for ${...})
2278 let mut intick = false; // inside backtick
2279 let is_math = endchar == ')' || endchar == ']' || self.infor > 0;
2280 const MAX_ITERATIONS: usize = 100_000;
2281 let mut iterations = 0;
2282
2283 loop {
2284 iterations += 1;
2285 if iterations > MAX_ITERATIONS {
2286 self.error = Some("dquote_parse exceeded maximum iterations".to_string());
2287 return Err(());
2288 }
2289 let c = self.hgetc();
2290 let c = match c {
2291 Some(c) if c == endchar && !intick && bct == 0 => {
2292 if is_math && (pct > 0 || brct > 0) {
2293 self.add(c);
2294 if c == ')' {
2295 pct -= 1;
2296 } else if c == ']' {
2297 brct -= 1;
2298 }
2299 continue;
2300 }
2301 return Ok(());
2302 }
2303 Some(c) => c,
2304 None => {
2305 self.lexstop = true;
2306 return Err(());
2307 }
2308 };
2309
2310 match c {
2311 '\\' => {
2312 let next = self.hgetc();
2313 match next {
2314 Some('\n') if !sub => continue, // Line continuation
2315 Some(c)
2316 if c == '$'
2317 || c == '\\'
2318 || (c == '}' && !intick && bct > 0)
2319 || c == endchar
2320 || c == '`'
2321 || (endchar == ']'
2322 && (c == '['
2323 || c == ']'
2324 || c == '('
2325 || c == ')'
2326 || c == '{'
2327 || c == '}'
2328 || (c == '"' && sub))) =>
2329 {
2330 self.add(char_tokens::BNULL);
2331 self.add(c);
2332 }
2333 Some(c) => {
2334 self.add('\\');
2335 self.hungetc(c);
2336 continue;
2337 }
2338 None => {
2339 self.add('\\');
2340 }
2341 }
2342 }
2343
2344 '$' => {
2345 if intick {
2346 self.add(c);
2347 continue;
2348 }
2349 let next = self.hgetc();
2350 match next {
2351 Some('(') => {
2352 self.add(char_tokens::QSTRING);
2353 match self.cmd_or_math_sub() {
2354 CmdOrMath::Cmd => self.add(char_tokens::OUTPAR),
2355 CmdOrMath::Math => self.add(char_tokens::OUTPARMATH),
2356 CmdOrMath::Err => return Err(()),
2357 }
2358 }
2359 Some('[') => {
2360 self.add(char_tokens::STRING);
2361 self.add(char_tokens::INBRACK);
2362 self.dquote_parse(']', sub)?;
2363 self.add(char_tokens::OUTBRACK);
2364 }
2365 Some('{') => {
2366 self.add(char_tokens::QSTRING);
2367 self.add(char_tokens::INBRACE);
2368 bct += 1;
2369 }
2370 Some('$') => {
2371 self.add(char_tokens::QSTRING);
2372 self.add('$');
2373 }
2374 _ => {
2375 if let Some(next) = next {
2376 self.hungetc(next);
2377 }
2378 self.lexstop = false;
2379 self.add(char_tokens::QSTRING);
2380 }
2381 }
2382 }
2383
2384 '}' => {
2385 if intick || bct == 0 {
2386 self.add(c);
2387 } else {
2388 self.add(char_tokens::OUTBRACE);
2389 bct -= 1;
2390 }
2391 }
2392
2393 '`' => {
2394 self.add(char_tokens::QTICK);
2395 intick = !intick;
2396 }
2397
2398 '(' => {
2399 if !is_math || bct == 0 {
2400 pct += 1;
2401 }
2402 self.add(c);
2403 }
2404
2405 ')' => {
2406 if !is_math || bct == 0 {
2407 if pct == 0 && is_math {
2408 return Err(());
2409 }
2410 pct -= 1;
2411 }
2412 self.add(c);
2413 }
2414
2415 '[' => {
2416 if !is_math || bct == 0 {
2417 brct += 1;
2418 }
2419 self.add(c);
2420 }
2421
2422 ']' => {
2423 if !is_math || bct == 0 {
2424 if brct == 0 && is_math {
2425 return Err(());
2426 }
2427 brct -= 1;
2428 }
2429 self.add(c);
2430 }
2431
2432 '"' => {
2433 if intick || (endchar != '"' && bct == 0) {
2434 self.add(c);
2435 } else if bct > 0 {
2436 self.add(char_tokens::DNULL);
2437 self.dquote_parse('"', sub)?;
2438 self.add(char_tokens::DNULL);
2439 } else {
2440 return Err(());
2441 }
2442 }
2443
2444 _ => {
2445 self.add(c);
2446 }
2447 }
2448 }
2449 }
2450
2451 /// Determine if (( is arithmetic or command
2452 /// Decide whether `( ... )` after a `$` is a math expression
2453 /// `$((...))` or a command substitution `$(...)`. Direct port of
2454 /// zsh/Src/lex.c:495-532 `cmd_or_math`. Tries dquote_parse first;
2455 /// if it succeeds AND the next char is `)` (closing the second
2456 /// paren of `(( ))`), it's math. Otherwise rewinds and treats as
2457 /// a command substitution.
2458 fn cmd_or_math(&mut self) -> CmdOrMath {
2459 let oldlen = self.lexbuf.len();
2460
2461 // Per lex.c:498-518 — `cmd_or_math` calls `dquote_parse(')')`
2462 // which fills lexbuf with ONLY the inner expression, then checks
2463 // for the closing `)`. The surrounding `((` / `))` are NOT added
2464 // to lexbuf. zshrs previously added INPAR + '(' before dquote and
2465 // ')' after, polluting DINPAR's tokstr with the literal parens.
2466 // Removed to match C exactly.
2467 if self.dquote_parse(')', false).is_err() {
2468 // Back up and try as command
2469 while self.lexbuf.len() > oldlen {
2470 if let Some(c) = self.lexbuf.pop() {
2471 self.hungetc(c);
2472 }
2473 }
2474 self.hungetc('(');
2475 self.lexstop = false;
2476 return if self.skip_command_sub().is_err() {
2477 CmdOrMath::Err
2478 } else {
2479 CmdOrMath::Cmd
2480 };
2481 }
2482
2483 // Check for closing ) — matches C lex.c:511-512: success-with-`)`
2484 // means `((..))` was math. Don't add `)` to lexbuf.
2485 let c = self.hgetc();
2486 if c == Some(')') {
2487 return CmdOrMath::Math;
2488 }
2489
2490 // Not math, back up
2491 if let Some(c) = c {
2492 self.hungetc(c);
2493 }
2494 self.lexstop = false;
2495
2496 // Back up token
2497 while self.lexbuf.len() > oldlen {
2498 if let Some(c) = self.lexbuf.pop() {
2499 self.hungetc(c);
2500 }
2501 }
2502 self.hungetc('(');
2503
2504 if self.skip_command_sub().is_err() {
2505 CmdOrMath::Err
2506 } else {
2507 CmdOrMath::Cmd
2508 }
2509 }
2510
2511 /// Parse `$(...)` or `$((...))` after the `$` has been consumed.
2512 /// Direct port of zsh/Src/lex.c:540-573 `cmd_or_math_sub`. Reads
2513 /// the next char to discriminate: a leading `(` plus successful
2514 /// math parse via `cmd_or_math` → arithmetic substitution (with
2515 /// the open-paren retroactively rewritten to Inparmath); else
2516 /// command substitution via skip_command_sub.
2517 fn cmd_or_math_sub(&mut self) -> CmdOrMath {
2518 const MAX_CONTINUATIONS: usize = 10_000;
2519 let mut continuations = 0;
2520
2521 loop {
2522 continuations += 1;
2523 if continuations > MAX_CONTINUATIONS {
2524 self.error = Some("cmd_or_math_sub: too many line continuations".to_string());
2525 return CmdOrMath::Err;
2526 }
2527
2528 let c = self.hgetc();
2529 if c == Some('\\') {
2530 let c2 = self.hgetc();
2531 if c2 != Some('\n') {
2532 if let Some(c2) = c2 {
2533 self.hungetc(c2);
2534 }
2535 self.hungetc('\\');
2536 self.lexstop = false;
2537 return if self.skip_command_sub().is_err() {
2538 CmdOrMath::Err
2539 } else {
2540 CmdOrMath::Cmd
2541 };
2542 }
2543 // Line continuation, try again (loop instead of recursion)
2544 continue;
2545 }
2546
2547 // Not a line continuation, process normally
2548 if c == Some('(') {
2549 // Might be $((...))
2550 let lexpos = self.lexbuf.len();
2551 self.add(char_tokens::INPAR);
2552 self.add('(');
2553
2554 if self.dquote_parse(')', false).is_ok() {
2555 let c2 = self.hgetc();
2556 if c2 == Some(')') {
2557 self.add(')');
2558 return CmdOrMath::Math;
2559 }
2560 if let Some(c2) = c2 {
2561 self.hungetc(c2);
2562 }
2563 }
2564
2565 // Not math, restore and parse as command
2566 while self.lexbuf.len() > lexpos {
2567 if let Some(ch) = self.lexbuf.pop() {
2568 self.hungetc(ch);
2569 }
2570 }
2571 self.hungetc('(');
2572 self.lexstop = false;
2573 } else {
2574 if let Some(c) = c {
2575 self.hungetc(c);
2576 }
2577 self.lexstop = false;
2578 }
2579
2580 return if self.skip_command_sub().is_err() {
2581 CmdOrMath::Err
2582 } else {
2583 CmdOrMath::Cmd
2584 };
2585 }
2586 }
2587
2588 /// Skip over `(...)` for command-style substitutions: `$(...)`,
2589 /// `<(...)`, `>(...)`. Direct port of zsh/Src/lex.c:2080-end
2590 /// `skipcomm`. Per the C source comment: "we'll parse the input
2591 /// until we find an unmatched closing parenthesis. However, we'll
2592 /// throw away the result of the parsing and just keep the string
2593 /// we've built up on the way."
2594 ///
2595 /// zshrs port note: the C source uses zcontext_save/restore +
2596 /// strinbeg/inpush to set up an isolated lex context for the
2597 /// throw-away parse. zshrs's standalone walker tracks paren
2598 /// depth directly without re-entering the parser. Same
2599 /// invariant: stops at the matching `)`.
2600 fn skip_command_sub(&mut self) -> Result<(), ()> {
2601 let mut pct = 1;
2602 let mut start = true;
2603 const MAX_ITERATIONS: usize = 100_000;
2604 let mut iterations = 0;
2605
2606 self.add(char_tokens::INPAR);
2607
2608 loop {
2609 iterations += 1;
2610 if iterations > MAX_ITERATIONS {
2611 self.error = Some("skip_command_sub exceeded maximum iterations".to_string());
2612 return Err(());
2613 }
2614
2615 let c = self.hgetc();
2616 let c = match c {
2617 Some(c) => c,
2618 None => {
2619 self.lexstop = true;
2620 return Err(());
2621 }
2622 };
2623
2624 let iswhite = Self::is_inblank(c);
2625
2626 match c {
2627 '(' => {
2628 pct += 1;
2629 self.add(c);
2630 }
2631 ')' => {
2632 pct -= 1;
2633 if pct == 0 {
2634 return Ok(());
2635 }
2636 self.add(c);
2637 }
2638 '\\' => {
2639 self.add(c);
2640 if let Some(c) = self.hgetc() {
2641 self.add(c);
2642 }
2643 }
2644 '\'' => {
2645 self.add(c);
2646 loop {
2647 let ch = self.hgetc();
2648 match ch {
2649 Some('\'') => {
2650 self.add('\'');
2651 break;
2652 }
2653 Some(ch) => self.add(ch),
2654 None => {
2655 self.lexstop = true;
2656 return Err(());
2657 }
2658 }
2659 }
2660 }
2661 '"' => {
2662 self.add(c);
2663 loop {
2664 let ch = self.hgetc();
2665 match ch {
2666 Some('"') => {
2667 self.add('"');
2668 break;
2669 }
2670 Some('\\') => {
2671 self.add('\\');
2672 if let Some(ch) = self.hgetc() {
2673 self.add(ch);
2674 }
2675 }
2676 Some(ch) => self.add(ch),
2677 None => {
2678 self.lexstop = true;
2679 return Err(());
2680 }
2681 }
2682 }
2683 }
2684 '`' => {
2685 self.add(c);
2686 loop {
2687 let ch = self.hgetc();
2688 match ch {
2689 Some('`') => {
2690 self.add('`');
2691 break;
2692 }
2693 Some('\\') => {
2694 self.add('\\');
2695 if let Some(ch) = self.hgetc() {
2696 self.add(ch);
2697 }
2698 }
2699 Some(ch) => self.add(ch),
2700 None => {
2701 self.lexstop = true;
2702 return Err(());
2703 }
2704 }
2705 }
2706 }
2707 '#' if start => {
2708 self.add(c);
2709 // Skip comment to end of line
2710 loop {
2711 let ch = self.hgetc();
2712 match ch {
2713 Some('\n') => {
2714 self.add('\n');
2715 break;
2716 }
2717 Some(ch) => self.add(ch),
2718 None => break,
2719 }
2720 }
2721 }
2722 _ => {
2723 self.add(c);
2724 }
2725 }
2726
2727 start = iswhite;
2728 }
2729 }
2730
2731 /// Lex next token AND update per-context flags. Direct port of
2732 /// zsh/Src/lex.c:316-369 `ctxtlex`. The post-token state machine
2733 /// at lex.c:322-358 sets `incmdpos` based on the token shape:
2734 /// list separators / pipes / control keywords reset to cmd-pos;
2735 /// word-shaped tokens leave cmd-pos. Redirections (lex.c:361-368)
2736 /// stash prior incmdpos and force the redir target to non-cmd-pos.
2737 pub fn ctxtlex(&mut self) {
2738 // lex.c:319 — static `oldpos` cache for redir-target restore
2739 // is captured per-call here as `oldpos` below (zshrs's parser
2740 // re-enters ctxtlex per token, no need for static persistence).
2741
2742 // lex.c:321 — `zshlex();` to advance to the next token.
2743 self.zshlex();
2744
2745 // lex.c:322-358 — post-token incmdpos switch.
2746 match self.tok {
2747 // lex.c:323-343 — separators / openers / conjunctions /
2748 // control keywords — back into cmd-pos so the next token
2749 // can be a fresh command.
2750 LexTok::Seper
2751 | LexTok::Newlin
2752 | LexTok::Semi
2753 | LexTok::Dsemi
2754 | LexTok::Semiamp
2755 | LexTok::Semibar
2756 | LexTok::Amper
2757 | LexTok::Amperbang
2758 | LexTok::Inpar
2759 | LexTok::Inbrace
2760 | LexTok::Dbar
2761 | LexTok::Damper
2762 | LexTok::Bar
2763 | LexTok::Baramp
2764 | LexTok::Inoutpar
2765 | LexTok::Doloop
2766 | LexTok::Then
2767 | LexTok::Elif
2768 | LexTok::Else
2769 | LexTok::Doutbrack => {
2770 self.incmdpos = true;
2771 }
2772 // lex.c:345-353 — word/value-shaped tokens leave cmd-pos
2773 // so subsequent tokens are arguments, not a fresh command.
2774 LexTok::String
2775 | LexTok::Typeset
2776 | LexTok::Envarray
2777 | LexTok::Outpar
2778 | LexTok::Case
2779 | LexTok::Dinbrack => {
2780 self.incmdpos = false;
2781 }
2782 _ => {}
2783 }
2784
2785 // lex.c:359-360 — `infor` decay. FOR sets infor=2 so the next
2786 // DINPAR can detect c-style for. After any non-DINPAR, decay
2787 // to 0 (or back to 2 if we just saw FOR again).
2788 if self.tok != LexTok::Dinpar {
2789 self.infor = if self.tok == LexTok::For { 2 } else { 0 };
2790 }
2791
2792 // lex.c:361-368 — redir-target context dance. After consuming
2793 // a redir operator, the following token (the file path) sees
2794 // incmdpos=0 even when its inherent shape would put it back
2795 // in cmd-pos. After the redir target, restore from oldpos
2796 // (struct field — must persist across zshlex calls).
2797 if self.tok.is_redirop()
2798 || self.tok == LexTok::For
2799 || self.tok == LexTok::Foreach
2800 || self.tok == LexTok::Select
2801 {
2802 self.inredir = true;
2803 self.oldpos = self.incmdpos;
2804 self.incmdpos = false;
2805 } else if self.inredir {
2806 self.incmdpos = self.oldpos;
2807 self.inredir = false;
2808 }
2809 }
2810
2811 /// Mark the current word as the one ZLE was looking for. Direct
2812 /// port of zsh/Src/lex.c:1881-1897 `gotword`. Only meaningful
2813 /// when the lexer was started with LEXFLAGS_ZLE for completion;
2814 /// after this call `lexflags` is cleared so subsequent tokens
2815 /// don't re-trigger word tracking.
2816 ///
2817 /// zshrs port note: zsh's gotword updates `wb`/`we` (word begin/
2818 /// end positions) based on `zlemetacs` (cursor pos), `zlemetall`
2819 /// (line length), `inbufct`, and `addedx` — all live in zsh's
2820 /// input.c globals which zshrs hasn't wired through the lexer.
2821 /// Only the `lexflags = 0` side-effect at lex.c:1895 is
2822 /// reproducible without that integration.
2823 pub fn gotword(&mut self) {
2824 // lex.c:1895 — `lexflags = 0;`
2825 self.lexflags = LexFlags::default();
2826 }
2827
2828 /// Register a heredoc to be processed at next newline
2829 pub fn register_heredoc(&mut self, terminator: String, strip_tabs: bool) {
2830 self.heredocs.push(HereDoc {
2831 terminator,
2832 strip_tabs,
2833 content: String::new(),
2834 quoted: false,
2835 processed: false,
2836 });
2837 }
2838
2839 /// Check for reserved word — mirrors lex.c:2002-2015 in `exalias`,
2840 /// but reachable from the bare `zshlex` path (without an
2841 /// AliasResolver). Promotes STRING tokens to keyword tokens when:
2842 /// - incmdpos is set (or text is `}` ending a brace block)
2843 /// - text is `]]` and we're inside `[[ ]]` (incond > 0)
2844 /// - text is bare `!` and we're at the start of a cond (incond == 1)
2845 pub fn check_reserved_word(&mut self) -> bool {
2846 if let Some(ref tokstr) = self.tokstr {
2847 if self.incmdpos || (tokstr == "}" && self.tok == LexTok::String) {
2848 if let Some(tok) = crate::tokens::lookup_reserved_word(tokstr) {
2849 self.tok = tok;
2850 if tok == LexTok::Repeat {
2851 self.inrepeat = 1;
2852 }
2853 if tok == LexTok::Dinbrack {
2854 self.incond = 1;
2855 }
2856 return true;
2857 }
2858 if tokstr == "]]" && self.incond > 0 {
2859 self.tok = LexTok::Doutbrack;
2860 self.incond = 0;
2861 return true;
2862 }
2863 }
2864 // lex.c:2010-2014 — `]]` and `!` are recognized inside `[[`
2865 // regardless of incmdpos.
2866 if self.incond > 0 && tokstr == "]]" {
2867 self.tok = LexTok::Doutbrack;
2868 self.incond = 0;
2869 return true;
2870 }
2871 if self.incond == 1 && tokstr == "!" {
2872 self.tok = LexTok::Bang;
2873 return true;
2874 }
2875 }
2876 false
2877 }
2878}
2879
2880/// Result of determining if (( is arithmetic or command
2881enum CmdOrMath {
2882 Cmd,
2883 Math,
2884 Err,
2885}
2886
2887// ============================================================================
2888// Additional parsing functions ported from lex.c
2889// ============================================================================
2890
2891/// Check whether we're looking at valid numeric globbing syntax
2892/// `<N-M>` / `<N->` / `<-M>` / `<->`. Call pointing just after the
2893/// opening `<`. Leaves the input position unchanged, returning true
2894/// or false.
2895///
2896/// Direct port of zsh/Src/lex.c:580-610 `isnumglob`. C source uses
2897/// hgetc/hungetc against the input stream and a temp buffer to
2898/// remember consumed chars; zshrs takes a `(input, pos)` slice and
2899/// scans without consumption. Same predicate, different I/O model.
2900pub fn isnumglob(input: &str, pos: usize) -> bool {
2901 let chars: Vec<char> = input[pos..].chars().collect();
2902 let mut i = 0;
2903 let mut expect_close = false;
2904
2905 // Look for digits, then -, then digits, then >
2906 while i < chars.len() {
2907 let c = chars[i];
2908 if c.is_ascii_digit() {
2909 i += 1;
2910 } else if c == '-' && !expect_close {
2911 expect_close = true;
2912 i += 1;
2913 } else if c == '>' && expect_close {
2914 return true;
2915 } else {
2916 break;
2917 }
2918 }
2919 false
2920}
2921
2922/// Tokenize a string as if in double quotes (error-tolerant variant).
2923///
2924/// Direct port of zsh/Src/lex.c:1713-1733 `parsestrnoerr`. The C
2925/// source: zcontext_save → untokenize → inpush → strinbeg →
2926/// `lexbuf.ptr = tokstr = *s; lexbuf.siz = l + 1` →
2927/// `err = dquote_parse('\0', 1)` → strinend → inpop → zcontext_restore.
2928/// Returns the tokenized string on success, or the offending char as
2929/// an error code (zsh convention: `> 32 && < 127` → printable, else
2930/// generic).
2931///
2932/// zshrs port: the C version drives the lexer's dquote_parse method
2933/// against the input string. zshrs's standalone walker produces the
2934/// same BNULL/QSTRING/QTICK token markers without re-entering the
2935/// lexer — same output for typical bodies. Documented divergence:
2936/// nested cmd-sub `$(...)` and arith `$((...))` aren't lexed
2937/// recursively; the runtime handles them at expansion time.
2938pub fn parsestrnoerr(s: &str) -> Result<String, String> {
2939 parsestr_inner(s)
2940}
2941
2942/// Tokenize a string as if in double quotes (error-reporting variant).
2943///
2944/// Direct port of zsh/Src/lex.c:1693-1709 `parsestr`. C source:
2945/// `if ((err = parsestrnoerr(s))) { untokenize(*s); ... zerr("parse
2946/// error near `%c'", err); tok = LEXERR; }`. zshrs's wrapper
2947/// returns the same Result and lets the caller emit the diagnostic.
2948///
2949/// Both `parsestr` and `parsestrnoerr` share the inner walker; the
2950/// only difference in C is whether errors trigger `zerr`. zshrs
2951/// returns `Err(msg)` from both — the caller decides whether to
2952/// surface the diagnostic.
2953pub fn parsestr(s: &str) -> Result<String, String> {
2954 parsestr_inner(s)
2955}
2956
2957/// Shared body for parsestr / parsestrnoerr.
2958fn parsestr_inner(s: &str) -> Result<String, String> {
2959 let mut result = String::with_capacity(s.len());
2960 let chars: Vec<char> = s.chars().collect();
2961 let mut i = 0;
2962
2963 while i < chars.len() {
2964 let c = chars[i];
2965 match c {
2966 '\\' => {
2967 i += 1;
2968 if i < chars.len() {
2969 let next = chars[i];
2970 match next {
2971 '$' | '\\' | '`' | '"' | '\n' => {
2972 result.push(char_tokens::BNULL);
2973 result.push(next);
2974 }
2975 _ => {
2976 result.push('\\');
2977 result.push(next);
2978 }
2979 }
2980 } else {
2981 result.push('\\');
2982 }
2983 }
2984 '$' => {
2985 result.push(char_tokens::QSTRING);
2986 if i + 1 < chars.len() {
2987 let next = chars[i + 1];
2988 if next == '{' {
2989 result.push(char_tokens::INBRACE);
2990 i += 1;
2991 } else if next == '(' {
2992 result.push(char_tokens::INPAR);
2993 i += 1;
2994 }
2995 }
2996 }
2997 '`' => {
2998 result.push(char_tokens::QTICK);
2999 }
3000 _ => {
3001 result.push(c);
3002 }
3003 }
3004 i += 1;
3005 }
3006
3007 Ok(result)
3008}
3009
3010/// Parse a subscript in string s. Return the position after the
3011/// closing bracket, or None on error.
3012///
3013/// Direct port of zsh/Src/lex.c:1742-1788 `parse_subscript`. The C
3014/// source uses dupstring_wlen + inpush + dquote_parse to lex the
3015/// subscript through the main lexer; zshrs implements a focused
3016/// bracket-balancing walker that handles the same nesting rules
3017/// (`[...]`, `(...)`, `{...}`) without re-entering the lexer.
3018///
3019/// zshrs port note: zsh's parse_subscript also handles a `sub`
3020/// flag that controls whether `$` and quotes are tokenized — that
3021/// flag isn't exposed here. Most callers don't need it; the few
3022/// that do (parameter expansion's `${var[expr]}`) handle the
3023/// quote-aware lex separately at the expansion layer.
3024pub fn parse_subscript(s: &str, endchar: char) -> Option<usize> {
3025 if s.is_empty() || s.starts_with(endchar) {
3026 return None;
3027 }
3028
3029 let chars: Vec<char> = s.chars().collect();
3030 let mut i = 0;
3031 let mut depth = 0;
3032 let mut in_dquote = false;
3033 let mut in_squote = false;
3034
3035 while i < chars.len() {
3036 let c = chars[i];
3037
3038 if in_squote {
3039 if c == '\'' {
3040 in_squote = false;
3041 }
3042 i += 1;
3043 continue;
3044 }
3045
3046 if in_dquote {
3047 if c == '"' {
3048 in_dquote = false;
3049 } else if c == '\\' && i + 1 < chars.len() {
3050 i += 1; // skip escaped char
3051 }
3052 i += 1;
3053 continue;
3054 }
3055
3056 match c {
3057 '\\' => {
3058 i += 1; // skip next char
3059 }
3060 '\'' => {
3061 in_squote = true;
3062 }
3063 '"' => {
3064 in_dquote = true;
3065 }
3066 '[' | '(' => {
3067 depth += 1;
3068 }
3069 ']' | ')' => {
3070 if depth > 0 {
3071 depth -= 1;
3072 } else if c == endchar {
3073 return Some(i);
3074 }
3075 }
3076 _ => {}
3077 }
3078
3079 if c == endchar && depth == 0 {
3080 return Some(i);
3081 }
3082
3083 i += 1;
3084 }
3085
3086 None
3087}
3088
3089/// Tokenize a string as if it were a normal command-line argument
3090/// but it may contain separators. Used for ${...%...} substitutions.
3091///
3092/// Direct port of zsh/Src/lex.c:1796-1880 `parse_subst_string`.
3093/// zsh's version sets `noaliases = 1` + `lexflags = 0` + uses
3094/// zcontext_save/inpush/strinbeg → dquote_parse('\0', 1) →
3095/// strinend/inpop/zcontext_restore. zshrs's standalone walker
3096/// produces the same BNULL/SNULL/DNULL/INPAR/INBRACK markers
3097/// without re-entering the lexer.
3098///
3099/// zshrs port note: the C source returns int (0=ok, char value =
3100/// where it stopped on error); zshrs returns Result<String,String>
3101/// returning the tokenized text directly. Lossy for callers that
3102/// need to know the exact stop position, but nothing in zshrs's
3103/// expansion layer uses that yet.
3104pub fn parse_subst_string(s: &str) -> Result<String, String> {
3105 if s.is_empty() {
3106 return Ok(String::new());
3107 }
3108
3109 let mut result = String::with_capacity(s.len());
3110 let chars: Vec<char> = s.chars().collect();
3111 let mut i = 0;
3112
3113 while i < chars.len() {
3114 let c = chars[i];
3115 match c {
3116 '\\' => {
3117 result.push(char_tokens::BNULL);
3118 i += 1;
3119 if i < chars.len() {
3120 result.push(chars[i]);
3121 }
3122 }
3123 '\'' => {
3124 result.push(char_tokens::SNULL);
3125 i += 1;
3126 while i < chars.len() && chars[i] != '\'' {
3127 result.push(chars[i]);
3128 i += 1;
3129 }
3130 result.push(char_tokens::SNULL);
3131 }
3132 '"' => {
3133 result.push(char_tokens::DNULL);
3134 i += 1;
3135 while i < chars.len() && chars[i] != '"' {
3136 if chars[i] == '\\' && i + 1 < chars.len() {
3137 result.push(char_tokens::BNULL);
3138 i += 1;
3139 result.push(chars[i]);
3140 } else if chars[i] == '$' {
3141 result.push(char_tokens::QSTRING);
3142 } else {
3143 result.push(chars[i]);
3144 }
3145 i += 1;
3146 }
3147 result.push(char_tokens::DNULL);
3148 }
3149 '$' => {
3150 result.push(char_tokens::STRING);
3151 if i + 1 < chars.len() {
3152 match chars[i + 1] {
3153 '{' => {
3154 result.push(char_tokens::INBRACE);
3155 i += 1;
3156 }
3157 '(' => {
3158 result.push(char_tokens::INPAR);
3159 i += 1;
3160 }
3161 _ => {}
3162 }
3163 }
3164 }
3165 '*' => result.push(char_tokens::STAR),
3166 '?' => result.push(char_tokens::QUEST),
3167 '[' => result.push(char_tokens::INBRACK),
3168 ']' => result.push(char_tokens::OUTBRACK),
3169 '{' => result.push(char_tokens::INBRACE),
3170 '}' => result.push(char_tokens::OUTBRACE),
3171 '~' => result.push(char_tokens::TILDE),
3172 '#' => result.push(char_tokens::POUND),
3173 '^' => result.push(char_tokens::HAT),
3174 _ => result.push(c),
3175 }
3176 i += 1;
3177 }
3178
3179 Ok(result)
3180}
3181
3182/// Untokenize a string - convert tokenized chars back to original
3183///
3184/// Port of untokenize() from exec.c (but used by lexer too)
3185/// Like `untokenize`, but maps SNULL → `'` and DNULL → `"` instead of
3186/// stripping them. Used by callers that need the source form including
3187/// quoting (e.g. arithmetic-substitution detection in compile_zsh).
3188pub fn untokenize_preserve_quotes(s: &str) -> String {
3189 let mut result = String::with_capacity(s.len() + 4);
3190 for c in s.chars() {
3191 let cu = c as u32;
3192 if (0x83..=0x9f).contains(&cu) {
3193 match c {
3194 c if c == char_tokens::POUND => result.push('#'),
3195 c if c == char_tokens::STRING => result.push('$'),
3196 c if c == char_tokens::HAT => result.push('^'),
3197 c if c == char_tokens::STAR => result.push('*'),
3198 c if c == char_tokens::INPAR => result.push('('),
3199 c if c == char_tokens::OUTPAR => result.push(')'),
3200 c if c == char_tokens::INPARMATH => result.push('('),
3201 c if c == char_tokens::OUTPARMATH => result.push(')'),
3202 c if c == char_tokens::QSTRING => result.push('$'),
3203 c if c == char_tokens::EQUALS => result.push('='),
3204 c if c == char_tokens::BAR => result.push('|'),
3205 c if c == char_tokens::INBRACE => result.push('{'),
3206 c if c == char_tokens::OUTBRACE => result.push('}'),
3207 c if c == char_tokens::INBRACK => result.push('['),
3208 c if c == char_tokens::OUTBRACK => result.push(']'),
3209 c if c == char_tokens::TICK => result.push('`'),
3210 c if c == char_tokens::INANG => result.push('<'),
3211 c if c == char_tokens::OUTANG => result.push('>'),
3212 c if c == char_tokens::OUTANGPROC => result.push('>'),
3213 c if c == char_tokens::QUEST => result.push('?'),
3214 c if c == char_tokens::TILDE => result.push('~'),
3215 c if c == char_tokens::QTICK => result.push('`'),
3216 c if c == char_tokens::COMMA => result.push(','),
3217 c if c == char_tokens::DASH => result.push('-'),
3218 c if c == char_tokens::BANG => result.push('!'),
3219 c if c == char_tokens::SNULL => result.push('\''),
3220 c if c == char_tokens::DNULL => result.push('"'),
3221 c if c == char_tokens::BNULL => result.push('\\'),
3222 _ => {
3223 let idx = c as usize;
3224 if idx < char_tokens::ZTOKENS.len() {
3225 result.push(char_tokens::ZTOKENS.chars().nth(idx).unwrap_or(c));
3226 } else {
3227 result.push(c);
3228 }
3229 }
3230 }
3231 } else {
3232 result.push(c);
3233 }
3234 }
3235 result
3236}
3237
3238/// Decode `\X` escape sequences for `$'...'` content.
3239/// Port of `getkeystring()` from Src/utils.c:6915 with the
3240/// `GETKEYS_DOLLARS_QUOTE` flag — handles the `\n`/`\t`/`\r`/`\e`/
3241/// `\E`/`\a`/`\b`/`\f`/`\v`/`\xNN`/`\uNNNN`/`\UNNNNNNNN`/octal/`\\`/`\'`
3242/// arms the C source recognizes inside dollar-single-quoted
3243/// strings. Walks `chars[start..]` until `Snull` is hit, returns
3244/// `(decoded, end_idx)` where `end_idx` points at the terminating
3245/// `Snull`. `Bnull \\` and `Bnull '` are user-literal `\` / `'`
3246/// per Src/lex.c:1303.
3247fn getkeystring_dollar_quote(chars: &[char], start: usize) -> (String, usize) {
3248 let mut out = String::new();
3249 let mut i = start;
3250 while i < chars.len() {
3251 let c = chars[i];
3252 if c == char_tokens::SNULL {
3253 return (out, i);
3254 }
3255 if c == char_tokens::BNULL {
3256 // Bnull marks a user-literal `\\` or `\'` per
3257 // Src/lex.c:1303-1306. The next char is the literal.
3258 i += 1;
3259 if i < chars.len() {
3260 out.push(chars[i]);
3261 i += 1;
3262 }
3263 continue;
3264 }
3265 if c == '\\' && i + 1 < chars.len() {
3266 let nc = chars[i + 1];
3267 match nc {
3268 'a' => {
3269 out.push('\x07');
3270 i += 2;
3271 }
3272 'b' => {
3273 out.push('\x08');
3274 i += 2;
3275 }
3276 'e' | 'E' => {
3277 out.push('\x1b');
3278 i += 2;
3279 }
3280 'f' => {
3281 out.push('\x0c');
3282 i += 2;
3283 }
3284 'n' => {
3285 out.push('\n');
3286 i += 2;
3287 }
3288 'r' => {
3289 out.push('\r');
3290 i += 2;
3291 }
3292 't' => {
3293 out.push('\t');
3294 i += 2;
3295 }
3296 'v' => {
3297 out.push('\x0b');
3298 i += 2;
3299 }
3300 '\\' | '\'' | '"' => {
3301 out.push(nc);
3302 i += 2;
3303 }
3304 'x' => {
3305 // \xNN — up to 2 hex digits per Src/utils.c:7156
3306 let mut val: u32 = 0;
3307 let mut consumed = 2; // \x
3308 let mut got = 0;
3309 while got < 2 && i + consumed < chars.len() {
3310 let h = chars[i + consumed];
3311 if let Some(d) = h.to_digit(16) {
3312 val = val * 16 + d;
3313 consumed += 1;
3314 got += 1;
3315 } else {
3316 break;
3317 }
3318 }
3319 if got == 0 {
3320 // No hex digits — emit literal `\x` per
3321 // Src/utils.c:7160-7163 fallthrough
3322 out.push('\\');
3323 out.push('x');
3324 } else if let Some(ch) = char::from_u32(val) {
3325 out.push(ch);
3326 }
3327 i += consumed;
3328 }
3329 'u' | 'U' => {
3330 let n = if nc == 'u' { 4 } else { 8 };
3331 let mut val: u32 = 0;
3332 let mut consumed = 2; // \u or \U
3333 let mut got = 0;
3334 while got < n && i + consumed < chars.len() {
3335 let h = chars[i + consumed];
3336 if let Some(d) = h.to_digit(16) {
3337 val = val * 16 + d;
3338 consumed += 1;
3339 got += 1;
3340 } else {
3341 break;
3342 }
3343 }
3344 if let Some(ch) = char::from_u32(val) {
3345 out.push(ch);
3346 }
3347 i += consumed;
3348 }
3349 '0'..='7' => {
3350 // Octal — up to 3 digits per Src/utils.c:7156
3351 let mut val: u32 = 0;
3352 let mut consumed = 1; // skip backslash
3353 let mut got = 0;
3354 while got < 3 && i + consumed < chars.len() {
3355 let h = chars[i + consumed];
3356 if let Some(d) = h.to_digit(8) {
3357 val = val * 8 + d;
3358 consumed += 1;
3359 got += 1;
3360 } else {
3361 break;
3362 }
3363 }
3364 if let Some(ch) = char::from_u32(val) {
3365 out.push(ch);
3366 }
3367 i += consumed;
3368 }
3369 _ => {
3370 // Unknown escape — keep `\` per
3371 // Src/utils.c:7180-7185 default branch
3372 out.push('\\');
3373 out.push(nc);
3374 i += 2;
3375 }
3376 }
3377 continue;
3378 }
3379 out.push(c);
3380 i += 1;
3381 }
3382 (out, i)
3383}
3384
3385pub fn untokenize(s: &str) -> String {
3386 let mut result = String::with_capacity(s.len());
3387 let chars: Vec<char> = s.chars().collect();
3388 let mut i = 0;
3389
3390 while i < chars.len() {
3391 let c = chars[i];
3392 // Token chars live in zsh's META range (0x83 = META through 0x9f =
3393 // BNULL). Anything in that range needs un-mapping before display
3394 // or downstream consumption. The original `< 32` test was wrong —
3395 // none of zsh's tokens land in that range.
3396 let cu = c as u32;
3397 if (0x83..=0x9f).contains(&cu) {
3398 // `Qstring Snull` opens a `$'...'` ANSI-C-quoted region.
3399 // Per Src/subst.c:301-304, when `stringsubst()` hits an
3400 // `Snull` it calls `stringsubstquote()` (line 206) which
3401 // calls `getkeystring(s+2, ...)` over the content,
3402 // skipping the leading `Qstring Snull` and stopping at
3403 // the closing `Snull`. zshrs's pipeline runs untokenize
3404 // at points where C runs subst, so we apply the same
3405 // decoding inline here. Result: the entire `$'...'`
3406 // region is replaced by its decoded content with no
3407 // `$`/`'`/marker remnants.
3408 if c == char_tokens::QSTRING
3409 && i + 1 < chars.len()
3410 && chars[i + 1] == char_tokens::SNULL
3411 {
3412 let (decoded, end) = getkeystring_dollar_quote(&chars, i + 2);
3413 result.push_str(&decoded);
3414 // `end` points at the closing `Snull` (or end of
3415 // string if unterminated); skip past it.
3416 i = if end < chars.len() { end + 1 } else { end };
3417 continue;
3418 }
3419 // Convert token back to original character
3420 match c {
3421 c if c == char_tokens::POUND => result.push('#'),
3422 c if c == char_tokens::STRING => result.push('$'),
3423 c if c == char_tokens::HAT => result.push('^'),
3424 c if c == char_tokens::STAR => result.push('*'),
3425 c if c == char_tokens::INPAR => result.push('('),
3426 c if c == char_tokens::OUTPAR => result.push(')'),
3427 c if c == char_tokens::INPARMATH => result.push('('),
3428 c if c == char_tokens::OUTPARMATH => result.push(')'),
3429 c if c == char_tokens::QSTRING => result.push('$'),
3430 c if c == char_tokens::EQUALS => result.push('='),
3431 c if c == char_tokens::BAR => result.push('|'),
3432 c if c == char_tokens::INBRACE => result.push('{'),
3433 c if c == char_tokens::OUTBRACE => result.push('}'),
3434 c if c == char_tokens::INBRACK => result.push('['),
3435 c if c == char_tokens::OUTBRACK => result.push(']'),
3436 c if c == char_tokens::TICK => result.push('`'),
3437 c if c == char_tokens::INANG => result.push('<'),
3438 c if c == char_tokens::OUTANG => result.push('>'),
3439 c if c == char_tokens::OUTANGPROC => result.push('>'),
3440 c if c == char_tokens::QUEST => result.push('?'),
3441 c if c == char_tokens::TILDE => result.push('~'),
3442 c if c == char_tokens::QTICK => result.push('`'),
3443 c if c == char_tokens::COMMA => result.push(','),
3444 c if c == char_tokens::DASH => result.push('-'),
3445 c if c == char_tokens::BANG => result.push('!'),
3446 c if c == char_tokens::SNULL
3447 || c == char_tokens::DNULL
3448 || c == char_tokens::BNULL =>
3449 {
3450 // Null markers - skip
3451 }
3452 _ => {
3453 // Unknown token, try ztokens lookup
3454 let idx = c as usize;
3455 if idx < char_tokens::ZTOKENS.len() {
3456 result.push(char_tokens::ZTOKENS.chars().nth(idx).unwrap_or(c));
3457 } else {
3458 result.push(c);
3459 }
3460 }
3461 }
3462 } else {
3463 result.push(c);
3464 }
3465 i += 1;
3466 }
3467
3468 result
3469}
3470
3471/// Check if a string contains any token characters
3472pub fn has_token(s: &str) -> bool {
3473 s.chars().any(|c| (c as u32) < 32)
3474}
3475
3476/// Convert token characters to their printable form for display
3477pub fn tokens_to_printable(s: &str) -> String {
3478 untokenize(s)
3479}
3480
3481#[cfg(test)]
3482mod tests {
3483 use super::*;
3484
3485 #[test]
3486 fn test_simple_command() {
3487 let mut lexer = ZshLexer::new("echo hello");
3488 lexer.zshlex();
3489 assert_eq!(lexer.tok, LexTok::String);
3490 assert_eq!(lexer.tokstr, Some("echo".to_string()));
3491
3492 lexer.zshlex();
3493 assert_eq!(lexer.tok, LexTok::String);
3494 assert_eq!(lexer.tokstr, Some("hello".to_string()));
3495
3496 lexer.zshlex();
3497 assert_eq!(lexer.tok, LexTok::Endinput);
3498 }
3499
3500 #[test]
3501 fn test_pipeline() {
3502 let mut lexer = ZshLexer::new("ls | grep foo");
3503 lexer.zshlex();
3504 assert_eq!(lexer.tok, LexTok::String);
3505
3506 lexer.zshlex();
3507 assert_eq!(lexer.tok, LexTok::Bar);
3508
3509 lexer.zshlex();
3510 assert_eq!(lexer.tok, LexTok::String);
3511
3512 lexer.zshlex();
3513 assert_eq!(lexer.tok, LexTok::String);
3514 }
3515
3516 #[test]
3517 fn test_redirections() {
3518 let mut lexer = ZshLexer::new("echo > file");
3519 lexer.zshlex();
3520 assert_eq!(lexer.tok, LexTok::String);
3521
3522 lexer.zshlex();
3523 assert_eq!(lexer.tok, LexTok::Outang);
3524
3525 lexer.zshlex();
3526 assert_eq!(lexer.tok, LexTok::String);
3527 }
3528
3529 #[test]
3530 fn test_heredoc() {
3531 let mut lexer = ZshLexer::new("cat << EOF");
3532 lexer.zshlex();
3533 assert_eq!(lexer.tok, LexTok::String);
3534
3535 lexer.zshlex();
3536 assert_eq!(lexer.tok, LexTok::Dinang);
3537
3538 lexer.zshlex();
3539 assert_eq!(lexer.tok, LexTok::String);
3540 }
3541
3542 #[test]
3543 fn test_single_quotes() {
3544 let mut lexer = ZshLexer::new("echo 'hello world'");
3545 lexer.zshlex();
3546 assert_eq!(lexer.tok, LexTok::String);
3547
3548 lexer.zshlex();
3549 assert_eq!(lexer.tok, LexTok::String);
3550 // Should contain Snull markers around literal content
3551 assert!(lexer.tokstr.is_some());
3552 }
3553
3554 #[test]
3555 fn test_function_tokens() {
3556 let mut lexer = ZshLexer::new("function foo { }");
3557 lexer.zshlex();
3558 assert_eq!(
3559 lexer.tok,
3560 LexTok::Func,
3561 "expected Func, got {:?}",
3562 lexer.tok
3563 );
3564
3565 lexer.zshlex();
3566 assert_eq!(
3567 lexer.tok,
3568 LexTok::String,
3569 "expected String for 'foo', got {:?}",
3570 lexer.tok
3571 );
3572 assert_eq!(lexer.tokstr, Some("foo".to_string()));
3573
3574 lexer.zshlex();
3575 assert_eq!(
3576 lexer.tok,
3577 LexTok::Inbrace,
3578 "expected Inbrace, got {:?} tokstr={:?}",
3579 lexer.tok,
3580 lexer.tokstr
3581 );
3582
3583 lexer.zshlex();
3584 assert_eq!(
3585 lexer.tok,
3586 LexTok::Outbrace,
3587 "expected Outbrace, got {:?} tokstr={:?} incmdpos={}",
3588 lexer.tok,
3589 lexer.tokstr,
3590 lexer.incmdpos
3591 );
3592 }
3593
3594 #[test]
3595 fn test_double_quotes() {
3596 let mut lexer = ZshLexer::new("echo \"hello $name\"");
3597 lexer.zshlex();
3598 assert_eq!(lexer.tok, LexTok::String);
3599
3600 lexer.zshlex();
3601 assert_eq!(lexer.tok, LexTok::String);
3602 // Should contain tokenized content
3603 assert!(lexer.tokstr.is_some());
3604 }
3605
3606 #[test]
3607 fn test_command_substitution() {
3608 let mut lexer = ZshLexer::new("echo $(pwd)");
3609 lexer.zshlex();
3610 assert_eq!(lexer.tok, LexTok::String);
3611
3612 lexer.zshlex();
3613 assert_eq!(lexer.tok, LexTok::String);
3614 }
3615
3616 #[test]
3617 fn test_env_assignment() {
3618 let mut lexer = ZshLexer::new("FOO=bar echo");
3619 lexer.incmdpos = true;
3620 lexer.zshlex();
3621 assert_eq!(
3622 lexer.tok,
3623 LexTok::Envstring,
3624 "tok={:?} tokstr={:?}",
3625 lexer.tok,
3626 lexer.tokstr
3627 );
3628
3629 lexer.zshlex();
3630 assert_eq!(lexer.tok, LexTok::String);
3631 }
3632
3633 #[test]
3634 fn test_array_assignment() {
3635 let mut lexer = ZshLexer::new("arr=(a b c)");
3636 lexer.incmdpos = true;
3637 lexer.zshlex();
3638 assert_eq!(lexer.tok, LexTok::Envarray);
3639 }
3640
3641 #[test]
3642 fn test_process_substitution() {
3643 let mut lexer = ZshLexer::new("diff <(ls) >(cat)");
3644 lexer.zshlex();
3645 assert_eq!(lexer.tok, LexTok::String);
3646
3647 lexer.zshlex();
3648 assert_eq!(lexer.tok, LexTok::String);
3649 // <(ls) is tokenized into the string
3650
3651 lexer.zshlex();
3652 assert_eq!(lexer.tok, LexTok::String);
3653 // >(cat) is tokenized
3654 }
3655
3656 #[test]
3657 fn test_arithmetic() {
3658 let mut lexer = ZshLexer::new("echo $((1+2))");
3659 lexer.zshlex();
3660 assert_eq!(lexer.tok, LexTok::String);
3661
3662 lexer.zshlex();
3663 assert_eq!(lexer.tok, LexTok::String);
3664 }
3665
3666 #[test]
3667 fn test_semicolon_variants() {
3668 let mut lexer = ZshLexer::new("case x in a) cmd;; b) cmd;& c) cmd;| esac");
3669
3670 // Skip to first ;;
3671 loop {
3672 lexer.zshlex();
3673 if lexer.tok == LexTok::Dsemi || lexer.tok == LexTok::Endinput {
3674 break;
3675 }
3676 }
3677 assert_eq!(lexer.tok, LexTok::Dsemi);
3678
3679 // Find ;&
3680 loop {
3681 lexer.zshlex();
3682 if lexer.tok == LexTok::Semiamp || lexer.tok == LexTok::Endinput {
3683 break;
3684 }
3685 }
3686 assert_eq!(lexer.tok, LexTok::Semiamp);
3687
3688 // Find ;|
3689 loop {
3690 lexer.zshlex();
3691 if lexer.tok == LexTok::Semibar || lexer.tok == LexTok::Endinput {
3692 break;
3693 }
3694 }
3695 assert_eq!(lexer.tok, LexTok::Semibar);
3696 }
3697}