Skip to main content

shuck_parser/parser/lexer/
mod.rs

1//! Lexer for bash scripts
2//!
3//! Tokenizes input into a stream of tokens with source position tracking.
4
5use std::{collections::VecDeque, ops::Range, sync::Arc};
6
7use memchr::{memchr, memchr_iter, memrchr};
8use shuck_ast::{Position, Span, TokenKind};
9use smallvec::SmallVec;
10
11use super::{ShellDialect, ShellProfile, ZshOptionState, ZshOptionTimeline};
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
14pub(crate) struct TokenFlags(u8);
15
16impl TokenFlags {
17    const COOKED_TEXT: u8 = 1 << 0;
18    const SYNTHETIC: u8 = 1 << 1;
19
20    const fn empty() -> Self {
21        Self(0)
22    }
23
24    const fn cooked_text() -> Self {
25        Self(Self::COOKED_TEXT)
26    }
27
28    pub(crate) const fn with_synthetic(self) -> Self {
29        Self(self.0 | Self::SYNTHETIC)
30    }
31
32    pub(crate) const fn has_cooked_text(self) -> bool {
33        self.0 & Self::COOKED_TEXT != 0
34    }
35
36    pub(crate) const fn is_synthetic(self) -> bool {
37        self.0 & Self::SYNTHETIC != 0
38    }
39}
40
41#[derive(Debug, Clone, PartialEq, Eq)]
42pub(crate) enum TokenText<'a> {
43    Borrowed(&'a str),
44    Shared {
45        source: Arc<str>,
46        range: Range<usize>,
47    },
48    Owned(String),
49}
50
51impl TokenText<'_> {
52    pub(crate) fn as_str(&self) -> &str {
53        match self {
54            Self::Borrowed(text) => text,
55            Self::Shared { source, range } => &source[range.clone()],
56            Self::Owned(text) => text,
57        }
58    }
59
60    fn into_owned<'a>(self) -> TokenText<'a> {
61        match self {
62            Self::Borrowed(text) => TokenText::Owned(text.to_string()),
63            Self::Shared { source, range } => TokenText::Shared { source, range },
64            Self::Owned(text) => TokenText::Owned(text),
65        }
66    }
67
68    fn into_shared<'a>(self, source: &Arc<str>, span: Option<Span>) -> TokenText<'a> {
69        match self {
70            Self::Borrowed(text) => span
71                .filter(|span| span.end.offset <= source.len())
72                .map_or_else(
73                    || TokenText::Owned(text.to_string()),
74                    |span| TokenText::Shared {
75                        source: Arc::clone(source),
76                        range: span.start.offset..span.end.offset,
77                    },
78                ),
79            Self::Shared { source, range } => TokenText::Shared { source, range },
80            Self::Owned(text) => TokenText::Owned(text),
81        }
82    }
83}
84
85/// Classification of one segment inside a lexed shell word.
86#[derive(Debug, Clone, Copy, PartialEq, Eq)]
87pub(crate) enum LexedWordSegmentKind {
88    /// Unquoted or otherwise plain text.
89    Plain,
90    /// Text from a single-quoted string.
91    SingleQuoted,
92    /// Text from a `$'...'` string.
93    DollarSingleQuoted,
94    /// Text from a double-quoted string.
95    DoubleQuoted,
96    /// Text from a `$"..."` string.
97    DollarDoubleQuoted,
98    /// Text composed from multiple lexical forms.
99    Composite,
100}
101
102/// One segment of a lexed shell word, optionally backed by source text.
103#[derive(Debug, Clone, PartialEq, Eq)]
104pub(crate) struct LexedWordSegment<'a> {
105    kind: LexedWordSegmentKind,
106    text: TokenText<'a>,
107    span: Option<Span>,
108    wrapper_span: Option<Span>,
109}
110
111impl<'a> LexedWordSegment<'a> {
112    fn borrowed(kind: LexedWordSegmentKind, text: &'a str, span: Option<Span>) -> Self {
113        Self {
114            kind,
115            text: TokenText::Borrowed(text),
116            span,
117            wrapper_span: span,
118        }
119    }
120
121    fn borrowed_with_spans(
122        kind: LexedWordSegmentKind,
123        text: &'a str,
124        span: Option<Span>,
125        wrapper_span: Option<Span>,
126    ) -> Self {
127        Self {
128            kind,
129            text: TokenText::Borrowed(text),
130            span,
131            wrapper_span,
132        }
133    }
134
135    fn owned(kind: LexedWordSegmentKind, text: String) -> Self {
136        Self {
137            kind,
138            text: TokenText::Owned(text),
139            span: None,
140            wrapper_span: None,
141        }
142    }
143
144    fn owned_with_spans(
145        kind: LexedWordSegmentKind,
146        text: String,
147        span: Option<Span>,
148        wrapper_span: Option<Span>,
149    ) -> Self {
150        Self {
151            kind,
152            text: TokenText::Owned(text),
153            span,
154            wrapper_span,
155        }
156    }
157
158    /// Borrow this segment's cooked text.
159    pub(crate) fn as_str(&self) -> &str {
160        self.text.as_str()
161    }
162
163    pub(crate) const fn text_is_source_backed(&self) -> bool {
164        matches!(self.text, TokenText::Borrowed(_) | TokenText::Shared { .. })
165    }
166
167    /// Return the lexical classification of this segment.
168    pub(crate) const fn kind(&self) -> LexedWordSegmentKind {
169        self.kind
170    }
171
172    /// Return the span of the inner text, if it is tracked.
173    pub(crate) const fn span(&self) -> Option<Span> {
174        self.span
175    }
176
177    /// Return the span including surrounding quoting syntax when available.
178    pub(crate) fn wrapper_span(&self) -> Option<Span> {
179        self.wrapper_span.or(self.span)
180    }
181
182    fn rebased(mut self, base: Position) -> Self {
183        self.span = self.span.map(|span| span.rebased(base));
184        self.wrapper_span = self.wrapper_span.map(|span| span.rebased(base));
185        self
186    }
187
188    fn into_owned<'b>(self) -> LexedWordSegment<'b> {
189        LexedWordSegment {
190            kind: self.kind,
191            text: self.text.into_owned(),
192            span: self.span,
193            wrapper_span: self.wrapper_span,
194        }
195    }
196
197    fn into_shared<'b>(self, source: &Arc<str>) -> LexedWordSegment<'b> {
198        LexedWordSegment {
199            kind: self.kind,
200            text: self.text.into_shared(source, self.span),
201            span: self.span,
202            wrapper_span: self.wrapper_span,
203        }
204    }
205}
206
207/// Source-backed representation of a shell word produced by the lexer.
208#[derive(Debug, Clone, PartialEq, Eq)]
209pub(crate) struct LexedWord<'a> {
210    primary_segment: LexedWordSegment<'a>,
211    trailing_segments: Vec<LexedWordSegment<'a>>,
212}
213
214impl<'a> LexedWord<'a> {
215    fn from_segment(primary_segment: LexedWordSegment<'a>) -> Self {
216        Self {
217            primary_segment,
218            trailing_segments: Vec::new(),
219        }
220    }
221
222    fn borrowed(kind: LexedWordSegmentKind, text: &'a str, span: Option<Span>) -> Self {
223        Self::from_segment(LexedWordSegment::borrowed(kind, text, span))
224    }
225
226    fn owned(kind: LexedWordSegmentKind, text: String) -> Self {
227        Self::from_segment(LexedWordSegment::owned(kind, text))
228    }
229
230    fn push_segment(&mut self, segment: LexedWordSegment<'a>) {
231        self.trailing_segments.push(segment);
232    }
233
234    /// Iterate over the segments that make up this word.
235    pub(crate) fn segments(&self) -> impl Iterator<Item = &LexedWordSegment<'a>> {
236        std::iter::once(&self.primary_segment).chain(self.trailing_segments.iter())
237    }
238
239    /// Return the word text when it is represented by a single segment.
240    pub(crate) fn text(&self) -> Option<&str> {
241        self.single_segment().map(LexedWordSegment::as_str)
242    }
243
244    /// Join all segments into an owned string.
245    pub(crate) fn joined_text(&self) -> String {
246        let mut text = String::new();
247        for segment in self.segments() {
248            text.push_str(segment.as_str());
249        }
250        text
251    }
252
253    /// Return the only segment when this word is not segmented.
254    pub(crate) fn single_segment(&self) -> Option<&LexedWordSegment<'a>> {
255        self.trailing_segments
256            .is_empty()
257            .then_some(&self.primary_segment)
258    }
259
260    fn has_cooked_text(&self) -> bool {
261        self.segments()
262            .any(|segment| matches!(segment.text, TokenText::Owned(_)))
263    }
264
265    fn rebased(mut self, base: Position) -> Self {
266        self.primary_segment = self.primary_segment.rebased(base);
267        self.trailing_segments = self
268            .trailing_segments
269            .into_iter()
270            .map(|segment| segment.rebased(base))
271            .collect();
272        self
273    }
274
275    fn into_owned<'b>(self) -> LexedWord<'b> {
276        LexedWord {
277            primary_segment: self.primary_segment.into_owned(),
278            trailing_segments: self
279                .trailing_segments
280                .into_iter()
281                .map(LexedWordSegment::into_owned)
282                .collect(),
283        }
284    }
285
286    fn into_shared<'b>(self, source: &Arc<str>) -> LexedWord<'b> {
287        LexedWord {
288            primary_segment: self.primary_segment.into_shared(source),
289            trailing_segments: self
290                .trailing_segments
291                .into_iter()
292                .map(|segment| segment.into_shared(source))
293                .collect(),
294        }
295    }
296}
297
298/// Kinds of lexer error payloads attached to `TokenKind::Error`.
299#[derive(Debug, Clone, Copy, PartialEq, Eq)]
300pub(crate) enum LexerErrorKind {
301    /// Unterminated `$()` command substitution.
302    CommandSubstitution,
303    /// Unterminated backtick command substitution.
304    BacktickSubstitution,
305    /// Unterminated single-quoted string.
306    SingleQuote,
307    /// Unterminated double-quoted string.
308    DoubleQuote,
309}
310
311impl LexerErrorKind {
312    /// Human-readable message for this lexer error kind.
313    pub(crate) const fn message(self) -> &'static str {
314        match self {
315            Self::CommandSubstitution => "unterminated command substitution",
316            Self::BacktickSubstitution => "unterminated backtick substitution",
317            Self::SingleQuote => "unterminated single quote",
318            Self::DoubleQuote => "unterminated double quote",
319        }
320    }
321}
322
323#[derive(Debug, Clone, PartialEq, Eq)]
324pub(crate) enum TokenPayload<'a> {
325    None,
326    Word(LexedWord<'a>),
327    Fd(i32),
328    FdPair(i32, i32),
329    Error(LexerErrorKind),
330}
331
332/// Token produced by the shell lexer.
333///
334/// Public consumers can inspect the token kind and source span. Word payloads,
335/// descriptor payloads, and lexer recovery details are currently parser-internal
336/// so the lexer can evolve without expanding the public API.
337#[derive(Debug, Clone, PartialEq, Eq)]
338pub struct LexedToken<'a> {
339    /// Token kind used by the parser.
340    pub kind: TokenKind,
341    /// Source span covered by the token.
342    pub span: Span,
343    pub(crate) flags: TokenFlags,
344    payload: TokenPayload<'a>,
345}
346
347impl<'a> LexedToken<'a> {
348    fn word_segment_kind(kind: TokenKind) -> LexedWordSegmentKind {
349        match kind {
350            TokenKind::Word => LexedWordSegmentKind::Plain,
351            TokenKind::LiteralWord => LexedWordSegmentKind::SingleQuoted,
352            TokenKind::QuotedWord => LexedWordSegmentKind::DoubleQuoted,
353            _ => LexedWordSegmentKind::Composite,
354        }
355    }
356
357    pub(crate) fn punctuation(kind: TokenKind) -> Self {
358        Self {
359            kind,
360            span: Span::new(),
361            flags: TokenFlags::empty(),
362            payload: TokenPayload::None,
363        }
364    }
365
366    fn with_word_payload(kind: TokenKind, word: LexedWord<'a>) -> Self {
367        let flags = if word.has_cooked_text() {
368            TokenFlags::cooked_text()
369        } else {
370            TokenFlags::empty()
371        };
372
373        Self {
374            kind,
375            span: Span::new(),
376            flags,
377            payload: TokenPayload::Word(word),
378        }
379    }
380
381    fn borrowed_word(kind: TokenKind, text: &'a str, text_span: Option<Span>) -> Self {
382        Self::with_word_payload(
383            kind,
384            LexedWord::borrowed(Self::word_segment_kind(kind), text, text_span),
385        )
386    }
387
388    fn owned_word(kind: TokenKind, text: String) -> Self {
389        Self::with_word_payload(kind, LexedWord::owned(Self::word_segment_kind(kind), text))
390    }
391
392    fn comment() -> Self {
393        Self {
394            kind: TokenKind::Comment,
395            span: Span::new(),
396            flags: TokenFlags::empty(),
397            payload: TokenPayload::None,
398        }
399    }
400
401    fn fd(kind: TokenKind, fd: i32) -> Self {
402        Self {
403            kind,
404            span: Span::new(),
405            flags: TokenFlags::empty(),
406            payload: TokenPayload::Fd(fd),
407        }
408    }
409
410    fn fd_pair(kind: TokenKind, src_fd: i32, dst_fd: i32) -> Self {
411        Self {
412            kind,
413            span: Span::new(),
414            flags: TokenFlags::empty(),
415            payload: TokenPayload::FdPair(src_fd, dst_fd),
416        }
417    }
418
419    fn error(kind: LexerErrorKind) -> Self {
420        Self {
421            kind: TokenKind::Error,
422            span: Span::new(),
423            flags: TokenFlags::empty(),
424            payload: TokenPayload::Error(kind),
425        }
426    }
427
428    pub(crate) fn with_span(mut self, span: Span) -> Self {
429        self.span = span;
430        self
431    }
432
433    pub(crate) fn rebased(mut self, base: Position) -> Self {
434        self.span = self.span.rebased(base);
435        self.payload = match self.payload {
436            TokenPayload::Word(word) => TokenPayload::Word(word.rebased(base)),
437            payload => payload,
438        };
439        self
440    }
441
442    pub(crate) fn with_synthetic_flag(mut self) -> Self {
443        self.flags = self.flags.with_synthetic();
444        self
445    }
446
447    pub(crate) fn into_owned<'b>(self) -> LexedToken<'b> {
448        let payload = match self.payload {
449            TokenPayload::None => TokenPayload::None,
450            TokenPayload::Word(word) => TokenPayload::Word(word.into_owned()),
451            TokenPayload::Fd(fd) => TokenPayload::Fd(fd),
452            TokenPayload::FdPair(src_fd, dst_fd) => TokenPayload::FdPair(src_fd, dst_fd),
453            TokenPayload::Error(kind) => TokenPayload::Error(kind),
454        };
455
456        LexedToken {
457            kind: self.kind,
458            span: self.span,
459            flags: self.flags,
460            payload,
461        }
462    }
463
464    pub(crate) fn into_shared<'b>(self, source: &Arc<str>) -> LexedToken<'b> {
465        let payload = match self.payload {
466            TokenPayload::None => TokenPayload::None,
467            TokenPayload::Word(word) => TokenPayload::Word(word.into_shared(source)),
468            TokenPayload::Fd(fd) => TokenPayload::Fd(fd),
469            TokenPayload::FdPair(src_fd, dst_fd) => TokenPayload::FdPair(src_fd, dst_fd),
470            TokenPayload::Error(kind) => TokenPayload::Error(kind),
471        };
472
473        LexedToken {
474            kind: self.kind,
475            span: self.span,
476            flags: self.flags,
477            payload,
478        }
479    }
480
481    /// Borrow the token text when it is a single-segment word token.
482    pub(crate) fn word_text(&self) -> Option<&str> {
483        self.kind
484            .is_word_like()
485            .then_some(())
486            .and_then(|_| match &self.payload {
487                TokenPayload::Word(word) => word.text(),
488                _ => None,
489            })
490    }
491
492    /// Return an owned string containing the token's word text.
493    pub(crate) fn word_string(&self) -> Option<String> {
494        self.kind
495            .is_word_like()
496            .then_some(())
497            .and_then(|_| match &self.payload {
498                TokenPayload::Word(word) => Some(word.joined_text()),
499                _ => None,
500            })
501    }
502
503    /// Borrow the structured word payload for word-like tokens.
504    pub(crate) fn word(&self) -> Option<&LexedWord<'a>> {
505        match &self.payload {
506            TokenPayload::Word(word) => Some(word),
507            _ => None,
508        }
509    }
510
511    /// Borrow the original source slice when the token is source-backed and uncooked.
512    pub(crate) fn source_slice<'b>(&self, source: &'b str) -> Option<&'b str> {
513        if !self.kind.is_word_like() || self.flags.has_cooked_text() || self.flags.is_synthetic() {
514            return None;
515        }
516
517        (self.span.start.offset <= self.span.end.offset && self.span.end.offset <= source.len())
518            .then(|| &source[self.span.start.offset..self.span.end.offset])
519    }
520
521    /// Return the file-descriptor payload for redirection tokens that carry one.
522    pub(crate) fn fd_value(&self) -> Option<i32> {
523        match self.payload {
524            TokenPayload::Fd(fd) => Some(fd),
525            _ => None,
526        }
527    }
528
529    /// Return the `(source_fd, target_fd)` payload for descriptor-pair redirections.
530    pub(crate) fn fd_pair_value(&self) -> Option<(i32, i32)> {
531        match self.payload {
532            TokenPayload::FdPair(src_fd, dst_fd) => Some((src_fd, dst_fd)),
533            _ => None,
534        }
535    }
536
537    /// Return the lexer error payload when this token represents `TokenKind::Error`.
538    pub(crate) fn error_kind(&self) -> Option<LexerErrorKind> {
539        match self.payload {
540            TokenPayload::Error(kind) => Some(kind),
541            _ => None,
542        }
543    }
544}
545
546/// Result of reading a heredoc body from the source.
547#[derive(Debug, Clone, PartialEq)]
548pub(crate) struct HeredocRead {
549    /// Decoded heredoc content.
550    pub content: String,
551    /// Source span covering the heredoc body content.
552    pub content_span: Span,
553}
554
555/// Maximum nesting depth for command substitution in the lexer.
556/// Prevents stack overflow from deeply nested $() patterns.
557const DEFAULT_MAX_SUBST_DEPTH: usize = 50;
558const MAX_PARAMETER_EXPANSION_SCAN_DEPTH: usize = 4;
559
560#[derive(Clone, Debug)]
561struct Cursor<'a> {
562    rest: &'a str,
563}
564
565impl<'a> Cursor<'a> {
566    fn new(source: &'a str) -> Self {
567        Self { rest: source }
568    }
569
570    fn first(&self) -> Option<char> {
571        self.rest.chars().next()
572    }
573
574    fn second(&self) -> Option<char> {
575        let mut chars = self.rest.chars();
576        chars.next()?;
577        chars.next()
578    }
579
580    fn third(&self) -> Option<char> {
581        let mut chars = self.rest.chars();
582        chars.next()?;
583        chars.next()?;
584        chars.next()
585    }
586
587    fn bump(&mut self) -> Option<char> {
588        let ch = self.first()?;
589        self.rest = &self.rest[ch.len_utf8()..];
590        Some(ch)
591    }
592
593    fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) -> &'a str {
594        let start = self.rest;
595        let mut end = 0;
596
597        for ch in start.chars() {
598            if !predicate(ch) {
599                break;
600            }
601            end += ch.len_utf8();
602        }
603
604        self.rest = &start[end..];
605        &start[..end]
606    }
607
608    fn rest(&self) -> &'a str {
609        self.rest
610    }
611
612    fn skip_bytes(&mut self, count: usize) {
613        self.rest = &self.rest[count..];
614    }
615
616    fn find_byte(&self, byte: u8) -> Option<usize> {
617        memchr(byte, self.rest.as_bytes())
618    }
619}
620
621#[derive(Clone, Debug)]
622struct PositionMap<'a> {
623    source: &'a str,
624    line_starts: Arc<[usize]>,
625    cached: Position,
626}
627
628#[cfg(feature = "benchmarking")]
629#[derive(Clone, Copy, Debug, Default)]
630pub(crate) struct LexerBenchmarkCounters {
631    pub(crate) current_position_calls: u64,
632}
633
634impl<'a> PositionMap<'a> {
635    fn new(source: &'a str) -> Self {
636        let mut line_starts =
637            Vec::with_capacity(source.bytes().filter(|byte| *byte == b'\n').count() + 1);
638        line_starts.push(0);
639        line_starts.extend(
640            source
641                .bytes()
642                .enumerate()
643                .filter_map(|(index, byte)| (byte == b'\n').then_some(index + 1)),
644        );
645
646        Self {
647            source,
648            line_starts: line_starts.into(),
649            cached: Position::new(),
650        }
651    }
652
653    fn position(&mut self, offset: usize) -> Position {
654        if offset == self.cached.offset {
655            return self.cached;
656        }
657
658        let position = if offset > self.cached.offset && offset <= self.source.len() {
659            Self::advance_from(self.cached, &self.source[self.cached.offset..offset])
660        } else {
661            self.position_uncached(offset)
662        };
663        self.cached = position;
664        position
665    }
666
667    fn position_uncached(&self, offset: usize) -> Position {
668        let offset = offset.min(self.source.len());
669        let line_index = self
670            .line_starts
671            .partition_point(|start| *start <= offset)
672            .saturating_sub(1);
673        let line_start = self.line_starts[line_index];
674        let line_text = &self.source[line_start..offset];
675        let column = if line_text.is_ascii() {
676            line_text.len() + 1
677        } else {
678            line_text.chars().count() + 1
679        };
680
681        Position {
682            line: line_index + 1,
683            column,
684            offset,
685        }
686    }
687
688    fn advance_from(mut position: Position, text: &str) -> Position {
689        position.offset += text.len();
690        let newline_count = memchr_iter(b'\n', text.as_bytes()).count();
691        if newline_count == 0 {
692            position.column += if text.is_ascii() {
693                text.len()
694            } else {
695                text.chars().count()
696            };
697            return position;
698        }
699
700        position.line += newline_count;
701        let tail_start = memrchr(b'\n', text.as_bytes())
702            .map(|index| index + 1)
703            .unwrap_or_default();
704        let tail = &text[tail_start..];
705        position.column = if tail.is_ascii() {
706            tail.len() + 1
707        } else {
708            tail.chars().count() + 1
709        };
710        position
711    }
712}
713
714/// Source-backed lexer for shell scripts.
715///
716/// The public lexer surface is intended for lower-level tooling and
717/// benchmarks. It tokenizes using the default bash profile; use the parser
718/// constructors when dialect or zsh option state matters.
719#[derive(Clone)]
720pub struct Lexer<'a> {
721    input: &'a str,
722    /// Current byte offset in the input/reinjected stream.
723    offset: usize,
724    cursor: Cursor<'a>,
725    position_map: PositionMap<'a>,
726    /// Buffer for re-injected characters (e.g., rest-of-line after heredoc delimiter).
727    /// Consumed before `cursor`.
728    reinject_buf: VecDeque<char>,
729    /// Cursor byte offset to restore once a heredoc replay buffer is exhausted.
730    reinject_resume_offset: Option<usize>,
731    /// Maximum allowed nesting depth for command substitution
732    max_subst_depth: usize,
733    initial_zsh_options: Option<ZshOptionState>,
734    zsh_timeline: Option<Arc<ZshOptionTimeline>>,
735    zsh_timeline_index: usize,
736    #[cfg(feature = "benchmarking")]
737    benchmark_counters: Option<LexerBenchmarkCounters>,
738}
739
740mod cursor;
741mod heredoc;
742mod quotes;
743mod substitutions;
744mod tokens;
745mod word;
746
747pub(super) use heredoc::heredoc_line_matches_delimiter;
748pub(super) use substitutions::{
749    line_has_unclosed_double_paren, scan_command_substitution_body_len,
750    scan_command_substitution_body_len_inner,
751};
752#[cfg(test)]
753mod tests;