1use std::{collections::VecDeque, ops::Range, sync::Arc};
6
7use memchr::{memchr, memchr_iter, memrchr};
8use shuck_ast::{Position, Span, TokenKind};
9use smallvec::SmallVec;
10
11use super::{ShellDialect, ShellProfile, ZshOptionState, ZshOptionTimeline};
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
14pub(crate) struct TokenFlags(u8);
15
16impl TokenFlags {
17 const COOKED_TEXT: u8 = 1 << 0;
18 const SYNTHETIC: u8 = 1 << 1;
19
20 const fn empty() -> Self {
21 Self(0)
22 }
23
24 const fn cooked_text() -> Self {
25 Self(Self::COOKED_TEXT)
26 }
27
28 pub(crate) const fn with_synthetic(self) -> Self {
29 Self(self.0 | Self::SYNTHETIC)
30 }
31
32 pub(crate) const fn has_cooked_text(self) -> bool {
33 self.0 & Self::COOKED_TEXT != 0
34 }
35
36 pub(crate) const fn is_synthetic(self) -> bool {
37 self.0 & Self::SYNTHETIC != 0
38 }
39}
40
41#[derive(Debug, Clone, PartialEq, Eq)]
42pub(crate) enum TokenText<'a> {
43 Borrowed(&'a str),
44 Shared {
45 source: Arc<str>,
46 range: Range<usize>,
47 },
48 Owned(String),
49}
50
51impl TokenText<'_> {
52 pub(crate) fn as_str(&self) -> &str {
53 match self {
54 Self::Borrowed(text) => text,
55 Self::Shared { source, range } => &source[range.clone()],
56 Self::Owned(text) => text,
57 }
58 }
59
60 fn into_owned<'a>(self) -> TokenText<'a> {
61 match self {
62 Self::Borrowed(text) => TokenText::Owned(text.to_string()),
63 Self::Shared { source, range } => TokenText::Shared { source, range },
64 Self::Owned(text) => TokenText::Owned(text),
65 }
66 }
67
68 fn into_shared<'a>(self, source: &Arc<str>, span: Option<Span>) -> TokenText<'a> {
69 match self {
70 Self::Borrowed(text) => span
71 .filter(|span| span.end.offset <= source.len())
72 .map_or_else(
73 || TokenText::Owned(text.to_string()),
74 |span| TokenText::Shared {
75 source: Arc::clone(source),
76 range: span.start.offset..span.end.offset,
77 },
78 ),
79 Self::Shared { source, range } => TokenText::Shared { source, range },
80 Self::Owned(text) => TokenText::Owned(text),
81 }
82 }
83}
84
85#[derive(Debug, Clone, Copy, PartialEq, Eq)]
87pub(crate) enum LexedWordSegmentKind {
88 Plain,
90 SingleQuoted,
92 DollarSingleQuoted,
94 DoubleQuoted,
96 DollarDoubleQuoted,
98 Composite,
100}
101
102#[derive(Debug, Clone, PartialEq, Eq)]
104pub(crate) struct LexedWordSegment<'a> {
105 kind: LexedWordSegmentKind,
106 text: TokenText<'a>,
107 span: Option<Span>,
108 wrapper_span: Option<Span>,
109}
110
111impl<'a> LexedWordSegment<'a> {
112 fn borrowed(kind: LexedWordSegmentKind, text: &'a str, span: Option<Span>) -> Self {
113 Self {
114 kind,
115 text: TokenText::Borrowed(text),
116 span,
117 wrapper_span: span,
118 }
119 }
120
121 fn borrowed_with_spans(
122 kind: LexedWordSegmentKind,
123 text: &'a str,
124 span: Option<Span>,
125 wrapper_span: Option<Span>,
126 ) -> Self {
127 Self {
128 kind,
129 text: TokenText::Borrowed(text),
130 span,
131 wrapper_span,
132 }
133 }
134
135 fn owned(kind: LexedWordSegmentKind, text: String) -> Self {
136 Self {
137 kind,
138 text: TokenText::Owned(text),
139 span: None,
140 wrapper_span: None,
141 }
142 }
143
144 fn owned_with_spans(
145 kind: LexedWordSegmentKind,
146 text: String,
147 span: Option<Span>,
148 wrapper_span: Option<Span>,
149 ) -> Self {
150 Self {
151 kind,
152 text: TokenText::Owned(text),
153 span,
154 wrapper_span,
155 }
156 }
157
158 pub(crate) fn as_str(&self) -> &str {
160 self.text.as_str()
161 }
162
163 pub(crate) const fn text_is_source_backed(&self) -> bool {
164 matches!(self.text, TokenText::Borrowed(_) | TokenText::Shared { .. })
165 }
166
167 pub(crate) const fn kind(&self) -> LexedWordSegmentKind {
169 self.kind
170 }
171
172 pub(crate) const fn span(&self) -> Option<Span> {
174 self.span
175 }
176
177 pub(crate) fn wrapper_span(&self) -> Option<Span> {
179 self.wrapper_span.or(self.span)
180 }
181
182 fn rebased(mut self, base: Position) -> Self {
183 self.span = self.span.map(|span| span.rebased(base));
184 self.wrapper_span = self.wrapper_span.map(|span| span.rebased(base));
185 self
186 }
187
188 fn into_owned<'b>(self) -> LexedWordSegment<'b> {
189 LexedWordSegment {
190 kind: self.kind,
191 text: self.text.into_owned(),
192 span: self.span,
193 wrapper_span: self.wrapper_span,
194 }
195 }
196
197 fn into_shared<'b>(self, source: &Arc<str>) -> LexedWordSegment<'b> {
198 LexedWordSegment {
199 kind: self.kind,
200 text: self.text.into_shared(source, self.span),
201 span: self.span,
202 wrapper_span: self.wrapper_span,
203 }
204 }
205}
206
207#[derive(Debug, Clone, PartialEq, Eq)]
209pub(crate) struct LexedWord<'a> {
210 primary_segment: LexedWordSegment<'a>,
211 trailing_segments: Vec<LexedWordSegment<'a>>,
212}
213
214impl<'a> LexedWord<'a> {
215 fn from_segment(primary_segment: LexedWordSegment<'a>) -> Self {
216 Self {
217 primary_segment,
218 trailing_segments: Vec::new(),
219 }
220 }
221
222 fn borrowed(kind: LexedWordSegmentKind, text: &'a str, span: Option<Span>) -> Self {
223 Self::from_segment(LexedWordSegment::borrowed(kind, text, span))
224 }
225
226 fn owned(kind: LexedWordSegmentKind, text: String) -> Self {
227 Self::from_segment(LexedWordSegment::owned(kind, text))
228 }
229
230 fn push_segment(&mut self, segment: LexedWordSegment<'a>) {
231 self.trailing_segments.push(segment);
232 }
233
234 pub(crate) fn segments(&self) -> impl Iterator<Item = &LexedWordSegment<'a>> {
236 std::iter::once(&self.primary_segment).chain(self.trailing_segments.iter())
237 }
238
239 pub(crate) fn text(&self) -> Option<&str> {
241 self.single_segment().map(LexedWordSegment::as_str)
242 }
243
244 pub(crate) fn joined_text(&self) -> String {
246 let mut text = String::new();
247 for segment in self.segments() {
248 text.push_str(segment.as_str());
249 }
250 text
251 }
252
253 pub(crate) fn single_segment(&self) -> Option<&LexedWordSegment<'a>> {
255 self.trailing_segments
256 .is_empty()
257 .then_some(&self.primary_segment)
258 }
259
260 fn has_cooked_text(&self) -> bool {
261 self.segments()
262 .any(|segment| matches!(segment.text, TokenText::Owned(_)))
263 }
264
265 fn rebased(mut self, base: Position) -> Self {
266 self.primary_segment = self.primary_segment.rebased(base);
267 self.trailing_segments = self
268 .trailing_segments
269 .into_iter()
270 .map(|segment| segment.rebased(base))
271 .collect();
272 self
273 }
274
275 fn into_owned<'b>(self) -> LexedWord<'b> {
276 LexedWord {
277 primary_segment: self.primary_segment.into_owned(),
278 trailing_segments: self
279 .trailing_segments
280 .into_iter()
281 .map(LexedWordSegment::into_owned)
282 .collect(),
283 }
284 }
285
286 fn into_shared<'b>(self, source: &Arc<str>) -> LexedWord<'b> {
287 LexedWord {
288 primary_segment: self.primary_segment.into_shared(source),
289 trailing_segments: self
290 .trailing_segments
291 .into_iter()
292 .map(|segment| segment.into_shared(source))
293 .collect(),
294 }
295 }
296}
297
298#[derive(Debug, Clone, Copy, PartialEq, Eq)]
300pub(crate) enum LexerErrorKind {
301 CommandSubstitution,
303 BacktickSubstitution,
305 SingleQuote,
307 DoubleQuote,
309}
310
311impl LexerErrorKind {
312 pub(crate) const fn message(self) -> &'static str {
314 match self {
315 Self::CommandSubstitution => "unterminated command substitution",
316 Self::BacktickSubstitution => "unterminated backtick substitution",
317 Self::SingleQuote => "unterminated single quote",
318 Self::DoubleQuote => "unterminated double quote",
319 }
320 }
321}
322
323#[derive(Debug, Clone, PartialEq, Eq)]
324pub(crate) enum TokenPayload<'a> {
325 None,
326 Word(LexedWord<'a>),
327 Fd(i32),
328 FdPair(i32, i32),
329 Error(LexerErrorKind),
330}
331
332#[derive(Debug, Clone, PartialEq, Eq)]
338pub struct LexedToken<'a> {
339 pub kind: TokenKind,
341 pub span: Span,
343 pub(crate) flags: TokenFlags,
344 payload: TokenPayload<'a>,
345}
346
347impl<'a> LexedToken<'a> {
348 fn word_segment_kind(kind: TokenKind) -> LexedWordSegmentKind {
349 match kind {
350 TokenKind::Word => LexedWordSegmentKind::Plain,
351 TokenKind::LiteralWord => LexedWordSegmentKind::SingleQuoted,
352 TokenKind::QuotedWord => LexedWordSegmentKind::DoubleQuoted,
353 _ => LexedWordSegmentKind::Composite,
354 }
355 }
356
357 pub(crate) fn punctuation(kind: TokenKind) -> Self {
358 Self {
359 kind,
360 span: Span::new(),
361 flags: TokenFlags::empty(),
362 payload: TokenPayload::None,
363 }
364 }
365
366 fn with_word_payload(kind: TokenKind, word: LexedWord<'a>) -> Self {
367 let flags = if word.has_cooked_text() {
368 TokenFlags::cooked_text()
369 } else {
370 TokenFlags::empty()
371 };
372
373 Self {
374 kind,
375 span: Span::new(),
376 flags,
377 payload: TokenPayload::Word(word),
378 }
379 }
380
381 fn borrowed_word(kind: TokenKind, text: &'a str, text_span: Option<Span>) -> Self {
382 Self::with_word_payload(
383 kind,
384 LexedWord::borrowed(Self::word_segment_kind(kind), text, text_span),
385 )
386 }
387
388 fn owned_word(kind: TokenKind, text: String) -> Self {
389 Self::with_word_payload(kind, LexedWord::owned(Self::word_segment_kind(kind), text))
390 }
391
392 fn comment() -> Self {
393 Self {
394 kind: TokenKind::Comment,
395 span: Span::new(),
396 flags: TokenFlags::empty(),
397 payload: TokenPayload::None,
398 }
399 }
400
401 fn fd(kind: TokenKind, fd: i32) -> Self {
402 Self {
403 kind,
404 span: Span::new(),
405 flags: TokenFlags::empty(),
406 payload: TokenPayload::Fd(fd),
407 }
408 }
409
410 fn fd_pair(kind: TokenKind, src_fd: i32, dst_fd: i32) -> Self {
411 Self {
412 kind,
413 span: Span::new(),
414 flags: TokenFlags::empty(),
415 payload: TokenPayload::FdPair(src_fd, dst_fd),
416 }
417 }
418
419 fn error(kind: LexerErrorKind) -> Self {
420 Self {
421 kind: TokenKind::Error,
422 span: Span::new(),
423 flags: TokenFlags::empty(),
424 payload: TokenPayload::Error(kind),
425 }
426 }
427
428 pub(crate) fn with_span(mut self, span: Span) -> Self {
429 self.span = span;
430 self
431 }
432
433 pub(crate) fn rebased(mut self, base: Position) -> Self {
434 self.span = self.span.rebased(base);
435 self.payload = match self.payload {
436 TokenPayload::Word(word) => TokenPayload::Word(word.rebased(base)),
437 payload => payload,
438 };
439 self
440 }
441
442 pub(crate) fn with_synthetic_flag(mut self) -> Self {
443 self.flags = self.flags.with_synthetic();
444 self
445 }
446
447 pub(crate) fn into_owned<'b>(self) -> LexedToken<'b> {
448 let payload = match self.payload {
449 TokenPayload::None => TokenPayload::None,
450 TokenPayload::Word(word) => TokenPayload::Word(word.into_owned()),
451 TokenPayload::Fd(fd) => TokenPayload::Fd(fd),
452 TokenPayload::FdPair(src_fd, dst_fd) => TokenPayload::FdPair(src_fd, dst_fd),
453 TokenPayload::Error(kind) => TokenPayload::Error(kind),
454 };
455
456 LexedToken {
457 kind: self.kind,
458 span: self.span,
459 flags: self.flags,
460 payload,
461 }
462 }
463
464 pub(crate) fn into_shared<'b>(self, source: &Arc<str>) -> LexedToken<'b> {
465 let payload = match self.payload {
466 TokenPayload::None => TokenPayload::None,
467 TokenPayload::Word(word) => TokenPayload::Word(word.into_shared(source)),
468 TokenPayload::Fd(fd) => TokenPayload::Fd(fd),
469 TokenPayload::FdPair(src_fd, dst_fd) => TokenPayload::FdPair(src_fd, dst_fd),
470 TokenPayload::Error(kind) => TokenPayload::Error(kind),
471 };
472
473 LexedToken {
474 kind: self.kind,
475 span: self.span,
476 flags: self.flags,
477 payload,
478 }
479 }
480
481 pub(crate) fn word_text(&self) -> Option<&str> {
483 self.kind
484 .is_word_like()
485 .then_some(())
486 .and_then(|_| match &self.payload {
487 TokenPayload::Word(word) => word.text(),
488 _ => None,
489 })
490 }
491
492 pub(crate) fn word_string(&self) -> Option<String> {
494 self.kind
495 .is_word_like()
496 .then_some(())
497 .and_then(|_| match &self.payload {
498 TokenPayload::Word(word) => Some(word.joined_text()),
499 _ => None,
500 })
501 }
502
503 pub(crate) fn word(&self) -> Option<&LexedWord<'a>> {
505 match &self.payload {
506 TokenPayload::Word(word) => Some(word),
507 _ => None,
508 }
509 }
510
511 pub(crate) fn source_slice<'b>(&self, source: &'b str) -> Option<&'b str> {
513 if !self.kind.is_word_like() || self.flags.has_cooked_text() || self.flags.is_synthetic() {
514 return None;
515 }
516
517 (self.span.start.offset <= self.span.end.offset && self.span.end.offset <= source.len())
518 .then(|| &source[self.span.start.offset..self.span.end.offset])
519 }
520
521 pub(crate) fn fd_value(&self) -> Option<i32> {
523 match self.payload {
524 TokenPayload::Fd(fd) => Some(fd),
525 _ => None,
526 }
527 }
528
529 pub(crate) fn fd_pair_value(&self) -> Option<(i32, i32)> {
531 match self.payload {
532 TokenPayload::FdPair(src_fd, dst_fd) => Some((src_fd, dst_fd)),
533 _ => None,
534 }
535 }
536
537 pub(crate) fn error_kind(&self) -> Option<LexerErrorKind> {
539 match self.payload {
540 TokenPayload::Error(kind) => Some(kind),
541 _ => None,
542 }
543 }
544}
545
546#[derive(Debug, Clone, PartialEq)]
548pub(crate) struct HeredocRead {
549 pub content: String,
551 pub content_span: Span,
553}
554
555const DEFAULT_MAX_SUBST_DEPTH: usize = 50;
558const MAX_PARAMETER_EXPANSION_SCAN_DEPTH: usize = 4;
559
560#[derive(Clone, Debug)]
561struct Cursor<'a> {
562 rest: &'a str,
563}
564
565impl<'a> Cursor<'a> {
566 fn new(source: &'a str) -> Self {
567 Self { rest: source }
568 }
569
570 fn first(&self) -> Option<char> {
571 self.rest.chars().next()
572 }
573
574 fn second(&self) -> Option<char> {
575 let mut chars = self.rest.chars();
576 chars.next()?;
577 chars.next()
578 }
579
580 fn third(&self) -> Option<char> {
581 let mut chars = self.rest.chars();
582 chars.next()?;
583 chars.next()?;
584 chars.next()
585 }
586
587 fn bump(&mut self) -> Option<char> {
588 let ch = self.first()?;
589 self.rest = &self.rest[ch.len_utf8()..];
590 Some(ch)
591 }
592
593 fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) -> &'a str {
594 let start = self.rest;
595 let mut end = 0;
596
597 for ch in start.chars() {
598 if !predicate(ch) {
599 break;
600 }
601 end += ch.len_utf8();
602 }
603
604 self.rest = &start[end..];
605 &start[..end]
606 }
607
608 fn rest(&self) -> &'a str {
609 self.rest
610 }
611
612 fn skip_bytes(&mut self, count: usize) {
613 self.rest = &self.rest[count..];
614 }
615
616 fn find_byte(&self, byte: u8) -> Option<usize> {
617 memchr(byte, self.rest.as_bytes())
618 }
619}
620
621#[derive(Clone, Debug)]
622struct PositionMap<'a> {
623 source: &'a str,
624 line_starts: Arc<[usize]>,
625 cached: Position,
626}
627
628#[cfg(feature = "benchmarking")]
629#[derive(Clone, Copy, Debug, Default)]
630pub(crate) struct LexerBenchmarkCounters {
631 pub(crate) current_position_calls: u64,
632}
633
634impl<'a> PositionMap<'a> {
635 fn new(source: &'a str) -> Self {
636 let mut line_starts =
637 Vec::with_capacity(source.bytes().filter(|byte| *byte == b'\n').count() + 1);
638 line_starts.push(0);
639 line_starts.extend(
640 source
641 .bytes()
642 .enumerate()
643 .filter_map(|(index, byte)| (byte == b'\n').then_some(index + 1)),
644 );
645
646 Self {
647 source,
648 line_starts: line_starts.into(),
649 cached: Position::new(),
650 }
651 }
652
653 fn position(&mut self, offset: usize) -> Position {
654 if offset == self.cached.offset {
655 return self.cached;
656 }
657
658 let position = if offset > self.cached.offset && offset <= self.source.len() {
659 Self::advance_from(self.cached, &self.source[self.cached.offset..offset])
660 } else {
661 self.position_uncached(offset)
662 };
663 self.cached = position;
664 position
665 }
666
667 fn position_uncached(&self, offset: usize) -> Position {
668 let offset = offset.min(self.source.len());
669 let line_index = self
670 .line_starts
671 .partition_point(|start| *start <= offset)
672 .saturating_sub(1);
673 let line_start = self.line_starts[line_index];
674 let line_text = &self.source[line_start..offset];
675 let column = if line_text.is_ascii() {
676 line_text.len() + 1
677 } else {
678 line_text.chars().count() + 1
679 };
680
681 Position {
682 line: line_index + 1,
683 column,
684 offset,
685 }
686 }
687
688 fn advance_from(mut position: Position, text: &str) -> Position {
689 position.offset += text.len();
690 let newline_count = memchr_iter(b'\n', text.as_bytes()).count();
691 if newline_count == 0 {
692 position.column += if text.is_ascii() {
693 text.len()
694 } else {
695 text.chars().count()
696 };
697 return position;
698 }
699
700 position.line += newline_count;
701 let tail_start = memrchr(b'\n', text.as_bytes())
702 .map(|index| index + 1)
703 .unwrap_or_default();
704 let tail = &text[tail_start..];
705 position.column = if tail.is_ascii() {
706 tail.len() + 1
707 } else {
708 tail.chars().count() + 1
709 };
710 position
711 }
712}
713
714#[derive(Clone)]
720pub struct Lexer<'a> {
721 input: &'a str,
722 offset: usize,
724 cursor: Cursor<'a>,
725 position_map: PositionMap<'a>,
726 reinject_buf: VecDeque<char>,
729 reinject_resume_offset: Option<usize>,
731 max_subst_depth: usize,
733 initial_zsh_options: Option<ZshOptionState>,
734 zsh_timeline: Option<Arc<ZshOptionTimeline>>,
735 zsh_timeline_index: usize,
736 #[cfg(feature = "benchmarking")]
737 benchmark_counters: Option<LexerBenchmarkCounters>,
738}
739
740mod cursor;
741mod heredoc;
742mod quotes;
743mod substitutions;
744mod tokens;
745mod word;
746
747pub(super) use heredoc::heredoc_line_matches_delimiter;
748pub(super) use substitutions::{
749 line_has_unclosed_double_paren, scan_command_substitution_body_len,
750 scan_command_substitution_body_len_inner,
751};
752#[cfg(test)]
753mod tests;