brush_parser/
tokenizer.rs

1use std::borrow::Cow;
2use std::fmt::Display;
3use std::sync::Arc;
4use utf8_chars::BufReadCharsExt;
5
6#[derive(Clone, Debug)]
7pub(crate) enum TokenEndReason {
8    /// End of input was reached.
9    EndOfInput,
10    /// An unescaped newline char was reached.
11    UnescapedNewLine,
12    /// Specified terminating char.
13    SpecifiedTerminatingChar,
14    /// A non-newline blank char was reached.
15    NonNewLineBlank,
16    /// A here-document's body is starting.
17    HereDocumentBodyStart,
18    /// A here-document's body was terminated.
19    HereDocumentBodyEnd,
20    /// A here-document's end tag was reached.
21    HereDocumentEndTag,
22    /// An operator was started.
23    OperatorStart,
24    /// An operator was terminated.
25    OperatorEnd,
26    /// Some other condition was reached.
27    Other,
28}
29
30/// Represents a position in a source shell script.
31#[derive(Clone, Default, Debug)]
32#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
33#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
34#[cfg_attr(test, serde(rename = "Pos"))]
35pub struct SourcePosition {
36    /// The 0-based index of the character in the input stream.
37    #[cfg_attr(test, serde(rename = "idx"))]
38    pub index: usize,
39    /// The 1-based line number.
40    pub line: usize,
41    /// The 1-based column number.
42    #[cfg_attr(test, serde(rename = "col"))]
43    pub column: usize,
44}
45
46impl Display for SourcePosition {
47    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48        f.write_fmt(format_args!("line {} col {}", self.line, self.column))
49    }
50}
51
52#[cfg(feature = "diagnostics")]
53impl From<&SourcePosition> for miette::SourceOffset {
54    #[allow(clippy::cast_sign_loss)]
55    fn from(position: &SourcePosition) -> Self {
56        position.index.into()
57    }
58}
59
60/// Represents the location of a token in its source shell script.
61#[derive(Clone, Default, Debug)]
62#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
63#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
64#[cfg_attr(test, serde(rename = "Loc"))]
65pub struct TokenLocation {
66    /// The start position of the token.
67    pub start: Arc<SourcePosition>,
68    /// The end position of the token (exclusive).
69    pub end: Arc<SourcePosition>,
70}
71
72impl TokenLocation {
73    /// Returns the length of the token in characters.
74    pub fn length(&self) -> usize {
75        self.end.index - self.start.index
76    }
77    pub(crate) fn within(start: &Self, end: &Self) -> Self {
78        Self {
79            start: start.start.clone(),
80            end: end.end.clone(),
81        }
82    }
83}
84
85/// Represents a token extracted from a shell script.
86#[derive(Clone, Debug)]
87#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
88#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
89pub enum Token {
90    /// An operator token.
91    #[cfg_attr(test, serde(rename = "Op"))]
92    Operator(String, TokenLocation),
93    /// A word token.
94    #[cfg_attr(test, serde(rename = "W"))]
95    Word(String, TokenLocation),
96}
97
98impl Token {
99    /// Returns the string value of the token.
100    pub fn to_str(&self) -> &str {
101        match self {
102            Self::Operator(s, _) => s,
103            Self::Word(s, _) => s,
104        }
105    }
106
107    /// Returns the location of the token in the source script.
108    pub const fn location(&self) -> &TokenLocation {
109        match self {
110            Self::Operator(_, l) => l,
111            Self::Word(_, l) => l,
112        }
113    }
114}
115
116#[cfg(feature = "diagnostics")]
117impl From<&Token> for miette::SourceSpan {
118    fn from(token: &Token) -> Self {
119        let start = token.location().start.as_ref();
120        Self::new(start.into(), token.location().length())
121    }
122}
123
124/// Encapsulates the result of tokenizing a shell script.
125#[derive(Clone, Debug)]
126pub(crate) struct TokenizeResult {
127    /// Reason for tokenization ending.
128    pub reason: TokenEndReason,
129    /// The token that was extracted, if any.
130    pub token: Option<Token>,
131}
132
133/// Represents an error that occurred during tokenization.
134#[derive(thiserror::Error, Debug)]
135pub enum TokenizerError {
136    /// An unterminated escape sequence was encountered at the end of the input stream.
137    #[error("unterminated escape sequence")]
138    UnterminatedEscapeSequence,
139
140    /// An unterminated single-quoted substring was encountered at the end of the input stream.
141    #[error("unterminated single quote at {0}")]
142    UnterminatedSingleQuote(SourcePosition),
143
144    /// An unterminated ANSI C-quoted substring was encountered at the end of the input stream.
145    #[error("unterminated ANSI C quote at {0}")]
146    UnterminatedAnsiCQuote(SourcePosition),
147
148    /// An unterminated double-quoted substring was encountered at the end of the input stream.
149    #[error("unterminated double quote at {0}")]
150    UnterminatedDoubleQuote(SourcePosition),
151
152    /// An unterminated back-quoted substring was encountered at the end of the input stream.
153    #[error("unterminated backquote near {0}")]
154    UnterminatedBackquote(SourcePosition),
155
156    /// An unterminated extended glob (extglob) pattern was encountered at the end of the input
157    /// stream.
158    #[error("unterminated extglob near {0}")]
159    UnterminatedExtendedGlob(SourcePosition),
160
161    /// An unterminated variable expression was encountered at the end of the input stream.
162    #[error("unterminated variable expression")]
163    UnterminatedVariable,
164
165    /// An unterminated command substitiion was encountered at the end of the input stream.
166    #[error("unterminated command substitution")]
167    UnterminatedCommandSubstitution,
168
169    /// An error occurred decoding UTF-8 characters in the input stream.
170    #[error("failed to decode UTF-8 characters")]
171    FailedDecoding,
172
173    /// An I/O here tag was missing.
174    #[error("missing here tag for here document body")]
175    MissingHereTagForDocumentBody,
176
177    /// The indicated I/O here tag was missing.
178    #[error("missing here tag '{0}'")]
179    MissingHereTag(String),
180
181    /// An unterminated here document sequence was encountered at the end of the input stream.
182    #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
183    UnterminatedHereDocuments(String, String),
184
185    /// An I/O error occurred while reading from the input stream.
186    #[error("failed to read input")]
187    ReadError(#[from] std::io::Error),
188}
189
190impl TokenizerError {
191    /// Returns true if the error represents an error that could possibly be due
192    /// to an incomplete input stream.
193    pub const fn is_incomplete(&self) -> bool {
194        matches!(
195            self,
196            Self::UnterminatedEscapeSequence
197                | Self::UnterminatedAnsiCQuote(..)
198                | Self::UnterminatedSingleQuote(..)
199                | Self::UnterminatedDoubleQuote(..)
200                | Self::UnterminatedBackquote(..)
201                | Self::UnterminatedCommandSubstitution
202                | Self::UnterminatedVariable
203                | Self::UnterminatedExtendedGlob(..)
204                | Self::UnterminatedHereDocuments(..)
205        )
206    }
207}
208
209/// Encapsulates a sequence of tokens.
210#[derive(Debug)]
211pub(crate) struct Tokens<'a> {
212    /// Sequence of tokens.
213    pub tokens: &'a [Token],
214}
215
216#[derive(Clone, Debug)]
217enum QuoteMode {
218    None,
219    AnsiC(SourcePosition),
220    Single(SourcePosition),
221    Double(SourcePosition),
222}
223
224#[derive(Clone, Debug, Default)]
225enum HereState {
226    /// In this state, we are not currently tracking any here-documents.
227    #[default]
228    None,
229    /// In this state, we expect that the next token will be a here tag.
230    NextTokenIsHereTag { remove_tabs: bool },
231    /// In this state, the *current* token is a here tag.
232    CurrentTokenIsHereTag {
233        remove_tabs: bool,
234        operator_token_result: TokenizeResult,
235    },
236    /// In this state, we expect that the *next line* will be the body of
237    /// a here-document.
238    NextLineIsHereDoc,
239    /// In this state, we are in the set of lines that comprise 1 or more
240    /// consecutive here-document bodies.
241    InHereDocs,
242}
243
244#[derive(Clone, Debug)]
245struct HereTag {
246    tag: String,
247    tag_was_escaped_or_quoted: bool,
248    remove_tabs: bool,
249    position: SourcePosition,
250    tokens: Vec<TokenizeResult>,
251    pending_tokens_after: Vec<TokenizeResult>,
252}
253
254#[derive(Clone, Debug)]
255struct CrossTokenParseState {
256    /// Cursor within the overall token stream; used for error reporting.
257    cursor: SourcePosition,
258    /// Current state of parsing here-documents.
259    here_state: HereState,
260    /// Ordered queue of here tags for which we're still looking for matching here-document bodies.
261    current_here_tags: Vec<HereTag>,
262    /// Tokens already tokenized that should be used first to serve requests for tokens.
263    queued_tokens: Vec<TokenizeResult>,
264    /// Are we in an arithmetic expansion?
265    arithmetic_expansion: bool,
266}
267
268/// Options controlling how the tokenizer operates.
269#[derive(Clone, Debug, Hash, Eq, PartialEq)]
270pub struct TokenizerOptions {
271    /// Whether or not to enable extended globbing patterns (extglob).
272    pub enable_extended_globbing: bool,
273    /// Whether or not to operate in POSIX compliance mode.
274    pub posix_mode: bool,
275    /// Whether or not we're running in SH emulation mode.
276    pub sh_mode: bool,
277}
278
279impl Default for TokenizerOptions {
280    fn default() -> Self {
281        Self {
282            enable_extended_globbing: true,
283            posix_mode: false,
284            sh_mode: false,
285        }
286    }
287}
288
289/// A tokenizer for shell scripts.
290pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
291    char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
292    cross_state: CrossTokenParseState,
293    options: TokenizerOptions,
294}
295
296/// Encapsulates the current token parsing state.
297#[derive(Clone, Debug)]
298struct TokenParseState {
299    pub start_position: SourcePosition,
300    pub token_so_far: String,
301    pub token_is_operator: bool,
302    pub in_escape: bool,
303    pub quote_mode: QuoteMode,
304}
305
306impl TokenParseState {
307    pub fn new(start_position: &SourcePosition) -> Self {
308        Self {
309            start_position: start_position.to_owned(),
310            token_so_far: String::new(),
311            token_is_operator: false,
312            in_escape: false,
313            quote_mode: QuoteMode::None,
314        }
315    }
316
317    pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
318        let end = Arc::new(end_position.to_owned());
319        let token_location = TokenLocation {
320            start: Arc::new(std::mem::take(&mut self.start_position)),
321            end,
322        };
323
324        let token = if std::mem::take(&mut self.token_is_operator) {
325            Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
326        } else {
327            Token::Word(std::mem::take(&mut self.token_so_far), token_location)
328        };
329
330        end_position.clone_into(&mut self.start_position);
331        self.in_escape = false;
332        self.quote_mode = QuoteMode::None;
333
334        token
335    }
336
337    pub const fn started_token(&self) -> bool {
338        !self.token_so_far.is_empty()
339    }
340
341    pub fn append_char(&mut self, c: char) {
342        self.token_so_far.push(c);
343    }
344
345    pub fn append_str(&mut self, s: &str) {
346        self.token_so_far.push_str(s);
347    }
348
349    pub const fn unquoted(&self) -> bool {
350        !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
351    }
352
353    pub fn current_token(&self) -> &str {
354        &self.token_so_far
355    }
356
357    pub fn is_specific_operator(&self, operator: &str) -> bool {
358        self.token_is_operator && self.current_token() == operator
359    }
360
361    pub const fn in_operator(&self) -> bool {
362        self.token_is_operator
363    }
364
365    fn is_newline(&self) -> bool {
366        self.token_so_far == "\n"
367    }
368
369    fn replace_with_here_doc(&mut self, s: String) {
370        self.token_so_far = s;
371    }
372
373    pub fn delimit_current_token(
374        &mut self,
375        reason: TokenEndReason,
376        cross_token_state: &mut CrossTokenParseState,
377    ) -> Result<Option<TokenizeResult>, TokenizerError> {
378        // If we don't have anything in the token, then don't yield an empty string token
379        // *unless* it's the body of a here document.
380        if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
381            return Ok(Some(TokenizeResult {
382                reason,
383                token: None,
384            }));
385        }
386
387        // TODO: Make sure the here-tag meets criteria (and isn't a newline).
388        let current_here_state = std::mem::take(&mut cross_token_state.here_state);
389        match current_here_state {
390            HereState::NextTokenIsHereTag { remove_tabs } => {
391                // Don't yield the operator as a token yet. We need to make sure we collect
392                // up everything we need for all the here-documents with tags on this line.
393                let operator_token_result = TokenizeResult {
394                    reason,
395                    token: Some(self.pop(&cross_token_state.cursor)),
396                };
397
398                cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
399                    remove_tabs,
400                    operator_token_result,
401                };
402
403                return Ok(None);
404            }
405            HereState::CurrentTokenIsHereTag {
406                remove_tabs,
407                operator_token_result,
408            } => {
409                if self.is_newline() {
410                    return Err(TokenizerError::MissingHereTag(
411                        self.current_token().to_owned(),
412                    ));
413                }
414
415                cross_token_state.here_state = HereState::NextLineIsHereDoc;
416
417                // Include the trailing \n in the here tag so it's easier to check against.
418                let tag = std::format!("{}\n", self.current_token().trim_ascii_start());
419                let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
420
421                let tag_token_result = TokenizeResult {
422                    reason,
423                    token: Some(self.pop(&cross_token_state.cursor)),
424                };
425
426                cross_token_state.current_here_tags.push(HereTag {
427                    tag,
428                    tag_was_escaped_or_quoted,
429                    remove_tabs,
430                    position: cross_token_state.cursor.clone(),
431                    tokens: vec![operator_token_result, tag_token_result],
432                    pending_tokens_after: vec![],
433                });
434
435                return Ok(None);
436            }
437            HereState::NextLineIsHereDoc => {
438                if self.is_newline() {
439                    cross_token_state.here_state = HereState::InHereDocs;
440                } else {
441                    cross_token_state.here_state = HereState::NextLineIsHereDoc;
442                }
443
444                if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
445                    let token = self.pop(&cross_token_state.cursor);
446                    let result = TokenizeResult {
447                        reason,
448                        token: Some(token),
449                    };
450
451                    last_here_tag.pending_tokens_after.push(result);
452                } else {
453                    return Err(TokenizerError::MissingHereTagForDocumentBody);
454                }
455
456                return Ok(None);
457            }
458            HereState::InHereDocs => {
459                // We hit the end of the current here-document.
460                let completed_here_tag = cross_token_state.current_here_tags.remove(0);
461
462                // First queue the redirection operator and (start) here-tag.
463                for here_token in completed_here_tag.tokens {
464                    cross_token_state.queued_tokens.push(here_token);
465                }
466
467                // Leave a hint that we are about to start a here-document.
468                cross_token_state.queued_tokens.push(TokenizeResult {
469                    reason: TokenEndReason::HereDocumentBodyStart,
470                    token: None,
471                });
472
473                // Then queue the body document we just finished.
474                cross_token_state.queued_tokens.push(TokenizeResult {
475                    reason,
476                    token: Some(self.pop(&cross_token_state.cursor)),
477                });
478
479                // Then queue up the (end) here-tag.
480                self.append_str(completed_here_tag.tag.trim_end_matches('\n'));
481                cross_token_state.queued_tokens.push(TokenizeResult {
482                    reason: TokenEndReason::HereDocumentEndTag,
483                    token: Some(self.pop(&cross_token_state.cursor)),
484                });
485
486                // Now we're ready to queue up any tokens that came between the completed
487                // here tag and the next here tag (or newline after it if it was the last).
488                for pending_token in completed_here_tag.pending_tokens_after {
489                    cross_token_state.queued_tokens.push(pending_token);
490                }
491
492                if cross_token_state.current_here_tags.is_empty() {
493                    cross_token_state.here_state = HereState::None;
494                } else {
495                    cross_token_state.here_state = HereState::InHereDocs;
496                }
497
498                return Ok(None);
499            }
500            HereState::None => (),
501        }
502
503        let token = self.pop(&cross_token_state.cursor);
504        let result = TokenizeResult {
505            reason,
506            token: Some(token),
507        };
508
509        Ok(Some(result))
510    }
511}
512
513/// Break the given input shell script string into tokens, returning the tokens.
514///
515/// # Arguments
516///
517/// * `input` - The shell script to tokenize.
518pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
519    tokenize_str_with_options(input, &TokenizerOptions::default())
520}
521
522/// Break the given input shell script string into tokens, returning the tokens.
523///
524/// # Arguments
525///
526/// * `input` - The shell script to tokenize.
527/// * `options` - Options controlling how the tokenizer operates.
528pub fn tokenize_str_with_options(
529    input: &str,
530    options: &TokenizerOptions,
531) -> Result<Vec<Token>, TokenizerError> {
532    uncached_tokenize_string(input.to_owned(), options.to_owned())
533}
534
535#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
536fn uncached_tokenize_string(
537    input: String,
538    options: TokenizerOptions,
539) -> Result<Vec<Token>, TokenizerError> {
540    uncached_tokenize_str(input.as_str(), &options)
541}
542
543/// Break the given input shell script string into tokens, returning the tokens.
544/// No caching is performed.
545///
546/// # Arguments
547///
548/// * `input` - The shell script to tokenize.
549pub fn uncached_tokenize_str(
550    input: &str,
551    options: &TokenizerOptions,
552) -> Result<Vec<Token>, TokenizerError> {
553    let mut reader = std::io::BufReader::new(input.as_bytes());
554    let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
555
556    let mut tokens = vec![];
557    loop {
558        match tokenizer.next_token()? {
559            TokenizeResult {
560                token: Some(token), ..
561            } => tokens.push(token),
562            TokenizeResult {
563                reason: TokenEndReason::EndOfInput,
564                ..
565            } => break,
566            _ => (),
567        }
568    }
569
570    Ok(tokens)
571}
572
573impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
574    pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Self {
575        Tokenizer {
576            options: options.clone(),
577            char_reader: reader.chars().peekable(),
578            cross_state: CrossTokenParseState {
579                cursor: SourcePosition {
580                    index: 0,
581                    line: 1,
582                    column: 1,
583                },
584                here_state: HereState::None,
585                current_here_tags: vec![],
586                queued_tokens: vec![],
587                arithmetic_expansion: false,
588            },
589        }
590    }
591
592    #[expect(clippy::unnecessary_wraps)]
593    pub fn current_location(&self) -> Option<SourcePosition> {
594        Some(self.cross_state.cursor.clone())
595    }
596
597    fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
598        let c = self
599            .char_reader
600            .next()
601            .transpose()
602            .map_err(TokenizerError::ReadError)?;
603
604        if let Some(ch) = c {
605            if ch == '\n' {
606                self.cross_state.cursor.line += 1;
607                self.cross_state.cursor.column = 1;
608            } else {
609                self.cross_state.cursor.column += 1;
610            }
611            self.cross_state.cursor.index += 1;
612        }
613
614        Ok(c)
615    }
616
617    fn consume_char(&mut self) -> Result<(), TokenizerError> {
618        let _ = self.next_char()?;
619        Ok(())
620    }
621
622    fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
623        match self.char_reader.peek() {
624            Some(result) => match result {
625                Ok(c) => Ok(Some(*c)),
626                Err(_) => Err(TokenizerError::FailedDecoding),
627            },
628            None => Ok(None),
629        }
630    }
631
632    pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
633        self.next_token_until(None, false /* include space? */)
634    }
635
636    /// Returns the next token from the input stream, optionally stopping early when a specified
637    /// terminating character is encountered.
638    ///
639    /// # Arguments
640    ///
641    /// * `terminating_char` - An optional character that, if encountered, will stop the
642    ///   tokenization process and return the token up to that character.
643    /// * `include_space` - If true, include spaces in the tokenization process. This is not
644    ///   typically the case, but can be helpful when needing to preserve the original source text
645    ///   embedded within a command substitution or similar construct.
646    #[expect(clippy::cognitive_complexity)]
647    #[expect(clippy::if_same_then_else)]
648    #[expect(clippy::panic_in_result_fn)]
649    #[expect(clippy::too_many_lines)]
650    #[allow(clippy::unwrap_in_result)]
651    fn next_token_until(
652        &mut self,
653        terminating_char: Option<char>,
654        include_space: bool,
655    ) -> Result<TokenizeResult, TokenizerError> {
656        let mut state = TokenParseState::new(&self.cross_state.cursor);
657        let mut result: Option<TokenizeResult> = None;
658
659        while result.is_none() {
660            // First satisfy token results from our queue. Once we exhaust the queue then
661            // we'll look at the input stream.
662            if !self.cross_state.queued_tokens.is_empty() {
663                return Ok(self.cross_state.queued_tokens.remove(0));
664            }
665
666            let next = self.peek_char()?;
667            let c = next.unwrap_or('\0');
668
669            // When we hit the end of the input, then we're done with the current token (if there is
670            // one).
671            if next.is_none() {
672                // TODO: Verify we're not waiting on some terminating character?
673                // Verify we're out of all quotes.
674                if state.in_escape {
675                    return Err(TokenizerError::UnterminatedEscapeSequence);
676                }
677                match state.quote_mode {
678                    QuoteMode::None => (),
679                    QuoteMode::AnsiC(pos) => {
680                        return Err(TokenizerError::UnterminatedAnsiCQuote(pos));
681                    }
682                    QuoteMode::Single(pos) => {
683                        return Err(TokenizerError::UnterminatedSingleQuote(pos));
684                    }
685                    QuoteMode::Double(pos) => {
686                        return Err(TokenizerError::UnterminatedDoubleQuote(pos));
687                    }
688                }
689
690                // Verify we're not in a here document.
691                if !matches!(self.cross_state.here_state, HereState::None) {
692                    if self.remove_here_end_tag(&mut state, &mut result, false)? {
693                        // If we hit end tag without a trailing newline, try to get next token.
694                        continue;
695                    }
696
697                    let tag_names = self
698                        .cross_state
699                        .current_here_tags
700                        .iter()
701                        .map(|tag| tag.tag.trim())
702                        .collect::<Vec<_>>()
703                        .join(", ");
704                    let tag_positions = self
705                        .cross_state
706                        .current_here_tags
707                        .iter()
708                        .map(|tag| std::format!("{}", tag.position))
709                        .collect::<Vec<_>>()
710                        .join(", ");
711                    return Err(TokenizerError::UnterminatedHereDocuments(
712                        tag_names,
713                        tag_positions,
714                    ));
715                }
716
717                result = state
718                    .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
719            //
720            // Look for the specially specified terminating char.
721            //
722            } else if state.unquoted() && terminating_char == Some(c) {
723                result = state.delimit_current_token(
724                    TokenEndReason::SpecifiedTerminatingChar,
725                    &mut self.cross_state,
726                )?;
727            //
728            // Handle being in a here document.
729            //
730            } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
731                //
732                // For now, just include the character in the current token. We also check
733                // if there are leading tabs to be removed.
734                //
735                if !self.cross_state.current_here_tags.is_empty()
736                    && self.cross_state.current_here_tags[0].remove_tabs
737                    && (!state.started_token() || state.current_token().ends_with('\n'))
738                    && c == '\t'
739                {
740                    // Consume it but don't include it.
741                    self.consume_char()?;
742                } else {
743                    self.consume_char()?;
744                    state.append_char(c);
745
746                    // See if this was a newline character following the terminating here tag.
747                    if c == '\n' {
748                        self.remove_here_end_tag(&mut state, &mut result, true)?;
749                    }
750                }
751            } else if state.in_operator() {
752                //
753                // We're in an operator. See if this character continues an operator, or if it
754                // must be a separate token (because it wouldn't make a prefix of an operator).
755                //
756
757                let mut hypothetical_token = state.current_token().to_owned();
758                hypothetical_token.push(c);
759
760                if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
761                    self.consume_char()?;
762                    state.append_char(c);
763                } else {
764                    assert!(state.started_token());
765
766                    //
767                    // N.B. If the completed operator indicates a here-document, then keep
768                    // track that the *next* token should be the here-tag.
769                    //
770                    if self.cross_state.arithmetic_expansion {
771                        //
772                        // We're in an arithmetic context; don't consider << and <<-
773                        // special. They're not here-docs, they're either a left-shift
774                        // operator or a left-shift operator followed by a unary
775                        // minus operator.
776                        //
777
778                        if state.is_specific_operator(")") && c == ')' {
779                            self.cross_state.arithmetic_expansion = false;
780                        }
781                    } else if state.is_specific_operator("<<") {
782                        self.cross_state.here_state =
783                            HereState::NextTokenIsHereTag { remove_tabs: false };
784                    } else if state.is_specific_operator("<<-") {
785                        self.cross_state.here_state =
786                            HereState::NextTokenIsHereTag { remove_tabs: true };
787                    } else if state.is_specific_operator("(") && c == '(' {
788                        self.cross_state.arithmetic_expansion = true;
789                    }
790
791                    let reason = if state.current_token() == "\n" {
792                        TokenEndReason::UnescapedNewLine
793                    } else {
794                        TokenEndReason::OperatorEnd
795                    };
796
797                    result = state.delimit_current_token(reason, &mut self.cross_state)?;
798                }
799            //
800            // See if this is a character that changes the current escaping/quoting state.
801            //
802            } else if does_char_newly_affect_quoting(&state, c) {
803                if c == '\\' {
804                    // Consume the backslash ourselves so we can peek past it.
805                    self.consume_char()?;
806
807                    if matches!(self.peek_char()?, Some('\n')) {
808                        // Make sure the newline char gets consumed too.
809                        self.consume_char()?;
810
811                        // Make sure to include neither the backslash nor the newline character.
812                    } else {
813                        state.in_escape = true;
814                        state.append_char(c);
815                    }
816                } else if c == '\'' {
817                    if state.token_so_far.ends_with('$') {
818                        state.quote_mode = QuoteMode::AnsiC(self.cross_state.cursor.clone());
819                    } else {
820                        state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
821                    }
822
823                    self.consume_char()?;
824                    state.append_char(c);
825                } else if c == '\"' {
826                    state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
827                    self.consume_char()?;
828                    state.append_char(c);
829                }
830            }
831            //
832            // Handle end of single-quote, double-quote, or ANSI-C quote.
833            else if !state.in_escape
834                && matches!(
835                    state.quote_mode,
836                    QuoteMode::Single(..) | QuoteMode::AnsiC(..)
837                )
838                && c == '\''
839            {
840                state.quote_mode = QuoteMode::None;
841                self.consume_char()?;
842                state.append_char(c);
843            } else if !state.in_escape
844                && matches!(state.quote_mode, QuoteMode::Double(..))
845                && c == '\"'
846            {
847                state.quote_mode = QuoteMode::None;
848                self.consume_char()?;
849                state.append_char(c);
850            }
851            //
852            // Handle end of escape sequence.
853            // TODO: Handle double-quote specific escape sequences.
854            else if state.in_escape {
855                state.in_escape = false;
856                self.consume_char()?;
857                state.append_char(c);
858            } else if (state.unquoted()
859                || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
860                && (c == '$' || c == '`')
861            {
862                // TODO: handle quoted $ or ` in a double quote
863                if c == '$' {
864                    // Consume the '$' so we can peek beyond.
865                    self.consume_char()?;
866
867                    // Now peek beyond to see what we have.
868                    let char_after_dollar_sign = self.peek_char()?;
869                    match char_after_dollar_sign {
870                        Some('(') => {
871                            // Add the '$' we already consumed to the token.
872                            state.append_char('$');
873
874                            // Consume the '(' and add it to the token.
875                            state.append_char(self.next_char()?.unwrap());
876
877                            // Check to see if this is possibly an arithmetic expression
878                            // (i.e., one that starts with `$((`).
879                            let mut required_end_parens = 1;
880                            if matches!(self.peek_char()?, Some('(')) {
881                                // Consume the second '(' and add it to the token.
882                                state.append_char(self.next_char()?.unwrap());
883                                // Keep track that we'll need to see *2* end parentheses
884                                // to leave this construct.
885                                required_end_parens = 2;
886                                // Keep track that we're in an arithmetic expression, since
887                                // some text will be interpreted differently as a result
888                                // (e.g., << is a left shift operator and not a here doc
889                                // input redirection operator).
890                                self.cross_state.arithmetic_expansion = true;
891                            }
892
893                            let mut pending_here_doc_tokens = vec![];
894                            let mut drain_here_doc_tokens = false;
895
896                            loop {
897                                let cur_token = if drain_here_doc_tokens
898                                    && !pending_here_doc_tokens.is_empty()
899                                {
900                                    if pending_here_doc_tokens.len() == 1 {
901                                        drain_here_doc_tokens = false;
902                                    }
903
904                                    pending_here_doc_tokens.remove(0)
905                                } else {
906                                    let cur_token = self.next_token_until(
907                                        Some(')'),
908                                        true, /* include space? */
909                                    )?;
910
911                                    // See if this is a here-document-related token we need to hold
912                                    // onto until after we've seen all the tokens that need to show
913                                    // up before we get to the body.
914                                    if matches!(
915                                        cur_token.reason,
916                                        TokenEndReason::HereDocumentBodyStart
917                                            | TokenEndReason::HereDocumentBodyEnd
918                                            | TokenEndReason::HereDocumentEndTag
919                                    ) {
920                                        pending_here_doc_tokens.push(cur_token);
921                                        continue;
922                                    }
923
924                                    cur_token
925                                };
926
927                                if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
928                                    && !pending_here_doc_tokens.is_empty()
929                                {
930                                    pending_here_doc_tokens.push(cur_token);
931                                    drain_here_doc_tokens = true;
932                                    continue;
933                                }
934
935                                if let Some(cur_token_value) = cur_token.token {
936                                    state.append_str(cur_token_value.to_str());
937
938                                    // If we encounter an embedded open parenthesis, then note that
939                                    // we'll have to see the matching end to it before we worry
940                                    // about the end of the
941                                    // containing construct.
942                                    if matches!(cur_token_value, Token::Operator(o, _) if o == "(")
943                                    {
944                                        required_end_parens += 1;
945                                    }
946                                }
947
948                                match cur_token.reason {
949                                    TokenEndReason::HereDocumentBodyStart => {
950                                        state.append_char('\n');
951                                    }
952                                    TokenEndReason::NonNewLineBlank => state.append_char(' '),
953                                    TokenEndReason::SpecifiedTerminatingChar => {
954                                        // We hit the ')' we were looking for. If this is the last
955                                        // end parenthesis we needed to find, then we'll exit the
956                                        // loop and consume
957                                        // and append it.
958                                        required_end_parens -= 1;
959                                        if required_end_parens == 0 {
960                                            break;
961                                        }
962
963                                        // This wasn't the *last* end parenthesis char, so let's
964                                        // consume and append it here before we loop around again.
965                                        state.append_char(self.next_char()?.unwrap());
966                                    }
967                                    TokenEndReason::EndOfInput => {
968                                        return Err(
969                                            TokenizerError::UnterminatedCommandSubstitution,
970                                        );
971                                    }
972                                    _ => (),
973                                }
974                            }
975
976                            self.cross_state.arithmetic_expansion = false;
977
978                            state.append_char(self.next_char()?.unwrap());
979                        }
980
981                        Some('{') => {
982                            // Add the '$' we already consumed to the token.
983                            state.append_char('$');
984
985                            // Consume the '{' and add it to the token.
986                            state.append_char(self.next_char()?.unwrap());
987
988                            let mut pending_here_doc_tokens = vec![];
989                            let mut drain_here_doc_tokens = false;
990
991                            loop {
992                                let cur_token = if drain_here_doc_tokens
993                                    && !pending_here_doc_tokens.is_empty()
994                                {
995                                    if pending_here_doc_tokens.len() == 1 {
996                                        drain_here_doc_tokens = false;
997                                    }
998
999                                    pending_here_doc_tokens.remove(0)
1000                                } else {
1001                                    let cur_token = self.next_token_until(
1002                                        Some('}'),
1003                                        false, /* include space? */
1004                                    )?;
1005
1006                                    // See if this is a here-document-related token we need to hold
1007                                    // onto until after we've seen all the tokens that need to show
1008                                    // up before we get to the body.
1009                                    if matches!(
1010                                        cur_token.reason,
1011                                        TokenEndReason::HereDocumentBodyStart
1012                                            | TokenEndReason::HereDocumentBodyEnd
1013                                            | TokenEndReason::HereDocumentEndTag
1014                                    ) {
1015                                        pending_here_doc_tokens.push(cur_token);
1016                                        continue;
1017                                    }
1018
1019                                    cur_token
1020                                };
1021
1022                                if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
1023                                    && !pending_here_doc_tokens.is_empty()
1024                                {
1025                                    pending_here_doc_tokens.push(cur_token);
1026                                    drain_here_doc_tokens = true;
1027                                    continue;
1028                                }
1029
1030                                if let Some(cur_token_value) = cur_token.token {
1031                                    state.append_str(cur_token_value.to_str());
1032                                }
1033
1034                                match cur_token.reason {
1035                                    TokenEndReason::HereDocumentBodyStart => {
1036                                        state.append_char('\n');
1037                                    }
1038                                    TokenEndReason::NonNewLineBlank => state.append_char(' '),
1039                                    TokenEndReason::SpecifiedTerminatingChar => {
1040                                        // We hit the end brace we were looking for but did not
1041                                        // yet consume it. Do so now.
1042                                        state.append_char(self.next_char()?.unwrap());
1043                                        break;
1044                                    }
1045                                    TokenEndReason::EndOfInput => {
1046                                        return Err(TokenizerError::UnterminatedVariable);
1047                                    }
1048                                    _ => (),
1049                                }
1050                            }
1051                        }
1052                        _ => {
1053                            // This is either a different character, or else the end of the string.
1054                            // Either way, add the '$' we already consumed to the token.
1055                            state.append_char('$');
1056                        }
1057                    }
1058                } else {
1059                    // We look for the terminating backquote. First disable normal consumption and
1060                    // consume the starting backquote.
1061                    let backquote_pos = self.cross_state.cursor.clone();
1062                    self.consume_char()?;
1063
1064                    // Add the opening backquote to the token.
1065                    state.append_char(c);
1066
1067                    // Now continue until we see an unescaped backquote.
1068                    let mut escaping_enabled = false;
1069                    let mut done = false;
1070                    while !done {
1071                        // Read (and consume) the next char.
1072                        let next_char_in_backquote = self.next_char()?;
1073                        if let Some(cib) = next_char_in_backquote {
1074                            // Include it in the token no matter what.
1075                            state.append_char(cib);
1076
1077                            // Watch out for escaping.
1078                            if !escaping_enabled && cib == '\\' {
1079                                escaping_enabled = true;
1080                            } else {
1081                                // Look for an unescaped backquote to terminate.
1082                                if !escaping_enabled && cib == '`' {
1083                                    done = true;
1084                                }
1085                                escaping_enabled = false;
1086                            }
1087                        } else {
1088                            return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1089                        }
1090                    }
1091                }
1092            }
1093            //
1094            // [Extension]
1095            // If extended globbing is enabled, the last consumed character is an
1096            // unquoted start of an extglob pattern, *and* if the current character
1097            // is an open parenthesis, then this begins an extglob pattern.
1098            else if c == '('
1099                && self.options.enable_extended_globbing
1100                && state.unquoted()
1101                && !state.in_operator()
1102                && state
1103                    .current_token()
1104                    .ends_with(|x| Self::can_start_extglob(x))
1105            {
1106                // Consume the '(' and append it.
1107                self.consume_char()?;
1108                state.append_char(c);
1109
1110                let mut paren_depth = 1;
1111
1112                // Keep consuming until we see the matching end ')'.
1113                while paren_depth > 0 {
1114                    if let Some(extglob_char) = self.next_char()? {
1115                        // Include it in the token.
1116                        state.append_char(extglob_char);
1117
1118                        // Look for ')' to terminate.
1119                        // TODO: handle escaping?
1120                        if extglob_char == '(' {
1121                            paren_depth += 1;
1122                        } else if extglob_char == ')' {
1123                            paren_depth -= 1;
1124                        }
1125                    } else {
1126                        return Err(TokenizerError::UnterminatedExtendedGlob(
1127                            self.cross_state.cursor.clone(),
1128                        ));
1129                    }
1130                }
1131            //
1132            // If the character *can* start an operator, then it will.
1133            //
1134            } else if state.unquoted() && Self::can_start_operator(c) {
1135                if state.started_token() {
1136                    result = state.delimit_current_token(
1137                        TokenEndReason::OperatorStart,
1138                        &mut self.cross_state,
1139                    )?;
1140                } else {
1141                    state.token_is_operator = true;
1142                    self.consume_char()?;
1143                    state.append_char(c);
1144                }
1145            //
1146            // Whitespace gets discarded (and delimits tokens).
1147            //
1148            } else if state.unquoted() && is_blank(c) {
1149                if state.started_token() {
1150                    result = state.delimit_current_token(
1151                        TokenEndReason::NonNewLineBlank,
1152                        &mut self.cross_state,
1153                    )?;
1154                } else if include_space {
1155                    state.append_char(c);
1156                } else {
1157                    // Make sure we don't include this char in the token range.
1158                    state.start_position.column += 1;
1159                    state.start_position.index += 1;
1160                }
1161
1162                self.consume_char()?;
1163            }
1164            //
1165            // N.B. We need to remember if we were recursively called in a variable
1166            // expansion expression; in that case we won't think a token was started but...
1167            // we'd be wrong.
1168            else if !state.token_is_operator
1169                && (state.started_token() || matches!(terminating_char, Some('}')))
1170            {
1171                self.consume_char()?;
1172                state.append_char(c);
1173            } else if c == '#' {
1174                // Consume the '#'.
1175                self.consume_char()?;
1176
1177                let mut done = false;
1178                while !done {
1179                    done = match self.peek_char()? {
1180                        Some('\n') => true,
1181                        None => true,
1182                        _ => {
1183                            // Consume the peeked char; it's part of the comment.
1184                            self.consume_char()?;
1185                            false
1186                        }
1187                    };
1188                }
1189                // Re-start loop as if the comment never happened.
1190            } else if state.started_token() {
1191                // In all other cases where we have an in-progress token, we delimit here.
1192                result =
1193                    state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1194            } else {
1195                // If we got here, then we don't have a token in progress and we're not starting an
1196                // operator. Add the character to a new token.
1197                self.consume_char()?;
1198                state.append_char(c);
1199            }
1200        }
1201
1202        let result = result.unwrap();
1203
1204        Ok(result)
1205    }
1206
1207    fn remove_here_end_tag(
1208        &mut self,
1209        state: &mut TokenParseState,
1210        result: &mut Option<TokenizeResult>,
1211        ends_with_newline: bool,
1212    ) -> Result<bool, TokenizerError> {
1213        // Bail immediately if we don't even have a *starting* here tag.
1214        if self.cross_state.current_here_tags.is_empty() {
1215            return Ok(false);
1216        }
1217
1218        let next_here_tag = &self.cross_state.current_here_tags[0];
1219
1220        let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
1221            unquote_str(next_here_tag.tag.as_str()).into()
1222        } else {
1223            next_here_tag.tag.as_str().into()
1224        };
1225
1226        let tag_str = if !ends_with_newline {
1227            tag_str
1228                .strip_suffix('\n')
1229                .unwrap_or_else(|| tag_str.as_ref())
1230        } else {
1231            tag_str.as_ref()
1232        };
1233
1234        if let Some(current_token_without_here_tag) = state.current_token().strip_suffix(tag_str) {
1235            // Make sure that was either the start of the here document, or there
1236            // was a newline between the preceding part
1237            // and the tag.
1238            if current_token_without_here_tag.is_empty()
1239                || current_token_without_here_tag.ends_with('\n')
1240            {
1241                state.replace_with_here_doc(current_token_without_here_tag.to_owned());
1242
1243                // Delimit the end of the here-document body.
1244                *result = state.delimit_current_token(
1245                    TokenEndReason::HereDocumentBodyEnd,
1246                    &mut self.cross_state,
1247                )?;
1248
1249                return Ok(true);
1250            }
1251        }
1252        Ok(false)
1253    }
1254
1255    const fn can_start_extglob(c: char) -> bool {
1256        matches!(c, '@' | '!' | '?' | '+' | '*')
1257    }
1258
1259    const fn can_start_operator(c: char) -> bool {
1260        matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1261    }
1262
1263    fn is_operator(&self, s: &str) -> bool {
1264        // Handle non-POSIX operators.
1265        if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1266            return true;
1267        }
1268
1269        matches!(
1270            s,
1271            "&" | "&&"
1272                | "("
1273                | ")"
1274                | ";"
1275                | ";;"
1276                | "\n"
1277                | "|"
1278                | "||"
1279                | "<"
1280                | ">"
1281                | ">|"
1282                | "<<"
1283                | ">>"
1284                | "<&"
1285                | ">&"
1286                | "<<-"
1287                | "<>"
1288        )
1289    }
1290}
1291
1292impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1293    type Item = Result<TokenizeResult, TokenizerError>;
1294
1295    fn next(&mut self) -> Option<Self::Item> {
1296        match self.next_token() {
1297            #[expect(clippy::manual_map)]
1298            Ok(result) => match result.token {
1299                Some(_) => Some(Ok(result)),
1300                None => None,
1301            },
1302            Err(e) => Some(Err(e)),
1303        }
1304    }
1305}
1306
1307const fn is_blank(c: char) -> bool {
1308    c == ' ' || c == '\t'
1309}
1310
1311const fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1312    // If we're currently escaped, then nothing affects quoting.
1313    if state.in_escape {
1314        return false;
1315    }
1316
1317    match state.quote_mode {
1318        // When we're in a double quote or ANSI-C quote, only a subset of escape
1319        // sequences are recognized.
1320        QuoteMode::Double(_) | QuoteMode::AnsiC(_) => {
1321            if c == '\\' {
1322                // TODO: handle backslash in double quote
1323                true
1324            } else {
1325                false
1326            }
1327        }
1328        // When we're in a single quote, nothing affects quoting.
1329        QuoteMode::Single(_) => false,
1330        // When we're not already in a quote, then we can straightforwardly look for a
1331        // quote mark or backslash.
1332        QuoteMode::None => is_quoting_char(c),
1333    }
1334}
1335
1336const fn is_quoting_char(c: char) -> bool {
1337    matches!(c, '\\' | '\'' | '\"')
1338}
1339
1340/// Return a string with all the quoting removed.
1341///
1342/// # Arguments
1343///
1344/// * `s` - The string to unquote.
1345pub fn unquote_str(s: &str) -> String {
1346    let mut result = String::new();
1347
1348    let mut in_escape = false;
1349    for c in s.chars() {
1350        match c {
1351            c if in_escape => {
1352                result.push(c);
1353                in_escape = false;
1354            }
1355            '\\' => in_escape = true,
1356            c if is_quoting_char(c) => (),
1357            c => result.push(c),
1358        }
1359    }
1360
1361    result
1362}
1363
1364#[cfg(test)]
1365mod tests {
1366
1367    use super::*;
1368    use anyhow::Result;
1369    use insta::assert_ron_snapshot;
1370    use pretty_assertions::{assert_eq, assert_matches};
1371
1372    #[derive(serde::Serialize)]
1373    struct TokenizerResult<'a> {
1374        input: &'a str,
1375        result: Vec<Token>,
1376    }
1377
1378    fn test_tokenizer(input: &str) -> Result<TokenizerResult<'_>> {
1379        Ok(TokenizerResult {
1380            input,
1381            result: tokenize_str(input)?,
1382        })
1383    }
1384
1385    #[test]
1386    fn tokenize_empty() -> Result<()> {
1387        let tokens = tokenize_str("")?;
1388        assert_eq!(tokens.len(), 0);
1389        Ok(())
1390    }
1391
1392    #[test]
1393    fn tokenize_line_continuation() -> Result<()> {
1394        assert_ron_snapshot!(test_tokenizer(
1395            r"a\
1396bc"
1397        )?);
1398        Ok(())
1399    }
1400
1401    #[test]
1402    fn tokenize_operators() -> Result<()> {
1403        assert_ron_snapshot!(test_tokenizer("a>>b")?);
1404        Ok(())
1405    }
1406
1407    #[test]
1408    fn tokenize_comment() -> Result<()> {
1409        assert_ron_snapshot!(test_tokenizer(
1410            r"a #comment
1411"
1412        )?);
1413        Ok(())
1414    }
1415
1416    #[test]
1417    fn tokenize_comment_at_eof() -> Result<()> {
1418        assert_ron_snapshot!(test_tokenizer(r"a #comment")?);
1419        Ok(())
1420    }
1421
1422    #[test]
1423    fn tokenize_empty_here_doc() -> Result<()> {
1424        assert_ron_snapshot!(test_tokenizer(
1425            r"cat <<HERE
1426HERE
1427"
1428        )?);
1429        Ok(())
1430    }
1431
1432    #[test]
1433    fn tokenize_here_doc() -> Result<()> {
1434        assert_ron_snapshot!(test_tokenizer(
1435            r"cat <<HERE
1436SOMETHING
1437HERE
1438echo after
1439"
1440        )?);
1441        assert_ron_snapshot!(test_tokenizer(
1442            r"cat <<HERE
1443SOMETHING
1444HERE
1445"
1446        )?);
1447        assert_ron_snapshot!(test_tokenizer(
1448            r"cat <<HERE
1449SOMETHING
1450HERE
1451
1452"
1453        )?);
1454        assert_ron_snapshot!(test_tokenizer(
1455            r"cat <<HERE
1456SOMETHING
1457HERE"
1458        )?);
1459        Ok(())
1460    }
1461
1462    #[test]
1463    fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1464        assert_ron_snapshot!(test_tokenizer(
1465            r"cat <<-HERE
1466	SOMETHING
1467	HERE
1468"
1469        )?);
1470        Ok(())
1471    }
1472
1473    #[test]
1474    fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1475        assert_ron_snapshot!(test_tokenizer(
1476            r"cat <<EOF | wc -l
1477A B C
14781 2 3
1479D E F
1480EOF
1481"
1482        )?);
1483        Ok(())
1484    }
1485
1486    #[test]
1487    fn tokenize_multiple_here_docs() -> Result<()> {
1488        assert_ron_snapshot!(test_tokenizer(
1489            r"cat <<HERE1 <<HERE2
1490SOMETHING
1491HERE1
1492OTHER
1493HERE2
1494echo after
1495"
1496        )?);
1497        Ok(())
1498    }
1499
1500    #[test]
1501    fn tokenize_unterminated_here_doc() {
1502        let result = tokenize_str(
1503            r"cat <<HERE
1504SOMETHING
1505",
1506        );
1507        assert!(result.is_err());
1508    }
1509
1510    #[test]
1511    fn tokenize_missing_here_tag() {
1512        let result = tokenize_str(
1513            r"cat <<
1514",
1515        );
1516        assert!(result.is_err());
1517    }
1518
1519    #[test]
1520    fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1521        assert_ron_snapshot!(test_tokenizer(
1522            r"echo $(cat <<HERE
1523TEXT
1524HERE
1525)"
1526        )?);
1527        Ok(())
1528    }
1529
1530    #[test]
1531    fn tokenize_here_doc_in_double_quoted_command_substitution() -> Result<()> {
1532        assert_ron_snapshot!(test_tokenizer(
1533            r#"echo "$(cat <<HERE
1534TEXT
1535HERE
1536)""#
1537        )?);
1538        Ok(())
1539    }
1540
1541    #[test]
1542    fn tokenize_here_doc_in_double_quoted_command_substitution_with_space() -> Result<()> {
1543        assert_ron_snapshot!(test_tokenizer(
1544            r#"echo "$(cat << HERE
1545TEXT
1546HERE
1547)""#
1548        )?);
1549        Ok(())
1550    }
1551
1552    #[test]
1553    fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1554        assert_ron_snapshot!(test_tokenizer(
1555            r"echo $(cat <<HERE1 <<HERE2 | wc -l
1556TEXT
1557HERE1
1558OTHER
1559HERE2
1560)"
1561        )?);
1562        Ok(())
1563    }
1564
1565    #[test]
1566    fn tokenize_simple_backquote() -> Result<()> {
1567        assert_ron_snapshot!(test_tokenizer(r"echo `echo hi`")?);
1568        Ok(())
1569    }
1570
1571    #[test]
1572    fn tokenize_backquote_with_escape() -> Result<()> {
1573        assert_ron_snapshot!(test_tokenizer(r"echo `echo\`hi`")?);
1574        Ok(())
1575    }
1576
1577    #[test]
1578    fn tokenize_unterminated_backquote() {
1579        assert_matches!(
1580            tokenize_str("`"),
1581            Err(TokenizerError::UnterminatedBackquote(_))
1582        );
1583    }
1584
1585    #[test]
1586    fn tokenize_unterminated_command_substitution() {
1587        assert_matches!(
1588            tokenize_str("$("),
1589            Err(TokenizerError::UnterminatedCommandSubstitution)
1590        );
1591    }
1592
1593    #[test]
1594    fn tokenize_command_substitution() -> Result<()> {
1595        assert_ron_snapshot!(test_tokenizer("a$(echo hi)b c")?);
1596        Ok(())
1597    }
1598
1599    #[test]
1600    fn tokenize_command_substitution_with_subshell() -> Result<()> {
1601        assert_ron_snapshot!(test_tokenizer("$( (:) )")?);
1602        Ok(())
1603    }
1604
1605    #[test]
1606    fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1607        assert_ron_snapshot!(test_tokenizer("echo $(echo !(x))")?);
1608        Ok(())
1609    }
1610
1611    #[test]
1612    fn tokenize_arithmetic_expression() -> Result<()> {
1613        assert_ron_snapshot!(test_tokenizer("a$((1+2))b c")?);
1614        Ok(())
1615    }
1616
1617    #[test]
1618    fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1619        // N.B. The spacing comes out a bit odd, but it gets processed okay
1620        // by later stages.
1621        assert_ron_snapshot!(test_tokenizer("$(( 1 ))")?);
1622        Ok(())
1623    }
1624    #[test]
1625    fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1626        assert_ron_snapshot!(test_tokenizer("$(( (0) ))")?);
1627        Ok(())
1628    }
1629
1630    #[test]
1631    fn tokenize_special_parameters() -> Result<()> {
1632        assert_ron_snapshot!(test_tokenizer("$$")?);
1633        assert_ron_snapshot!(test_tokenizer("$@")?);
1634        assert_ron_snapshot!(test_tokenizer("$!")?);
1635        assert_ron_snapshot!(test_tokenizer("$?")?);
1636        assert_ron_snapshot!(test_tokenizer("$*")?);
1637        Ok(())
1638    }
1639
1640    #[test]
1641    fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1642        assert_ron_snapshot!(test_tokenizer("$x")?);
1643        assert_ron_snapshot!(test_tokenizer("a$x")?);
1644        Ok(())
1645    }
1646
1647    #[test]
1648    fn tokenize_unterminated_parameter_expansion() {
1649        assert_matches!(
1650            tokenize_str("${x"),
1651            Err(TokenizerError::UnterminatedVariable)
1652        );
1653    }
1654
1655    #[test]
1656    fn tokenize_braced_parameter_expansion() -> Result<()> {
1657        assert_ron_snapshot!(test_tokenizer("${x}")?);
1658        assert_ron_snapshot!(test_tokenizer("a${x}b")?);
1659        Ok(())
1660    }
1661
1662    #[test]
1663    fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1664        assert_ron_snapshot!(test_tokenizer(r"a${x\}}b")?);
1665        Ok(())
1666    }
1667
1668    #[test]
1669    fn tokenize_whitespace() -> Result<()> {
1670        assert_ron_snapshot!(test_tokenizer("1 2 3")?);
1671        Ok(())
1672    }
1673
1674    #[test]
1675    fn tokenize_escaped_whitespace() -> Result<()> {
1676        assert_ron_snapshot!(test_tokenizer(r"1\ 2 3")?);
1677        Ok(())
1678    }
1679
1680    #[test]
1681    fn tokenize_single_quote() -> Result<()> {
1682        assert_ron_snapshot!(test_tokenizer(r"x'a b'y")?);
1683        Ok(())
1684    }
1685
1686    #[test]
1687    fn tokenize_double_quote() -> Result<()> {
1688        assert_ron_snapshot!(test_tokenizer(r#"x"a b"y"#)?);
1689        Ok(())
1690    }
1691
1692    #[test]
1693    fn tokenize_double_quoted_command_substitution() -> Result<()> {
1694        assert_ron_snapshot!(test_tokenizer(r#"x"$(echo hi)"y"#)?);
1695        Ok(())
1696    }
1697
1698    #[test]
1699    fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1700        assert_ron_snapshot!(test_tokenizer(r#"x"$((1+2))"y"#)?);
1701        Ok(())
1702    }
1703
1704    #[test]
1705    fn test_quote_removal() {
1706        assert_eq!(unquote_str(r#""hello""#), "hello");
1707        assert_eq!(unquote_str(r"'hello'"), "hello");
1708        assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1709        assert_eq!(unquote_str(r"'hel\'lo'"), r"hel'lo");
1710    }
1711}
brush_parser/tokenizer.rs

brush_parser/
tokenizer.rs