Skip to main content

brush_parser/
tokenizer.rs

1use std::borrow::Cow;
2use std::sync::Arc;
3use utf8_chars::BufReadCharsExt;
4
5use crate::{SourcePosition, SourceSpan};
6
7#[derive(Clone, Debug)]
8pub(crate) enum TokenEndReason {
9    /// End of input was reached.
10    EndOfInput,
11    /// An unescaped newline char was reached.
12    UnescapedNewLine,
13    /// Specified terminating char.
14    SpecifiedTerminatingChar,
15    /// A non-newline blank char was reached.
16    NonNewLineBlank,
17    /// A here-document's body is starting.
18    HereDocumentBodyStart,
19    /// A here-document's body was terminated.
20    HereDocumentBodyEnd,
21    /// A here-document's end tag was reached.
22    HereDocumentEndTag,
23    /// An operator was started.
24    OperatorStart,
25    /// An operator was terminated.
26    OperatorEnd,
27    /// Some other condition was reached.
28    Other,
29}
30
31/// Compatibility alias for `SourceSpan`.
32pub type TokenLocation = SourceSpan;
33
34/// Represents a token extracted from a shell script.
35#[derive(Clone, Debug)]
36#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
37#[cfg_attr(
38    any(test, feature = "serde"),
39    derive(PartialEq, Eq, serde::Serialize, serde::Deserialize)
40)]
41pub enum Token {
42    /// An operator token.
43    Operator(String, SourceSpan),
44    /// A word token.
45    Word(String, SourceSpan),
46}
47
48impl Token {
49    /// Returns the string value of the token.
50    pub fn to_str(&self) -> &str {
51        match self {
52            Self::Operator(s, _) => s,
53            Self::Word(s, _) => s,
54        }
55    }
56
57    /// Returns the location of the token in the source script.
58    pub const fn location(&self) -> &SourceSpan {
59        match self {
60            Self::Operator(_, l) => l,
61            Self::Word(_, l) => l,
62        }
63    }
64}
65
66#[cfg(feature = "diagnostics")]
67impl From<&Token> for miette::SourceSpan {
68    fn from(token: &Token) -> Self {
69        let start = token.location().start.as_ref();
70        Self::new(start.into(), token.location().length())
71    }
72}
73
74/// Encapsulates the result of tokenizing a shell script.
75#[derive(Clone, Debug)]
76pub(crate) struct TokenizeResult {
77    /// Reason for tokenization ending.
78    pub reason: TokenEndReason,
79    /// The token that was extracted, if any.
80    pub token: Option<Token>,
81}
82
83/// Represents an error that occurred during tokenization.
84#[derive(thiserror::Error, Debug)]
85pub enum TokenizerError {
86    /// An unterminated escape sequence was encountered at the end of the input stream.
87    #[error("unterminated escape sequence")]
88    UnterminatedEscapeSequence,
89
90    /// An unterminated single-quoted substring was encountered at the end of the input stream.
91    #[error("unterminated single quote at {0}")]
92    UnterminatedSingleQuote(SourcePosition),
93
94    /// An unterminated ANSI C-quoted substring was encountered at the end of the input stream.
95    #[error("unterminated ANSI C quote at {0}")]
96    UnterminatedAnsiCQuote(SourcePosition),
97
98    /// An unterminated double-quoted substring was encountered at the end of the input stream.
99    #[error("unterminated double quote at {0}")]
100    UnterminatedDoubleQuote(SourcePosition),
101
102    /// An unterminated back-quoted substring was encountered at the end of the input stream.
103    #[error("unterminated backquote near {0}")]
104    UnterminatedBackquote(SourcePosition),
105
106    /// An unterminated extended glob (extglob) pattern was encountered at the end of the input
107    /// stream.
108    #[error("unterminated extglob near {0}")]
109    UnterminatedExtendedGlob(SourcePosition),
110
111    /// An unterminated variable expression was encountered at the end of the input stream.
112    #[error("unterminated variable expression")]
113    UnterminatedVariable,
114
115    /// An unterminated command substitiion was encountered at the end of the input stream.
116    #[error("unterminated command substitution")]
117    UnterminatedCommandSubstitution,
118
119    /// An unterminated arithmetic or other expansion was encountered at the end of the input
120    /// stream.
121    #[error("unterminated expansion")]
122    UnterminatedExpansion,
123
124    /// An error occurred decoding UTF-8 characters in the input stream.
125    #[error("failed to decode UTF-8 characters")]
126    FailedDecoding,
127
128    /// An I/O here tag was missing.
129    #[error("missing here tag for here document body")]
130    MissingHereTagForDocumentBody,
131
132    /// The indicated I/O here tag was missing.
133    #[error("missing here tag '{0}'")]
134    MissingHereTag(String),
135
136    /// An unterminated here document sequence was encountered at the end of the input stream.
137    #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
138    UnterminatedHereDocuments(String, String),
139
140    /// An I/O error occurred while reading from the input stream.
141    #[error("failed to read input")]
142    ReadError(#[from] std::io::Error),
143}
144
145impl TokenizerError {
146    /// Returns true if the error represents an error that could possibly be due
147    /// to an incomplete input stream.
148    pub const fn is_incomplete(&self) -> bool {
149        matches!(
150            self,
151            Self::UnterminatedEscapeSequence
152                | Self::UnterminatedAnsiCQuote(..)
153                | Self::UnterminatedSingleQuote(..)
154                | Self::UnterminatedDoubleQuote(..)
155                | Self::UnterminatedBackquote(..)
156                | Self::UnterminatedCommandSubstitution
157                | Self::UnterminatedExpansion
158                | Self::UnterminatedVariable
159                | Self::UnterminatedExtendedGlob(..)
160                | Self::UnterminatedHereDocuments(..)
161        )
162    }
163}
164
165/// Encapsulates a sequence of tokens.
166#[derive(Debug)]
167pub(crate) struct Tokens<'a> {
168    /// Sequence of tokens.
169    pub tokens: &'a [Token],
170}
171
172#[derive(Clone, Debug)]
173enum QuoteMode {
174    None,
175    AnsiC(SourcePosition),
176    Single(SourcePosition),
177    Double(SourcePosition),
178}
179
180#[derive(Clone, Debug, Default)]
181enum HereState {
182    /// In this state, we are not currently tracking any here-documents.
183    #[default]
184    None,
185    /// In this state, we expect that the next token will be a here tag.
186    NextTokenIsHereTag { remove_tabs: bool },
187    /// In this state, the *current* token is a here tag.
188    CurrentTokenIsHereTag {
189        remove_tabs: bool,
190        operator_token_result: TokenizeResult,
191    },
192    /// In this state, we expect that the *next line* will be the body of
193    /// a here-document.
194    NextLineIsHereDoc,
195    /// In this state, we are in the set of lines that comprise 1 or more
196    /// consecutive here-document bodies.
197    InHereDocs,
198}
199
200#[derive(Clone, Debug)]
201struct HereTag {
202    tag: String,
203    tag_was_escaped_or_quoted: bool,
204    remove_tabs: bool,
205    position: SourcePosition,
206    tokens: Vec<TokenizeResult>,
207    pending_tokens_after: Vec<TokenizeResult>,
208}
209
210#[derive(Clone, Debug)]
211struct CrossTokenParseState {
212    /// Cursor within the overall token stream; used for error reporting.
213    cursor: SourcePosition,
214    /// Current state of parsing here-documents.
215    here_state: HereState,
216    /// Ordered queue of here tags for which we're still looking for matching here-document bodies.
217    current_here_tags: Vec<HereTag>,
218    /// Tokens already tokenized that should be used first to serve requests for tokens.
219    queued_tokens: Vec<TokenizeResult>,
220    /// Are we in an arithmetic expansion?
221    arithmetic_expansion: bool,
222}
223
224/// Options controlling how the tokenizer operates.
225#[derive(Clone, Debug, Hash, Eq, PartialEq)]
226pub struct TokenizerOptions {
227    /// Whether or not to enable extended globbing patterns (extglob).
228    pub enable_extended_globbing: bool,
229    /// Whether or not to operate in POSIX compliance mode.
230    pub posix_mode: bool,
231    /// Whether or not we're running in SH emulation mode.
232    pub sh_mode: bool,
233}
234
235impl Default for TokenizerOptions {
236    fn default() -> Self {
237        Self {
238            enable_extended_globbing: true,
239            posix_mode: false,
240            sh_mode: false,
241        }
242    }
243}
244
245/// A tokenizer for shell scripts.
246pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
247    char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
248    cross_state: CrossTokenParseState,
249    options: TokenizerOptions,
250}
251
252/// Encapsulates the current token parsing state.
253#[derive(Clone, Debug)]
254struct TokenParseState {
255    pub start_position: SourcePosition,
256    pub token_so_far: String,
257    pub token_is_operator: bool,
258    pub in_escape: bool,
259    pub quote_mode: QuoteMode,
260}
261
262impl TokenParseState {
263    pub fn new(start_position: &SourcePosition) -> Self {
264        Self {
265            start_position: start_position.to_owned(),
266            token_so_far: String::new(),
267            token_is_operator: false,
268            in_escape: false,
269            quote_mode: QuoteMode::None,
270        }
271    }
272
273    pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
274        let end = Arc::new(end_position.to_owned());
275        let token_location = SourceSpan {
276            start: Arc::new(std::mem::take(&mut self.start_position)),
277            end,
278        };
279
280        let token = if std::mem::take(&mut self.token_is_operator) {
281            Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
282        } else {
283            Token::Word(std::mem::take(&mut self.token_so_far), token_location)
284        };
285
286        end_position.clone_into(&mut self.start_position);
287        self.in_escape = false;
288        self.quote_mode = QuoteMode::None;
289
290        token
291    }
292
293    pub const fn started_token(&self) -> bool {
294        !self.token_so_far.is_empty()
295    }
296
297    pub fn append_char(&mut self, c: char) {
298        self.token_so_far.push(c);
299    }
300
301    pub fn append_str(&mut self, s: &str) {
302        self.token_so_far.push_str(s);
303    }
304
305    pub const fn unquoted(&self) -> bool {
306        !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
307    }
308
309    pub fn current_token(&self) -> &str {
310        &self.token_so_far
311    }
312
313    pub fn is_specific_operator(&self, operator: &str) -> bool {
314        self.token_is_operator && self.current_token() == operator
315    }
316
317    pub const fn in_operator(&self) -> bool {
318        self.token_is_operator
319    }
320
321    fn is_newline(&self) -> bool {
322        self.token_so_far == "\n"
323    }
324
325    fn replace_with_here_doc(&mut self, s: String) {
326        self.token_so_far = s;
327    }
328
329    #[allow(clippy::too_many_lines)]
330    pub fn delimit_current_token(
331        &mut self,
332        reason: TokenEndReason,
333        cross_token_state: &mut CrossTokenParseState,
334    ) -> Result<Option<TokenizeResult>, TokenizerError> {
335        // If we don't have anything in the token, then don't yield an empty string token
336        // *unless* it's the body of a here document.
337        if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
338            return Ok(Some(TokenizeResult {
339                reason,
340                token: None,
341            }));
342        }
343
344        // TODO(tokenizer): Make sure the here-tag meets criteria (and isn't a newline).
345        let current_here_state = std::mem::take(&mut cross_token_state.here_state);
346        match current_here_state {
347            HereState::NextTokenIsHereTag { remove_tabs } => {
348                // Don't yield the operator as a token yet. We need to make sure we collect
349                // up everything we need for all the here-documents with tags on this line.
350                let operator_token_result = TokenizeResult {
351                    reason,
352                    token: Some(self.pop(&cross_token_state.cursor)),
353                };
354
355                cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
356                    remove_tabs,
357                    operator_token_result,
358                };
359
360                return Ok(None);
361            }
362            HereState::CurrentTokenIsHereTag {
363                remove_tabs,
364                operator_token_result,
365            } => {
366                if self.is_newline() {
367                    return Err(TokenizerError::MissingHereTag(
368                        self.current_token().to_owned(),
369                    ));
370                }
371
372                cross_token_state.here_state = HereState::NextLineIsHereDoc;
373
374                // Include the trailing \n in the here tag so it's easier to check against.
375                let tag = std::format!("{}\n", self.current_token().trim_ascii_start());
376                let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
377
378                let tag_token_result = TokenizeResult {
379                    reason,
380                    token: Some(self.pop(&cross_token_state.cursor)),
381                };
382
383                cross_token_state.current_here_tags.push(HereTag {
384                    tag,
385                    tag_was_escaped_or_quoted,
386                    remove_tabs,
387                    position: cross_token_state.cursor.clone(),
388                    tokens: vec![operator_token_result, tag_token_result],
389                    pending_tokens_after: vec![],
390                });
391
392                return Ok(None);
393            }
394            HereState::NextLineIsHereDoc => {
395                if self.is_newline() {
396                    cross_token_state.here_state = HereState::InHereDocs;
397                } else {
398                    cross_token_state.here_state = HereState::NextLineIsHereDoc;
399                }
400
401                if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
402                    let token = self.pop(&cross_token_state.cursor);
403                    let result = TokenizeResult {
404                        reason,
405                        token: Some(token),
406                    };
407
408                    last_here_tag.pending_tokens_after.push(result);
409                } else {
410                    return Err(TokenizerError::MissingHereTagForDocumentBody);
411                }
412
413                return Ok(None);
414            }
415            HereState::InHereDocs => {
416                // We hit the end of the current here-document.
417                let completed_here_tag = cross_token_state.current_here_tags.remove(0);
418
419                // First queue the redirection operator and (start) here-tag.
420                for here_token in completed_here_tag.tokens {
421                    cross_token_state.queued_tokens.push(here_token);
422                }
423
424                // Leave a hint that we are about to start a here-document.
425                cross_token_state.queued_tokens.push(TokenizeResult {
426                    reason: TokenEndReason::HereDocumentBodyStart,
427                    token: None,
428                });
429
430                // Then queue the body document we just finished.
431                cross_token_state.queued_tokens.push(TokenizeResult {
432                    reason,
433                    token: Some(self.pop(&cross_token_state.cursor)),
434                });
435
436                // Then queue up the (end) here-tag.
437                let end_tag = if completed_here_tag.tag_was_escaped_or_quoted {
438                    unquote_str(&completed_here_tag.tag)
439                } else {
440                    completed_here_tag.tag
441                };
442                self.append_str(end_tag.trim_end_matches('\n'));
443                cross_token_state.queued_tokens.push(TokenizeResult {
444                    reason: TokenEndReason::HereDocumentEndTag,
445                    token: Some(self.pop(&cross_token_state.cursor)),
446                });
447
448                // Now we're ready to queue up any tokens that came between the completed
449                // here tag and the next here tag (or newline after it if it was the last).
450                for pending_token in completed_here_tag.pending_tokens_after {
451                    cross_token_state.queued_tokens.push(pending_token);
452                }
453
454                if cross_token_state.current_here_tags.is_empty() {
455                    cross_token_state.here_state = HereState::None;
456                } else {
457                    cross_token_state.here_state = HereState::InHereDocs;
458                }
459
460                return Ok(None);
461            }
462            HereState::None => (),
463        }
464
465        let token = self.pop(&cross_token_state.cursor);
466        let result = TokenizeResult {
467            reason,
468            token: Some(token),
469        };
470
471        Ok(Some(result))
472    }
473}
474
475/// Break the given input shell script string into tokens, returning the tokens.
476///
477/// # Arguments
478///
479/// * `input` - The shell script to tokenize.
480pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
481    tokenize_str_with_options(input, &TokenizerOptions::default())
482}
483
484/// Break the given input shell script string into tokens, returning the tokens.
485///
486/// # Arguments
487///
488/// * `input` - The shell script to tokenize.
489/// * `options` - Options controlling how the tokenizer operates.
490pub fn tokenize_str_with_options(
491    input: &str,
492    options: &TokenizerOptions,
493) -> Result<Vec<Token>, TokenizerError> {
494    uncached_tokenize_string(input.to_owned(), options.to_owned())
495}
496
497#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
498fn uncached_tokenize_string(
499    input: String,
500    options: TokenizerOptions,
501) -> Result<Vec<Token>, TokenizerError> {
502    uncached_tokenize_str(input.as_str(), &options)
503}
504
505/// Break the given input shell script string into tokens, returning the tokens.
506/// No caching is performed.
507///
508/// # Arguments
509///
510/// * `input` - The shell script to tokenize.
511pub fn uncached_tokenize_str(
512    input: &str,
513    options: &TokenizerOptions,
514) -> Result<Vec<Token>, TokenizerError> {
515    let mut reader = std::io::BufReader::new(input.as_bytes());
516    let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
517
518    let mut tokens = vec![];
519    loop {
520        match tokenizer.next_token()? {
521            TokenizeResult {
522                token: Some(token), ..
523            } => tokens.push(token),
524            TokenizeResult {
525                reason: TokenEndReason::EndOfInput,
526                ..
527            } => break,
528            _ => (),
529        }
530    }
531
532    Ok(tokens)
533}
534
535impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
536    pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Self {
537        Tokenizer {
538            options: options.clone(),
539            char_reader: reader.chars().peekable(),
540            cross_state: CrossTokenParseState {
541                cursor: SourcePosition {
542                    index: 0,
543                    line: 1,
544                    column: 1,
545                },
546                here_state: HereState::None,
547                current_here_tags: vec![],
548                queued_tokens: vec![],
549                arithmetic_expansion: false,
550            },
551        }
552    }
553
554    #[expect(clippy::unnecessary_wraps)]
555    pub fn current_location(&self) -> Option<SourcePosition> {
556        Some(self.cross_state.cursor.clone())
557    }
558
559    fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
560        let c = self
561            .char_reader
562            .next()
563            .transpose()
564            .map_err(TokenizerError::ReadError)?;
565
566        if let Some(ch) = c {
567            if ch == '\n' {
568                self.cross_state.cursor.line += 1;
569                self.cross_state.cursor.column = 1;
570            } else {
571                self.cross_state.cursor.column += 1;
572            }
573            self.cross_state.cursor.index += 1;
574        }
575
576        Ok(c)
577    }
578
579    fn consume_char(&mut self) -> Result<(), TokenizerError> {
580        let _ = self.next_char()?;
581        Ok(())
582    }
583
584    fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
585        match self.char_reader.peek() {
586            Some(result) => match result {
587                Ok(c) => Ok(Some(*c)),
588                Err(_) => Err(TokenizerError::FailedDecoding),
589            },
590            None => Ok(None),
591        }
592    }
593
594    pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
595        self.next_token_until(None, false /* include space? */)
596    }
597
598    /// Consumes a nested construct (e.g., `$((...))` or `$[...]`), handling nested delimiters
599    /// and here-documents.
600    ///
601    /// # Arguments
602    ///
603    /// * `state` - The current token parse state to append characters to.
604    /// * `terminating_char` - The character that terminates the construct (e.g., `)` or `]`).
605    /// * `nesting_open` - The character that increases nesting depth when encountered (e.g., `(` or
606    ///   `[`).
607    /// * `initial_nesting` - The initial nesting count (e.g., 2 for `$((`, 1 for `$[`).
608    fn consume_nested_construct(
609        &mut self,
610        state: &mut TokenParseState,
611        terminating_char: char,
612        nesting_open: &str,
613        mut nesting_count: u32,
614    ) -> Result<(), TokenizerError> {
615        let mut pending_here_doc_tokens = vec![];
616        let mut drain_here_doc_tokens = false;
617
618        loop {
619            let cur_token = if drain_here_doc_tokens && !pending_here_doc_tokens.is_empty() {
620                if pending_here_doc_tokens.len() == 1 {
621                    drain_here_doc_tokens = false;
622                }
623                pending_here_doc_tokens.remove(0)
624            } else {
625                let cur_token = self.next_token_until(Some(terminating_char), true)?;
626
627                if matches!(
628                    cur_token.reason,
629                    TokenEndReason::HereDocumentBodyStart
630                        | TokenEndReason::HereDocumentBodyEnd
631                        | TokenEndReason::HereDocumentEndTag
632                ) {
633                    pending_here_doc_tokens.push(cur_token);
634                    continue;
635                }
636                cur_token
637            };
638
639            if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
640                && !pending_here_doc_tokens.is_empty()
641            {
642                pending_here_doc_tokens.push(cur_token);
643                drain_here_doc_tokens = true;
644                continue;
645            }
646
647            if let Some(cur_token_value) = cur_token.token {
648                state.append_str(cur_token_value.to_str());
649
650                if matches!(cur_token_value, Token::Operator(o, _) if o == nesting_open) {
651                    nesting_count += 1;
652                }
653            }
654
655            match cur_token.reason {
656                TokenEndReason::HereDocumentBodyStart => {
657                    state.append_char('\n');
658                }
659                TokenEndReason::NonNewLineBlank => state.append_char(' '),
660                TokenEndReason::SpecifiedTerminatingChar => {
661                    nesting_count -= 1;
662                    if nesting_count == 0 {
663                        break;
664                    }
665                    state.append_char(self.next_char()?.unwrap());
666                }
667                TokenEndReason::EndOfInput => {
668                    return Err(TokenizerError::UnterminatedExpansion);
669                }
670                _ => (),
671            }
672        }
673
674        state.append_char(self.next_char()?.unwrap());
675        Ok(())
676    }
677
678    /// Returns the next token from the input stream, optionally stopping early when a specified
679    /// terminating character is encountered.
680    ///
681    /// # Arguments
682    ///
683    /// * `terminating_char` - An optional character that, if encountered, will stop the
684    ///   tokenization process and return the token up to that character.
685    /// * `include_space` - If true, include spaces in the tokenization process. This is not
686    ///   typically the case, but can be helpful when needing to preserve the original source text
687    ///   embedded within a command substitution or similar construct.
688    #[expect(clippy::cognitive_complexity)]
689    #[expect(clippy::if_same_then_else)]
690    #[expect(clippy::panic_in_result_fn)]
691    #[expect(clippy::too_many_lines)]
692    #[allow(clippy::unwrap_in_result)]
693    fn next_token_until(
694        &mut self,
695        terminating_char: Option<char>,
696        include_space: bool,
697    ) -> Result<TokenizeResult, TokenizerError> {
698        let mut state = TokenParseState::new(&self.cross_state.cursor);
699        let mut result: Option<TokenizeResult> = None;
700
701        while result.is_none() {
702            // First satisfy token results from our queue. Once we exhaust the queue then
703            // we'll look at the input stream.
704            if !self.cross_state.queued_tokens.is_empty() {
705                return Ok(self.cross_state.queued_tokens.remove(0));
706            }
707
708            let next = self.peek_char()?;
709            let c = next.unwrap_or('\0');
710
711            // When we hit the end of the input, then we're done with the current token (if there is
712            // one).
713            if next.is_none() {
714                // TODO(tokenizer): Verify we're not waiting on some terminating character?
715                // Verify we're out of all quotes.
716                if state.in_escape {
717                    return Err(TokenizerError::UnterminatedEscapeSequence);
718                }
719                match state.quote_mode {
720                    QuoteMode::None => (),
721                    QuoteMode::AnsiC(pos) => {
722                        return Err(TokenizerError::UnterminatedAnsiCQuote(pos));
723                    }
724                    QuoteMode::Single(pos) => {
725                        return Err(TokenizerError::UnterminatedSingleQuote(pos));
726                    }
727                    QuoteMode::Double(pos) => {
728                        return Err(TokenizerError::UnterminatedDoubleQuote(pos));
729                    }
730                }
731
732                // Verify we're not in a here document.
733                if !matches!(self.cross_state.here_state, HereState::None) {
734                    if self.remove_here_end_tag(&mut state, &mut result, false)? {
735                        // If we hit end tag without a trailing newline, try to get next token.
736                        continue;
737                    }
738
739                    let tag_names = self
740                        .cross_state
741                        .current_here_tags
742                        .iter()
743                        .map(|tag| tag.tag.trim())
744                        .collect::<Vec<_>>()
745                        .join(", ");
746                    let tag_positions = self
747                        .cross_state
748                        .current_here_tags
749                        .iter()
750                        .map(|tag| std::format!("{}", tag.position))
751                        .collect::<Vec<_>>()
752                        .join(", ");
753                    return Err(TokenizerError::UnterminatedHereDocuments(
754                        tag_names,
755                        tag_positions,
756                    ));
757                }
758
759                result = state
760                    .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
761            //
762            // Handle being in a here document.
763            //
764            } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
765                //
766                // For now, just include the character in the current token. We also check
767                // if there are leading tabs to be removed.
768                //
769                if !self.cross_state.current_here_tags.is_empty()
770                    && self.cross_state.current_here_tags[0].remove_tabs
771                    && (!state.started_token() || state.current_token().ends_with('\n'))
772                    && c == '\t'
773                {
774                    // Consume it but don't include it.
775                    self.consume_char()?;
776                } else {
777                    self.consume_char()?;
778                    state.append_char(c);
779
780                    // See if this was a newline character following the terminating here tag.
781                    if c == '\n' {
782                        self.remove_here_end_tag(&mut state, &mut result, true)?;
783                    }
784                }
785            //
786            // Look for the specially specified terminating char.
787            //
788            } else if state.unquoted() && terminating_char == Some(c) {
789                result = state.delimit_current_token(
790                    TokenEndReason::SpecifiedTerminatingChar,
791                    &mut self.cross_state,
792                )?;
793            } else if state.in_operator() {
794                //
795                // We're in an operator. See if this character continues an operator, or if it
796                // must be a separate token (because it wouldn't make a prefix of an operator).
797                //
798
799                let mut hypothetical_token = state.current_token().to_owned();
800                hypothetical_token.push(c);
801
802                if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
803                    self.consume_char()?;
804                    state.append_char(c);
805                } else {
806                    assert!(state.started_token());
807
808                    //
809                    // N.B. If the completed operator indicates a here-document, then keep
810                    // track that the *next* token should be the here-tag.
811                    //
812                    if self.cross_state.arithmetic_expansion {
813                        //
814                        // We're in an arithmetic context; don't consider << and <<-
815                        // special. They're not here-docs, they're either a left-shift
816                        // operator or a left-shift operator followed by a unary
817                        // minus operator.
818                        //
819
820                        if state.is_specific_operator(")") && c == ')' {
821                            self.cross_state.arithmetic_expansion = false;
822                        }
823                    } else if state.is_specific_operator("<<") {
824                        self.cross_state.here_state =
825                            HereState::NextTokenIsHereTag { remove_tabs: false };
826                    } else if state.is_specific_operator("<<-") {
827                        self.cross_state.here_state =
828                            HereState::NextTokenIsHereTag { remove_tabs: true };
829                    } else if state.is_specific_operator("(") && c == '(' {
830                        self.cross_state.arithmetic_expansion = true;
831                    }
832
833                    let reason = if state.current_token() == "\n" {
834                        TokenEndReason::UnescapedNewLine
835                    } else {
836                        TokenEndReason::OperatorEnd
837                    };
838
839                    result = state.delimit_current_token(reason, &mut self.cross_state)?;
840                }
841            //
842            // See if this is a character that changes the current escaping/quoting state.
843            //
844            } else if does_char_newly_affect_quoting(&state, c) {
845                if c == '\\' {
846                    // Consume the backslash ourselves so we can peek past it.
847                    self.consume_char()?;
848
849                    if matches!(self.peek_char()?, Some('\n')) {
850                        // Make sure the newline char gets consumed too.
851                        self.consume_char()?;
852
853                        // Make sure to include neither the backslash nor the newline character.
854                    } else {
855                        state.in_escape = true;
856                        state.append_char(c);
857                    }
858                } else if c == '\'' {
859                    if state.token_so_far.ends_with('$') {
860                        state.quote_mode = QuoteMode::AnsiC(self.cross_state.cursor.clone());
861                    } else {
862                        state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
863                    }
864
865                    self.consume_char()?;
866                    state.append_char(c);
867                } else if c == '\"' {
868                    state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
869                    self.consume_char()?;
870                    state.append_char(c);
871                }
872            }
873            //
874            // Handle end of single-quote, double-quote, or ANSI-C quote.
875            else if !state.in_escape
876                && matches!(
877                    state.quote_mode,
878                    QuoteMode::Single(..) | QuoteMode::AnsiC(..)
879                )
880                && c == '\''
881            {
882                state.quote_mode = QuoteMode::None;
883                self.consume_char()?;
884                state.append_char(c);
885            } else if !state.in_escape
886                && matches!(state.quote_mode, QuoteMode::Double(..))
887                && c == '\"'
888            {
889                state.quote_mode = QuoteMode::None;
890                self.consume_char()?;
891                state.append_char(c);
892            }
893            //
894            // Handle end of escape sequence.
895            // TODO(tokenizer): Handle double-quote specific escape sequences.
896            else if state.in_escape {
897                state.in_escape = false;
898                self.consume_char()?;
899                state.append_char(c);
900            } else if (state.unquoted()
901                || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
902                && (c == '$' || c == '`')
903            {
904                // TODO(tokenizer): handle quoted $ or ` in a double quote
905                if c == '$' {
906                    // Consume the '$' so we can peek beyond.
907                    self.consume_char()?;
908
909                    // Now peek beyond to see what we have.
910                    let char_after_dollar_sign = self.peek_char()?;
911                    match char_after_dollar_sign {
912                        Some('(') => {
913                            // Add the '$' we already consumed to the token.
914                            state.append_char('$');
915
916                            // Consume the '(' and add it to the token.
917                            state.append_char(self.next_char()?.unwrap());
918
919                            // Check to see if this is possibly an arithmetic expression
920                            // (i.e., one that starts with `$((`).
921                            let (initial_nesting, is_arithmetic) =
922                                if matches!(self.peek_char()?, Some('(')) {
923                                    // Consume the second '(' and add it to the token.
924                                    state.append_char(self.next_char()?.unwrap());
925                                    (2, true)
926                                } else {
927                                    (1, false)
928                                };
929
930                            if is_arithmetic {
931                                self.cross_state.arithmetic_expansion = true;
932                            }
933
934                            self.consume_nested_construct(&mut state, ')', "(", initial_nesting)?;
935
936                            if is_arithmetic {
937                                self.cross_state.arithmetic_expansion = false;
938                            }
939                        }
940
941                        Some('[') => {
942                            // Add the '$' we already consumed to the token.
943                            state.append_char('$');
944
945                            // Consume the '[' and add it to the token.
946                            state.append_char(self.next_char()?.unwrap());
947
948                            // Keep track that we're in an arithmetic expression, since
949                            // some text will be interpreted differently as a result.
950                            self.cross_state.arithmetic_expansion = true;
951
952                            self.consume_nested_construct(&mut state, ']', "[", 1)?;
953
954                            self.cross_state.arithmetic_expansion = false;
955                        }
956
957                        Some('{') => {
958                            // Add the '$' we already consumed to the token.
959                            state.append_char('$');
960
961                            // Consume the '{' and add it to the token.
962                            state.append_char(self.next_char()?.unwrap());
963
964                            let mut pending_here_doc_tokens = vec![];
965                            let mut drain_here_doc_tokens = false;
966
967                            loop {
968                                let cur_token = if drain_here_doc_tokens
969                                    && !pending_here_doc_tokens.is_empty()
970                                {
971                                    if pending_here_doc_tokens.len() == 1 {
972                                        drain_here_doc_tokens = false;
973                                    }
974
975                                    pending_here_doc_tokens.remove(0)
976                                } else {
977                                    let cur_token = self.next_token_until(
978                                        Some('}'),
979                                        false, /* include space? */
980                                    )?;
981
982                                    // See if this is a here-document-related token we need to hold
983                                    // onto until after we've seen all the tokens that need to show
984                                    // up before we get to the body.
985                                    if matches!(
986                                        cur_token.reason,
987                                        TokenEndReason::HereDocumentBodyStart
988                                            | TokenEndReason::HereDocumentBodyEnd
989                                            | TokenEndReason::HereDocumentEndTag
990                                    ) {
991                                        pending_here_doc_tokens.push(cur_token);
992                                        continue;
993                                    }
994
995                                    cur_token
996                                };
997
998                                if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
999                                    && !pending_here_doc_tokens.is_empty()
1000                                {
1001                                    pending_here_doc_tokens.push(cur_token);
1002                                    drain_here_doc_tokens = true;
1003                                    continue;
1004                                }
1005
1006                                if let Some(cur_token_value) = cur_token.token {
1007                                    state.append_str(cur_token_value.to_str());
1008                                }
1009
1010                                match cur_token.reason {
1011                                    TokenEndReason::HereDocumentBodyStart => {
1012                                        state.append_char('\n');
1013                                    }
1014                                    TokenEndReason::NonNewLineBlank => state.append_char(' '),
1015                                    TokenEndReason::SpecifiedTerminatingChar => {
1016                                        // We hit the end brace we were looking for but did not
1017                                        // yet consume it. Do so now.
1018                                        state.append_char(self.next_char()?.unwrap());
1019                                        break;
1020                                    }
1021                                    TokenEndReason::EndOfInput => {
1022                                        return Err(TokenizerError::UnterminatedVariable);
1023                                    }
1024                                    _ => (),
1025                                }
1026                            }
1027                        }
1028                        _ => {
1029                            // This is either a different character, or else the end of the string.
1030                            // Either way, add the '$' we already consumed to the token.
1031                            state.append_char('$');
1032                        }
1033                    }
1034                } else {
1035                    // We look for the terminating backquote. First disable normal consumption and
1036                    // consume the starting backquote.
1037                    let backquote_pos = self.cross_state.cursor.clone();
1038                    self.consume_char()?;
1039
1040                    // Add the opening backquote to the token.
1041                    state.append_char(c);
1042
1043                    // Now continue until we see an unescaped backquote.
1044                    let mut escaping_enabled = false;
1045                    let mut done = false;
1046                    while !done {
1047                        // Read (and consume) the next char.
1048                        let next_char_in_backquote = self.next_char()?;
1049                        if let Some(cib) = next_char_in_backquote {
1050                            // Include it in the token no matter what.
1051                            state.append_char(cib);
1052
1053                            // Watch out for escaping.
1054                            if !escaping_enabled && cib == '\\' {
1055                                escaping_enabled = true;
1056                            } else {
1057                                // Look for an unescaped backquote to terminate.
1058                                if !escaping_enabled && cib == '`' {
1059                                    done = true;
1060                                }
1061                                escaping_enabled = false;
1062                            }
1063                        } else {
1064                            return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1065                        }
1066                    }
1067                }
1068            }
1069            //
1070            // [Extension]
1071            // If extended globbing is enabled, the last consumed character is an
1072            // unquoted start of an extglob pattern, *and* if the current character
1073            // is an open parenthesis, then this begins an extglob pattern.
1074            else if c == '('
1075                && self.options.enable_extended_globbing
1076                && state.unquoted()
1077                && !state.in_operator()
1078                && state
1079                    .current_token()
1080                    .ends_with(|x| Self::can_start_extglob(x))
1081            {
1082                // Consume the '(' and append it.
1083                self.consume_char()?;
1084                state.append_char(c);
1085
1086                let mut paren_depth = 1;
1087                let mut in_escape = false;
1088
1089                // Keep consuming until we see the matching end ')'.
1090                while paren_depth > 0 {
1091                    if let Some(extglob_char) = self.next_char()? {
1092                        // Include it in the token.
1093                        state.append_char(extglob_char);
1094
1095                        match extglob_char {
1096                            _ if in_escape => in_escape = false,
1097                            '\\' => in_escape = true,
1098                            '(' => paren_depth += 1,
1099                            ')' => paren_depth -= 1,
1100                            _ => (),
1101                        }
1102                    } else {
1103                        return Err(TokenizerError::UnterminatedExtendedGlob(
1104                            self.cross_state.cursor.clone(),
1105                        ));
1106                    }
1107                }
1108            //
1109            // If the character *can* start an operator, then it will.
1110            //
1111            } else if state.unquoted() && Self::can_start_operator(c) {
1112                if state.started_token() {
1113                    result = state.delimit_current_token(
1114                        TokenEndReason::OperatorStart,
1115                        &mut self.cross_state,
1116                    )?;
1117                } else {
1118                    state.token_is_operator = true;
1119                    self.consume_char()?;
1120                    state.append_char(c);
1121                }
1122            //
1123            // Whitespace gets discarded (and delimits tokens).
1124            //
1125            } else if state.unquoted() && is_blank(c) {
1126                if state.started_token() {
1127                    result = state.delimit_current_token(
1128                        TokenEndReason::NonNewLineBlank,
1129                        &mut self.cross_state,
1130                    )?;
1131                } else if include_space {
1132                    state.append_char(c);
1133                } else {
1134                    // Make sure we don't include this char in the token range.
1135                    state.start_position.column += 1;
1136                    state.start_position.index += 1;
1137                }
1138
1139                self.consume_char()?;
1140            }
1141            //
1142            // N.B. We need to remember if we were recursively called in a variable
1143            // expansion expression; in that case we won't think a token was started but...
1144            // we'd be wrong.
1145            else if !state.token_is_operator
1146                && (state.started_token() || matches!(terminating_char, Some('}')))
1147            {
1148                self.consume_char()?;
1149                state.append_char(c);
1150            } else if c == '#' {
1151                // Consume the '#'.
1152                self.consume_char()?;
1153
1154                let mut done = false;
1155                while !done {
1156                    done = match self.peek_char()? {
1157                        Some('\n') => true,
1158                        None => true,
1159                        _ => {
1160                            // Consume the peeked char; it's part of the comment.
1161                            self.consume_char()?;
1162                            false
1163                        }
1164                    };
1165                }
1166                // Re-start loop as if the comment never happened.
1167            } else if state.started_token() {
1168                // In all other cases where we have an in-progress token, we delimit here.
1169                result =
1170                    state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1171            } else {
1172                // If we got here, then we don't have a token in progress and we're not starting an
1173                // operator. Add the character to a new token.
1174                self.consume_char()?;
1175                state.append_char(c);
1176            }
1177        }
1178
1179        let result = result.unwrap();
1180
1181        Ok(result)
1182    }
1183
1184    fn remove_here_end_tag(
1185        &mut self,
1186        state: &mut TokenParseState,
1187        result: &mut Option<TokenizeResult>,
1188        ends_with_newline: bool,
1189    ) -> Result<bool, TokenizerError> {
1190        // Bail immediately if we don't even have a *starting* here tag.
1191        if self.cross_state.current_here_tags.is_empty() {
1192            return Ok(false);
1193        }
1194
1195        let next_here_tag = &self.cross_state.current_here_tags[0];
1196
1197        let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
1198            unquote_str(next_here_tag.tag.as_str()).into()
1199        } else {
1200            next_here_tag.tag.as_str().into()
1201        };
1202
1203        let tag_str = if !ends_with_newline {
1204            tag_str
1205                .strip_suffix('\n')
1206                .unwrap_or_else(|| tag_str.as_ref())
1207        } else {
1208            tag_str.as_ref()
1209        };
1210
1211        if let Some(current_token_without_here_tag) = state.current_token().strip_suffix(tag_str) {
1212            // Make sure that was either the start of the here document, or there
1213            // was a newline between the preceding part
1214            // and the tag.
1215            if current_token_without_here_tag.is_empty()
1216                || current_token_without_here_tag.ends_with('\n')
1217            {
1218                state.replace_with_here_doc(current_token_without_here_tag.to_owned());
1219
1220                // Delimit the end of the here-document body.
1221                *result = state.delimit_current_token(
1222                    TokenEndReason::HereDocumentBodyEnd,
1223                    &mut self.cross_state,
1224                )?;
1225
1226                return Ok(true);
1227            }
1228        }
1229        Ok(false)
1230    }
1231
1232    const fn can_start_extglob(c: char) -> bool {
1233        matches!(c, '@' | '!' | '?' | '+' | '*')
1234    }
1235
1236    const fn can_start_operator(c: char) -> bool {
1237        matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1238    }
1239
1240    fn is_operator(&self, s: &str) -> bool {
1241        // Handle non-POSIX operators.
1242        if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1243            return true;
1244        }
1245
1246        matches!(
1247            s,
1248            "&" | "&&"
1249                | "("
1250                | ")"
1251                | ";"
1252                | ";;"
1253                | "\n"
1254                | "|"
1255                | "||"
1256                | "<"
1257                | ">"
1258                | ">|"
1259                | "<<"
1260                | ">>"
1261                | "<&"
1262                | ">&"
1263                | "<<-"
1264                | "<>"
1265        )
1266    }
1267}
1268
1269impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1270    type Item = Result<TokenizeResult, TokenizerError>;
1271
1272    fn next(&mut self) -> Option<Self::Item> {
1273        match self.next_token() {
1274            #[expect(clippy::manual_map)]
1275            Ok(result) => match result.token {
1276                Some(_) => Some(Ok(result)),
1277                None => None,
1278            },
1279            Err(e) => Some(Err(e)),
1280        }
1281    }
1282}
1283
1284const fn is_blank(c: char) -> bool {
1285    c == ' ' || c == '\t'
1286}
1287
1288const fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1289    // If we're currently escaped, then nothing affects quoting.
1290    if state.in_escape {
1291        return false;
1292    }
1293
1294    match state.quote_mode {
1295        // When we're in a double quote or ANSI-C quote, only a subset of escape
1296        // sequences are recognized.
1297        QuoteMode::Double(_) | QuoteMode::AnsiC(_) => {
1298            if c == '\\' {
1299                // TODO(tokenizer): handle backslash in double quote
1300                true
1301            } else {
1302                false
1303            }
1304        }
1305        // When we're in a single quote, nothing affects quoting.
1306        QuoteMode::Single(_) => false,
1307        // When we're not already in a quote, then we can straightforwardly look for a
1308        // quote mark or backslash.
1309        QuoteMode::None => is_quoting_char(c),
1310    }
1311}
1312
1313const fn is_quoting_char(c: char) -> bool {
1314    matches!(c, '\\' | '\'' | '\"')
1315}
1316
1317/// Return a string with all the quoting removed.
1318///
1319/// # Arguments
1320///
1321/// * `s` - The string to unquote.
1322pub fn unquote_str(s: &str) -> String {
1323    let mut result = String::new();
1324
1325    let mut in_escape = false;
1326    for c in s.chars() {
1327        match c {
1328            c if in_escape => {
1329                result.push(c);
1330                in_escape = false;
1331            }
1332            '\\' => in_escape = true,
1333            c if is_quoting_char(c) => (),
1334            c => result.push(c),
1335        }
1336    }
1337
1338    result
1339}
1340
1341#[cfg(test)]
1342mod tests {
1343
1344    use super::*;
1345    use anyhow::Result;
1346    use insta::assert_ron_snapshot;
1347    use pretty_assertions::{assert_eq, assert_matches};
1348
1349    #[derive(serde::Serialize, serde::Deserialize)]
1350    struct TokenizerResult<'a> {
1351        input: &'a str,
1352        result: Vec<Token>,
1353    }
1354
1355    fn test_tokenizer(input: &str) -> Result<TokenizerResult<'_>> {
1356        Ok(TokenizerResult {
1357            input,
1358            result: tokenize_str(input)?,
1359        })
1360    }
1361
1362    #[test]
1363    fn tokenize_empty() -> Result<()> {
1364        let tokens = tokenize_str("")?;
1365        assert_eq!(tokens.len(), 0);
1366        Ok(())
1367    }
1368
1369    #[test]
1370    fn tokenize_line_continuation() -> Result<()> {
1371        assert_ron_snapshot!(test_tokenizer(
1372            r"a\
1373bc"
1374        )?);
1375        Ok(())
1376    }
1377
1378    #[test]
1379    fn tokenize_operators() -> Result<()> {
1380        assert_ron_snapshot!(test_tokenizer("a>>b")?);
1381        Ok(())
1382    }
1383
1384    #[test]
1385    fn tokenize_comment() -> Result<()> {
1386        assert_ron_snapshot!(test_tokenizer(
1387            r"a #comment
1388"
1389        )?);
1390        Ok(())
1391    }
1392
1393    #[test]
1394    fn tokenize_comment_at_eof() -> Result<()> {
1395        assert_ron_snapshot!(test_tokenizer(r"a #comment")?);
1396        Ok(())
1397    }
1398
1399    #[test]
1400    fn tokenize_empty_here_doc() -> Result<()> {
1401        assert_ron_snapshot!(test_tokenizer(
1402            r"cat <<HERE
1403HERE
1404"
1405        )?);
1406        Ok(())
1407    }
1408
1409    #[test]
1410    fn tokenize_here_doc() -> Result<()> {
1411        assert_ron_snapshot!(test_tokenizer(
1412            r"cat <<HERE
1413SOMETHING
1414HERE
1415echo after
1416"
1417        )?);
1418        assert_ron_snapshot!(test_tokenizer(
1419            r"cat <<HERE
1420SOMETHING
1421HERE
1422"
1423        )?);
1424        assert_ron_snapshot!(test_tokenizer(
1425            r"cat <<HERE
1426SOMETHING
1427HERE
1428
1429"
1430        )?);
1431        assert_ron_snapshot!(test_tokenizer(
1432            r"cat <<HERE
1433SOMETHING
1434HERE"
1435        )?);
1436        Ok(())
1437    }
1438
1439    #[test]
1440    fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1441        assert_ron_snapshot!(test_tokenizer(
1442            r"cat <<-HERE
1443	SOMETHING
1444	HERE
1445"
1446        )?);
1447        Ok(())
1448    }
1449
1450    #[test]
1451    fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1452        assert_ron_snapshot!(test_tokenizer(
1453            r"cat <<EOF | wc -l
1454A B C
14551 2 3
1456D E F
1457EOF
1458"
1459        )?);
1460        Ok(())
1461    }
1462
1463    #[test]
1464    fn tokenize_multiple_here_docs() -> Result<()> {
1465        assert_ron_snapshot!(test_tokenizer(
1466            r"cat <<HERE1 <<HERE2
1467SOMETHING
1468HERE1
1469OTHER
1470HERE2
1471echo after
1472"
1473        )?);
1474        Ok(())
1475    }
1476
1477    #[test]
1478    fn tokenize_unterminated_here_doc() {
1479        let result = tokenize_str(
1480            r"cat <<HERE
1481SOMETHING
1482",
1483        );
1484        assert!(result.is_err());
1485    }
1486
1487    #[test]
1488    fn tokenize_missing_here_tag() {
1489        let result = tokenize_str(
1490            r"cat <<
1491",
1492        );
1493        assert!(result.is_err());
1494    }
1495
1496    #[test]
1497    fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1498        assert_ron_snapshot!(test_tokenizer(
1499            r"echo $(cat <<HERE
1500TEXT
1501HERE
1502)"
1503        )?);
1504        Ok(())
1505    }
1506
1507    #[test]
1508    fn tokenize_here_doc_in_double_quoted_command_substitution() -> Result<()> {
1509        assert_ron_snapshot!(test_tokenizer(
1510            r#"echo "$(cat <<HERE
1511TEXT
1512HERE
1513)""#
1514        )?);
1515        Ok(())
1516    }
1517
1518    #[test]
1519    fn tokenize_here_doc_in_double_quoted_command_substitution_with_space() -> Result<()> {
1520        assert_ron_snapshot!(test_tokenizer(
1521            r#"echo "$(cat << HERE
1522TEXT
1523HERE
1524)""#
1525        )?);
1526        Ok(())
1527    }
1528
1529    #[test]
1530    fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1531        assert_ron_snapshot!(test_tokenizer(
1532            r"echo $(cat <<HERE1 <<HERE2 | wc -l
1533TEXT
1534HERE1
1535OTHER
1536HERE2
1537)"
1538        )?);
1539        Ok(())
1540    }
1541
1542    #[test]
1543    fn tokenize_simple_backquote() -> Result<()> {
1544        assert_ron_snapshot!(test_tokenizer(r"echo `echo hi`")?);
1545        Ok(())
1546    }
1547
1548    #[test]
1549    fn tokenize_backquote_with_escape() -> Result<()> {
1550        assert_ron_snapshot!(test_tokenizer(r"echo `echo\`hi`")?);
1551        Ok(())
1552    }
1553
1554    #[test]
1555    fn tokenize_unterminated_backquote() {
1556        assert_matches!(
1557            tokenize_str("`"),
1558            Err(TokenizerError::UnterminatedBackquote(_))
1559        );
1560    }
1561
1562    #[test]
1563    fn tokenize_unterminated_command_substitution() {
1564        // $( is consumed before the tokenizer knows whether it's $( or $((,
1565        // so it goes through consume_nested_construct and yields UnterminatedExpansion.
1566        assert_matches!(
1567            tokenize_str("$("),
1568            Err(TokenizerError::UnterminatedExpansion)
1569        );
1570    }
1571
1572    #[test]
1573    fn tokenize_unterminated_arithmetic_expansion() {
1574        assert_matches!(
1575            tokenize_str("$(("),
1576            Err(TokenizerError::UnterminatedExpansion)
1577        );
1578    }
1579
1580    #[test]
1581    fn tokenize_unterminated_legacy_arithmetic_expansion() {
1582        assert_matches!(
1583            tokenize_str("$["),
1584            Err(TokenizerError::UnterminatedExpansion)
1585        );
1586    }
1587
1588    #[test]
1589    fn tokenize_command_substitution() -> Result<()> {
1590        assert_ron_snapshot!(test_tokenizer("a$(echo hi)b c")?);
1591        Ok(())
1592    }
1593
1594    #[test]
1595    fn tokenize_command_substitution_with_subshell() -> Result<()> {
1596        assert_ron_snapshot!(test_tokenizer("$( (:) )")?);
1597        Ok(())
1598    }
1599
1600    #[test]
1601    fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1602        assert_ron_snapshot!(test_tokenizer("echo $(echo !(x))")?);
1603        Ok(())
1604    }
1605
1606    #[test]
1607    fn tokenize_arithmetic_expression() -> Result<()> {
1608        assert_ron_snapshot!(test_tokenizer("a$((1+2))b c")?);
1609        Ok(())
1610    }
1611
1612    #[test]
1613    fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1614        // N.B. The spacing comes out a bit odd, but it gets processed okay
1615        // by later stages.
1616        assert_ron_snapshot!(test_tokenizer("$(( 1 ))")?);
1617        Ok(())
1618    }
1619    #[test]
1620    fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1621        assert_ron_snapshot!(test_tokenizer("$(( (0) ))")?);
1622        Ok(())
1623    }
1624
1625    #[test]
1626    fn tokenize_special_parameters() -> Result<()> {
1627        assert_ron_snapshot!(test_tokenizer("$$")?);
1628        assert_ron_snapshot!(test_tokenizer("$@")?);
1629        assert_ron_snapshot!(test_tokenizer("$!")?);
1630        assert_ron_snapshot!(test_tokenizer("$?")?);
1631        assert_ron_snapshot!(test_tokenizer("$*")?);
1632        Ok(())
1633    }
1634
1635    #[test]
1636    fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1637        assert_ron_snapshot!(test_tokenizer("$x")?);
1638        assert_ron_snapshot!(test_tokenizer("a$x")?);
1639        Ok(())
1640    }
1641
1642    #[test]
1643    fn tokenize_unterminated_parameter_expansion() {
1644        assert_matches!(
1645            tokenize_str("${x"),
1646            Err(TokenizerError::UnterminatedVariable)
1647        );
1648    }
1649
1650    #[test]
1651    fn tokenize_braced_parameter_expansion() -> Result<()> {
1652        assert_ron_snapshot!(test_tokenizer("${x}")?);
1653        assert_ron_snapshot!(test_tokenizer("a${x}b")?);
1654        Ok(())
1655    }
1656
1657    #[test]
1658    fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1659        assert_ron_snapshot!(test_tokenizer(r"a${x\}}b")?);
1660        Ok(())
1661    }
1662
1663    #[test]
1664    fn tokenize_whitespace() -> Result<()> {
1665        assert_ron_snapshot!(test_tokenizer("1 2 3")?);
1666        Ok(())
1667    }
1668
1669    #[test]
1670    fn tokenize_escaped_whitespace() -> Result<()> {
1671        assert_ron_snapshot!(test_tokenizer(r"1\ 2 3")?);
1672        Ok(())
1673    }
1674
1675    #[test]
1676    fn tokenize_single_quote() -> Result<()> {
1677        assert_ron_snapshot!(test_tokenizer(r"x'a b'y")?);
1678        Ok(())
1679    }
1680
1681    #[test]
1682    fn tokenize_double_quote() -> Result<()> {
1683        assert_ron_snapshot!(test_tokenizer(r#"x"a b"y"#)?);
1684        Ok(())
1685    }
1686
1687    #[test]
1688    fn tokenize_double_quoted_command_substitution() -> Result<()> {
1689        assert_ron_snapshot!(test_tokenizer(r#"x"$(echo hi)"y"#)?);
1690        Ok(())
1691    }
1692
1693    #[test]
1694    fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1695        assert_ron_snapshot!(test_tokenizer(r#"x"$((1+2))"y"#)?);
1696        Ok(())
1697    }
1698
1699    #[test]
1700    fn test_quote_removal() {
1701        assert_eq!(unquote_str(r#""hello""#), "hello");
1702        assert_eq!(unquote_str(r"'hello'"), "hello");
1703        assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1704        assert_eq!(unquote_str(r"'hel\'lo'"), r"hel'lo");
1705    }
1706}