Skip to main content

clash_brush_parser/
tokenizer.rs

1use std::borrow::Cow;
2use std::sync::Arc;
3use utf8_chars::BufReadCharsExt;
4
5use crate::{SourcePosition, SourceSpan};
6
7#[derive(Clone, Debug)]
8pub(crate) enum TokenEndReason {
9    /// End of input was reached.
10    EndOfInput,
11    /// An unescaped newline char was reached.
12    UnescapedNewLine,
13    /// Specified terminating char.
14    SpecifiedTerminatingChar,
15    /// A non-newline blank char was reached.
16    NonNewLineBlank,
17    /// A here-document's body is starting.
18    HereDocumentBodyStart,
19    /// A here-document's body was terminated.
20    HereDocumentBodyEnd,
21    /// A here-document's end tag was reached.
22    HereDocumentEndTag,
23    /// An operator was started.
24    OperatorStart,
25    /// An operator was terminated.
26    OperatorEnd,
27    /// Some other condition was reached.
28    Other,
29}
30
31/// Compatibility alias for `SourceSpan`.
32pub type TokenLocation = SourceSpan;
33
34/// Represents a token extracted from a shell script.
35#[derive(Clone, Debug)]
36#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
37#[cfg_attr(
38    any(test, feature = "serde"),
39    derive(PartialEq, Eq, serde::Serialize, serde::Deserialize)
40)]
41pub enum Token {
42    /// An operator token.
43    Operator(String, SourceSpan),
44    /// A word token.
45    Word(String, SourceSpan),
46}
47
48impl Token {
49    /// Returns the string value of the token.
50    pub fn to_str(&self) -> &str {
51        match self {
52            Self::Operator(s, _) => s,
53            Self::Word(s, _) => s,
54        }
55    }
56
57    /// Returns the location of the token in the source script.
58    pub const fn location(&self) -> &SourceSpan {
59        match self {
60            Self::Operator(_, l) => l,
61            Self::Word(_, l) => l,
62        }
63    }
64}
65
66#[cfg(feature = "diagnostics")]
67impl From<&Token> for miette::SourceSpan {
68    fn from(token: &Token) -> Self {
69        let start = token.location().start.as_ref();
70        Self::new(start.into(), token.location().length())
71    }
72}
73
74/// Encapsulates the result of tokenizing a shell script.
75#[derive(Clone, Debug)]
76pub(crate) struct TokenizeResult {
77    /// Reason for tokenization ending.
78    pub reason: TokenEndReason,
79    /// The token that was extracted, if any.
80    pub token: Option<Token>,
81}
82
83/// Represents an error that occurred during tokenization.
84#[derive(thiserror::Error, Debug)]
85pub enum TokenizerError {
86    /// An unterminated escape sequence was encountered at the end of the input stream.
87    #[error("unterminated escape sequence")]
88    UnterminatedEscapeSequence,
89
90    /// An unterminated single-quoted substring was encountered at the end of the input stream.
91    #[error("unterminated single quote at {0}")]
92    UnterminatedSingleQuote(SourcePosition),
93
94    /// An unterminated ANSI C-quoted substring was encountered at the end of the input stream.
95    #[error("unterminated ANSI C quote at {0}")]
96    UnterminatedAnsiCQuote(SourcePosition),
97
98    /// An unterminated double-quoted substring was encountered at the end of the input stream.
99    #[error("unterminated double quote at {0}")]
100    UnterminatedDoubleQuote(SourcePosition),
101
102    /// An unterminated back-quoted substring was encountered at the end of the input stream.
103    #[error("unterminated backquote near {0}")]
104    UnterminatedBackquote(SourcePosition),
105
106    /// An unterminated extended glob (extglob) pattern was encountered at the end of the input
107    /// stream.
108    #[error("unterminated extglob near {0}")]
109    UnterminatedExtendedGlob(SourcePosition),
110
111    /// An unterminated variable expression was encountered at the end of the input stream.
112    #[error("unterminated variable expression")]
113    UnterminatedVariable,
114
115    /// An unterminated command substitiion was encountered at the end of the input stream.
116    #[error("unterminated command substitution")]
117    UnterminatedCommandSubstitution,
118
119    /// An unterminated arithmetic or other expansion was encountered at the end of the input stream.
120    #[error("unterminated expansion")]
121    UnterminatedExpansion,
122
123    /// An error occurred decoding UTF-8 characters in the input stream.
124    #[error("failed to decode UTF-8 characters")]
125    FailedDecoding,
126
127    /// An I/O here tag was missing.
128    #[error("missing here tag for here document body")]
129    MissingHereTagForDocumentBody,
130
131    /// The indicated I/O here tag was missing.
132    #[error("missing here tag '{0}'")]
133    MissingHereTag(String),
134
135    /// An unterminated here document sequence was encountered at the end of the input stream.
136    #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
137    UnterminatedHereDocuments(String, String),
138
139    /// An I/O error occurred while reading from the input stream.
140    #[error("failed to read input")]
141    ReadError(#[from] std::io::Error),
142}
143
144impl TokenizerError {
145    /// Returns true if the error represents an error that could possibly be due
146    /// to an incomplete input stream.
147    pub const fn is_incomplete(&self) -> bool {
148        matches!(
149            self,
150            Self::UnterminatedEscapeSequence
151                | Self::UnterminatedAnsiCQuote(..)
152                | Self::UnterminatedSingleQuote(..)
153                | Self::UnterminatedDoubleQuote(..)
154                | Self::UnterminatedBackquote(..)
155                | Self::UnterminatedCommandSubstitution
156                | Self::UnterminatedExpansion
157                | Self::UnterminatedVariable
158                | Self::UnterminatedExtendedGlob(..)
159                | Self::UnterminatedHereDocuments(..)
160        )
161    }
162}
163
164/// Encapsulates a sequence of tokens.
165#[derive(Debug)]
166pub(crate) struct Tokens<'a> {
167    /// Sequence of tokens.
168    pub tokens: &'a [Token],
169}
170
171#[derive(Clone, Debug)]
172enum QuoteMode {
173    None,
174    AnsiC(SourcePosition),
175    Single(SourcePosition),
176    Double(SourcePosition),
177}
178
179#[derive(Clone, Debug, Default)]
180enum HereState {
181    /// In this state, we are not currently tracking any here-documents.
182    #[default]
183    None,
184    /// In this state, we expect that the next token will be a here tag.
185    NextTokenIsHereTag { remove_tabs: bool },
186    /// In this state, the *current* token is a here tag.
187    CurrentTokenIsHereTag {
188        remove_tabs: bool,
189        operator_token_result: TokenizeResult,
190    },
191    /// In this state, we expect that the *next line* will be the body of
192    /// a here-document.
193    NextLineIsHereDoc,
194    /// In this state, we are in the set of lines that comprise 1 or more
195    /// consecutive here-document bodies.
196    InHereDocs,
197}
198
199#[derive(Clone, Debug)]
200struct HereTag {
201    tag: String,
202    tag_was_escaped_or_quoted: bool,
203    remove_tabs: bool,
204    position: SourcePosition,
205    tokens: Vec<TokenizeResult>,
206    pending_tokens_after: Vec<TokenizeResult>,
207}
208
209#[derive(Clone, Debug)]
210struct CrossTokenParseState {
211    /// Cursor within the overall token stream; used for error reporting.
212    cursor: SourcePosition,
213    /// Current state of parsing here-documents.
214    here_state: HereState,
215    /// Ordered queue of here tags for which we're still looking for matching here-document bodies.
216    current_here_tags: Vec<HereTag>,
217    /// Tokens already tokenized that should be used first to serve requests for tokens.
218    queued_tokens: Vec<TokenizeResult>,
219    /// Are we in an arithmetic expansion?
220    arithmetic_expansion: bool,
221}
222
223/// Options controlling how the tokenizer operates.
224#[derive(Clone, Debug, Hash, Eq, PartialEq)]
225pub struct TokenizerOptions {
226    /// Whether or not to enable extended globbing patterns (extglob).
227    pub enable_extended_globbing: bool,
228    /// Whether or not to operate in POSIX compliance mode.
229    pub posix_mode: bool,
230    /// Whether or not we're running in SH emulation mode.
231    pub sh_mode: bool,
232}
233
234impl Default for TokenizerOptions {
235    fn default() -> Self {
236        Self {
237            enable_extended_globbing: true,
238            posix_mode: false,
239            sh_mode: false,
240        }
241    }
242}
243
244/// A tokenizer for shell scripts.
245pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
246    char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
247    cross_state: CrossTokenParseState,
248    options: TokenizerOptions,
249}
250
251/// Encapsulates the current token parsing state.
252#[derive(Clone, Debug)]
253struct TokenParseState {
254    pub start_position: SourcePosition,
255    pub token_so_far: String,
256    pub token_is_operator: bool,
257    pub in_escape: bool,
258    pub quote_mode: QuoteMode,
259}
260
261impl TokenParseState {
262    pub fn new(start_position: &SourcePosition) -> Self {
263        Self {
264            start_position: start_position.to_owned(),
265            token_so_far: String::new(),
266            token_is_operator: false,
267            in_escape: false,
268            quote_mode: QuoteMode::None,
269        }
270    }
271
272    pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
273        let end = Arc::new(end_position.to_owned());
274        let token_location = SourceSpan {
275            start: Arc::new(std::mem::take(&mut self.start_position)),
276            end,
277        };
278
279        let token = if std::mem::take(&mut self.token_is_operator) {
280            Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
281        } else {
282            Token::Word(std::mem::take(&mut self.token_so_far), token_location)
283        };
284
285        end_position.clone_into(&mut self.start_position);
286        self.in_escape = false;
287        self.quote_mode = QuoteMode::None;
288
289        token
290    }
291
292    pub const fn started_token(&self) -> bool {
293        !self.token_so_far.is_empty()
294    }
295
296    pub fn append_char(&mut self, c: char) {
297        self.token_so_far.push(c);
298    }
299
300    pub fn append_str(&mut self, s: &str) {
301        self.token_so_far.push_str(s);
302    }
303
304    pub const fn unquoted(&self) -> bool {
305        !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
306    }
307
308    pub fn current_token(&self) -> &str {
309        &self.token_so_far
310    }
311
312    pub fn is_specific_operator(&self, operator: &str) -> bool {
313        self.token_is_operator && self.current_token() == operator
314    }
315
316    pub const fn in_operator(&self) -> bool {
317        self.token_is_operator
318    }
319
320    fn is_newline(&self) -> bool {
321        self.token_so_far == "\n"
322    }
323
324    fn replace_with_here_doc(&mut self, s: String) {
325        self.token_so_far = s;
326    }
327
328    pub fn delimit_current_token(
329        &mut self,
330        reason: TokenEndReason,
331        cross_token_state: &mut CrossTokenParseState,
332    ) -> Result<Option<TokenizeResult>, TokenizerError> {
333        // If we don't have anything in the token, then don't yield an empty string token
334        // *unless* it's the body of a here document.
335        if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
336            return Ok(Some(TokenizeResult {
337                reason,
338                token: None,
339            }));
340        }
341
342        // TODO(tokenizer): Make sure the here-tag meets criteria (and isn't a newline).
343        let current_here_state = std::mem::take(&mut cross_token_state.here_state);
344        match current_here_state {
345            HereState::NextTokenIsHereTag { remove_tabs } => {
346                // Don't yield the operator as a token yet. We need to make sure we collect
347                // up everything we need for all the here-documents with tags on this line.
348                let operator_token_result = TokenizeResult {
349                    reason,
350                    token: Some(self.pop(&cross_token_state.cursor)),
351                };
352
353                cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
354                    remove_tabs,
355                    operator_token_result,
356                };
357
358                return Ok(None);
359            }
360            HereState::CurrentTokenIsHereTag {
361                remove_tabs,
362                operator_token_result,
363            } => {
364                if self.is_newline() {
365                    return Err(TokenizerError::MissingHereTag(
366                        self.current_token().to_owned(),
367                    ));
368                }
369
370                cross_token_state.here_state = HereState::NextLineIsHereDoc;
371
372                // Include the trailing \n in the here tag so it's easier to check against.
373                let tag = std::format!("{}\n", self.current_token().trim_ascii_start());
374                let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
375
376                let tag_token_result = TokenizeResult {
377                    reason,
378                    token: Some(self.pop(&cross_token_state.cursor)),
379                };
380
381                cross_token_state.current_here_tags.push(HereTag {
382                    tag,
383                    tag_was_escaped_or_quoted,
384                    remove_tabs,
385                    position: cross_token_state.cursor.clone(),
386                    tokens: vec![operator_token_result, tag_token_result],
387                    pending_tokens_after: vec![],
388                });
389
390                return Ok(None);
391            }
392            HereState::NextLineIsHereDoc => {
393                if self.is_newline() {
394                    cross_token_state.here_state = HereState::InHereDocs;
395                } else {
396                    cross_token_state.here_state = HereState::NextLineIsHereDoc;
397                }
398
399                if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
400                    let token = self.pop(&cross_token_state.cursor);
401                    let result = TokenizeResult {
402                        reason,
403                        token: Some(token),
404                    };
405
406                    last_here_tag.pending_tokens_after.push(result);
407                } else {
408                    return Err(TokenizerError::MissingHereTagForDocumentBody);
409                }
410
411                return Ok(None);
412            }
413            HereState::InHereDocs => {
414                // We hit the end of the current here-document.
415                let completed_here_tag = cross_token_state.current_here_tags.remove(0);
416
417                // First queue the redirection operator and (start) here-tag.
418                for here_token in completed_here_tag.tokens {
419                    cross_token_state.queued_tokens.push(here_token);
420                }
421
422                // Leave a hint that we are about to start a here-document.
423                cross_token_state.queued_tokens.push(TokenizeResult {
424                    reason: TokenEndReason::HereDocumentBodyStart,
425                    token: None,
426                });
427
428                // Then queue the body document we just finished.
429                cross_token_state.queued_tokens.push(TokenizeResult {
430                    reason,
431                    token: Some(self.pop(&cross_token_state.cursor)),
432                });
433
434                // Then queue up the (end) here-tag.
435                self.append_str(completed_here_tag.tag.trim_end_matches('\n'));
436                cross_token_state.queued_tokens.push(TokenizeResult {
437                    reason: TokenEndReason::HereDocumentEndTag,
438                    token: Some(self.pop(&cross_token_state.cursor)),
439                });
440
441                // Now we're ready to queue up any tokens that came between the completed
442                // here tag and the next here tag (or newline after it if it was the last).
443                for pending_token in completed_here_tag.pending_tokens_after {
444                    cross_token_state.queued_tokens.push(pending_token);
445                }
446
447                if cross_token_state.current_here_tags.is_empty() {
448                    cross_token_state.here_state = HereState::None;
449                } else {
450                    cross_token_state.here_state = HereState::InHereDocs;
451                }
452
453                return Ok(None);
454            }
455            HereState::None => (),
456        }
457
458        let token = self.pop(&cross_token_state.cursor);
459        let result = TokenizeResult {
460            reason,
461            token: Some(token),
462        };
463
464        Ok(Some(result))
465    }
466}
467
468/// Break the given input shell script string into tokens, returning the tokens.
469///
470/// # Arguments
471///
472/// * `input` - The shell script to tokenize.
473pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
474    tokenize_str_with_options(input, &TokenizerOptions::default())
475}
476
477/// Break the given input shell script string into tokens, returning the tokens.
478///
479/// # Arguments
480///
481/// * `input` - The shell script to tokenize.
482/// * `options` - Options controlling how the tokenizer operates.
483pub fn tokenize_str_with_options(
484    input: &str,
485    options: &TokenizerOptions,
486) -> Result<Vec<Token>, TokenizerError> {
487    uncached_tokenize_string(input.to_owned(), options.to_owned())
488}
489
490#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
491fn uncached_tokenize_string(
492    input: String,
493    options: TokenizerOptions,
494) -> Result<Vec<Token>, TokenizerError> {
495    uncached_tokenize_str(input.as_str(), &options)
496}
497
498/// Break the given input shell script string into tokens, returning the tokens.
499/// No caching is performed.
500///
501/// # Arguments
502///
503/// * `input` - The shell script to tokenize.
504pub fn uncached_tokenize_str(
505    input: &str,
506    options: &TokenizerOptions,
507) -> Result<Vec<Token>, TokenizerError> {
508    let mut reader = std::io::BufReader::new(input.as_bytes());
509    let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
510
511    let mut tokens = vec![];
512    loop {
513        match tokenizer.next_token()? {
514            TokenizeResult {
515                token: Some(token), ..
516            } => tokens.push(token),
517            TokenizeResult {
518                reason: TokenEndReason::EndOfInput,
519                ..
520            } => break,
521            _ => (),
522        }
523    }
524
525    Ok(tokens)
526}
527
528impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
529    pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Self {
530        Tokenizer {
531            options: options.clone(),
532            char_reader: reader.chars().peekable(),
533            cross_state: CrossTokenParseState {
534                cursor: SourcePosition {
535                    index: 0,
536                    line: 1,
537                    column: 1,
538                },
539                here_state: HereState::None,
540                current_here_tags: vec![],
541                queued_tokens: vec![],
542                arithmetic_expansion: false,
543            },
544        }
545    }
546
547    #[expect(clippy::unnecessary_wraps)]
548    pub fn current_location(&self) -> Option<SourcePosition> {
549        Some(self.cross_state.cursor.clone())
550    }
551
552    fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
553        let c = self
554            .char_reader
555            .next()
556            .transpose()
557            .map_err(TokenizerError::ReadError)?;
558
559        if let Some(ch) = c {
560            if ch == '\n' {
561                self.cross_state.cursor.line += 1;
562                self.cross_state.cursor.column = 1;
563            } else {
564                self.cross_state.cursor.column += 1;
565            }
566            self.cross_state.cursor.index += 1;
567        }
568
569        Ok(c)
570    }
571
572    fn consume_char(&mut self) -> Result<(), TokenizerError> {
573        let _ = self.next_char()?;
574        Ok(())
575    }
576
577    fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
578        match self.char_reader.peek() {
579            Some(result) => match result {
580                Ok(c) => Ok(Some(*c)),
581                Err(_) => Err(TokenizerError::FailedDecoding),
582            },
583            None => Ok(None),
584        }
585    }
586
587    pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
588        self.next_token_until(None, false /* include space? */)
589    }
590
591    /// Consumes a nested construct (e.g., `$((...))` or `$[...]`), handling nested delimiters
592    /// and here-documents.
593    ///
594    /// # Arguments
595    ///
596    /// * `state` - The current token parse state to append characters to.
597    /// * `terminating_char` - The character that terminates the construct (e.g., `)` or `]`).
598    /// * `nesting_open` - The character that increases nesting depth when encountered (e.g., `(` or `[`).
599    /// * `initial_nesting` - The initial nesting count (e.g., 2 for `$((`, 1 for `$[`).
600    fn consume_nested_construct(
601        &mut self,
602        state: &mut TokenParseState,
603        terminating_char: char,
604        nesting_open: &str,
605        mut nesting_count: u32,
606    ) -> Result<(), TokenizerError> {
607        let mut pending_here_doc_tokens = vec![];
608        let mut drain_here_doc_tokens = false;
609
610        loop {
611            let cur_token = if drain_here_doc_tokens && !pending_here_doc_tokens.is_empty() {
612                if pending_here_doc_tokens.len() == 1 {
613                    drain_here_doc_tokens = false;
614                }
615                pending_here_doc_tokens.remove(0)
616            } else {
617                let cur_token = self.next_token_until(Some(terminating_char), true)?;
618
619                if matches!(
620                    cur_token.reason,
621                    TokenEndReason::HereDocumentBodyStart
622                        | TokenEndReason::HereDocumentBodyEnd
623                        | TokenEndReason::HereDocumentEndTag
624                ) {
625                    pending_here_doc_tokens.push(cur_token);
626                    continue;
627                }
628                cur_token
629            };
630
631            if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
632                && !pending_here_doc_tokens.is_empty()
633            {
634                pending_here_doc_tokens.push(cur_token);
635                drain_here_doc_tokens = true;
636                continue;
637            }
638
639            if let Some(cur_token_value) = cur_token.token {
640                state.append_str(cur_token_value.to_str());
641
642                if matches!(cur_token_value, Token::Operator(o, _) if o == nesting_open) {
643                    nesting_count += 1;
644                }
645            }
646
647            match cur_token.reason {
648                TokenEndReason::HereDocumentBodyStart => {
649                    state.append_char('\n');
650                }
651                TokenEndReason::NonNewLineBlank => state.append_char(' '),
652                TokenEndReason::SpecifiedTerminatingChar => {
653                    nesting_count -= 1;
654                    if nesting_count == 0 {
655                        break;
656                    }
657                    state.append_char(self.next_char()?.unwrap());
658                }
659                TokenEndReason::EndOfInput => {
660                    return Err(TokenizerError::UnterminatedExpansion);
661                }
662                _ => (),
663            }
664        }
665
666        state.append_char(self.next_char()?.unwrap());
667        Ok(())
668    }
669
670    /// Returns the next token from the input stream, optionally stopping early when a specified
671    /// terminating character is encountered.
672    ///
673    /// # Arguments
674    ///
675    /// * `terminating_char` - An optional character that, if encountered, will stop the
676    ///   tokenization process and return the token up to that character.
677    /// * `include_space` - If true, include spaces in the tokenization process. This is not
678    ///   typically the case, but can be helpful when needing to preserve the original source text
679    ///   embedded within a command substitution or similar construct.
680    #[expect(clippy::cognitive_complexity)]
681    #[expect(clippy::if_same_then_else)]
682    #[expect(clippy::panic_in_result_fn)]
683    #[expect(clippy::too_many_lines)]
684    #[allow(clippy::unwrap_in_result)]
685    fn next_token_until(
686        &mut self,
687        terminating_char: Option<char>,
688        include_space: bool,
689    ) -> Result<TokenizeResult, TokenizerError> {
690        let mut state = TokenParseState::new(&self.cross_state.cursor);
691        let mut result: Option<TokenizeResult> = None;
692
693        while result.is_none() {
694            // First satisfy token results from our queue. Once we exhaust the queue then
695            // we'll look at the input stream.
696            if !self.cross_state.queued_tokens.is_empty() {
697                return Ok(self.cross_state.queued_tokens.remove(0));
698            }
699
700            let next = self.peek_char()?;
701            let c = next.unwrap_or('\0');
702
703            // When we hit the end of the input, then we're done with the current token (if there is
704            // one).
705            if next.is_none() {
706                // TODO(tokenizer): Verify we're not waiting on some terminating character?
707                // Verify we're out of all quotes.
708                if state.in_escape {
709                    return Err(TokenizerError::UnterminatedEscapeSequence);
710                }
711                match state.quote_mode {
712                    QuoteMode::None => (),
713                    QuoteMode::AnsiC(pos) => {
714                        return Err(TokenizerError::UnterminatedAnsiCQuote(pos));
715                    }
716                    QuoteMode::Single(pos) => {
717                        return Err(TokenizerError::UnterminatedSingleQuote(pos));
718                    }
719                    QuoteMode::Double(pos) => {
720                        return Err(TokenizerError::UnterminatedDoubleQuote(pos));
721                    }
722                }
723
724                // Verify we're not in a here document.
725                if !matches!(self.cross_state.here_state, HereState::None) {
726                    if self.remove_here_end_tag(&mut state, &mut result, false)? {
727                        // If we hit end tag without a trailing newline, try to get next token.
728                        continue;
729                    }
730
731                    let tag_names = self
732                        .cross_state
733                        .current_here_tags
734                        .iter()
735                        .map(|tag| tag.tag.trim())
736                        .collect::<Vec<_>>()
737                        .join(", ");
738                    let tag_positions = self
739                        .cross_state
740                        .current_here_tags
741                        .iter()
742                        .map(|tag| std::format!("{}", tag.position))
743                        .collect::<Vec<_>>()
744                        .join(", ");
745                    return Err(TokenizerError::UnterminatedHereDocuments(
746                        tag_names,
747                        tag_positions,
748                    ));
749                }
750
751                result = state
752                    .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
753            //
754            // Look for the specially specified terminating char.
755            //
756            } else if state.unquoted() && terminating_char == Some(c) {
757                result = state.delimit_current_token(
758                    TokenEndReason::SpecifiedTerminatingChar,
759                    &mut self.cross_state,
760                )?;
761            //
762            // Handle being in a here document.
763            //
764            } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
765                //
766                // For now, just include the character in the current token. We also check
767                // if there are leading tabs to be removed.
768                //
769                if !self.cross_state.current_here_tags.is_empty()
770                    && self.cross_state.current_here_tags[0].remove_tabs
771                    && (!state.started_token() || state.current_token().ends_with('\n'))
772                    && c == '\t'
773                {
774                    // Consume it but don't include it.
775                    self.consume_char()?;
776                } else {
777                    self.consume_char()?;
778                    state.append_char(c);
779
780                    // See if this was a newline character following the terminating here tag.
781                    if c == '\n' {
782                        self.remove_here_end_tag(&mut state, &mut result, true)?;
783                    }
784                }
785            } else if state.in_operator() {
786                //
787                // We're in an operator. See if this character continues an operator, or if it
788                // must be a separate token (because it wouldn't make a prefix of an operator).
789                //
790
791                let mut hypothetical_token = state.current_token().to_owned();
792                hypothetical_token.push(c);
793
794                if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
795                    self.consume_char()?;
796                    state.append_char(c);
797                } else {
798                    assert!(state.started_token());
799
800                    //
801                    // N.B. If the completed operator indicates a here-document, then keep
802                    // track that the *next* token should be the here-tag.
803                    //
804                    if self.cross_state.arithmetic_expansion {
805                        //
806                        // We're in an arithmetic context; don't consider << and <<-
807                        // special. They're not here-docs, they're either a left-shift
808                        // operator or a left-shift operator followed by a unary
809                        // minus operator.
810                        //
811
812                        if state.is_specific_operator(")") && c == ')' {
813                            self.cross_state.arithmetic_expansion = false;
814                        }
815                    } else if state.is_specific_operator("<<") {
816                        self.cross_state.here_state =
817                            HereState::NextTokenIsHereTag { remove_tabs: false };
818                    } else if state.is_specific_operator("<<-") {
819                        self.cross_state.here_state =
820                            HereState::NextTokenIsHereTag { remove_tabs: true };
821                    } else if state.is_specific_operator("(") && c == '(' {
822                        self.cross_state.arithmetic_expansion = true;
823                    }
824
825                    let reason = if state.current_token() == "\n" {
826                        TokenEndReason::UnescapedNewLine
827                    } else {
828                        TokenEndReason::OperatorEnd
829                    };
830
831                    result = state.delimit_current_token(reason, &mut self.cross_state)?;
832                }
833            //
834            // See if this is a character that changes the current escaping/quoting state.
835            //
836            } else if does_char_newly_affect_quoting(&state, c) {
837                if c == '\\' {
838                    // Consume the backslash ourselves so we can peek past it.
839                    self.consume_char()?;
840
841                    if matches!(self.peek_char()?, Some('\n')) {
842                        // Make sure the newline char gets consumed too.
843                        self.consume_char()?;
844
845                        // Make sure to include neither the backslash nor the newline character.
846                    } else {
847                        state.in_escape = true;
848                        state.append_char(c);
849                    }
850                } else if c == '\'' {
851                    if state.token_so_far.ends_with('$') {
852                        state.quote_mode = QuoteMode::AnsiC(self.cross_state.cursor.clone());
853                    } else {
854                        state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
855                    }
856
857                    self.consume_char()?;
858                    state.append_char(c);
859                } else if c == '\"' {
860                    state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
861                    self.consume_char()?;
862                    state.append_char(c);
863                }
864            }
865            //
866            // Handle end of single-quote, double-quote, or ANSI-C quote.
867            else if !state.in_escape
868                && matches!(
869                    state.quote_mode,
870                    QuoteMode::Single(..) | QuoteMode::AnsiC(..)
871                )
872                && c == '\''
873            {
874                state.quote_mode = QuoteMode::None;
875                self.consume_char()?;
876                state.append_char(c);
877            } else if !state.in_escape
878                && matches!(state.quote_mode, QuoteMode::Double(..))
879                && c == '\"'
880            {
881                state.quote_mode = QuoteMode::None;
882                self.consume_char()?;
883                state.append_char(c);
884            }
885            //
886            // Handle end of escape sequence.
887            // TODO(tokenizer): Handle double-quote specific escape sequences.
888            else if state.in_escape {
889                state.in_escape = false;
890                self.consume_char()?;
891                state.append_char(c);
892            } else if (state.unquoted()
893                || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
894                && (c == '$' || c == '`')
895            {
896                // TODO(tokenizer): handle quoted $ or ` in a double quote
897                if c == '$' {
898                    // Consume the '$' so we can peek beyond.
899                    self.consume_char()?;
900
901                    // Now peek beyond to see what we have.
902                    let char_after_dollar_sign = self.peek_char()?;
903                    match char_after_dollar_sign {
904                        Some('(') => {
905                            // Add the '$' we already consumed to the token.
906                            state.append_char('$');
907
908                            // Consume the '(' and add it to the token.
909                            state.append_char(self.next_char()?.unwrap());
910
911                            // Check to see if this is possibly an arithmetic expression
912                            // (i.e., one that starts with `$((`).
913                            let (initial_nesting, is_arithmetic) =
914                                if matches!(self.peek_char()?, Some('(')) {
915                                    // Consume the second '(' and add it to the token.
916                                    state.append_char(self.next_char()?.unwrap());
917                                    (2, true)
918                                } else {
919                                    (1, false)
920                                };
921
922                            if is_arithmetic {
923                                self.cross_state.arithmetic_expansion = true;
924                            }
925
926                            self.consume_nested_construct(&mut state, ')', "(", initial_nesting)?;
927
928                            if is_arithmetic {
929                                self.cross_state.arithmetic_expansion = false;
930                            }
931                        }
932
933                        Some('[') => {
934                            // Add the '$' we already consumed to the token.
935                            state.append_char('$');
936
937                            // Consume the '[' and add it to the token.
938                            state.append_char(self.next_char()?.unwrap());
939
940                            // Keep track that we're in an arithmetic expression, since
941                            // some text will be interpreted differently as a result.
942                            self.cross_state.arithmetic_expansion = true;
943
944                            self.consume_nested_construct(&mut state, ']', "[", 1)?;
945
946                            self.cross_state.arithmetic_expansion = false;
947                        }
948
949                        Some('{') => {
950                            // Add the '$' we already consumed to the token.
951                            state.append_char('$');
952
953                            // Consume the '{' and add it to the token.
954                            state.append_char(self.next_char()?.unwrap());
955
956                            let mut pending_here_doc_tokens = vec![];
957                            let mut drain_here_doc_tokens = false;
958
959                            loop {
960                                let cur_token = if drain_here_doc_tokens
961                                    && !pending_here_doc_tokens.is_empty()
962                                {
963                                    if pending_here_doc_tokens.len() == 1 {
964                                        drain_here_doc_tokens = false;
965                                    }
966
967                                    pending_here_doc_tokens.remove(0)
968                                } else {
969                                    let cur_token = self.next_token_until(
970                                        Some('}'),
971                                        false, /* include space? */
972                                    )?;
973
974                                    // See if this is a here-document-related token we need to hold
975                                    // onto until after we've seen all the tokens that need to show
976                                    // up before we get to the body.
977                                    if matches!(
978                                        cur_token.reason,
979                                        TokenEndReason::HereDocumentBodyStart
980                                            | TokenEndReason::HereDocumentBodyEnd
981                                            | TokenEndReason::HereDocumentEndTag
982                                    ) {
983                                        pending_here_doc_tokens.push(cur_token);
984                                        continue;
985                                    }
986
987                                    cur_token
988                                };
989
990                                if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
991                                    && !pending_here_doc_tokens.is_empty()
992                                {
993                                    pending_here_doc_tokens.push(cur_token);
994                                    drain_here_doc_tokens = true;
995                                    continue;
996                                }
997
998                                if let Some(cur_token_value) = cur_token.token {
999                                    state.append_str(cur_token_value.to_str());
1000                                }
1001
1002                                match cur_token.reason {
1003                                    TokenEndReason::HereDocumentBodyStart => {
1004                                        state.append_char('\n');
1005                                    }
1006                                    TokenEndReason::NonNewLineBlank => state.append_char(' '),
1007                                    TokenEndReason::SpecifiedTerminatingChar => {
1008                                        // We hit the end brace we were looking for but did not
1009                                        // yet consume it. Do so now.
1010                                        state.append_char(self.next_char()?.unwrap());
1011                                        break;
1012                                    }
1013                                    TokenEndReason::EndOfInput => {
1014                                        return Err(TokenizerError::UnterminatedVariable);
1015                                    }
1016                                    _ => (),
1017                                }
1018                            }
1019                        }
1020                        _ => {
1021                            // This is either a different character, or else the end of the string.
1022                            // Either way, add the '$' we already consumed to the token.
1023                            state.append_char('$');
1024                        }
1025                    }
1026                } else {
1027                    // We look for the terminating backquote. First disable normal consumption and
1028                    // consume the starting backquote.
1029                    let backquote_pos = self.cross_state.cursor.clone();
1030                    self.consume_char()?;
1031
1032                    // Add the opening backquote to the token.
1033                    state.append_char(c);
1034
1035                    // Now continue until we see an unescaped backquote.
1036                    let mut escaping_enabled = false;
1037                    let mut done = false;
1038                    while !done {
1039                        // Read (and consume) the next char.
1040                        let next_char_in_backquote = self.next_char()?;
1041                        if let Some(cib) = next_char_in_backquote {
1042                            // Include it in the token no matter what.
1043                            state.append_char(cib);
1044
1045                            // Watch out for escaping.
1046                            if !escaping_enabled && cib == '\\' {
1047                                escaping_enabled = true;
1048                            } else {
1049                                // Look for an unescaped backquote to terminate.
1050                                if !escaping_enabled && cib == '`' {
1051                                    done = true;
1052                                }
1053                                escaping_enabled = false;
1054                            }
1055                        } else {
1056                            return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1057                        }
1058                    }
1059                }
1060            }
1061            //
1062            // [Extension]
1063            // If extended globbing is enabled, the last consumed character is an
1064            // unquoted start of an extglob pattern, *and* if the current character
1065            // is an open parenthesis, then this begins an extglob pattern.
1066            else if c == '('
1067                && self.options.enable_extended_globbing
1068                && state.unquoted()
1069                && !state.in_operator()
1070                && state
1071                    .current_token()
1072                    .ends_with(|x| Self::can_start_extglob(x))
1073            {
1074                // Consume the '(' and append it.
1075                self.consume_char()?;
1076                state.append_char(c);
1077
1078                let mut paren_depth = 1;
1079                let mut in_escape = false;
1080
1081                // Keep consuming until we see the matching end ')'.
1082                while paren_depth > 0 {
1083                    if let Some(extglob_char) = self.next_char()? {
1084                        // Include it in the token.
1085                        state.append_char(extglob_char);
1086
1087                        match extglob_char {
1088                            _ if in_escape => in_escape = false,
1089                            '\\' => in_escape = true,
1090                            '(' => paren_depth += 1,
1091                            ')' => paren_depth -= 1,
1092                            _ => (),
1093                        }
1094                    } else {
1095                        return Err(TokenizerError::UnterminatedExtendedGlob(
1096                            self.cross_state.cursor.clone(),
1097                        ));
1098                    }
1099                }
1100            //
1101            // If the character *can* start an operator, then it will.
1102            //
1103            } else if state.unquoted() && Self::can_start_operator(c) {
1104                if state.started_token() {
1105                    result = state.delimit_current_token(
1106                        TokenEndReason::OperatorStart,
1107                        &mut self.cross_state,
1108                    )?;
1109                } else {
1110                    state.token_is_operator = true;
1111                    self.consume_char()?;
1112                    state.append_char(c);
1113                }
1114            //
1115            // Whitespace gets discarded (and delimits tokens).
1116            //
1117            } else if state.unquoted() && is_blank(c) {
1118                if state.started_token() {
1119                    result = state.delimit_current_token(
1120                        TokenEndReason::NonNewLineBlank,
1121                        &mut self.cross_state,
1122                    )?;
1123                } else if include_space {
1124                    state.append_char(c);
1125                } else {
1126                    // Make sure we don't include this char in the token range.
1127                    state.start_position.column += 1;
1128                    state.start_position.index += 1;
1129                }
1130
1131                self.consume_char()?;
1132            }
1133            //
1134            // N.B. We need to remember if we were recursively called in a variable
1135            // expansion expression; in that case we won't think a token was started but...
1136            // we'd be wrong.
1137            else if !state.token_is_operator
1138                && (state.started_token() || matches!(terminating_char, Some('}')))
1139            {
1140                self.consume_char()?;
1141                state.append_char(c);
1142            } else if c == '#' {
1143                // Consume the '#'.
1144                self.consume_char()?;
1145
1146                let mut done = false;
1147                while !done {
1148                    done = match self.peek_char()? {
1149                        Some('\n') => true,
1150                        None => true,
1151                        _ => {
1152                            // Consume the peeked char; it's part of the comment.
1153                            self.consume_char()?;
1154                            false
1155                        }
1156                    };
1157                }
1158                // Re-start loop as if the comment never happened.
1159            } else if state.started_token() {
1160                // In all other cases where we have an in-progress token, we delimit here.
1161                result =
1162                    state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1163            } else {
1164                // If we got here, then we don't have a token in progress and we're not starting an
1165                // operator. Add the character to a new token.
1166                self.consume_char()?;
1167                state.append_char(c);
1168            }
1169        }
1170
1171        let result = result.unwrap();
1172
1173        Ok(result)
1174    }
1175
1176    fn remove_here_end_tag(
1177        &mut self,
1178        state: &mut TokenParseState,
1179        result: &mut Option<TokenizeResult>,
1180        ends_with_newline: bool,
1181    ) -> Result<bool, TokenizerError> {
1182        // Bail immediately if we don't even have a *starting* here tag.
1183        if self.cross_state.current_here_tags.is_empty() {
1184            return Ok(false);
1185        }
1186
1187        let next_here_tag = &self.cross_state.current_here_tags[0];
1188
1189        let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
1190            unquote_str(next_here_tag.tag.as_str()).into()
1191        } else {
1192            next_here_tag.tag.as_str().into()
1193        };
1194
1195        let tag_str = if !ends_with_newline {
1196            tag_str
1197                .strip_suffix('\n')
1198                .unwrap_or_else(|| tag_str.as_ref())
1199        } else {
1200            tag_str.as_ref()
1201        };
1202
1203        if let Some(current_token_without_here_tag) = state.current_token().strip_suffix(tag_str) {
1204            // Make sure that was either the start of the here document, or there
1205            // was a newline between the preceding part
1206            // and the tag.
1207            if current_token_without_here_tag.is_empty()
1208                || current_token_without_here_tag.ends_with('\n')
1209            {
1210                state.replace_with_here_doc(current_token_without_here_tag.to_owned());
1211
1212                // Delimit the end of the here-document body.
1213                *result = state.delimit_current_token(
1214                    TokenEndReason::HereDocumentBodyEnd,
1215                    &mut self.cross_state,
1216                )?;
1217
1218                return Ok(true);
1219            }
1220        }
1221        Ok(false)
1222    }
1223
1224    const fn can_start_extglob(c: char) -> bool {
1225        matches!(c, '@' | '!' | '?' | '+' | '*')
1226    }
1227
1228    const fn can_start_operator(c: char) -> bool {
1229        matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1230    }
1231
1232    fn is_operator(&self, s: &str) -> bool {
1233        // Handle non-POSIX operators.
1234        if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1235            return true;
1236        }
1237
1238        matches!(
1239            s,
1240            "&" | "&&"
1241                | "("
1242                | ")"
1243                | ";"
1244                | ";;"
1245                | "\n"
1246                | "|"
1247                | "||"
1248                | "<"
1249                | ">"
1250                | ">|"
1251                | "<<"
1252                | ">>"
1253                | "<&"
1254                | ">&"
1255                | "<<-"
1256                | "<>"
1257        )
1258    }
1259}
1260
1261impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1262    type Item = Result<TokenizeResult, TokenizerError>;
1263
1264    fn next(&mut self) -> Option<Self::Item> {
1265        match self.next_token() {
1266            #[expect(clippy::manual_map)]
1267            Ok(result) => match result.token {
1268                Some(_) => Some(Ok(result)),
1269                None => None,
1270            },
1271            Err(e) => Some(Err(e)),
1272        }
1273    }
1274}
1275
1276const fn is_blank(c: char) -> bool {
1277    c == ' ' || c == '\t'
1278}
1279
1280const fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1281    // If we're currently escaped, then nothing affects quoting.
1282    if state.in_escape {
1283        return false;
1284    }
1285
1286    match state.quote_mode {
1287        // When we're in a double quote or ANSI-C quote, only a subset of escape
1288        // sequences are recognized.
1289        QuoteMode::Double(_) | QuoteMode::AnsiC(_) => {
1290            if c == '\\' {
1291                // TODO(tokenizer): handle backslash in double quote
1292                true
1293            } else {
1294                false
1295            }
1296        }
1297        // When we're in a single quote, nothing affects quoting.
1298        QuoteMode::Single(_) => false,
1299        // When we're not already in a quote, then we can straightforwardly look for a
1300        // quote mark or backslash.
1301        QuoteMode::None => is_quoting_char(c),
1302    }
1303}
1304
1305const fn is_quoting_char(c: char) -> bool {
1306    matches!(c, '\\' | '\'' | '\"')
1307}
1308
1309/// Return a string with all the quoting removed.
1310///
1311/// # Arguments
1312///
1313/// * `s` - The string to unquote.
1314pub fn unquote_str(s: &str) -> String {
1315    let mut result = String::new();
1316
1317    let mut in_escape = false;
1318    for c in s.chars() {
1319        match c {
1320            c if in_escape => {
1321                result.push(c);
1322                in_escape = false;
1323            }
1324            '\\' => in_escape = true,
1325            c if is_quoting_char(c) => (),
1326            c => result.push(c),
1327        }
1328    }
1329
1330    result
1331}
1332
1333#[cfg(test)]
1334mod tests {
1335
1336    use super::*;
1337    use anyhow::Result;
1338    use insta::assert_ron_snapshot;
1339    use pretty_assertions::{assert_eq, assert_matches};
1340
1341    #[derive(serde::Serialize, serde::Deserialize)]
1342    struct TokenizerResult<'a> {
1343        input: &'a str,
1344        result: Vec<Token>,
1345    }
1346
1347    fn test_tokenizer(input: &str) -> Result<TokenizerResult<'_>> {
1348        Ok(TokenizerResult {
1349            input,
1350            result: tokenize_str(input)?,
1351        })
1352    }
1353
1354    #[test]
1355    fn tokenize_empty() -> Result<()> {
1356        let tokens = tokenize_str("")?;
1357        assert_eq!(tokens.len(), 0);
1358        Ok(())
1359    }
1360
1361    #[test]
1362    fn tokenize_line_continuation() -> Result<()> {
1363        assert_ron_snapshot!(test_tokenizer(
1364            r"a\
1365bc"
1366        )?);
1367        Ok(())
1368    }
1369
1370    #[test]
1371    fn tokenize_operators() -> Result<()> {
1372        assert_ron_snapshot!(test_tokenizer("a>>b")?);
1373        Ok(())
1374    }
1375
1376    #[test]
1377    fn tokenize_comment() -> Result<()> {
1378        assert_ron_snapshot!(test_tokenizer(
1379            r"a #comment
1380"
1381        )?);
1382        Ok(())
1383    }
1384
1385    #[test]
1386    fn tokenize_comment_at_eof() -> Result<()> {
1387        assert_ron_snapshot!(test_tokenizer(r"a #comment")?);
1388        Ok(())
1389    }
1390
1391    #[test]
1392    fn tokenize_empty_here_doc() -> Result<()> {
1393        assert_ron_snapshot!(test_tokenizer(
1394            r"cat <<HERE
1395HERE
1396"
1397        )?);
1398        Ok(())
1399    }
1400
1401    #[test]
1402    fn tokenize_here_doc() -> Result<()> {
1403        assert_ron_snapshot!(test_tokenizer(
1404            r"cat <<HERE
1405SOMETHING
1406HERE
1407echo after
1408"
1409        )?);
1410        assert_ron_snapshot!(test_tokenizer(
1411            r"cat <<HERE
1412SOMETHING
1413HERE
1414"
1415        )?);
1416        assert_ron_snapshot!(test_tokenizer(
1417            r"cat <<HERE
1418SOMETHING
1419HERE
1420
1421"
1422        )?);
1423        assert_ron_snapshot!(test_tokenizer(
1424            r"cat <<HERE
1425SOMETHING
1426HERE"
1427        )?);
1428        Ok(())
1429    }
1430
1431    #[test]
1432    fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1433        assert_ron_snapshot!(test_tokenizer(
1434            r"cat <<-HERE
1435	SOMETHING
1436	HERE
1437"
1438        )?);
1439        Ok(())
1440    }
1441
1442    #[test]
1443    fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1444        assert_ron_snapshot!(test_tokenizer(
1445            r"cat <<EOF | wc -l
1446A B C
14471 2 3
1448D E F
1449EOF
1450"
1451        )?);
1452        Ok(())
1453    }
1454
1455    #[test]
1456    fn tokenize_multiple_here_docs() -> Result<()> {
1457        assert_ron_snapshot!(test_tokenizer(
1458            r"cat <<HERE1 <<HERE2
1459SOMETHING
1460HERE1
1461OTHER
1462HERE2
1463echo after
1464"
1465        )?);
1466        Ok(())
1467    }
1468
1469    #[test]
1470    fn tokenize_unterminated_here_doc() {
1471        let result = tokenize_str(
1472            r"cat <<HERE
1473SOMETHING
1474",
1475        );
1476        assert!(result.is_err());
1477    }
1478
1479    #[test]
1480    fn tokenize_missing_here_tag() {
1481        let result = tokenize_str(
1482            r"cat <<
1483",
1484        );
1485        assert!(result.is_err());
1486    }
1487
1488    #[test]
1489    fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1490        assert_ron_snapshot!(test_tokenizer(
1491            r"echo $(cat <<HERE
1492TEXT
1493HERE
1494)"
1495        )?);
1496        Ok(())
1497    }
1498
1499    #[test]
1500    fn tokenize_here_doc_in_double_quoted_command_substitution() -> Result<()> {
1501        assert_ron_snapshot!(test_tokenizer(
1502            r#"echo "$(cat <<HERE
1503TEXT
1504HERE
1505)""#
1506        )?);
1507        Ok(())
1508    }
1509
1510    #[test]
1511    fn tokenize_here_doc_in_double_quoted_command_substitution_with_space() -> Result<()> {
1512        assert_ron_snapshot!(test_tokenizer(
1513            r#"echo "$(cat << HERE
1514TEXT
1515HERE
1516)""#
1517        )?);
1518        Ok(())
1519    }
1520
1521    #[test]
1522    fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1523        assert_ron_snapshot!(test_tokenizer(
1524            r"echo $(cat <<HERE1 <<HERE2 | wc -l
1525TEXT
1526HERE1
1527OTHER
1528HERE2
1529)"
1530        )?);
1531        Ok(())
1532    }
1533
1534    #[test]
1535    fn tokenize_simple_backquote() -> Result<()> {
1536        assert_ron_snapshot!(test_tokenizer(r"echo `echo hi`")?);
1537        Ok(())
1538    }
1539
1540    #[test]
1541    fn tokenize_backquote_with_escape() -> Result<()> {
1542        assert_ron_snapshot!(test_tokenizer(r"echo `echo\`hi`")?);
1543        Ok(())
1544    }
1545
1546    #[test]
1547    fn tokenize_unterminated_backquote() {
1548        assert_matches!(
1549            tokenize_str("`"),
1550            Err(TokenizerError::UnterminatedBackquote(_))
1551        );
1552    }
1553
1554    #[test]
1555    fn tokenize_unterminated_command_substitution() {
1556        // $( is consumed before the tokenizer knows whether it's $( or $((,
1557        // so it goes through consume_nested_construct and yields UnterminatedExpansion.
1558        assert_matches!(
1559            tokenize_str("$("),
1560            Err(TokenizerError::UnterminatedExpansion)
1561        );
1562    }
1563
1564    #[test]
1565    fn tokenize_unterminated_arithmetic_expansion() {
1566        assert_matches!(
1567            tokenize_str("$(("),
1568            Err(TokenizerError::UnterminatedExpansion)
1569        );
1570    }
1571
1572    #[test]
1573    fn tokenize_unterminated_legacy_arithmetic_expansion() {
1574        assert_matches!(
1575            tokenize_str("$["),
1576            Err(TokenizerError::UnterminatedExpansion)
1577        );
1578    }
1579
1580    #[test]
1581    fn tokenize_command_substitution() -> Result<()> {
1582        assert_ron_snapshot!(test_tokenizer("a$(echo hi)b c")?);
1583        Ok(())
1584    }
1585
1586    #[test]
1587    fn tokenize_command_substitution_with_subshell() -> Result<()> {
1588        assert_ron_snapshot!(test_tokenizer("$( (:) )")?);
1589        Ok(())
1590    }
1591
1592    #[test]
1593    fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1594        assert_ron_snapshot!(test_tokenizer("echo $(echo !(x))")?);
1595        Ok(())
1596    }
1597
1598    #[test]
1599    fn tokenize_arithmetic_expression() -> Result<()> {
1600        assert_ron_snapshot!(test_tokenizer("a$((1+2))b c")?);
1601        Ok(())
1602    }
1603
1604    #[test]
1605    fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1606        // N.B. The spacing comes out a bit odd, but it gets processed okay
1607        // by later stages.
1608        assert_ron_snapshot!(test_tokenizer("$(( 1 ))")?);
1609        Ok(())
1610    }
1611    #[test]
1612    fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1613        assert_ron_snapshot!(test_tokenizer("$(( (0) ))")?);
1614        Ok(())
1615    }
1616
1617    #[test]
1618    fn tokenize_special_parameters() -> Result<()> {
1619        assert_ron_snapshot!(test_tokenizer("$$")?);
1620        assert_ron_snapshot!(test_tokenizer("$@")?);
1621        assert_ron_snapshot!(test_tokenizer("$!")?);
1622        assert_ron_snapshot!(test_tokenizer("$?")?);
1623        assert_ron_snapshot!(test_tokenizer("$*")?);
1624        Ok(())
1625    }
1626
1627    #[test]
1628    fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1629        assert_ron_snapshot!(test_tokenizer("$x")?);
1630        assert_ron_snapshot!(test_tokenizer("a$x")?);
1631        Ok(())
1632    }
1633
1634    #[test]
1635    fn tokenize_unterminated_parameter_expansion() {
1636        assert_matches!(
1637            tokenize_str("${x"),
1638            Err(TokenizerError::UnterminatedVariable)
1639        );
1640    }
1641
1642    #[test]
1643    fn tokenize_braced_parameter_expansion() -> Result<()> {
1644        assert_ron_snapshot!(test_tokenizer("${x}")?);
1645        assert_ron_snapshot!(test_tokenizer("a${x}b")?);
1646        Ok(())
1647    }
1648
1649    #[test]
1650    fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1651        assert_ron_snapshot!(test_tokenizer(r"a${x\}}b")?);
1652        Ok(())
1653    }
1654
1655    #[test]
1656    fn tokenize_whitespace() -> Result<()> {
1657        assert_ron_snapshot!(test_tokenizer("1 2 3")?);
1658        Ok(())
1659    }
1660
1661    #[test]
1662    fn tokenize_escaped_whitespace() -> Result<()> {
1663        assert_ron_snapshot!(test_tokenizer(r"1\ 2 3")?);
1664        Ok(())
1665    }
1666
1667    #[test]
1668    fn tokenize_single_quote() -> Result<()> {
1669        assert_ron_snapshot!(test_tokenizer(r"x'a b'y")?);
1670        Ok(())
1671    }
1672
1673    #[test]
1674    fn tokenize_double_quote() -> Result<()> {
1675        assert_ron_snapshot!(test_tokenizer(r#"x"a b"y"#)?);
1676        Ok(())
1677    }
1678
1679    #[test]
1680    fn tokenize_double_quoted_command_substitution() -> Result<()> {
1681        assert_ron_snapshot!(test_tokenizer(r#"x"$(echo hi)"y"#)?);
1682        Ok(())
1683    }
1684
1685    #[test]
1686    fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1687        assert_ron_snapshot!(test_tokenizer(r#"x"$((1+2))"y"#)?);
1688        Ok(())
1689    }
1690
1691    #[test]
1692    fn test_quote_removal() {
1693        assert_eq!(unquote_str(r#""hello""#), "hello");
1694        assert_eq!(unquote_str(r"'hello'"), "hello");
1695        assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1696        assert_eq!(unquote_str(r"'hel\'lo'"), r"hel'lo");
1697    }
1698}