shannon_brush_parser/
tokenizer.rs

1use std::borrow::Cow;
2use std::sync::Arc;
3use utf8_chars::BufReadCharsExt;
4
5use crate::{SourcePosition, SourceSpan};
6
7#[derive(Clone, Debug)]
8pub(crate) enum TokenEndReason {
9    /// End of input was reached.
10    EndOfInput,
11    /// An unescaped newline char was reached.
12    UnescapedNewLine,
13    /// Specified terminating char.
14    SpecifiedTerminatingChar,
15    /// A non-newline blank char was reached.
16    NonNewLineBlank,
17    /// A here-document's body is starting.
18    HereDocumentBodyStart,
19    /// A here-document's body was terminated.
20    HereDocumentBodyEnd,
21    /// A here-document's end tag was reached.
22    HereDocumentEndTag,
23    /// An operator was started.
24    OperatorStart,
25    /// An operator was terminated.
26    OperatorEnd,
27    /// Some other condition was reached.
28    Other,
29}
30
31/// Compatibility alias for `SourceSpan`.
32pub type TokenLocation = SourceSpan;
33
34/// Represents a token extracted from a shell script.
35#[derive(Clone, Debug)]
36#[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))]
37#[cfg_attr(
38    any(test, feature = "serde"),
39    derive(PartialEq, Eq, serde::Serialize, serde::Deserialize)
40)]
41pub enum Token {
42    /// An operator token.
43    Operator(String, SourceSpan),
44    /// A word token.
45    Word(String, SourceSpan),
46}
47
48impl Token {
49    /// Returns the string value of the token.
50    pub fn to_str(&self) -> &str {
51        match self {
52            Self::Operator(s, _) => s,
53            Self::Word(s, _) => s,
54        }
55    }
56
57    /// Returns the location of the token in the source script.
58    pub const fn location(&self) -> &SourceSpan {
59        match self {
60            Self::Operator(_, l) => l,
61            Self::Word(_, l) => l,
62        }
63    }
64}
65
66#[cfg(feature = "diagnostics")]
67impl From<&Token> for miette::SourceSpan {
68    fn from(token: &Token) -> Self {
69        let start = token.location().start.as_ref();
70        Self::new(start.into(), token.location().length())
71    }
72}
73
74/// Encapsulates the result of tokenizing a shell script.
75#[derive(Clone, Debug)]
76pub(crate) struct TokenizeResult {
77    /// Reason for tokenization ending.
78    pub reason: TokenEndReason,
79    /// The token that was extracted, if any.
80    pub token: Option<Token>,
81}
82
83/// Represents an error that occurred during tokenization.
84#[derive(thiserror::Error, Debug)]
85pub enum TokenizerError {
86    /// An unterminated escape sequence was encountered at the end of the input stream.
87    #[error("unterminated escape sequence")]
88    UnterminatedEscapeSequence,
89
90    /// An unterminated single-quoted substring was encountered at the end of the input stream.
91    #[error("unterminated single quote at {0}")]
92    UnterminatedSingleQuote(SourcePosition),
93
94    /// An unterminated ANSI C-quoted substring was encountered at the end of the input stream.
95    #[error("unterminated ANSI C quote at {0}")]
96    UnterminatedAnsiCQuote(SourcePosition),
97
98    /// An unterminated double-quoted substring was encountered at the end of the input stream.
99    #[error("unterminated double quote at {0}")]
100    UnterminatedDoubleQuote(SourcePosition),
101
102    /// An unterminated back-quoted substring was encountered at the end of the input stream.
103    #[error("unterminated backquote near {0}")]
104    UnterminatedBackquote(SourcePosition),
105
106    /// An unterminated extended glob (extglob) pattern was encountered at the end of the input
107    /// stream.
108    #[error("unterminated extglob near {0}")]
109    UnterminatedExtendedGlob(SourcePosition),
110
111    /// An unterminated variable expression was encountered at the end of the input stream.
112    #[error("unterminated variable expression")]
113    UnterminatedVariable,
114
115    /// An unterminated command substitiion was encountered at the end of the input stream.
116    #[error("unterminated command substitution")]
117    UnterminatedCommandSubstitution,
118
119    /// An unterminated arithmetic or other expansion was encountered at the end of the input stream.
120    #[error("unterminated expansion")]
121    UnterminatedExpansion,
122
123    /// An error occurred decoding UTF-8 characters in the input stream.
124    #[error("failed to decode UTF-8 characters")]
125    FailedDecoding,
126
127    /// An I/O here tag was missing.
128    #[error("missing here tag for here document body")]
129    MissingHereTagForDocumentBody,
130
131    /// The indicated I/O here tag was missing.
132    #[error("missing here tag '{0}'")]
133    MissingHereTag(String),
134
135    /// An unterminated here document sequence was encountered at the end of the input stream.
136    #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
137    UnterminatedHereDocuments(String, String),
138
139    /// An I/O error occurred while reading from the input stream.
140    #[error("failed to read input")]
141    ReadError(#[from] std::io::Error),
142}
143
144impl TokenizerError {
145    /// Returns true if the error represents an error that could possibly be due
146    /// to an incomplete input stream.
147    pub const fn is_incomplete(&self) -> bool {
148        matches!(
149            self,
150            Self::UnterminatedEscapeSequence
151                | Self::UnterminatedAnsiCQuote(..)
152                | Self::UnterminatedSingleQuote(..)
153                | Self::UnterminatedDoubleQuote(..)
154                | Self::UnterminatedBackquote(..)
155                | Self::UnterminatedCommandSubstitution
156                | Self::UnterminatedExpansion
157                | Self::UnterminatedVariable
158                | Self::UnterminatedExtendedGlob(..)
159                | Self::UnterminatedHereDocuments(..)
160        )
161    }
162}
163
164/// Encapsulates a sequence of tokens.
165#[derive(Debug)]
166pub(crate) struct Tokens<'a> {
167    /// Sequence of tokens.
168    pub tokens: &'a [Token],
169}
170
171#[derive(Clone, Debug)]
172enum QuoteMode {
173    None,
174    AnsiC(SourcePosition),
175    Single(SourcePosition),
176    Double(SourcePosition),
177}
178
179#[derive(Clone, Debug, Default)]
180enum HereState {
181    /// In this state, we are not currently tracking any here-documents.
182    #[default]
183    None,
184    /// In this state, we expect that the next token will be a here tag.
185    NextTokenIsHereTag { remove_tabs: bool },
186    /// In this state, the *current* token is a here tag.
187    CurrentTokenIsHereTag {
188        remove_tabs: bool,
189        operator_token_result: TokenizeResult,
190    },
191    /// In this state, we expect that the *next line* will be the body of
192    /// a here-document.
193    NextLineIsHereDoc,
194    /// In this state, we are in the set of lines that comprise 1 or more
195    /// consecutive here-document bodies.
196    InHereDocs,
197}
198
199#[derive(Clone, Debug)]
200struct HereTag {
201    tag: String,
202    tag_was_escaped_or_quoted: bool,
203    remove_tabs: bool,
204    position: SourcePosition,
205    tokens: Vec<TokenizeResult>,
206    pending_tokens_after: Vec<TokenizeResult>,
207}
208
209#[derive(Clone, Debug)]
210struct CrossTokenParseState {
211    /// Cursor within the overall token stream; used for error reporting.
212    cursor: SourcePosition,
213    /// Current state of parsing here-documents.
214    here_state: HereState,
215    /// Ordered queue of here tags for which we're still looking for matching here-document bodies.
216    current_here_tags: Vec<HereTag>,
217    /// Tokens already tokenized that should be used first to serve requests for tokens.
218    queued_tokens: Vec<TokenizeResult>,
219    /// Are we in an arithmetic expansion?
220    arithmetic_expansion: bool,
221}
222
223/// Options controlling how the tokenizer operates.
224#[derive(Clone, Debug, Hash, Eq, PartialEq)]
225pub struct TokenizerOptions {
226    /// Whether or not to enable extended globbing patterns (extglob).
227    pub enable_extended_globbing: bool,
228    /// Whether or not to operate in POSIX compliance mode.
229    pub posix_mode: bool,
230    /// Whether or not we're running in SH emulation mode.
231    pub sh_mode: bool,
232}
233
234impl Default for TokenizerOptions {
235    fn default() -> Self {
236        Self {
237            enable_extended_globbing: true,
238            posix_mode: false,
239            sh_mode: false,
240        }
241    }
242}
243
244/// A tokenizer for shell scripts.
245pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
246    char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
247    cross_state: CrossTokenParseState,
248    options: TokenizerOptions,
249}
250
251/// Encapsulates the current token parsing state.
252#[derive(Clone, Debug)]
253struct TokenParseState {
254    pub start_position: SourcePosition,
255    pub token_so_far: String,
256    pub token_is_operator: bool,
257    pub in_escape: bool,
258    pub quote_mode: QuoteMode,
259}
260
261impl TokenParseState {
262    pub fn new(start_position: &SourcePosition) -> Self {
263        Self {
264            start_position: start_position.to_owned(),
265            token_so_far: String::new(),
266            token_is_operator: false,
267            in_escape: false,
268            quote_mode: QuoteMode::None,
269        }
270    }
271
272    pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
273        let end = Arc::new(end_position.to_owned());
274        let token_location = SourceSpan {
275            start: Arc::new(std::mem::take(&mut self.start_position)),
276            end,
277        };
278
279        let token = if std::mem::take(&mut self.token_is_operator) {
280            Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
281        } else {
282            Token::Word(std::mem::take(&mut self.token_so_far), token_location)
283        };
284
285        end_position.clone_into(&mut self.start_position);
286        self.in_escape = false;
287        self.quote_mode = QuoteMode::None;
288
289        token
290    }
291
292    pub const fn started_token(&self) -> bool {
293        !self.token_so_far.is_empty()
294    }
295
296    pub fn append_char(&mut self, c: char) {
297        self.token_so_far.push(c);
298    }
299
300    pub fn append_str(&mut self, s: &str) {
301        self.token_so_far.push_str(s);
302    }
303
304    pub const fn unquoted(&self) -> bool {
305        !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
306    }
307
308    pub fn current_token(&self) -> &str {
309        &self.token_so_far
310    }
311
312    pub fn is_specific_operator(&self, operator: &str) -> bool {
313        self.token_is_operator && self.current_token() == operator
314    }
315
316    pub const fn in_operator(&self) -> bool {
317        self.token_is_operator
318    }
319
320    fn is_newline(&self) -> bool {
321        self.token_so_far == "\n"
322    }
323
324    fn replace_with_here_doc(&mut self, s: String) {
325        self.token_so_far = s;
326    }
327
328    #[allow(clippy::too_many_lines)]
329    pub fn delimit_current_token(
330        &mut self,
331        reason: TokenEndReason,
332        cross_token_state: &mut CrossTokenParseState,
333    ) -> Result<Option<TokenizeResult>, TokenizerError> {
334        // If we don't have anything in the token, then don't yield an empty string token
335        // *unless* it's the body of a here document.
336        if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
337            return Ok(Some(TokenizeResult {
338                reason,
339                token: None,
340            }));
341        }
342
343        // TODO(tokenizer): Make sure the here-tag meets criteria (and isn't a newline).
344        let current_here_state = std::mem::take(&mut cross_token_state.here_state);
345        match current_here_state {
346            HereState::NextTokenIsHereTag { remove_tabs } => {
347                // Don't yield the operator as a token yet. We need to make sure we collect
348                // up everything we need for all the here-documents with tags on this line.
349                let operator_token_result = TokenizeResult {
350                    reason,
351                    token: Some(self.pop(&cross_token_state.cursor)),
352                };
353
354                cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
355                    remove_tabs,
356                    operator_token_result,
357                };
358
359                return Ok(None);
360            }
361            HereState::CurrentTokenIsHereTag {
362                remove_tabs,
363                operator_token_result,
364            } => {
365                if self.is_newline() {
366                    return Err(TokenizerError::MissingHereTag(
367                        self.current_token().to_owned(),
368                    ));
369                }
370
371                cross_token_state.here_state = HereState::NextLineIsHereDoc;
372
373                // Include the trailing \n in the here tag so it's easier to check against.
374                let tag = std::format!("{}\n", self.current_token().trim_ascii_start());
375                let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
376
377                let tag_token_result = TokenizeResult {
378                    reason,
379                    token: Some(self.pop(&cross_token_state.cursor)),
380                };
381
382                cross_token_state.current_here_tags.push(HereTag {
383                    tag,
384                    tag_was_escaped_or_quoted,
385                    remove_tabs,
386                    position: cross_token_state.cursor.clone(),
387                    tokens: vec![operator_token_result, tag_token_result],
388                    pending_tokens_after: vec![],
389                });
390
391                return Ok(None);
392            }
393            HereState::NextLineIsHereDoc => {
394                if self.is_newline() {
395                    cross_token_state.here_state = HereState::InHereDocs;
396                } else {
397                    cross_token_state.here_state = HereState::NextLineIsHereDoc;
398                }
399
400                if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
401                    let token = self.pop(&cross_token_state.cursor);
402                    let result = TokenizeResult {
403                        reason,
404                        token: Some(token),
405                    };
406
407                    last_here_tag.pending_tokens_after.push(result);
408                } else {
409                    return Err(TokenizerError::MissingHereTagForDocumentBody);
410                }
411
412                return Ok(None);
413            }
414            HereState::InHereDocs => {
415                // We hit the end of the current here-document.
416                let completed_here_tag = cross_token_state.current_here_tags.remove(0);
417
418                // First queue the redirection operator and (start) here-tag.
419                for here_token in completed_here_tag.tokens {
420                    cross_token_state.queued_tokens.push(here_token);
421                }
422
423                // Leave a hint that we are about to start a here-document.
424                cross_token_state.queued_tokens.push(TokenizeResult {
425                    reason: TokenEndReason::HereDocumentBodyStart,
426                    token: None,
427                });
428
429                // Then queue the body document we just finished.
430                cross_token_state.queued_tokens.push(TokenizeResult {
431                    reason,
432                    token: Some(self.pop(&cross_token_state.cursor)),
433                });
434
435                // Then queue up the (end) here-tag.
436                let end_tag = if completed_here_tag.tag_was_escaped_or_quoted {
437                    unquote_str(&completed_here_tag.tag)
438                } else {
439                    completed_here_tag.tag
440                };
441                self.append_str(end_tag.trim_end_matches('\n'));
442                cross_token_state.queued_tokens.push(TokenizeResult {
443                    reason: TokenEndReason::HereDocumentEndTag,
444                    token: Some(self.pop(&cross_token_state.cursor)),
445                });
446
447                // Now we're ready to queue up any tokens that came between the completed
448                // here tag and the next here tag (or newline after it if it was the last).
449                for pending_token in completed_here_tag.pending_tokens_after {
450                    cross_token_state.queued_tokens.push(pending_token);
451                }
452
453                if cross_token_state.current_here_tags.is_empty() {
454                    cross_token_state.here_state = HereState::None;
455                } else {
456                    cross_token_state.here_state = HereState::InHereDocs;
457                }
458
459                return Ok(None);
460            }
461            HereState::None => (),
462        }
463
464        let token = self.pop(&cross_token_state.cursor);
465        let result = TokenizeResult {
466            reason,
467            token: Some(token),
468        };
469
470        Ok(Some(result))
471    }
472}
473
474/// Break the given input shell script string into tokens, returning the tokens.
475///
476/// # Arguments
477///
478/// * `input` - The shell script to tokenize.
479pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
480    tokenize_str_with_options(input, &TokenizerOptions::default())
481}
482
483/// Break the given input shell script string into tokens, returning the tokens.
484///
485/// # Arguments
486///
487/// * `input` - The shell script to tokenize.
488/// * `options` - Options controlling how the tokenizer operates.
489pub fn tokenize_str_with_options(
490    input: &str,
491    options: &TokenizerOptions,
492) -> Result<Vec<Token>, TokenizerError> {
493    uncached_tokenize_string(input.to_owned(), options.to_owned())
494}
495
496#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
497fn uncached_tokenize_string(
498    input: String,
499    options: TokenizerOptions,
500) -> Result<Vec<Token>, TokenizerError> {
501    uncached_tokenize_str(input.as_str(), &options)
502}
503
504/// Break the given input shell script string into tokens, returning the tokens.
505/// No caching is performed.
506///
507/// # Arguments
508///
509/// * `input` - The shell script to tokenize.
510pub fn uncached_tokenize_str(
511    input: &str,
512    options: &TokenizerOptions,
513) -> Result<Vec<Token>, TokenizerError> {
514    let mut reader = std::io::BufReader::new(input.as_bytes());
515    let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
516
517    let mut tokens = vec![];
518    loop {
519        match tokenizer.next_token()? {
520            TokenizeResult {
521                token: Some(token), ..
522            } => tokens.push(token),
523            TokenizeResult {
524                reason: TokenEndReason::EndOfInput,
525                ..
526            } => break,
527            _ => (),
528        }
529    }
530
531    Ok(tokens)
532}
533
534impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
535    pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Self {
536        Tokenizer {
537            options: options.clone(),
538            char_reader: reader.chars().peekable(),
539            cross_state: CrossTokenParseState {
540                cursor: SourcePosition {
541                    index: 0,
542                    line: 1,
543                    column: 1,
544                },
545                here_state: HereState::None,
546                current_here_tags: vec![],
547                queued_tokens: vec![],
548                arithmetic_expansion: false,
549            },
550        }
551    }
552
553    #[expect(clippy::unnecessary_wraps)]
554    pub fn current_location(&self) -> Option<SourcePosition> {
555        Some(self.cross_state.cursor.clone())
556    }
557
558    fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
559        let c = self
560            .char_reader
561            .next()
562            .transpose()
563            .map_err(TokenizerError::ReadError)?;
564
565        if let Some(ch) = c {
566            if ch == '\n' {
567                self.cross_state.cursor.line += 1;
568                self.cross_state.cursor.column = 1;
569            } else {
570                self.cross_state.cursor.column += 1;
571            }
572            self.cross_state.cursor.index += 1;
573        }
574
575        Ok(c)
576    }
577
578    fn consume_char(&mut self) -> Result<(), TokenizerError> {
579        let _ = self.next_char()?;
580        Ok(())
581    }
582
583    fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
584        match self.char_reader.peek() {
585            Some(result) => match result {
586                Ok(c) => Ok(Some(*c)),
587                Err(_) => Err(TokenizerError::FailedDecoding),
588            },
589            None => Ok(None),
590        }
591    }
592
593    pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
594        self.next_token_until(None, false /* include space? */)
595    }
596
597    /// Consumes a nested construct (e.g., `$((...))` or `$[...]`), handling nested delimiters
598    /// and here-documents.
599    ///
600    /// # Arguments
601    ///
602    /// * `state` - The current token parse state to append characters to.
603    /// * `terminating_char` - The character that terminates the construct (e.g., `)` or `]`).
604    /// * `nesting_open` - The character that increases nesting depth when encountered (e.g., `(` or `[`).
605    /// * `initial_nesting` - The initial nesting count (e.g., 2 for `$((`, 1 for `$[`).
606    fn consume_nested_construct(
607        &mut self,
608        state: &mut TokenParseState,
609        terminating_char: char,
610        nesting_open: &str,
611        mut nesting_count: u32,
612    ) -> Result<(), TokenizerError> {
613        let mut pending_here_doc_tokens = vec![];
614        let mut drain_here_doc_tokens = false;
615
616        loop {
617            let cur_token = if drain_here_doc_tokens && !pending_here_doc_tokens.is_empty() {
618                if pending_here_doc_tokens.len() == 1 {
619                    drain_here_doc_tokens = false;
620                }
621                pending_here_doc_tokens.remove(0)
622            } else {
623                let cur_token = self.next_token_until(Some(terminating_char), true)?;
624
625                if matches!(
626                    cur_token.reason,
627                    TokenEndReason::HereDocumentBodyStart
628                        | TokenEndReason::HereDocumentBodyEnd
629                        | TokenEndReason::HereDocumentEndTag
630                ) {
631                    pending_here_doc_tokens.push(cur_token);
632                    continue;
633                }
634                cur_token
635            };
636
637            if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
638                && !pending_here_doc_tokens.is_empty()
639            {
640                pending_here_doc_tokens.push(cur_token);
641                drain_here_doc_tokens = true;
642                continue;
643            }
644
645            if let Some(cur_token_value) = cur_token.token {
646                state.append_str(cur_token_value.to_str());
647
648                if matches!(cur_token_value, Token::Operator(o, _) if o == nesting_open) {
649                    nesting_count += 1;
650                }
651            }
652
653            match cur_token.reason {
654                TokenEndReason::HereDocumentBodyStart => {
655                    state.append_char('\n');
656                }
657                TokenEndReason::NonNewLineBlank => state.append_char(' '),
658                TokenEndReason::SpecifiedTerminatingChar => {
659                    nesting_count -= 1;
660                    if nesting_count == 0 {
661                        break;
662                    }
663                    state.append_char(self.next_char()?.unwrap());
664                }
665                TokenEndReason::EndOfInput => {
666                    return Err(TokenizerError::UnterminatedExpansion);
667                }
668                _ => (),
669            }
670        }
671
672        state.append_char(self.next_char()?.unwrap());
673        Ok(())
674    }
675
676    /// Returns the next token from the input stream, optionally stopping early when a specified
677    /// terminating character is encountered.
678    ///
679    /// # Arguments
680    ///
681    /// * `terminating_char` - An optional character that, if encountered, will stop the
682    ///   tokenization process and return the token up to that character.
683    /// * `include_space` - If true, include spaces in the tokenization process. This is not
684    ///   typically the case, but can be helpful when needing to preserve the original source text
685    ///   embedded within a command substitution or similar construct.
686    #[expect(clippy::cognitive_complexity)]
687    #[expect(clippy::if_same_then_else)]
688    #[expect(clippy::panic_in_result_fn)]
689    #[expect(clippy::too_many_lines)]
690    #[allow(clippy::unwrap_in_result)]
691    fn next_token_until(
692        &mut self,
693        terminating_char: Option<char>,
694        include_space: bool,
695    ) -> Result<TokenizeResult, TokenizerError> {
696        let mut state = TokenParseState::new(&self.cross_state.cursor);
697        let mut result: Option<TokenizeResult> = None;
698
699        while result.is_none() {
700            // First satisfy token results from our queue. Once we exhaust the queue then
701            // we'll look at the input stream.
702            if !self.cross_state.queued_tokens.is_empty() {
703                return Ok(self.cross_state.queued_tokens.remove(0));
704            }
705
706            let next = self.peek_char()?;
707            let c = next.unwrap_or('\0');
708
709            // When we hit the end of the input, then we're done with the current token (if there is
710            // one).
711            if next.is_none() {
712                // TODO(tokenizer): Verify we're not waiting on some terminating character?
713                // Verify we're out of all quotes.
714                if state.in_escape {
715                    return Err(TokenizerError::UnterminatedEscapeSequence);
716                }
717                match state.quote_mode {
718                    QuoteMode::None => (),
719                    QuoteMode::AnsiC(pos) => {
720                        return Err(TokenizerError::UnterminatedAnsiCQuote(pos));
721                    }
722                    QuoteMode::Single(pos) => {
723                        return Err(TokenizerError::UnterminatedSingleQuote(pos));
724                    }
725                    QuoteMode::Double(pos) => {
726                        return Err(TokenizerError::UnterminatedDoubleQuote(pos));
727                    }
728                }
729
730                // Verify we're not in a here document.
731                if !matches!(self.cross_state.here_state, HereState::None) {
732                    if self.remove_here_end_tag(&mut state, &mut result, false)? {
733                        // If we hit end tag without a trailing newline, try to get next token.
734                        continue;
735                    }
736
737                    let tag_names = self
738                        .cross_state
739                        .current_here_tags
740                        .iter()
741                        .map(|tag| tag.tag.trim())
742                        .collect::<Vec<_>>()
743                        .join(", ");
744                    let tag_positions = self
745                        .cross_state
746                        .current_here_tags
747                        .iter()
748                        .map(|tag| std::format!("{}", tag.position))
749                        .collect::<Vec<_>>()
750                        .join(", ");
751                    return Err(TokenizerError::UnterminatedHereDocuments(
752                        tag_names,
753                        tag_positions,
754                    ));
755                }
756
757                result = state
758                    .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
759            //
760            // Handle being in a here document.
761            //
762            } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
763                //
764                // For now, just include the character in the current token. We also check
765                // if there are leading tabs to be removed.
766                //
767                if !self.cross_state.current_here_tags.is_empty()
768                    && self.cross_state.current_here_tags[0].remove_tabs
769                    && (!state.started_token() || state.current_token().ends_with('\n'))
770                    && c == '\t'
771                {
772                    // Consume it but don't include it.
773                    self.consume_char()?;
774                } else {
775                    self.consume_char()?;
776                    state.append_char(c);
777
778                    // See if this was a newline character following the terminating here tag.
779                    if c == '\n' {
780                        self.remove_here_end_tag(&mut state, &mut result, true)?;
781                    }
782                }
783            //
784            // Look for the specially specified terminating char.
785            //
786            } else if state.unquoted() && terminating_char == Some(c) {
787                result = state.delimit_current_token(
788                    TokenEndReason::SpecifiedTerminatingChar,
789                    &mut self.cross_state,
790                )?;
791            } else if state.in_operator() {
792                //
793                // We're in an operator. See if this character continues an operator, or if it
794                // must be a separate token (because it wouldn't make a prefix of an operator).
795                //
796
797                let mut hypothetical_token = state.current_token().to_owned();
798                hypothetical_token.push(c);
799
800                if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
801                    self.consume_char()?;
802                    state.append_char(c);
803                } else {
804                    assert!(state.started_token());
805
806                    //
807                    // N.B. If the completed operator indicates a here-document, then keep
808                    // track that the *next* token should be the here-tag.
809                    //
810                    if self.cross_state.arithmetic_expansion {
811                        //
812                        // We're in an arithmetic context; don't consider << and <<-
813                        // special. They're not here-docs, they're either a left-shift
814                        // operator or a left-shift operator followed by a unary
815                        // minus operator.
816                        //
817
818                        if state.is_specific_operator(")") && c == ')' {
819                            self.cross_state.arithmetic_expansion = false;
820                        }
821                    } else if state.is_specific_operator("<<") {
822                        self.cross_state.here_state =
823                            HereState::NextTokenIsHereTag { remove_tabs: false };
824                    } else if state.is_specific_operator("<<-") {
825                        self.cross_state.here_state =
826                            HereState::NextTokenIsHereTag { remove_tabs: true };
827                    } else if state.is_specific_operator("(") && c == '(' {
828                        self.cross_state.arithmetic_expansion = true;
829                    }
830
831                    let reason = if state.current_token() == "\n" {
832                        TokenEndReason::UnescapedNewLine
833                    } else {
834                        TokenEndReason::OperatorEnd
835                    };
836
837                    result = state.delimit_current_token(reason, &mut self.cross_state)?;
838                }
839            //
840            // See if this is a character that changes the current escaping/quoting state.
841            //
842            } else if does_char_newly_affect_quoting(&state, c) {
843                if c == '\\' {
844                    // Consume the backslash ourselves so we can peek past it.
845                    self.consume_char()?;
846
847                    if matches!(self.peek_char()?, Some('\n')) {
848                        // Make sure the newline char gets consumed too.
849                        self.consume_char()?;
850
851                        // Make sure to include neither the backslash nor the newline character.
852                    } else {
853                        state.in_escape = true;
854                        state.append_char(c);
855                    }
856                } else if c == '\'' {
857                    if state.token_so_far.ends_with('$') {
858                        state.quote_mode = QuoteMode::AnsiC(self.cross_state.cursor.clone());
859                    } else {
860                        state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
861                    }
862
863                    self.consume_char()?;
864                    state.append_char(c);
865                } else if c == '\"' {
866                    state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
867                    self.consume_char()?;
868                    state.append_char(c);
869                }
870            }
871            //
872            // Handle end of single-quote, double-quote, or ANSI-C quote.
873            else if !state.in_escape
874                && matches!(
875                    state.quote_mode,
876                    QuoteMode::Single(..) | QuoteMode::AnsiC(..)
877                )
878                && c == '\''
879            {
880                state.quote_mode = QuoteMode::None;
881                self.consume_char()?;
882                state.append_char(c);
883            } else if !state.in_escape
884                && matches!(state.quote_mode, QuoteMode::Double(..))
885                && c == '\"'
886            {
887                state.quote_mode = QuoteMode::None;
888                self.consume_char()?;
889                state.append_char(c);
890            }
891            //
892            // Handle end of escape sequence.
893            // TODO(tokenizer): Handle double-quote specific escape sequences.
894            else if state.in_escape {
895                state.in_escape = false;
896                self.consume_char()?;
897                state.append_char(c);
898            } else if (state.unquoted()
899                || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
900                && (c == '$' || c == '`')
901            {
902                // TODO(tokenizer): handle quoted $ or ` in a double quote
903                if c == '$' {
904                    // Consume the '$' so we can peek beyond.
905                    self.consume_char()?;
906
907                    // Now peek beyond to see what we have.
908                    let char_after_dollar_sign = self.peek_char()?;
909                    match char_after_dollar_sign {
910                        Some('(') => {
911                            // Add the '$' we already consumed to the token.
912                            state.append_char('$');
913
914                            // Consume the '(' and add it to the token.
915                            state.append_char(self.next_char()?.unwrap());
916
917                            // Check to see if this is possibly an arithmetic expression
918                            // (i.e., one that starts with `$((`).
919                            let (initial_nesting, is_arithmetic) =
920                                if matches!(self.peek_char()?, Some('(')) {
921                                    // Consume the second '(' and add it to the token.
922                                    state.append_char(self.next_char()?.unwrap());
923                                    (2, true)
924                                } else {
925                                    (1, false)
926                                };
927
928                            if is_arithmetic {
929                                self.cross_state.arithmetic_expansion = true;
930                            }
931
932                            self.consume_nested_construct(&mut state, ')', "(", initial_nesting)?;
933
934                            if is_arithmetic {
935                                self.cross_state.arithmetic_expansion = false;
936                            }
937                        }
938
939                        Some('[') => {
940                            // Add the '$' we already consumed to the token.
941                            state.append_char('$');
942
943                            // Consume the '[' and add it to the token.
944                            state.append_char(self.next_char()?.unwrap());
945
946                            // Keep track that we're in an arithmetic expression, since
947                            // some text will be interpreted differently as a result.
948                            self.cross_state.arithmetic_expansion = true;
949
950                            self.consume_nested_construct(&mut state, ']', "[", 1)?;
951
952                            self.cross_state.arithmetic_expansion = false;
953                        }
954
955                        Some('{') => {
956                            // Add the '$' we already consumed to the token.
957                            state.append_char('$');
958
959                            // Consume the '{' and add it to the token.
960                            state.append_char(self.next_char()?.unwrap());
961
962                            let mut pending_here_doc_tokens = vec![];
963                            let mut drain_here_doc_tokens = false;
964
965                            loop {
966                                let cur_token = if drain_here_doc_tokens
967                                    && !pending_here_doc_tokens.is_empty()
968                                {
969                                    if pending_here_doc_tokens.len() == 1 {
970                                        drain_here_doc_tokens = false;
971                                    }
972
973                                    pending_here_doc_tokens.remove(0)
974                                } else {
975                                    let cur_token = self.next_token_until(
976                                        Some('}'),
977                                        false, /* include space? */
978                                    )?;
979
980                                    // See if this is a here-document-related token we need to hold
981                                    // onto until after we've seen all the tokens that need to show
982                                    // up before we get to the body.
983                                    if matches!(
984                                        cur_token.reason,
985                                        TokenEndReason::HereDocumentBodyStart
986                                            | TokenEndReason::HereDocumentBodyEnd
987                                            | TokenEndReason::HereDocumentEndTag
988                                    ) {
989                                        pending_here_doc_tokens.push(cur_token);
990                                        continue;
991                                    }
992
993                                    cur_token
994                                };
995
996                                if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
997                                    && !pending_here_doc_tokens.is_empty()
998                                {
999                                    pending_here_doc_tokens.push(cur_token);
1000                                    drain_here_doc_tokens = true;
1001                                    continue;
1002                                }
1003
1004                                if let Some(cur_token_value) = cur_token.token {
1005                                    state.append_str(cur_token_value.to_str());
1006                                }
1007
1008                                match cur_token.reason {
1009                                    TokenEndReason::HereDocumentBodyStart => {
1010                                        state.append_char('\n');
1011                                    }
1012                                    TokenEndReason::NonNewLineBlank => state.append_char(' '),
1013                                    TokenEndReason::SpecifiedTerminatingChar => {
1014                                        // We hit the end brace we were looking for but did not
1015                                        // yet consume it. Do so now.
1016                                        state.append_char(self.next_char()?.unwrap());
1017                                        break;
1018                                    }
1019                                    TokenEndReason::EndOfInput => {
1020                                        return Err(TokenizerError::UnterminatedVariable);
1021                                    }
1022                                    _ => (),
1023                                }
1024                            }
1025                        }
1026                        _ => {
1027                            // This is either a different character, or else the end of the string.
1028                            // Either way, add the '$' we already consumed to the token.
1029                            state.append_char('$');
1030                        }
1031                    }
1032                } else {
1033                    // We look for the terminating backquote. First disable normal consumption and
1034                    // consume the starting backquote.
1035                    let backquote_pos = self.cross_state.cursor.clone();
1036                    self.consume_char()?;
1037
1038                    // Add the opening backquote to the token.
1039                    state.append_char(c);
1040
1041                    // Now continue until we see an unescaped backquote.
1042                    let mut escaping_enabled = false;
1043                    let mut done = false;
1044                    while !done {
1045                        // Read (and consume) the next char.
1046                        let next_char_in_backquote = self.next_char()?;
1047                        if let Some(cib) = next_char_in_backquote {
1048                            // Include it in the token no matter what.
1049                            state.append_char(cib);
1050
1051                            // Watch out for escaping.
1052                            if !escaping_enabled && cib == '\\' {
1053                                escaping_enabled = true;
1054                            } else {
1055                                // Look for an unescaped backquote to terminate.
1056                                if !escaping_enabled && cib == '`' {
1057                                    done = true;
1058                                }
1059                                escaping_enabled = false;
1060                            }
1061                        } else {
1062                            return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1063                        }
1064                    }
1065                }
1066            }
1067            //
1068            // [Extension]
1069            // If extended globbing is enabled, the last consumed character is an
1070            // unquoted start of an extglob pattern, *and* if the current character
1071            // is an open parenthesis, then this begins an extglob pattern.
1072            else if c == '('
1073                && self.options.enable_extended_globbing
1074                && state.unquoted()
1075                && !state.in_operator()
1076                && state
1077                    .current_token()
1078                    .ends_with(|x| Self::can_start_extglob(x))
1079            {
1080                // Consume the '(' and append it.
1081                self.consume_char()?;
1082                state.append_char(c);
1083
1084                let mut paren_depth = 1;
1085                let mut in_escape = false;
1086
1087                // Keep consuming until we see the matching end ')'.
1088                while paren_depth > 0 {
1089                    if let Some(extglob_char) = self.next_char()? {
1090                        // Include it in the token.
1091                        state.append_char(extglob_char);
1092
1093                        match extglob_char {
1094                            _ if in_escape => in_escape = false,
1095                            '\\' => in_escape = true,
1096                            '(' => paren_depth += 1,
1097                            ')' => paren_depth -= 1,
1098                            _ => (),
1099                        }
1100                    } else {
1101                        return Err(TokenizerError::UnterminatedExtendedGlob(
1102                            self.cross_state.cursor.clone(),
1103                        ));
1104                    }
1105                }
1106            //
1107            // If the character *can* start an operator, then it will.
1108            //
1109            } else if state.unquoted() && Self::can_start_operator(c) {
1110                if state.started_token() {
1111                    result = state.delimit_current_token(
1112                        TokenEndReason::OperatorStart,
1113                        &mut self.cross_state,
1114                    )?;
1115                } else {
1116                    state.token_is_operator = true;
1117                    self.consume_char()?;
1118                    state.append_char(c);
1119                }
1120            //
1121            // Whitespace gets discarded (and delimits tokens).
1122            //
1123            } else if state.unquoted() && is_blank(c) {
1124                if state.started_token() {
1125                    result = state.delimit_current_token(
1126                        TokenEndReason::NonNewLineBlank,
1127                        &mut self.cross_state,
1128                    )?;
1129                } else if include_space {
1130                    state.append_char(c);
1131                } else {
1132                    // Make sure we don't include this char in the token range.
1133                    state.start_position.column += 1;
1134                    state.start_position.index += 1;
1135                }
1136
1137                self.consume_char()?;
1138            }
1139            //
1140            // N.B. We need to remember if we were recursively called in a variable
1141            // expansion expression; in that case we won't think a token was started but...
1142            // we'd be wrong.
1143            else if !state.token_is_operator
1144                && (state.started_token() || matches!(terminating_char, Some('}')))
1145            {
1146                self.consume_char()?;
1147                state.append_char(c);
1148            } else if c == '#' {
1149                // Consume the '#'.
1150                self.consume_char()?;
1151
1152                let mut done = false;
1153                while !done {
1154                    done = match self.peek_char()? {
1155                        Some('\n') => true,
1156                        None => true,
1157                        _ => {
1158                            // Consume the peeked char; it's part of the comment.
1159                            self.consume_char()?;
1160                            false
1161                        }
1162                    };
1163                }
1164                // Re-start loop as if the comment never happened.
1165            } else if state.started_token() {
1166                // In all other cases where we have an in-progress token, we delimit here.
1167                result =
1168                    state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1169            } else {
1170                // If we got here, then we don't have a token in progress and we're not starting an
1171                // operator. Add the character to a new token.
1172                self.consume_char()?;
1173                state.append_char(c);
1174            }
1175        }
1176
1177        let result = result.unwrap();
1178
1179        Ok(result)
1180    }
1181
1182    fn remove_here_end_tag(
1183        &mut self,
1184        state: &mut TokenParseState,
1185        result: &mut Option<TokenizeResult>,
1186        ends_with_newline: bool,
1187    ) -> Result<bool, TokenizerError> {
1188        // Bail immediately if we don't even have a *starting* here tag.
1189        if self.cross_state.current_here_tags.is_empty() {
1190            return Ok(false);
1191        }
1192
1193        let next_here_tag = &self.cross_state.current_here_tags[0];
1194
1195        let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
1196            unquote_str(next_here_tag.tag.as_str()).into()
1197        } else {
1198            next_here_tag.tag.as_str().into()
1199        };
1200
1201        let tag_str = if !ends_with_newline {
1202            tag_str
1203                .strip_suffix('\n')
1204                .unwrap_or_else(|| tag_str.as_ref())
1205        } else {
1206            tag_str.as_ref()
1207        };
1208
1209        if let Some(current_token_without_here_tag) = state.current_token().strip_suffix(tag_str) {
1210            // Make sure that was either the start of the here document, or there
1211            // was a newline between the preceding part
1212            // and the tag.
1213            if current_token_without_here_tag.is_empty()
1214                || current_token_without_here_tag.ends_with('\n')
1215            {
1216                state.replace_with_here_doc(current_token_without_here_tag.to_owned());
1217
1218                // Delimit the end of the here-document body.
1219                *result = state.delimit_current_token(
1220                    TokenEndReason::HereDocumentBodyEnd,
1221                    &mut self.cross_state,
1222                )?;
1223
1224                return Ok(true);
1225            }
1226        }
1227        Ok(false)
1228    }
1229
1230    const fn can_start_extglob(c: char) -> bool {
1231        matches!(c, '@' | '!' | '?' | '+' | '*')
1232    }
1233
1234    const fn can_start_operator(c: char) -> bool {
1235        matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1236    }
1237
1238    fn is_operator(&self, s: &str) -> bool {
1239        // Handle non-POSIX operators.
1240        if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1241            return true;
1242        }
1243
1244        matches!(
1245            s,
1246            "&" | "&&"
1247                | "("
1248                | ")"
1249                | ";"
1250                | ";;"
1251                | "\n"
1252                | "|"
1253                | "||"
1254                | "<"
1255                | ">"
1256                | ">|"
1257                | "<<"
1258                | ">>"
1259                | "<&"
1260                | ">&"
1261                | "<<-"
1262                | "<>"
1263        )
1264    }
1265}
1266
1267impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1268    type Item = Result<TokenizeResult, TokenizerError>;
1269
1270    fn next(&mut self) -> Option<Self::Item> {
1271        match self.next_token() {
1272            #[expect(clippy::manual_map)]
1273            Ok(result) => match result.token {
1274                Some(_) => Some(Ok(result)),
1275                None => None,
1276            },
1277            Err(e) => Some(Err(e)),
1278        }
1279    }
1280}
1281
1282const fn is_blank(c: char) -> bool {
1283    c == ' ' || c == '\t'
1284}
1285
1286const fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1287    // If we're currently escaped, then nothing affects quoting.
1288    if state.in_escape {
1289        return false;
1290    }
1291
1292    match state.quote_mode {
1293        // When we're in a double quote or ANSI-C quote, only a subset of escape
1294        // sequences are recognized.
1295        QuoteMode::Double(_) | QuoteMode::AnsiC(_) => {
1296            if c == '\\' {
1297                // TODO(tokenizer): handle backslash in double quote
1298                true
1299            } else {
1300                false
1301            }
1302        }
1303        // When we're in a single quote, nothing affects quoting.
1304        QuoteMode::Single(_) => false,
1305        // When we're not already in a quote, then we can straightforwardly look for a
1306        // quote mark or backslash.
1307        QuoteMode::None => is_quoting_char(c),
1308    }
1309}
1310
1311const fn is_quoting_char(c: char) -> bool {
1312    matches!(c, '\\' | '\'' | '\"')
1313}
1314
1315/// Return a string with all the quoting removed.
1316///
1317/// # Arguments
1318///
1319/// * `s` - The string to unquote.
1320pub fn unquote_str(s: &str) -> String {
1321    let mut result = String::new();
1322
1323    let mut in_escape = false;
1324    for c in s.chars() {
1325        match c {
1326            c if in_escape => {
1327                result.push(c);
1328                in_escape = false;
1329            }
1330            '\\' => in_escape = true,
1331            c if is_quoting_char(c) => (),
1332            c => result.push(c),
1333        }
1334    }
1335
1336    result
1337}
1338
1339#[cfg(test)]
1340mod tests {
1341
1342    use super::*;
1343    use anyhow::Result;
1344    use insta::assert_ron_snapshot;
1345    use pretty_assertions::{assert_eq, assert_matches};
1346
1347    #[derive(serde::Serialize, serde::Deserialize)]
1348    struct TokenizerResult<'a> {
1349        input: &'a str,
1350        result: Vec<Token>,
1351    }
1352
1353    fn test_tokenizer(input: &str) -> Result<TokenizerResult<'_>> {
1354        Ok(TokenizerResult {
1355            input,
1356            result: tokenize_str(input)?,
1357        })
1358    }
1359
1360    #[test]
1361    fn tokenize_empty() -> Result<()> {
1362        let tokens = tokenize_str("")?;
1363        assert_eq!(tokens.len(), 0);
1364        Ok(())
1365    }
1366
1367    #[test]
1368    fn tokenize_line_continuation() -> Result<()> {
1369        assert_ron_snapshot!(test_tokenizer(
1370            r"a\
1371bc"
1372        )?);
1373        Ok(())
1374    }
1375
1376    #[test]
1377    fn tokenize_operators() -> Result<()> {
1378        assert_ron_snapshot!(test_tokenizer("a>>b")?);
1379        Ok(())
1380    }
1381
1382    #[test]
1383    fn tokenize_comment() -> Result<()> {
1384        assert_ron_snapshot!(test_tokenizer(
1385            r"a #comment
1386"
1387        )?);
1388        Ok(())
1389    }
1390
1391    #[test]
1392    fn tokenize_comment_at_eof() -> Result<()> {
1393        assert_ron_snapshot!(test_tokenizer(r"a #comment")?);
1394        Ok(())
1395    }
1396
1397    #[test]
1398    fn tokenize_empty_here_doc() -> Result<()> {
1399        assert_ron_snapshot!(test_tokenizer(
1400            r"cat <<HERE
1401HERE
1402"
1403        )?);
1404        Ok(())
1405    }
1406
1407    #[test]
1408    fn tokenize_here_doc() -> Result<()> {
1409        assert_ron_snapshot!(test_tokenizer(
1410            r"cat <<HERE
1411SOMETHING
1412HERE
1413echo after
1414"
1415        )?);
1416        assert_ron_snapshot!(test_tokenizer(
1417            r"cat <<HERE
1418SOMETHING
1419HERE
1420"
1421        )?);
1422        assert_ron_snapshot!(test_tokenizer(
1423            r"cat <<HERE
1424SOMETHING
1425HERE
1426
1427"
1428        )?);
1429        assert_ron_snapshot!(test_tokenizer(
1430            r"cat <<HERE
1431SOMETHING
1432HERE"
1433        )?);
1434        Ok(())
1435    }
1436
1437    #[test]
1438    fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1439        assert_ron_snapshot!(test_tokenizer(
1440            r"cat <<-HERE
1441	SOMETHING
1442	HERE
1443"
1444        )?);
1445        Ok(())
1446    }
1447
1448    #[test]
1449    fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1450        assert_ron_snapshot!(test_tokenizer(
1451            r"cat <<EOF | wc -l
1452A B C
14531 2 3
1454D E F
1455EOF
1456"
1457        )?);
1458        Ok(())
1459    }
1460
1461    #[test]
1462    fn tokenize_multiple_here_docs() -> Result<()> {
1463        assert_ron_snapshot!(test_tokenizer(
1464            r"cat <<HERE1 <<HERE2
1465SOMETHING
1466HERE1
1467OTHER
1468HERE2
1469echo after
1470"
1471        )?);
1472        Ok(())
1473    }
1474
1475    #[test]
1476    fn tokenize_unterminated_here_doc() {
1477        let result = tokenize_str(
1478            r"cat <<HERE
1479SOMETHING
1480",
1481        );
1482        assert!(result.is_err());
1483    }
1484
1485    #[test]
1486    fn tokenize_missing_here_tag() {
1487        let result = tokenize_str(
1488            r"cat <<
1489",
1490        );
1491        assert!(result.is_err());
1492    }
1493
1494    #[test]
1495    fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1496        assert_ron_snapshot!(test_tokenizer(
1497            r"echo $(cat <<HERE
1498TEXT
1499HERE
1500)"
1501        )?);
1502        Ok(())
1503    }
1504
1505    #[test]
1506    fn tokenize_here_doc_in_double_quoted_command_substitution() -> Result<()> {
1507        assert_ron_snapshot!(test_tokenizer(
1508            r#"echo "$(cat <<HERE
1509TEXT
1510HERE
1511)""#
1512        )?);
1513        Ok(())
1514    }
1515
1516    #[test]
1517    fn tokenize_here_doc_in_double_quoted_command_substitution_with_space() -> Result<()> {
1518        assert_ron_snapshot!(test_tokenizer(
1519            r#"echo "$(cat << HERE
1520TEXT
1521HERE
1522)""#
1523        )?);
1524        Ok(())
1525    }
1526
1527    #[test]
1528    fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1529        assert_ron_snapshot!(test_tokenizer(
1530            r"echo $(cat <<HERE1 <<HERE2 | wc -l
1531TEXT
1532HERE1
1533OTHER
1534HERE2
1535)"
1536        )?);
1537        Ok(())
1538    }
1539
1540    #[test]
1541    fn tokenize_simple_backquote() -> Result<()> {
1542        assert_ron_snapshot!(test_tokenizer(r"echo `echo hi`")?);
1543        Ok(())
1544    }
1545
1546    #[test]
1547    fn tokenize_backquote_with_escape() -> Result<()> {
1548        assert_ron_snapshot!(test_tokenizer(r"echo `echo\`hi`")?);
1549        Ok(())
1550    }
1551
1552    #[test]
1553    fn tokenize_unterminated_backquote() {
1554        assert_matches!(
1555            tokenize_str("`"),
1556            Err(TokenizerError::UnterminatedBackquote(_))
1557        );
1558    }
1559
1560    #[test]
1561    fn tokenize_unterminated_command_substitution() {
1562        // $( is consumed before the tokenizer knows whether it's $( or $((,
1563        // so it goes through consume_nested_construct and yields UnterminatedExpansion.
1564        assert_matches!(
1565            tokenize_str("$("),
1566            Err(TokenizerError::UnterminatedExpansion)
1567        );
1568    }
1569
1570    #[test]
1571    fn tokenize_unterminated_arithmetic_expansion() {
1572        assert_matches!(
1573            tokenize_str("$(("),
1574            Err(TokenizerError::UnterminatedExpansion)
1575        );
1576    }
1577
1578    #[test]
1579    fn tokenize_unterminated_legacy_arithmetic_expansion() {
1580        assert_matches!(
1581            tokenize_str("$["),
1582            Err(TokenizerError::UnterminatedExpansion)
1583        );
1584    }
1585
1586    #[test]
1587    fn tokenize_command_substitution() -> Result<()> {
1588        assert_ron_snapshot!(test_tokenizer("a$(echo hi)b c")?);
1589        Ok(())
1590    }
1591
1592    #[test]
1593    fn tokenize_command_substitution_with_subshell() -> Result<()> {
1594        assert_ron_snapshot!(test_tokenizer("$( (:) )")?);
1595        Ok(())
1596    }
1597
1598    #[test]
1599    fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1600        assert_ron_snapshot!(test_tokenizer("echo $(echo !(x))")?);
1601        Ok(())
1602    }
1603
1604    #[test]
1605    fn tokenize_arithmetic_expression() -> Result<()> {
1606        assert_ron_snapshot!(test_tokenizer("a$((1+2))b c")?);
1607        Ok(())
1608    }
1609
1610    #[test]
1611    fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1612        // N.B. The spacing comes out a bit odd, but it gets processed okay
1613        // by later stages.
1614        assert_ron_snapshot!(test_tokenizer("$(( 1 ))")?);
1615        Ok(())
1616    }
1617    #[test]
1618    fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1619        assert_ron_snapshot!(test_tokenizer("$(( (0) ))")?);
1620        Ok(())
1621    }
1622
1623    #[test]
1624    fn tokenize_special_parameters() -> Result<()> {
1625        assert_ron_snapshot!(test_tokenizer("$$")?);
1626        assert_ron_snapshot!(test_tokenizer("$@")?);
1627        assert_ron_snapshot!(test_tokenizer("$!")?);
1628        assert_ron_snapshot!(test_tokenizer("$?")?);
1629        assert_ron_snapshot!(test_tokenizer("$*")?);
1630        Ok(())
1631    }
1632
1633    #[test]
1634    fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1635        assert_ron_snapshot!(test_tokenizer("$x")?);
1636        assert_ron_snapshot!(test_tokenizer("a$x")?);
1637        Ok(())
1638    }
1639
1640    #[test]
1641    fn tokenize_unterminated_parameter_expansion() {
1642        assert_matches!(
1643            tokenize_str("${x"),
1644            Err(TokenizerError::UnterminatedVariable)
1645        );
1646    }
1647
1648    #[test]
1649    fn tokenize_braced_parameter_expansion() -> Result<()> {
1650        assert_ron_snapshot!(test_tokenizer("${x}")?);
1651        assert_ron_snapshot!(test_tokenizer("a${x}b")?);
1652        Ok(())
1653    }
1654
1655    #[test]
1656    fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1657        assert_ron_snapshot!(test_tokenizer(r"a${x\}}b")?);
1658        Ok(())
1659    }
1660
1661    #[test]
1662    fn tokenize_whitespace() -> Result<()> {
1663        assert_ron_snapshot!(test_tokenizer("1 2 3")?);
1664        Ok(())
1665    }
1666
1667    #[test]
1668    fn tokenize_escaped_whitespace() -> Result<()> {
1669        assert_ron_snapshot!(test_tokenizer(r"1\ 2 3")?);
1670        Ok(())
1671    }
1672
1673    #[test]
1674    fn tokenize_single_quote() -> Result<()> {
1675        assert_ron_snapshot!(test_tokenizer(r"x'a b'y")?);
1676        Ok(())
1677    }
1678
1679    #[test]
1680    fn tokenize_double_quote() -> Result<()> {
1681        assert_ron_snapshot!(test_tokenizer(r#"x"a b"y"#)?);
1682        Ok(())
1683    }
1684
1685    #[test]
1686    fn tokenize_double_quoted_command_substitution() -> Result<()> {
1687        assert_ron_snapshot!(test_tokenizer(r#"x"$(echo hi)"y"#)?);
1688        Ok(())
1689    }
1690
1691    #[test]
1692    fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1693        assert_ron_snapshot!(test_tokenizer(r#"x"$((1+2))"y"#)?);
1694        Ok(())
1695    }
1696
1697    #[test]
1698    fn test_quote_removal() {
1699        assert_eq!(unquote_str(r#""hello""#), "hello");
1700        assert_eq!(unquote_str(r"'hello'"), "hello");
1701        assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1702        assert_eq!(unquote_str(r"'hel\'lo'"), r"hel'lo");
1703    }
1704}
shannon_brush_parser/tokenizer.rs

shannon_brush_parser/
tokenizer.rs