brush_parser/
tokenizer.rs

1use std::borrow::Cow;
2use std::fmt::Display;
3use utf8_chars::BufReadCharsExt;
4
5#[derive(Clone, Debug)]
6pub(crate) enum TokenEndReason {
7    /// End of input was reached.
8    EndOfInput,
9    /// An unescaped newline char was reached.
10    UnescapedNewLine,
11    /// Specified terminating char.
12    SpecifiedTerminatingChar,
13    /// A non-newline blank char was reached.
14    NonNewLineBlank,
15    /// A here-document's body is starting.
16    HereDocumentBodyStart,
17    /// A here-document's body was terminated.
18    HereDocumentBodyEnd,
19    /// A here-document's end tag was reached.
20    HereDocumentEndTag,
21    /// An operator was started.
22    OperatorStart,
23    /// An operator was terminated.
24    OperatorEnd,
25    /// Some other condition was reached.
26    Other,
27}
28
29/// Represents a position in a source shell script.
30#[derive(Clone, Default, Debug)]
31#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
32#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
33#[cfg_attr(test, serde(rename = "Pos"))]
34pub struct SourcePosition {
35    /// The 0-based index of the character in the input stream.
36    #[cfg_attr(test, serde(rename = "idx"))]
37    pub index: i32,
38    /// The 1-based line number.
39    pub line: i32,
40    /// The 1-based column number.
41    #[cfg_attr(test, serde(rename = "col"))]
42    pub column: i32,
43}
44
45impl Display for SourcePosition {
46    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
47        f.write_fmt(format_args!("line {} col {}", self.line, self.column))
48    }
49}
50
51/// Represents the location of a token in its source shell script.
52#[derive(Clone, Default, Debug)]
53#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
54#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
55#[cfg_attr(test, serde(rename = "Loc"))]
56pub struct TokenLocation {
57    /// The start position of the token.
58    pub start: SourcePosition,
59    /// The end position of the token (exclusive).
60    pub end: SourcePosition,
61}
62
63/// Represents a token extracted from a shell script.
64#[derive(Clone, Debug)]
65#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
66#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
67pub enum Token {
68    /// An operator token.
69    #[cfg_attr(test, serde(rename = "Op"))]
70    Operator(String, TokenLocation),
71    /// A word token.
72    #[cfg_attr(test, serde(rename = "W"))]
73    Word(String, TokenLocation),
74}
75
76impl Token {
77    /// Returns the string value of the token.
78    pub fn to_str(&self) -> &str {
79        match self {
80            Self::Operator(s, _) => s,
81            Self::Word(s, _) => s,
82        }
83    }
84
85    /// Returns the location of the token in the source script.
86    pub const fn location(&self) -> &TokenLocation {
87        match self {
88            Self::Operator(_, l) => l,
89            Self::Word(_, l) => l,
90        }
91    }
92}
93
94/// Encapsulates the result of tokenizing a shell script.
95#[derive(Clone, Debug)]
96pub(crate) struct TokenizeResult {
97    /// Reason for tokenization ending.
98    pub reason: TokenEndReason,
99    /// The token that was extracted, if any.
100    pub token: Option<Token>,
101}
102
103/// Represents an error that occurred during tokenization.
104#[derive(thiserror::Error, Debug)]
105pub enum TokenizerError {
106    /// An unterminated escape sequence was encountered at the end of the input stream.
107    #[error("unterminated escape sequence")]
108    UnterminatedEscapeSequence,
109
110    /// An unterminated single-quoted substring was encountered at the end of the input stream.
111    #[error("unterminated single quote at {0}")]
112    UnterminatedSingleQuote(SourcePosition),
113
114    /// An unterminated ANSI C-quoted substring was encountered at the end of the input stream.
115    #[error("unterminated ANSI C quote at {0}")]
116    UnterminatedAnsiCQuote(SourcePosition),
117
118    /// An unterminated double-quoted substring was encountered at the end of the input stream.
119    #[error("unterminated double quote at {0}")]
120    UnterminatedDoubleQuote(SourcePosition),
121
122    /// An unterminated back-quoted substring was encountered at the end of the input stream.
123    #[error("unterminated backquote near {0}")]
124    UnterminatedBackquote(SourcePosition),
125
126    /// An unterminated extended glob (extglob) pattern was encountered at the end of the input
127    /// stream.
128    #[error("unterminated extglob near {0}")]
129    UnterminatedExtendedGlob(SourcePosition),
130
131    /// An unterminated variable expression was encountered at the end of the input stream.
132    #[error("unterminated variable expression")]
133    UnterminatedVariable,
134
135    /// An unterminated command substitiion was encountered at the end of the input stream.
136    #[error("unterminated command substitution")]
137    UnterminatedCommandSubstitution,
138
139    /// An error occurred decoding UTF-8 characters in the input stream.
140    #[error("failed to decode UTF-8 characters")]
141    FailedDecoding,
142
143    /// An I/O here tag was missing.
144    #[error("missing here tag for here document body")]
145    MissingHereTagForDocumentBody,
146
147    /// The indicated I/O here tag was missing.
148    #[error("missing here tag '{0}'")]
149    MissingHereTag(String),
150
151    /// An unterminated here document sequence was encountered at the end of the input stream.
152    #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
153    UnterminatedHereDocuments(String, String),
154
155    /// An I/O error occurred while reading from the input stream.
156    #[error("failed to read input")]
157    ReadError(#[from] std::io::Error),
158}
159
160impl TokenizerError {
161    /// Returns true if the error represents an error that could possibly be due
162    /// to an incomplete input stream.
163    pub const fn is_incomplete(&self) -> bool {
164        matches!(
165            self,
166            Self::UnterminatedEscapeSequence
167                | Self::UnterminatedAnsiCQuote(..)
168                | Self::UnterminatedSingleQuote(..)
169                | Self::UnterminatedDoubleQuote(..)
170                | Self::UnterminatedBackquote(..)
171                | Self::UnterminatedCommandSubstitution
172                | Self::UnterminatedVariable
173                | Self::UnterminatedExtendedGlob(..)
174                | Self::UnterminatedHereDocuments(..)
175        )
176    }
177}
178
179/// Encapsulates a sequence of tokens.
180#[derive(Debug)]
181pub(crate) struct Tokens<'a> {
182    /// Sequence of tokens.
183    pub tokens: &'a [Token],
184}
185
186#[derive(Clone, Debug)]
187enum QuoteMode {
188    None,
189    AnsiC(SourcePosition),
190    Single(SourcePosition),
191    Double(SourcePosition),
192}
193
194#[derive(Clone, Debug, Default)]
195enum HereState {
196    /// In this state, we are not currently tracking any here-documents.
197    #[default]
198    None,
199    /// In this state, we expect that the next token will be a here tag.
200    NextTokenIsHereTag { remove_tabs: bool },
201    /// In this state, the *current* token is a here tag.
202    CurrentTokenIsHereTag {
203        remove_tabs: bool,
204        operator_token_result: TokenizeResult,
205    },
206    /// In this state, we expect that the *next line* will be the body of
207    /// a here-document.
208    NextLineIsHereDoc,
209    /// In this state, we are in the set of lines that comprise 1 or more
210    /// consecutive here-document bodies.
211    InHereDocs,
212}
213
214#[derive(Clone, Debug)]
215struct HereTag {
216    tag: String,
217    tag_was_escaped_or_quoted: bool,
218    remove_tabs: bool,
219    position: SourcePosition,
220    tokens: Vec<TokenizeResult>,
221    pending_tokens_after: Vec<TokenizeResult>,
222}
223
224#[derive(Clone, Debug)]
225struct CrossTokenParseState {
226    /// Cursor within the overall token stream; used for error reporting.
227    cursor: SourcePosition,
228    /// Current state of parsing here-documents.
229    here_state: HereState,
230    /// Ordered queue of here tags for which we're still looking for matching here-document bodies.
231    current_here_tags: Vec<HereTag>,
232    /// Tokens already tokenized that should be used first to serve requests for tokens.
233    queued_tokens: Vec<TokenizeResult>,
234    /// Are we in an arithmetic expansion?
235    arithmetic_expansion: bool,
236}
237
238/// Options controlling how the tokenizer operates.
239#[derive(Clone, Debug, Hash, Eq, PartialEq)]
240pub struct TokenizerOptions {
241    /// Whether or not to enable extended globbing patterns (extglob).
242    pub enable_extended_globbing: bool,
243    /// Whether or not to operate in POSIX compliance mode.
244    pub posix_mode: bool,
245    /// Whether or not we're running in SH emulation mode.
246    pub sh_mode: bool,
247}
248
249impl Default for TokenizerOptions {
250    fn default() -> Self {
251        Self {
252            enable_extended_globbing: true,
253            posix_mode: false,
254            sh_mode: false,
255        }
256    }
257}
258
259/// A tokenizer for shell scripts.
260pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
261    char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
262    cross_state: CrossTokenParseState,
263    options: TokenizerOptions,
264}
265
266/// Encapsulates the current token parsing state.
267#[derive(Clone, Debug)]
268struct TokenParseState {
269    pub start_position: SourcePosition,
270    pub token_so_far: String,
271    pub token_is_operator: bool,
272    pub in_escape: bool,
273    pub quote_mode: QuoteMode,
274}
275
276impl TokenParseState {
277    pub fn new(start_position: &SourcePosition) -> Self {
278        Self {
279            start_position: start_position.clone(),
280            token_so_far: String::new(),
281            token_is_operator: false,
282            in_escape: false,
283            quote_mode: QuoteMode::None,
284        }
285    }
286
287    pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
288        let token_location = TokenLocation {
289            start: std::mem::take(&mut self.start_position),
290            end: end_position.clone(),
291        };
292
293        let token = if std::mem::take(&mut self.token_is_operator) {
294            Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
295        } else {
296            Token::Word(std::mem::take(&mut self.token_so_far), token_location)
297        };
298
299        self.start_position = end_position.clone();
300        self.in_escape = false;
301        self.quote_mode = QuoteMode::None;
302
303        token
304    }
305
306    pub fn started_token(&self) -> bool {
307        !self.token_so_far.is_empty()
308    }
309
310    pub fn append_char(&mut self, c: char) {
311        self.token_so_far.push(c);
312    }
313
314    pub fn append_str(&mut self, s: &str) {
315        self.token_so_far.push_str(s);
316    }
317
318    pub const fn unquoted(&self) -> bool {
319        !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
320    }
321
322    pub fn current_token(&self) -> &str {
323        &self.token_so_far
324    }
325
326    pub fn is_specific_operator(&self, operator: &str) -> bool {
327        self.token_is_operator && self.current_token() == operator
328    }
329
330    pub const fn in_operator(&self) -> bool {
331        self.token_is_operator
332    }
333
334    fn is_newline(&self) -> bool {
335        self.token_so_far == "\n"
336    }
337
338    fn replace_with_here_doc(&mut self, s: String) {
339        self.token_so_far = s;
340    }
341
342    pub fn delimit_current_token(
343        &mut self,
344        reason: TokenEndReason,
345        cross_token_state: &mut CrossTokenParseState,
346    ) -> Result<Option<TokenizeResult>, TokenizerError> {
347        // If we don't have anything in the token, then don't yield an empty string token
348        // *unless* it's the body of a here document.
349        if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
350            return Ok(Some(TokenizeResult {
351                reason,
352                token: None,
353            }));
354        }
355
356        // TODO: Make sure the here-tag meets criteria (and isn't a newline).
357        let current_here_state = std::mem::take(&mut cross_token_state.here_state);
358        match current_here_state {
359            HereState::NextTokenIsHereTag { remove_tabs } => {
360                // Don't yield the operator as a token yet. We need to make sure we collect
361                // up everything we need for all the here-documents with tags on this line.
362                let operator_token_result = TokenizeResult {
363                    reason,
364                    token: Some(self.pop(&cross_token_state.cursor)),
365                };
366
367                cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
368                    remove_tabs,
369                    operator_token_result,
370                };
371
372                return Ok(None);
373            }
374            HereState::CurrentTokenIsHereTag {
375                remove_tabs,
376                operator_token_result,
377            } => {
378                if self.is_newline() {
379                    return Err(TokenizerError::MissingHereTag(
380                        self.current_token().to_owned(),
381                    ));
382                }
383
384                cross_token_state.here_state = HereState::NextLineIsHereDoc;
385
386                // Include the trailing \n in the here tag so it's easier to check against.
387                let tag = std::format!("{}\n", self.current_token());
388                let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
389
390                let tag_token_result = TokenizeResult {
391                    reason,
392                    token: Some(self.pop(&cross_token_state.cursor)),
393                };
394
395                cross_token_state.current_here_tags.push(HereTag {
396                    tag,
397                    tag_was_escaped_or_quoted,
398                    remove_tabs,
399                    position: cross_token_state.cursor.clone(),
400                    tokens: vec![operator_token_result, tag_token_result],
401                    pending_tokens_after: vec![],
402                });
403
404                return Ok(None);
405            }
406            HereState::NextLineIsHereDoc => {
407                if self.is_newline() {
408                    cross_token_state.here_state = HereState::InHereDocs;
409                } else {
410                    cross_token_state.here_state = HereState::NextLineIsHereDoc;
411                }
412
413                if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
414                    let token = self.pop(&cross_token_state.cursor);
415                    let result = TokenizeResult {
416                        reason,
417                        token: Some(token),
418                    };
419
420                    last_here_tag.pending_tokens_after.push(result);
421                } else {
422                    return Err(TokenizerError::MissingHereTagForDocumentBody);
423                }
424
425                return Ok(None);
426            }
427            HereState::InHereDocs => {
428                // We hit the end of the current here-document.
429                let completed_here_tag = cross_token_state.current_here_tags.remove(0);
430
431                // First queue the redirection operator and (start) here-tag.
432                for here_token in completed_here_tag.tokens {
433                    cross_token_state.queued_tokens.push(here_token);
434                }
435
436                // Leave a hint that we are about to start a here-document.
437                cross_token_state.queued_tokens.push(TokenizeResult {
438                    reason: TokenEndReason::HereDocumentBodyStart,
439                    token: None,
440                });
441
442                // Then queue the body document we just finished.
443                cross_token_state.queued_tokens.push(TokenizeResult {
444                    reason,
445                    token: Some(self.pop(&cross_token_state.cursor)),
446                });
447
448                // Then queue up the (end) here-tag.
449                self.append_str(completed_here_tag.tag.trim_end_matches('\n'));
450                cross_token_state.queued_tokens.push(TokenizeResult {
451                    reason: TokenEndReason::HereDocumentEndTag,
452                    token: Some(self.pop(&cross_token_state.cursor)),
453                });
454
455                // Now we're ready to queue up any tokens that came between the completed
456                // here tag and the next here tag (or newline after it if it was the last).
457                for pending_token in completed_here_tag.pending_tokens_after {
458                    cross_token_state.queued_tokens.push(pending_token);
459                }
460
461                if cross_token_state.current_here_tags.is_empty() {
462                    cross_token_state.here_state = HereState::None;
463                } else {
464                    cross_token_state.here_state = HereState::InHereDocs;
465                }
466
467                return Ok(None);
468            }
469            HereState::None => (),
470        }
471
472        let token = self.pop(&cross_token_state.cursor);
473        let result = TokenizeResult {
474            reason,
475            token: Some(token),
476        };
477
478        Ok(Some(result))
479    }
480}
481
482/// Break the given input shell script string into tokens, returning the tokens.
483///
484/// # Arguments
485///
486/// * `input` - The shell script to tokenize.
487pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
488    tokenize_str_with_options(input, &TokenizerOptions::default())
489}
490
491/// Break the given input shell script string into tokens, returning the tokens.
492///
493/// # Arguments
494///
495/// * `input` - The shell script to tokenize.
496/// * `options` - Options controlling how the tokenizer operates.
497pub fn tokenize_str_with_options(
498    input: &str,
499    options: &TokenizerOptions,
500) -> Result<Vec<Token>, TokenizerError> {
501    uncached_tokenize_string(input.to_owned(), options.to_owned())
502}
503
504#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
505fn uncached_tokenize_string(
506    input: String,
507    options: TokenizerOptions,
508) -> Result<Vec<Token>, TokenizerError> {
509    uncached_tokenize_str(input.as_str(), &options)
510}
511
512/// Break the given input shell script string into tokens, returning the tokens.
513/// No caching is performed.
514///
515/// # Arguments
516///
517/// * `input` - The shell script to tokenize.
518pub fn uncached_tokenize_str(
519    input: &str,
520    options: &TokenizerOptions,
521) -> Result<Vec<Token>, TokenizerError> {
522    let mut reader = std::io::BufReader::new(input.as_bytes());
523    let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
524
525    let mut tokens = vec![];
526    loop {
527        match tokenizer.next_token()? {
528            TokenizeResult {
529                token: Some(token), ..
530            } => tokens.push(token),
531            TokenizeResult {
532                reason: TokenEndReason::EndOfInput,
533                ..
534            } => break,
535            _ => (),
536        }
537    }
538
539    Ok(tokens)
540}
541
542impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
543    pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Self {
544        Tokenizer {
545            options: options.clone(),
546            char_reader: reader.chars().peekable(),
547            cross_state: CrossTokenParseState {
548                cursor: SourcePosition {
549                    index: 0,
550                    line: 1,
551                    column: 1,
552                },
553                here_state: HereState::None,
554                current_here_tags: vec![],
555                queued_tokens: vec![],
556                arithmetic_expansion: false,
557            },
558        }
559    }
560
561    #[expect(clippy::unnecessary_wraps)]
562    pub fn current_location(&self) -> Option<SourcePosition> {
563        Some(self.cross_state.cursor.clone())
564    }
565
566    fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
567        let c = self
568            .char_reader
569            .next()
570            .transpose()
571            .map_err(TokenizerError::ReadError)?;
572
573        if let Some(ch) = c {
574            if ch == '\n' {
575                self.cross_state.cursor.line += 1;
576                self.cross_state.cursor.column = 1;
577            } else {
578                self.cross_state.cursor.column += 1;
579            }
580            self.cross_state.cursor.index += 1;
581        }
582
583        Ok(c)
584    }
585
586    fn consume_char(&mut self) -> Result<(), TokenizerError> {
587        let _ = self.next_char()?;
588        Ok(())
589    }
590
591    fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
592        match self.char_reader.peek() {
593            Some(result) => match result {
594                Ok(c) => Ok(Some(*c)),
595                Err(_) => Err(TokenizerError::FailedDecoding),
596            },
597            None => Ok(None),
598        }
599    }
600
601    pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
602        self.next_token_until(None, false /* include space? */)
603    }
604
605    /// Returns the next token from the input stream, optionally stopping early when a specified
606    /// terminating character is encountered.
607    ///
608    /// # Arguments
609    ///
610    /// * `terminating_char` - An optional character that, if encountered, will stop the
611    ///   tokenization process and return the token up to that character.
612    /// * `include_space` - If true, include spaces in the tokenization process. This is not
613    ///   typically the case, but can be helpful when needing to preserve the original source text
614    ///   embedded within a command substitution or similar construct.
615    #[expect(clippy::cognitive_complexity)]
616    #[expect(clippy::if_same_then_else)]
617    #[expect(clippy::panic_in_result_fn)]
618    #[expect(clippy::too_many_lines)]
619    #[expect(clippy::unwrap_in_result)]
620    fn next_token_until(
621        &mut self,
622        terminating_char: Option<char>,
623        include_space: bool,
624    ) -> Result<TokenizeResult, TokenizerError> {
625        let mut state = TokenParseState::new(&self.cross_state.cursor);
626        let mut result: Option<TokenizeResult> = None;
627
628        while result.is_none() {
629            // First satisfy token results from our queue. Once we exhaust the queue then
630            // we'll look at the input stream.
631            if !self.cross_state.queued_tokens.is_empty() {
632                return Ok(self.cross_state.queued_tokens.remove(0));
633            }
634
635            let next = self.peek_char()?;
636            let c = next.unwrap_or('\0');
637
638            // When we hit the end of the input, then we're done with the current token (if there is
639            // one).
640            if next.is_none() {
641                // TODO: Verify we're not waiting on some terminating character?
642                // Verify we're out of all quotes.
643                if state.in_escape {
644                    return Err(TokenizerError::UnterminatedEscapeSequence);
645                }
646                match state.quote_mode {
647                    QuoteMode::None => (),
648                    QuoteMode::AnsiC(pos) => {
649                        return Err(TokenizerError::UnterminatedAnsiCQuote(pos));
650                    }
651                    QuoteMode::Single(pos) => {
652                        return Err(TokenizerError::UnterminatedSingleQuote(pos));
653                    }
654                    QuoteMode::Double(pos) => {
655                        return Err(TokenizerError::UnterminatedDoubleQuote(pos));
656                    }
657                }
658
659                // Verify we're not in a here document.
660                if !matches!(self.cross_state.here_state, HereState::None) {
661                    if self.remove_here_end_tag(&mut state, &mut result, false)? {
662                        // If we hit end tag without a trailing newline, try to get next token.
663                        continue;
664                    }
665
666                    let tag_names = self
667                        .cross_state
668                        .current_here_tags
669                        .iter()
670                        .map(|tag| tag.tag.trim())
671                        .collect::<Vec<_>>()
672                        .join(", ");
673                    let tag_positions = self
674                        .cross_state
675                        .current_here_tags
676                        .iter()
677                        .map(|tag| std::format!("{}", tag.position))
678                        .collect::<Vec<_>>()
679                        .join(", ");
680                    return Err(TokenizerError::UnterminatedHereDocuments(
681                        tag_names,
682                        tag_positions,
683                    ));
684                }
685
686                result = state
687                    .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
688            //
689            // Look for the specially specified terminating char.
690            //
691            } else if state.unquoted() && terminating_char == Some(c) {
692                result = state.delimit_current_token(
693                    TokenEndReason::SpecifiedTerminatingChar,
694                    &mut self.cross_state,
695                )?;
696            //
697            // Handle being in a here document.
698            //
699            } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
700                //
701                // For now, just include the character in the current token. We also check
702                // if there are leading tabs to be removed.
703                //
704                if !self.cross_state.current_here_tags.is_empty()
705                    && self.cross_state.current_here_tags[0].remove_tabs
706                    && (!state.started_token() || state.current_token().ends_with('\n'))
707                    && c == '\t'
708                {
709                    // Consume it but don't include it.
710                    self.consume_char()?;
711                } else {
712                    self.consume_char()?;
713                    state.append_char(c);
714
715                    // See if this was a newline character following the terminating here tag.
716                    if c == '\n' {
717                        self.remove_here_end_tag(&mut state, &mut result, true)?;
718                    }
719                }
720            } else if state.in_operator() {
721                //
722                // We're in an operator. See if this character continues an operator, or if it
723                // must be a separate token (because it wouldn't make a prefix of an operator).
724                //
725
726                let mut hypothetical_token = state.current_token().to_owned();
727                hypothetical_token.push(c);
728
729                if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
730                    self.consume_char()?;
731                    state.append_char(c);
732                } else {
733                    assert!(state.started_token());
734
735                    //
736                    // N.B. If the completed operator indicates a here-document, then keep
737                    // track that the *next* token should be the here-tag.
738                    //
739                    if self.cross_state.arithmetic_expansion {
740                        //
741                        // We're in an arithmetic context; don't consider << and <<-
742                        // special. They're not here-docs, they're either a left-shift
743                        // operator or a left-shift operator followed by a unary
744                        // minus operator.
745                        //
746
747                        if state.is_specific_operator(")") && c == ')' {
748                            self.cross_state.arithmetic_expansion = false;
749                        }
750                    } else if state.is_specific_operator("<<") {
751                        self.cross_state.here_state =
752                            HereState::NextTokenIsHereTag { remove_tabs: false };
753                    } else if state.is_specific_operator("<<-") {
754                        self.cross_state.here_state =
755                            HereState::NextTokenIsHereTag { remove_tabs: true };
756                    } else if state.is_specific_operator("(") && c == '(' {
757                        self.cross_state.arithmetic_expansion = true;
758                    }
759
760                    let reason = if state.current_token() == "\n" {
761                        TokenEndReason::UnescapedNewLine
762                    } else {
763                        TokenEndReason::OperatorEnd
764                    };
765
766                    result = state.delimit_current_token(reason, &mut self.cross_state)?;
767                }
768            //
769            // See if this is a character that changes the current escaping/quoting state.
770            //
771            } else if does_char_newly_affect_quoting(&state, c) {
772                if c == '\\' {
773                    // Consume the backslash ourselves so we can peek past it.
774                    self.consume_char()?;
775
776                    if matches!(self.peek_char()?, Some('\n')) {
777                        // Make sure the newline char gets consumed too.
778                        self.consume_char()?;
779
780                        // Make sure to include neither the backslash nor the newline character.
781                    } else {
782                        state.in_escape = true;
783                        state.append_char(c);
784                    }
785                } else if c == '\'' {
786                    if state.token_so_far.ends_with('$') {
787                        state.quote_mode = QuoteMode::AnsiC(self.cross_state.cursor.clone());
788                    } else {
789                        state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
790                    }
791
792                    self.consume_char()?;
793                    state.append_char(c);
794                } else if c == '\"' {
795                    state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
796                    self.consume_char()?;
797                    state.append_char(c);
798                }
799            }
800            //
801            // Handle end of single-quote, double-quote, or ANSI-C quote.
802            else if !state.in_escape
803                && matches!(
804                    state.quote_mode,
805                    QuoteMode::Single(..) | QuoteMode::AnsiC(..)
806                )
807                && c == '\''
808            {
809                state.quote_mode = QuoteMode::None;
810                self.consume_char()?;
811                state.append_char(c);
812            } else if !state.in_escape
813                && matches!(state.quote_mode, QuoteMode::Double(..))
814                && c == '\"'
815            {
816                state.quote_mode = QuoteMode::None;
817                self.consume_char()?;
818                state.append_char(c);
819            }
820            //
821            // Handle end of escape sequence.
822            // TODO: Handle double-quote specific escape sequences.
823            else if state.in_escape {
824                state.in_escape = false;
825                self.consume_char()?;
826                state.append_char(c);
827            } else if (state.unquoted()
828                || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
829                && (c == '$' || c == '`')
830            {
831                // TODO: handle quoted $ or ` in a double quote
832                if c == '$' {
833                    // Consume the '$' so we can peek beyond.
834                    self.consume_char()?;
835
836                    // Now peek beyond to see what we have.
837                    let char_after_dollar_sign = self.peek_char()?;
838                    match char_after_dollar_sign {
839                        Some('(') => {
840                            // Add the '$' we already consumed to the token.
841                            state.append_char('$');
842
843                            // Consume the '(' and add it to the token.
844                            state.append_char(self.next_char()?.unwrap());
845
846                            // Check to see if this is possibly an arithmetic expression
847                            // (i.e., one that starts with `$((`).
848                            let mut required_end_parens = 1;
849                            if matches!(self.peek_char()?, Some('(')) {
850                                // Consume the second '(' and add it to the token.
851                                state.append_char(self.next_char()?.unwrap());
852                                // Keep track that we'll need to see *2* end parentheses
853                                // to leave this construct.
854                                required_end_parens = 2;
855                                // Keep track that we're in an arithmetic expression, since
856                                // some text will be interpreted differently as a result
857                                // (e.g., << is a left shift operator and not a here doc
858                                // input redirection operator).
859                                self.cross_state.arithmetic_expansion = true;
860                            }
861
862                            let mut pending_here_doc_tokens = vec![];
863                            let mut drain_here_doc_tokens = false;
864
865                            loop {
866                                let cur_token = if drain_here_doc_tokens
867                                    && !pending_here_doc_tokens.is_empty()
868                                {
869                                    if pending_here_doc_tokens.len() == 1 {
870                                        drain_here_doc_tokens = false;
871                                    }
872
873                                    pending_here_doc_tokens.remove(0)
874                                } else {
875                                    let cur_token = self.next_token_until(
876                                        Some(')'),
877                                        true, /* include space? */
878                                    )?;
879
880                                    // See if this is a here-document-related token we need to hold
881                                    // onto until after we've seen all the tokens that need to show
882                                    // up before we get to the body.
883                                    if matches!(
884                                        cur_token.reason,
885                                        TokenEndReason::HereDocumentBodyStart
886                                            | TokenEndReason::HereDocumentBodyEnd
887                                            | TokenEndReason::HereDocumentEndTag
888                                    ) {
889                                        pending_here_doc_tokens.push(cur_token);
890                                        continue;
891                                    }
892
893                                    cur_token
894                                };
895
896                                if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
897                                    && !pending_here_doc_tokens.is_empty()
898                                {
899                                    pending_here_doc_tokens.push(cur_token);
900                                    drain_here_doc_tokens = true;
901                                    continue;
902                                }
903
904                                if let Some(cur_token_value) = cur_token.token {
905                                    state.append_str(cur_token_value.to_str());
906
907                                    // If we encounter an embedded open parenthesis, then note that
908                                    // we'll have to see the matching end to it before we worry
909                                    // about the end of the
910                                    // containing construct.
911                                    if matches!(cur_token_value, Token::Operator(o, _) if o == "(")
912                                    {
913                                        required_end_parens += 1;
914                                    }
915                                }
916
917                                match cur_token.reason {
918                                    TokenEndReason::HereDocumentBodyStart => {
919                                        state.append_char('\n');
920                                    }
921                                    TokenEndReason::NonNewLineBlank => state.append_char(' '),
922                                    TokenEndReason::SpecifiedTerminatingChar => {
923                                        // We hit the ')' we were looking for. If this is the last
924                                        // end parenthesis we needed to find, then we'll exit the
925                                        // loop and consume
926                                        // and append it.
927                                        required_end_parens -= 1;
928                                        if required_end_parens == 0 {
929                                            break;
930                                        }
931
932                                        // This wasn't the *last* end parenthesis char, so let's
933                                        // consume and append it here before we loop around again.
934                                        state.append_char(self.next_char()?.unwrap());
935                                    }
936                                    TokenEndReason::EndOfInput => {
937                                        return Err(
938                                            TokenizerError::UnterminatedCommandSubstitution,
939                                        );
940                                    }
941                                    _ => (),
942                                }
943                            }
944
945                            self.cross_state.arithmetic_expansion = false;
946
947                            state.append_char(self.next_char()?.unwrap());
948                        }
949
950                        Some('{') => {
951                            // Add the '$' we already consumed to the token.
952                            state.append_char('$');
953
954                            // Consume the '{' and add it to the token.
955                            state.append_char(self.next_char()?.unwrap());
956
957                            let mut pending_here_doc_tokens = vec![];
958                            let mut drain_here_doc_tokens = false;
959
960                            loop {
961                                let cur_token = if drain_here_doc_tokens
962                                    && !pending_here_doc_tokens.is_empty()
963                                {
964                                    if pending_here_doc_tokens.len() == 1 {
965                                        drain_here_doc_tokens = false;
966                                    }
967
968                                    pending_here_doc_tokens.remove(0)
969                                } else {
970                                    let cur_token = self.next_token_until(
971                                        Some('}'),
972                                        false, /* include space? */
973                                    )?;
974
975                                    // See if this is a here-document-related token we need to hold
976                                    // onto until after we've seen all the tokens that need to show
977                                    // up before we get to the body.
978                                    if matches!(
979                                        cur_token.reason,
980                                        TokenEndReason::HereDocumentBodyStart
981                                            | TokenEndReason::HereDocumentBodyEnd
982                                            | TokenEndReason::HereDocumentEndTag
983                                    ) {
984                                        pending_here_doc_tokens.push(cur_token);
985                                        continue;
986                                    }
987
988                                    cur_token
989                                };
990
991                                if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
992                                    && !pending_here_doc_tokens.is_empty()
993                                {
994                                    pending_here_doc_tokens.push(cur_token);
995                                    drain_here_doc_tokens = true;
996                                    continue;
997                                }
998
999                                if let Some(cur_token_value) = cur_token.token {
1000                                    state.append_str(cur_token_value.to_str());
1001                                }
1002
1003                                match cur_token.reason {
1004                                    TokenEndReason::HereDocumentBodyStart => {
1005                                        state.append_char('\n');
1006                                    }
1007                                    TokenEndReason::NonNewLineBlank => state.append_char(' '),
1008                                    TokenEndReason::SpecifiedTerminatingChar => {
1009                                        // We hit the end brace we were looking for but did not
1010                                        // yet consume it. Do so now.
1011                                        state.append_char(self.next_char()?.unwrap());
1012                                        break;
1013                                    }
1014                                    TokenEndReason::EndOfInput => {
1015                                        return Err(TokenizerError::UnterminatedVariable);
1016                                    }
1017                                    _ => (),
1018                                }
1019                            }
1020                        }
1021                        _ => {
1022                            // This is either a different character, or else the end of the string.
1023                            // Either way, add the '$' we already consumed to the token.
1024                            state.append_char('$');
1025                        }
1026                    }
1027                } else {
1028                    // We look for the terminating backquote. First disable normal consumption and
1029                    // consume the starting backquote.
1030                    let backquote_pos = self.cross_state.cursor.clone();
1031                    self.consume_char()?;
1032
1033                    // Add the opening backquote to the token.
1034                    state.append_char(c);
1035
1036                    // Now continue until we see an unescaped backquote.
1037                    let mut escaping_enabled = false;
1038                    let mut done = false;
1039                    while !done {
1040                        // Read (and consume) the next char.
1041                        let next_char_in_backquote = self.next_char()?;
1042                        if let Some(cib) = next_char_in_backquote {
1043                            // Include it in the token no matter what.
1044                            state.append_char(cib);
1045
1046                            // Watch out for escaping.
1047                            if !escaping_enabled && cib == '\\' {
1048                                escaping_enabled = true;
1049                            } else {
1050                                // Look for an unescaped backquote to terminate.
1051                                if !escaping_enabled && cib == '`' {
1052                                    done = true;
1053                                }
1054                                escaping_enabled = false;
1055                            }
1056                        } else {
1057                            return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1058                        }
1059                    }
1060                }
1061            }
1062            //
1063            // [Extension]
1064            // If extended globbing is enabled, the last consumed character is an
1065            // unquoted start of an extglob pattern, *and* if the current character
1066            // is an open parenthesis, then this begins an extglob pattern.
1067            else if c == '('
1068                && self.options.enable_extended_globbing
1069                && state.unquoted()
1070                && !state.in_operator()
1071                && state
1072                    .current_token()
1073                    .ends_with(|x| Self::can_start_extglob(x))
1074            {
1075                // Consume the '(' and append it.
1076                self.consume_char()?;
1077                state.append_char(c);
1078
1079                let mut paren_depth = 1;
1080
1081                // Keep consuming until we see the matching end ')'.
1082                while paren_depth > 0 {
1083                    if let Some(extglob_char) = self.next_char()? {
1084                        // Include it in the token.
1085                        state.append_char(extglob_char);
1086
1087                        // Look for ')' to terminate.
1088                        // TODO: handle escaping?
1089                        if extglob_char == '(' {
1090                            paren_depth += 1;
1091                        } else if extglob_char == ')' {
1092                            paren_depth -= 1;
1093                        }
1094                    } else {
1095                        return Err(TokenizerError::UnterminatedExtendedGlob(
1096                            self.cross_state.cursor.clone(),
1097                        ));
1098                    }
1099                }
1100            //
1101            // If the character *can* start an operator, then it will.
1102            //
1103            } else if state.unquoted() && Self::can_start_operator(c) {
1104                if state.started_token() {
1105                    result = state.delimit_current_token(
1106                        TokenEndReason::OperatorStart,
1107                        &mut self.cross_state,
1108                    )?;
1109                } else {
1110                    state.token_is_operator = true;
1111                    self.consume_char()?;
1112                    state.append_char(c);
1113                }
1114            //
1115            // Whitespace gets discarded (and delimits tokens).
1116            //
1117            } else if state.unquoted() && is_blank(c) {
1118                if state.started_token() {
1119                    result = state.delimit_current_token(
1120                        TokenEndReason::NonNewLineBlank,
1121                        &mut self.cross_state,
1122                    )?;
1123                } else if include_space {
1124                    state.append_char(c);
1125                } else {
1126                    // Make sure we don't include this char in the token range.
1127                    state.start_position.column += 1;
1128                    state.start_position.index += 1;
1129                }
1130
1131                self.consume_char()?;
1132            }
1133            //
1134            // N.B. We need to remember if we were recursively called in a variable
1135            // expansion expression; in that case we won't think a token was started but...
1136            // we'd be wrong.
1137            else if !state.token_is_operator
1138                && (state.started_token() || matches!(terminating_char, Some('}')))
1139            {
1140                self.consume_char()?;
1141                state.append_char(c);
1142            } else if c == '#' {
1143                // Consume the '#'.
1144                self.consume_char()?;
1145
1146                let mut done = false;
1147                while !done {
1148                    done = match self.peek_char()? {
1149                        Some('\n') => true,
1150                        None => true,
1151                        _ => {
1152                            // Consume the peeked char; it's part of the comment.
1153                            self.consume_char()?;
1154                            false
1155                        }
1156                    };
1157                }
1158                // Re-start loop as if the comment never happened.
1159            } else if state.started_token() {
1160                // In all other cases where we have an in-progress token, we delimit here.
1161                result =
1162                    state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1163            } else {
1164                // If we got here, then we don't have a token in progress and we're not starting an
1165                // operator. Add the character to a new token.
1166                self.consume_char()?;
1167                state.append_char(c);
1168            }
1169        }
1170
1171        let result = result.unwrap();
1172
1173        Ok(result)
1174    }
1175
1176    fn remove_here_end_tag(
1177        &mut self,
1178        state: &mut TokenParseState,
1179        result: &mut Option<TokenizeResult>,
1180        ends_with_newline: bool,
1181    ) -> Result<bool, TokenizerError> {
1182        // Bail immediately if we don't even have a *starting* here tag.
1183        if self.cross_state.current_here_tags.is_empty() {
1184            return Ok(false);
1185        }
1186
1187        let next_here_tag = &self.cross_state.current_here_tags[0];
1188
1189        let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
1190            unquote_str(next_here_tag.tag.as_str()).into()
1191        } else {
1192            next_here_tag.tag.as_str().into()
1193        };
1194
1195        let tag_str = if !ends_with_newline {
1196            tag_str
1197                .strip_suffix('\n')
1198                .unwrap_or_else(|| tag_str.as_ref())
1199        } else {
1200            tag_str.as_ref()
1201        };
1202
1203        if let Some(current_token_without_here_tag) = state.current_token().strip_suffix(tag_str) {
1204            // Make sure that was either the start of the here document, or there
1205            // was a newline between the preceding part
1206            // and the tag.
1207            if current_token_without_here_tag.is_empty()
1208                || current_token_without_here_tag.ends_with('\n')
1209            {
1210                state.replace_with_here_doc(current_token_without_here_tag.to_owned());
1211
1212                // Delimit the end of the here-document body.
1213                *result = state.delimit_current_token(
1214                    TokenEndReason::HereDocumentBodyEnd,
1215                    &mut self.cross_state,
1216                )?;
1217
1218                return Ok(true);
1219            }
1220        }
1221        Ok(false)
1222    }
1223
1224    const fn can_start_extglob(c: char) -> bool {
1225        matches!(c, '@' | '!' | '?' | '+' | '*')
1226    }
1227
1228    const fn can_start_operator(c: char) -> bool {
1229        matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1230    }
1231
1232    fn is_operator(&self, s: &str) -> bool {
1233        // Handle non-POSIX operators.
1234        if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1235            return true;
1236        }
1237
1238        matches!(
1239            s,
1240            "&" | "&&"
1241                | "("
1242                | ")"
1243                | ";"
1244                | ";;"
1245                | "\n"
1246                | "|"
1247                | "||"
1248                | "<"
1249                | ">"
1250                | ">|"
1251                | "<<"
1252                | ">>"
1253                | "<&"
1254                | ">&"
1255                | "<<-"
1256                | "<>"
1257        )
1258    }
1259}
1260
1261impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1262    type Item = Result<TokenizeResult, TokenizerError>;
1263
1264    fn next(&mut self) -> Option<Self::Item> {
1265        match self.next_token() {
1266            #[expect(clippy::manual_map)]
1267            Ok(result) => match result.token {
1268                Some(_) => Some(Ok(result)),
1269                None => None,
1270            },
1271            Err(e) => Some(Err(e)),
1272        }
1273    }
1274}
1275
1276const fn is_blank(c: char) -> bool {
1277    c == ' ' || c == '\t'
1278}
1279
1280const fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1281    // If we're currently escaped, then nothing affects quoting.
1282    if state.in_escape {
1283        return false;
1284    }
1285
1286    match state.quote_mode {
1287        // When we're in a double quote or ANSI-C quote, only a subset of escape
1288        // sequences are recognized.
1289        QuoteMode::Double(_) | QuoteMode::AnsiC(_) => {
1290            if c == '\\' {
1291                // TODO: handle backslash in double quote
1292                true
1293            } else {
1294                false
1295            }
1296        }
1297        // When we're in a single quote, nothing affects quoting.
1298        QuoteMode::Single(_) => false,
1299        // When we're not already in a quote, then we can straightforwardly look for a
1300        // quote mark or backslash.
1301        QuoteMode::None => is_quoting_char(c),
1302    }
1303}
1304
1305const fn is_quoting_char(c: char) -> bool {
1306    matches!(c, '\\' | '\'' | '\"')
1307}
1308
1309/// Return a string with all the quoting removed.
1310///
1311/// # Arguments
1312///
1313/// * `s` - The string to unquote.
1314pub fn unquote_str(s: &str) -> String {
1315    let mut result = String::new();
1316
1317    let mut in_escape = false;
1318    for c in s.chars() {
1319        match c {
1320            c if in_escape => {
1321                result.push(c);
1322                in_escape = false;
1323            }
1324            '\\' => in_escape = true,
1325            c if is_quoting_char(c) => (),
1326            c => result.push(c),
1327        }
1328    }
1329
1330    result
1331}
1332
1333#[cfg(test)]
1334mod tests {
1335
1336    use super::*;
1337    use anyhow::Result;
1338    use insta::assert_ron_snapshot;
1339    use pretty_assertions::{assert_eq, assert_matches};
1340
1341    #[derive(serde::Serialize)]
1342    struct TokenizerResult<'a> {
1343        input: &'a str,
1344        result: Vec<Token>,
1345    }
1346
1347    fn test_tokenizer(input: &str) -> Result<TokenizerResult<'_>> {
1348        Ok(TokenizerResult {
1349            input,
1350            result: tokenize_str(input)?,
1351        })
1352    }
1353
1354    #[test]
1355    fn tokenize_empty() -> Result<()> {
1356        let tokens = tokenize_str("")?;
1357        assert_eq!(tokens.len(), 0);
1358        Ok(())
1359    }
1360
1361    #[test]
1362    fn tokenize_line_continuation() -> Result<()> {
1363        assert_ron_snapshot!(test_tokenizer(
1364            r"a\
1365bc"
1366        )?);
1367        Ok(())
1368    }
1369
1370    #[test]
1371    fn tokenize_operators() -> Result<()> {
1372        assert_ron_snapshot!(test_tokenizer("a>>b")?);
1373        Ok(())
1374    }
1375
1376    #[test]
1377    fn tokenize_comment() -> Result<()> {
1378        assert_ron_snapshot!(test_tokenizer(
1379            r"a #comment
1380"
1381        )?);
1382        Ok(())
1383    }
1384
1385    #[test]
1386    fn tokenize_comment_at_eof() -> Result<()> {
1387        assert_ron_snapshot!(test_tokenizer(r"a #comment")?);
1388        Ok(())
1389    }
1390
1391    #[test]
1392    fn tokenize_empty_here_doc() -> Result<()> {
1393        assert_ron_snapshot!(test_tokenizer(
1394            r"cat <<HERE
1395HERE
1396"
1397        )?);
1398        Ok(())
1399    }
1400
1401    #[test]
1402    fn tokenize_here_doc() -> Result<()> {
1403        assert_ron_snapshot!(test_tokenizer(
1404            r"cat <<HERE
1405SOMETHING
1406HERE
1407echo after
1408"
1409        )?);
1410        assert_ron_snapshot!(test_tokenizer(
1411            r"cat <<HERE
1412SOMETHING
1413HERE
1414"
1415        )?);
1416        assert_ron_snapshot!(test_tokenizer(
1417            r"cat <<HERE
1418SOMETHING
1419HERE
1420
1421"
1422        )?);
1423        assert_ron_snapshot!(test_tokenizer(
1424            r"cat <<HERE
1425SOMETHING
1426HERE"
1427        )?);
1428        Ok(())
1429    }
1430
1431    #[test]
1432    fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1433        assert_ron_snapshot!(test_tokenizer(
1434            r"cat <<-HERE
1435	SOMETHING
1436	HERE
1437"
1438        )?);
1439        Ok(())
1440    }
1441
1442    #[test]
1443    fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1444        assert_ron_snapshot!(test_tokenizer(
1445            r"cat <<EOF | wc -l
1446A B C
14471 2 3
1448D E F
1449EOF
1450"
1451        )?);
1452        Ok(())
1453    }
1454
1455    #[test]
1456    fn tokenize_multiple_here_docs() -> Result<()> {
1457        assert_ron_snapshot!(test_tokenizer(
1458            r"cat <<HERE1 <<HERE2
1459SOMETHING
1460HERE1
1461OTHER
1462HERE2
1463echo after
1464"
1465        )?);
1466        Ok(())
1467    }
1468
1469    #[test]
1470    fn tokenize_unterminated_here_doc() {
1471        let result = tokenize_str(
1472            r"cat <<HERE
1473SOMETHING
1474",
1475        );
1476        assert!(result.is_err());
1477    }
1478
1479    #[test]
1480    fn tokenize_missing_here_tag() {
1481        let result = tokenize_str(
1482            r"cat <<
1483",
1484        );
1485        assert!(result.is_err());
1486    }
1487
1488    #[test]
1489    fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1490        assert_ron_snapshot!(test_tokenizer(
1491            r"echo $(cat <<HERE
1492TEXT
1493HERE
1494)"
1495        )?);
1496        Ok(())
1497    }
1498
1499    #[test]
1500    fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1501        assert_ron_snapshot!(test_tokenizer(
1502            r"echo $(cat <<HERE1 <<HERE2 | wc -l
1503TEXT
1504HERE1
1505OTHER
1506HERE2
1507)"
1508        )?);
1509        Ok(())
1510    }
1511
1512    #[test]
1513    fn tokenize_simple_backquote() -> Result<()> {
1514        assert_ron_snapshot!(test_tokenizer(r"echo `echo hi`")?);
1515        Ok(())
1516    }
1517
1518    #[test]
1519    fn tokenize_backquote_with_escape() -> Result<()> {
1520        assert_ron_snapshot!(test_tokenizer(r"echo `echo\`hi`")?);
1521        Ok(())
1522    }
1523
1524    #[test]
1525    fn tokenize_unterminated_backquote() {
1526        assert_matches!(
1527            tokenize_str("`"),
1528            Err(TokenizerError::UnterminatedBackquote(_))
1529        );
1530    }
1531
1532    #[test]
1533    fn tokenize_unterminated_command_substitution() {
1534        assert_matches!(
1535            tokenize_str("$("),
1536            Err(TokenizerError::UnterminatedCommandSubstitution)
1537        );
1538    }
1539
1540    #[test]
1541    fn tokenize_command_substitution() -> Result<()> {
1542        assert_ron_snapshot!(test_tokenizer("a$(echo hi)b c")?);
1543        Ok(())
1544    }
1545
1546    #[test]
1547    fn tokenize_command_substitution_with_subshell() -> Result<()> {
1548        assert_ron_snapshot!(test_tokenizer("$( (:) )")?);
1549        Ok(())
1550    }
1551
1552    #[test]
1553    fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1554        assert_ron_snapshot!(test_tokenizer("echo $(echo !(x))")?);
1555        Ok(())
1556    }
1557
1558    #[test]
1559    fn tokenize_arithmetic_expression() -> Result<()> {
1560        assert_ron_snapshot!(test_tokenizer("a$((1+2))b c")?);
1561        Ok(())
1562    }
1563
1564    #[test]
1565    fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1566        // N.B. The spacing comes out a bit odd, but it gets processed okay
1567        // by later stages.
1568        assert_ron_snapshot!(test_tokenizer("$(( 1 ))")?);
1569        Ok(())
1570    }
1571    #[test]
1572    fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1573        assert_ron_snapshot!(test_tokenizer("$(( (0) ))")?);
1574        Ok(())
1575    }
1576
1577    #[test]
1578    fn tokenize_special_parameters() -> Result<()> {
1579        assert_ron_snapshot!(test_tokenizer("$$")?);
1580        assert_ron_snapshot!(test_tokenizer("$@")?);
1581        assert_ron_snapshot!(test_tokenizer("$!")?);
1582        assert_ron_snapshot!(test_tokenizer("$?")?);
1583        assert_ron_snapshot!(test_tokenizer("$*")?);
1584        Ok(())
1585    }
1586
1587    #[test]
1588    fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1589        assert_ron_snapshot!(test_tokenizer("$x")?);
1590        assert_ron_snapshot!(test_tokenizer("a$x")?);
1591        Ok(())
1592    }
1593
1594    #[test]
1595    fn tokenize_unterminated_parameter_expansion() {
1596        assert_matches!(
1597            tokenize_str("${x"),
1598            Err(TokenizerError::UnterminatedVariable)
1599        );
1600    }
1601
1602    #[test]
1603    fn tokenize_braced_parameter_expansion() -> Result<()> {
1604        assert_ron_snapshot!(test_tokenizer("${x}")?);
1605        assert_ron_snapshot!(test_tokenizer("a${x}b")?);
1606        Ok(())
1607    }
1608
1609    #[test]
1610    fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1611        assert_ron_snapshot!(test_tokenizer(r"a${x\}}b")?);
1612        Ok(())
1613    }
1614
1615    #[test]
1616    fn tokenize_whitespace() -> Result<()> {
1617        assert_ron_snapshot!(test_tokenizer("1 2 3")?);
1618        Ok(())
1619    }
1620
1621    #[test]
1622    fn tokenize_escaped_whitespace() -> Result<()> {
1623        assert_ron_snapshot!(test_tokenizer(r"1\ 2 3")?);
1624        Ok(())
1625    }
1626
1627    #[test]
1628    fn tokenize_single_quote() -> Result<()> {
1629        assert_ron_snapshot!(test_tokenizer(r"x'a b'y")?);
1630        Ok(())
1631    }
1632
1633    #[test]
1634    fn tokenize_double_quote() -> Result<()> {
1635        assert_ron_snapshot!(test_tokenizer(r#"x"a b"y"#)?);
1636        Ok(())
1637    }
1638
1639    #[test]
1640    fn tokenize_double_quoted_command_substitution() -> Result<()> {
1641        assert_ron_snapshot!(test_tokenizer(r#"x"$(echo hi)"y"#)?);
1642        Ok(())
1643    }
1644
1645    #[test]
1646    fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1647        assert_ron_snapshot!(test_tokenizer(r#"x"$((1+2))"y"#)?);
1648        Ok(())
1649    }
1650
1651    #[test]
1652    fn test_quote_removal() {
1653        assert_eq!(unquote_str(r#""hello""#), "hello");
1654        assert_eq!(unquote_str(r"'hello'"), "hello");
1655        assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1656        assert_eq!(unquote_str(r"'hel\'lo'"), r"hel'lo");
1657    }
1658}
brush_parser/tokenizer.rs

brush_parser/
tokenizer.rs