brush_parser/
tokenizer.rs

1use std::borrow::Cow;
2use std::fmt::Display;
3use utf8_chars::BufReadCharsExt;
4
5#[allow(dead_code)]
6#[derive(Clone, Debug)]
7pub(crate) enum TokenEndReason {
8    /// End of input was reached.
9    EndOfInput,
10    /// An unescaped newline char was reached.
11    UnescapedNewLine,
12    /// Specified terminating char.
13    SpecifiedTerminatingChar,
14    /// A non-newline blank char was reached.
15    NonNewLineBlank,
16    /// A here-document's body is starting.
17    HereDocumentBodyStart,
18    /// A here-document's body was terminated.
19    HereDocumentBodyEnd,
20    /// A here-document's end tag was reached.
21    HereDocumentEndTag,
22    /// An operator was started.
23    OperatorStart,
24    /// An operator was terminated.
25    OperatorEnd,
26    /// Some other condition was reached.
27    Other,
28}
29
30/// Represents a position in a source shell script.
31#[derive(Clone, Default, Debug)]
32#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
33#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
34#[cfg_attr(test, serde(rename = "Pos"))]
35pub struct SourcePosition {
36    /// The 0-based index of the character in the input stream.
37    #[cfg_attr(test, serde(rename = "idx"))]
38    pub index: i32,
39    /// The 1-based line number.
40    pub line: i32,
41    /// The 1-based column number.
42    #[cfg_attr(test, serde(rename = "col"))]
43    pub column: i32,
44}
45
46impl Display for SourcePosition {
47    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
48        f.write_fmt(format_args!("line {} col {}", self.line, self.column))
49    }
50}
51
52/// Represents the location of a token in its source shell script.
53#[derive(Clone, Default, Debug)]
54#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
55#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
56#[cfg_attr(test, serde(rename = "Loc"))]
57pub struct TokenLocation {
58    /// The start position of the token.
59    pub start: SourcePosition,
60    /// The end position of the token (exclusive).
61    pub end: SourcePosition,
62}
63
64/// Represents a token extracted from a shell script.
65#[derive(Clone, Debug)]
66#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
67#[cfg_attr(test, derive(PartialEq, Eq, serde::Serialize))]
68pub enum Token {
69    /// An operator token.
70    #[cfg_attr(test, serde(rename = "Op"))]
71    Operator(String, TokenLocation),
72    /// A word token.
73    #[cfg_attr(test, serde(rename = "W"))]
74    Word(String, TokenLocation),
75}
76
77impl Token {
78    /// Returns the string value of the token.
79    pub fn to_str(&self) -> &str {
80        match self {
81            Self::Operator(s, _) => s,
82            Self::Word(s, _) => s,
83        }
84    }
85
86    /// Returns the location of the token in the source script.
87    pub const fn location(&self) -> &TokenLocation {
88        match self {
89            Self::Operator(_, l) => l,
90            Self::Word(_, l) => l,
91        }
92    }
93}
94
95/// Encapsulates the result of tokenizing a shell script.
96#[derive(Clone, Debug)]
97pub(crate) struct TokenizeResult {
98    /// Reason for tokenization ending.
99    pub reason: TokenEndReason,
100    /// The token that was extracted, if any.
101    pub token: Option<Token>,
102}
103
104/// Represents an error that occurred during tokenization.
105#[derive(thiserror::Error, Debug)]
106pub enum TokenizerError {
107    /// An unterminated escape sequence was encountered at the end of the input stream.
108    #[error("unterminated escape sequence")]
109    UnterminatedEscapeSequence,
110
111    /// An unterminated single-quoted substring was encountered at the end of the input stream.
112    #[error("unterminated single quote at {0}")]
113    UnterminatedSingleQuote(SourcePosition),
114
115    /// An unterminated ANSI C-quoted substring was encountered at the end of the input stream.
116    #[error("unterminated ANSI C quote at {0}")]
117    UnterminatedAnsiCQuote(SourcePosition),
118
119    /// An unterminated double-quoted substring was encountered at the end of the input stream.
120    #[error("unterminated double quote at {0}")]
121    UnterminatedDoubleQuote(SourcePosition),
122
123    /// An unterminated back-quoted substring was encountered at the end of the input stream.
124    #[error("unterminated backquote near {0}")]
125    UnterminatedBackquote(SourcePosition),
126
127    /// An unterminated extended glob (extglob) pattern was encountered at the end of the input
128    /// stream.
129    #[error("unterminated extglob near {0}")]
130    UnterminatedExtendedGlob(SourcePosition),
131
132    /// An unterminated variable expression was encountered at the end of the input stream.
133    #[error("unterminated variable expression")]
134    UnterminatedVariable,
135
136    /// An unterminated command substitiion was encountered at the end of the input stream.
137    #[error("unterminated command substitution")]
138    UnterminatedCommandSubstitution,
139
140    /// An error occurred decoding UTF-8 characters in the input stream.
141    #[error("failed to decode UTF-8 characters")]
142    FailedDecoding,
143
144    /// An I/O here tag was missing.
145    #[error("missing here tag for here document body")]
146    MissingHereTagForDocumentBody,
147
148    /// The indicated I/O here tag was missing.
149    #[error("missing here tag '{0}'")]
150    MissingHereTag(String),
151
152    /// An unterminated here document sequence was encountered at the end of the input stream.
153    #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
154    UnterminatedHereDocuments(String, String),
155
156    /// An I/O error occurred while reading from the input stream.
157    #[error("failed to read input")]
158    ReadError(#[from] std::io::Error),
159}
160
161impl TokenizerError {
162    /// Returns true if the error represents an error that could possibly be due
163    /// to an incomplete input stream.
164    pub const fn is_incomplete(&self) -> bool {
165        matches!(
166            self,
167            Self::UnterminatedEscapeSequence
168                | Self::UnterminatedAnsiCQuote(..)
169                | Self::UnterminatedSingleQuote(..)
170                | Self::UnterminatedDoubleQuote(..)
171                | Self::UnterminatedBackquote(..)
172                | Self::UnterminatedCommandSubstitution
173                | Self::UnterminatedVariable
174                | Self::UnterminatedExtendedGlob(..)
175                | Self::UnterminatedHereDocuments(..)
176        )
177    }
178}
179
180/// Encapsulates a sequence of tokens.
181#[derive(Debug)]
182pub(crate) struct Tokens<'a> {
183    /// Sequence of tokens.
184    pub tokens: &'a [Token],
185}
186
187#[derive(Clone, Debug)]
188enum QuoteMode {
189    None,
190    AnsiC(SourcePosition),
191    Single(SourcePosition),
192    Double(SourcePosition),
193}
194
195#[derive(Clone, Debug, Default)]
196enum HereState {
197    /// In this state, we are not currently tracking any here-documents.
198    #[default]
199    None,
200    /// In this state, we expect that the next token will be a here tag.
201    NextTokenIsHereTag { remove_tabs: bool },
202    /// In this state, the *current* token is a here tag.
203    CurrentTokenIsHereTag {
204        remove_tabs: bool,
205        operator_token_result: TokenizeResult,
206    },
207    /// In this state, we expect that the *next line* will be the body of
208    /// a here-document.
209    NextLineIsHereDoc,
210    /// In this state, we are in the set of lines that comprise 1 or more
211    /// consecutive here-document bodies.
212    InHereDocs,
213}
214
215#[derive(Clone, Debug)]
216struct HereTag {
217    tag: String,
218    tag_was_escaped_or_quoted: bool,
219    remove_tabs: bool,
220    position: SourcePosition,
221    tokens: Vec<TokenizeResult>,
222    pending_tokens_after: Vec<TokenizeResult>,
223}
224
225#[derive(Clone, Debug)]
226struct CrossTokenParseState {
227    /// Cursor within the overall token stream; used for error reporting.
228    cursor: SourcePosition,
229    /// Current state of parsing here-documents.
230    here_state: HereState,
231    /// Ordered queue of here tags for which we're still looking for matching here-document bodies.
232    current_here_tags: Vec<HereTag>,
233    /// Tokens already tokenized that should be used first to serve requests for tokens.
234    queued_tokens: Vec<TokenizeResult>,
235    /// Are we in an arithmetic expansion?
236    arithmetic_expansion: bool,
237}
238
239/// Options controlling how the tokenizer operates.
240#[derive(Clone, Debug, Hash, Eq, PartialEq)]
241pub struct TokenizerOptions {
242    /// Whether or not to enable extended globbing patterns (extglob).
243    pub enable_extended_globbing: bool,
244    /// Whether or not to operate in POSIX compliance mode.
245    #[allow(unused)]
246    pub posix_mode: bool,
247    /// Whether or not we're running in SH emulation mode.
248    pub sh_mode: bool,
249}
250
251impl Default for TokenizerOptions {
252    fn default() -> Self {
253        Self {
254            enable_extended_globbing: true,
255            posix_mode: false,
256            sh_mode: false,
257        }
258    }
259}
260
261/// A tokenizer for shell scripts.
262pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
263    char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
264    cross_state: CrossTokenParseState,
265    options: TokenizerOptions,
266}
267
268/// Encapsulates the current token parsing state.
269#[derive(Clone, Debug)]
270struct TokenParseState {
271    pub start_position: SourcePosition,
272    pub token_so_far: String,
273    pub token_is_operator: bool,
274    pub in_escape: bool,
275    pub quote_mode: QuoteMode,
276}
277
278impl TokenParseState {
279    pub fn new(start_position: &SourcePosition) -> Self {
280        Self {
281            start_position: start_position.clone(),
282            token_so_far: String::new(),
283            token_is_operator: false,
284            in_escape: false,
285            quote_mode: QuoteMode::None,
286        }
287    }
288
289    pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
290        let token_location = TokenLocation {
291            start: std::mem::take(&mut self.start_position),
292            end: end_position.clone(),
293        };
294
295        let token = if std::mem::take(&mut self.token_is_operator) {
296            Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
297        } else {
298            Token::Word(std::mem::take(&mut self.token_so_far), token_location)
299        };
300
301        self.start_position = end_position.clone();
302        self.in_escape = false;
303        self.quote_mode = QuoteMode::None;
304
305        token
306    }
307
308    pub fn started_token(&self) -> bool {
309        !self.token_so_far.is_empty()
310    }
311
312    pub fn append_char(&mut self, c: char) {
313        self.token_so_far.push(c);
314    }
315
316    pub fn append_str(&mut self, s: &str) {
317        self.token_so_far.push_str(s);
318    }
319
320    pub const fn unquoted(&self) -> bool {
321        !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
322    }
323
324    pub fn current_token(&self) -> &str {
325        &self.token_so_far
326    }
327
328    pub fn is_specific_operator(&self, operator: &str) -> bool {
329        self.token_is_operator && self.current_token() == operator
330    }
331
332    pub const fn in_operator(&self) -> bool {
333        self.token_is_operator
334    }
335
336    fn is_newline(&self) -> bool {
337        self.token_so_far == "\n"
338    }
339
340    fn replace_with_here_doc(&mut self, s: String) {
341        self.token_so_far = s;
342    }
343
344    pub fn delimit_current_token(
345        &mut self,
346        reason: TokenEndReason,
347        cross_token_state: &mut CrossTokenParseState,
348    ) -> Result<Option<TokenizeResult>, TokenizerError> {
349        // If we don't have anything in the token, then don't yield an empty string token
350        // *unless* it's the body of a here document.
351        if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
352            return Ok(Some(TokenizeResult {
353                reason,
354                token: None,
355            }));
356        }
357
358        // TODO: Make sure the here-tag meets criteria (and isn't a newline).
359        let current_here_state = std::mem::take(&mut cross_token_state.here_state);
360        match current_here_state {
361            HereState::NextTokenIsHereTag { remove_tabs } => {
362                // Don't yield the operator as a token yet. We need to make sure we collect
363                // up everything we need for all the here-documents with tags on this line.
364                let operator_token_result = TokenizeResult {
365                    reason,
366                    token: Some(self.pop(&cross_token_state.cursor)),
367                };
368
369                cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
370                    remove_tabs,
371                    operator_token_result,
372                };
373
374                return Ok(None);
375            }
376            HereState::CurrentTokenIsHereTag {
377                remove_tabs,
378                operator_token_result,
379            } => {
380                if self.is_newline() {
381                    return Err(TokenizerError::MissingHereTag(
382                        self.current_token().to_owned(),
383                    ));
384                }
385
386                cross_token_state.here_state = HereState::NextLineIsHereDoc;
387
388                // Include the trailing \n in the here tag so it's easier to check against.
389                let tag = std::format!("{}\n", self.current_token());
390                let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
391
392                let tag_token_result = TokenizeResult {
393                    reason,
394                    token: Some(self.pop(&cross_token_state.cursor)),
395                };
396
397                cross_token_state.current_here_tags.push(HereTag {
398                    tag,
399                    tag_was_escaped_or_quoted,
400                    remove_tabs,
401                    position: cross_token_state.cursor.clone(),
402                    tokens: vec![operator_token_result, tag_token_result],
403                    pending_tokens_after: vec![],
404                });
405
406                return Ok(None);
407            }
408            HereState::NextLineIsHereDoc => {
409                if self.is_newline() {
410                    cross_token_state.here_state = HereState::InHereDocs;
411                } else {
412                    cross_token_state.here_state = HereState::NextLineIsHereDoc;
413                }
414
415                if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
416                    let token = self.pop(&cross_token_state.cursor);
417                    let result = TokenizeResult {
418                        reason,
419                        token: Some(token),
420                    };
421
422                    last_here_tag.pending_tokens_after.push(result);
423                } else {
424                    return Err(TokenizerError::MissingHereTagForDocumentBody);
425                }
426
427                return Ok(None);
428            }
429            HereState::InHereDocs => {
430                // We hit the end of the current here-document.
431                let completed_here_tag = cross_token_state.current_here_tags.remove(0);
432
433                // First queue the redirection operator and (start) here-tag.
434                for here_token in completed_here_tag.tokens {
435                    cross_token_state.queued_tokens.push(here_token);
436                }
437
438                // Leave a hint that we are about to start a here-document.
439                cross_token_state.queued_tokens.push(TokenizeResult {
440                    reason: TokenEndReason::HereDocumentBodyStart,
441                    token: None,
442                });
443
444                // Then queue the body document we just finished.
445                cross_token_state.queued_tokens.push(TokenizeResult {
446                    reason,
447                    token: Some(self.pop(&cross_token_state.cursor)),
448                });
449
450                // Then queue up the (end) here-tag.
451                self.append_str(completed_here_tag.tag.trim_end_matches('\n'));
452                cross_token_state.queued_tokens.push(TokenizeResult {
453                    reason: TokenEndReason::HereDocumentEndTag,
454                    token: Some(self.pop(&cross_token_state.cursor)),
455                });
456
457                // Now we're ready to queue up any tokens that came between the completed
458                // here tag and the next here tag (or newline after it if it was the last).
459                for pending_token in completed_here_tag.pending_tokens_after {
460                    cross_token_state.queued_tokens.push(pending_token);
461                }
462
463                if cross_token_state.current_here_tags.is_empty() {
464                    cross_token_state.here_state = HereState::None;
465                } else {
466                    cross_token_state.here_state = HereState::InHereDocs;
467                }
468
469                return Ok(None);
470            }
471            HereState::None => (),
472        }
473
474        let token = self.pop(&cross_token_state.cursor);
475        let result = TokenizeResult {
476            reason,
477            token: Some(token),
478        };
479
480        Ok(Some(result))
481    }
482}
483
484/// Break the given input shell script string into tokens, returning the tokens.
485///
486/// # Arguments
487///
488/// * `input` - The shell script to tokenize.
489pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
490    tokenize_str_with_options(input, &TokenizerOptions::default())
491}
492
493/// Break the given input shell script string into tokens, returning the tokens.
494///
495/// # Arguments
496///
497/// * `input` - The shell script to tokenize.
498/// * `options` - Options controlling how the tokenizer operates.
499pub fn tokenize_str_with_options(
500    input: &str,
501    options: &TokenizerOptions,
502) -> Result<Vec<Token>, TokenizerError> {
503    uncached_tokenize_string(input.to_owned(), options.to_owned())
504}
505
506#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
507fn uncached_tokenize_string(
508    input: String,
509    options: TokenizerOptions,
510) -> Result<Vec<Token>, TokenizerError> {
511    uncached_tokenize_str(input.as_str(), &options)
512}
513
514/// Break the given input shell script string into tokens, returning the tokens.
515/// No caching is performed.
516///
517/// # Arguments
518///
519/// * `input` - The shell script to tokenize.
520pub fn uncached_tokenize_str(
521    input: &str,
522    options: &TokenizerOptions,
523) -> Result<Vec<Token>, TokenizerError> {
524    let mut reader = std::io::BufReader::new(input.as_bytes());
525    let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
526
527    let mut tokens = vec![];
528    loop {
529        match tokenizer.next_token()? {
530            TokenizeResult {
531                token: Some(token), ..
532            } => tokens.push(token),
533            TokenizeResult {
534                reason: TokenEndReason::EndOfInput,
535                ..
536            } => break,
537            _ => (),
538        }
539    }
540
541    Ok(tokens)
542}
543
544impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
545    pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Self {
546        Tokenizer {
547            options: options.clone(),
548            char_reader: reader.chars().peekable(),
549            cross_state: CrossTokenParseState {
550                cursor: SourcePosition {
551                    index: 0,
552                    line: 1,
553                    column: 1,
554                },
555                here_state: HereState::None,
556                current_here_tags: vec![],
557                queued_tokens: vec![],
558                arithmetic_expansion: false,
559            },
560        }
561    }
562
563    #[allow(clippy::unnecessary_wraps)]
564    pub fn current_location(&self) -> Option<SourcePosition> {
565        Some(self.cross_state.cursor.clone())
566    }
567
568    fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
569        let c = self
570            .char_reader
571            .next()
572            .transpose()
573            .map_err(TokenizerError::ReadError)?;
574
575        if let Some(ch) = c {
576            if ch == '\n' {
577                self.cross_state.cursor.line += 1;
578                self.cross_state.cursor.column = 1;
579            } else {
580                self.cross_state.cursor.column += 1;
581            }
582            self.cross_state.cursor.index += 1;
583        }
584
585        Ok(c)
586    }
587
588    fn consume_char(&mut self) -> Result<(), TokenizerError> {
589        let _ = self.next_char()?;
590        Ok(())
591    }
592
593    fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
594        match self.char_reader.peek() {
595            Some(result) => match result {
596                Ok(c) => Ok(Some(*c)),
597                Err(_) => Err(TokenizerError::FailedDecoding),
598            },
599            None => Ok(None),
600        }
601    }
602
603    pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
604        self.next_token_until(None, false /* include space? */)
605    }
606
607    /// Returns the next token from the input stream, optionally stopping early when a specified
608    /// terminating character is encountered.
609    ///
610    /// # Arguments
611    ///
612    /// * `terminating_char` - An optional character that, if encountered, will stop the
613    ///   tokenization process and return the token up to that character.
614    /// * `include_space` - If true, include spaces in the tokenization process. This is not
615    ///   typically the case, but can be helpful when needing to preserve the original source text
616    ///   embedded within a command substitution or similar construct.
617    #[allow(clippy::cognitive_complexity)]
618    #[allow(clippy::if_same_then_else)]
619    #[allow(clippy::panic_in_result_fn)]
620    #[allow(clippy::too_many_lines)]
621    #[allow(clippy::unwrap_in_result)]
622    fn next_token_until(
623        &mut self,
624        terminating_char: Option<char>,
625        include_space: bool,
626    ) -> Result<TokenizeResult, TokenizerError> {
627        let mut state = TokenParseState::new(&self.cross_state.cursor);
628        let mut result: Option<TokenizeResult> = None;
629
630        while result.is_none() {
631            // First satisfy token results from our queue. Once we exhaust the queue then
632            // we'll look at the input stream.
633            if !self.cross_state.queued_tokens.is_empty() {
634                return Ok(self.cross_state.queued_tokens.remove(0));
635            }
636
637            let next = self.peek_char()?;
638            let c = next.unwrap_or('\0');
639
640            // When we hit the end of the input, then we're done with the current token (if there is
641            // one).
642            if next.is_none() {
643                // TODO: Verify we're not waiting on some terminating character?
644                // Verify we're out of all quotes.
645                if state.in_escape {
646                    return Err(TokenizerError::UnterminatedEscapeSequence);
647                }
648                match state.quote_mode {
649                    QuoteMode::None => (),
650                    QuoteMode::AnsiC(pos) => {
651                        return Err(TokenizerError::UnterminatedAnsiCQuote(pos));
652                    }
653                    QuoteMode::Single(pos) => {
654                        return Err(TokenizerError::UnterminatedSingleQuote(pos));
655                    }
656                    QuoteMode::Double(pos) => {
657                        return Err(TokenizerError::UnterminatedDoubleQuote(pos));
658                    }
659                }
660
661                // Verify we're not in a here document.
662                if !matches!(self.cross_state.here_state, HereState::None) {
663                    if self.remove_here_end_tag(&mut state, &mut result, false)? {
664                        // If we hit end tag without a trailing newline, try to get next token.
665                        continue;
666                    }
667
668                    let tag_names = self
669                        .cross_state
670                        .current_here_tags
671                        .iter()
672                        .map(|tag| tag.tag.trim())
673                        .collect::<Vec<_>>()
674                        .join(", ");
675                    let tag_positions = self
676                        .cross_state
677                        .current_here_tags
678                        .iter()
679                        .map(|tag| std::format!("{}", tag.position))
680                        .collect::<Vec<_>>()
681                        .join(", ");
682                    return Err(TokenizerError::UnterminatedHereDocuments(
683                        tag_names,
684                        tag_positions,
685                    ));
686                }
687
688                result = state
689                    .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
690            //
691            // Look for the specially specified terminating char.
692            //
693            } else if state.unquoted() && terminating_char == Some(c) {
694                result = state.delimit_current_token(
695                    TokenEndReason::SpecifiedTerminatingChar,
696                    &mut self.cross_state,
697                )?;
698            //
699            // Handle being in a here document.
700            //
701            } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
702                //
703                // For now, just include the character in the current token. We also check
704                // if there are leading tabs to be removed.
705                //
706                if !self.cross_state.current_here_tags.is_empty()
707                    && self.cross_state.current_here_tags[0].remove_tabs
708                    && (!state.started_token() || state.current_token().ends_with('\n'))
709                    && c == '\t'
710                {
711                    // Consume it but don't include it.
712                    self.consume_char()?;
713                } else {
714                    self.consume_char()?;
715                    state.append_char(c);
716
717                    // See if this was a newline character following the terminating here tag.
718                    if c == '\n' {
719                        self.remove_here_end_tag(&mut state, &mut result, true)?;
720                    }
721                }
722            } else if state.in_operator() {
723                //
724                // We're in an operator. See if this character continues an operator, or if it
725                // must be a separate token (because it wouldn't make a prefix of an operator).
726                //
727
728                let mut hypothetical_token = state.current_token().to_owned();
729                hypothetical_token.push(c);
730
731                if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
732                    self.consume_char()?;
733                    state.append_char(c);
734                } else {
735                    assert!(state.started_token());
736
737                    //
738                    // N.B. If the completed operator indicates a here-document, then keep
739                    // track that the *next* token should be the here-tag.
740                    //
741                    if self.cross_state.arithmetic_expansion {
742                        //
743                        // We're in an arithmetic context; don't consider << and <<-
744                        // special. They're not here-docs, they're either a left-shift
745                        // operator or a left-shift operator followed by a unary
746                        // minus operator.
747                        //
748
749                        if state.is_specific_operator(")") && c == ')' {
750                            self.cross_state.arithmetic_expansion = false;
751                        }
752                    } else if state.is_specific_operator("<<") {
753                        self.cross_state.here_state =
754                            HereState::NextTokenIsHereTag { remove_tabs: false };
755                    } else if state.is_specific_operator("<<-") {
756                        self.cross_state.here_state =
757                            HereState::NextTokenIsHereTag { remove_tabs: true };
758                    } else if state.is_specific_operator("(") && c == '(' {
759                        self.cross_state.arithmetic_expansion = true;
760                    }
761
762                    let reason = if state.current_token() == "\n" {
763                        TokenEndReason::UnescapedNewLine
764                    } else {
765                        TokenEndReason::OperatorEnd
766                    };
767
768                    result = state.delimit_current_token(reason, &mut self.cross_state)?;
769                }
770            //
771            // See if this is a character that changes the current escaping/quoting state.
772            //
773            } else if does_char_newly_affect_quoting(&state, c) {
774                if c == '\\' {
775                    // Consume the backslash ourselves so we can peek past it.
776                    self.consume_char()?;
777
778                    if matches!(self.peek_char()?, Some('\n')) {
779                        // Make sure the newline char gets consumed too.
780                        self.consume_char()?;
781
782                        // Make sure to include neither the backslash nor the newline character.
783                    } else {
784                        state.in_escape = true;
785                        state.append_char(c);
786                    }
787                } else if c == '\'' {
788                    if state.token_so_far.ends_with('$') {
789                        state.quote_mode = QuoteMode::AnsiC(self.cross_state.cursor.clone());
790                    } else {
791                        state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
792                    }
793
794                    self.consume_char()?;
795                    state.append_char(c);
796                } else if c == '\"' {
797                    state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
798                    self.consume_char()?;
799                    state.append_char(c);
800                }
801            }
802            //
803            // Handle end of single-quote, double-quote, or ANSI-C quote.
804            else if !state.in_escape
805                && matches!(
806                    state.quote_mode,
807                    QuoteMode::Single(..) | QuoteMode::AnsiC(..)
808                )
809                && c == '\''
810            {
811                state.quote_mode = QuoteMode::None;
812                self.consume_char()?;
813                state.append_char(c);
814            } else if !state.in_escape
815                && matches!(state.quote_mode, QuoteMode::Double(..))
816                && c == '\"'
817            {
818                state.quote_mode = QuoteMode::None;
819                self.consume_char()?;
820                state.append_char(c);
821            }
822            //
823            // Handle end of escape sequence.
824            // TODO: Handle double-quote specific escape sequences.
825            else if state.in_escape {
826                state.in_escape = false;
827                self.consume_char()?;
828                state.append_char(c);
829            } else if (state.unquoted()
830                || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
831                && (c == '$' || c == '`')
832            {
833                // TODO: handle quoted $ or ` in a double quote
834                if c == '$' {
835                    // Consume the '$' so we can peek beyond.
836                    self.consume_char()?;
837
838                    // Now peek beyond to see what we have.
839                    let char_after_dollar_sign = self.peek_char()?;
840                    match char_after_dollar_sign {
841                        Some('(') => {
842                            // Add the '$' we already consumed to the token.
843                            state.append_char('$');
844
845                            // Consume the '(' and add it to the token.
846                            state.append_char(self.next_char()?.unwrap());
847
848                            // Check to see if this is possibly an arithmetic expression
849                            // (i.e., one that starts with `$((`).
850                            let mut required_end_parens = 1;
851                            if matches!(self.peek_char()?, Some('(')) {
852                                // Consume the second '(' and add it to the token.
853                                state.append_char(self.next_char()?.unwrap());
854                                // Keep track that we'll need to see *2* end parentheses
855                                // to leave this construct.
856                                required_end_parens = 2;
857                                // Keep track that we're in an arithmetic expression, since
858                                // some text will be interpreted differently as a result
859                                // (e.g., << is a left shift operator and not a here doc
860                                // input redirection operator).
861                                self.cross_state.arithmetic_expansion = true;
862                            }
863
864                            let mut pending_here_doc_tokens = vec![];
865                            let mut drain_here_doc_tokens = false;
866
867                            loop {
868                                let cur_token = if drain_here_doc_tokens
869                                    && !pending_here_doc_tokens.is_empty()
870                                {
871                                    if pending_here_doc_tokens.len() == 1 {
872                                        drain_here_doc_tokens = false;
873                                    }
874
875                                    pending_here_doc_tokens.remove(0)
876                                } else {
877                                    let cur_token = self.next_token_until(
878                                        Some(')'),
879                                        true, /* include space? */
880                                    )?;
881
882                                    // See if this is a here-document-related token we need to hold
883                                    // onto until after we've seen all the tokens that need to show
884                                    // up before we get to the body.
885                                    if matches!(
886                                        cur_token.reason,
887                                        TokenEndReason::HereDocumentBodyStart
888                                            | TokenEndReason::HereDocumentBodyEnd
889                                            | TokenEndReason::HereDocumentEndTag
890                                    ) {
891                                        pending_here_doc_tokens.push(cur_token);
892                                        continue;
893                                    }
894
895                                    cur_token
896                                };
897
898                                if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
899                                    && !pending_here_doc_tokens.is_empty()
900                                {
901                                    pending_here_doc_tokens.push(cur_token);
902                                    drain_here_doc_tokens = true;
903                                    continue;
904                                }
905
906                                if let Some(cur_token_value) = cur_token.token {
907                                    state.append_str(cur_token_value.to_str());
908
909                                    // If we encounter an embedded open parenthesis, then note that
910                                    // we'll have to see the matching end to it before we worry
911                                    // about the end of the
912                                    // containing construct.
913                                    if matches!(cur_token_value, Token::Operator(o, _) if o == "(")
914                                    {
915                                        required_end_parens += 1;
916                                    }
917                                }
918
919                                match cur_token.reason {
920                                    TokenEndReason::HereDocumentBodyStart => {
921                                        state.append_char('\n');
922                                    }
923                                    TokenEndReason::NonNewLineBlank => state.append_char(' '),
924                                    TokenEndReason::SpecifiedTerminatingChar => {
925                                        // We hit the ')' we were looking for. If this is the last
926                                        // end parenthesis we needed to find, then we'll exit the
927                                        // loop and consume
928                                        // and append it.
929                                        required_end_parens -= 1;
930                                        if required_end_parens == 0 {
931                                            break;
932                                        }
933
934                                        // This wasn't the *last* end parenthesis char, so let's
935                                        // consume and append it here before we loop around again.
936                                        state.append_char(self.next_char()?.unwrap());
937                                    }
938                                    TokenEndReason::EndOfInput => {
939                                        return Err(
940                                            TokenizerError::UnterminatedCommandSubstitution,
941                                        );
942                                    }
943                                    _ => (),
944                                }
945                            }
946
947                            self.cross_state.arithmetic_expansion = false;
948
949                            state.append_char(self.next_char()?.unwrap());
950                        }
951
952                        Some('{') => {
953                            // Add the '$' we already consumed to the token.
954                            state.append_char('$');
955
956                            // Consume the '{' and add it to the token.
957                            state.append_char(self.next_char()?.unwrap());
958
959                            let mut pending_here_doc_tokens = vec![];
960                            let mut drain_here_doc_tokens = false;
961
962                            loop {
963                                let cur_token = if drain_here_doc_tokens
964                                    && !pending_here_doc_tokens.is_empty()
965                                {
966                                    if pending_here_doc_tokens.len() == 1 {
967                                        drain_here_doc_tokens = false;
968                                    }
969
970                                    pending_here_doc_tokens.remove(0)
971                                } else {
972                                    let cur_token = self.next_token_until(
973                                        Some('}'),
974                                        false, /* include space? */
975                                    )?;
976
977                                    // See if this is a here-document-related token we need to hold
978                                    // onto until after we've seen all the tokens that need to show
979                                    // up before we get to the body.
980                                    if matches!(
981                                        cur_token.reason,
982                                        TokenEndReason::HereDocumentBodyStart
983                                            | TokenEndReason::HereDocumentBodyEnd
984                                            | TokenEndReason::HereDocumentEndTag
985                                    ) {
986                                        pending_here_doc_tokens.push(cur_token);
987                                        continue;
988                                    }
989
990                                    cur_token
991                                };
992
993                                if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
994                                    && !pending_here_doc_tokens.is_empty()
995                                {
996                                    pending_here_doc_tokens.push(cur_token);
997                                    drain_here_doc_tokens = true;
998                                    continue;
999                                }
1000
1001                                if let Some(cur_token_value) = cur_token.token {
1002                                    state.append_str(cur_token_value.to_str());
1003                                }
1004
1005                                match cur_token.reason {
1006                                    TokenEndReason::HereDocumentBodyStart => {
1007                                        state.append_char('\n');
1008                                    }
1009                                    TokenEndReason::NonNewLineBlank => state.append_char(' '),
1010                                    TokenEndReason::SpecifiedTerminatingChar => {
1011                                        // We hit the end brace we were looking for but did not
1012                                        // yet consume it. Do so now.
1013                                        state.append_char(self.next_char()?.unwrap());
1014                                        break;
1015                                    }
1016                                    TokenEndReason::EndOfInput => {
1017                                        return Err(TokenizerError::UnterminatedVariable);
1018                                    }
1019                                    _ => (),
1020                                }
1021                            }
1022                        }
1023                        _ => {
1024                            // This is either a different character, or else the end of the string.
1025                            // Either way, add the '$' we already consumed to the token.
1026                            state.append_char('$');
1027                        }
1028                    }
1029                } else {
1030                    // We look for the terminating backquote. First disable normal consumption and
1031                    // consume the starting backquote.
1032                    let backquote_pos = self.cross_state.cursor.clone();
1033                    self.consume_char()?;
1034
1035                    // Add the opening backquote to the token.
1036                    state.append_char(c);
1037
1038                    // Now continue until we see an unescaped backquote.
1039                    let mut escaping_enabled = false;
1040                    let mut done = false;
1041                    while !done {
1042                        // Read (and consume) the next char.
1043                        let next_char_in_backquote = self.next_char()?;
1044                        if let Some(cib) = next_char_in_backquote {
1045                            // Include it in the token no matter what.
1046                            state.append_char(cib);
1047
1048                            // Watch out for escaping.
1049                            if !escaping_enabled && cib == '\\' {
1050                                escaping_enabled = true;
1051                            } else {
1052                                // Look for an unescaped backquote to terminate.
1053                                if !escaping_enabled && cib == '`' {
1054                                    done = true;
1055                                }
1056                                escaping_enabled = false;
1057                            }
1058                        } else {
1059                            return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1060                        }
1061                    }
1062                }
1063            }
1064            //
1065            // [Extension]
1066            // If extended globbing is enabled, the last consumed character is an
1067            // unquoted start of an extglob pattern, *and* if the current character
1068            // is an open parenthesis, then this begins an extglob pattern.
1069            else if c == '('
1070                && self.options.enable_extended_globbing
1071                && state.unquoted()
1072                && !state.in_operator()
1073                && state
1074                    .current_token()
1075                    .ends_with(|x| Self::can_start_extglob(x))
1076            {
1077                // Consume the '(' and append it.
1078                self.consume_char()?;
1079                state.append_char(c);
1080
1081                let mut paren_depth = 1;
1082
1083                // Keep consuming until we see the matching end ')'.
1084                while paren_depth > 0 {
1085                    if let Some(extglob_char) = self.next_char()? {
1086                        // Include it in the token.
1087                        state.append_char(extglob_char);
1088
1089                        // Look for ')' to terminate.
1090                        // TODO: handle escaping?
1091                        if extglob_char == '(' {
1092                            paren_depth += 1;
1093                        } else if extglob_char == ')' {
1094                            paren_depth -= 1;
1095                        }
1096                    } else {
1097                        return Err(TokenizerError::UnterminatedExtendedGlob(
1098                            self.cross_state.cursor.clone(),
1099                        ));
1100                    }
1101                }
1102            //
1103            // If the character *can* start an operator, then it will.
1104            //
1105            } else if state.unquoted() && Self::can_start_operator(c) {
1106                if state.started_token() {
1107                    result = state.delimit_current_token(
1108                        TokenEndReason::OperatorStart,
1109                        &mut self.cross_state,
1110                    )?;
1111                } else {
1112                    state.token_is_operator = true;
1113                    self.consume_char()?;
1114                    state.append_char(c);
1115                }
1116            //
1117            // Whitespace gets discarded (and delimits tokens).
1118            //
1119            } else if state.unquoted() && is_blank(c) {
1120                if state.started_token() {
1121                    result = state.delimit_current_token(
1122                        TokenEndReason::NonNewLineBlank,
1123                        &mut self.cross_state,
1124                    )?;
1125                } else if include_space {
1126                    state.append_char(c);
1127                } else {
1128                    // Make sure we don't include this char in the token range.
1129                    state.start_position.column += 1;
1130                    state.start_position.index += 1;
1131                }
1132
1133                self.consume_char()?;
1134            }
1135            //
1136            // N.B. We need to remember if we were recursively called in a variable
1137            // expansion expression; in that case we won't think a token was started but...
1138            // we'd be wrong.
1139            else if !state.token_is_operator
1140                && (state.started_token() || matches!(terminating_char, Some('}')))
1141            {
1142                self.consume_char()?;
1143                state.append_char(c);
1144            } else if c == '#' {
1145                // Consume the '#'.
1146                self.consume_char()?;
1147
1148                let mut done = false;
1149                while !done {
1150                    done = match self.peek_char()? {
1151                        Some('\n') => true,
1152                        None => true,
1153                        _ => {
1154                            // Consume the peeked char; it's part of the comment.
1155                            self.consume_char()?;
1156                            false
1157                        }
1158                    };
1159                }
1160                // Re-start loop as if the comment never happened.
1161            } else if state.started_token() {
1162                // In all other cases where we have an in-progress token, we delimit here.
1163                result =
1164                    state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1165            } else {
1166                // If we got here, then we don't have a token in progress and we're not starting an
1167                // operator. Add the character to a new token.
1168                self.consume_char()?;
1169                state.append_char(c);
1170            }
1171        }
1172
1173        let result = result.unwrap();
1174
1175        Ok(result)
1176    }
1177
1178    fn remove_here_end_tag(
1179        &mut self,
1180        state: &mut TokenParseState,
1181        result: &mut Option<TokenizeResult>,
1182        ends_with_newline: bool,
1183    ) -> Result<bool, TokenizerError> {
1184        // Bail immediately if we don't even have a *starting* here tag.
1185        if self.cross_state.current_here_tags.is_empty() {
1186            return Ok(false);
1187        }
1188
1189        let next_here_tag = &self.cross_state.current_here_tags[0];
1190
1191        let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
1192            unquote_str(next_here_tag.tag.as_str()).into()
1193        } else {
1194            next_here_tag.tag.as_str().into()
1195        };
1196
1197        let tag_str = if !ends_with_newline {
1198            tag_str
1199                .strip_suffix('\n')
1200                .unwrap_or_else(|| tag_str.as_ref())
1201        } else {
1202            tag_str.as_ref()
1203        };
1204
1205        if let Some(current_token_without_here_tag) = state.current_token().strip_suffix(tag_str) {
1206            // Make sure that was either the start of the here document, or there
1207            // was a newline between the preceding part
1208            // and the tag.
1209            if current_token_without_here_tag.is_empty()
1210                || current_token_without_here_tag.ends_with('\n')
1211            {
1212                state.replace_with_here_doc(current_token_without_here_tag.to_owned());
1213
1214                // Delimit the end of the here-document body.
1215                *result = state.delimit_current_token(
1216                    TokenEndReason::HereDocumentBodyEnd,
1217                    &mut self.cross_state,
1218                )?;
1219
1220                return Ok(true);
1221            }
1222        }
1223        Ok(false)
1224    }
1225
1226    const fn can_start_extglob(c: char) -> bool {
1227        matches!(c, '@' | '!' | '?' | '+' | '*')
1228    }
1229
1230    const fn can_start_operator(c: char) -> bool {
1231        matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1232    }
1233
1234    fn is_operator(&self, s: &str) -> bool {
1235        // Handle non-POSIX operators.
1236        if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1237            return true;
1238        }
1239
1240        matches!(
1241            s,
1242            "&" | "&&"
1243                | "("
1244                | ")"
1245                | ";"
1246                | ";;"
1247                | "\n"
1248                | "|"
1249                | "||"
1250                | "<"
1251                | ">"
1252                | ">|"
1253                | "<<"
1254                | ">>"
1255                | "<&"
1256                | ">&"
1257                | "<<-"
1258                | "<>"
1259        )
1260    }
1261}
1262
1263impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1264    type Item = Result<TokenizeResult, TokenizerError>;
1265
1266    fn next(&mut self) -> Option<Self::Item> {
1267        match self.next_token() {
1268            #[allow(clippy::manual_map)]
1269            Ok(result) => match result.token {
1270                Some(_) => Some(Ok(result)),
1271                None => None,
1272            },
1273            Err(e) => Some(Err(e)),
1274        }
1275    }
1276}
1277
1278const fn is_blank(c: char) -> bool {
1279    c == ' ' || c == '\t'
1280}
1281
1282const fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1283    // If we're currently escaped, then nothing affects quoting.
1284    if state.in_escape {
1285        return false;
1286    }
1287
1288    match state.quote_mode {
1289        // When we're in a double quote or ANSI-C quote, only a subset of escape
1290        // sequences are recognized.
1291        QuoteMode::Double(_) | QuoteMode::AnsiC(_) => {
1292            if c == '\\' {
1293                // TODO: handle backslash in double quote
1294                true
1295            } else {
1296                false
1297            }
1298        }
1299        // When we're in a single quote, nothing affects quoting.
1300        QuoteMode::Single(_) => false,
1301        // When we're not already in a quote, then we can straightforwardly look for a
1302        // quote mark or backslash.
1303        QuoteMode::None => is_quoting_char(c),
1304    }
1305}
1306
1307const fn is_quoting_char(c: char) -> bool {
1308    matches!(c, '\\' | '\'' | '\"')
1309}
1310
1311/// Return a string with all the quoting removed.
1312///
1313/// # Arguments
1314///
1315/// * `s` - The string to unquote.
1316pub fn unquote_str(s: &str) -> String {
1317    let mut result = String::new();
1318
1319    let mut in_escape = false;
1320    for c in s.chars() {
1321        match c {
1322            c if in_escape => {
1323                result.push(c);
1324                in_escape = false;
1325            }
1326            '\\' => in_escape = true,
1327            c if is_quoting_char(c) => (),
1328            c => result.push(c),
1329        }
1330    }
1331
1332    result
1333}
1334
1335#[cfg(test)]
1336#[allow(clippy::panic_in_result_fn)]
1337mod tests {
1338
1339    use super::*;
1340    use anyhow::Result;
1341    use insta::assert_ron_snapshot;
1342    use pretty_assertions::{assert_eq, assert_matches};
1343
1344    #[derive(serde::Serialize)]
1345    struct TokenizerResult<'a> {
1346        input: &'a str,
1347        result: Vec<Token>,
1348    }
1349
1350    fn test_tokenizer(input: &str) -> Result<TokenizerResult<'_>> {
1351        Ok(TokenizerResult {
1352            input,
1353            result: tokenize_str(input)?,
1354        })
1355    }
1356
1357    #[test]
1358    fn tokenize_empty() -> Result<()> {
1359        let tokens = tokenize_str("")?;
1360        assert_eq!(tokens.len(), 0);
1361        Ok(())
1362    }
1363
1364    #[test]
1365    fn tokenize_line_continuation() -> Result<()> {
1366        assert_ron_snapshot!(test_tokenizer(
1367            r"a\
1368bc"
1369        )?);
1370        Ok(())
1371    }
1372
1373    #[test]
1374    fn tokenize_operators() -> Result<()> {
1375        assert_ron_snapshot!(test_tokenizer("a>>b")?);
1376        Ok(())
1377    }
1378
1379    #[test]
1380    fn tokenize_comment() -> Result<()> {
1381        assert_ron_snapshot!(test_tokenizer(
1382            r"a #comment
1383"
1384        )?);
1385        Ok(())
1386    }
1387
1388    #[test]
1389    fn tokenize_comment_at_eof() -> Result<()> {
1390        assert_ron_snapshot!(test_tokenizer(r"a #comment")?);
1391        Ok(())
1392    }
1393
1394    #[test]
1395    fn tokenize_empty_here_doc() -> Result<()> {
1396        assert_ron_snapshot!(test_tokenizer(
1397            r"cat <<HERE
1398HERE
1399"
1400        )?);
1401        Ok(())
1402    }
1403
1404    #[test]
1405    fn tokenize_here_doc() -> Result<()> {
1406        assert_ron_snapshot!(test_tokenizer(
1407            r"cat <<HERE
1408SOMETHING
1409HERE
1410echo after
1411"
1412        )?);
1413        assert_ron_snapshot!(test_tokenizer(
1414            r"cat <<HERE
1415SOMETHING
1416HERE
1417"
1418        )?);
1419        assert_ron_snapshot!(test_tokenizer(
1420            r"cat <<HERE
1421SOMETHING
1422HERE
1423
1424"
1425        )?);
1426        assert_ron_snapshot!(test_tokenizer(
1427            r"cat <<HERE
1428SOMETHING
1429HERE"
1430        )?);
1431        Ok(())
1432    }
1433
1434    #[test]
1435    fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1436        assert_ron_snapshot!(test_tokenizer(
1437            r"cat <<-HERE
1438	SOMETHING
1439	HERE
1440"
1441        )?);
1442        Ok(())
1443    }
1444
1445    #[test]
1446    fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1447        assert_ron_snapshot!(test_tokenizer(
1448            r"cat <<EOF | wc -l
1449A B C
14501 2 3
1451D E F
1452EOF
1453"
1454        )?);
1455        Ok(())
1456    }
1457
1458    #[test]
1459    fn tokenize_multiple_here_docs() -> Result<()> {
1460        assert_ron_snapshot!(test_tokenizer(
1461            r"cat <<HERE1 <<HERE2
1462SOMETHING
1463HERE1
1464OTHER
1465HERE2
1466echo after
1467"
1468        )?);
1469        Ok(())
1470    }
1471
1472    #[test]
1473    fn tokenize_unterminated_here_doc() {
1474        let result = tokenize_str(
1475            r"cat <<HERE
1476SOMETHING
1477",
1478        );
1479        assert!(result.is_err());
1480    }
1481
1482    #[test]
1483    fn tokenize_missing_here_tag() {
1484        let result = tokenize_str(
1485            r"cat <<
1486",
1487        );
1488        assert!(result.is_err());
1489    }
1490
1491    #[test]
1492    fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1493        assert_ron_snapshot!(test_tokenizer(
1494            r"echo $(cat <<HERE
1495TEXT
1496HERE
1497)"
1498        )?);
1499        Ok(())
1500    }
1501
1502    #[test]
1503    fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1504        assert_ron_snapshot!(test_tokenizer(
1505            r"echo $(cat <<HERE1 <<HERE2 | wc -l
1506TEXT
1507HERE1
1508OTHER
1509HERE2
1510)"
1511        )?);
1512        Ok(())
1513    }
1514
1515    #[test]
1516    fn tokenize_simple_backquote() -> Result<()> {
1517        assert_ron_snapshot!(test_tokenizer(r"echo `echo hi`")?);
1518        Ok(())
1519    }
1520
1521    #[test]
1522    fn tokenize_backquote_with_escape() -> Result<()> {
1523        assert_ron_snapshot!(test_tokenizer(r"echo `echo\`hi`")?);
1524        Ok(())
1525    }
1526
1527    #[test]
1528    fn tokenize_unterminated_backquote() {
1529        assert_matches!(
1530            tokenize_str("`"),
1531            Err(TokenizerError::UnterminatedBackquote(_))
1532        );
1533    }
1534
1535    #[test]
1536    fn tokenize_unterminated_command_substitution() {
1537        assert_matches!(
1538            tokenize_str("$("),
1539            Err(TokenizerError::UnterminatedCommandSubstitution)
1540        );
1541    }
1542
1543    #[test]
1544    fn tokenize_command_substitution() -> Result<()> {
1545        assert_ron_snapshot!(test_tokenizer("a$(echo hi)b c")?);
1546        Ok(())
1547    }
1548
1549    #[test]
1550    fn tokenize_command_substitution_with_subshell() -> Result<()> {
1551        assert_ron_snapshot!(test_tokenizer("$( (:) )")?);
1552        Ok(())
1553    }
1554
1555    #[test]
1556    fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1557        assert_ron_snapshot!(test_tokenizer("echo $(echo !(x))")?);
1558        Ok(())
1559    }
1560
1561    #[test]
1562    fn tokenize_arithmetic_expression() -> Result<()> {
1563        assert_ron_snapshot!(test_tokenizer("a$((1+2))b c")?);
1564        Ok(())
1565    }
1566
1567    #[test]
1568    fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1569        // N.B. The spacing comes out a bit odd, but it gets processed okay
1570        // by later stages.
1571        assert_ron_snapshot!(test_tokenizer("$(( 1 ))")?);
1572        Ok(())
1573    }
1574    #[test]
1575    fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1576        assert_ron_snapshot!(test_tokenizer("$(( (0) ))")?);
1577        Ok(())
1578    }
1579
1580    #[test]
1581    fn tokenize_special_parameters() -> Result<()> {
1582        assert_ron_snapshot!(test_tokenizer("$$")?);
1583        assert_ron_snapshot!(test_tokenizer("$@")?);
1584        assert_ron_snapshot!(test_tokenizer("$!")?);
1585        assert_ron_snapshot!(test_tokenizer("$?")?);
1586        assert_ron_snapshot!(test_tokenizer("$*")?);
1587        Ok(())
1588    }
1589
1590    #[test]
1591    fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1592        assert_ron_snapshot!(test_tokenizer("$x")?);
1593        assert_ron_snapshot!(test_tokenizer("a$x")?);
1594        Ok(())
1595    }
1596
1597    #[test]
1598    fn tokenize_unterminated_parameter_expansion() {
1599        assert_matches!(
1600            tokenize_str("${x"),
1601            Err(TokenizerError::UnterminatedVariable)
1602        );
1603    }
1604
1605    #[test]
1606    fn tokenize_braced_parameter_expansion() -> Result<()> {
1607        assert_ron_snapshot!(test_tokenizer("${x}")?);
1608        assert_ron_snapshot!(test_tokenizer("a${x}b")?);
1609        Ok(())
1610    }
1611
1612    #[test]
1613    fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1614        assert_ron_snapshot!(test_tokenizer(r"a${x\}}b")?);
1615        Ok(())
1616    }
1617
1618    #[test]
1619    fn tokenize_whitespace() -> Result<()> {
1620        assert_ron_snapshot!(test_tokenizer("1 2 3")?);
1621        Ok(())
1622    }
1623
1624    #[test]
1625    fn tokenize_escaped_whitespace() -> Result<()> {
1626        assert_ron_snapshot!(test_tokenizer(r"1\ 2 3")?);
1627        Ok(())
1628    }
1629
1630    #[test]
1631    fn tokenize_single_quote() -> Result<()> {
1632        assert_ron_snapshot!(test_tokenizer(r"x'a b'y")?);
1633        Ok(())
1634    }
1635
1636    #[test]
1637    fn tokenize_double_quote() -> Result<()> {
1638        assert_ron_snapshot!(test_tokenizer(r#"x"a b"y"#)?);
1639        Ok(())
1640    }
1641
1642    #[test]
1643    fn tokenize_double_quoted_command_substitution() -> Result<()> {
1644        assert_ron_snapshot!(test_tokenizer(r#"x"$(echo hi)"y"#)?);
1645        Ok(())
1646    }
1647
1648    #[test]
1649    fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1650        assert_ron_snapshot!(test_tokenizer(r#"x"$((1+2))"y"#)?);
1651        Ok(())
1652    }
1653
1654    #[test]
1655    fn test_quote_removal() {
1656        assert_eq!(unquote_str(r#""hello""#), "hello");
1657        assert_eq!(unquote_str(r"'hello'"), "hello");
1658        assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1659        assert_eq!(unquote_str(r"'hel\'lo'"), r"hel'lo");
1660    }
1661}
brush_parser/tokenizer.rs

brush_parser/
tokenizer.rs