brush_parser/
tokenizer.rs

1use std::borrow::Cow;
2use std::fmt::Display;
3use utf8_chars::BufReadCharsExt;
4
5#[allow(dead_code)]
6#[derive(Clone, Debug)]
7pub(crate) enum TokenEndReason {
8    /// End of input was reached.
9    EndOfInput,
10    /// An unescaped newline char was reached.
11    UnescapedNewLine,
12    /// Specified terminating char.
13    SpecifiedTerminatingChar,
14    /// A non-newline blank char was reached.
15    NonNewLineBlank,
16    /// A here-document's body is starting.
17    HereDocumentBodyStart,
18    /// A here-document's body was terminated.
19    HereDocumentBodyEnd,
20    /// A here-document's end tag was reached.
21    HereDocumentEndTag,
22    /// An operator was started.
23    OperatorStart,
24    /// An operator was terminated.
25    OperatorEnd,
26    /// Some other condition was reached.
27    Other,
28}
29
30/// Represents a position in a source shell script.
31#[derive(Clone, Default, Debug)]
32#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
33pub struct SourcePosition {
34    /// The 0-based index of the character in the input stream.
35    pub index: i32,
36    /// The 1-based line number.
37    pub line: i32,
38    /// The 1-based column number.
39    pub column: i32,
40}
41
42impl Display for SourcePosition {
43    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
44        f.write_fmt(format_args!("line {} col {}", self.line, self.column))
45    }
46}
47
48/// Represents the location of a token in its source shell script.
49#[derive(Clone, Default, Debug)]
50#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
51pub struct TokenLocation {
52    /// The start position of the token.
53    pub start: SourcePosition,
54    /// The end position of the token (exclusive).
55    pub end: SourcePosition,
56}
57
58/// Represents a token extracted from a shell script.
59#[derive(Clone, Debug)]
60#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
61pub enum Token {
62    /// An operator token.
63    Operator(String, TokenLocation),
64    /// A word token.
65    Word(String, TokenLocation),
66}
67
68impl Token {
69    /// Returns the string value of the token.
70    pub fn to_str(&self) -> &str {
71        match self {
72            Token::Operator(s, _) => s,
73            Token::Word(s, _) => s,
74        }
75    }
76
77    /// Returns the location of the token in the source script.
78    pub fn location(&self) -> &TokenLocation {
79        match self {
80            Token::Operator(_, l) => l,
81            Token::Word(_, l) => l,
82        }
83    }
84}
85
86/// Encapsulates the result of tokenizing a shell script.
87#[derive(Clone, Debug)]
88pub(crate) struct TokenizeResult {
89    /// Reason for tokenization ending.
90    pub reason: TokenEndReason,
91    /// The token that was extracted, if any.
92    pub token: Option<Token>,
93}
94
95/// Represents an error that occurred during tokenization.
96#[derive(thiserror::Error, Debug)]
97pub enum TokenizerError {
98    /// An unterminated escape sequence was encountered at the end of the input stream.
99    #[error("unterminated escape sequence")]
100    UnterminatedEscapeSequence,
101
102    /// An unterminated single-quoted substring was encountered at the end of the input stream.
103    #[error("unterminated single quote at {0}")]
104    UnterminatedSingleQuote(SourcePosition),
105
106    /// An unterminated double-quoted substring was encountered at the end of the input stream.
107    #[error("unterminated double quote at {0}")]
108    UnterminatedDoubleQuote(SourcePosition),
109
110    /// An unterminated back-quoted substring was encountered at the end of the input stream.
111    #[error("unterminated backquote near {0}")]
112    UnterminatedBackquote(SourcePosition),
113
114    /// An unterminated extended glob (extglob) pattern was encountered at the end of the input
115    /// stream.
116    #[error("unterminated extglob near {0}")]
117    UnterminatedExtendedGlob(SourcePosition),
118
119    /// An unterminated variable expression was encountered at the end of the input stream.
120    #[error("unterminated variable expression")]
121    UnterminatedVariable,
122
123    /// An unterminated command substitiion was encountered at the end of the input stream.
124    #[error("unterminated command substitution")]
125    UnterminatedCommandSubstitution,
126
127    /// An error occurred decoding UTF-8 characters in the input stream.
128    #[error("failed to decode UTF-8 characters")]
129    FailedDecoding,
130
131    /// An I/O here tag was missing.
132    #[error("missing here tag for here document body")]
133    MissingHereTagForDocumentBody,
134
135    /// The indicated I/O here tag was missing.
136    #[error("missing here tag '{0}'")]
137    MissingHereTag(String),
138
139    /// An unterminated here document sequence was encountered at the end of the input stream.
140    #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
141    UnterminatedHereDocuments(String, String),
142
143    /// An I/O error occurred while reading from the input stream.
144    #[error("failed to read input")]
145    ReadError(#[from] std::io::Error),
146}
147
148impl TokenizerError {
149    pub fn is_incomplete(&self) -> bool {
150        matches!(
151            self,
152            Self::UnterminatedEscapeSequence
153                | Self::UnterminatedSingleQuote(..)
154                | Self::UnterminatedDoubleQuote(..)
155                | Self::UnterminatedBackquote(..)
156                | Self::UnterminatedCommandSubstitution
157                | Self::UnterminatedVariable
158                | Self::UnterminatedExtendedGlob(..)
159                | Self::UnterminatedHereDocuments(..)
160        )
161    }
162}
163
164/// Encapsulates a sequence of tokens.
165#[derive(Debug)]
166pub(crate) struct Tokens<'a> {
167    /// Sequence of tokens.
168    pub tokens: &'a [Token],
169}
170
171#[derive(Clone, Debug)]
172enum QuoteMode {
173    None,
174    Single(SourcePosition),
175    Double(SourcePosition),
176}
177
178#[derive(Clone, Debug, Default)]
179enum HereState {
180    /// In this state, we are not currently tracking any here-documents.
181    #[default]
182    None,
183    /// In this state, we expect that the next token will be a here tag.
184    NextTokenIsHereTag { remove_tabs: bool },
185    /// In this state, the *current* token is a here tag.
186    CurrentTokenIsHereTag {
187        remove_tabs: bool,
188        operator_token_result: TokenizeResult,
189    },
190    /// In this state, we expect that the *next line* will be the body of
191    /// a here-document.
192    NextLineIsHereDoc,
193    /// In this state, we are in the set of lines that comprise 1 or more
194    /// consecutive here-document bodies.
195    InHereDocs,
196}
197
198#[derive(Clone, Debug)]
199struct HereTag {
200    tag: String,
201    tag_was_escaped_or_quoted: bool,
202    remove_tabs: bool,
203    position: SourcePosition,
204    tokens: Vec<TokenizeResult>,
205    pending_tokens_after: Vec<TokenizeResult>,
206}
207
208#[derive(Clone, Debug)]
209struct CrossTokenParseState {
210    /// Cursor within the overall token stream; used for error reporting.
211    cursor: SourcePosition,
212    /// Current state of parsing here-documents.
213    here_state: HereState,
214    /// Ordered queue of here tags for which we're still looking for matching here-document bodies.
215    current_here_tags: Vec<HereTag>,
216    /// Tokens already tokenized that should be used first to serve requests for tokens.
217    queued_tokens: Vec<TokenizeResult>,
218    /// Are we in an arithmetic expansion?
219    arithmetic_expansion: bool,
220}
221
222/// Options controlling how the tokenizer operates.
223#[derive(Clone, Debug, Hash, Eq, PartialEq)]
224pub struct TokenizerOptions {
225    /// Whether or not to enable extended globbing patterns (extglob).
226    pub enable_extended_globbing: bool,
227    /// Whether or not to operate in POSIX compliance mode.
228    #[allow(unused)]
229    pub posix_mode: bool,
230    /// Whether or not we're running in SH emulation mode.
231    pub sh_mode: bool,
232}
233
234impl Default for TokenizerOptions {
235    fn default() -> Self {
236        Self {
237            enable_extended_globbing: true,
238            posix_mode: false,
239            sh_mode: false,
240        }
241    }
242}
243
244/// A tokenizer for shell scripts.
245pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
246    char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
247    cross_state: CrossTokenParseState,
248    options: TokenizerOptions,
249}
250
251/// Encapsulates the current token parsing state.
252#[derive(Clone, Debug)]
253struct TokenParseState {
254    pub start_position: SourcePosition,
255    pub token_so_far: String,
256    pub token_is_operator: bool,
257    pub in_escape: bool,
258    pub quote_mode: QuoteMode,
259}
260
261impl TokenParseState {
262    pub fn new(start_position: &SourcePosition) -> Self {
263        TokenParseState {
264            start_position: start_position.clone(),
265            token_so_far: String::new(),
266            token_is_operator: false,
267            in_escape: false,
268            quote_mode: QuoteMode::None,
269        }
270    }
271
272    pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
273        let token_location = TokenLocation {
274            start: std::mem::take(&mut self.start_position),
275            end: end_position.clone(),
276        };
277
278        let token = if std::mem::take(&mut self.token_is_operator) {
279            Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
280        } else {
281            Token::Word(std::mem::take(&mut self.token_so_far), token_location)
282        };
283
284        self.start_position = end_position.clone();
285        self.in_escape = false;
286        self.quote_mode = QuoteMode::None;
287
288        token
289    }
290
291    pub fn started_token(&self) -> bool {
292        !self.token_so_far.is_empty()
293    }
294
295    pub fn append_char(&mut self, c: char) {
296        self.token_so_far.push(c);
297    }
298
299    pub fn append_str(&mut self, s: &str) {
300        self.token_so_far.push_str(s);
301    }
302
303    pub fn unquoted(&self) -> bool {
304        !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
305    }
306
307    pub fn current_token(&self) -> &str {
308        &self.token_so_far
309    }
310
311    pub fn is_specific_operator(&self, operator: &str) -> bool {
312        self.token_is_operator && self.current_token() == operator
313    }
314
315    pub fn in_operator(&self) -> bool {
316        self.token_is_operator
317    }
318
319    fn is_newline(&self) -> bool {
320        self.token_so_far == "\n"
321    }
322
323    fn replace_with_here_doc(&mut self, s: String) {
324        self.token_so_far = s;
325    }
326
327    pub fn delimit_current_token(
328        &mut self,
329        reason: TokenEndReason,
330        cross_token_state: &mut CrossTokenParseState,
331    ) -> Result<Option<TokenizeResult>, TokenizerError> {
332        // If we don't have anything in the token, then don't yield an empty string token
333        // *unless* it's the body of a here document.
334        if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
335            return Ok(Some(TokenizeResult {
336                reason,
337                token: None,
338            }));
339        }
340
341        // TODO: Make sure the here-tag meets criteria (and isn't a newline).
342        let current_here_state = std::mem::take(&mut cross_token_state.here_state);
343        match current_here_state {
344            HereState::NextTokenIsHereTag { remove_tabs } => {
345                // Don't yield the operator as a token yet. We need to make sure we collect
346                // up everything we need for all the here-documents with tags on this line.
347                let operator_token_result = TokenizeResult {
348                    reason,
349                    token: Some(self.pop(&cross_token_state.cursor)),
350                };
351
352                cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
353                    remove_tabs,
354                    operator_token_result,
355                };
356
357                return Ok(None);
358            }
359            HereState::CurrentTokenIsHereTag {
360                remove_tabs,
361                operator_token_result,
362            } => {
363                if self.is_newline() {
364                    return Err(TokenizerError::MissingHereTag(
365                        self.current_token().to_owned(),
366                    ));
367                }
368
369                cross_token_state.here_state = HereState::NextLineIsHereDoc;
370
371                // Include the trailing \n in the here tag so it's easier to check against.
372                let tag = std::format!("{}\n", self.current_token());
373                let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
374
375                let tag_token_result = TokenizeResult {
376                    reason,
377                    token: Some(self.pop(&cross_token_state.cursor)),
378                };
379
380                cross_token_state.current_here_tags.push(HereTag {
381                    tag,
382                    tag_was_escaped_or_quoted,
383                    remove_tabs,
384                    position: cross_token_state.cursor.clone(),
385                    tokens: vec![operator_token_result, tag_token_result],
386                    pending_tokens_after: vec![],
387                });
388
389                return Ok(None);
390            }
391            HereState::NextLineIsHereDoc => {
392                if self.is_newline() {
393                    cross_token_state.here_state = HereState::InHereDocs;
394                } else {
395                    cross_token_state.here_state = HereState::NextLineIsHereDoc;
396                }
397
398                if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
399                    let token = self.pop(&cross_token_state.cursor);
400                    let result = TokenizeResult {
401                        reason,
402                        token: Some(token),
403                    };
404
405                    last_here_tag.pending_tokens_after.push(result);
406                } else {
407                    return Err(TokenizerError::MissingHereTagForDocumentBody);
408                }
409
410                return Ok(None);
411            }
412            HereState::InHereDocs => {
413                // We hit the end of the current here-document.
414                let completed_here_tag = cross_token_state.current_here_tags.remove(0);
415
416                // First queue the redirection operator and (start) here-tag.
417                for here_token in completed_here_tag.tokens {
418                    cross_token_state.queued_tokens.push(here_token);
419                }
420
421                // Leave a hint that we are about to start a here-document.
422                cross_token_state.queued_tokens.push(TokenizeResult {
423                    reason: TokenEndReason::HereDocumentBodyStart,
424                    token: None,
425                });
426
427                // Then queue the body document we just finished.
428                cross_token_state.queued_tokens.push(TokenizeResult {
429                    reason,
430                    token: Some(self.pop(&cross_token_state.cursor)),
431                });
432
433                // Then queue up the (end) here-tag.
434                self.append_str(completed_here_tag.tag.trim_end_matches('\n'));
435                cross_token_state.queued_tokens.push(TokenizeResult {
436                    reason: TokenEndReason::HereDocumentEndTag,
437                    token: Some(self.pop(&cross_token_state.cursor)),
438                });
439
440                // Now we're ready to queue up any tokens that came between the completed
441                // here tag and the next here tag (or newline after it if it was the last).
442                for pending_token in completed_here_tag.pending_tokens_after {
443                    cross_token_state.queued_tokens.push(pending_token);
444                }
445
446                if cross_token_state.current_here_tags.is_empty() {
447                    cross_token_state.here_state = HereState::None;
448                } else {
449                    cross_token_state.here_state = HereState::InHereDocs;
450                }
451
452                return Ok(None);
453            }
454            HereState::None => (),
455        }
456
457        let token = self.pop(&cross_token_state.cursor);
458        let result = TokenizeResult {
459            reason,
460            token: Some(token),
461        };
462
463        Ok(Some(result))
464    }
465}
466
467/// Break the given input shell script string into tokens, returning the tokens.
468///
469/// # Arguments
470///
471/// * `input` - The shell script to tokenize.
472pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
473    tokenize_str_with_options(input, &TokenizerOptions::default())
474}
475
476/// Break the given input shell script string into tokens, returning the tokens.
477///
478/// # Arguments
479///
480/// * `input` - The shell script to tokenize.
481/// * `options` - Options controlling how the tokenizer operates.
482pub fn tokenize_str_with_options(
483    input: &str,
484    options: &TokenizerOptions,
485) -> Result<Vec<Token>, TokenizerError> {
486    uncached_tokenize_string(input.to_owned(), options.to_owned())
487}
488
489#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
490fn uncached_tokenize_string(
491    input: String,
492    options: TokenizerOptions,
493) -> Result<Vec<Token>, TokenizerError> {
494    uncached_tokenize_str(input.as_str(), &options)
495}
496
497/// Break the given input shell script string into tokens, returning the tokens.
498/// No caching is performed.
499///
500/// # Arguments
501///
502/// * `input` - The shell script to tokenize.
503pub fn uncached_tokenize_str(
504    input: &str,
505    options: &TokenizerOptions,
506) -> Result<Vec<Token>, TokenizerError> {
507    let mut reader = std::io::BufReader::new(input.as_bytes());
508    let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
509
510    let mut tokens = vec![];
511    loop {
512        match tokenizer.next_token()? {
513            TokenizeResult {
514                token: Some(token), ..
515            } => tokens.push(token),
516            TokenizeResult {
517                reason: TokenEndReason::EndOfInput,
518                ..
519            } => break,
520            _ => (),
521        }
522    }
523
524    Ok(tokens)
525}
526
527impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
528    pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Tokenizer<'a, R> {
529        Tokenizer {
530            options: options.clone(),
531            char_reader: reader.chars().peekable(),
532            cross_state: CrossTokenParseState {
533                cursor: SourcePosition {
534                    index: 0,
535                    line: 1,
536                    column: 1,
537                },
538                here_state: HereState::None,
539                current_here_tags: vec![],
540                queued_tokens: vec![],
541                arithmetic_expansion: false,
542            },
543        }
544    }
545
546    pub fn current_location(&self) -> Option<SourcePosition> {
547        Some(self.cross_state.cursor.clone())
548    }
549
550    fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
551        let c = self
552            .char_reader
553            .next()
554            .transpose()
555            .map_err(TokenizerError::ReadError)?;
556
557        if let Some(ch) = c {
558            if ch == '\n' {
559                self.cross_state.cursor.line += 1;
560                self.cross_state.cursor.column = 1;
561            } else {
562                self.cross_state.cursor.column += 1;
563            }
564            self.cross_state.cursor.index += 1;
565        }
566
567        Ok(c)
568    }
569
570    fn consume_char(&mut self) -> Result<(), TokenizerError> {
571        let _ = self.next_char()?;
572        Ok(())
573    }
574
575    fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
576        match self.char_reader.peek() {
577            Some(result) => match result {
578                Ok(c) => Ok(Some(*c)),
579                Err(_) => Err(TokenizerError::FailedDecoding),
580            },
581            None => Ok(None),
582        }
583    }
584
585    pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
586        self.next_token_until(None)
587    }
588
589    #[allow(clippy::if_same_then_else)]
590    fn next_token_until(
591        &mut self,
592        terminating_char: Option<char>,
593    ) -> Result<TokenizeResult, TokenizerError> {
594        let mut state = TokenParseState::new(&self.cross_state.cursor);
595        let mut result: Option<TokenizeResult> = None;
596
597        while result.is_none() {
598            // First satisfy token results from our queue. Once we exhaust the queue then
599            // we'll look at the input stream.
600            if !self.cross_state.queued_tokens.is_empty() {
601                return Ok(self.cross_state.queued_tokens.remove(0));
602            }
603
604            let next = self.peek_char()?;
605            let c = next.unwrap_or('\0');
606
607            // When we hit the end of the input, then we're done with the current token (if there is
608            // one).
609            if next.is_none() {
610                // TODO: Verify we're not waiting on some terminating character?
611                // Verify we're out of all quotes.
612                if state.in_escape {
613                    return Err(TokenizerError::UnterminatedEscapeSequence);
614                }
615                match state.quote_mode {
616                    QuoteMode::None => (),
617                    QuoteMode::Single(pos) => {
618                        return Err(TokenizerError::UnterminatedSingleQuote(pos));
619                    }
620                    QuoteMode::Double(pos) => {
621                        return Err(TokenizerError::UnterminatedDoubleQuote(pos));
622                    }
623                }
624
625                // Verify we're not in a here document.
626                if !matches!(self.cross_state.here_state, HereState::None) {
627                    let tag_names = self
628                        .cross_state
629                        .current_here_tags
630                        .iter()
631                        .map(|tag| tag.tag.trim())
632                        .collect::<Vec<_>>()
633                        .join(", ");
634                    let tag_positions = self
635                        .cross_state
636                        .current_here_tags
637                        .iter()
638                        .map(|tag| std::format!("{}", tag.position))
639                        .collect::<Vec<_>>()
640                        .join(", ");
641                    return Err(TokenizerError::UnterminatedHereDocuments(
642                        tag_names,
643                        tag_positions,
644                    ));
645                }
646
647                result = state
648                    .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
649            //
650            // Look for the specially specified terminating char.
651            //
652            } else if state.unquoted() && terminating_char == Some(c) {
653                result = state.delimit_current_token(
654                    TokenEndReason::SpecifiedTerminatingChar,
655                    &mut self.cross_state,
656                )?;
657            //
658            // Handle being in a here document.
659            //
660            } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
661                //
662                // For now, just include the character in the current token. We also check
663                // if there are leading tabs to be removed.
664                //
665                if !self.cross_state.current_here_tags.is_empty()
666                    && self.cross_state.current_here_tags[0].remove_tabs
667                    && (!state.started_token() || state.current_token().ends_with('\n'))
668                    && c == '\t'
669                {
670                    // Consume it but don't include it.
671                    self.consume_char()?;
672                } else {
673                    self.consume_char()?;
674                    state.append_char(c);
675
676                    // See if this was a newline character following the terminating here tag.
677                    if c == '\n' {
678                        let next_here_tag = &self.cross_state.current_here_tags[0];
679                        let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
680                            unquote_str(next_here_tag.tag.as_str()).into()
681                        } else {
682                            next_here_tag.tag.as_str().into()
683                        };
684
685                        if let Some(current_token_without_here_tag) =
686                            state.current_token().strip_suffix(tag_str.as_ref())
687                        {
688                            // Make sure that was either the start of the here document, or there
689                            // was a newline between the preceding part
690                            // and the tag.
691                            if current_token_without_here_tag.is_empty()
692                                || current_token_without_here_tag.ends_with('\n')
693                            {
694                                state.replace_with_here_doc(
695                                    current_token_without_here_tag.to_owned(),
696                                );
697
698                                // Delimit the end of the here-document body.
699                                result = state.delimit_current_token(
700                                    TokenEndReason::HereDocumentBodyEnd,
701                                    &mut self.cross_state,
702                                )?;
703                            }
704                        }
705                    }
706                }
707            } else if state.in_operator() {
708                //
709                // We're in an operator. See if this character continues an operator, or if it
710                // must be a separate token (because it wouldn't make a prefix of an operator).
711                //
712
713                let mut hypothetical_token = state.current_token().to_owned();
714                hypothetical_token.push(c);
715
716                if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
717                    self.consume_char()?;
718                    state.append_char(c);
719                } else {
720                    assert!(state.started_token());
721
722                    //
723                    // N.B. If the completed operator indicates a here-document, then keep
724                    // track that the *next* token should be the here-tag.
725                    //
726                    if self.cross_state.arithmetic_expansion {
727                        // Nothing to do; we're in an arithmetic expansion so << and <<-
728                        // are not here-docs, they're either a left-shift operator or
729                        // a left-shift operator followed by a unary minus operator.
730                    } else if state.is_specific_operator("<<") {
731                        self.cross_state.here_state =
732                            HereState::NextTokenIsHereTag { remove_tabs: false };
733                    } else if state.is_specific_operator("<<-") {
734                        self.cross_state.here_state =
735                            HereState::NextTokenIsHereTag { remove_tabs: true };
736                    }
737
738                    let reason = if state.current_token() == "\n" {
739                        TokenEndReason::UnescapedNewLine
740                    } else {
741                        TokenEndReason::OperatorEnd
742                    };
743
744                    result = state.delimit_current_token(reason, &mut self.cross_state)?;
745                }
746            //
747            // See if this is a character that changes the current escaping/quoting state.
748            //
749            } else if does_char_newly_affect_quoting(&state, c) {
750                if c == '\\' {
751                    // Consume the backslash ourselves so we can peek past it.
752                    self.consume_char()?;
753
754                    if matches!(self.peek_char()?, Some('\n')) {
755                        // Make sure the newline char gets consumed too.
756                        self.consume_char()?;
757
758                        // Make sure to include neither the backslash nor the newline character.
759                    } else {
760                        state.in_escape = true;
761                        state.append_char(c);
762                    }
763                } else if c == '\'' {
764                    state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
765                    self.consume_char()?;
766                    state.append_char(c);
767                } else if c == '\"' {
768                    state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
769                    self.consume_char()?;
770                    state.append_char(c);
771                }
772            }
773            //
774            // Handle end of single-quote or double-quote.
775            else if !state.in_escape
776                && matches!(state.quote_mode, QuoteMode::Single(_))
777                && c == '\''
778            {
779                state.quote_mode = QuoteMode::None;
780                self.consume_char()?;
781                state.append_char(c);
782            } else if !state.in_escape
783                && matches!(state.quote_mode, QuoteMode::Double(_))
784                && c == '\"'
785            {
786                state.quote_mode = QuoteMode::None;
787                self.consume_char()?;
788                state.append_char(c);
789            }
790            //
791            // Handle end of escape sequence.
792            // TODO: Handle double-quote specific escape sequences.
793            else if state.in_escape {
794                state.in_escape = false;
795                self.consume_char()?;
796                state.append_char(c);
797            } else if (state.unquoted()
798                || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
799                && (c == '$' || c == '`')
800            {
801                // TODO: handle quoted $ or ` in a double quote
802                if c == '$' {
803                    // Consume the '$' so we can peek beyond.
804                    self.consume_char()?;
805
806                    // Now peek beyond to see what we have.
807                    let char_after_dollar_sign = self.peek_char()?;
808                    match char_after_dollar_sign {
809                        Some('(') => {
810                            // Add the '$' we already consumed to the token.
811                            state.append_char('$');
812
813                            // Consume the '(' and add it to the token.
814                            state.append_char(self.next_char()?.unwrap());
815
816                            // Check to see if this is possibly an arithmetic expression
817                            // (i.e., one that starts with `$((`).
818                            let mut required_end_parens = 1;
819                            if matches!(self.peek_char()?, Some('(')) {
820                                // Consume the second '(' and add it to the token.
821                                state.append_char(self.next_char()?.unwrap());
822                                // Keep track that we'll need to see *2* end parentheses
823                                // to leave this construct.
824                                required_end_parens = 2;
825                                // Keep track that we're in an arithmetic expression, since
826                                // some text will be interpreted differently as a result
827                                // (e.g., << is a left shift operator and not a here doc
828                                // input redirection operator).
829                                self.cross_state.arithmetic_expansion = true;
830                            }
831
832                            let mut pending_here_doc_tokens = vec![];
833                            let mut drain_here_doc_tokens = false;
834
835                            loop {
836                                let cur_token = if drain_here_doc_tokens
837                                    && !pending_here_doc_tokens.is_empty()
838                                {
839                                    if pending_here_doc_tokens.len() == 1 {
840                                        drain_here_doc_tokens = false;
841                                    }
842
843                                    pending_here_doc_tokens.remove(0)
844                                } else {
845                                    let cur_token = self.next_token_until(Some(')'))?;
846
847                                    // See if this is a here-document-related token we need to hold
848                                    // onto until after we've seen all the tokens that need to show
849                                    // up before we get to the body.
850                                    if matches!(
851                                        cur_token.reason,
852                                        TokenEndReason::HereDocumentBodyStart
853                                            | TokenEndReason::HereDocumentBodyEnd
854                                            | TokenEndReason::HereDocumentEndTag
855                                    ) {
856                                        pending_here_doc_tokens.push(cur_token);
857                                        continue;
858                                    }
859
860                                    cur_token
861                                };
862
863                                if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
864                                    && !pending_here_doc_tokens.is_empty()
865                                {
866                                    pending_here_doc_tokens.push(cur_token);
867                                    drain_here_doc_tokens = true;
868                                    continue;
869                                }
870
871                                if let Some(cur_token_value) = cur_token.token {
872                                    state.append_str(cur_token_value.to_str());
873
874                                    // If we encounter an embedded open parenthesis, then note that
875                                    // we'll have to see the matching end to it before we worry
876                                    // about the end of the
877                                    // containing construct.
878                                    if matches!(cur_token_value, Token::Operator(o, _) if o == "(")
879                                    {
880                                        required_end_parens += 1;
881                                    }
882                                }
883
884                                match cur_token.reason {
885                                    TokenEndReason::HereDocumentBodyStart => {
886                                        state.append_char('\n')
887                                    }
888                                    TokenEndReason::NonNewLineBlank => state.append_char(' '),
889                                    TokenEndReason::SpecifiedTerminatingChar => {
890                                        // We hit the ')' we were looking for. If this is the last
891                                        // end parenthesis we needed to find, then we'll exit the
892                                        // loop and consume
893                                        // and append it.
894                                        required_end_parens -= 1;
895                                        if required_end_parens == 0 {
896                                            break;
897                                        }
898
899                                        // This wasn't the *last* end parenthesis char, so let's
900                                        // consume and append it here before we loop around again.
901                                        state.append_char(self.next_char()?.unwrap());
902                                    }
903                                    TokenEndReason::EndOfInput => {
904                                        return Err(TokenizerError::UnterminatedCommandSubstitution)
905                                    }
906                                    _ => (),
907                                }
908                            }
909
910                            self.cross_state.arithmetic_expansion = false;
911
912                            state.append_char(self.next_char()?.unwrap());
913                        }
914
915                        Some('{') => {
916                            // Add the '$' we already consumed to the token.
917                            state.append_char('$');
918
919                            // Consume the '{' and add it to the token.
920                            state.append_char(self.next_char()?.unwrap());
921
922                            let mut pending_here_doc_tokens = vec![];
923                            let mut drain_here_doc_tokens = false;
924
925                            loop {
926                                let cur_token = if drain_here_doc_tokens
927                                    && !pending_here_doc_tokens.is_empty()
928                                {
929                                    if pending_here_doc_tokens.len() == 1 {
930                                        drain_here_doc_tokens = false;
931                                    }
932
933                                    pending_here_doc_tokens.remove(0)
934                                } else {
935                                    let cur_token = self.next_token_until(Some('}'))?;
936
937                                    // See if this is a here-document-related token we need to hold
938                                    // onto until after we've seen all the tokens that need to show
939                                    // up before we get to the body.
940                                    if matches!(
941                                        cur_token.reason,
942                                        TokenEndReason::HereDocumentBodyStart
943                                            | TokenEndReason::HereDocumentBodyEnd
944                                            | TokenEndReason::HereDocumentEndTag
945                                    ) {
946                                        pending_here_doc_tokens.push(cur_token);
947                                        continue;
948                                    }
949
950                                    cur_token
951                                };
952
953                                if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
954                                    && !pending_here_doc_tokens.is_empty()
955                                {
956                                    pending_here_doc_tokens.push(cur_token);
957                                    drain_here_doc_tokens = true;
958                                    continue;
959                                }
960
961                                if let Some(cur_token_value) = cur_token.token {
962                                    state.append_str(cur_token_value.to_str())
963                                }
964
965                                match cur_token.reason {
966                                    TokenEndReason::HereDocumentBodyStart => {
967                                        state.append_char('\n')
968                                    }
969                                    TokenEndReason::NonNewLineBlank => state.append_char(' '),
970                                    TokenEndReason::SpecifiedTerminatingChar => {
971                                        // We hit the end brace we were looking for but did not
972                                        // yet consume it. Do so now.
973                                        state.append_char(self.next_char()?.unwrap());
974                                        break;
975                                    }
976                                    TokenEndReason::EndOfInput => {
977                                        return Err(TokenizerError::UnterminatedVariable)
978                                    }
979                                    _ => (),
980                                }
981                            }
982                        }
983                        _ => {
984                            // This is either a different character, or else the end of the string.
985                            // Either way, add the '$' we already consumed to the token.
986                            state.append_char('$');
987                        }
988                    }
989                } else {
990                    // We look for the terminating backquote. First disable normal consumption and
991                    // consume the starting backquote.
992                    let backquote_pos = self.cross_state.cursor.clone();
993                    self.consume_char()?;
994
995                    // Add the opening backquote to the token.
996                    state.append_char(c);
997
998                    // Now continue until we see an unescaped backquote.
999                    let mut escaping_enabled = false;
1000                    let mut done = false;
1001                    while !done {
1002                        // Read (and consume) the next char.
1003                        let next_char_in_backquote = self.next_char()?;
1004                        if let Some(cib) = next_char_in_backquote {
1005                            // Include it in the token no matter what.
1006                            state.append_char(cib);
1007
1008                            // Watch out for escaping.
1009                            if !escaping_enabled && cib == '\\' {
1010                                escaping_enabled = true;
1011                            } else {
1012                                // Look for an unescaped backquote to terminate.
1013                                if !escaping_enabled && cib == '`' {
1014                                    done = true;
1015                                }
1016                                escaping_enabled = false;
1017                            }
1018                        } else {
1019                            return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1020                        }
1021                    }
1022                }
1023            }
1024            //
1025            // [Extension]
1026            // If extended globbing is enabled, the last consumed character is an
1027            // unquoted start of an extglob pattern, *and* if the current character
1028            // is an open parenthesis, then this begins an extglob pattern.
1029            else if c == '('
1030                && self.options.enable_extended_globbing
1031                && state.unquoted()
1032                && !state.in_operator()
1033                && state
1034                    .current_token()
1035                    .ends_with(|x| self.can_start_extglob(x))
1036            {
1037                // Consume the '(' and append it.
1038                self.consume_char()?;
1039                state.append_char(c);
1040
1041                let mut paren_depth = 1;
1042
1043                // Keep consuming until we see the matching end ')'.
1044                while paren_depth > 0 {
1045                    if let Some(extglob_char) = self.next_char()? {
1046                        // Include it in the token.
1047                        state.append_char(extglob_char);
1048
1049                        // Look for ')' to terminate.
1050                        // TODO: handle escaping?
1051                        if extglob_char == '(' {
1052                            paren_depth += 1;
1053                        } else if extglob_char == ')' {
1054                            paren_depth -= 1;
1055                        }
1056                    } else {
1057                        return Err(TokenizerError::UnterminatedExtendedGlob(
1058                            self.cross_state.cursor.clone(),
1059                        ));
1060                    }
1061                }
1062            //
1063            // If the character *can* start an operator, then it will.
1064            //
1065            } else if state.unquoted() && self.can_start_operator(c) {
1066                if state.started_token() {
1067                    result = state.delimit_current_token(
1068                        TokenEndReason::OperatorStart,
1069                        &mut self.cross_state,
1070                    )?;
1071                } else {
1072                    state.token_is_operator = true;
1073                    self.consume_char()?;
1074                    state.append_char(c);
1075                }
1076            //
1077            // Whitespace gets discarded (and delimits tokens).
1078            //
1079            } else if state.unquoted() && is_blank(c) {
1080                if state.started_token() {
1081                    result = state.delimit_current_token(
1082                        TokenEndReason::NonNewLineBlank,
1083                        &mut self.cross_state,
1084                    )?;
1085                } else {
1086                    // Make sure we don't include this char in the token range.
1087                    state.start_position.column += 1;
1088                    state.start_position.index += 1;
1089                }
1090
1091                self.consume_char()?;
1092            }
1093            //
1094            // N.B. We need to remember if we were recursively called in a variable
1095            // expansion expression; in that case we won't think a token was started but...
1096            // we'd be wrong.
1097            else if !state.token_is_operator
1098                && (state.started_token() || matches!(terminating_char, Some('}')))
1099            {
1100                self.consume_char()?;
1101                state.append_char(c);
1102            } else if c == '#' {
1103                // Consume the '#'.
1104                self.consume_char()?;
1105
1106                let mut done = false;
1107                while !done {
1108                    done = match self.peek_char()? {
1109                        Some('\n') => true,
1110                        None => true,
1111                        _ => {
1112                            // Consume the peeked char; it's part of the comment.
1113                            self.consume_char()?;
1114                            false
1115                        }
1116                    };
1117                }
1118                // Re-start loop as if the comment never happened.
1119                continue;
1120            //
1121            // In all other cases where we have an in-progress token, we delimit here.
1122            //
1123            } else if state.started_token() {
1124                result =
1125                    state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1126            } else {
1127                // If we got here, then we don't have a token in progress and we're not starting an
1128                // operator. Add the character to a new token.
1129                self.consume_char()?;
1130                state.append_char(c);
1131            }
1132        }
1133
1134        let result = result.unwrap();
1135
1136        Ok(result)
1137    }
1138
1139    fn can_start_extglob(&self, c: char) -> bool {
1140        matches!(c, '@' | '!' | '?' | '+' | '*')
1141    }
1142
1143    fn can_start_operator(&self, c: char) -> bool {
1144        matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1145    }
1146
1147    fn is_operator(&self, s: &str) -> bool {
1148        // Handle non-POSIX operators.
1149        if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1150            return true;
1151        }
1152
1153        matches!(
1154            s,
1155            "&" | "&&"
1156                | "("
1157                | ")"
1158                | ";"
1159                | ";;"
1160                | "\n"
1161                | "|"
1162                | "||"
1163                | "<"
1164                | ">"
1165                | ">|"
1166                | "<<"
1167                | ">>"
1168                | "<&"
1169                | ">&"
1170                | "<<-"
1171                | "<>"
1172        )
1173    }
1174}
1175
1176impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1177    type Item = Result<TokenizeResult, TokenizerError>;
1178
1179    fn next(&mut self) -> Option<Self::Item> {
1180        match self.next_token() {
1181            #[allow(clippy::manual_map)]
1182            Ok(result) => match result.token {
1183                Some(_) => Some(Ok(result)),
1184                None => None,
1185            },
1186            Err(e) => Some(Err(e)),
1187        }
1188    }
1189}
1190
1191fn is_blank(c: char) -> bool {
1192    c == ' ' || c == '\t'
1193}
1194
1195fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1196    // If we're currently escaped, then nothing affects quoting.
1197    if state.in_escape {
1198        return false;
1199    }
1200
1201    match state.quote_mode {
1202        // When we're in a double quote, only a subset of escape sequences are recognized.
1203        QuoteMode::Double(_) => {
1204            if c == '\\' {
1205                // TODO: handle backslash in double quote
1206                true
1207            } else {
1208                false
1209            }
1210        }
1211        // When we're in a single quote, nothing affects quoting.
1212        QuoteMode::Single(_) => false,
1213        // When we're not already in a quote, then we can straightforwardly look for a
1214        // quote mark or backslash.
1215        QuoteMode::None => is_quoting_char(c),
1216    }
1217}
1218
1219fn is_quoting_char(c: char) -> bool {
1220    matches!(c, '\\' | '\'' | '\"')
1221}
1222
1223/// Return a string with all the quoting removed.
1224///
1225/// # Arguments
1226///
1227/// * `s` - The string to unquote.
1228pub fn unquote_str(s: &str) -> String {
1229    let mut result = String::new();
1230
1231    let mut in_escape = false;
1232    for c in s.chars() {
1233        match c {
1234            c if in_escape => {
1235                result.push(c);
1236                in_escape = false;
1237            }
1238            '\\' => in_escape = true,
1239            c if is_quoting_char(c) => (),
1240            c => result.push(c),
1241        }
1242    }
1243
1244    result
1245}
1246
1247#[cfg(test)]
1248mod tests {
1249    use super::*;
1250    use anyhow::Result;
1251    // use assert_matches::assert_matches;
1252    use pretty_assertions::{assert_eq, assert_matches};
1253
1254    #[test]
1255    fn tokenize_empty() -> Result<()> {
1256        let tokens = tokenize_str("")?;
1257        assert_eq!(tokens.len(), 0);
1258        Ok(())
1259    }
1260
1261    #[test]
1262    fn tokenize_line_continuation() -> Result<()> {
1263        let tokens = tokenize_str(
1264            r"a\
1265bc",
1266        )?;
1267        assert_matches!(
1268            &tokens[..],
1269            [t1 @ Token::Word(..)] if t1.to_str() == "abc"
1270        );
1271        Ok(())
1272    }
1273
1274    #[test]
1275    fn tokenize_operators() -> Result<()> {
1276        assert_matches!(
1277            &tokenize_str("a>>b")?[..],
1278            [t1 @ Token::Word(..), t2 @ Token::Operator(..), t3 @ Token::Word(..)] if
1279                t1.to_str() == "a" &&
1280                t2.to_str() == ">>" &&
1281                t3.to_str() == "b"
1282        );
1283        Ok(())
1284    }
1285
1286    #[test]
1287    fn tokenize_comment() -> Result<()> {
1288        let tokens = tokenize_str(
1289            r#"a #comment
1290"#,
1291        )?;
1292        assert_matches!(
1293            &tokens[..],
1294            [t1 @ Token::Word(..), t2 @ Token::Operator(..)] if
1295                t1.to_str() == "a" &&
1296                t2.to_str() == "\n"
1297        );
1298        Ok(())
1299    }
1300
1301    #[test]
1302    fn tokenize_comment_at_eof() -> Result<()> {
1303        assert_matches!(
1304            &tokenize_str(r#"a #comment"#)?[..],
1305            [t1 @ Token::Word(..)] if t1.to_str() == "a"
1306        );
1307        Ok(())
1308    }
1309
1310    #[test]
1311    fn tokenize_empty_here_doc() -> Result<()> {
1312        let tokens = tokenize_str(
1313            r#"cat <<HERE
1314HERE
1315"#,
1316        )?;
1317        assert_matches!(
1318            &tokens[..],
1319            [t1 @ Token::Word(..),
1320             t2 @ Token::Operator(..),
1321             t3 @ Token::Word(..),
1322             t4 @ Token::Word(..),
1323             t5 @ Token::Word(..),
1324             t6 @ Token::Operator(..)] if
1325                t1.to_str() == "cat" &&
1326                t2.to_str() == "<<" &&
1327                t3.to_str() == "HERE" &&
1328                t4.to_str() == "" &&
1329                t5.to_str() == "HERE" &&
1330                t6.to_str() == "\n"
1331        );
1332        Ok(())
1333    }
1334
1335    #[test]
1336    fn tokenize_here_doc() -> Result<()> {
1337        let tokens = tokenize_str(
1338            r#"cat <<HERE
1339SOMETHING
1340HERE
1341echo after
1342"#,
1343        )?;
1344        assert_matches!(
1345            &tokens[..],
1346            [t1 @ Token::Word(..),
1347             t2 @ Token::Operator(..),
1348             t3 @ Token::Word(..),
1349             t4 @ Token::Word(..),
1350             t5 @ Token::Word(..),
1351             t6 @ Token::Operator(..),
1352             t7 @ Token::Word(..),
1353             t8 @ Token::Word(..),
1354             t9 @ Token::Operator(..)] if
1355                t1.to_str() == "cat" &&
1356                t2.to_str() == "<<" &&
1357                t3.to_str() == "HERE" &&
1358                t4.to_str() == "SOMETHING\n" &&
1359                t5.to_str() == "HERE" &&
1360                t6.to_str() == "\n" &&
1361                t7.to_str() == "echo" &&
1362                t8.to_str() == "after" &&
1363                t9.to_str() == "\n"
1364        );
1365        Ok(())
1366    }
1367
1368    #[test]
1369    fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1370        let tokens = tokenize_str(
1371            r#"cat <<-HERE
1372	SOMETHING
1373	HERE
1374"#,
1375        )?;
1376        assert_matches!(
1377            &tokens[..],
1378            [t1 @ Token::Word(..),
1379             t2 @ Token::Operator(..),
1380             t3 @ Token::Word(..),
1381             t4 @ Token::Word(..),
1382             t5 @ Token::Word(..),
1383             t6 @ Token::Operator(..)] if
1384                t1.to_str() == "cat" &&
1385                t2.to_str() == "<<-" &&
1386                t3.to_str() == "HERE" &&
1387                t4.to_str() == "SOMETHING\n" &&
1388                t5.to_str() == "HERE" &&
1389                t6.to_str() == "\n"
1390        );
1391        Ok(())
1392    }
1393
1394    #[test]
1395    fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1396        let tokens = tokenize_str(
1397            r#"cat <<EOF | wc -l
1398A B C
13991 2 3
1400D E F
1401EOF
1402"#,
1403        )?;
1404        assert_matches!(
1405            &tokens[..],
1406            [t1 @ Token::Word(..),
1407             t2 @ Token::Operator(..),
1408             t3 @ Token::Word(..),
1409             t4 @ Token::Word(..),
1410             t5 @ Token::Word(..),
1411             t6 @ Token::Operator(..),
1412             t7 @ Token::Word(..),
1413             t8 @ Token::Word(..),
1414             t9 @ Token::Operator(..)] if
1415                t1.to_str() == "cat" &&
1416                t2.to_str() == "<<" &&
1417                t3.to_str() == "EOF" &&
1418                t4.to_str() == "A B C\n1 2 3\nD E F\n" &&
1419                t5.to_str() == "EOF" &&
1420                t6.to_str() == "|" &&
1421                t7.to_str() == "wc" &&
1422                t8.to_str() == "-l" &&
1423                t9.to_str() == "\n"
1424        );
1425
1426        Ok(())
1427    }
1428
1429    #[test]
1430    fn tokenize_multiple_here_docs() -> Result<()> {
1431        let tokens = tokenize_str(
1432            r#"cat <<HERE1 <<HERE2
1433SOMETHING
1434HERE1
1435OTHER
1436HERE2
1437echo after
1438"#,
1439        )?;
1440        assert_matches!(
1441            &tokens[..],
1442            [t1 @ Token::Word(..),
1443             t2 @ Token::Operator(..),
1444             t3 @ Token::Word(..),
1445             t4 @ Token::Word(..),
1446             t5 @ Token::Word(..),
1447             t6 @ Token::Operator(..),
1448             t7 @ Token::Word(..),
1449             t8 @ Token::Word(..),
1450             t9 @ Token::Word(..),
1451             t10 @ Token::Operator(..),
1452             t11 @ Token::Word(..),
1453             t12 @ Token::Word(..),
1454             t13 @ Token::Operator(..)] if
1455                t1.to_str() == "cat" &&
1456                t2.to_str() == "<<" &&
1457                t3.to_str() == "HERE1" &&
1458                t4.to_str() == "SOMETHING\n" &&
1459                t5.to_str() == "HERE1" &&
1460                t6.to_str() == "<<" &&
1461                t7.to_str() == "HERE2" &&
1462                t8.to_str() == "OTHER\n" &&
1463                t9.to_str() == "HERE2" &&
1464                t10.to_str() == "\n" &&
1465                t11.to_str() == "echo" &&
1466                t12.to_str() == "after" &&
1467                t13.to_str() == "\n"
1468        );
1469        Ok(())
1470    }
1471
1472    #[test]
1473    fn tokenize_unterminated_here_doc() -> Result<()> {
1474        let result = tokenize_str(
1475            r#"cat <<HERE
1476SOMETHING
1477"#,
1478        );
1479        assert!(result.is_err());
1480        Ok(())
1481    }
1482
1483    #[test]
1484    fn tokenize_missing_here_tag() -> Result<()> {
1485        let result = tokenize_str(
1486            r"cat <<
1487",
1488        );
1489        assert!(result.is_err());
1490        Ok(())
1491    }
1492
1493    #[test]
1494    fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1495        let tokens = tokenize_str(
1496            r#"echo $(cat <<HERE
1497TEXT
1498HERE
1499)"#,
1500        )?;
1501        assert_matches!(
1502            &tokens[..],
1503            [t1 @ Token::Word(..),
1504             t2 @ Token::Word(..)] if
1505                t1.to_str() == "echo" &&
1506                t2.to_str() == "$(cat <<HERE\nTEXT\nHERE\n)"
1507        );
1508        Ok(())
1509    }
1510
1511    #[test]
1512    fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1513        let tokens = tokenize_str(
1514            r#"echo $(cat <<HERE1 <<HERE2 | wc -l
1515TEXT
1516HERE1
1517OTHER
1518HERE2
1519)"#,
1520        )?;
1521        assert_matches!(
1522            &tokens[..],
1523            [t1 @ Token::Word(..),
1524             t2 @ Token::Word(..)] if
1525                t1.to_str() == "echo" &&
1526                t2.to_str() == "$(cat <<HERE1 <<HERE2 |wc -l\nTEXT\nHERE1\nOTHER\nHERE2\n)"
1527        );
1528        Ok(())
1529    }
1530
1531    #[test]
1532    fn tokenize_simple_backquote() -> Result<()> {
1533        assert_matches!(
1534            &tokenize_str(r#"echo `echo hi`"#)?[..],
1535            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1536                t1.to_str() == "echo" &&
1537                t2.to_str() == "`echo hi`"
1538        );
1539        Ok(())
1540    }
1541
1542    #[test]
1543    fn tokenize_backquote_with_escape() -> Result<()> {
1544        assert_matches!(
1545            &tokenize_str(r"echo `echo\`hi`")?[..],
1546            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1547                t1.to_str() == "echo" &&
1548                t2.to_str() == r"`echo\`hi`"
1549        );
1550        Ok(())
1551    }
1552
1553    #[test]
1554    fn tokenize_unterminated_backquote() {
1555        assert_matches!(
1556            tokenize_str("`"),
1557            Err(TokenizerError::UnterminatedBackquote(_))
1558        );
1559    }
1560
1561    #[test]
1562    fn tokenize_unterminated_command_substitution() {
1563        assert_matches!(
1564            tokenize_str("$("),
1565            Err(TokenizerError::UnterminatedCommandSubstitution)
1566        );
1567    }
1568
1569    #[test]
1570    fn tokenize_command_substitution() -> Result<()> {
1571        assert_matches!(
1572            &tokenize_str("a$(echo hi)b c")?[..],
1573            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1574                t1.to_str() == "a$(echo hi)b" &&
1575                t2.to_str() == "c"
1576        );
1577        Ok(())
1578    }
1579
1580    #[test]
1581    fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1582        assert_matches!(
1583            &tokenize_str("echo $(echo !(x))")?[..],
1584            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1585                t1.to_str() == "echo" &&
1586                t2.to_str() == "$(echo !(x))"
1587        );
1588        Ok(())
1589    }
1590
1591    #[test]
1592    fn tokenize_arithmetic_expression() -> Result<()> {
1593        assert_matches!(
1594            &tokenize_str("a$((1+2))b c")?[..],
1595            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1596                t1.to_str() == "a$((1+2))b" &&
1597                t2.to_str() == "c"
1598        );
1599        Ok(())
1600    }
1601
1602    #[test]
1603    fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1604        // N.B. The spacing comes out a bit odd, but it gets processed okay
1605        // by later stages.
1606        assert_matches!(
1607            &tokenize_str("$(( 1 ))")?[..],
1608            [t1 @ Token::Word(..)] if
1609                t1.to_str() == "$((1 ))"
1610        );
1611        Ok(())
1612    }
1613    #[test]
1614    fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1615        assert_matches!(
1616            &tokenize_str("$(( (0) ))")?[..],
1617            [t1 @ Token::Word(..)] if
1618                t1.to_str() == "$(((0)))"
1619        );
1620        Ok(())
1621    }
1622
1623    #[test]
1624    fn tokenize_special_parameters() -> Result<()> {
1625        assert_matches!(
1626            &tokenize_str("$$")?[..],
1627            [t1 @ Token::Word(..)] if t1.to_str() == "$$"
1628        );
1629        assert_matches!(
1630            &tokenize_str("$@")?[..],
1631            [t1 @ Token::Word(..)] if t1.to_str() == "$@"
1632        );
1633        assert_matches!(
1634            &tokenize_str("$!")?[..],
1635            [t1 @ Token::Word(..)] if t1.to_str() == "$!"
1636        );
1637        assert_matches!(
1638            &tokenize_str("$?")?[..],
1639            [t1 @ Token::Word(..)] if t1.to_str() == "$?"
1640        );
1641        assert_matches!(
1642            &tokenize_str("$*")?[..],
1643            [t1 @ Token::Word(..)] if t1.to_str() == "$*"
1644        );
1645        Ok(())
1646    }
1647
1648    #[test]
1649    fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1650        assert_matches!(
1651            &tokenize_str("$x")?[..],
1652            [t1 @ Token::Word(..)] if t1.to_str() == "$x"
1653        );
1654        assert_matches!(
1655            &tokenize_str("a$x")?[..],
1656            [t1 @ Token::Word(..)] if t1.to_str() == "a$x"
1657        );
1658        Ok(())
1659    }
1660
1661    #[test]
1662    fn tokenize_unterminated_parameter_expansion() {
1663        assert_matches!(
1664            tokenize_str("${x"),
1665            Err(TokenizerError::UnterminatedVariable)
1666        );
1667    }
1668
1669    #[test]
1670    fn tokenize_braced_parameter_expansion() -> Result<()> {
1671        assert_matches!(
1672            &tokenize_str("${x}")?[..],
1673            [t1 @ Token::Word(..)] if t1.to_str() == "${x}"
1674        );
1675        assert_matches!(
1676            &tokenize_str("a${x}b")?[..],
1677            [t1 @ Token::Word(..)] if t1.to_str() == "a${x}b"
1678        );
1679        Ok(())
1680    }
1681
1682    #[test]
1683    fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1684        assert_matches!(
1685            &tokenize_str(r"a${x\}}b")?[..],
1686            [t1 @ Token::Word(..)] if t1.to_str() == r"a${x\}}b"
1687        );
1688        Ok(())
1689    }
1690
1691    #[test]
1692    fn tokenize_whitespace() -> Result<()> {
1693        assert_matches!(
1694            &tokenize_str("1 2 3")?[..],
1695            [t1 @ Token::Word(..), t2 @ Token::Word(..), t3 @ Token::Word(..)] if
1696                t1.to_str() == "1" &&
1697                t2.to_str() == "2" &&
1698                t3.to_str() == "3"
1699        );
1700        Ok(())
1701    }
1702
1703    #[test]
1704    fn tokenize_escaped_whitespace() -> Result<()> {
1705        assert_matches!(
1706            &tokenize_str(r"1\ 2 3")?[..],
1707            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1708                t1.to_str() == r"1\ 2" &&
1709                t2.to_str() == "3"
1710        );
1711        Ok(())
1712    }
1713
1714    #[test]
1715    fn tokenize_single_quote() -> Result<()> {
1716        assert_matches!(
1717            &tokenize_str(r"x'a b'y")?[..],
1718            [t1 @ Token::Word(..)] if
1719                t1.to_str() == r"x'a b'y"
1720        );
1721        Ok(())
1722    }
1723
1724    #[test]
1725    fn tokenize_double_quote() -> Result<()> {
1726        assert_matches!(
1727            &tokenize_str(r#"x"a b"y"#)?[..],
1728            [t1 @ Token::Word(..)] if
1729                t1.to_str() == r#"x"a b"y"#
1730        );
1731        Ok(())
1732    }
1733
1734    #[test]
1735    fn tokenize_double_quoted_command_substitution() -> Result<()> {
1736        assert_matches!(
1737            &tokenize_str(r#"x"$(echo hi)"y"#)?[..],
1738            [t1 @ Token::Word(..)] if
1739                t1.to_str() == r#"x"$(echo hi)"y"#
1740        );
1741        Ok(())
1742    }
1743
1744    #[test]
1745    fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1746        assert_matches!(
1747            &tokenize_str(r#"x"$((1+2))"y"#)?[..],
1748            [t1 @ Token::Word(..)] if
1749                t1.to_str() == r#"x"$((1+2))"y"#
1750        );
1751        Ok(())
1752    }
1753
1754    #[test]
1755    fn test_quote_removal() {
1756        assert_eq!(unquote_str(r#""hello""#), "hello");
1757        assert_eq!(unquote_str(r#"'hello'"#), "hello");
1758        assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1759        assert_eq!(unquote_str(r#"'hel\'lo'"#), r#"hel'lo"#);
1760    }
1761}