brush_parser/
tokenizer.rs

1use std::borrow::Cow;
2use std::fmt::Display;
3use utf8_chars::BufReadCharsExt;
4
5#[allow(dead_code)]
6#[derive(Clone, Debug)]
7pub(crate) enum TokenEndReason {
8    /// End of input was reached.
9    EndOfInput,
10    /// An unescaped newline char was reached.
11    UnescapedNewLine,
12    /// Specified terminating char.
13    SpecifiedTerminatingChar,
14    /// A non-newline blank char was reached.
15    NonNewLineBlank,
16    /// A here-document's body is starting.
17    HereDocumentBodyStart,
18    /// A here-document's body was terminated.
19    HereDocumentBodyEnd,
20    /// A here-document's end tag was reached.
21    HereDocumentEndTag,
22    /// An operator was started.
23    OperatorStart,
24    /// An operator was terminated.
25    OperatorEnd,
26    /// Some other condition was reached.
27    Other,
28}
29
30/// Represents a position in a source shell script.
31#[derive(Clone, Default, Debug)]
32#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
33pub struct SourcePosition {
34    /// The 0-based index of the character in the input stream.
35    pub index: i32,
36    /// The 1-based line number.
37    pub line: i32,
38    /// The 1-based column number.
39    pub column: i32,
40}
41
42impl Display for SourcePosition {
43    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
44        f.write_fmt(format_args!("line {} col {}", self.line, self.column))
45    }
46}
47
48/// Represents the location of a token in its source shell script.
49#[derive(Clone, Default, Debug)]
50#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
51pub struct TokenLocation {
52    /// The start position of the token.
53    pub start: SourcePosition,
54    /// The end position of the token (exclusive).
55    pub end: SourcePosition,
56}
57
58/// Represents a token extracted from a shell script.
59#[derive(Clone, Debug)]
60#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
61pub enum Token {
62    /// An operator token.
63    Operator(String, TokenLocation),
64    /// A word token.
65    Word(String, TokenLocation),
66}
67
68impl Token {
69    /// Returns the string value of the token.
70    pub fn to_str(&self) -> &str {
71        match self {
72            Token::Operator(s, _) => s,
73            Token::Word(s, _) => s,
74        }
75    }
76
77    /// Returns the location of the token in the source script.
78    pub fn location(&self) -> &TokenLocation {
79        match self {
80            Token::Operator(_, l) => l,
81            Token::Word(_, l) => l,
82        }
83    }
84}
85
86/// Encapsulates the result of tokenizing a shell script.
87#[derive(Clone, Debug)]
88pub(crate) struct TokenizeResult {
89    /// Reason for tokenization ending.
90    pub reason: TokenEndReason,
91    /// The token that was extracted, if any.
92    pub token: Option<Token>,
93}
94
95/// Represents an error that occurred during tokenization.
96#[derive(thiserror::Error, Debug)]
97pub enum TokenizerError {
98    /// An unterminated escape sequence was encountered at the end of the input stream.
99    #[error("unterminated escape sequence")]
100    UnterminatedEscapeSequence,
101
102    /// An unterminated single-quoted substring was encountered at the end of the input stream.
103    #[error("unterminated single quote at {0}")]
104    UnterminatedSingleQuote(SourcePosition),
105
106    /// An unterminated double-quoted substring was encountered at the end of the input stream.
107    #[error("unterminated double quote at {0}")]
108    UnterminatedDoubleQuote(SourcePosition),
109
110    /// An unterminated back-quoted substring was encountered at the end of the input stream.
111    #[error("unterminated backquote near {0}")]
112    UnterminatedBackquote(SourcePosition),
113
114    /// An unterminated extended glob (extglob) pattern was encountered at the end of the input
115    /// stream.
116    #[error("unterminated extglob near {0}")]
117    UnterminatedExtendedGlob(SourcePosition),
118
119    /// An unterminated variable expression was encountered at the end of the input stream.
120    #[error("unterminated variable expression")]
121    UnterminatedVariable,
122
123    /// An unterminated command substitiion was encountered at the end of the input stream.
124    #[error("unterminated command substitution")]
125    UnterminatedCommandSubstitution,
126
127    /// An error occurred decoding UTF-8 characters in the input stream.
128    #[error("failed to decode UTF-8 characters")]
129    FailedDecoding,
130
131    /// An I/O here tag was missing.
132    #[error("missing here tag for here document body")]
133    MissingHereTagForDocumentBody,
134
135    /// The indicated I/O here tag was missing.
136    #[error("missing here tag '{0}'")]
137    MissingHereTag(String),
138
139    /// An unterminated here document sequence was encountered at the end of the input stream.
140    #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
141    UnterminatedHereDocuments(String, String),
142
143    /// An I/O error occurred while reading from the input stream.
144    #[error("failed to read input")]
145    ReadError(#[from] std::io::Error),
146}
147
148impl TokenizerError {
149    /// Returns true if the error represents an error that could possibly be due
150    /// to an incomplete input stream.
151    pub fn is_incomplete(&self) -> bool {
152        matches!(
153            self,
154            Self::UnterminatedEscapeSequence
155                | Self::UnterminatedSingleQuote(..)
156                | Self::UnterminatedDoubleQuote(..)
157                | Self::UnterminatedBackquote(..)
158                | Self::UnterminatedCommandSubstitution
159                | Self::UnterminatedVariable
160                | Self::UnterminatedExtendedGlob(..)
161                | Self::UnterminatedHereDocuments(..)
162        )
163    }
164}
165
166/// Encapsulates a sequence of tokens.
167#[derive(Debug)]
168pub(crate) struct Tokens<'a> {
169    /// Sequence of tokens.
170    pub tokens: &'a [Token],
171}
172
173#[derive(Clone, Debug)]
174enum QuoteMode {
175    None,
176    Single(SourcePosition),
177    Double(SourcePosition),
178}
179
180#[derive(Clone, Debug, Default)]
181enum HereState {
182    /// In this state, we are not currently tracking any here-documents.
183    #[default]
184    None,
185    /// In this state, we expect that the next token will be a here tag.
186    NextTokenIsHereTag { remove_tabs: bool },
187    /// In this state, the *current* token is a here tag.
188    CurrentTokenIsHereTag {
189        remove_tabs: bool,
190        operator_token_result: TokenizeResult,
191    },
192    /// In this state, we expect that the *next line* will be the body of
193    /// a here-document.
194    NextLineIsHereDoc,
195    /// In this state, we are in the set of lines that comprise 1 or more
196    /// consecutive here-document bodies.
197    InHereDocs,
198}
199
200#[derive(Clone, Debug)]
201struct HereTag {
202    tag: String,
203    tag_was_escaped_or_quoted: bool,
204    remove_tabs: bool,
205    position: SourcePosition,
206    tokens: Vec<TokenizeResult>,
207    pending_tokens_after: Vec<TokenizeResult>,
208}
209
210#[derive(Clone, Debug)]
211struct CrossTokenParseState {
212    /// Cursor within the overall token stream; used for error reporting.
213    cursor: SourcePosition,
214    /// Current state of parsing here-documents.
215    here_state: HereState,
216    /// Ordered queue of here tags for which we're still looking for matching here-document bodies.
217    current_here_tags: Vec<HereTag>,
218    /// Tokens already tokenized that should be used first to serve requests for tokens.
219    queued_tokens: Vec<TokenizeResult>,
220    /// Are we in an arithmetic expansion?
221    arithmetic_expansion: bool,
222}
223
224/// Options controlling how the tokenizer operates.
225#[derive(Clone, Debug, Hash, Eq, PartialEq)]
226pub struct TokenizerOptions {
227    /// Whether or not to enable extended globbing patterns (extglob).
228    pub enable_extended_globbing: bool,
229    /// Whether or not to operate in POSIX compliance mode.
230    #[allow(unused)]
231    pub posix_mode: bool,
232    /// Whether or not we're running in SH emulation mode.
233    pub sh_mode: bool,
234}
235
236impl Default for TokenizerOptions {
237    fn default() -> Self {
238        Self {
239            enable_extended_globbing: true,
240            posix_mode: false,
241            sh_mode: false,
242        }
243    }
244}
245
246/// A tokenizer for shell scripts.
247pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
248    char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
249    cross_state: CrossTokenParseState,
250    options: TokenizerOptions,
251}
252
253/// Encapsulates the current token parsing state.
254#[derive(Clone, Debug)]
255struct TokenParseState {
256    pub start_position: SourcePosition,
257    pub token_so_far: String,
258    pub token_is_operator: bool,
259    pub in_escape: bool,
260    pub quote_mode: QuoteMode,
261}
262
263impl TokenParseState {
264    pub fn new(start_position: &SourcePosition) -> Self {
265        TokenParseState {
266            start_position: start_position.clone(),
267            token_so_far: String::new(),
268            token_is_operator: false,
269            in_escape: false,
270            quote_mode: QuoteMode::None,
271        }
272    }
273
274    pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
275        let token_location = TokenLocation {
276            start: std::mem::take(&mut self.start_position),
277            end: end_position.clone(),
278        };
279
280        let token = if std::mem::take(&mut self.token_is_operator) {
281            Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
282        } else {
283            Token::Word(std::mem::take(&mut self.token_so_far), token_location)
284        };
285
286        self.start_position = end_position.clone();
287        self.in_escape = false;
288        self.quote_mode = QuoteMode::None;
289
290        token
291    }
292
293    pub fn started_token(&self) -> bool {
294        !self.token_so_far.is_empty()
295    }
296
297    pub fn append_char(&mut self, c: char) {
298        self.token_so_far.push(c);
299    }
300
301    pub fn append_str(&mut self, s: &str) {
302        self.token_so_far.push_str(s);
303    }
304
305    pub fn unquoted(&self) -> bool {
306        !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
307    }
308
309    pub fn current_token(&self) -> &str {
310        &self.token_so_far
311    }
312
313    pub fn is_specific_operator(&self, operator: &str) -> bool {
314        self.token_is_operator && self.current_token() == operator
315    }
316
317    pub fn in_operator(&self) -> bool {
318        self.token_is_operator
319    }
320
321    fn is_newline(&self) -> bool {
322        self.token_so_far == "\n"
323    }
324
325    fn replace_with_here_doc(&mut self, s: String) {
326        self.token_so_far = s;
327    }
328
329    pub fn delimit_current_token(
330        &mut self,
331        reason: TokenEndReason,
332        cross_token_state: &mut CrossTokenParseState,
333    ) -> Result<Option<TokenizeResult>, TokenizerError> {
334        // If we don't have anything in the token, then don't yield an empty string token
335        // *unless* it's the body of a here document.
336        if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
337            return Ok(Some(TokenizeResult {
338                reason,
339                token: None,
340            }));
341        }
342
343        // TODO: Make sure the here-tag meets criteria (and isn't a newline).
344        let current_here_state = std::mem::take(&mut cross_token_state.here_state);
345        match current_here_state {
346            HereState::NextTokenIsHereTag { remove_tabs } => {
347                // Don't yield the operator as a token yet. We need to make sure we collect
348                // up everything we need for all the here-documents with tags on this line.
349                let operator_token_result = TokenizeResult {
350                    reason,
351                    token: Some(self.pop(&cross_token_state.cursor)),
352                };
353
354                cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
355                    remove_tabs,
356                    operator_token_result,
357                };
358
359                return Ok(None);
360            }
361            HereState::CurrentTokenIsHereTag {
362                remove_tabs,
363                operator_token_result,
364            } => {
365                if self.is_newline() {
366                    return Err(TokenizerError::MissingHereTag(
367                        self.current_token().to_owned(),
368                    ));
369                }
370
371                cross_token_state.here_state = HereState::NextLineIsHereDoc;
372
373                // Include the trailing \n in the here tag so it's easier to check against.
374                let tag = std::format!("{}\n", self.current_token());
375                let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
376
377                let tag_token_result = TokenizeResult {
378                    reason,
379                    token: Some(self.pop(&cross_token_state.cursor)),
380                };
381
382                cross_token_state.current_here_tags.push(HereTag {
383                    tag,
384                    tag_was_escaped_or_quoted,
385                    remove_tabs,
386                    position: cross_token_state.cursor.clone(),
387                    tokens: vec![operator_token_result, tag_token_result],
388                    pending_tokens_after: vec![],
389                });
390
391                return Ok(None);
392            }
393            HereState::NextLineIsHereDoc => {
394                if self.is_newline() {
395                    cross_token_state.here_state = HereState::InHereDocs;
396                } else {
397                    cross_token_state.here_state = HereState::NextLineIsHereDoc;
398                }
399
400                if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
401                    let token = self.pop(&cross_token_state.cursor);
402                    let result = TokenizeResult {
403                        reason,
404                        token: Some(token),
405                    };
406
407                    last_here_tag.pending_tokens_after.push(result);
408                } else {
409                    return Err(TokenizerError::MissingHereTagForDocumentBody);
410                }
411
412                return Ok(None);
413            }
414            HereState::InHereDocs => {
415                // We hit the end of the current here-document.
416                let completed_here_tag = cross_token_state.current_here_tags.remove(0);
417
418                // First queue the redirection operator and (start) here-tag.
419                for here_token in completed_here_tag.tokens {
420                    cross_token_state.queued_tokens.push(here_token);
421                }
422
423                // Leave a hint that we are about to start a here-document.
424                cross_token_state.queued_tokens.push(TokenizeResult {
425                    reason: TokenEndReason::HereDocumentBodyStart,
426                    token: None,
427                });
428
429                // Then queue the body document we just finished.
430                cross_token_state.queued_tokens.push(TokenizeResult {
431                    reason,
432                    token: Some(self.pop(&cross_token_state.cursor)),
433                });
434
435                // Then queue up the (end) here-tag.
436                self.append_str(completed_here_tag.tag.trim_end_matches('\n'));
437                cross_token_state.queued_tokens.push(TokenizeResult {
438                    reason: TokenEndReason::HereDocumentEndTag,
439                    token: Some(self.pop(&cross_token_state.cursor)),
440                });
441
442                // Now we're ready to queue up any tokens that came between the completed
443                // here tag and the next here tag (or newline after it if it was the last).
444                for pending_token in completed_here_tag.pending_tokens_after {
445                    cross_token_state.queued_tokens.push(pending_token);
446                }
447
448                if cross_token_state.current_here_tags.is_empty() {
449                    cross_token_state.here_state = HereState::None;
450                } else {
451                    cross_token_state.here_state = HereState::InHereDocs;
452                }
453
454                return Ok(None);
455            }
456            HereState::None => (),
457        }
458
459        let token = self.pop(&cross_token_state.cursor);
460        let result = TokenizeResult {
461            reason,
462            token: Some(token),
463        };
464
465        Ok(Some(result))
466    }
467}
468
469/// Break the given input shell script string into tokens, returning the tokens.
470///
471/// # Arguments
472///
473/// * `input` - The shell script to tokenize.
474pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
475    tokenize_str_with_options(input, &TokenizerOptions::default())
476}
477
478/// Break the given input shell script string into tokens, returning the tokens.
479///
480/// # Arguments
481///
482/// * `input` - The shell script to tokenize.
483/// * `options` - Options controlling how the tokenizer operates.
484pub fn tokenize_str_with_options(
485    input: &str,
486    options: &TokenizerOptions,
487) -> Result<Vec<Token>, TokenizerError> {
488    uncached_tokenize_string(input.to_owned(), options.to_owned())
489}
490
491#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
492fn uncached_tokenize_string(
493    input: String,
494    options: TokenizerOptions,
495) -> Result<Vec<Token>, TokenizerError> {
496    uncached_tokenize_str(input.as_str(), &options)
497}
498
499/// Break the given input shell script string into tokens, returning the tokens.
500/// No caching is performed.
501///
502/// # Arguments
503///
504/// * `input` - The shell script to tokenize.
505pub fn uncached_tokenize_str(
506    input: &str,
507    options: &TokenizerOptions,
508) -> Result<Vec<Token>, TokenizerError> {
509    let mut reader = std::io::BufReader::new(input.as_bytes());
510    let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
511
512    let mut tokens = vec![];
513    loop {
514        match tokenizer.next_token()? {
515            TokenizeResult {
516                token: Some(token), ..
517            } => tokens.push(token),
518            TokenizeResult {
519                reason: TokenEndReason::EndOfInput,
520                ..
521            } => break,
522            _ => (),
523        }
524    }
525
526    Ok(tokens)
527}
528
529impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
530    pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Tokenizer<'a, R> {
531        Tokenizer {
532            options: options.clone(),
533            char_reader: reader.chars().peekable(),
534            cross_state: CrossTokenParseState {
535                cursor: SourcePosition {
536                    index: 0,
537                    line: 1,
538                    column: 1,
539                },
540                here_state: HereState::None,
541                current_here_tags: vec![],
542                queued_tokens: vec![],
543                arithmetic_expansion: false,
544            },
545        }
546    }
547
548    #[allow(clippy::unnecessary_wraps)]
549    pub fn current_location(&self) -> Option<SourcePosition> {
550        Some(self.cross_state.cursor.clone())
551    }
552
553    fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
554        let c = self
555            .char_reader
556            .next()
557            .transpose()
558            .map_err(TokenizerError::ReadError)?;
559
560        if let Some(ch) = c {
561            if ch == '\n' {
562                self.cross_state.cursor.line += 1;
563                self.cross_state.cursor.column = 1;
564            } else {
565                self.cross_state.cursor.column += 1;
566            }
567            self.cross_state.cursor.index += 1;
568        }
569
570        Ok(c)
571    }
572
573    fn consume_char(&mut self) -> Result<(), TokenizerError> {
574        let _ = self.next_char()?;
575        Ok(())
576    }
577
578    fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
579        match self.char_reader.peek() {
580            Some(result) => match result {
581                Ok(c) => Ok(Some(*c)),
582                Err(_) => Err(TokenizerError::FailedDecoding),
583            },
584            None => Ok(None),
585        }
586    }
587
588    pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
589        self.next_token_until(None)
590    }
591
592    #[allow(clippy::if_same_then_else)]
593    #[allow(clippy::too_many_lines)]
594    #[allow(clippy::unwrap_in_result)]
595    #[allow(clippy::panic_in_result_fn)]
596    fn next_token_until(
597        &mut self,
598        terminating_char: Option<char>,
599    ) -> Result<TokenizeResult, TokenizerError> {
600        let mut state = TokenParseState::new(&self.cross_state.cursor);
601        let mut result: Option<TokenizeResult> = None;
602
603        while result.is_none() {
604            // First satisfy token results from our queue. Once we exhaust the queue then
605            // we'll look at the input stream.
606            if !self.cross_state.queued_tokens.is_empty() {
607                return Ok(self.cross_state.queued_tokens.remove(0));
608            }
609
610            let next = self.peek_char()?;
611            let c = next.unwrap_or('\0');
612
613            // When we hit the end of the input, then we're done with the current token (if there is
614            // one).
615            if next.is_none() {
616                // TODO: Verify we're not waiting on some terminating character?
617                // Verify we're out of all quotes.
618                if state.in_escape {
619                    return Err(TokenizerError::UnterminatedEscapeSequence);
620                }
621                match state.quote_mode {
622                    QuoteMode::None => (),
623                    QuoteMode::Single(pos) => {
624                        return Err(TokenizerError::UnterminatedSingleQuote(pos));
625                    }
626                    QuoteMode::Double(pos) => {
627                        return Err(TokenizerError::UnterminatedDoubleQuote(pos));
628                    }
629                }
630
631                // Verify we're not in a here document.
632                if !matches!(self.cross_state.here_state, HereState::None) {
633                    let tag_names = self
634                        .cross_state
635                        .current_here_tags
636                        .iter()
637                        .map(|tag| tag.tag.trim())
638                        .collect::<Vec<_>>()
639                        .join(", ");
640                    let tag_positions = self
641                        .cross_state
642                        .current_here_tags
643                        .iter()
644                        .map(|tag| std::format!("{}", tag.position))
645                        .collect::<Vec<_>>()
646                        .join(", ");
647                    return Err(TokenizerError::UnterminatedHereDocuments(
648                        tag_names,
649                        tag_positions,
650                    ));
651                }
652
653                result = state
654                    .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
655            //
656            // Look for the specially specified terminating char.
657            //
658            } else if state.unquoted() && terminating_char == Some(c) {
659                result = state.delimit_current_token(
660                    TokenEndReason::SpecifiedTerminatingChar,
661                    &mut self.cross_state,
662                )?;
663            //
664            // Handle being in a here document.
665            //
666            } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
667                //
668                // For now, just include the character in the current token. We also check
669                // if there are leading tabs to be removed.
670                //
671                if !self.cross_state.current_here_tags.is_empty()
672                    && self.cross_state.current_here_tags[0].remove_tabs
673                    && (!state.started_token() || state.current_token().ends_with('\n'))
674                    && c == '\t'
675                {
676                    // Consume it but don't include it.
677                    self.consume_char()?;
678                } else {
679                    self.consume_char()?;
680                    state.append_char(c);
681
682                    // See if this was a newline character following the terminating here tag.
683                    if c == '\n' {
684                        let next_here_tag = &self.cross_state.current_here_tags[0];
685                        let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
686                            unquote_str(next_here_tag.tag.as_str()).into()
687                        } else {
688                            next_here_tag.tag.as_str().into()
689                        };
690
691                        if let Some(current_token_without_here_tag) =
692                            state.current_token().strip_suffix(tag_str.as_ref())
693                        {
694                            // Make sure that was either the start of the here document, or there
695                            // was a newline between the preceding part
696                            // and the tag.
697                            if current_token_without_here_tag.is_empty()
698                                || current_token_without_here_tag.ends_with('\n')
699                            {
700                                state.replace_with_here_doc(
701                                    current_token_without_here_tag.to_owned(),
702                                );
703
704                                // Delimit the end of the here-document body.
705                                result = state.delimit_current_token(
706                                    TokenEndReason::HereDocumentBodyEnd,
707                                    &mut self.cross_state,
708                                )?;
709                            }
710                        }
711                    }
712                }
713            } else if state.in_operator() {
714                //
715                // We're in an operator. See if this character continues an operator, or if it
716                // must be a separate token (because it wouldn't make a prefix of an operator).
717                //
718
719                let mut hypothetical_token = state.current_token().to_owned();
720                hypothetical_token.push(c);
721
722                if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
723                    self.consume_char()?;
724                    state.append_char(c);
725                } else {
726                    assert!(state.started_token());
727
728                    //
729                    // N.B. If the completed operator indicates a here-document, then keep
730                    // track that the *next* token should be the here-tag.
731                    //
732                    if self.cross_state.arithmetic_expansion {
733                        // Nothing to do; we're in an arithmetic expansion so << and <<-
734                        // are not here-docs, they're either a left-shift operator or
735                        // a left-shift operator followed by a unary minus operator.
736                    } else if state.is_specific_operator("<<") {
737                        self.cross_state.here_state =
738                            HereState::NextTokenIsHereTag { remove_tabs: false };
739                    } else if state.is_specific_operator("<<-") {
740                        self.cross_state.here_state =
741                            HereState::NextTokenIsHereTag { remove_tabs: true };
742                    }
743
744                    let reason = if state.current_token() == "\n" {
745                        TokenEndReason::UnescapedNewLine
746                    } else {
747                        TokenEndReason::OperatorEnd
748                    };
749
750                    result = state.delimit_current_token(reason, &mut self.cross_state)?;
751                }
752            //
753            // See if this is a character that changes the current escaping/quoting state.
754            //
755            } else if does_char_newly_affect_quoting(&state, c) {
756                if c == '\\' {
757                    // Consume the backslash ourselves so we can peek past it.
758                    self.consume_char()?;
759
760                    if matches!(self.peek_char()?, Some('\n')) {
761                        // Make sure the newline char gets consumed too.
762                        self.consume_char()?;
763
764                        // Make sure to include neither the backslash nor the newline character.
765                    } else {
766                        state.in_escape = true;
767                        state.append_char(c);
768                    }
769                } else if c == '\'' {
770                    state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
771                    self.consume_char()?;
772                    state.append_char(c);
773                } else if c == '\"' {
774                    state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
775                    self.consume_char()?;
776                    state.append_char(c);
777                }
778            }
779            //
780            // Handle end of single-quote or double-quote.
781            else if !state.in_escape
782                && matches!(state.quote_mode, QuoteMode::Single(_))
783                && c == '\''
784            {
785                state.quote_mode = QuoteMode::None;
786                self.consume_char()?;
787                state.append_char(c);
788            } else if !state.in_escape
789                && matches!(state.quote_mode, QuoteMode::Double(_))
790                && c == '\"'
791            {
792                state.quote_mode = QuoteMode::None;
793                self.consume_char()?;
794                state.append_char(c);
795            }
796            //
797            // Handle end of escape sequence.
798            // TODO: Handle double-quote specific escape sequences.
799            else if state.in_escape {
800                state.in_escape = false;
801                self.consume_char()?;
802                state.append_char(c);
803            } else if (state.unquoted()
804                || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
805                && (c == '$' || c == '`')
806            {
807                // TODO: handle quoted $ or ` in a double quote
808                if c == '$' {
809                    // Consume the '$' so we can peek beyond.
810                    self.consume_char()?;
811
812                    // Now peek beyond to see what we have.
813                    let char_after_dollar_sign = self.peek_char()?;
814                    match char_after_dollar_sign {
815                        Some('(') => {
816                            // Add the '$' we already consumed to the token.
817                            state.append_char('$');
818
819                            // Consume the '(' and add it to the token.
820                            state.append_char(self.next_char()?.unwrap());
821
822                            // Check to see if this is possibly an arithmetic expression
823                            // (i.e., one that starts with `$((`).
824                            let mut required_end_parens = 1;
825                            if matches!(self.peek_char()?, Some('(')) {
826                                // Consume the second '(' and add it to the token.
827                                state.append_char(self.next_char()?.unwrap());
828                                // Keep track that we'll need to see *2* end parentheses
829                                // to leave this construct.
830                                required_end_parens = 2;
831                                // Keep track that we're in an arithmetic expression, since
832                                // some text will be interpreted differently as a result
833                                // (e.g., << is a left shift operator and not a here doc
834                                // input redirection operator).
835                                self.cross_state.arithmetic_expansion = true;
836                            }
837
838                            let mut pending_here_doc_tokens = vec![];
839                            let mut drain_here_doc_tokens = false;
840
841                            loop {
842                                let cur_token = if drain_here_doc_tokens
843                                    && !pending_here_doc_tokens.is_empty()
844                                {
845                                    if pending_here_doc_tokens.len() == 1 {
846                                        drain_here_doc_tokens = false;
847                                    }
848
849                                    pending_here_doc_tokens.remove(0)
850                                } else {
851                                    let cur_token = self.next_token_until(Some(')'))?;
852
853                                    // See if this is a here-document-related token we need to hold
854                                    // onto until after we've seen all the tokens that need to show
855                                    // up before we get to the body.
856                                    if matches!(
857                                        cur_token.reason,
858                                        TokenEndReason::HereDocumentBodyStart
859                                            | TokenEndReason::HereDocumentBodyEnd
860                                            | TokenEndReason::HereDocumentEndTag
861                                    ) {
862                                        pending_here_doc_tokens.push(cur_token);
863                                        continue;
864                                    }
865
866                                    cur_token
867                                };
868
869                                if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
870                                    && !pending_here_doc_tokens.is_empty()
871                                {
872                                    pending_here_doc_tokens.push(cur_token);
873                                    drain_here_doc_tokens = true;
874                                    continue;
875                                }
876
877                                if let Some(cur_token_value) = cur_token.token {
878                                    state.append_str(cur_token_value.to_str());
879
880                                    // If we encounter an embedded open parenthesis, then note that
881                                    // we'll have to see the matching end to it before we worry
882                                    // about the end of the
883                                    // containing construct.
884                                    if matches!(cur_token_value, Token::Operator(o, _) if o == "(")
885                                    {
886                                        required_end_parens += 1;
887                                    }
888                                }
889
890                                match cur_token.reason {
891                                    TokenEndReason::HereDocumentBodyStart => {
892                                        state.append_char('\n');
893                                    }
894                                    TokenEndReason::NonNewLineBlank => state.append_char(' '),
895                                    TokenEndReason::SpecifiedTerminatingChar => {
896                                        // We hit the ')' we were looking for. If this is the last
897                                        // end parenthesis we needed to find, then we'll exit the
898                                        // loop and consume
899                                        // and append it.
900                                        required_end_parens -= 1;
901                                        if required_end_parens == 0 {
902                                            break;
903                                        }
904
905                                        // This wasn't the *last* end parenthesis char, so let's
906                                        // consume and append it here before we loop around again.
907                                        state.append_char(self.next_char()?.unwrap());
908                                    }
909                                    TokenEndReason::EndOfInput => {
910                                        return Err(TokenizerError::UnterminatedCommandSubstitution)
911                                    }
912                                    _ => (),
913                                }
914                            }
915
916                            self.cross_state.arithmetic_expansion = false;
917
918                            state.append_char(self.next_char()?.unwrap());
919                        }
920
921                        Some('{') => {
922                            // Add the '$' we already consumed to the token.
923                            state.append_char('$');
924
925                            // Consume the '{' and add it to the token.
926                            state.append_char(self.next_char()?.unwrap());
927
928                            let mut pending_here_doc_tokens = vec![];
929                            let mut drain_here_doc_tokens = false;
930
931                            loop {
932                                let cur_token = if drain_here_doc_tokens
933                                    && !pending_here_doc_tokens.is_empty()
934                                {
935                                    if pending_here_doc_tokens.len() == 1 {
936                                        drain_here_doc_tokens = false;
937                                    }
938
939                                    pending_here_doc_tokens.remove(0)
940                                } else {
941                                    let cur_token = self.next_token_until(Some('}'))?;
942
943                                    // See if this is a here-document-related token we need to hold
944                                    // onto until after we've seen all the tokens that need to show
945                                    // up before we get to the body.
946                                    if matches!(
947                                        cur_token.reason,
948                                        TokenEndReason::HereDocumentBodyStart
949                                            | TokenEndReason::HereDocumentBodyEnd
950                                            | TokenEndReason::HereDocumentEndTag
951                                    ) {
952                                        pending_here_doc_tokens.push(cur_token);
953                                        continue;
954                                    }
955
956                                    cur_token
957                                };
958
959                                if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
960                                    && !pending_here_doc_tokens.is_empty()
961                                {
962                                    pending_here_doc_tokens.push(cur_token);
963                                    drain_here_doc_tokens = true;
964                                    continue;
965                                }
966
967                                if let Some(cur_token_value) = cur_token.token {
968                                    state.append_str(cur_token_value.to_str());
969                                }
970
971                                match cur_token.reason {
972                                    TokenEndReason::HereDocumentBodyStart => {
973                                        state.append_char('\n');
974                                    }
975                                    TokenEndReason::NonNewLineBlank => state.append_char(' '),
976                                    TokenEndReason::SpecifiedTerminatingChar => {
977                                        // We hit the end brace we were looking for but did not
978                                        // yet consume it. Do so now.
979                                        state.append_char(self.next_char()?.unwrap());
980                                        break;
981                                    }
982                                    TokenEndReason::EndOfInput => {
983                                        return Err(TokenizerError::UnterminatedVariable)
984                                    }
985                                    _ => (),
986                                }
987                            }
988                        }
989                        _ => {
990                            // This is either a different character, or else the end of the string.
991                            // Either way, add the '$' we already consumed to the token.
992                            state.append_char('$');
993                        }
994                    }
995                } else {
996                    // We look for the terminating backquote. First disable normal consumption and
997                    // consume the starting backquote.
998                    let backquote_pos = self.cross_state.cursor.clone();
999                    self.consume_char()?;
1000
1001                    // Add the opening backquote to the token.
1002                    state.append_char(c);
1003
1004                    // Now continue until we see an unescaped backquote.
1005                    let mut escaping_enabled = false;
1006                    let mut done = false;
1007                    while !done {
1008                        // Read (and consume) the next char.
1009                        let next_char_in_backquote = self.next_char()?;
1010                        if let Some(cib) = next_char_in_backquote {
1011                            // Include it in the token no matter what.
1012                            state.append_char(cib);
1013
1014                            // Watch out for escaping.
1015                            if !escaping_enabled && cib == '\\' {
1016                                escaping_enabled = true;
1017                            } else {
1018                                // Look for an unescaped backquote to terminate.
1019                                if !escaping_enabled && cib == '`' {
1020                                    done = true;
1021                                }
1022                                escaping_enabled = false;
1023                            }
1024                        } else {
1025                            return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1026                        }
1027                    }
1028                }
1029            }
1030            //
1031            // [Extension]
1032            // If extended globbing is enabled, the last consumed character is an
1033            // unquoted start of an extglob pattern, *and* if the current character
1034            // is an open parenthesis, then this begins an extglob pattern.
1035            else if c == '('
1036                && self.options.enable_extended_globbing
1037                && state.unquoted()
1038                && !state.in_operator()
1039                && state
1040                    .current_token()
1041                    .ends_with(|x| Self::can_start_extglob(x))
1042            {
1043                // Consume the '(' and append it.
1044                self.consume_char()?;
1045                state.append_char(c);
1046
1047                let mut paren_depth = 1;
1048
1049                // Keep consuming until we see the matching end ')'.
1050                while paren_depth > 0 {
1051                    if let Some(extglob_char) = self.next_char()? {
1052                        // Include it in the token.
1053                        state.append_char(extglob_char);
1054
1055                        // Look for ')' to terminate.
1056                        // TODO: handle escaping?
1057                        if extglob_char == '(' {
1058                            paren_depth += 1;
1059                        } else if extglob_char == ')' {
1060                            paren_depth -= 1;
1061                        }
1062                    } else {
1063                        return Err(TokenizerError::UnterminatedExtendedGlob(
1064                            self.cross_state.cursor.clone(),
1065                        ));
1066                    }
1067                }
1068            //
1069            // If the character *can* start an operator, then it will.
1070            //
1071            } else if state.unquoted() && Self::can_start_operator(c) {
1072                if state.started_token() {
1073                    result = state.delimit_current_token(
1074                        TokenEndReason::OperatorStart,
1075                        &mut self.cross_state,
1076                    )?;
1077                } else {
1078                    state.token_is_operator = true;
1079                    self.consume_char()?;
1080                    state.append_char(c);
1081                }
1082            //
1083            // Whitespace gets discarded (and delimits tokens).
1084            //
1085            } else if state.unquoted() && is_blank(c) {
1086                if state.started_token() {
1087                    result = state.delimit_current_token(
1088                        TokenEndReason::NonNewLineBlank,
1089                        &mut self.cross_state,
1090                    )?;
1091                } else {
1092                    // Make sure we don't include this char in the token range.
1093                    state.start_position.column += 1;
1094                    state.start_position.index += 1;
1095                }
1096
1097                self.consume_char()?;
1098            }
1099            //
1100            // N.B. We need to remember if we were recursively called in a variable
1101            // expansion expression; in that case we won't think a token was started but...
1102            // we'd be wrong.
1103            else if !state.token_is_operator
1104                && (state.started_token() || matches!(terminating_char, Some('}')))
1105            {
1106                self.consume_char()?;
1107                state.append_char(c);
1108            } else if c == '#' {
1109                // Consume the '#'.
1110                self.consume_char()?;
1111
1112                let mut done = false;
1113                while !done {
1114                    done = match self.peek_char()? {
1115                        Some('\n') => true,
1116                        None => true,
1117                        _ => {
1118                            // Consume the peeked char; it's part of the comment.
1119                            self.consume_char()?;
1120                            false
1121                        }
1122                    };
1123                }
1124                // Re-start loop as if the comment never happened.
1125            } else if state.started_token() {
1126                // In all other cases where we have an in-progress token, we delimit here.
1127                result =
1128                    state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1129            } else {
1130                // If we got here, then we don't have a token in progress and we're not starting an
1131                // operator. Add the character to a new token.
1132                self.consume_char()?;
1133                state.append_char(c);
1134            }
1135        }
1136
1137        let result = result.unwrap();
1138
1139        Ok(result)
1140    }
1141
1142    fn can_start_extglob(c: char) -> bool {
1143        matches!(c, '@' | '!' | '?' | '+' | '*')
1144    }
1145
1146    fn can_start_operator(c: char) -> bool {
1147        matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1148    }
1149
1150    fn is_operator(&self, s: &str) -> bool {
1151        // Handle non-POSIX operators.
1152        if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1153            return true;
1154        }
1155
1156        matches!(
1157            s,
1158            "&" | "&&"
1159                | "("
1160                | ")"
1161                | ";"
1162                | ";;"
1163                | "\n"
1164                | "|"
1165                | "||"
1166                | "<"
1167                | ">"
1168                | ">|"
1169                | "<<"
1170                | ">>"
1171                | "<&"
1172                | ">&"
1173                | "<<-"
1174                | "<>"
1175        )
1176    }
1177}
1178
1179impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1180    type Item = Result<TokenizeResult, TokenizerError>;
1181
1182    fn next(&mut self) -> Option<Self::Item> {
1183        match self.next_token() {
1184            #[allow(clippy::manual_map)]
1185            Ok(result) => match result.token {
1186                Some(_) => Some(Ok(result)),
1187                None => None,
1188            },
1189            Err(e) => Some(Err(e)),
1190        }
1191    }
1192}
1193
1194fn is_blank(c: char) -> bool {
1195    c == ' ' || c == '\t'
1196}
1197
1198fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1199    // If we're currently escaped, then nothing affects quoting.
1200    if state.in_escape {
1201        return false;
1202    }
1203
1204    match state.quote_mode {
1205        // When we're in a double quote, only a subset of escape sequences are recognized.
1206        QuoteMode::Double(_) => {
1207            if c == '\\' {
1208                // TODO: handle backslash in double quote
1209                true
1210            } else {
1211                false
1212            }
1213        }
1214        // When we're in a single quote, nothing affects quoting.
1215        QuoteMode::Single(_) => false,
1216        // When we're not already in a quote, then we can straightforwardly look for a
1217        // quote mark or backslash.
1218        QuoteMode::None => is_quoting_char(c),
1219    }
1220}
1221
1222fn is_quoting_char(c: char) -> bool {
1223    matches!(c, '\\' | '\'' | '\"')
1224}
1225
1226/// Return a string with all the quoting removed.
1227///
1228/// # Arguments
1229///
1230/// * `s` - The string to unquote.
1231pub fn unquote_str(s: &str) -> String {
1232    let mut result = String::new();
1233
1234    let mut in_escape = false;
1235    for c in s.chars() {
1236        match c {
1237            c if in_escape => {
1238                result.push(c);
1239                in_escape = false;
1240            }
1241            '\\' => in_escape = true,
1242            c if is_quoting_char(c) => (),
1243            c => result.push(c),
1244        }
1245    }
1246
1247    result
1248}
1249
1250#[cfg(test)]
1251#[allow(clippy::panic_in_result_fn)]
1252mod tests {
1253
1254    use super::*;
1255    use anyhow::Result;
1256    // use assert_matches::assert_matches;
1257    use pretty_assertions::{assert_eq, assert_matches};
1258
1259    #[test]
1260    fn tokenize_empty() -> Result<()> {
1261        let tokens = tokenize_str("")?;
1262        assert_eq!(tokens.len(), 0);
1263        Ok(())
1264    }
1265
1266    #[test]
1267    fn tokenize_line_continuation() -> Result<()> {
1268        let tokens = tokenize_str(
1269            r"a\
1270bc",
1271        )?;
1272        assert_matches!(
1273            &tokens[..],
1274            [t1 @ Token::Word(..)] if t1.to_str() == "abc"
1275        );
1276        Ok(())
1277    }
1278
1279    #[test]
1280    fn tokenize_operators() -> Result<()> {
1281        assert_matches!(
1282            &tokenize_str("a>>b")?[..],
1283            [t1 @ Token::Word(..), t2 @ Token::Operator(..), t3 @ Token::Word(..)] if
1284                t1.to_str() == "a" &&
1285                t2.to_str() == ">>" &&
1286                t3.to_str() == "b"
1287        );
1288        Ok(())
1289    }
1290
1291    #[test]
1292    fn tokenize_comment() -> Result<()> {
1293        let tokens = tokenize_str(
1294            r"a #comment
1295",
1296        )?;
1297        assert_matches!(
1298            &tokens[..],
1299            [t1 @ Token::Word(..), t2 @ Token::Operator(..)] if
1300                t1.to_str() == "a" &&
1301                t2.to_str() == "\n"
1302        );
1303        Ok(())
1304    }
1305
1306    #[test]
1307    fn tokenize_comment_at_eof() -> Result<()> {
1308        assert_matches!(
1309            &tokenize_str(r"a #comment")?[..],
1310            [t1 @ Token::Word(..)] if t1.to_str() == "a"
1311        );
1312        Ok(())
1313    }
1314
1315    #[test]
1316    fn tokenize_empty_here_doc() -> Result<()> {
1317        let tokens = tokenize_str(
1318            r"cat <<HERE
1319HERE
1320",
1321        )?;
1322        assert_matches!(
1323            &tokens[..],
1324            [t1 @ Token::Word(..),
1325             t2 @ Token::Operator(..),
1326             t3 @ Token::Word(..),
1327             t4 @ Token::Word(..),
1328             t5 @ Token::Word(..),
1329             t6 @ Token::Operator(..)] if
1330                t1.to_str() == "cat" &&
1331                t2.to_str() == "<<" &&
1332                t3.to_str() == "HERE" &&
1333                t4.to_str() == "" &&
1334                t5.to_str() == "HERE" &&
1335                t6.to_str() == "\n"
1336        );
1337        Ok(())
1338    }
1339
1340    #[test]
1341    fn tokenize_here_doc() -> Result<()> {
1342        let tokens = tokenize_str(
1343            r"cat <<HERE
1344SOMETHING
1345HERE
1346echo after
1347",
1348        )?;
1349        assert_matches!(
1350            &tokens[..],
1351            [t1 @ Token::Word(..),
1352             t2 @ Token::Operator(..),
1353             t3 @ Token::Word(..),
1354             t4 @ Token::Word(..),
1355             t5 @ Token::Word(..),
1356             t6 @ Token::Operator(..),
1357             t7 @ Token::Word(..),
1358             t8 @ Token::Word(..),
1359             t9 @ Token::Operator(..)] if
1360                t1.to_str() == "cat" &&
1361                t2.to_str() == "<<" &&
1362                t3.to_str() == "HERE" &&
1363                t4.to_str() == "SOMETHING\n" &&
1364                t5.to_str() == "HERE" &&
1365                t6.to_str() == "\n" &&
1366                t7.to_str() == "echo" &&
1367                t8.to_str() == "after" &&
1368                t9.to_str() == "\n"
1369        );
1370        Ok(())
1371    }
1372
1373    #[test]
1374    fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1375        let tokens = tokenize_str(
1376            r"cat <<-HERE
1377	SOMETHING
1378	HERE
1379",
1380        )?;
1381        assert_matches!(
1382            &tokens[..],
1383            [t1 @ Token::Word(..),
1384             t2 @ Token::Operator(..),
1385             t3 @ Token::Word(..),
1386             t4 @ Token::Word(..),
1387             t5 @ Token::Word(..),
1388             t6 @ Token::Operator(..)] if
1389                t1.to_str() == "cat" &&
1390                t2.to_str() == "<<-" &&
1391                t3.to_str() == "HERE" &&
1392                t4.to_str() == "SOMETHING\n" &&
1393                t5.to_str() == "HERE" &&
1394                t6.to_str() == "\n"
1395        );
1396        Ok(())
1397    }
1398
1399    #[test]
1400    fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1401        let tokens = tokenize_str(
1402            r"cat <<EOF | wc -l
1403A B C
14041 2 3
1405D E F
1406EOF
1407",
1408        )?;
1409        assert_matches!(
1410            &tokens[..],
1411            [t1 @ Token::Word(..),
1412             t2 @ Token::Operator(..),
1413             t3 @ Token::Word(..),
1414             t4 @ Token::Word(..),
1415             t5 @ Token::Word(..),
1416             t6 @ Token::Operator(..),
1417             t7 @ Token::Word(..),
1418             t8 @ Token::Word(..),
1419             t9 @ Token::Operator(..)] if
1420                t1.to_str() == "cat" &&
1421                t2.to_str() == "<<" &&
1422                t3.to_str() == "EOF" &&
1423                t4.to_str() == "A B C\n1 2 3\nD E F\n" &&
1424                t5.to_str() == "EOF" &&
1425                t6.to_str() == "|" &&
1426                t7.to_str() == "wc" &&
1427                t8.to_str() == "-l" &&
1428                t9.to_str() == "\n"
1429        );
1430
1431        Ok(())
1432    }
1433
1434    #[test]
1435    fn tokenize_multiple_here_docs() -> Result<()> {
1436        let tokens = tokenize_str(
1437            r"cat <<HERE1 <<HERE2
1438SOMETHING
1439HERE1
1440OTHER
1441HERE2
1442echo after
1443",
1444        )?;
1445        assert_matches!(
1446            &tokens[..],
1447            [t1 @ Token::Word(..),
1448             t2 @ Token::Operator(..),
1449             t3 @ Token::Word(..),
1450             t4 @ Token::Word(..),
1451             t5 @ Token::Word(..),
1452             t6 @ Token::Operator(..),
1453             t7 @ Token::Word(..),
1454             t8 @ Token::Word(..),
1455             t9 @ Token::Word(..),
1456             t10 @ Token::Operator(..),
1457             t11 @ Token::Word(..),
1458             t12 @ Token::Word(..),
1459             t13 @ Token::Operator(..)] if
1460                t1.to_str() == "cat" &&
1461                t2.to_str() == "<<" &&
1462                t3.to_str() == "HERE1" &&
1463                t4.to_str() == "SOMETHING\n" &&
1464                t5.to_str() == "HERE1" &&
1465                t6.to_str() == "<<" &&
1466                t7.to_str() == "HERE2" &&
1467                t8.to_str() == "OTHER\n" &&
1468                t9.to_str() == "HERE2" &&
1469                t10.to_str() == "\n" &&
1470                t11.to_str() == "echo" &&
1471                t12.to_str() == "after" &&
1472                t13.to_str() == "\n"
1473        );
1474        Ok(())
1475    }
1476
1477    #[test]
1478    fn tokenize_unterminated_here_doc() {
1479        let result = tokenize_str(
1480            r"cat <<HERE
1481SOMETHING
1482",
1483        );
1484        assert!(result.is_err());
1485    }
1486
1487    #[test]
1488    fn tokenize_missing_here_tag() {
1489        let result = tokenize_str(
1490            r"cat <<
1491",
1492        );
1493        assert!(result.is_err());
1494    }
1495
1496    #[test]
1497    fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1498        let tokens = tokenize_str(
1499            r"echo $(cat <<HERE
1500TEXT
1501HERE
1502)",
1503        )?;
1504        assert_matches!(
1505            &tokens[..],
1506            [t1 @ Token::Word(..),
1507             t2 @ Token::Word(..)] if
1508                t1.to_str() == "echo" &&
1509                t2.to_str() == "$(cat <<HERE\nTEXT\nHERE\n)"
1510        );
1511        Ok(())
1512    }
1513
1514    #[test]
1515    fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1516        let tokens = tokenize_str(
1517            r"echo $(cat <<HERE1 <<HERE2 | wc -l
1518TEXT
1519HERE1
1520OTHER
1521HERE2
1522)",
1523        )?;
1524        assert_matches!(
1525            &tokens[..],
1526            [t1 @ Token::Word(..),
1527             t2 @ Token::Word(..)] if
1528                t1.to_str() == "echo" &&
1529                t2.to_str() == "$(cat <<HERE1 <<HERE2 |wc -l\nTEXT\nHERE1\nOTHER\nHERE2\n)"
1530        );
1531        Ok(())
1532    }
1533
1534    #[test]
1535    fn tokenize_simple_backquote() -> Result<()> {
1536        assert_matches!(
1537            &tokenize_str(r"echo `echo hi`")?[..],
1538            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1539                t1.to_str() == "echo" &&
1540                t2.to_str() == "`echo hi`"
1541        );
1542        Ok(())
1543    }
1544
1545    #[test]
1546    fn tokenize_backquote_with_escape() -> Result<()> {
1547        assert_matches!(
1548            &tokenize_str(r"echo `echo\`hi`")?[..],
1549            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1550                t1.to_str() == "echo" &&
1551                t2.to_str() == r"`echo\`hi`"
1552        );
1553        Ok(())
1554    }
1555
1556    #[test]
1557    fn tokenize_unterminated_backquote() {
1558        assert_matches!(
1559            tokenize_str("`"),
1560            Err(TokenizerError::UnterminatedBackquote(_))
1561        );
1562    }
1563
1564    #[test]
1565    fn tokenize_unterminated_command_substitution() {
1566        assert_matches!(
1567            tokenize_str("$("),
1568            Err(TokenizerError::UnterminatedCommandSubstitution)
1569        );
1570    }
1571
1572    #[test]
1573    fn tokenize_command_substitution() -> Result<()> {
1574        assert_matches!(
1575            &tokenize_str("a$(echo hi)b c")?[..],
1576            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1577                t1.to_str() == "a$(echo hi)b" &&
1578                t2.to_str() == "c"
1579        );
1580        Ok(())
1581    }
1582
1583    #[test]
1584    fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1585        assert_matches!(
1586            &tokenize_str("echo $(echo !(x))")?[..],
1587            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1588                t1.to_str() == "echo" &&
1589                t2.to_str() == "$(echo !(x))"
1590        );
1591        Ok(())
1592    }
1593
1594    #[test]
1595    fn tokenize_arithmetic_expression() -> Result<()> {
1596        assert_matches!(
1597            &tokenize_str("a$((1+2))b c")?[..],
1598            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1599                t1.to_str() == "a$((1+2))b" &&
1600                t2.to_str() == "c"
1601        );
1602        Ok(())
1603    }
1604
1605    #[test]
1606    fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1607        // N.B. The spacing comes out a bit odd, but it gets processed okay
1608        // by later stages.
1609        assert_matches!(
1610            &tokenize_str("$(( 1 ))")?[..],
1611            [t1 @ Token::Word(..)] if
1612                t1.to_str() == "$((1 ))"
1613        );
1614        Ok(())
1615    }
1616    #[test]
1617    fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1618        assert_matches!(
1619            &tokenize_str("$(( (0) ))")?[..],
1620            [t1 @ Token::Word(..)] if
1621                t1.to_str() == "$(((0)))"
1622        );
1623        Ok(())
1624    }
1625
1626    #[test]
1627    fn tokenize_special_parameters() -> Result<()> {
1628        assert_matches!(
1629            &tokenize_str("$$")?[..],
1630            [t1 @ Token::Word(..)] if t1.to_str() == "$$"
1631        );
1632        assert_matches!(
1633            &tokenize_str("$@")?[..],
1634            [t1 @ Token::Word(..)] if t1.to_str() == "$@"
1635        );
1636        assert_matches!(
1637            &tokenize_str("$!")?[..],
1638            [t1 @ Token::Word(..)] if t1.to_str() == "$!"
1639        );
1640        assert_matches!(
1641            &tokenize_str("$?")?[..],
1642            [t1 @ Token::Word(..)] if t1.to_str() == "$?"
1643        );
1644        assert_matches!(
1645            &tokenize_str("$*")?[..],
1646            [t1 @ Token::Word(..)] if t1.to_str() == "$*"
1647        );
1648        Ok(())
1649    }
1650
1651    #[test]
1652    fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1653        assert_matches!(
1654            &tokenize_str("$x")?[..],
1655            [t1 @ Token::Word(..)] if t1.to_str() == "$x"
1656        );
1657        assert_matches!(
1658            &tokenize_str("a$x")?[..],
1659            [t1 @ Token::Word(..)] if t1.to_str() == "a$x"
1660        );
1661        Ok(())
1662    }
1663
1664    #[test]
1665    fn tokenize_unterminated_parameter_expansion() {
1666        assert_matches!(
1667            tokenize_str("${x"),
1668            Err(TokenizerError::UnterminatedVariable)
1669        );
1670    }
1671
1672    #[test]
1673    fn tokenize_braced_parameter_expansion() -> Result<()> {
1674        assert_matches!(
1675            &tokenize_str("${x}")?[..],
1676            [t1 @ Token::Word(..)] if t1.to_str() == "${x}"
1677        );
1678        assert_matches!(
1679            &tokenize_str("a${x}b")?[..],
1680            [t1 @ Token::Word(..)] if t1.to_str() == "a${x}b"
1681        );
1682        Ok(())
1683    }
1684
1685    #[test]
1686    fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1687        assert_matches!(
1688            &tokenize_str(r"a${x\}}b")?[..],
1689            [t1 @ Token::Word(..)] if t1.to_str() == r"a${x\}}b"
1690        );
1691        Ok(())
1692    }
1693
1694    #[test]
1695    fn tokenize_whitespace() -> Result<()> {
1696        assert_matches!(
1697            &tokenize_str("1 2 3")?[..],
1698            [t1 @ Token::Word(..), t2 @ Token::Word(..), t3 @ Token::Word(..)] if
1699                t1.to_str() == "1" &&
1700                t2.to_str() == "2" &&
1701                t3.to_str() == "3"
1702        );
1703        Ok(())
1704    }
1705
1706    #[test]
1707    fn tokenize_escaped_whitespace() -> Result<()> {
1708        assert_matches!(
1709            &tokenize_str(r"1\ 2 3")?[..],
1710            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1711                t1.to_str() == r"1\ 2" &&
1712                t2.to_str() == "3"
1713        );
1714        Ok(())
1715    }
1716
1717    #[test]
1718    fn tokenize_single_quote() -> Result<()> {
1719        assert_matches!(
1720            &tokenize_str(r"x'a b'y")?[..],
1721            [t1 @ Token::Word(..)] if
1722                t1.to_str() == r"x'a b'y"
1723        );
1724        Ok(())
1725    }
1726
1727    #[test]
1728    fn tokenize_double_quote() -> Result<()> {
1729        assert_matches!(
1730            &tokenize_str(r#"x"a b"y"#)?[..],
1731            [t1 @ Token::Word(..)] if
1732                t1.to_str() == r#"x"a b"y"#
1733        );
1734        Ok(())
1735    }
1736
1737    #[test]
1738    fn tokenize_double_quoted_command_substitution() -> Result<()> {
1739        assert_matches!(
1740            &tokenize_str(r#"x"$(echo hi)"y"#)?[..],
1741            [t1 @ Token::Word(..)] if
1742                t1.to_str() == r#"x"$(echo hi)"y"#
1743        );
1744        Ok(())
1745    }
1746
1747    #[test]
1748    fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1749        assert_matches!(
1750            &tokenize_str(r#"x"$((1+2))"y"#)?[..],
1751            [t1 @ Token::Word(..)] if
1752                t1.to_str() == r#"x"$((1+2))"y"#
1753        );
1754        Ok(())
1755    }
1756
1757    #[test]
1758    fn test_quote_removal() {
1759        assert_eq!(unquote_str(r#""hello""#), "hello");
1760        assert_eq!(unquote_str(r"'hello'"), "hello");
1761        assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1762        assert_eq!(unquote_str(r"'hel\'lo'"), r"hel'lo");
1763    }
1764}
brush_parser/tokenizer.rs

brush_parser/
tokenizer.rs