brush_parser/
tokenizer.rs

1use std::borrow::Cow;
2use std::fmt::Display;
3use utf8_chars::BufReadCharsExt;
4
5#[allow(dead_code)]
6#[derive(Clone, Debug)]
7pub(crate) enum TokenEndReason {
8    /// End of input was reached.
9    EndOfInput,
10    /// An unescaped newline char was reached.
11    UnescapedNewLine,
12    /// Specified terminating char.
13    SpecifiedTerminatingChar,
14    /// A non-newline blank char was reached.
15    NonNewLineBlank,
16    /// A here-document's body is starting.
17    HereDocumentBodyStart,
18    /// A here-document's body was terminated.
19    HereDocumentBodyEnd,
20    /// A here-document's end tag was reached.
21    HereDocumentEndTag,
22    /// An operator was started.
23    OperatorStart,
24    /// An operator was terminated.
25    OperatorEnd,
26    /// Some other condition was reached.
27    Other,
28}
29
30/// Represents a position in a source shell script.
31#[derive(Clone, Default, Debug)]
32#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
33pub struct SourcePosition {
34    /// The 0-based index of the character in the input stream.
35    pub index: i32,
36    /// The 1-based line number.
37    pub line: i32,
38    /// The 1-based column number.
39    pub column: i32,
40}
41
42impl Display for SourcePosition {
43    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
44        f.write_fmt(format_args!("line {} col {}", self.line, self.column))
45    }
46}
47
48/// Represents the location of a token in its source shell script.
49#[derive(Clone, Default, Debug)]
50#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
51pub struct TokenLocation {
52    /// The start position of the token.
53    pub start: SourcePosition,
54    /// The end position of the token (exclusive).
55    pub end: SourcePosition,
56}
57
58/// Represents a token extracted from a shell script.
59#[derive(Clone, Debug)]
60#[cfg_attr(feature = "fuzz-testing", derive(arbitrary::Arbitrary))]
61pub enum Token {
62    /// An operator token.
63    Operator(String, TokenLocation),
64    /// A word token.
65    Word(String, TokenLocation),
66}
67
68impl Token {
69    /// Returns the string value of the token.
70    pub fn to_str(&self) -> &str {
71        match self {
72            Token::Operator(s, _) => s,
73            Token::Word(s, _) => s,
74        }
75    }
76
77    /// Returns the location of the token in the source script.
78    pub fn location(&self) -> &TokenLocation {
79        match self {
80            Token::Operator(_, l) => l,
81            Token::Word(_, l) => l,
82        }
83    }
84}
85
86/// Encapsulates the result of tokenizing a shell script.
87#[derive(Clone, Debug)]
88pub(crate) struct TokenizeResult {
89    /// Reason for tokenization ending.
90    pub reason: TokenEndReason,
91    /// The token that was extracted, if any.
92    pub token: Option<Token>,
93}
94
95/// Represents an error that occurred during tokenization.
96#[derive(thiserror::Error, Debug)]
97pub enum TokenizerError {
98    /// An unterminated escape sequence was encountered at the end of the input stream.
99    #[error("unterminated escape sequence")]
100    UnterminatedEscapeSequence,
101
102    /// An unterminated single-quoted substring was encountered at the end of the input stream.
103    #[error("unterminated single quote at {0}")]
104    UnterminatedSingleQuote(SourcePosition),
105
106    /// An unterminated double-quoted substring was encountered at the end of the input stream.
107    #[error("unterminated double quote at {0}")]
108    UnterminatedDoubleQuote(SourcePosition),
109
110    /// An unterminated back-quoted substring was encountered at the end of the input stream.
111    #[error("unterminated backquote near {0}")]
112    UnterminatedBackquote(SourcePosition),
113
114    /// An unterminated extended glob (extglob) pattern was encountered at the end of the input
115    /// stream.
116    #[error("unterminated extglob near {0}")]
117    UnterminatedExtendedGlob(SourcePosition),
118
119    /// An unterminated variable expression was encountered at the end of the input stream.
120    #[error("unterminated variable expression")]
121    UnterminatedVariable,
122
123    /// An unterminated command substitiion was encountered at the end of the input stream.
124    #[error("unterminated command substitution")]
125    UnterminatedCommandSubstitution,
126
127    /// An error occurred decoding UTF-8 characters in the input stream.
128    #[error("failed to decode UTF-8 characters")]
129    FailedDecoding,
130
131    /// An I/O here tag was missing.
132    #[error("missing here tag for here document body")]
133    MissingHereTagForDocumentBody,
134
135    /// The indicated I/O here tag was missing.
136    #[error("missing here tag '{0}'")]
137    MissingHereTag(String),
138
139    /// An unterminated here document sequence was encountered at the end of the input stream.
140    #[error("unterminated here document sequence; tag(s) [{0}] found at: [{1}]")]
141    UnterminatedHereDocuments(String, String),
142
143    /// An I/O error occurred while reading from the input stream.
144    #[error("failed to read input")]
145    ReadError(#[from] std::io::Error),
146}
147
148impl TokenizerError {
149    pub fn is_incomplete(&self) -> bool {
150        matches!(
151            self,
152            Self::UnterminatedEscapeSequence
153                | Self::UnterminatedSingleQuote(..)
154                | Self::UnterminatedDoubleQuote(..)
155                | Self::UnterminatedBackquote(..)
156                | Self::UnterminatedCommandSubstitution
157                | Self::UnterminatedVariable
158                | Self::UnterminatedExtendedGlob(..)
159                | Self::UnterminatedHereDocuments(..)
160        )
161    }
162}
163
164/// Encapsulates a sequence of tokens.
165#[derive(Debug)]
166pub(crate) struct Tokens<'a> {
167    /// Sequence of tokens.
168    pub tokens: &'a [Token],
169}
170
171#[derive(Clone, Debug)]
172enum QuoteMode {
173    None,
174    Single(SourcePosition),
175    Double(SourcePosition),
176}
177
178#[derive(Clone, Debug, Default)]
179enum HereState {
180    /// In this state, we are not currently tracking any here-documents.
181    #[default]
182    None,
183    /// In this state, we expect that the next token will be a here tag.
184    NextTokenIsHereTag { remove_tabs: bool },
185    /// In this state, the *current* token is a here tag.
186    CurrentTokenIsHereTag {
187        remove_tabs: bool,
188        operator_token_result: TokenizeResult,
189    },
190    /// In this state, we expect that the *next line* will be the body of
191    /// a here-document.
192    NextLineIsHereDoc,
193    /// In this state, we are in the set of lines that comprise 1 or more
194    /// consecutive here-document bodies.
195    InHereDocs,
196}
197
198#[derive(Clone, Debug)]
199struct HereTag {
200    tag: String,
201    tag_was_escaped_or_quoted: bool,
202    remove_tabs: bool,
203    position: SourcePosition,
204    tokens: Vec<TokenizeResult>,
205    pending_tokens_after: Vec<TokenizeResult>,
206}
207
208#[derive(Clone, Debug)]
209struct CrossTokenParseState {
210    /// Cursor within the overall token stream; used for error reporting.
211    cursor: SourcePosition,
212    /// Current state of parsing here-documents.
213    here_state: HereState,
214    /// Ordered queue of here tags for which we're still looking for matching here-document bodies.
215    current_here_tags: Vec<HereTag>,
216    /// Tokens already tokenized that should be used first to serve requests for tokens.
217    queued_tokens: Vec<TokenizeResult>,
218    /// Are we in an arithmetic expansion?
219    arithmetic_expansion: bool,
220}
221
222/// Options controlling how the tokenizer operates.
223#[derive(Clone, Debug, Hash, Eq, PartialEq)]
224pub struct TokenizerOptions {
225    /// Whether or not to enable extended globbing patterns (extglob).
226    pub enable_extended_globbing: bool,
227    /// Whether or not to operate in POSIX compliance mode.
228    #[allow(unused)]
229    pub posix_mode: bool,
230    /// Whether or not we're running in SH emulation mode.
231    pub sh_mode: bool,
232}
233
234impl Default for TokenizerOptions {
235    fn default() -> Self {
236        Self {
237            enable_extended_globbing: true,
238            posix_mode: false,
239            sh_mode: false,
240        }
241    }
242}
243
244/// A tokenizer for shell scripts.
245pub(crate) struct Tokenizer<'a, R: ?Sized + std::io::BufRead> {
246    char_reader: std::iter::Peekable<utf8_chars::Chars<'a, R>>,
247    cross_state: CrossTokenParseState,
248    options: TokenizerOptions,
249}
250
251/// Encapsulates the current token parsing state.
252#[derive(Clone, Debug)]
253struct TokenParseState {
254    pub start_position: SourcePosition,
255    pub token_so_far: String,
256    pub token_is_operator: bool,
257    pub in_escape: bool,
258    pub quote_mode: QuoteMode,
259}
260
261impl TokenParseState {
262    pub fn new(start_position: &SourcePosition) -> Self {
263        TokenParseState {
264            start_position: start_position.clone(),
265            token_so_far: String::new(),
266            token_is_operator: false,
267            in_escape: false,
268            quote_mode: QuoteMode::None,
269        }
270    }
271
272    pub fn pop(&mut self, end_position: &SourcePosition) -> Token {
273        let token_location = TokenLocation {
274            start: std::mem::take(&mut self.start_position),
275            end: end_position.clone(),
276        };
277
278        let token = if std::mem::take(&mut self.token_is_operator) {
279            Token::Operator(std::mem::take(&mut self.token_so_far), token_location)
280        } else {
281            Token::Word(std::mem::take(&mut self.token_so_far), token_location)
282        };
283
284        self.start_position = end_position.clone();
285        self.in_escape = false;
286        self.quote_mode = QuoteMode::None;
287
288        token
289    }
290
291    pub fn started_token(&self) -> bool {
292        !self.token_so_far.is_empty()
293    }
294
295    pub fn append_char(&mut self, c: char) {
296        self.token_so_far.push(c);
297    }
298
299    pub fn append_str(&mut self, s: &str) {
300        self.token_so_far.push_str(s);
301    }
302
303    pub fn unquoted(&self) -> bool {
304        !self.in_escape && matches!(self.quote_mode, QuoteMode::None)
305    }
306
307    pub fn current_token(&self) -> &str {
308        &self.token_so_far
309    }
310
311    pub fn is_specific_operator(&self, operator: &str) -> bool {
312        self.token_is_operator && self.current_token() == operator
313    }
314
315    pub fn in_operator(&self) -> bool {
316        self.token_is_operator
317    }
318
319    fn is_newline(&self) -> bool {
320        self.token_so_far == "\n"
321    }
322
323    fn replace_with_here_doc(&mut self, s: String) {
324        self.token_so_far = s;
325    }
326
327    pub fn delimit_current_token(
328        &mut self,
329        reason: TokenEndReason,
330        cross_token_state: &mut CrossTokenParseState,
331    ) -> Result<Option<TokenizeResult>, TokenizerError> {
332        // If we don't have anything in the token, then don't yield an empty string token
333        // *unless* it's the body of a here document.
334        if !self.started_token() && !matches!(reason, TokenEndReason::HereDocumentBodyEnd) {
335            return Ok(Some(TokenizeResult {
336                reason,
337                token: None,
338            }));
339        }
340
341        // TODO: Make sure the here-tag meets criteria (and isn't a newline).
342        let current_here_state = std::mem::take(&mut cross_token_state.here_state);
343        match current_here_state {
344            HereState::NextTokenIsHereTag { remove_tabs } => {
345                // Don't yield the operator as a token yet. We need to make sure we collect
346                // up everything we need for all the here-documents with tags on this line.
347                let operator_token_result = TokenizeResult {
348                    reason,
349                    token: Some(self.pop(&cross_token_state.cursor)),
350                };
351
352                cross_token_state.here_state = HereState::CurrentTokenIsHereTag {
353                    remove_tabs,
354                    operator_token_result,
355                };
356
357                return Ok(None);
358            }
359            HereState::CurrentTokenIsHereTag {
360                remove_tabs,
361                operator_token_result,
362            } => {
363                if self.is_newline() {
364                    return Err(TokenizerError::MissingHereTag(
365                        self.current_token().to_owned(),
366                    ));
367                }
368
369                cross_token_state.here_state = HereState::NextLineIsHereDoc;
370
371                // Include the trailing \n in the here tag so it's easier to check against.
372                let tag = std::format!("{}\n", self.current_token());
373                let tag_was_escaped_or_quoted = tag.contains(is_quoting_char);
374
375                let tag_token_result = TokenizeResult {
376                    reason,
377                    token: Some(self.pop(&cross_token_state.cursor)),
378                };
379
380                cross_token_state.current_here_tags.push(HereTag {
381                    tag,
382                    tag_was_escaped_or_quoted,
383                    remove_tabs,
384                    position: cross_token_state.cursor.clone(),
385                    tokens: vec![operator_token_result, tag_token_result],
386                    pending_tokens_after: vec![],
387                });
388
389                return Ok(None);
390            }
391            HereState::NextLineIsHereDoc => {
392                if self.is_newline() {
393                    cross_token_state.here_state = HereState::InHereDocs;
394                } else {
395                    cross_token_state.here_state = HereState::NextLineIsHereDoc;
396                }
397
398                if let Some(last_here_tag) = cross_token_state.current_here_tags.last_mut() {
399                    let token = self.pop(&cross_token_state.cursor);
400                    let result = TokenizeResult {
401                        reason,
402                        token: Some(token),
403                    };
404
405                    last_here_tag.pending_tokens_after.push(result);
406                } else {
407                    return Err(TokenizerError::MissingHereTagForDocumentBody);
408                }
409
410                return Ok(None);
411            }
412            HereState::InHereDocs => {
413                // We hit the end of the current here-document.
414                let completed_here_tag = cross_token_state.current_here_tags.remove(0);
415
416                // First queue the redirection operator and (start) here-tag.
417                for here_token in completed_here_tag.tokens {
418                    cross_token_state.queued_tokens.push(here_token);
419                }
420
421                // Leave a hint that we are about to start a here-document.
422                cross_token_state.queued_tokens.push(TokenizeResult {
423                    reason: TokenEndReason::HereDocumentBodyStart,
424                    token: None,
425                });
426
427                // Then queue the body document we just finished.
428                cross_token_state.queued_tokens.push(TokenizeResult {
429                    reason,
430                    token: Some(self.pop(&cross_token_state.cursor)),
431                });
432
433                // Then queue up the (end) here-tag.
434                self.append_str(completed_here_tag.tag.trim_end_matches('\n'));
435                cross_token_state.queued_tokens.push(TokenizeResult {
436                    reason: TokenEndReason::HereDocumentEndTag,
437                    token: Some(self.pop(&cross_token_state.cursor)),
438                });
439
440                // Now we're ready to queue up any tokens that came between the completed
441                // here tag and the next here tag (or newline after it if it was the last).
442                for pending_token in completed_here_tag.pending_tokens_after {
443                    cross_token_state.queued_tokens.push(pending_token);
444                }
445
446                if cross_token_state.current_here_tags.is_empty() {
447                    cross_token_state.here_state = HereState::None;
448                } else {
449                    cross_token_state.here_state = HereState::InHereDocs;
450                }
451
452                return Ok(None);
453            }
454            HereState::None => (),
455        }
456
457        let token = self.pop(&cross_token_state.cursor);
458        let result = TokenizeResult {
459            reason,
460            token: Some(token),
461        };
462
463        Ok(Some(result))
464    }
465}
466
467/// Break the given input shell script string into tokens, returning the tokens.
468///
469/// # Arguments
470///
471/// * `input` - The shell script to tokenize.
472pub fn tokenize_str(input: &str) -> Result<Vec<Token>, TokenizerError> {
473    tokenize_str_with_options(input, &TokenizerOptions::default())
474}
475
476/// Break the given input shell script string into tokens, returning the tokens.
477///
478/// # Arguments
479///
480/// * `input` - The shell script to tokenize.
481/// * `options` - Options controlling how the tokenizer operates.
482pub fn tokenize_str_with_options(
483    input: &str,
484    options: &TokenizerOptions,
485) -> Result<Vec<Token>, TokenizerError> {
486    uncached_tokenize_string(input.to_owned(), options.to_owned())
487}
488
489#[cached::proc_macro::cached(name = "TOKENIZE_CACHE", size = 64, result = true)]
490fn uncached_tokenize_string(
491    input: String,
492    options: TokenizerOptions,
493) -> Result<Vec<Token>, TokenizerError> {
494    uncached_tokenize_str(input.as_str(), &options)
495}
496
497/// Break the given input shell script string into tokens, returning the tokens.
498/// No caching is performed.
499///
500/// # Arguments
501///
502/// * `input` - The shell script to tokenize.
503pub fn uncached_tokenize_str(
504    input: &str,
505    options: &TokenizerOptions,
506) -> Result<Vec<Token>, TokenizerError> {
507    let mut reader = std::io::BufReader::new(input.as_bytes());
508    let mut tokenizer = crate::tokenizer::Tokenizer::new(&mut reader, options);
509
510    let mut tokens = vec![];
511    loop {
512        match tokenizer.next_token()? {
513            TokenizeResult {
514                token: Some(token), ..
515            } => tokens.push(token),
516            TokenizeResult {
517                reason: TokenEndReason::EndOfInput,
518                ..
519            } => break,
520            _ => (),
521        }
522    }
523
524    Ok(tokens)
525}
526
527impl<'a, R: ?Sized + std::io::BufRead> Tokenizer<'a, R> {
528    pub fn new(reader: &'a mut R, options: &TokenizerOptions) -> Tokenizer<'a, R> {
529        Tokenizer {
530            options: options.clone(),
531            char_reader: reader.chars().peekable(),
532            cross_state: CrossTokenParseState {
533                cursor: SourcePosition {
534                    index: 0,
535                    line: 1,
536                    column: 1,
537                },
538                here_state: HereState::None,
539                current_here_tags: vec![],
540                queued_tokens: vec![],
541                arithmetic_expansion: false,
542            },
543        }
544    }
545
546    pub fn current_location(&self) -> Option<SourcePosition> {
547        Some(self.cross_state.cursor.clone())
548    }
549
550    fn next_char(&mut self) -> Result<Option<char>, TokenizerError> {
551        let c = self
552            .char_reader
553            .next()
554            .transpose()
555            .map_err(TokenizerError::ReadError)?;
556
557        if let Some(ch) = c {
558            if ch == '\n' {
559                self.cross_state.cursor.line += 1;
560                self.cross_state.cursor.column = 1;
561            } else {
562                self.cross_state.cursor.column += 1;
563            }
564            self.cross_state.cursor.index += 1;
565        }
566
567        Ok(c)
568    }
569
570    fn consume_char(&mut self) -> Result<(), TokenizerError> {
571        let _ = self.next_char()?;
572        Ok(())
573    }
574
575    fn peek_char(&mut self) -> Result<Option<char>, TokenizerError> {
576        match self.char_reader.peek() {
577            Some(result) => match result {
578                Ok(c) => Ok(Some(*c)),
579                Err(_) => Err(TokenizerError::FailedDecoding),
580            },
581            None => Ok(None),
582        }
583    }
584
585    pub fn next_token(&mut self) -> Result<TokenizeResult, TokenizerError> {
586        self.next_token_until(None)
587    }
588
589    #[allow(clippy::if_same_then_else)]
590    fn next_token_until(
591        &mut self,
592        terminating_char: Option<char>,
593    ) -> Result<TokenizeResult, TokenizerError> {
594        let mut state = TokenParseState::new(&self.cross_state.cursor);
595        let mut result: Option<TokenizeResult> = None;
596
597        while result.is_none() {
598            // First satisfy token results from our queue. Once we exhaust the queue then
599            // we'll look at the input stream.
600            if !self.cross_state.queued_tokens.is_empty() {
601                return Ok(self.cross_state.queued_tokens.remove(0));
602            }
603
604            let next = self.peek_char()?;
605            let c = next.unwrap_or('\0');
606
607            // When we hit the end of the input, then we're done with the current token (if there is
608            // one).
609            if next.is_none() {
610                // TODO: Verify we're not waiting on some terminating character?
611                // Verify we're out of all quotes.
612                if state.in_escape {
613                    return Err(TokenizerError::UnterminatedEscapeSequence);
614                }
615                match state.quote_mode {
616                    QuoteMode::None => (),
617                    QuoteMode::Single(pos) => {
618                        return Err(TokenizerError::UnterminatedSingleQuote(pos));
619                    }
620                    QuoteMode::Double(pos) => {
621                        return Err(TokenizerError::UnterminatedDoubleQuote(pos));
622                    }
623                }
624
625                // Verify we're not in a here document.
626                if !matches!(self.cross_state.here_state, HereState::None) {
627                    let tag_names = self
628                        .cross_state
629                        .current_here_tags
630                        .iter()
631                        .map(|tag| tag.tag.trim())
632                        .collect::<Vec<_>>()
633                        .join(", ");
634                    let tag_positions = self
635                        .cross_state
636                        .current_here_tags
637                        .iter()
638                        .map(|tag| std::format!("{}", tag.position))
639                        .collect::<Vec<_>>()
640                        .join(", ");
641                    return Err(TokenizerError::UnterminatedHereDocuments(
642                        tag_names,
643                        tag_positions,
644                    ));
645                }
646
647                result = state
648                    .delimit_current_token(TokenEndReason::EndOfInput, &mut self.cross_state)?;
649            //
650            // Look for the specially specified terminating char.
651            //
652            } else if state.unquoted() && terminating_char == Some(c) {
653                result = state.delimit_current_token(
654                    TokenEndReason::SpecifiedTerminatingChar,
655                    &mut self.cross_state,
656                )?;
657            //
658            // Handle being in a here document.
659            //
660            } else if matches!(self.cross_state.here_state, HereState::InHereDocs) {
661                //
662                // For now, just include the character in the current token. We also check
663                // if there are leading tabs to be removed.
664                //
665                if !self.cross_state.current_here_tags.is_empty()
666                    && self.cross_state.current_here_tags[0].remove_tabs
667                    && (!state.started_token() || state.current_token().ends_with('\n'))
668                    && c == '\t'
669                {
670                    // Consume it but don't include it.
671                    self.consume_char()?;
672                } else {
673                    self.consume_char()?;
674                    state.append_char(c);
675
676                    // See if this was a newline character following the terminating here tag.
677                    if c == '\n' {
678                        let next_here_tag = &self.cross_state.current_here_tags[0];
679                        let tag_str: Cow<'_, str> = if next_here_tag.tag_was_escaped_or_quoted {
680                            unquote_str(next_here_tag.tag.as_str()).into()
681                        } else {
682                            next_here_tag.tag.as_str().into()
683                        };
684
685                        if let Some(current_token_without_here_tag) =
686                            state.current_token().strip_suffix(tag_str.as_ref())
687                        {
688                            // Make sure that was either the start of the here document, or there
689                            // was a newline between the preceding part
690                            // and the tag.
691                            if current_token_without_here_tag.is_empty()
692                                || current_token_without_here_tag.ends_with('\n')
693                            {
694                                state.replace_with_here_doc(
695                                    current_token_without_here_tag.to_owned(),
696                                );
697
698                                // Delimit the end of the here-document body.
699                                result = state.delimit_current_token(
700                                    TokenEndReason::HereDocumentBodyEnd,
701                                    &mut self.cross_state,
702                                )?;
703                            }
704                        }
705                    }
706                }
707            } else if state.in_operator() {
708                //
709                // We're in an operator. See if this character continues an operator, or if it
710                // must be a separate token (because it wouldn't make a prefix of an operator).
711                //
712
713                let mut hypothetical_token = state.current_token().to_owned();
714                hypothetical_token.push(c);
715
716                if state.unquoted() && self.is_operator(hypothetical_token.as_ref()) {
717                    self.consume_char()?;
718                    state.append_char(c);
719                } else {
720                    assert!(state.started_token());
721
722                    //
723                    // N.B. If the completed operator indicates a here-document, then keep
724                    // track that the *next* token should be the here-tag.
725                    //
726                    if self.cross_state.arithmetic_expansion {
727                        // Nothing to do; we're in an arithmetic expansion so << and <<-
728                        // are not here-docs, they're either a left-shift operator or
729                        // a left-shift operator followed by a unary minus operator.
730                    } else if state.is_specific_operator("<<") {
731                        self.cross_state.here_state =
732                            HereState::NextTokenIsHereTag { remove_tabs: false };
733                    } else if state.is_specific_operator("<<-") {
734                        self.cross_state.here_state =
735                            HereState::NextTokenIsHereTag { remove_tabs: true };
736                    }
737
738                    let reason = if state.current_token() == "\n" {
739                        TokenEndReason::UnescapedNewLine
740                    } else {
741                        TokenEndReason::OperatorEnd
742                    };
743
744                    result = state.delimit_current_token(reason, &mut self.cross_state)?;
745                }
746            //
747            // See if this is a character that changes the current escaping/quoting state.
748            //
749            } else if does_char_newly_affect_quoting(&state, c) {
750                if c == '\\' {
751                    // Consume the backslash ourselves so we can peek past it.
752                    self.consume_char()?;
753
754                    if matches!(self.peek_char()?, Some('\n')) {
755                        // Make sure the newline char gets consumed too.
756                        self.consume_char()?;
757
758                        // Make sure to include neither the backslash nor the newline character.
759                    } else {
760                        state.in_escape = true;
761                        state.append_char(c);
762                    }
763                } else if c == '\'' {
764                    state.quote_mode = QuoteMode::Single(self.cross_state.cursor.clone());
765                    self.consume_char()?;
766                    state.append_char(c);
767                } else if c == '\"' {
768                    state.quote_mode = QuoteMode::Double(self.cross_state.cursor.clone());
769                    self.consume_char()?;
770                    state.append_char(c);
771                }
772            }
773            //
774            // Handle end of single-quote or double-quote.
775            else if !state.in_escape
776                && matches!(state.quote_mode, QuoteMode::Single(_))
777                && c == '\''
778            {
779                state.quote_mode = QuoteMode::None;
780                self.consume_char()?;
781                state.append_char(c);
782            } else if !state.in_escape
783                && matches!(state.quote_mode, QuoteMode::Double(_))
784                && c == '\"'
785            {
786                state.quote_mode = QuoteMode::None;
787                self.consume_char()?;
788                state.append_char(c);
789            }
790            //
791            // Handle end of escape sequence.
792            // TODO: Handle double-quote specific escape sequences.
793            else if state.in_escape {
794                state.in_escape = false;
795                self.consume_char()?;
796                state.append_char(c);
797            } else if (state.unquoted()
798                || (matches!(state.quote_mode, QuoteMode::Double(_)) && !state.in_escape))
799                && (c == '$' || c == '`')
800            {
801                // TODO: handle quoted $ or ` in a double quote
802                if c == '$' {
803                    // Consume the '$' so we can peek beyond.
804                    self.consume_char()?;
805
806                    // Now peek beyond to see what we have.
807                    let char_after_dollar_sign = self.peek_char()?;
808                    match char_after_dollar_sign {
809                        Some('(') => {
810                            // Add the '$' we already consumed to the token.
811                            state.append_char('$');
812
813                            // Consume the '(' and add it to the token.
814                            state.append_char(self.next_char()?.unwrap());
815
816                            // Check to see if this is possibly an arithmetic expression
817                            // (i.e., one that starts with `$((`).
818                            let mut required_end_parens = 1;
819                            if matches!(self.peek_char()?, Some('(')) {
820                                // Consume the second '(' and add it to the token.
821                                state.append_char(self.next_char()?.unwrap());
822                                // Keep track that we'll need to see *2* end parentheses
823                                // to leave this construct.
824                                required_end_parens = 2;
825                                // Keep track that we're in an arithmetic expression, since
826                                // some text will be interpreted differently as a result
827                                // (e.g., << is a left shift operator and not a here doc
828                                // input redirection operator).
829                                self.cross_state.arithmetic_expansion = true;
830                            }
831
832                            let mut pending_here_doc_tokens = vec![];
833                            let mut drain_here_doc_tokens = false;
834
835                            loop {
836                                let cur_token = if drain_here_doc_tokens
837                                    && !pending_here_doc_tokens.is_empty()
838                                {
839                                    if pending_here_doc_tokens.len() == 1 {
840                                        drain_here_doc_tokens = false;
841                                    }
842
843                                    pending_here_doc_tokens.remove(0)
844                                } else {
845                                    let cur_token = self.next_token_until(Some(')'))?;
846
847                                    // See if this is a here-document-related token we need to hold
848                                    // onto until after we've seen all the tokens that need to show
849                                    // up before we get to the body.
850                                    if matches!(
851                                        cur_token.reason,
852                                        TokenEndReason::HereDocumentBodyStart
853                                            | TokenEndReason::HereDocumentBodyEnd
854                                            | TokenEndReason::HereDocumentEndTag
855                                    ) {
856                                        pending_here_doc_tokens.push(cur_token);
857                                        continue;
858                                    }
859
860                                    cur_token
861                                };
862
863                                if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
864                                    && !pending_here_doc_tokens.is_empty()
865                                {
866                                    pending_here_doc_tokens.push(cur_token);
867                                    drain_here_doc_tokens = true;
868                                    continue;
869                                }
870
871                                if let Some(cur_token_value) = cur_token.token {
872                                    state.append_str(cur_token_value.to_str());
873
874                                    // If we encounter an embedded open parenthesis, then note that
875                                    // we'll have to see the matching end to it before we worry
876                                    // about the end of the
877                                    // containing construct.
878                                    if matches!(cur_token_value, Token::Operator(o, _) if o == "(")
879                                    {
880                                        required_end_parens += 1;
881                                    }
882                                }
883
884                                match cur_token.reason {
885                                    TokenEndReason::HereDocumentBodyStart => {
886                                        state.append_char('\n')
887                                    }
888                                    TokenEndReason::NonNewLineBlank => state.append_char(' '),
889                                    TokenEndReason::SpecifiedTerminatingChar => {
890                                        // We hit the ')' we were looking for. If this is the last
891                                        // end parenthesis we needed to find, then we'll exit the
892                                        // loop and consume
893                                        // and append it.
894                                        required_end_parens -= 1;
895                                        if required_end_parens == 0 {
896                                            break;
897                                        }
898
899                                        // This wasn't the *last* end parenthesis char, so let's
900                                        // consume and append it here before we loop around again.
901                                        state.append_char(self.next_char()?.unwrap());
902                                    }
903                                    TokenEndReason::EndOfInput => {
904                                        return Err(TokenizerError::UnterminatedCommandSubstitution)
905                                    }
906                                    _ => (),
907                                }
908                            }
909
910                            self.cross_state.arithmetic_expansion = false;
911
912                            state.append_char(self.next_char()?.unwrap());
913                        }
914
915                        Some('{') => {
916                            // Add the '$' we already consumed to the token.
917                            state.append_char('$');
918
919                            // Consume the '{' and add it to the token.
920                            state.append_char(self.next_char()?.unwrap());
921
922                            let mut pending_here_doc_tokens = vec![];
923                            let mut drain_here_doc_tokens = false;
924
925                            loop {
926                                let cur_token = if drain_here_doc_tokens
927                                    && !pending_here_doc_tokens.is_empty()
928                                {
929                                    if pending_here_doc_tokens.len() == 1 {
930                                        drain_here_doc_tokens = false;
931                                    }
932
933                                    pending_here_doc_tokens.remove(0)
934                                } else {
935                                    let cur_token = self.next_token_until(Some('}'))?;
936
937                                    // See if this is a here-document-related token we need to hold
938                                    // onto until after we've seen all the tokens that need to show
939                                    // up before we get to the body.
940                                    if matches!(
941                                        cur_token.reason,
942                                        TokenEndReason::HereDocumentBodyStart
943                                            | TokenEndReason::HereDocumentBodyEnd
944                                            | TokenEndReason::HereDocumentEndTag
945                                    ) {
946                                        pending_here_doc_tokens.push(cur_token);
947                                        continue;
948                                    }
949
950                                    cur_token
951                                };
952
953                                if matches!(cur_token.reason, TokenEndReason::UnescapedNewLine)
954                                    && !pending_here_doc_tokens.is_empty()
955                                {
956                                    pending_here_doc_tokens.push(cur_token);
957                                    drain_here_doc_tokens = true;
958                                    continue;
959                                }
960
961                                if let Some(cur_token_value) = cur_token.token {
962                                    state.append_str(cur_token_value.to_str())
963                                }
964
965                                match cur_token.reason {
966                                    TokenEndReason::HereDocumentBodyStart => {
967                                        state.append_char('\n')
968                                    }
969                                    TokenEndReason::NonNewLineBlank => state.append_char(' '),
970                                    TokenEndReason::SpecifiedTerminatingChar => {
971                                        // We hit the end brace we were looking for but did not
972                                        // yet consume it. Do so now.
973                                        state.append_char(self.next_char()?.unwrap());
974                                        break;
975                                    }
976                                    TokenEndReason::EndOfInput => {
977                                        return Err(TokenizerError::UnterminatedVariable)
978                                    }
979                                    _ => (),
980                                }
981                            }
982                        }
983                        _ => {
984                            // This is either a different character, or else the end of the string.
985                            // Either way, add the '$' we already consumed to the token.
986                            state.append_char('$');
987                        }
988                    }
989                } else {
990                    // We look for the terminating backquote. First disable normal consumption and
991                    // consume the starting backquote.
992                    let backquote_pos = self.cross_state.cursor.clone();
993                    self.consume_char()?;
994
995                    // Add the opening backquote to the token.
996                    state.append_char(c);
997
998                    // Now continue until we see an unescaped backquote.
999                    let mut escaping_enabled = false;
1000                    let mut done = false;
1001                    while !done {
1002                        // Read (and consume) the next char.
1003                        let next_char_in_backquote = self.next_char()?;
1004                        if let Some(cib) = next_char_in_backquote {
1005                            // Include it in the token no matter what.
1006                            state.append_char(cib);
1007
1008                            // Watch out for escaping.
1009                            if !escaping_enabled && cib == '\\' {
1010                                escaping_enabled = true;
1011                            } else {
1012                                // Look for an unescaped backquote to terminate.
1013                                if !escaping_enabled && cib == '`' {
1014                                    done = true;
1015                                }
1016                                escaping_enabled = false;
1017                            }
1018                        } else {
1019                            return Err(TokenizerError::UnterminatedBackquote(backquote_pos));
1020                        }
1021                    }
1022                }
1023            }
1024            //
1025            // [Extension]
1026            // If extended globbing is enabled, the last consumed character is an
1027            // unquoted start of an extglob pattern, *and* if the current character
1028            // is an open parenthesis, then this begins an extglob pattern.
1029            else if c == '('
1030                && self.options.enable_extended_globbing
1031                && state.unquoted()
1032                && !state.in_operator()
1033                && state
1034                    .current_token()
1035                    .ends_with(|x| self.can_start_extglob(x))
1036            {
1037                // Consume the '(' and append it.
1038                self.consume_char()?;
1039                state.append_char(c);
1040
1041                let mut paren_depth = 1;
1042
1043                // Keep consuming until we see the matching end ')'.
1044                while paren_depth > 0 {
1045                    if let Some(extglob_char) = self.next_char()? {
1046                        // Include it in the token.
1047                        state.append_char(extglob_char);
1048
1049                        // Look for ')' to terminate.
1050                        // TODO: handle escaping?
1051                        if extglob_char == '(' {
1052                            paren_depth += 1;
1053                        } else if extglob_char == ')' {
1054                            paren_depth -= 1;
1055                        }
1056                    } else {
1057                        return Err(TokenizerError::UnterminatedExtendedGlob(
1058                            self.cross_state.cursor.clone(),
1059                        ));
1060                    }
1061                }
1062            //
1063            // If the character *can* start an operator, then it will.
1064            //
1065            } else if state.unquoted() && self.can_start_operator(c) {
1066                if state.started_token() {
1067                    result = state.delimit_current_token(
1068                        TokenEndReason::OperatorStart,
1069                        &mut self.cross_state,
1070                    )?;
1071                } else {
1072                    state.token_is_operator = true;
1073                    self.consume_char()?;
1074                    state.append_char(c);
1075                }
1076            //
1077            // Whitespace gets discarded (and delimits tokens).
1078            //
1079            } else if state.unquoted() && is_blank(c) {
1080                if state.started_token() {
1081                    result = state.delimit_current_token(
1082                        TokenEndReason::NonNewLineBlank,
1083                        &mut self.cross_state,
1084                    )?;
1085                } else {
1086                    // Make sure we don't include this char in the token range.
1087                    state.start_position.column += 1;
1088                    state.start_position.index += 1;
1089                }
1090
1091                self.consume_char()?;
1092            }
1093            //
1094            // N.B. We need to remember if we were recursively called, say in a command
1095            // substitution; in that case we won't think a token was started but... we'd
1096            // be wrong.
1097            else if !state.token_is_operator
1098                && (state.started_token() || terminating_char.is_some())
1099            {
1100                self.consume_char()?;
1101                state.append_char(c);
1102            } else if c == '#' {
1103                // Consume the '#'.
1104                self.consume_char()?;
1105
1106                let mut done = false;
1107                while !done {
1108                    done = match self.peek_char()? {
1109                        Some('\n') => true,
1110                        None => true,
1111                        _ => {
1112                            // Consume the peeked char; it's part of the comment.
1113                            self.consume_char()?;
1114                            false
1115                        }
1116                    };
1117                }
1118
1119                // Re-start loop as if the comment never happened.
1120                continue;
1121            //
1122            // In all other cases where we have an in-progress token, we delimit here.
1123            //
1124            } else if state.started_token() {
1125                result =
1126                    state.delimit_current_token(TokenEndReason::Other, &mut self.cross_state)?;
1127            } else {
1128                // If we got here, then we don't have a token in progress and we're not starting an
1129                // operator. Add the character to a new token.
1130                self.consume_char()?;
1131                state.append_char(c);
1132            }
1133        }
1134
1135        let result = result.unwrap();
1136
1137        Ok(result)
1138    }
1139
1140    fn can_start_extglob(&self, c: char) -> bool {
1141        matches!(c, '@' | '!' | '?' | '+' | '*')
1142    }
1143
1144    fn can_start_operator(&self, c: char) -> bool {
1145        matches!(c, '&' | '(' | ')' | ';' | '\n' | '|' | '<' | '>')
1146    }
1147
1148    fn is_operator(&self, s: &str) -> bool {
1149        // Handle non-POSIX operators.
1150        if !self.options.sh_mode && matches!(s, "<<<" | "&>" | "&>>" | ";;&" | ";&" | "|&") {
1151            return true;
1152        }
1153
1154        matches!(
1155            s,
1156            "&" | "&&"
1157                | "("
1158                | ")"
1159                | ";"
1160                | ";;"
1161                | "\n"
1162                | "|"
1163                | "||"
1164                | "<"
1165                | ">"
1166                | ">|"
1167                | "<<"
1168                | ">>"
1169                | "<&"
1170                | ">&"
1171                | "<<-"
1172                | "<>"
1173        )
1174    }
1175}
1176
1177impl<R: ?Sized + std::io::BufRead> Iterator for Tokenizer<'_, R> {
1178    type Item = Result<TokenizeResult, TokenizerError>;
1179
1180    fn next(&mut self) -> Option<Self::Item> {
1181        match self.next_token() {
1182            #[allow(clippy::manual_map)]
1183            Ok(result) => match result.token {
1184                Some(_) => Some(Ok(result)),
1185                None => None,
1186            },
1187            Err(e) => Some(Err(e)),
1188        }
1189    }
1190}
1191
1192fn is_blank(c: char) -> bool {
1193    c == ' ' || c == '\t'
1194}
1195
1196fn does_char_newly_affect_quoting(state: &TokenParseState, c: char) -> bool {
1197    // If we're currently escaped, then nothing affects quoting.
1198    if state.in_escape {
1199        return false;
1200    }
1201
1202    match state.quote_mode {
1203        // When we're in a double quote, only a subset of escape sequences are recognized.
1204        QuoteMode::Double(_) => {
1205            if c == '\\' {
1206                // TODO: handle backslash in double quote
1207                true
1208            } else {
1209                false
1210            }
1211        }
1212        // When we're in a single quote, nothing affects quoting.
1213        QuoteMode::Single(_) => false,
1214        // When we're not already in a quote, then we can straightforwardly look for a
1215        // quote mark or backslash.
1216        QuoteMode::None => is_quoting_char(c),
1217    }
1218}
1219
1220fn is_quoting_char(c: char) -> bool {
1221    matches!(c, '\\' | '\'' | '\"')
1222}
1223
1224/// Return a string with all the quoting removed.
1225///
1226/// # Arguments
1227///
1228/// * `s` - The string to unquote.
1229pub fn unquote_str(s: &str) -> String {
1230    let mut result = String::new();
1231
1232    let mut in_escape = false;
1233    for c in s.chars() {
1234        match c {
1235            c if in_escape => {
1236                result.push(c);
1237                in_escape = false;
1238            }
1239            '\\' => in_escape = true,
1240            c if is_quoting_char(c) => (),
1241            c => result.push(c),
1242        }
1243    }
1244
1245    result
1246}
1247
1248#[cfg(test)]
1249mod tests {
1250    use super::*;
1251    use anyhow::Result;
1252    // use assert_matches::assert_matches;
1253    use pretty_assertions::{assert_eq, assert_matches};
1254
1255    #[test]
1256    fn tokenize_empty() -> Result<()> {
1257        let tokens = tokenize_str("")?;
1258        assert_eq!(tokens.len(), 0);
1259        Ok(())
1260    }
1261
1262    #[test]
1263    fn tokenize_line_continuation() -> Result<()> {
1264        let tokens = tokenize_str(
1265            r"a\
1266bc",
1267        )?;
1268        assert_matches!(
1269            &tokens[..],
1270            [t1 @ Token::Word(..)] if t1.to_str() == "abc"
1271        );
1272        Ok(())
1273    }
1274
1275    #[test]
1276    fn tokenize_operators() -> Result<()> {
1277        assert_matches!(
1278            &tokenize_str("a>>b")?[..],
1279            [t1 @ Token::Word(..), t2 @ Token::Operator(..), t3 @ Token::Word(..)] if
1280                t1.to_str() == "a" &&
1281                t2.to_str() == ">>" &&
1282                t3.to_str() == "b"
1283        );
1284        Ok(())
1285    }
1286
1287    #[test]
1288    fn tokenize_comment() -> Result<()> {
1289        let tokens = tokenize_str(
1290            r#"a #comment
1291"#,
1292        )?;
1293        assert_matches!(
1294            &tokens[..],
1295            [t1 @ Token::Word(..), t2 @ Token::Operator(..)] if
1296                t1.to_str() == "a" &&
1297                t2.to_str() == "\n"
1298        );
1299        Ok(())
1300    }
1301
1302    #[test]
1303    fn tokenize_comment_at_eof() -> Result<()> {
1304        assert_matches!(
1305            &tokenize_str(r#"a #comment"#)?[..],
1306            [t1 @ Token::Word(..)] if t1.to_str() == "a"
1307        );
1308        Ok(())
1309    }
1310
1311    #[test]
1312    fn tokenize_empty_here_doc() -> Result<()> {
1313        let tokens = tokenize_str(
1314            r#"cat <<HERE
1315HERE
1316"#,
1317        )?;
1318        assert_matches!(
1319            &tokens[..],
1320            [t1 @ Token::Word(..),
1321             t2 @ Token::Operator(..),
1322             t3 @ Token::Word(..),
1323             t4 @ Token::Word(..),
1324             t5 @ Token::Word(..),
1325             t6 @ Token::Operator(..)] if
1326                t1.to_str() == "cat" &&
1327                t2.to_str() == "<<" &&
1328                t3.to_str() == "HERE" &&
1329                t4.to_str() == "" &&
1330                t5.to_str() == "HERE" &&
1331                t6.to_str() == "\n"
1332        );
1333        Ok(())
1334    }
1335
1336    #[test]
1337    fn tokenize_here_doc() -> Result<()> {
1338        let tokens = tokenize_str(
1339            r#"cat <<HERE
1340SOMETHING
1341HERE
1342echo after
1343"#,
1344        )?;
1345        assert_matches!(
1346            &tokens[..],
1347            [t1 @ Token::Word(..),
1348             t2 @ Token::Operator(..),
1349             t3 @ Token::Word(..),
1350             t4 @ Token::Word(..),
1351             t5 @ Token::Word(..),
1352             t6 @ Token::Operator(..),
1353             t7 @ Token::Word(..),
1354             t8 @ Token::Word(..),
1355             t9 @ Token::Operator(..)] if
1356                t1.to_str() == "cat" &&
1357                t2.to_str() == "<<" &&
1358                t3.to_str() == "HERE" &&
1359                t4.to_str() == "SOMETHING\n" &&
1360                t5.to_str() == "HERE" &&
1361                t6.to_str() == "\n" &&
1362                t7.to_str() == "echo" &&
1363                t8.to_str() == "after" &&
1364                t9.to_str() == "\n"
1365        );
1366        Ok(())
1367    }
1368
1369    #[test]
1370    fn tokenize_here_doc_with_tab_removal() -> Result<()> {
1371        let tokens = tokenize_str(
1372            r#"cat <<-HERE
1373	SOMETHING
1374	HERE
1375"#,
1376        )?;
1377        assert_matches!(
1378            &tokens[..],
1379            [t1 @ Token::Word(..),
1380             t2 @ Token::Operator(..),
1381             t3 @ Token::Word(..),
1382             t4 @ Token::Word(..),
1383             t5 @ Token::Word(..),
1384             t6 @ Token::Operator(..)] if
1385                t1.to_str() == "cat" &&
1386                t2.to_str() == "<<-" &&
1387                t3.to_str() == "HERE" &&
1388                t4.to_str() == "SOMETHING\n" &&
1389                t5.to_str() == "HERE" &&
1390                t6.to_str() == "\n"
1391        );
1392        Ok(())
1393    }
1394
1395    #[test]
1396    fn tokenize_here_doc_with_other_tokens() -> Result<()> {
1397        let tokens = tokenize_str(
1398            r#"cat <<EOF | wc -l
1399A B C
14001 2 3
1401D E F
1402EOF
1403"#,
1404        )?;
1405        assert_matches!(
1406            &tokens[..],
1407            [t1 @ Token::Word(..),
1408             t2 @ Token::Operator(..),
1409             t3 @ Token::Word(..),
1410             t4 @ Token::Word(..),
1411             t5 @ Token::Word(..),
1412             t6 @ Token::Operator(..),
1413             t7 @ Token::Word(..),
1414             t8 @ Token::Word(..),
1415             t9 @ Token::Operator(..)] if
1416                t1.to_str() == "cat" &&
1417                t2.to_str() == "<<" &&
1418                t3.to_str() == "EOF" &&
1419                t4.to_str() == "A B C\n1 2 3\nD E F\n" &&
1420                t5.to_str() == "EOF" &&
1421                t6.to_str() == "|" &&
1422                t7.to_str() == "wc" &&
1423                t8.to_str() == "-l" &&
1424                t9.to_str() == "\n"
1425        );
1426
1427        Ok(())
1428    }
1429
1430    #[test]
1431    fn tokenize_multiple_here_docs() -> Result<()> {
1432        let tokens = tokenize_str(
1433            r#"cat <<HERE1 <<HERE2
1434SOMETHING
1435HERE1
1436OTHER
1437HERE2
1438echo after
1439"#,
1440        )?;
1441        assert_matches!(
1442            &tokens[..],
1443            [t1 @ Token::Word(..),
1444             t2 @ Token::Operator(..),
1445             t3 @ Token::Word(..),
1446             t4 @ Token::Word(..),
1447             t5 @ Token::Word(..),
1448             t6 @ Token::Operator(..),
1449             t7 @ Token::Word(..),
1450             t8 @ Token::Word(..),
1451             t9 @ Token::Word(..),
1452             t10 @ Token::Operator(..),
1453             t11 @ Token::Word(..),
1454             t12 @ Token::Word(..),
1455             t13 @ Token::Operator(..)] if
1456                t1.to_str() == "cat" &&
1457                t2.to_str() == "<<" &&
1458                t3.to_str() == "HERE1" &&
1459                t4.to_str() == "SOMETHING\n" &&
1460                t5.to_str() == "HERE1" &&
1461                t6.to_str() == "<<" &&
1462                t7.to_str() == "HERE2" &&
1463                t8.to_str() == "OTHER\n" &&
1464                t9.to_str() == "HERE2" &&
1465                t10.to_str() == "\n" &&
1466                t11.to_str() == "echo" &&
1467                t12.to_str() == "after" &&
1468                t13.to_str() == "\n"
1469        );
1470        Ok(())
1471    }
1472
1473    #[test]
1474    fn tokenize_unterminated_here_doc() -> Result<()> {
1475        let result = tokenize_str(
1476            r#"cat <<HERE
1477SOMETHING
1478"#,
1479        );
1480        assert!(result.is_err());
1481        Ok(())
1482    }
1483
1484    #[test]
1485    fn tokenize_missing_here_tag() -> Result<()> {
1486        let result = tokenize_str(
1487            r"cat <<
1488",
1489        );
1490        assert!(result.is_err());
1491        Ok(())
1492    }
1493
1494    #[test]
1495    fn tokenize_here_doc_in_command_substitution() -> Result<()> {
1496        let tokens = tokenize_str(
1497            r#"echo $(cat <<HERE
1498TEXT
1499HERE
1500)"#,
1501        )?;
1502        assert_matches!(
1503            &tokens[..],
1504            [t1 @ Token::Word(..),
1505             t2 @ Token::Word(..)] if
1506                t1.to_str() == "echo" &&
1507                t2.to_str() == "$(cat <<HERE\nTEXT\nHERE\n)"
1508        );
1509        Ok(())
1510    }
1511
1512    #[test]
1513    fn tokenize_complex_here_docs_in_command_substitution() -> Result<()> {
1514        let tokens = tokenize_str(
1515            r#"echo $(cat <<HERE1 <<HERE2 | wc -l
1516TEXT
1517HERE1
1518OTHER
1519HERE2
1520)"#,
1521        )?;
1522        assert_matches!(
1523            &tokens[..],
1524            [t1 @ Token::Word(..),
1525             t2 @ Token::Word(..)] if
1526                t1.to_str() == "echo" &&
1527                t2.to_str() == "$(cat <<HERE1 <<HERE2 |wc -l\nTEXT\nHERE1\nOTHER\nHERE2\n)"
1528        );
1529        Ok(())
1530    }
1531
1532    #[test]
1533    fn tokenize_simple_backquote() -> Result<()> {
1534        assert_matches!(
1535            &tokenize_str(r#"echo `echo hi`"#)?[..],
1536            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1537                t1.to_str() == "echo" &&
1538                t2.to_str() == "`echo hi`"
1539        );
1540        Ok(())
1541    }
1542
1543    #[test]
1544    fn tokenize_backquote_with_escape() -> Result<()> {
1545        assert_matches!(
1546            &tokenize_str(r"echo `echo\`hi`")?[..],
1547            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1548                t1.to_str() == "echo" &&
1549                t2.to_str() == r"`echo\`hi`"
1550        );
1551        Ok(())
1552    }
1553
1554    #[test]
1555    fn tokenize_unterminated_backquote() {
1556        assert_matches!(
1557            tokenize_str("`"),
1558            Err(TokenizerError::UnterminatedBackquote(_))
1559        );
1560    }
1561
1562    #[test]
1563    fn tokenize_unterminated_command_substitution() {
1564        assert_matches!(
1565            tokenize_str("$("),
1566            Err(TokenizerError::UnterminatedCommandSubstitution)
1567        );
1568    }
1569
1570    #[test]
1571    fn tokenize_command_substitution() -> Result<()> {
1572        assert_matches!(
1573            &tokenize_str("a$(echo hi)b c")?[..],
1574            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1575                t1.to_str() == "a$(echo hi)b" &&
1576                t2.to_str() == "c"
1577        );
1578        Ok(())
1579    }
1580
1581    #[test]
1582    fn tokenize_command_substitution_containing_extglob() -> Result<()> {
1583        assert_matches!(
1584            &tokenize_str("echo $(echo !(x))")?[..],
1585            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1586                t1.to_str() == "echo" &&
1587                t2.to_str() == "$(echo !(x))"
1588        );
1589        Ok(())
1590    }
1591
1592    #[test]
1593    fn tokenize_arithmetic_expression() -> Result<()> {
1594        assert_matches!(
1595            &tokenize_str("a$((1+2))b c")?[..],
1596            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1597                t1.to_str() == "a$((1+2))b" &&
1598                t2.to_str() == "c"
1599        );
1600        Ok(())
1601    }
1602
1603    #[test]
1604    fn tokenize_arithmetic_expression_with_space() -> Result<()> {
1605        // N.B. The spacing comes out a bit odd, but it gets processed okay
1606        // by later stages.
1607        assert_matches!(
1608            &tokenize_str("$(( 1 ))")?[..],
1609            [t1 @ Token::Word(..)] if
1610                t1.to_str() == "$((1 ))"
1611        );
1612        Ok(())
1613    }
1614    #[test]
1615    fn tokenize_arithmetic_expression_with_parens() -> Result<()> {
1616        assert_matches!(
1617            &tokenize_str("$(( (0) ))")?[..],
1618            [t1 @ Token::Word(..)] if
1619                t1.to_str() == "$(((0)))"
1620        );
1621        Ok(())
1622    }
1623
1624    #[test]
1625    fn tokenize_special_parameters() -> Result<()> {
1626        assert_matches!(
1627            &tokenize_str("$$")?[..],
1628            [t1 @ Token::Word(..)] if t1.to_str() == "$$"
1629        );
1630        assert_matches!(
1631            &tokenize_str("$@")?[..],
1632            [t1 @ Token::Word(..)] if t1.to_str() == "$@"
1633        );
1634        assert_matches!(
1635            &tokenize_str("$!")?[..],
1636            [t1 @ Token::Word(..)] if t1.to_str() == "$!"
1637        );
1638        assert_matches!(
1639            &tokenize_str("$?")?[..],
1640            [t1 @ Token::Word(..)] if t1.to_str() == "$?"
1641        );
1642        assert_matches!(
1643            &tokenize_str("$*")?[..],
1644            [t1 @ Token::Word(..)] if t1.to_str() == "$*"
1645        );
1646        Ok(())
1647    }
1648
1649    #[test]
1650    fn tokenize_unbraced_parameter_expansion() -> Result<()> {
1651        assert_matches!(
1652            &tokenize_str("$x")?[..],
1653            [t1 @ Token::Word(..)] if t1.to_str() == "$x"
1654        );
1655        assert_matches!(
1656            &tokenize_str("a$x")?[..],
1657            [t1 @ Token::Word(..)] if t1.to_str() == "a$x"
1658        );
1659        Ok(())
1660    }
1661
1662    #[test]
1663    fn tokenize_unterminated_parameter_expansion() {
1664        assert_matches!(
1665            tokenize_str("${x"),
1666            Err(TokenizerError::UnterminatedVariable)
1667        );
1668    }
1669
1670    #[test]
1671    fn tokenize_braced_parameter_expansion() -> Result<()> {
1672        assert_matches!(
1673            &tokenize_str("${x}")?[..],
1674            [t1 @ Token::Word(..)] if t1.to_str() == "${x}"
1675        );
1676        assert_matches!(
1677            &tokenize_str("a${x}b")?[..],
1678            [t1 @ Token::Word(..)] if t1.to_str() == "a${x}b"
1679        );
1680        Ok(())
1681    }
1682
1683    #[test]
1684    fn tokenize_braced_parameter_expansion_with_escaping() -> Result<()> {
1685        assert_matches!(
1686            &tokenize_str(r"a${x\}}b")?[..],
1687            [t1 @ Token::Word(..)] if t1.to_str() == r"a${x\}}b"
1688        );
1689        Ok(())
1690    }
1691
1692    #[test]
1693    fn tokenize_whitespace() -> Result<()> {
1694        assert_matches!(
1695            &tokenize_str("1 2 3")?[..],
1696            [t1 @ Token::Word(..), t2 @ Token::Word(..), t3 @ Token::Word(..)] if
1697                t1.to_str() == "1" &&
1698                t2.to_str() == "2" &&
1699                t3.to_str() == "3"
1700        );
1701        Ok(())
1702    }
1703
1704    #[test]
1705    fn tokenize_escaped_whitespace() -> Result<()> {
1706        assert_matches!(
1707            &tokenize_str(r"1\ 2 3")?[..],
1708            [t1 @ Token::Word(..), t2 @ Token::Word(..)] if
1709                t1.to_str() == r"1\ 2" &&
1710                t2.to_str() == "3"
1711        );
1712        Ok(())
1713    }
1714
1715    #[test]
1716    fn tokenize_single_quote() -> Result<()> {
1717        assert_matches!(
1718            &tokenize_str(r"x'a b'y")?[..],
1719            [t1 @ Token::Word(..)] if
1720                t1.to_str() == r"x'a b'y"
1721        );
1722        Ok(())
1723    }
1724
1725    #[test]
1726    fn tokenize_double_quote() -> Result<()> {
1727        assert_matches!(
1728            &tokenize_str(r#"x"a b"y"#)?[..],
1729            [t1 @ Token::Word(..)] if
1730                t1.to_str() == r#"x"a b"y"#
1731        );
1732        Ok(())
1733    }
1734
1735    #[test]
1736    fn tokenize_double_quoted_command_substitution() -> Result<()> {
1737        assert_matches!(
1738            &tokenize_str(r#"x"$(echo hi)"y"#)?[..],
1739            [t1 @ Token::Word(..)] if
1740                t1.to_str() == r#"x"$(echo hi)"y"#
1741        );
1742        Ok(())
1743    }
1744
1745    #[test]
1746    fn tokenize_double_quoted_arithmetic_expression() -> Result<()> {
1747        assert_matches!(
1748            &tokenize_str(r#"x"$((1+2))"y"#)?[..],
1749            [t1 @ Token::Word(..)] if
1750                t1.to_str() == r#"x"$((1+2))"y"#
1751        );
1752        Ok(())
1753    }
1754
1755    #[test]
1756    fn test_quote_removal() {
1757        assert_eq!(unquote_str(r#""hello""#), "hello");
1758        assert_eq!(unquote_str(r#"'hello'"#), "hello");
1759        assert_eq!(unquote_str(r#""hel\"lo""#), r#"hel"lo"#);
1760        assert_eq!(unquote_str(r#"'hel\'lo'"#), r#"hel'lo"#);
1761    }
1762}
brush_parser/tokenizer.rs

brush_parser/
tokenizer.rs