Skip to main content

shuck_parser/parser/lexer/
tokens.rs

1use super::*;
2
3impl<'a> Lexer<'a> {
4    /// Get the next source-backed token from the input, skipping line comments.
5    ///
6    /// Returned tokens expose their [`TokenKind`] and source [`Span`]. Comments
7    /// are omitted from this public stream; the parser uses an internal variant
8    /// when it needs to preserve them for AST attachment.
9    pub fn next_lexed_token(&mut self) -> Option<LexedToken<'a>> {
10        self.skip_whitespace();
11        let start = self.current_position();
12        let token = self.next_lexed_token_inner(false)?;
13        let end = self.current_position();
14        Some(token.with_span(Span::from_positions(start, end)))
15    }
16
17    /// Get the next source-backed token from the input, preserving line comments.
18    pub(in crate::parser) fn next_lexed_token_with_comments(&mut self) -> Option<LexedToken<'a>> {
19        self.skip_whitespace();
20        let start = self.current_position();
21        let token = self.next_lexed_token_inner(true)?;
22        let end = self.current_position();
23        Some(token.with_span(Span::from_positions(start, end)))
24    }
25
26    /// Internal: get next token without recording position (called after whitespace skip)
27    pub(in crate::parser) fn next_lexed_token_inner(
28        &mut self,
29        preserve_comments: bool,
30    ) -> Option<LexedToken<'a>> {
31        let ch = self.peek_char()?;
32
33        match ch {
34            '\n' => {
35                self.consume_ascii_chars(1);
36                Some(LexedToken::punctuation(TokenKind::Newline))
37            }
38            ';' => {
39                if self.second_char() == Some(';') {
40                    if self.third_char() == Some('&') {
41                        self.consume_ascii_chars(3);
42                        Some(LexedToken::punctuation(TokenKind::DoubleSemiAmp)) // ;;&
43                    } else {
44                        self.consume_ascii_chars(2);
45                        Some(LexedToken::punctuation(TokenKind::DoubleSemicolon)) // ;;
46                    }
47                } else if self.second_char() == Some('|') {
48                    self.consume_ascii_chars(2);
49                    Some(LexedToken::punctuation(TokenKind::SemiPipe)) // ;|
50                } else if self.second_char() == Some('&') {
51                    self.consume_ascii_chars(2);
52                    Some(LexedToken::punctuation(TokenKind::SemiAmp)) // ;&
53                } else {
54                    self.consume_ascii_chars(1);
55                    Some(LexedToken::punctuation(TokenKind::Semicolon))
56                }
57            }
58            '|' => {
59                if self.second_char() == Some('|') {
60                    self.consume_ascii_chars(2);
61                    Some(LexedToken::punctuation(TokenKind::Or))
62                } else if self.second_char() == Some('&') {
63                    self.consume_ascii_chars(2);
64                    Some(LexedToken::punctuation(TokenKind::PipeBoth))
65                } else {
66                    self.consume_ascii_chars(1);
67                    Some(LexedToken::punctuation(TokenKind::Pipe))
68                }
69            }
70            '&' => {
71                if self.second_char() == Some('&') {
72                    self.consume_ascii_chars(2);
73                    Some(LexedToken::punctuation(TokenKind::And))
74                } else if self.second_char() == Some('>') {
75                    if self.third_char() == Some('>') {
76                        self.consume_ascii_chars(3);
77                        Some(LexedToken::punctuation(TokenKind::RedirectBothAppend))
78                    } else {
79                        self.consume_ascii_chars(2);
80                        Some(LexedToken::punctuation(TokenKind::RedirectBoth))
81                    }
82                } else if self.second_char() == Some('|') {
83                    self.consume_ascii_chars(2);
84                    Some(LexedToken::punctuation(TokenKind::BackgroundPipe))
85                } else if self.second_char() == Some('!') {
86                    self.consume_ascii_chars(2);
87                    Some(LexedToken::punctuation(TokenKind::BackgroundBang))
88                } else {
89                    self.consume_ascii_chars(1);
90                    Some(LexedToken::punctuation(TokenKind::Background))
91                }
92            }
93            '>' => {
94                if self.second_char() == Some('>') {
95                    if self.third_char() == Some('|') {
96                        self.consume_ascii_chars(3);
97                    } else {
98                        self.consume_ascii_chars(2);
99                    }
100                    Some(LexedToken::punctuation(TokenKind::RedirectAppend))
101                } else if self.second_char() == Some('|') {
102                    self.consume_ascii_chars(2);
103                    Some(LexedToken::punctuation(TokenKind::Clobber))
104                } else if self.second_char() == Some('(') {
105                    self.consume_ascii_chars(2);
106                    Some(LexedToken::punctuation(TokenKind::ProcessSubOut))
107                } else if self.second_char() == Some('&') {
108                    self.consume_ascii_chars(2);
109                    Some(LexedToken::punctuation(TokenKind::DupOutput))
110                } else {
111                    self.consume_ascii_chars(1);
112                    Some(LexedToken::punctuation(TokenKind::RedirectOut))
113                }
114            }
115            '<' => {
116                if self.second_char() == Some('<') {
117                    if self.third_char() == Some('<') {
118                        self.consume_ascii_chars(3);
119                        Some(LexedToken::punctuation(TokenKind::HereString))
120                    } else if self.third_char() == Some('-') {
121                        self.consume_ascii_chars(3);
122                        Some(LexedToken::punctuation(TokenKind::HereDocStrip))
123                    } else {
124                        self.consume_ascii_chars(2);
125                        Some(LexedToken::punctuation(TokenKind::HereDoc))
126                    }
127                } else if self.second_char() == Some('>') {
128                    self.consume_ascii_chars(2);
129                    Some(LexedToken::punctuation(TokenKind::RedirectReadWrite))
130                } else if self.second_char() == Some('(') {
131                    self.consume_ascii_chars(2);
132                    Some(LexedToken::punctuation(TokenKind::ProcessSubIn))
133                } else if self.second_char() == Some('&') {
134                    self.consume_ascii_chars(2);
135                    Some(LexedToken::punctuation(TokenKind::DupInput))
136                } else {
137                    self.consume_ascii_chars(1);
138                    Some(LexedToken::punctuation(TokenKind::RedirectIn))
139                }
140            }
141            '(' => {
142                if self.second_char() == Some('(') {
143                    self.consume_ascii_chars(2);
144                    Some(LexedToken::punctuation(TokenKind::DoubleLeftParen))
145                } else {
146                    self.consume_ascii_chars(1);
147                    Some(LexedToken::punctuation(TokenKind::LeftParen))
148                }
149            }
150            ')' => {
151                if self.second_char() == Some(')') {
152                    self.consume_ascii_chars(2);
153                    Some(LexedToken::punctuation(TokenKind::DoubleRightParen))
154                } else {
155                    self.consume_ascii_chars(1);
156                    Some(LexedToken::punctuation(TokenKind::RightParen))
157                }
158            }
159            '{' => {
160                let start = self.current_position();
161                if self.ignore_braces_enabled() {
162                    self.consume_ascii_chars(1);
163                    match self.peek_char() {
164                        Some(' ') | Some('\t') | Some('\n') | None => {
165                            Some(LexedToken::borrowed_word(TokenKind::Word, "{", None))
166                        }
167                        _ => self.read_word_starting_with("{", start),
168                    }
169                } else if self.looks_like_brace_expansion() {
170                    // Look ahead to see if this is a brace expansion like {a,b,c} or {1..5}
171                    // vs a brace group like { cmd; }
172                    // Note: { must be followed by space/newline to be a brace group
173                    self.read_brace_expansion_word()
174                } else if self.is_brace_group_start() {
175                    self.advance();
176                    Some(LexedToken::punctuation(TokenKind::LeftBrace))
177                } else if self.brace_literal_starts_case_pattern_delimiter() {
178                    self.read_word_starting_with("{", start)
179                } else {
180                    self.read_brace_literal_word()
181                }
182            }
183            '}' => {
184                self.consume_ascii_chars(1);
185                if self.ignore_close_braces_enabled() {
186                    Some(LexedToken::borrowed_word(TokenKind::Word, "}", None))
187                } else {
188                    Some(LexedToken::punctuation(TokenKind::RightBrace))
189                }
190            }
191            '[' => {
192                let start = self.current_position();
193                self.consume_ascii_chars(1);
194                if self.peek_char() == Some('[')
195                    && matches!(
196                        self.second_char(),
197                        Some(' ') | Some('\t') | Some('\n') | None
198                    )
199                {
200                    self.consume_ascii_chars(1);
201                    Some(LexedToken::punctuation(TokenKind::DoubleLeftBracket))
202                } else {
203                    // `[` can start the test command when followed by whitespace, or it can be
204                    // ordinary word text such as a glob bracket expression.
205                    //
206                    // Read the whole token with the normal word scanner so forms like `[[z]`,
207                    // `[hello"]"`, and `[+(])` stay attached to one word instead of producing
208                    // structural tokens mid-word.
209                    match self.peek_char() {
210                        Some(' ') | Some('\t') | Some('\n') | None => {
211                            Some(LexedToken::borrowed_word(TokenKind::Word, "[", None))
212                        }
213                        _ => self.read_word_starting_with("[", start),
214                    }
215                }
216            }
217            ']' => {
218                if self.second_char() == Some(']') {
219                    self.consume_ascii_chars(2);
220                    Some(LexedToken::punctuation(TokenKind::DoubleRightBracket))
221                } else {
222                    self.consume_ascii_chars(1);
223                    Some(LexedToken::borrowed_word(TokenKind::Word, "]", None))
224                }
225            }
226            '\'' => self.read_single_quoted_string(),
227            '"' => self.read_double_quoted_string(),
228            '#' => {
229                if self.should_treat_hash_as_word_char() {
230                    let start = self.current_position();
231                    return self.read_word_starting_with("#", start);
232                }
233                if preserve_comments {
234                    self.read_comment();
235                    Some(LexedToken::comment())
236                } else {
237                    self.skip_comment();
238                    self.next_lexed_token_inner(false)
239                }
240            }
241            // Handle file descriptor redirects like 2> or 2>&1
242            '0'..='9' => self.read_word_or_fd_redirect(),
243            _ => self.read_word(),
244        }
245    }
246
247    pub(in crate::parser) fn skip_whitespace(&mut self) {
248        while let Some(ch) = self.peek_char() {
249            if self.reinject_buf.is_empty() {
250                let whitespace_len = self.source_horizontal_whitespace_len();
251                if whitespace_len > 0 {
252                    self.consume_source_bytes(whitespace_len);
253                    continue;
254                }
255
256                if self.cursor.rest().starts_with("\\\n") {
257                    self.consume_source_bytes(2);
258                    continue;
259                }
260            }
261
262            if ch == ' ' || ch == '\t' {
263                self.consume_ascii_chars(1);
264            } else if ch == '\\' {
265                // Check for backslash-newline (line continuation) between tokens
266                if self.second_char() == Some('\n') {
267                    self.consume_ascii_chars(2);
268                } else {
269                    break;
270                }
271            } else {
272                break;
273            }
274        }
275    }
276
277    pub(in crate::parser) fn skip_comment(&mut self) {
278        if self.reinject_buf.is_empty() {
279            let end = self
280                .cursor
281                .find_byte(b'\n')
282                .unwrap_or(self.cursor.rest().len());
283            self.consume_source_bytes(end);
284            return;
285        }
286
287        while let Some(ch) = self.peek_char() {
288            if ch == '\n' {
289                break;
290            }
291            self.advance();
292        }
293    }
294
295    pub(in crate::parser) fn read_comment(&mut self) {
296        debug_assert_eq!(self.peek_char(), Some('#'));
297
298        if self.reinject_buf.is_empty() {
299            let rest = self.cursor.rest();
300            let end = self.cursor.find_byte(b'\n').unwrap_or(rest.len());
301            self.consume_source_bytes(end);
302            return;
303        }
304
305        self.advance(); // consume '#'
306
307        while let Some(ch) = self.peek_char() {
308            if ch == '\n' {
309                break;
310            }
311            self.advance();
312        }
313    }
314
315    pub(in crate::parser) fn is_inside_unclosed_double_paren_on_line(&self) -> bool {
316        if !self.reinject_buf.is_empty() || self.offset > self.input.len() {
317            return false;
318        }
319
320        let line_start = self.input[..self.offset]
321            .rfind('\n')
322            .map_or(0, |index| index + 1);
323        let prefix = &self.input[line_start..self.offset];
324        line_has_unclosed_double_paren(prefix)
325    }
326}