Skip to main content

oak_bash/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Bash token types and role definitions.
3pub mod token_type;
4
5pub use token_type::BashTokenType;
6
7use crate::language::BashLanguage;
8use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
9use std::sync::LazyLock;
10
11pub(crate) type State<'a, S> = LexerState<'a, S, BashLanguage>;
12
13/// Lexer for the Bash language.
14#[derive(Clone)]
15pub struct BashLexer<'config> {
16    config: &'config BashLanguage,
17}
18
19impl<'config> Lexer<BashLanguage> for BashLexer<'config> {
20    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<BashLanguage>) -> LexOutput<BashLanguage> {
21        let mut state = LexerState::new_with_cache(source, 0, cache);
22        let result = self.run(&mut state);
23        if result.is_ok() {
24            state.add_eof()
25        }
26        state.finish_with_cache(result, cache)
27    }
28}
29
30impl<'config> BashLexer<'config> {
31    /// Creates a new `BashLexer` instance.
32    pub fn new(config: &'config BashLanguage) -> Self {
33        Self { config }
34    }
35
36    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
37        while state.not_at_end() {
38            let safe_point = state.get_position();
39            if self.skip_whitespace(state) {
40                continue;
41            }
42
43            if self.skip_comment(state) {
44                continue;
45            }
46
47            if self.lex_newline(state) {
48                continue;
49            }
50
51            if self.lex_string(state) {
52                continue;
53            }
54
55            if self.lex_variable(state) {
56                continue;
57            }
58
59            if self.lex_number(state) {
60                continue;
61            }
62
63            if self.lex_keyword_or_identifier(state) {
64                continue;
65            }
66
67            if self.lex_operator_or_delimiter(state) {
68                continue;
69            }
70
71            if self.lex_heredoc(state) {
72                continue;
73            }
74
75            if self.lex_glob_pattern(state) {
76                continue;
77            }
78
79            if self.lex_special_char(state) {
80                continue;
81            }
82
83            if self.lex_text(state) {
84                continue;
85            }
86
87            // If no pattern matches, skip one character and generate an Error token
88            let start_pos = state.get_position();
89            if let Some(ch) = state.peek() {
90                state.advance(ch.len_utf8());
91                state.add_token(BashTokenType::Error, start_pos, state.get_position())
92            }
93
94            state.advance_if_dead_lock(safe_point)
95        }
96        Ok(())
97    }
98
99    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
100        let start_pos = state.get_position();
101
102        while let Some(ch) = state.peek() {
103            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
104        }
105
106        if state.get_position() > start_pos {
107            state.add_token(BashTokenType::Whitespace, start_pos, state.get_position());
108            true
109        }
110        else {
111            false
112        }
113    }
114
115    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
116        let start_pos = state.get_position();
117
118        if let Some('#') = state.peek() {
119            state.advance(1);
120            while let Some(ch) = state.peek() {
121                if ch == '\n' || ch == '\r' {
122                    break;
123                }
124                state.advance(ch.len_utf8())
125            }
126            state.add_token(BashTokenType::Comment, start_pos, state.get_position());
127            true
128        }
129        else {
130            false
131        }
132    }
133
134    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
135        let start_pos = state.get_position();
136
137        if let Some('\n') = state.peek() {
138            state.advance(1);
139            state.add_token(BashTokenType::Newline, start_pos, state.get_position());
140            true
141        }
142        else if let Some('\r') = state.peek() {
143            state.advance(1);
144            if let Some('\n') = state.peek() {
145                state.advance(1)
146            }
147            state.add_token(BashTokenType::Newline, start_pos, state.get_position());
148            true
149        }
150        else {
151            false
152        }
153    }
154
155    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
156        let start_pos = state.get_position();
157
158        if let Some(quote) = state.peek() {
159            if quote == '"' || quote == '\'' {
160                state.advance(1);
161                let mut escaped = false;
162
163                while let Some(ch) = state.peek() {
164                    if escaped {
165                        escaped = false;
166                        state.advance(ch.len_utf8());
167                        continue;
168                    }
169
170                    if ch == '\\' {
171                        escaped = true;
172                        state.advance(1);
173                        continue;
174                    }
175
176                    if ch == quote {
177                        state.advance(1);
178                        break;
179                    }
180
181                    state.advance(ch.len_utf8())
182                }
183
184                state.add_token(BashTokenType::StringLiteral, start_pos, state.get_position());
185                return true;
186            }
187        }
188
189        false
190    }
191
192    fn lex_variable<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
193        let start_pos = state.get_position();
194
195        if let Some('$') = state.peek() {
196            state.advance(1);
197
198            // Handle special variables like $0, $1, $?, $$, etc.
199            if let Some(ch) = state.peek() {
200                if ch.is_ascii_digit() || ch == '?' || ch == '$' || ch == '#' || ch == '@' || ch == '*' {
201                    state.advance(1);
202                    state.add_token(BashTokenType::Variable, start_pos, state.get_position());
203                    return true;
204                }
205            }
206
207            // Handle ${var} format
208            if let Some('{') = state.peek() {
209                state.advance(1);
210                while let Some(ch) = state.peek() {
211                    if ch == '}' {
212                        state.advance(1);
213                        break;
214                    }
215                    state.advance(ch.len_utf8())
216                }
217                state.add_token(BashTokenType::Variable, start_pos, state.get_position());
218                return true;
219            }
220
221            // Handle normal variable names
222            if let Some(ch) = state.peek() {
223                if ch.is_alphabetic() || ch == '_' {
224                    state.advance(ch.len_utf8());
225                    while let Some(ch) = state.peek() {
226                        if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
227                    }
228                    state.add_token(BashTokenType::Variable, start_pos, state.get_position());
229                    return true;
230                }
231            }
232
233            // If there is only $ without a valid variable name, backtrack
234            state.set_position(start_pos);
235        }
236
237        false
238    }
239
240    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
241        let start_pos = state.get_position();
242
243        if let Some(ch) = state.peek() {
244            if ch.is_ascii_digit() {
245                state.advance(1);
246                while let Some(ch) = state.peek() {
247                    if ch.is_ascii_digit() { state.advance(1) } else { break }
248                }
249                state.add_token(BashTokenType::NumberLiteral, start_pos, state.get_position());
250                return true;
251            }
252        }
253
254        false
255    }
256
257    fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
258        let start_pos = state.get_position();
259
260        if let Some(ch) = state.peek() {
261            if ch.is_ascii_alphabetic() || ch == '_' {
262                state.advance(ch.len_utf8());
263                while let Some(ch) = state.peek() {
264                    if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
265                }
266
267                let text = state.get_text_in((start_pos..state.get_position()).into());
268                let kind = if BASH_KEYWORDS.contains(&text.as_ref()) { BashTokenType::Keyword } else { BashTokenType::Identifier };
269
270                state.add_token(kind, start_pos, state.get_position());
271                return true;
272            }
273        }
274
275        false
276    }
277
278    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
279        let start_pos = state.get_position();
280
281        if let Some(ch) = state.peek() {
282            let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
283
284            // Check for two-character operators
285            if BASH_TWO_CHAR_OPERATORS.contains(&two_char.as_str()) {
286                state.advance(2);
287                state.add_token(BashTokenType::Operator, start_pos, state.get_position());
288                return true;
289            }
290
291            // Check for single-character operators and delimiters
292            let ch_str = ch.to_string();
293            if BASH_OPERATORS.contains(&ch_str.as_str()) {
294                state.advance(1);
295                state.add_token(BashTokenType::Operator, start_pos, state.get_position());
296                return true;
297            }
298
299            if BASH_DELIMITERS.contains(&ch_str.as_str()) {
300                state.advance(1);
301                state.add_token(BashTokenType::Delimiter, start_pos, state.get_position());
302                return true;
303            }
304        }
305
306        false
307    }
308
309    fn lex_heredoc<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
310        let start_pos = state.get_position();
311
312        // Check for heredoc starting with <<
313        if let Some('<') = state.peek() {
314            if let Some('<') = state.peek_next_n(1) {
315                state.advance(2);
316
317                // Skip optional -
318                if let Some('-') = state.peek() {
319                    state.advance(1)
320                }
321
322                // Read identifier
323                while let Some(ch) = state.peek() {
324                    if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
325                }
326
327                state.add_token(BashTokenType::Heredoc, start_pos, state.get_position());
328                return true;
329            }
330        }
331
332        false
333    }
334
335    fn lex_glob_pattern<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
336        let start_pos = state.get_position();
337
338        if let Some(ch) = state.peek() {
339            if ch == '*' || ch == '?' || ch == '[' {
340                state.advance(1);
341
342                if ch == '[' {
343                    // Handle character classes [abc] or [!abc]
344                    if let Some('!') = state.peek() {
345                        state.advance(1)
346                    }
347                    while let Some(ch) = state.peek() {
348                        if ch == ']' {
349                            state.advance(1);
350                            break;
351                        }
352                        state.advance(ch.len_utf8())
353                    }
354                }
355
356                state.add_token(BashTokenType::GlobPattern, start_pos, state.get_position());
357                return true;
358            }
359        }
360
361        false
362    }
363
364    fn lex_special_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
365        let start_pos = state.get_position();
366
367        if let Some(ch) = state.peek() {
368            if BASH_SPECIAL_CHARS.contains(&ch) {
369                state.advance(1);
370                state.add_token(BashTokenType::SpecialChar, start_pos, state.get_position());
371                return true;
372            }
373        }
374
375        false
376    }
377
378    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
379        let start_pos = state.get_position();
380
381        if let Some(ch) = state.peek() {
382            if !ch.is_whitespace() && !BASH_SPECIAL_CHARS.contains(&ch) {
383                state.advance(ch.len_utf8());
384                state.add_token(BashTokenType::Text, start_pos, state.get_position());
385                return true;
386            }
387        }
388
389        false
390    }
391}
392
393static BASH_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
394    &[
395        "if", "then", "else", "elif", "fi", "case", "esac", "for", "while", "until", "do", "done", "function", "return", "break", "continue", "local", "export", "readonly", "declare", "typeset", "unset", "shift", "exit", "source", ".", "eval", "exec",
396        "trap", "wait", "jobs", "bg", "fg", "disown", "suspend", "alias", "unalias", "history", "fc", "let", "test", "[", "[[", "]]", "time", "coproc", "select", "in",
397    ]
398});
399
400static BASH_OPERATORS: LazyLock<&[&str]> = LazyLock::new(|| &["+", "-", "*", "/", "%", "=", "!", "<", ">", "&", "|", "^", "~"]);
401
402static BASH_TWO_CHAR_OPERATORS: LazyLock<&[&str]> = LazyLock::new(|| &["==", "!=", "<=", ">=", "&&", "||", "<<", ">>", "++", "--", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "**"]);
403
404static BASH_DELIMITERS: LazyLock<&[&str]> = LazyLock::new(|| &["(", ")", "{", "}", "[", "]", ";", ",", ":", "."]);
405
406static BASH_SPECIAL_CHARS: LazyLock<&[char]> = LazyLock::new(|| &['\\', '`', '~', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '+', '=', '{', '}', '[', ']', '|', '\\', ':', ';', '"', '\'', '<', '>', ',', '.', '?', '/', '!', '`']);