oak_bash/lexer/
mod.rs

1use crate::{kind::BashSyntaxKind, language::BashLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
3use std::sync::LazyLock;
4
5type State<S> = LexerState<S, BashLanguage>;
6
7#[derive(Clone)]
8pub struct BashLexer<'config> {
9    config: &'config BashLanguage,
10}
11
12impl<'config> BashLexer<'config> {
13    pub fn new(config: &'config BashLanguage) -> Self {
14        Self { config }
15    }
16
17    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
18        while state.not_at_end() {
19            if self.skip_whitespace(state) {
20                continue;
21            }
22
23            if self.skip_comment(state) {
24                continue;
25            }
26
27            if self.lex_newline(state) {
28                continue;
29            }
30
31            if self.lex_string(state) {
32                continue;
33            }
34
35            if self.lex_variable(state) {
36                continue;
37            }
38
39            if self.lex_number(state) {
40                continue;
41            }
42
43            if self.lex_keyword_or_identifier(state) {
44                continue;
45            }
46
47            if self.lex_operator_or_delimiter(state) {
48                continue;
49            }
50
51            if self.lex_heredoc(state) {
52                continue;
53            }
54
55            if self.lex_glob_pattern(state) {
56                continue;
57            }
58
59            if self.lex_special_char(state) {
60                continue;
61            }
62
63            if self.lex_text(state) {
64                continue;
65            }
66
67            // 如果没有匹配任何模式,跳过一个字符
68            state.advance(1);
69        }
70        Ok(())
71    }
72
73    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
74        let start_pos = state.get_position();
75
76        while let Some(ch) = state.peek() {
77            if ch == ' ' || ch == '\t' {
78                state.advance(ch.len_utf8());
79            }
80            else {
81                break;
82            }
83        }
84
85        if state.get_position() > start_pos {
86            state.add_token(BashSyntaxKind::Whitespace, start_pos, state.get_position());
87            true
88        }
89        else {
90            false
91        }
92    }
93
94    fn skip_comment<S: Source>(&self, state: &mut State<S>) -> bool {
95        let start_pos = state.get_position();
96
97        if let Some('#') = state.peek() {
98            state.advance(1);
99            while let Some(ch) = state.peek() {
100                if ch == '\n' || ch == '\r' {
101                    break;
102                }
103                state.advance(ch.len_utf8());
104            }
105            state.add_token(BashSyntaxKind::Comment, start_pos, state.get_position());
106            true
107        }
108        else {
109            false
110        }
111    }
112
113    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
114        let start_pos = state.get_position();
115
116        if let Some('\n') = state.peek() {
117            state.advance(1);
118            state.add_token(BashSyntaxKind::Newline, start_pos, state.get_position());
119            true
120        }
121        else if let Some('\r') = state.peek() {
122            state.advance(1);
123            if let Some('\n') = state.peek() {
124                state.advance(1);
125            }
126            state.add_token(BashSyntaxKind::Newline, start_pos, state.get_position());
127            true
128        }
129        else {
130            false
131        }
132    }
133
134    fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
135        let start_pos = state.get_position();
136
137        if let Some(quote) = state.peek() {
138            if quote == '"' || quote == '\'' {
139                state.advance(1);
140                let mut escaped = false;
141
142                while let Some(ch) = state.peek() {
143                    if escaped {
144                        escaped = false;
145                        state.advance(ch.len_utf8());
146                        continue;
147                    }
148
149                    if ch == '\\' {
150                        escaped = true;
151                        state.advance(1);
152                        continue;
153                    }
154
155                    if ch == quote {
156                        state.advance(1);
157                        break;
158                    }
159
160                    state.advance(ch.len_utf8());
161                }
162
163                state.add_token(BashSyntaxKind::StringLiteral, start_pos, state.get_position());
164                return true;
165            }
166        }
167
168        false
169    }
170
171    fn lex_variable<S: Source>(&self, state: &mut State<S>) -> bool {
172        let start_pos = state.get_position();
173
174        if let Some('$') = state.peek() {
175            state.advance(1);
176
177            // 处理特殊变量 $0, $1, $?, $$ 等
178            if let Some(ch) = state.peek() {
179                if ch.is_ascii_digit() || ch == '?' || ch == '$' || ch == '#' || ch == '@' || ch == '*' {
180                    state.advance(1);
181                    state.add_token(BashSyntaxKind::Variable, start_pos, state.get_position());
182                    return true;
183                }
184            }
185
186            // 处理 ${var} 形式
187            if let Some('{') = state.peek() {
188                state.advance(1);
189                while let Some(ch) = state.peek() {
190                    if ch == '}' {
191                        state.advance(1);
192                        break;
193                    }
194                    state.advance(ch.len_utf8());
195                }
196                state.add_token(BashSyntaxKind::Variable, start_pos, state.get_position());
197                return true;
198            }
199
200            // 处理普通变量名
201            if let Some(ch) = state.peek() {
202                if ch.is_alphabetic() || ch == '_' {
203                    state.advance(ch.len_utf8());
204                    while let Some(ch) = state.peek() {
205                        if ch.is_alphanumeric() || ch == '_' {
206                            state.advance(ch.len_utf8());
207                        }
208                        else {
209                            break;
210                        }
211                    }
212                    state.add_token(BashSyntaxKind::Variable, start_pos, state.get_position());
213                    return true;
214                }
215            }
216
217            // 如果只有 $ 没有有效变量名,回退
218            state.set_position(start_pos);
219        }
220
221        false
222    }
223
224    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
225        let start_pos = state.get_position();
226
227        if let Some(ch) = state.peek() {
228            if ch.is_ascii_digit() {
229                state.advance(1);
230                while let Some(ch) = state.peek() {
231                    if ch.is_ascii_digit() {
232                        state.advance(1);
233                    }
234                    else {
235                        break;
236                    }
237                }
238                state.add_token(BashSyntaxKind::NumberLiteral, start_pos, state.get_position());
239                return true;
240            }
241        }
242
243        false
244    }
245
246    fn lex_keyword_or_identifier<S: Source>(&self, state: &mut State<S>) -> bool {
247        let start_pos = state.get_position();
248
249        if let Some(ch) = state.peek() {
250            if ch.is_alphabetic() || ch == '_' {
251                state.advance(ch.len_utf8());
252                while let Some(ch) = state.peek() {
253                    if ch.is_alphanumeric() || ch == '_' {
254                        state.advance(ch.len_utf8());
255                    }
256                    else {
257                        break;
258                    }
259                }
260
261                let text = state.get_text_in((start_pos..state.get_position()).into());
262                let kind = if BASH_KEYWORDS.contains(&text) { BashSyntaxKind::Keyword } else { BashSyntaxKind::Identifier };
263
264                state.add_token(kind, start_pos, state.get_position());
265                return true;
266            }
267        }
268
269        false
270    }
271
272    fn lex_operator_or_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
273        let start_pos = state.get_position();
274
275        if let Some(ch) = state.peek() {
276            let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
277
278            // 检查双字符操作符
279            if BASH_TWO_CHAR_OPERATORS.contains(&two_char.as_str()) {
280                state.advance(2);
281                state.add_token(BashSyntaxKind::Operator, start_pos, state.get_position());
282                return true;
283            }
284
285            // 检查单字符操作符和分隔符
286            let ch_str = ch.to_string();
287            if BASH_OPERATORS.contains(&ch_str.as_str()) {
288                state.advance(1);
289                state.add_token(BashSyntaxKind::Operator, start_pos, state.get_position());
290                return true;
291            }
292
293            if BASH_DELIMITERS.contains(&ch_str.as_str()) {
294                state.advance(1);
295                state.add_token(BashSyntaxKind::Delimiter, start_pos, state.get_position());
296                return true;
297            }
298        }
299
300        false
301    }
302
303    fn lex_heredoc<S: Source>(&self, state: &mut State<S>) -> bool {
304        let start_pos = state.get_position();
305
306        // 检查 << 开始的 heredoc
307        if let Some('<') = state.peek() {
308            if let Some('<') = state.peek_next_n(1) {
309                state.advance(2);
310
311                // 跳过可选的 -
312                if let Some('-') = state.peek() {
313                    state.advance(1);
314                }
315
316                // 读取标识符
317                while let Some(ch) = state.peek() {
318                    if ch.is_alphanumeric() || ch == '_' {
319                        state.advance(ch.len_utf8());
320                    }
321                    else {
322                        break;
323                    }
324                }
325
326                state.add_token(BashSyntaxKind::Heredoc, start_pos, state.get_position());
327                return true;
328            }
329        }
330
331        false
332    }
333
334    fn lex_glob_pattern<S: Source>(&self, state: &mut State<S>) -> bool {
335        let start_pos = state.get_position();
336
337        if let Some(ch) = state.peek() {
338            if ch == '*' || ch == '?' || ch == '[' {
339                state.advance(1);
340
341                if ch == '[' {
342                    // 处理字符类 [abc] 或 [!abc]
343                    if let Some('!') = state.peek() {
344                        state.advance(1);
345                    }
346                    while let Some(ch) = state.peek() {
347                        if ch == ']' {
348                            state.advance(1);
349                            break;
350                        }
351                        state.advance(ch.len_utf8());
352                    }
353                }
354
355                state.add_token(BashSyntaxKind::GlobPattern, start_pos, state.get_position());
356                return true;
357            }
358        }
359
360        false
361    }
362
363    fn lex_special_char<S: Source>(&self, state: &mut State<S>) -> bool {
364        let start_pos = state.get_position();
365
366        if let Some(ch) = state.peek() {
367            if BASH_SPECIAL_CHARS.contains(&ch) {
368                state.advance(1);
369                state.add_token(BashSyntaxKind::SpecialChar, start_pos, state.get_position());
370                return true;
371            }
372        }
373
374        false
375    }
376
377    fn lex_text<S: Source>(&self, state: &mut State<S>) -> bool {
378        let start_pos = state.get_position();
379
380        if let Some(ch) = state.peek() {
381            if !ch.is_whitespace() && !BASH_SPECIAL_CHARS.contains(&ch) {
382                state.advance(ch.len_utf8());
383                state.add_token(BashSyntaxKind::Text, start_pos, state.get_position());
384                return true;
385            }
386        }
387
388        false
389    }
390}
391
392impl<'config> Lexer<BashLanguage> for BashLexer<'config> {
393    fn lex_incremental(
394        &self,
395        source: impl Source,
396        _changed: usize,
397        _cache: IncrementalCache<BashLanguage>,
398    ) -> LexOutput<BashLanguage> {
399        let mut state = LexerState::new_with_cache(source, _changed, _cache);
400        let result = self.run(&mut state);
401        if result.is_ok() {
402            let eof_pos = state.get_position();
403            state.add_token(BashSyntaxKind::Eof, eof_pos, eof_pos);
404        }
405        state.finish(result)
406    }
407}
408
409static BASH_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
410    &[
411        "if", "then", "else", "elif", "fi", "case", "esac", "for", "while", "until", "do", "done", "function", "return",
412        "break", "continue", "local", "export", "readonly", "declare", "typeset", "unset", "shift", "exit", "source", ".",
413        "eval", "exec", "trap", "wait", "jobs", "bg", "fg", "disown", "suspend", "alias", "unalias", "history", "fc", "let",
414        "test", "[", "[[", "]]", "time", "coproc", "select", "in",
415    ]
416});
417
418static BASH_OPERATORS: LazyLock<&[&str]> = LazyLock::new(|| &["+", "-", "*", "/", "%", "=", "!", "<", ">", "&", "|", "^", "~"]);
419
420static BASH_TWO_CHAR_OPERATORS: LazyLock<&[&str]> = LazyLock::new(|| {
421    &[
422        "==", "!=", "<=", ">=", "&&", "||", "<<", ">>", "++", "--", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=",
423        ">>=", "**",
424    ]
425});
426
427static BASH_DELIMITERS: LazyLock<&[&str]> = LazyLock::new(|| &["(", ")", "{", "}", "[", "]", ";", ",", ":", "."]);
428
429static BASH_SPECIAL_CHARS: LazyLock<&[char]> = LazyLock::new(|| {
430    &[
431        '\\', '`', '~', '@', '#', '$', '%', '^', '&', '*', '(', ')', '-', '+', '=', '{', '}', '[', ']', '|', '\\', ':', ';',
432        '"', '\'', '<', '>', ',', '.', '?', '/', '!', '`',
433    ]
434});