oak_bash/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4pub use token_type::BashTokenType;
5
6use crate::language::BashLanguage;
7use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
8use std::sync::LazyLock;
9
10type State<'a, S> = LexerState<'a, S, BashLanguage>;
11
12#[derive(Clone)]
13pub struct BashLexer<'config> {
14    _config: &'config BashLanguage,
15}
16
17impl<'config> Lexer<BashLanguage> for BashLexer<'config> {
18    fn lex<'a, S: Source + ?Sized>(&self, source: &S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<BashLanguage>) -> LexOutput<BashLanguage> {
19        let mut state = LexerState::new_with_cache(source, 0, cache);
20        let result = self.run(&mut state);
21        if result.is_ok() {
22            state.add_eof()
23        }
24        state.finish_with_cache(result, cache)
25    }
26}
27
28impl<'config> BashLexer<'config> {
29    pub fn new(config: &'config BashLanguage) -> Self {
30        Self { _config: config }
31    }
32
33    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
34        while state.not_at_end() {
35            let safe_point = state.get_position();
36            if self.skip_whitespace(state) {
37                continue;
38            }
39
40            if self.skip_comment(state) {
41                continue;
42            }
43
44            if self.lex_newline(state) {
45                continue;
46            }
47
48            if self.lex_string(state) {
49                continue;
50            }
51
52            if self.lex_variable(state) {
53                continue;
54            }
55
56            if self.lex_number(state) {
57                continue;
58            }
59
60            if self.lex_keyword_or_identifier(state) {
61                continue;
62            }
63
64            if self.lex_operator_or_delimiter(state) {
65                continue;
66            }
67
68            if self.lex_heredoc(state) {
69                continue;
70            }
71
72            if self.lex_glob_pattern(state) {
73                continue;
74            }
75
76            if self.lex_special_char(state) {
77                continue;
78            }
79
80            if self.lex_text(state) {
81                continue;
82            }
83
84            // 如果没有匹配任何模式，跳过一个字符并生成 Error token
85            let start_pos = state.get_position();
86            if let Some(ch) = state.peek() {
87                state.advance(ch.len_utf8());
88                state.add_token(BashTokenType::Error, start_pos, state.get_position())
89            }
90
91            state.advance_if_dead_lock(safe_point)
92        }
93        Ok(())
94    }
95
96    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
97        let start_pos = state.get_position();
98
99        while let Some(ch) = state.peek() {
100            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
101        }
102
103        if state.get_position() > start_pos {
104            state.add_token(BashTokenType::Whitespace, start_pos, state.get_position());
105            true
106        }
107        else {
108            false
109        }
110    }
111
112    fn skip_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
113        let start_pos = state.get_position();
114
115        if let Some('#') = state.peek() {
116            state.advance(1);
117            while let Some(ch) = state.peek() {
118                if ch == '\n' || ch == '\r' {
119                    break;
120                }
121                state.advance(ch.len_utf8())
122            }
123            state.add_token(BashTokenType::Comment, start_pos, state.get_position());
124            true
125        }
126        else {
127            false
128        }
129    }
130
131    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
132        let start_pos = state.get_position();
133
134        if let Some('\n') = state.peek() {
135            state.advance(1);
136            state.add_token(BashTokenType::Newline, start_pos, state.get_position());
137            true
138        }
139        else if let Some('\r') = state.peek() {
140            state.advance(1);
141            if let Some('\n') = state.peek() {
142                state.advance(1)
143            }
144            state.add_token(BashTokenType::Newline, start_pos, state.get_position());
145            true
146        }
147        else {
148            false
149        }
150    }
151
152    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
153        let start_pos = state.get_position();
154
155        if let Some(quote) = state.peek() {
156            if quote == '"' || quote == '\'' {
157                state.advance(1);
158                let mut escaped = false;
159
160                while let Some(ch) = state.peek() {
161                    if escaped {
162                        escaped = false;
163                        state.advance(ch.len_utf8());
164                        continue;
165                    }
166
167                    if ch == '\\' {
168                        escaped = true;
169                        state.advance(1);
170                        continue;
171                    }
172
173                    if ch == quote {
174                        state.advance(1);
175                        break;
176                    }
177
178                    state.advance(ch.len_utf8())
179                }
180
181                state.add_token(BashTokenType::StringLiteral, start_pos, state.get_position());
182                return true;
183            }
184        }
185
186        false
187    }
188
189    fn lex_variable<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
190        let start_pos = state.get_position();
191
192        if let Some('$') = state.peek() {
193            state.advance(1);
194
195            // 处理特殊变量 $0, $1, $?, $$ 等
196            if let Some(ch) = state.peek() {
197                if ch.is_ascii_digit() || ch == '?' || ch == '$' || ch == '#' || ch == '↯' || ch == '*' {
198                    state.advance(1);
199                    state.add_token(BashTokenType::Variable, start_pos, state.get_position());
200                    return true;
201                }
202            }
203
204            // 处理 ${var} 形式
205            if let Some('{') = state.peek() {
206                state.advance(1);
207                while let Some(ch) = state.peek() {
208                    if ch == '}' {
209                        state.advance(1);
210                        break;
211                    }
212                    state.advance(ch.len_utf8())
213                }
214                state.add_token(BashTokenType::Variable, start_pos, state.get_position());
215                return true;
216            }
217
218            // 处理普通变量名
219            if let Some(ch) = state.peek() {
220                if ch.is_alphabetic() || ch == '_' {
221                    state.advance(ch.len_utf8());
222                    while let Some(ch) = state.peek() {
223                        if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
224                    }
225                    state.add_token(BashTokenType::Variable, start_pos, state.get_position());
226                    return true;
227                }
228            }
229
230            // 如果只有 $ 没有有效变量名，回退
231            state.set_position(start_pos);
232        }
233
234        false
235    }
236
237    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
238        let start_pos = state.get_position();
239
240        if let Some(ch) = state.peek() {
241            if ch.is_ascii_digit() {
242                state.advance(1);
243                while let Some(ch) = state.peek() {
244                    if ch.is_ascii_digit() { state.advance(1) } else { break }
245                }
246                state.add_token(BashTokenType::NumberLiteral, start_pos, state.get_position());
247                return true;
248            }
249        }
250
251        false
252    }
253
254    fn lex_keyword_or_identifier<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
255        let start_pos = state.get_position();
256
257        if let Some(ch) = state.peek() {
258            if ch.is_ascii_alphabetic() || ch == '_' {
259                state.advance(ch.len_utf8());
260                while let Some(ch) = state.peek() {
261                    if ch.is_ascii_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
262                }
263
264                let text = state.get_text_in((start_pos..state.get_position()).into());
265                let kind = if BASH_KEYWORDS.contains(&text.as_ref()) { BashTokenType::Keyword } else { BashTokenType::Identifier };
266
267                state.add_token(kind, start_pos, state.get_position());
268                return true;
269            }
270        }
271
272        false
273    }
274
275    fn lex_operator_or_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
276        let start_pos = state.get_position();
277
278        if let Some(ch) = state.peek() {
279            let two_char = if let Some(next_ch) = state.peek_next_n(1) { format!("{}{}", ch, next_ch) } else { String::new() };
280
281            // 检查双字符操作符
282            if BASH_TWO_CHAR_OPERATORS.contains(&two_char.as_str()) {
283                state.advance(2);
284                state.add_token(BashTokenType::Operator, start_pos, state.get_position());
285                return true;
286            }
287
288            // 检查单字符操作符和分隔符
289            let ch_str = ch.to_string();
290            if BASH_OPERATORS.contains(&ch_str.as_str()) {
291                state.advance(1);
292                state.add_token(BashTokenType::Operator, start_pos, state.get_position());
293                return true;
294            }
295
296            if BASH_DELIMITERS.contains(&ch_str.as_str()) {
297                state.advance(1);
298                state.add_token(BashTokenType::Delimiter, start_pos, state.get_position());
299                return true;
300            }
301        }
302
303        false
304    }
305
306    fn lex_heredoc<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
307        let start_pos = state.get_position();
308
309        // 检查 << 开始的 heredoc
310        if let Some('<') = state.peek() {
311            if let Some('<') = state.peek_next_n(1) {
312                state.advance(2);
313
314                // 跳过可选的 -
315                if let Some('-') = state.peek() {
316                    state.advance(1)
317                }
318
319                // 读取标识符
320                while let Some(ch) = state.peek() {
321                    if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
322                }
323
324                state.add_token(BashTokenType::Heredoc, start_pos, state.get_position());
325                return true;
326            }
327        }
328
329        false
330    }
331
332    fn lex_glob_pattern<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
333        let start_pos = state.get_position();
334
335        if let Some(ch) = state.peek() {
336            if ch == '*' || ch == '?' || ch == '[' {
337                state.advance(1);
338
339                if ch == '[' {
340                    // 处理字符类 [abc] 或 [!abc]
341                    if let Some('!') = state.peek() {
342                        state.advance(1)
343                    }
344                    while let Some(ch) = state.peek() {
345                        if ch == ']' {
346                            state.advance(1);
347                            break;
348                        }
349                        state.advance(ch.len_utf8())
350                    }
351                }
352
353                state.add_token(BashTokenType::GlobPattern, start_pos, state.get_position());
354                return true;
355            }
356        }
357
358        false
359    }
360
361    fn lex_special_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
362        let start_pos = state.get_position();
363
364        if let Some(ch) = state.peek() {
365            if BASH_SPECIAL_CHARS.contains(&ch) {
366                state.advance(1);
367                state.add_token(BashTokenType::SpecialChar, start_pos, state.get_position());
368                return true;
369            }
370        }
371
372        false
373    }
374
375    fn lex_text<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
376        let start_pos = state.get_position();
377
378        if let Some(ch) = state.peek() {
379            if !ch.is_whitespace() && !BASH_SPECIAL_CHARS.contains(&ch) {
380                state.advance(ch.len_utf8());
381                state.add_token(BashTokenType::Text, start_pos, state.get_position());
382                return true;
383            }
384        }
385
386        false
387    }
388}
389
390static BASH_KEYWORDS: LazyLock<&[&str]> = LazyLock::new(|| {
391    &[
392        "if", "then", "else", "elif", "fi", "case", "esac", "for", "while", "until", "do", "done", "function", "return", "break", "continue", "local", "export", "readonly", "declare", "typeset", "unset", "shift", "exit", "source", ".", "eval", "exec",
393        "trap", "wait", "jobs", "bg", "fg", "disown", "suspend", "alias", "unalias", "history", "fc", "let", "test", "[", "[[", "]]", "time", "coproc", "select", "in",
394    ]
395});
396
397static BASH_OPERATORS: LazyLock<&[&str]> = LazyLock::new(|| &["+", "-", "*", "/", "%", "=", "!", "<", ">", "&", "|", "^", "~"]);
398
399static BASH_TWO_CHAR_OPERATORS: LazyLock<&[&str]> = LazyLock::new(|| &["==", "!=", "<=", ">=", "&&", "||", "<<", ">>", "++", "--", "+=", "-=", "*=", "/=", "%=", "&=", "|=", "^=", "<<=", ">>=", "**"]);
400
401static BASH_DELIMITERS: LazyLock<&[&str]> = LazyLock::new(|| &["(", ")", "{", "}", "[", "]", ";", ",", ":", "."]);
402
403static BASH_SPECIAL_CHARS: LazyLock<&[char]> = LazyLock::new(|| &['\\', '`', '~', '↯', '#', '$', '%', '^', '&', '*', '(', ')', '-', '+', '=', '{', '}', '[', ']', '|', '\\', ':', ';', '"', '\'', '<', '>', ',', '.', '?', '/', '!', '`']);
oak_bash/lexer/mod.rs

oak_bash/lexer/
mod.rs