Skip to main content

oak_tcl/lexer/
mod.rs

1use crate::{kind::TclSyntaxKind, language::TclLanguage};
2use oak_core::{
3    Lexer, LexerState, OakError,
4    lexer::{CommentConfig, LexOutput, LexerCache, StringConfig, WhitespaceConfig},
5    source::Source,
6};
7
8type State<'s, S> = LexerState<'s, S, TclLanguage>;
9
10static TCL_WHITESPACE: WhitespaceConfig = WhitespaceConfig { unicode_whitespace: true };
11static TCL_COMMENT: CommentConfig = CommentConfig { line_marker: "#", block_start: "", block_end: "", nested_blocks: false };
12static TCL_STRING: StringConfig = StringConfig { quotes: &['"'], escape: Some('\\') };
13
14#[derive(Clone)]
15pub struct TclLexer<'config> {
16    _config: &'config TclLanguage,
17}
18
19impl<'config> TclLexer<'config> {
20    pub fn new(config: &'config TclLanguage) -> Self {
21        Self { _config: config }
22    }
23
24    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
25        while state.not_at_end() {
26            let safe_point = state.get_position();
27
28            if self.skip_whitespace(state) {
29                continue;
30            }
31
32            if self.lex_newline(state) {
33                continue;
34            }
35
36            if self.skip_comment(state) {
37                continue;
38            }
39
40            if self.lex_string_literal(state) {
41                continue;
42            }
43
44            if self.lex_brace_string(state) {
45                continue;
46            }
47
48            if self.lex_numeric_literal(state) {
49                continue;
50            }
51
52            if self.lex_identifier_or_keyword(state) {
53                continue;
54            }
55
56            if self.lex_operators(state) {
57                continue;
58            }
59
60            if self.lex_single_char_tokens(state) {
61                continue;
62            }
63
64            // 如果所有规则都不匹配,跳过当前字符并标记为错误
65            if let Some(ch) = state.current() {
66                state.advance(ch.len_utf8());
67            }
68
69            state.advance_if_dead_lock(safe_point);
70        }
71
72        state.add_eof();
73        Ok(())
74    }
75}
76
77impl<'config> Lexer<TclLanguage> for TclLexer<'config> {
78    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<TclLanguage>) -> LexOutput<TclLanguage> {
79        let mut state = State::new(source);
80        let result = self.run(&mut state);
81        state.finish_with_cache(result, cache)
82    }
83}
84
85impl<'config> TclLexer<'config> {
86    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
87        TCL_WHITESPACE.scan(state, TclSyntaxKind::Whitespace)
88    }
89
90    fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
91        if let Some(ch) = state.current() {
92            if ch == '\n' {
93                let start = state.get_position();
94                state.advance(1);
95                state.add_token(TclSyntaxKind::Newline, start, state.get_position());
96                return true;
97            }
98            else if ch == '\r' {
99                let start = state.get_position();
100                state.advance(1);
101                if state.current() == Some('\n') {
102                    state.advance(1);
103                }
104                state.add_token(TclSyntaxKind::Newline, start, state.get_position());
105                return true;
106            }
107        }
108        false
109    }
110
111    fn skip_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
112        TCL_COMMENT.scan(state, TclSyntaxKind::Comment, TclSyntaxKind::Comment)
113    }
114
115    fn lex_string_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
116        TCL_STRING.scan(state, TclSyntaxKind::StringLiteral)
117    }
118
119    fn lex_brace_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
120        let start = state.get_position();
121
122        if state.current() != Some('{') {
123            return false;
124        }
125
126        state.advance(1);
127        let mut brace_count = 1;
128
129        while let Some(ch) = state.peek() {
130            if ch == '{' {
131                brace_count += 1;
132            }
133            else if ch == '}' {
134                brace_count -= 1;
135                if brace_count == 0 {
136                    state.advance(1);
137                    break;
138                }
139            }
140            state.advance(ch.len_utf8());
141        }
142
143        state.add_token(TclSyntaxKind::StringLiteral, start, state.get_position());
144        true
145    }
146
147    fn lex_numeric_literal<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
148        let start = state.get_position();
149        let first = match state.current() {
150            Some(c) => c,
151            None => return false,
152        };
153
154        if !first.is_ascii_digit() && !(first == '-' && state.peek().map_or(false, |c| c.is_ascii_digit())) {
155            return false;
156        }
157
158        if first == '-' {
159            state.advance(1);
160        }
161
162        // 整数部分
163        while let Some(c) = state.current() {
164            if c.is_ascii_digit() {
165                state.advance(1);
166            }
167            else {
168                break;
169            }
170        }
171
172        // 小数部分
173        if state.current() == Some('.') && state.peek().map_or(false, |c| c.is_ascii_digit()) {
174            state.advance(1); // consume '.'
175            while let Some(c) = state.current() {
176                if c.is_ascii_digit() {
177                    state.advance(1);
178                }
179                else {
180                    break;
181                }
182            }
183        }
184
185        // 科学计数法
186        if let Some(c) = state.current() {
187            if c == 'e' || c == 'E' {
188                let next = state.peek();
189                if next == Some('+') || next == Some('-') || next.map_or(false, |d| d.is_ascii_digit()) {
190                    state.advance(1);
191                    if let Some(sign) = state.current() {
192                        if sign == '+' || sign == '-' {
193                            state.advance(1);
194                        }
195                    }
196                    while let Some(d) = state.current() {
197                        if d.is_ascii_digit() {
198                            state.advance(1);
199                        }
200                        else {
201                            break;
202                        }
203                    }
204                }
205            }
206        }
207
208        state.add_token(TclSyntaxKind::Number, start, state.get_position());
209        true
210    }
211
212    fn lex_identifier_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
213        let start = state.get_position();
214        let ch = match state.current() {
215            Some(c) => c,
216            None => return false,
217        };
218
219        if !(ch.is_ascii_alphabetic() || ch == '_') {
220            return false;
221        }
222
223        state.advance(ch.len_utf8());
224        while let Some(c) = state.current() {
225            if c.is_ascii_alphanumeric() || c == '_' {
226                state.advance(c.len_utf8());
227            }
228            else {
229                break;
230            }
231        }
232
233        let end = state.get_position();
234        let text = state.source().get_text_in(oak_core::Range { start, end });
235        let kind = match text.as_ref() {
236            "if" => TclSyntaxKind::If,
237            "else" => TclSyntaxKind::Else,
238            "elseif" => TclSyntaxKind::ElseIf,
239            "for" => TclSyntaxKind::For,
240            "while" => TclSyntaxKind::While,
241            "foreach" => TclSyntaxKind::ForEach,
242            "proc" => TclSyntaxKind::Proc,
243            "return" => TclSyntaxKind::Return,
244            "break" => TclSyntaxKind::Break,
245            "continue" => TclSyntaxKind::Continue,
246            "set" => TclSyntaxKind::Set,
247            "unset" => TclSyntaxKind::Unset,
248            "global" => TclSyntaxKind::Global,
249            "upvar" => TclSyntaxKind::Upvar,
250            "variable" => TclSyntaxKind::Variable,
251            _ => TclSyntaxKind::Identifier,
252        };
253
254        state.add_token(kind, start, state.get_position());
255        true
256    }
257
258    fn lex_operators<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
259        let start = state.get_position();
260
261        // 多字符操作符
262        let patterns: &[(&str, TclSyntaxKind)] =
263            &[("==", TclSyntaxKind::Equal), ("!=", TclSyntaxKind::NotEqual), ("<=", TclSyntaxKind::LessEqual), (">=", TclSyntaxKind::GreaterEqual), ("&&", TclSyntaxKind::AmpersandAmpersand), ("||", TclSyntaxKind::PipePipe)];
264
265        for (pat, kind) in patterns {
266            let mut matches = true;
267            for (i, c) in pat.chars().enumerate() {
268                if state.peek_next_n(i) != Some(c) {
269                    matches = false;
270                    break;
271                }
272            }
273
274            if matches {
275                state.advance(pat.len());
276                state.add_token(*kind, start, state.get_position());
277                return true;
278            }
279        }
280
281        // 单字符操作符
282        if let Some(ch) = state.current() {
283            let kind = match ch {
284                '+' => Some(TclSyntaxKind::Plus),
285                '-' => Some(TclSyntaxKind::Minus),
286                '*' => Some(TclSyntaxKind::Star),
287                '/' => Some(TclSyntaxKind::Slash),
288                '%' => Some(TclSyntaxKind::Percent),
289                '<' => Some(TclSyntaxKind::Less),
290                '>' => Some(TclSyntaxKind::Greater),
291                '!' => Some(TclSyntaxKind::Exclamation),
292                '&' => Some(TclSyntaxKind::Ampersand),
293                '|' => Some(TclSyntaxKind::Pipe),
294                '=' => Some(TclSyntaxKind::Equal),
295                _ => None,
296            };
297
298            if let Some(k) = kind {
299                state.advance(ch.len_utf8());
300                state.add_token(k, start, state.get_position());
301                return true;
302            }
303        }
304        false
305    }
306
307    fn lex_single_char_tokens<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
308        let start = state.get_position();
309
310        if let Some(ch) = state.current() {
311            let kind = match ch {
312                '(' => TclSyntaxKind::LeftParen,
313                ')' => TclSyntaxKind::RightParen,
314                '[' => TclSyntaxKind::LeftBracket,
315                ']' => TclSyntaxKind::RightBracket,
316                '{' => TclSyntaxKind::LeftBrace,
317                '}' => TclSyntaxKind::RightBrace,
318                ';' => TclSyntaxKind::Semicolon,
319                ',' => TclSyntaxKind::Comma,
320                '$' => TclSyntaxKind::Dollar,
321                _ => return false,
322            };
323
324            state.advance(ch.len_utf8());
325            state.add_token(kind, start, state.get_position());
326            true
327        }
328        else {
329            false
330        }
331    }
332}