oak_lean/lexer/
mod.rs

1//! Lean 语言词法分析器
2//!
3//! 提供 Lean 语言的词法分析功能,将源代码文本转换为标记流
4
5use crate::{kind::LeanSyntaxKind, language::LeanLanguage};
6use oak_core::{Lexer, LexerCache, LexerState, OakError, TextEdit, lexer::LexOutput, source::Source};
7
8type State<'a, S> = LexerState<'a, S, LeanLanguage>;
9
10/// Lean 词法分析器
11#[derive(Debug, Clone, Copy)]
12pub struct LeanLexer;
13
14impl Lexer<LeanLanguage> for LeanLexer {
15    fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<LeanLanguage>) -> LexOutput<LeanLanguage> {
16        let mut state = State::new(text);
17        let result = self.run(&mut state);
18        if result.is_ok() {
19            state.add_eof();
20        }
21        state.finish_with_cache(result, cache)
22    }
23}
24
25impl LeanLexer {
26    /// 创建新的 Lean 词法分析器
27    pub fn new(_config: &LeanLanguage) -> Self {
28        Self
29    }
30
31    /// 跳过空白字符
32    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
33        let start_pos = state.get_position();
34
35        while let Some(ch) = state.peek() {
36            if ch == ' ' || ch == '\t' {
37                state.advance(ch.len_utf8());
38            }
39            else {
40                break;
41            }
42        }
43
44        if state.get_position() > start_pos {
45            state.add_token(LeanSyntaxKind::Whitespace, start_pos, state.get_position());
46            true
47        }
48        else {
49            false
50        }
51    }
52
53    /// 处理换行
54    fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
55        let start_pos = state.get_position();
56
57        if let Some('\n') = state.peek() {
58            state.advance(1);
59            state.add_token(LeanSyntaxKind::Newline, start_pos, state.get_position());
60            true
61        }
62        else if let Some('\r') = state.peek() {
63            state.advance(1);
64            if let Some('\n') = state.peek() {
65                state.advance(1);
66            }
67            state.add_token(LeanSyntaxKind::Newline, start_pos, state.get_position());
68            true
69        }
70        else {
71            false
72        }
73    }
74
75    /// 处理注释
76    fn lex_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
77        let start_pos = state.get_position();
78
79        if let Some('-') = state.peek() {
80            state.advance(1);
81            if let Some('-') = state.peek() {
82                // 单行注释
83                state.advance(1);
84                while let Some(ch) = state.peek() {
85                    if ch == '\n' || ch == '\r' {
86                        break;
87                    }
88                    state.advance(ch.len_utf8());
89                }
90                state.add_token(LeanSyntaxKind::Comment, start_pos, state.get_position());
91                true
92            }
93            else {
94                // 回退,这是减号操作符
95                state.set_position(start_pos);
96                false
97            }
98        }
99        else if let Some('/') = state.peek() {
100            state.advance(1);
101            if let Some('-') = state.peek() {
102                // 块注释开始
103                state.advance(1);
104                let mut depth = 1;
105                while depth > 0 && state.not_at_end() {
106                    if let Some('/') = state.peek() {
107                        state.advance(1);
108                        if let Some('-') = state.peek() {
109                            state.advance(1);
110                            depth += 1;
111                        }
112                    }
113                    else if let Some('-') = state.peek() {
114                        state.advance(1);
115                        if let Some('/') = state.peek() {
116                            state.advance(1);
117                            depth -= 1;
118                        }
119                    }
120                    else if let Some(ch) = state.peek() {
121                        state.advance(ch.len_utf8());
122                    }
123                    else {
124                        break;
125                    }
126                }
127                state.add_token(LeanSyntaxKind::Comment, start_pos, state.get_position());
128                true
129            }
130            else {
131                // 回退,这是除法操作符
132                state.set_position(start_pos);
133                false
134            }
135        }
136        else {
137            false
138        }
139    }
140
141    /// 处理字符串字面量
142    fn lex_string<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
143        let start_pos = state.get_position();
144
145        if let Some('"') = state.peek() {
146            state.advance(1);
147
148            while let Some(ch) = state.peek() {
149                if ch == '"' {
150                    state.advance(1);
151                    break;
152                }
153                else if ch == '\\' {
154                    state.advance(1);
155                    if let Some(escaped) = state.peek() {
156                        state.advance(escaped.len_utf8());
157                    }
158                }
159                else if ch == '\n' || ch == '\r' {
160                    break; // 字符串不能跨行
161                }
162                else {
163                    state.advance(ch.len_utf8());
164                }
165            }
166            state.add_token(LeanSyntaxKind::StringLiteral, start_pos, state.get_position());
167            true
168        }
169        else {
170            false
171        }
172    }
173
174    /// 处理字符字面量
175    fn lex_char<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
176        let start_pos = state.get_position();
177
178        if let Some('\'') = state.peek() {
179            state.advance(1);
180
181            while let Some(ch) = state.peek() {
182                if ch == '\'' {
183                    state.advance(1);
184                    break;
185                }
186                else if ch == '\\' {
187                    state.advance(1);
188                    if let Some(escaped) = state.peek() {
189                        state.advance(escaped.len_utf8());
190                    }
191                }
192                else if ch == '\n' || ch == '\r' {
193                    break;
194                }
195                else {
196                    state.advance(ch.len_utf8());
197                }
198            }
199            state.add_token(LeanSyntaxKind::CharLiteral, start_pos, state.get_position());
200            true
201        }
202        else {
203            false
204        }
205    }
206
207    /// 处理数字字面量
208    fn lex_number<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
209        let start_pos = state.get_position();
210
211        if let Some(ch) = state.peek() {
212            if ch.is_ascii_digit() {
213                // 扫描数字
214                while let Some(ch) = state.peek() {
215                    if ch.is_ascii_digit() {
216                        state.advance(ch.len_utf8());
217                    }
218                    else {
219                        break;
220                    }
221                }
222
223                // 检查小数点
224                if let Some('.') = state.peek() {
225                    state.advance(1); // 跳过小数点
226                    if let Some(next_char) = state.peek() {
227                        if next_char.is_ascii_digit() {
228                            while let Some(ch) = state.peek() {
229                                if ch.is_ascii_digit() {
230                                    state.advance(ch.len_utf8());
231                                }
232                                else {
233                                    break;
234                                }
235                            }
236                        }
237                    }
238                }
239
240                // 检查指数部分
241                if let Some(ch) = state.peek() {
242                    if ch == 'e' || ch == 'E' {
243                        state.advance(1);
244                        if let Some(sign) = state.peek() {
245                            if sign == '+' || sign == '-' {
246                                state.advance(1);
247                            }
248                        }
249                        while let Some(ch) = state.peek() {
250                            if ch.is_ascii_digit() {
251                                state.advance(ch.len_utf8());
252                            }
253                            else {
254                                break;
255                            }
256                        }
257                    }
258                }
259
260                state.add_token(LeanSyntaxKind::IntegerLiteral, start_pos, state.get_position());
261                true
262            }
263            else {
264                false
265            }
266        }
267        else {
268            false
269        }
270    }
271
272    /// 处理标识符和关键字
273    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
274        let start_pos = state.get_position();
275
276        if let Some(ch) = state.peek() {
277            if ch.is_alphabetic() || ch == '_' {
278                state.advance(ch.len_utf8());
279
280                while let Some(ch) = state.peek() {
281                    if ch.is_alphanumeric() || ch == '_' || ch == '\'' {
282                        state.advance(ch.len_utf8());
283                    }
284                    else {
285                        break;
286                    }
287                }
288
289                let text = state.get_text_in(oak_core::Range { start: start_pos, end: state.get_position() });
290                let kind = match text.as_ref() {
291                    "axiom" => LeanSyntaxKind::Axiom,
292                    "constant" => LeanSyntaxKind::Constant,
293                    "def" => LeanSyntaxKind::Def,
294                    "example" => LeanSyntaxKind::Example,
295                    "inductive" => LeanSyntaxKind::Inductive,
296                    "lemma" => LeanSyntaxKind::Lemma,
297                    "namespace" => LeanSyntaxKind::Namespace,
298                    "open" => LeanSyntaxKind::Open,
299                    "private" => LeanSyntaxKind::Private,
300                    "protected" => LeanSyntaxKind::Protected,
301                    "section" => LeanSyntaxKind::Section,
302                    "structure" => LeanSyntaxKind::Structure,
303                    "theorem" => LeanSyntaxKind::Theorem,
304                    "universe" => LeanSyntaxKind::Universe,
305                    "variable" => LeanSyntaxKind::Variable,
306                    "variables" => LeanSyntaxKind::Variables,
307                    "end" => LeanSyntaxKind::End,
308                    "import" => LeanSyntaxKind::Import,
309                    "export" => LeanSyntaxKind::Export,
310                    "prelude" => LeanSyntaxKind::Prelude,
311                    "noncomputable" => LeanSyntaxKind::Noncomputable,
312                    "partial" => LeanSyntaxKind::Partial,
313                    "unsafe" => LeanSyntaxKind::Unsafe,
314                    "mutual" => LeanSyntaxKind::Mutual,
315                    "where" => LeanSyntaxKind::Where,
316                    "have" => LeanSyntaxKind::Have,
317                    "show" => LeanSyntaxKind::Show,
318                    "suffices" => LeanSyntaxKind::Suffices,
319                    "let" => LeanSyntaxKind::Let,
320                    "in" => LeanSyntaxKind::In,
321                    "if" => LeanSyntaxKind::If,
322                    "then" => LeanSyntaxKind::Then,
323                    "else" => LeanSyntaxKind::Else,
324                    "match" => LeanSyntaxKind::Match,
325                    "with" => LeanSyntaxKind::With,
326                    "fun" => LeanSyntaxKind::Fun,
327                    "do" => LeanSyntaxKind::Do,
328                    "for" => LeanSyntaxKind::For,
329                    "while" => LeanSyntaxKind::While,
330                    "break" => LeanSyntaxKind::Break,
331                    "continue" => LeanSyntaxKind::Continue,
332                    "return" => LeanSyntaxKind::Return,
333                    "try" => LeanSyntaxKind::Try,
334                    "catch" => LeanSyntaxKind::Catch,
335                    "finally" => LeanSyntaxKind::Finally,
336                    "throw" => LeanSyntaxKind::Throw,
337                    _ => LeanSyntaxKind::Identifier,
338                };
339
340                state.add_token(kind, start_pos, state.get_position());
341                true
342            }
343            else {
344                false
345            }
346        }
347        else {
348            false
349        }
350    }
351
352    /// 处理操作符和分隔符
353    fn lex_operator_or_delimiter<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
354        let start_pos = state.get_position();
355
356        if let Some(ch) = state.peek() {
357            let kind = match ch {
358                '(' => LeanSyntaxKind::LeftParen,
359                ')' => LeanSyntaxKind::RightParen,
360                '{' => LeanSyntaxKind::LeftBrace,
361                '}' => LeanSyntaxKind::RightBrace,
362                '[' => LeanSyntaxKind::LeftBracket,
363                ']' => LeanSyntaxKind::RightBracket,
364                ',' => LeanSyntaxKind::Comma,
365                ';' => LeanSyntaxKind::Semicolon,
366                '+' => LeanSyntaxKind::Plus,
367                '*' => LeanSyntaxKind::Star,
368                '/' => LeanSyntaxKind::Slash,
369                '%' => LeanSyntaxKind::Percent,
370                '^' => LeanSyntaxKind::Caret,
371                '#' => LeanSyntaxKind::Hash,
372                '&' => LeanSyntaxKind::Ampersand,
373                '|' => LeanSyntaxKind::Pipe,
374                '~' => LeanSyntaxKind::Tilde,
375                '!' => LeanSyntaxKind::Bang,
376                '?' => LeanSyntaxKind::Question,
377                '@' => LeanSyntaxKind::At,
378                '$' => LeanSyntaxKind::Dollar,
379                '<' => LeanSyntaxKind::Lt,
380                '>' => LeanSyntaxKind::Gt,
381                '=' => LeanSyntaxKind::Eq,
382                _ => {
383                    // 检查多字符操作符
384                    match ch {
385                        ':' => {
386                            state.advance(1);
387                            if let Some('=') = state.peek() {
388                                state.advance(1);
389                                state.add_token(LeanSyntaxKind::ColonEq, start_pos, state.get_position());
390                                return true;
391                            }
392                            else if let Some(':') = state.peek() {
393                                state.advance(1);
394                                state.add_token(LeanSyntaxKind::ColonColon, start_pos, state.get_position());
395                                return true;
396                            }
397                            else {
398                                state.add_token(LeanSyntaxKind::Colon, start_pos, state.get_position());
399                                return true;
400                            }
401                        }
402                        '.' => {
403                            state.advance(1);
404                            if let Some('.') = state.peek() {
405                                state.advance(1);
406                                state.add_token(LeanSyntaxKind::DotDot, start_pos, state.get_position());
407                                return true;
408                            }
409                            else {
410                                state.add_token(LeanSyntaxKind::Dot, start_pos, state.get_position());
411                                return true;
412                            }
413                        }
414                        '-' => {
415                            state.advance(1);
416                            if let Some('>') = state.peek() {
417                                state.advance(1);
418                                state.add_token(LeanSyntaxKind::Arrow, start_pos, state.get_position());
419                                return true;
420                            }
421                            else {
422                                state.add_token(LeanSyntaxKind::Minus, start_pos, state.get_position());
423                                return true;
424                            }
425                        }
426                        _ => return false,
427                    }
428                }
429            };
430
431            state.advance(ch.len_utf8());
432            state.add_token(kind, start_pos, state.get_position());
433            true
434        }
435        else {
436            false
437        }
438    }
439
440    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
441        while state.not_at_end() {
442            let safe_point = state.get_position();
443            if self.skip_whitespace(state) {
444                continue;
445            }
446
447            if self.lex_newline(state) {
448                continue;
449            }
450
451            if self.lex_comment(state) {
452                continue;
453            }
454
455            if self.lex_string(state) {
456                continue;
457            }
458
459            if self.lex_char(state) {
460                continue;
461            }
462
463            if self.lex_number(state) {
464                continue;
465            }
466
467            if self.lex_identifier_or_keyword(state) {
468                continue;
469            }
470
471            if self.lex_operator_or_delimiter(state) {
472                continue;
473            }
474
475            // 如果所有规则都不匹配,跳过当前字符并标记为错误
476            let start_pos = state.get_position();
477            if let Some(ch) = state.peek() {
478                state.advance(ch.len_utf8());
479                state.add_token(LeanSyntaxKind::Error, start_pos, state.get_position());
480            }
481
482            state.advance_if_dead_lock(safe_point);
483        }
484
485        Ok(())
486    }
487}