Skip to main content

oak_lean/lexer/
mod.rs

1//! Lean 语言词法分析器
2//!
3//! 提供 Lean 语言的词法分析功能,将源代码文本转换为标记流
4
5use crate::{kind::LeanSyntaxKind, language::LeanLanguage};
6use oak_core::{Lexer, LexerCache, LexerState, OakError, TextEdit, lexer::LexOutput, source::Source};
7
8type State<'a, S> = LexerState<'a, S, LeanLanguage>;
9
10/// Lean 词法分析器
11#[derive(Debug, Clone)]
12pub struct LeanLexer<'config> {
13    _config: &'config LeanLanguage,
14}
15
16impl<'config> Lexer<LeanLanguage> for LeanLexer<'config> {
17    fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<LeanLanguage>) -> LexOutput<LeanLanguage> {
18        let mut state = State::new(text);
19        let result = self.run(&mut state);
20        if result.is_ok() {
21            state.add_eof();
22        }
23        state.finish_with_cache(result, cache)
24    }
25}
26
27impl<'config> LeanLexer<'config> {
28    /// 创建新的 Lean 词法分析器
29    pub fn new(config: &'config LeanLanguage) -> Self {
30        Self { _config: config }
31    }
32
33    /// 跳过空白字符
34    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
35        let start_pos = state.get_position();
36
37        while let Some(ch) = state.peek() {
38            if ch == ' ' || ch == '\t' {
39                state.advance(ch.len_utf8());
40            }
41            else {
42                break;
43            }
44        }
45
46        if state.get_position() > start_pos {
47            state.add_token(LeanSyntaxKind::Whitespace, start_pos, state.get_position());
48            true
49        }
50        else {
51            false
52        }
53    }
54
55    /// 处理换行
56    fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
57        let start_pos = state.get_position();
58
59        if let Some('\n') = state.peek() {
60            state.advance(1);
61            state.add_token(LeanSyntaxKind::Newline, start_pos, state.get_position());
62            true
63        }
64        else if let Some('\r') = state.peek() {
65            state.advance(1);
66            if let Some('\n') = state.peek() {
67                state.advance(1);
68            }
69            state.add_token(LeanSyntaxKind::Newline, start_pos, state.get_position());
70            true
71        }
72        else {
73            false
74        }
75    }
76
77    /// 处理注释
78    fn lex_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
79        let start_pos = state.get_position();
80
81        if let Some('-') = state.peek() {
82            state.advance(1);
83            if let Some('-') = state.peek() {
84                // 单行注释
85                state.advance(1);
86                while let Some(ch) = state.peek() {
87                    if ch == '\n' || ch == '\r' {
88                        break;
89                    }
90                    state.advance(ch.len_utf8());
91                }
92                state.add_token(LeanSyntaxKind::Comment, start_pos, state.get_position());
93                true
94            }
95            else {
96                // 回退,这是减号操作符
97                state.set_position(start_pos);
98                false
99            }
100        }
101        else if let Some('/') = state.peek() {
102            state.advance(1);
103            if let Some('-') = state.peek() {
104                // 块注释开始
105                state.advance(1);
106                let mut depth = 1;
107                while depth > 0 && state.not_at_end() {
108                    if let Some('/') = state.peek() {
109                        state.advance(1);
110                        if let Some('-') = state.peek() {
111                            state.advance(1);
112                            depth += 1;
113                        }
114                    }
115                    else if let Some('-') = state.peek() {
116                        state.advance(1);
117                        if let Some('/') = state.peek() {
118                            state.advance(1);
119                            depth -= 1;
120                        }
121                    }
122                    else if let Some(ch) = state.peek() {
123                        state.advance(ch.len_utf8());
124                    }
125                    else {
126                        break;
127                    }
128                }
129                state.add_token(LeanSyntaxKind::Comment, start_pos, state.get_position());
130                true
131            }
132            else {
133                // 回退,这是除法操作符
134                state.set_position(start_pos);
135                false
136            }
137        }
138        else {
139            false
140        }
141    }
142
143    /// 处理字符串字面量
144    fn lex_string<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
145        let start_pos = state.get_position();
146
147        if let Some('"') = state.peek() {
148            state.advance(1);
149
150            while let Some(ch) = state.peek() {
151                if ch == '"' {
152                    state.advance(1);
153                    break;
154                }
155                else if ch == '\\' {
156                    state.advance(1);
157                    if let Some(escaped) = state.peek() {
158                        state.advance(escaped.len_utf8());
159                    }
160                }
161                else if ch == '\n' || ch == '\r' {
162                    break; // 字符串不能跨行
163                }
164                else {
165                    state.advance(ch.len_utf8());
166                }
167            }
168            state.add_token(LeanSyntaxKind::StringLiteral, start_pos, state.get_position());
169            true
170        }
171        else {
172            false
173        }
174    }
175
176    /// 处理字符字面量
177    fn lex_char<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
178        let start_pos = state.get_position();
179
180        if let Some('\'') = state.peek() {
181            state.advance(1);
182
183            while let Some(ch) = state.peek() {
184                if ch == '\'' {
185                    state.advance(1);
186                    break;
187                }
188                else if ch == '\\' {
189                    state.advance(1);
190                    if let Some(escaped) = state.peek() {
191                        state.advance(escaped.len_utf8());
192                    }
193                }
194                else if ch == '\n' || ch == '\r' {
195                    break;
196                }
197                else {
198                    state.advance(ch.len_utf8());
199                }
200            }
201            state.add_token(LeanSyntaxKind::CharLiteral, start_pos, state.get_position());
202            true
203        }
204        else {
205            false
206        }
207    }
208
209    /// 处理数字字面量
210    fn lex_number<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
211        let start_pos = state.get_position();
212
213        if let Some(ch) = state.peek() {
214            if ch.is_ascii_digit() {
215                // 扫描数字
216                while let Some(ch) = state.peek() {
217                    if ch.is_ascii_digit() {
218                        state.advance(ch.len_utf8());
219                    }
220                    else {
221                        break;
222                    }
223                }
224
225                // 检查小数点
226                if let Some('.') = state.peek() {
227                    state.advance(1); // 跳过小数点
228                    if let Some(next_char) = state.peek() {
229                        if next_char.is_ascii_digit() {
230                            while let Some(ch) = state.peek() {
231                                if ch.is_ascii_digit() {
232                                    state.advance(ch.len_utf8());
233                                }
234                                else {
235                                    break;
236                                }
237                            }
238                        }
239                    }
240                }
241
242                // 检查指数部分
243                if let Some(ch) = state.peek() {
244                    if ch == 'e' || ch == 'E' {
245                        state.advance(1);
246                        if let Some(sign) = state.peek() {
247                            if sign == '+' || sign == '-' {
248                                state.advance(1);
249                            }
250                        }
251                        while let Some(ch) = state.peek() {
252                            if ch.is_ascii_digit() {
253                                state.advance(ch.len_utf8());
254                            }
255                            else {
256                                break;
257                            }
258                        }
259                    }
260                }
261
262                state.add_token(LeanSyntaxKind::IntegerLiteral, start_pos, state.get_position());
263                true
264            }
265            else {
266                false
267            }
268        }
269        else {
270            false
271        }
272    }
273
274    /// 处理标识符和关键字
275    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
276        let start_pos = state.get_position();
277
278        if let Some(ch) = state.peek() {
279            if ch.is_alphabetic() || ch == '_' {
280                state.advance(ch.len_utf8());
281
282                while let Some(ch) = state.peek() {
283                    if ch.is_alphanumeric() || ch == '_' || ch == '\'' {
284                        state.advance(ch.len_utf8());
285                    }
286                    else {
287                        break;
288                    }
289                }
290
291                let text = state.get_text_in(oak_core::Range { start: start_pos, end: state.get_position() });
292                let kind = match text.as_ref() {
293                    "axiom" => LeanSyntaxKind::Axiom,
294                    "constant" => LeanSyntaxKind::Constant,
295                    "def" => LeanSyntaxKind::Def,
296                    "example" => LeanSyntaxKind::Example,
297                    "inductive" => LeanSyntaxKind::Inductive,
298                    "lemma" => LeanSyntaxKind::Lemma,
299                    "namespace" => LeanSyntaxKind::Namespace,
300                    "open" => LeanSyntaxKind::Open,
301                    "private" => LeanSyntaxKind::Private,
302                    "protected" => LeanSyntaxKind::Protected,
303                    "section" => LeanSyntaxKind::Section,
304                    "structure" => LeanSyntaxKind::Structure,
305                    "theorem" => LeanSyntaxKind::Theorem,
306                    "universe" => LeanSyntaxKind::Universe,
307                    "variable" => LeanSyntaxKind::Variable,
308                    "variables" => LeanSyntaxKind::Variables,
309                    "end" => LeanSyntaxKind::End,
310                    "import" => LeanSyntaxKind::Import,
311                    "export" => LeanSyntaxKind::Export,
312                    "prelude" => LeanSyntaxKind::Prelude,
313                    "noncomputable" => LeanSyntaxKind::Noncomputable,
314                    "partial" => LeanSyntaxKind::Partial,
315                    "unsafe" => LeanSyntaxKind::Unsafe,
316                    "mutual" => LeanSyntaxKind::Mutual,
317                    "where" => LeanSyntaxKind::Where,
318                    "have" => LeanSyntaxKind::Have,
319                    "show" => LeanSyntaxKind::Show,
320                    "suffices" => LeanSyntaxKind::Suffices,
321                    "let" => LeanSyntaxKind::Let,
322                    "in" => LeanSyntaxKind::In,
323                    "if" => LeanSyntaxKind::If,
324                    "then" => LeanSyntaxKind::Then,
325                    "else" => LeanSyntaxKind::Else,
326                    "match" => LeanSyntaxKind::Match,
327                    "with" => LeanSyntaxKind::With,
328                    "fun" => LeanSyntaxKind::Fun,
329                    "do" => LeanSyntaxKind::Do,
330                    "for" => LeanSyntaxKind::For,
331                    "while" => LeanSyntaxKind::While,
332                    "break" => LeanSyntaxKind::Break,
333                    "continue" => LeanSyntaxKind::Continue,
334                    "return" => LeanSyntaxKind::Return,
335                    "try" => LeanSyntaxKind::Try,
336                    "catch" => LeanSyntaxKind::Catch,
337                    "finally" => LeanSyntaxKind::Finally,
338                    "throw" => LeanSyntaxKind::Throw,
339                    _ => LeanSyntaxKind::Identifier,
340                };
341
342                state.add_token(kind, start_pos, state.get_position());
343                true
344            }
345            else {
346                false
347            }
348        }
349        else {
350            false
351        }
352    }
353
354    /// 处理操作符和分隔符
355    fn lex_operator_or_delimiter<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
356        let start_pos = state.get_position();
357
358        if let Some(ch) = state.peek() {
359            let kind = match ch {
360                '(' => LeanSyntaxKind::LeftParen,
361                ')' => LeanSyntaxKind::RightParen,
362                '{' => LeanSyntaxKind::LeftBrace,
363                '}' => LeanSyntaxKind::RightBrace,
364                '[' => LeanSyntaxKind::LeftBracket,
365                ']' => LeanSyntaxKind::RightBracket,
366                ',' => LeanSyntaxKind::Comma,
367                ';' => LeanSyntaxKind::Semicolon,
368                '+' => LeanSyntaxKind::Plus,
369                '*' => LeanSyntaxKind::Star,
370                '/' => LeanSyntaxKind::Slash,
371                '%' => LeanSyntaxKind::Percent,
372                '^' => LeanSyntaxKind::Caret,
373                '#' => LeanSyntaxKind::Hash,
374                '&' => LeanSyntaxKind::Ampersand,
375                '|' => LeanSyntaxKind::Pipe,
376                '~' => LeanSyntaxKind::Tilde,
377                '!' => LeanSyntaxKind::Bang,
378                '?' => LeanSyntaxKind::Question,
379                '@' => LeanSyntaxKind::At,
380                '$' => LeanSyntaxKind::Dollar,
381                '<' => LeanSyntaxKind::Lt,
382                '>' => LeanSyntaxKind::Gt,
383                '=' => LeanSyntaxKind::Eq,
384                _ => {
385                    // 检查多字符操作符
386                    match ch {
387                        ':' => {
388                            state.advance(1);
389                            if let Some('=') = state.peek() {
390                                state.advance(1);
391                                state.add_token(LeanSyntaxKind::ColonEq, start_pos, state.get_position());
392                                return true;
393                            }
394                            else if let Some(':') = state.peek() {
395                                state.advance(1);
396                                state.add_token(LeanSyntaxKind::ColonColon, start_pos, state.get_position());
397                                return true;
398                            }
399                            else {
400                                state.add_token(LeanSyntaxKind::Colon, start_pos, state.get_position());
401                                return true;
402                            }
403                        }
404                        '.' => {
405                            state.advance(1);
406                            if let Some('.') = state.peek() {
407                                state.advance(1);
408                                state.add_token(LeanSyntaxKind::DotDot, start_pos, state.get_position());
409                                return true;
410                            }
411                            else {
412                                state.add_token(LeanSyntaxKind::Dot, start_pos, state.get_position());
413                                return true;
414                            }
415                        }
416                        '-' => {
417                            state.advance(1);
418                            if let Some('>') = state.peek() {
419                                state.advance(1);
420                                state.add_token(LeanSyntaxKind::Arrow, start_pos, state.get_position());
421                                return true;
422                            }
423                            else {
424                                state.add_token(LeanSyntaxKind::Minus, start_pos, state.get_position());
425                                return true;
426                            }
427                        }
428                        _ => return false,
429                    }
430                }
431            };
432
433            state.advance(ch.len_utf8());
434            state.add_token(kind, start_pos, state.get_position());
435            true
436        }
437        else {
438            false
439        }
440    }
441
442    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
443        while state.not_at_end() {
444            let safe_point = state.get_position();
445            if self.skip_whitespace(state) {
446                continue;
447            }
448
449            if self.lex_newline(state) {
450                continue;
451            }
452
453            if self.lex_comment(state) {
454                continue;
455            }
456
457            if self.lex_string(state) {
458                continue;
459            }
460
461            if self.lex_char(state) {
462                continue;
463            }
464
465            if self.lex_number(state) {
466                continue;
467            }
468
469            if self.lex_identifier_or_keyword(state) {
470                continue;
471            }
472
473            if self.lex_operator_or_delimiter(state) {
474                continue;
475            }
476
477            // 如果所有规则都不匹配,跳过当前字符并标记为错误
478            let start_pos = state.get_position();
479            if let Some(ch) = state.peek() {
480                state.advance(ch.len_utf8());
481                state.add_token(LeanSyntaxKind::Error, start_pos, state.get_position());
482            }
483
484            state.advance_if_dead_lock(safe_point);
485        }
486
487        Ok(())
488    }
489}