oak_lean/lexer/
mod.rs

1//! Lean 语言词法分析器
2//!
3//! 提供 Lean 语言的词法分析功能,将源代码文本转换为标记流
4
5use crate::{kind::LeanSyntaxKind, language::LeanLanguage};
6use oak_core::{IncrementalCache, Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
7
8type State<S> = LexerState<S, LeanLanguage>;
9
10/// Lean 词法分析器
11pub struct LeanLexer<'config> {
12    config: &'config LeanLanguage,
13}
14
15impl<'config> LeanLexer<'config> {
16    /// 创建新的 Lean 词法分析器
17    pub fn new(config: &'config LeanLanguage) -> Self {
18        Self { config }
19    }
20
21    /// 跳过空白字符
22    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
23        let start_pos = state.get_position();
24
25        while let Some(ch) = state.peek() {
26            if ch == ' ' || ch == '\t' {
27                state.advance(ch.len_utf8());
28            }
29            else {
30                break;
31            }
32        }
33
34        if state.get_position() > start_pos {
35            state.add_token(LeanSyntaxKind::Whitespace, start_pos, state.get_position());
36            true
37        }
38        else {
39            false
40        }
41    }
42
43    /// 处理换行
44    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
45        let start_pos = state.get_position();
46
47        if let Some('\n') = state.peek() {
48            state.advance(1);
49            state.add_token(LeanSyntaxKind::Whitespace, start_pos, state.get_position());
50            true
51        }
52        else if let Some('\r') = state.peek() {
53            state.advance(1);
54            if let Some('\n') = state.peek() {
55                state.advance(1);
56            }
57            state.add_token(LeanSyntaxKind::Whitespace, start_pos, state.get_position());
58            true
59        }
60        else {
61            false
62        }
63    }
64
65    /// 处理注释
66    fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
67        let start_pos = state.get_position();
68
69        if let Some('-') = state.peek() {
70            state.advance(1);
71            if let Some('-') = state.peek() {
72                // 单行注释
73                state.advance(1);
74                while let Some(ch) = state.peek() {
75                    if ch == '\n' || ch == '\r' {
76                        break;
77                    }
78                    state.advance(ch.len_utf8());
79                }
80                state.add_token(LeanSyntaxKind::Comment, start_pos, state.get_position());
81                true
82            }
83            else {
84                // 回退,这是减号操作符
85                state.set_position(start_pos);
86                false
87            }
88        }
89        else if let Some('/') = state.peek() {
90            state.advance(1);
91            if let Some('-') = state.peek() {
92                // 块注释开始
93                state.advance(1);
94                let mut depth = 1;
95                while depth > 0 && state.not_at_end() {
96                    if let Some('/') = state.peek() {
97                        state.advance(1);
98                        if let Some('-') = state.peek() {
99                            state.advance(1);
100                            depth += 1;
101                        }
102                    }
103                    else if let Some('-') = state.peek() {
104                        state.advance(1);
105                        if let Some('/') = state.peek() {
106                            state.advance(1);
107                            depth -= 1;
108                        }
109                    }
110                    else {
111                        let ch = state.peek().unwrap();
112                        state.advance(ch.len_utf8());
113                    }
114                }
115                state.add_token(LeanSyntaxKind::Comment, start_pos, state.get_position());
116                true
117            }
118            else {
119                // 回退,这是除法操作符
120                state.set_position(start_pos);
121                false
122            }
123        }
124        else {
125            false
126        }
127    }
128
129    /// 处理字符串字面量
130    fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
131        let start_pos = state.get_position();
132
133        if let Some('"') = state.peek() {
134            state.advance(1);
135
136            while let Some(ch) = state.peek() {
137                if ch == '"' {
138                    state.advance(1);
139                    break;
140                }
141                else if ch == '\\' {
142                    state.advance(1);
143                    if let Some(escaped) = state.peek() {
144                        state.advance(escaped.len_utf8());
145                    }
146                }
147                else if ch == '\n' || ch == '\r' {
148                    break; // 字符串不能跨行
149                }
150                else {
151                    state.advance(ch.len_utf8());
152                }
153            }
154            state.add_token(LeanSyntaxKind::StringLiteral, start_pos, state.get_position());
155            true
156        }
157        else {
158            false
159        }
160    }
161
162    /// 处理字符字面量
163    fn lex_char<S: Source>(&self, state: &mut State<S>) -> bool {
164        let start_pos = state.get_position();
165
166        if let Some('\'') = state.peek() {
167            state.advance(1);
168
169            while let Some(ch) = state.peek() {
170                if ch == '\'' {
171                    state.advance(1);
172                    break;
173                }
174                else if ch == '\\' {
175                    state.advance(1);
176                    if let Some(escaped) = state.peek() {
177                        state.advance(escaped.len_utf8());
178                    }
179                }
180                else if ch == '\n' || ch == '\r' {
181                    break;
182                }
183                else {
184                    state.advance(ch.len_utf8());
185                }
186            }
187            state.add_token(LeanSyntaxKind::CharLiteral, start_pos, state.get_position());
188            true
189        }
190        else {
191            false
192        }
193    }
194
195    /// 处理数字字面量
196    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
197        let start_pos = state.get_position();
198
199        if let Some(ch) = state.peek() {
200            if ch.is_ascii_digit() {
201                // 扫描数字
202                while let Some(ch) = state.peek() {
203                    if ch.is_ascii_digit() {
204                        state.advance(ch.len_utf8());
205                    }
206                    else {
207                        break;
208                    }
209                }
210
211                // 检查小数点
212                if let Some('.') = state.peek() {
213                    state.advance(1); // 跳过小数点
214                    if let Some(next_char) = state.peek() {
215                        if next_char.is_ascii_digit() {
216                            while let Some(ch) = state.peek() {
217                                if ch.is_ascii_digit() {
218                                    state.advance(ch.len_utf8());
219                                }
220                                else {
221                                    break;
222                                }
223                            }
224                        }
225                    }
226                }
227
228                // 检查指数部分
229                if let Some(ch) = state.peek() {
230                    if ch == 'e' || ch == 'E' {
231                        state.advance(1);
232                        if let Some(sign) = state.peek() {
233                            if sign == '+' || sign == '-' {
234                                state.advance(1);
235                            }
236                        }
237                        while let Some(ch) = state.peek() {
238                            if ch.is_ascii_digit() {
239                                state.advance(ch.len_utf8());
240                            }
241                            else {
242                                break;
243                            }
244                        }
245                    }
246                }
247
248                state.add_token(LeanSyntaxKind::IntegerLiteral, start_pos, state.get_position());
249                true
250            }
251            else {
252                false
253            }
254        }
255        else {
256            false
257        }
258    }
259
260    /// 处理标识符和关键字
261    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
262        let start_pos = state.get_position();
263
264        if let Some(ch) = state.peek() {
265            if ch.is_alphabetic() || ch == '_' {
266                state.advance(ch.len_utf8());
267
268                while let Some(ch) = state.peek() {
269                    if ch.is_alphanumeric() || ch == '_' || ch == '\'' {
270                        state.advance(ch.len_utf8());
271                    }
272                    else {
273                        break;
274                    }
275                }
276
277                let text = state.get_text_in((start_pos..state.get_position()).into());
278                let kind = match text {
279                    "axiom" => LeanSyntaxKind::Axiom,
280                    "constant" => LeanSyntaxKind::Constant,
281                    "def" => LeanSyntaxKind::Def,
282                    "example" => LeanSyntaxKind::Example,
283                    "inductive" => LeanSyntaxKind::Inductive,
284                    "lemma" => LeanSyntaxKind::Lemma,
285                    "namespace" => LeanSyntaxKind::Namespace,
286                    "open" => LeanSyntaxKind::Open,
287                    "private" => LeanSyntaxKind::Private,
288                    "protected" => LeanSyntaxKind::Protected,
289                    "section" => LeanSyntaxKind::Section,
290                    "structure" => LeanSyntaxKind::Structure,
291                    "theorem" => LeanSyntaxKind::Theorem,
292                    "universe" => LeanSyntaxKind::Universe,
293                    "variable" => LeanSyntaxKind::Variable,
294                    "variables" => LeanSyntaxKind::Variables,
295                    "end" => LeanSyntaxKind::End,
296                    "import" => LeanSyntaxKind::Import,
297                    "export" => LeanSyntaxKind::Export,
298                    "prelude" => LeanSyntaxKind::Prelude,
299                    "noncomputable" => LeanSyntaxKind::Noncomputable,
300                    "partial" => LeanSyntaxKind::Partial,
301                    "unsafe" => LeanSyntaxKind::Unsafe,
302                    "mutual" => LeanSyntaxKind::Mutual,
303                    "where" => LeanSyntaxKind::Where,
304                    "have" => LeanSyntaxKind::Have,
305                    "show" => LeanSyntaxKind::Show,
306                    "suffices" => LeanSyntaxKind::Suffices,
307                    "let" => LeanSyntaxKind::Let,
308                    "in" => LeanSyntaxKind::In,
309                    "if" => LeanSyntaxKind::If,
310                    "then" => LeanSyntaxKind::Then,
311                    "else" => LeanSyntaxKind::Else,
312                    "match" => LeanSyntaxKind::Match,
313                    "with" => LeanSyntaxKind::With,
314                    "fun" => LeanSyntaxKind::Fun,
315                    "do" => LeanSyntaxKind::Do,
316                    "for" => LeanSyntaxKind::For,
317                    "while" => LeanSyntaxKind::While,
318                    "break" => LeanSyntaxKind::Break,
319                    "continue" => LeanSyntaxKind::Continue,
320                    "return" => LeanSyntaxKind::Return,
321                    "try" => LeanSyntaxKind::Try,
322                    "catch" => LeanSyntaxKind::Catch,
323                    "finally" => LeanSyntaxKind::Finally,
324                    "throw" => LeanSyntaxKind::Throw,
325                    _ => LeanSyntaxKind::Identifier,
326                };
327
328                state.add_token(kind, start_pos, state.get_position());
329                true
330            }
331            else {
332                false
333            }
334        }
335        else {
336            false
337        }
338    }
339
340    /// 处理操作符和分隔符
341    fn lex_operator_or_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
342        let start_pos = state.get_position();
343
344        if let Some(ch) = state.peek() {
345            let kind = match ch {
346                '(' => LeanSyntaxKind::LeftParen,
347                ')' => LeanSyntaxKind::RightParen,
348                '{' => LeanSyntaxKind::LeftBrace,
349                '}' => LeanSyntaxKind::RightBrace,
350                '[' => LeanSyntaxKind::LeftBracket,
351                ']' => LeanSyntaxKind::RightBracket,
352                ',' => LeanSyntaxKind::Comma,
353                ';' => LeanSyntaxKind::Semicolon,
354                '+' => LeanSyntaxKind::Plus,
355                '*' => LeanSyntaxKind::Star,
356                '/' => LeanSyntaxKind::Slash,
357                '%' => LeanSyntaxKind::Percent,
358                '^' => LeanSyntaxKind::Caret,
359                '#' => LeanSyntaxKind::Hash,
360                '&' => LeanSyntaxKind::Ampersand,
361                '|' => LeanSyntaxKind::Pipe,
362                '~' => LeanSyntaxKind::Tilde,
363                '!' => LeanSyntaxKind::Bang,
364                '?' => LeanSyntaxKind::Question,
365                '@' => LeanSyntaxKind::At,
366                '$' => LeanSyntaxKind::Dollar,
367                '<' => LeanSyntaxKind::Lt,
368                '>' => LeanSyntaxKind::Gt,
369                '=' => LeanSyntaxKind::Eq,
370                _ => {
371                    // 检查多字符操作符
372                    match ch {
373                        ':' => {
374                            state.advance(1);
375                            if let Some('=') = state.peek() {
376                                state.advance(1);
377                                state.add_token(LeanSyntaxKind::ColonEq, start_pos, state.get_position());
378                                return true;
379                            }
380                            else if let Some(':') = state.peek() {
381                                state.advance(1);
382                                state.add_token(LeanSyntaxKind::ColonColon, start_pos, state.get_position());
383                                return true;
384                            }
385                            else {
386                                state.add_token(LeanSyntaxKind::Colon, start_pos, state.get_position());
387                                return true;
388                            }
389                        }
390                        '.' => {
391                            state.advance(1);
392                            if let Some('.') = state.peek() {
393                                state.advance(1);
394                                state.add_token(LeanSyntaxKind::DotDot, start_pos, state.get_position());
395                                return true;
396                            }
397                            else {
398                                state.add_token(LeanSyntaxKind::Dot, start_pos, state.get_position());
399                                return true;
400                            }
401                        }
402                        '-' => {
403                            state.advance(1);
404                            if let Some('>') = state.peek() {
405                                state.advance(1);
406                                state.add_token(LeanSyntaxKind::Arrow, start_pos, state.get_position());
407                                return true;
408                            }
409                            else {
410                                state.add_token(LeanSyntaxKind::Minus, start_pos, state.get_position());
411                                return true;
412                            }
413                        }
414                        _ => return false,
415                    }
416                }
417            };
418
419            state.advance(ch.len_utf8());
420            state.add_token(kind, start_pos, state.get_position());
421            true
422        }
423        else {
424            false
425        }
426    }
427}
428
429impl<'config> Lexer<LeanLanguage> for LeanLexer<'config> {
430    fn lex_incremental(
431        &self,
432        source: impl Source,
433        changed: usize,
434        cache: IncrementalCache<LeanLanguage>,
435    ) -> LexOutput<LeanLanguage> {
436        let mut state = LexerState::new_with_cache(source, changed, cache);
437        let result = self.run(&mut state);
438        state.finish(result)
439    }
440
441    fn lex(&self, source: impl Source) -> LexOutput<LeanLanguage> {
442        let mut state = State::new(source);
443        let result = self.run(&mut state);
444        state.finish(result)
445    }
446}
447
448impl<'config> LeanLexer<'config> {
449    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
450        while state.not_at_end() {
451            if self.skip_whitespace(state) {
452                continue;
453            }
454
455            if self.lex_newline(state) {
456                continue;
457            }
458
459            if self.lex_comment(state) {
460                continue;
461            }
462
463            if self.lex_string(state) {
464                continue;
465            }
466
467            if self.lex_char(state) {
468                continue;
469            }
470
471            if self.lex_number(state) {
472                continue;
473            }
474
475            if self.lex_identifier_or_keyword(state) {
476                continue;
477            }
478
479            if self.lex_operator_or_delimiter(state) {
480                continue;
481            }
482
483            // 如果所有规则都不匹配,跳过当前字符并标记为错误
484            let start_pos = state.get_position();
485            if let Some(ch) = state.peek() {
486                state.advance(ch.len_utf8());
487                state.add_token(LeanSyntaxKind::Error, start_pos, state.get_position());
488            }
489        }
490
491        // 添加 EOF kind
492        let eof_pos = state.get_position();
493        state.add_token(LeanSyntaxKind::Eof, eof_pos, eof_pos);
494
495        Ok(())
496    }
497}