Skip to main content

oak_lean/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4/// Lean 语言词法分析器
5///
6/// 提供 Lean 语言的词法分析功能,将源代码文本转换为标记流
7use crate::{language::LeanLanguage, lexer::token_type::LeanTokenType};
8use oak_core::{Lexer, LexerCache, LexerState, OakError, TextEdit, lexer::LexOutput, source::Source};
9
10type State<'a, S> = LexerState<'a, S, LeanLanguage>;
11
12/// Lean 词法分析器
13#[derive(Debug, Clone)]
14pub struct LeanLexer<'config> {
15    _config: &'config LeanLanguage,
16}
17
18impl<'config> Lexer<LeanLanguage> for LeanLexer<'config> {
19    fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<LeanLanguage>) -> LexOutput<LeanLanguage> {
20        let mut state = State::new(text);
21        let result = self.run(&mut state);
22        if result.is_ok() {
23            state.add_eof()
24        }
25        state.finish_with_cache(result, cache)
26    }
27}
28
29impl<'config> LeanLexer<'config> {
30    /// 创建新的 Lean 词法分析器
31    pub fn new(config: &'config LeanLanguage) -> Self {
32        Self { _config: config }
33    }
34
35    /// 跳过空白字符
36    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
37        let start_pos = state.get_position();
38
39        while let Some(ch) = state.peek() {
40            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
41        }
42
43        if state.get_position() > start_pos {
44            state.add_token(LeanTokenType::Whitespace, start_pos, state.get_position());
45            true
46        }
47        else {
48            false
49        }
50    }
51
52    /// 处理换行
53    fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
54        let start_pos = state.get_position();
55
56        if let Some('\n') = state.peek() {
57            state.advance(1);
58            state.add_token(LeanTokenType::Newline, start_pos, state.get_position());
59            true
60        }
61        else if let Some('\r') = state.peek() {
62            state.advance(1);
63            if let Some('\n') = state.peek() {
64                state.advance(1)
65            }
66            state.add_token(LeanTokenType::Newline, start_pos, state.get_position());
67            true
68        }
69        else {
70            false
71        }
72    }
73
74    /// 处理注释
75    fn lex_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
76        let start_pos = state.get_position();
77
78        if let Some('-') = state.peek() {
79            state.advance(1);
80            if let Some('-') = state.peek() {
81                // 单行注释
82                state.advance(1);
83                while let Some(ch) = state.peek() {
84                    if ch == '\n' || ch == '\r' {
85                        break;
86                    }
87                    state.advance(ch.len_utf8())
88                }
89                state.add_token(LeanTokenType::Comment, start_pos, state.get_position());
90                true
91            }
92            else {
93                // 回退,这是减号操作符
94                state.set_position(start_pos);
95                false
96            }
97        }
98        else if let Some('/') = state.peek() {
99            state.advance(1);
100            if let Some('-') = state.peek() {
101                // 块注释开始
102                state.advance(1);
103                let mut depth = 1;
104                while depth > 0 && state.not_at_end() {
105                    if let Some('/') = state.peek() {
106                        state.advance(1);
107                        if let Some('-') = state.peek() {
108                            state.advance(1);
109                            depth += 1
110                        }
111                    }
112                    else if let Some('-') = state.peek() {
113                        state.advance(1);
114                        if let Some('/') = state.peek() {
115                            state.advance(1);
116                            depth -= 1
117                        }
118                    }
119                    else if let Some(ch) = state.peek() {
120                        state.advance(ch.len_utf8())
121                    }
122                    else {
123                        break;
124                    }
125                }
126                state.add_token(LeanTokenType::Comment, start_pos, state.get_position());
127                true
128            }
129            else {
130                // 回退,这是除法操作符
131                state.set_position(start_pos);
132                false
133            }
134        }
135        else {
136            false
137        }
138    }
139
140    /// 处理字符串字面量
141    fn lex_string<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
142        let start_pos = state.get_position();
143
144        if let Some('"') = state.peek() {
145            state.advance(1);
146
147            while let Some(ch) = state.peek() {
148                if ch == '"' {
149                    state.advance(1);
150                    break;
151                }
152                else if ch == '\\' {
153                    state.advance(1);
154                    if let Some(escaped) = state.peek() {
155                        state.advance(escaped.len_utf8())
156                    }
157                }
158                else if ch == '\n' || ch == '\r' {
159                    break; // 字符串不能跨行
160                }
161                else {
162                    state.advance(ch.len_utf8())
163                }
164            }
165            state.add_token(LeanTokenType::StringLiteral, start_pos, state.get_position());
166            true
167        }
168        else {
169            false
170        }
171    }
172
173    /// 处理字符字面量
174    fn lex_char<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
175        let start_pos = state.get_position();
176
177        if let Some('\'') = state.peek() {
178            state.advance(1);
179
180            while let Some(ch) = state.peek() {
181                if ch == '\'' {
182                    state.advance(1);
183                    break;
184                }
185                else if ch == '\\' {
186                    state.advance(1);
187                    if let Some(escaped) = state.peek() {
188                        state.advance(escaped.len_utf8())
189                    }
190                }
191                else if ch == '\n' || ch == '\r' {
192                    break;
193                }
194                else {
195                    state.advance(ch.len_utf8())
196                }
197            }
198            state.add_token(LeanTokenType::CharLiteral, start_pos, state.get_position());
199            true
200        }
201        else {
202            false
203        }
204    }
205
206    /// 处理数字字面量
207    fn lex_number<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
208        let start_pos = state.get_position();
209
210        if let Some(ch) = state.peek() {
211            if ch.is_ascii_digit() {
212                // 扫描数字
213                while let Some(ch) = state.peek() {
214                    if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
215                }
216
217                // 检查小数点
218                if let Some('.') = state.peek() {
219                    state.advance(1); // 跳过小数点
220                    if let Some(next_char) = state.peek() {
221                        if next_char.is_ascii_digit() {
222                            while let Some(ch) = state.peek() {
223                                if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
224                            }
225                        }
226                    }
227                }
228
229                // 检查指数部分
230                if let Some(ch) = state.peek() {
231                    if ch == 'e' || ch == 'E' {
232                        state.advance(1);
233                        if let Some(sign) = state.peek() {
234                            if sign == '+' || sign == '-' {
235                                state.advance(1)
236                            }
237                        }
238                        while let Some(ch) = state.peek() {
239                            if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
240                        }
241                    }
242                }
243
244                state.add_token(LeanTokenType::IntegerLiteral, start_pos, state.get_position());
245                true
246            }
247            else {
248                false
249            }
250        }
251        else {
252            false
253        }
254    }
255
256    /// 处理标识符和关键字
257    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
258        let start_pos = state.get_position();
259
260        if let Some(ch) = state.peek() {
261            if ch.is_alphabetic() || ch == '_' {
262                state.advance(ch.len_utf8());
263
264                while let Some(ch) = state.peek() {
265                    if ch.is_alphanumeric() || ch == '_' || ch == '\'' { state.advance(ch.len_utf8()) } else { break }
266                }
267
268                let text = state.get_text_in(oak_core::Range { start: start_pos, end: state.get_position() });
269                let kind = match text.as_ref() {
270                    "axiom" => LeanTokenType::Axiom,
271                    "constant" => LeanTokenType::Constant,
272                    "def" => LeanTokenType::Def,
273                    "example" => LeanTokenType::Example,
274                    "inductive" => LeanTokenType::Inductive,
275                    "lemma" => LeanTokenType::Lemma,
276                    "namespace" => LeanTokenType::Namespace,
277                    "open" => LeanTokenType::Open,
278                    "private" => LeanTokenType::Private,
279                    "protected" => LeanTokenType::Protected,
280                    "section" => LeanTokenType::Section,
281                    "structure" => LeanTokenType::Structure,
282                    "theorem" => LeanTokenType::Theorem,
283                    "universe" => LeanTokenType::Universe,
284                    "variable" => LeanTokenType::Variable,
285                    "variables" => LeanTokenType::Variables,
286                    "end" => LeanTokenType::End,
287                    "import" => LeanTokenType::Import,
288                    "export" => LeanTokenType::Export,
289                    "prelude" => LeanTokenType::Prelude,
290                    "noncomputable" => LeanTokenType::Noncomputable,
291                    "partial" => LeanTokenType::Partial,
292                    "unsafe" => LeanTokenType::Unsafe,
293                    "mutual" => LeanTokenType::Mutual,
294                    "where" => LeanTokenType::Where,
295                    "have" => LeanTokenType::Have,
296                    "show" => LeanTokenType::Show,
297                    "suffices" => LeanTokenType::Suffices,
298                    "let" => LeanTokenType::Let,
299                    "in" => LeanTokenType::In,
300                    "if" => LeanTokenType::If,
301                    "then" => LeanTokenType::Then,
302                    "else" => LeanTokenType::Else,
303                    "match" => LeanTokenType::Match,
304                    "with" => LeanTokenType::With,
305                    "fun" => LeanTokenType::Fun,
306                    "do" => LeanTokenType::Do,
307                    "for" => LeanTokenType::For,
308                    "while" => LeanTokenType::While,
309                    "break" => LeanTokenType::Break,
310                    "continue" => LeanTokenType::Continue,
311                    "return" => LeanTokenType::Return,
312                    "try" => LeanTokenType::Try,
313                    "catch" => LeanTokenType::Catch,
314                    "finally" => LeanTokenType::Finally,
315                    "throw" => LeanTokenType::Throw,
316                    _ => LeanTokenType::Identifier,
317                };
318
319                state.add_token(kind, start_pos, state.get_position());
320                true
321            }
322            else {
323                false
324            }
325        }
326        else {
327            false
328        }
329    }
330
331    /// 处理操作符和分隔符
332    fn lex_operator_or_delimiter<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
333        let start_pos = state.get_position();
334
335        if let Some(ch) = state.peek() {
336            let kind = match ch {
337                '(' => LeanTokenType::LeftParen,
338                ')' => LeanTokenType::RightParen,
339                '{' => LeanTokenType::LeftBrace,
340                '}' => LeanTokenType::RightBrace,
341                '[' => LeanTokenType::LeftBracket,
342                ']' => LeanTokenType::RightBracket,
343                ',' => LeanTokenType::Comma,
344                ';' => LeanTokenType::Semicolon,
345                '+' => LeanTokenType::Plus,
346                '*' => LeanTokenType::Star,
347                '/' => LeanTokenType::Slash,
348                '%' => LeanTokenType::Percent,
349                '^' => LeanTokenType::Caret,
350                '#' => LeanTokenType::Hash,
351                '&' => LeanTokenType::Ampersand,
352                '|' => LeanTokenType::Pipe,
353                '~' => LeanTokenType::Tilde,
354                '!' => LeanTokenType::Bang,
355                '?' => LeanTokenType::Question,
356                '@' => LeanTokenType::At,
357                '$' => LeanTokenType::Dollar,
358                '<' => LeanTokenType::Lt,
359                '>' => LeanTokenType::Gt,
360                '=' => LeanTokenType::Eq,
361                _ => return false,
362            };
363
364            state.advance(ch.len_utf8());
365            state.add_token(kind, start_pos, state.get_position());
366            true
367        }
368        else {
369            false
370        }
371    }
372
373    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
374        while state.not_at_end() {
375            if self.skip_whitespace(state) || self.lex_newline(state) || self.lex_comment(state) || self.lex_string(state) || self.lex_char(state) || self.lex_number(state) || self.lex_identifier_or_keyword(state) || self.lex_operator_or_delimiter(state) {
376                continue;
377            }
378
379            // 如果所有规则都不匹配,跳过当前字符并标记为错误
380            let start_pos = state.get_position();
381            if let Some(ch) = state.peek() {
382                state.advance(ch.len_utf8());
383                state.add_token(LeanTokenType::Error, start_pos, state.get_position());
384            }
385        }
386
387        Ok(())
388    }
389}