Skip to main content

oak_lean/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token type definitions for the Lean language lexer.
3pub mod token_type;
4
5/// Lean language lexer
6///
7/// Provides lexical analysis for the Lean language, converting source text into a token stream
8use crate::{language::LeanLanguage, lexer::token_type::LeanTokenType};
9use oak_core::{Lexer, LexerCache, LexerState, OakError, TextEdit, lexer::LexOutput, source::Source};
10
11pub(crate) type State<'a, S> = LexerState<'a, S, LeanLanguage>;
12
13/// Lean lexer
14#[derive(Debug, Clone)]
15pub struct LeanLexer<'config> {
16    config: &'config LeanLanguage,
17}
18
19impl<'config> Lexer<LeanLanguage> for LeanLexer<'config> {
20    fn lex<'a, S: Source + ?Sized>(&self, text: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<LeanLanguage>) -> LexOutput<LeanLanguage> {
21        let mut state = State::new(text);
22        let result = self.run(&mut state);
23        if result.is_ok() {
24            state.add_eof()
25        }
26        state.finish_with_cache(result, cache)
27    }
28}
29
30impl<'config> LeanLexer<'config> {
31    /// Creates a new Lean lexer
32    pub fn new(config: &'config LeanLanguage) -> Self {
33        Self { config }
34    }
35
36    /// Skips whitespace
37    fn skip_whitespace<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
38        let start_pos = state.get_position();
39
40        while let Some(ch) = state.peek() {
41            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
42        }
43
44        if state.get_position() > start_pos {
45            state.add_token(LeanTokenType::Whitespace, start_pos, state.get_position());
46            true
47        }
48        else {
49            false
50        }
51    }
52
53    /// Handles newlines
54    fn lex_newline<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
55        let start_pos = state.get_position();
56
57        if let Some('\n') = state.peek() {
58            state.advance(1);
59            state.add_token(LeanTokenType::Newline, start_pos, state.get_position());
60            true
61        }
62        else if let Some('\r') = state.peek() {
63            state.advance(1);
64            if let Some('\n') = state.peek() {
65                state.advance(1)
66            }
67            state.add_token(LeanTokenType::Newline, start_pos, state.get_position());
68            true
69        }
70        else {
71            false
72        }
73    }
74
75    /// Handles comments
76    fn lex_comment<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
77        let start_pos = state.get_position();
78
79        if let Some('-') = state.peek() {
80            state.advance(1);
81            if let Some('-') = state.peek() {
82                // Single-line comment
83                state.advance(1);
84                while let Some(ch) = state.peek() {
85                    if ch == '\n' || ch == '\r' {
86                        break;
87                    }
88                    state.advance(ch.len_utf8())
89                }
90                state.add_token(LeanTokenType::Comment, start_pos, state.get_position());
91                true
92            }
93            else {
94                // Backtrack, this is a minus operator
95                state.set_position(start_pos);
96                false
97            }
98        }
99        else if let Some('/') = state.peek() {
100            state.advance(1);
101            if let Some('-') = state.peek() {
102                // Block comment start
103                state.advance(1);
104                let mut depth = 1;
105                while depth > 0 && state.not_at_end() {
106                    if let Some('/') = state.peek() {
107                        state.advance(1);
108                        if let Some('-') = state.peek() {
109                            state.advance(1);
110                            depth += 1
111                        }
112                    }
113                    else if let Some('-') = state.peek() {
114                        state.advance(1);
115                        if let Some('/') = state.peek() {
116                            state.advance(1);
117                            depth -= 1
118                        }
119                    }
120                    else if let Some(ch) = state.peek() {
121                        state.advance(ch.len_utf8())
122                    }
123                    else {
124                        break;
125                    }
126                }
127                state.add_token(LeanTokenType::Comment, start_pos, state.get_position());
128                true
129            }
130            else {
131                // Backtrack, this is a division operator
132                state.set_position(start_pos);
133                false
134            }
135        }
136        else {
137            false
138        }
139    }
140
141    /// Handles string literals
142    fn lex_string<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
143        let start_pos = state.get_position();
144
145        if let Some('"') = state.peek() {
146            state.advance(1);
147
148            while let Some(ch) = state.peek() {
149                if ch == '"' {
150                    state.advance(1);
151                    break;
152                }
153                else if ch == '\\' {
154                    state.advance(1);
155                    if let Some(escaped) = state.peek() {
156                        state.advance(escaped.len_utf8())
157                    }
158                }
159                else if ch == '\n' || ch == '\r' {
160                    break; // String cannot span lines
161                }
162                else {
163                    state.advance(ch.len_utf8())
164                }
165            }
166            state.add_token(LeanTokenType::StringLiteral, start_pos, state.get_position());
167            true
168        }
169        else {
170            false
171        }
172    }
173
174    /// Handles character literals
175    fn lex_char<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
176        let start_pos = state.get_position();
177
178        if let Some('\'') = state.peek() {
179            state.advance(1);
180
181            while let Some(ch) = state.peek() {
182                if ch == '\'' {
183                    state.advance(1);
184                    break;
185                }
186                else if ch == '\\' {
187                    state.advance(1);
188                    if let Some(escaped) = state.peek() {
189                        state.advance(escaped.len_utf8())
190                    }
191                }
192                else if ch == '\n' || ch == '\r' {
193                    break;
194                }
195                else {
196                    state.advance(ch.len_utf8())
197                }
198            }
199            state.add_token(LeanTokenType::CharLiteral, start_pos, state.get_position());
200            true
201        }
202        else {
203            false
204        }
205    }
206
207    /// Handles number literals.
208    fn lex_number<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
209        let start_pos = state.get_position();
210
211        if let Some(ch) = state.peek() {
212            if ch.is_ascii_digit() {
213                // Scan digits
214                while let Some(ch) = state.peek() {
215                    if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
216                }
217
218                // Check decimal point
219                if let Some('.') = state.peek() {
220                    state.advance(1); // Skip decimal point
221                    if let Some(next_char) = state.peek() {
222                        if next_char.is_ascii_digit() {
223                            while let Some(ch) = state.peek() {
224                                if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
225                            }
226                        }
227                    }
228                }
229
230                // Check exponent part
231                if let Some(ch) = state.peek() {
232                    if ch == 'e' || ch == 'E' {
233                        state.advance(1);
234                        if let Some(sign) = state.peek() {
235                            if sign == '+' || sign == '-' {
236                                state.advance(1)
237                            }
238                        }
239                        while let Some(ch) = state.peek() {
240                            if ch.is_ascii_digit() { state.advance(ch.len_utf8()) } else { break }
241                        }
242                    }
243                }
244
245                state.add_token(LeanTokenType::IntegerLiteral, start_pos, state.get_position());
246                true
247            }
248            else {
249                false
250            }
251        }
252        else {
253            false
254        }
255    }
256
257    /// Handles identifiers and keywords.
258    fn lex_identifier_or_keyword<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
259        let start_pos = state.get_position();
260
261        if let Some(ch) = state.peek() {
262            if ch.is_alphabetic() || ch == '_' {
263                state.advance(ch.len_utf8());
264
265                while let Some(ch) = state.peek() {
266                    if ch.is_alphanumeric() || ch == '_' || ch == '\'' { state.advance(ch.len_utf8()) } else { break }
267                }
268
269                let text = state.get_text_in(oak_core::Range { start: start_pos, end: state.get_position() });
270                let kind = match text.as_ref() {
271                    "axiom" => LeanTokenType::Axiom,
272                    "constant" => LeanTokenType::Constant,
273                    "def" => LeanTokenType::Def,
274                    "example" => LeanTokenType::Example,
275                    "inductive" => LeanTokenType::Inductive,
276                    "lemma" => LeanTokenType::Lemma,
277                    "namespace" => LeanTokenType::Namespace,
278                    "open" => LeanTokenType::Open,
279                    "private" => LeanTokenType::Private,
280                    "protected" => LeanTokenType::Protected,
281                    "section" => LeanTokenType::Section,
282                    "structure" => LeanTokenType::Structure,
283                    "theorem" => LeanTokenType::Theorem,
284                    "universe" => LeanTokenType::Universe,
285                    "variable" => LeanTokenType::Variable,
286                    "variables" => LeanTokenType::Variables,
287                    "end" => LeanTokenType::End,
288                    "import" => LeanTokenType::Import,
289                    "export" => LeanTokenType::Export,
290                    "prelude" => LeanTokenType::Prelude,
291                    "noncomputable" => LeanTokenType::Noncomputable,
292                    "partial" => LeanTokenType::Partial,
293                    "unsafe" => LeanTokenType::Unsafe,
294                    "mutual" => LeanTokenType::Mutual,
295                    "where" => LeanTokenType::Where,
296                    "have" => LeanTokenType::Have,
297                    "show" => LeanTokenType::Show,
298                    "suffices" => LeanTokenType::Suffices,
299                    "let" => LeanTokenType::Let,
300                    "in" => LeanTokenType::In,
301                    "if" => LeanTokenType::If,
302                    "then" => LeanTokenType::Then,
303                    "else" => LeanTokenType::Else,
304                    "match" => LeanTokenType::Match,
305                    "with" => LeanTokenType::With,
306                    "fun" => LeanTokenType::Fun,
307                    "do" => LeanTokenType::Do,
308                    "for" => LeanTokenType::For,
309                    "while" => LeanTokenType::While,
310                    "break" => LeanTokenType::Break,
311                    "continue" => LeanTokenType::Continue,
312                    "return" => LeanTokenType::Return,
313                    "try" => LeanTokenType::Try,
314                    "catch" => LeanTokenType::Catch,
315                    "finally" => LeanTokenType::Finally,
316                    "throw" => LeanTokenType::Throw,
317                    _ => LeanTokenType::Identifier,
318                };
319
320                state.add_token(kind, start_pos, state.get_position());
321                true
322            }
323            else {
324                false
325            }
326        }
327        else {
328            false
329        }
330    }
331
332    /// Handles operators and delimiters.
333    fn lex_operator_or_delimiter<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> bool {
334        let start_pos = state.get_position();
335
336        if let Some(ch) = state.peek() {
337            let kind = match ch {
338                '(' => LeanTokenType::LeftParen,
339                ')' => LeanTokenType::RightParen,
340                '{' => LeanTokenType::LeftBrace,
341                '}' => LeanTokenType::RightBrace,
342                '[' => LeanTokenType::LeftBracket,
343                ']' => LeanTokenType::RightBracket,
344                ',' => LeanTokenType::Comma,
345                ';' => LeanTokenType::Semicolon,
346                '+' => LeanTokenType::Plus,
347                '*' => LeanTokenType::Star,
348                '/' => LeanTokenType::Slash,
349                '%' => LeanTokenType::Percent,
350                '^' => LeanTokenType::Caret,
351                '#' => LeanTokenType::Hash,
352                '&' => LeanTokenType::Ampersand,
353                '|' => LeanTokenType::Pipe,
354                '~' => LeanTokenType::Tilde,
355                '!' => LeanTokenType::Bang,
356                '?' => LeanTokenType::Question,
357                '@' => LeanTokenType::At,
358                '$' => LeanTokenType::Dollar,
359                '<' => LeanTokenType::Lt,
360                '>' => LeanTokenType::Gt,
361                '=' => LeanTokenType::Eq,
362                _ => return false,
363            };
364
365            state.advance(ch.len_utf8());
366            state.add_token(kind, start_pos, state.get_position());
367            true
368        }
369        else {
370            false
371        }
372    }
373
374    fn run<S: Source + ?Sized>(&self, state: &mut State<'_, S>) -> Result<(), OakError> {
375        while state.not_at_end() {
376            if self.skip_whitespace(state) || self.lex_newline(state) || self.lex_comment(state) || self.lex_string(state) || self.lex_char(state) || self.lex_number(state) || self.lex_identifier_or_keyword(state) || self.lex_operator_or_delimiter(state) {
377                continue;
378            }
379
380            // If no rules match, skip current character and mark as error.
381            let start_pos = state.get_position();
382            if let Some(ch) = state.peek() {
383                state.advance(ch.len_utf8());
384                state.add_token(LeanTokenType::Error, start_pos, state.get_position());
385            }
386        }
387
388        Ok(())
389    }
390}