oak_haskell/lexer/
mod.rs

1use crate::{kind::HaskellSyntaxKind, language::HaskellLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, TextEdit, lexer::LexOutput, source::Source};
3
4type State<'a, S> = LexerState<'a, S, HaskellLanguage>;
5
6#[derive(Clone)]
7pub struct HaskellLexer<'config> {
8    _config: &'config HaskellLanguage,
9}
10
11impl<'config> HaskellLexer<'config> {
12    pub fn new(config: &'config HaskellLanguage) -> Self {
13        Self { _config: config }
14    }
15
16    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
17        let start_pos = state.get_position();
18        while let Some(ch) = state.peek() {
19            if ch == ' ' || ch == '\t' {
20                state.bump();
21            }
22            else {
23                break;
24            }
25        }
26
27        if state.get_position() > start_pos {
28            state.add_token(HaskellSyntaxKind::Whitespace, start_pos, state.get_position());
29            true
30        }
31        else {
32            false
33        }
34    }
35
36    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
37        let start_pos = state.get_position();
38
39        if let Some('\n') = state.peek() {
40            state.bump();
41            state.add_token(HaskellSyntaxKind::Newline, start_pos, state.get_position());
42            true
43        }
44        else if let Some('\r') = state.peek() {
45            state.bump();
46            if let Some('\n') = state.peek() {
47                state.bump();
48            }
49            state.add_token(HaskellSyntaxKind::Newline, start_pos, state.get_position());
50            true
51        }
52        else {
53            false
54        }
55    }
56
57    fn lex_single_line_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
58        let start_pos = state.get_position();
59
60        if let Some('-') = state.peek() {
61            if let Some('-') = state.peek_next_n(1) {
62                state.advance(2);
63                while let Some(ch) = state.peek() {
64                    if ch == '\n' || ch == '\r' {
65                        break;
66                    }
67                    state.bump();
68                }
69                state.add_token(HaskellSyntaxKind::Comment, start_pos, state.get_position());
70                true
71            }
72            else {
73                false
74            }
75        }
76        else {
77            false
78        }
79    }
80
81    fn lex_multi_line_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
82        let start_pos = state.get_position();
83
84        if let Some('{') = state.peek() {
85            if let Some('-') = state.peek_next_n(1) {
86                state.advance(2);
87                let mut depth = 1;
88                while let Some(ch) = state.peek() {
89                    if ch == '{' && state.peek_next_n(1) == Some('-') {
90                        depth += 1;
91                        state.advance(2);
92                    }
93                    else if ch == '-' && state.peek_next_n(1) == Some('}') {
94                        depth -= 1;
95                        state.advance(2);
96                        if depth == 0 {
97                            break;
98                        }
99                    }
100                    else {
101                        state.bump();
102                    }
103                }
104                state.add_token(HaskellSyntaxKind::Comment, start_pos, state.get_position());
105                true
106            }
107            else {
108                false
109            }
110        }
111        else {
112            false
113        }
114    }
115
116    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
117        let start_pos = state.get_position();
118
119        if let Some(ch) = state.peek() {
120            if ch.is_ascii_alphabetic() || ch == '_' {
121                state.bump();
122
123                while let Some(ch) = state.peek() {
124                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '\'' {
125                        state.bump();
126                    }
127                    else {
128                        break;
129                    }
130                }
131
132                let end_pos = state.get_position();
133                let text = state.get_text_in((start_pos..end_pos).into());
134                let kind = self.keyword_or_identifier(text.as_ref());
135
136                state.add_token(kind, start_pos, end_pos);
137                true
138            }
139            else {
140                false
141            }
142        }
143        else {
144            false
145        }
146    }
147
148    fn keyword_or_identifier(&self, text: &str) -> HaskellSyntaxKind {
149        match text {
150            "case" => HaskellSyntaxKind::Case,
151            "class" => HaskellSyntaxKind::Class,
152            "data" => HaskellSyntaxKind::Data,
153            "default" => HaskellSyntaxKind::Default,
154            "deriving" => HaskellSyntaxKind::Deriving,
155            "do" => HaskellSyntaxKind::Do,
156            "else" => HaskellSyntaxKind::Else,
157            "if" => HaskellSyntaxKind::If,
158            "import" => HaskellSyntaxKind::Import,
159            "in" => HaskellSyntaxKind::In,
160            "infix" => HaskellSyntaxKind::Infix,
161            "infixl" => HaskellSyntaxKind::Infixl,
162            "infixr" => HaskellSyntaxKind::Infixr,
163            "instance" => HaskellSyntaxKind::Instance,
164            "let" => HaskellSyntaxKind::Let,
165            "module" => HaskellSyntaxKind::Module,
166            "newtype" => HaskellSyntaxKind::Newtype,
167            "of" => HaskellSyntaxKind::Of,
168            "then" => HaskellSyntaxKind::Then,
169            "type" => HaskellSyntaxKind::Type,
170            "where" => HaskellSyntaxKind::Where,
171            _ => HaskellSyntaxKind::Identifier,
172        }
173    }
174
175    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
176        let start_pos = state.get_position();
177
178        if let Some(ch) = state.peek() {
179            if ch.is_ascii_digit() {
180                state.bump();
181
182                while let Some(ch) = state.peek() {
183                    if ch.is_ascii_digit() {
184                        state.bump();
185                    }
186                    else if ch == '.' {
187                        state.bump();
188                        while let Some(ch) = state.peek() {
189                            if ch.is_ascii_digit() {
190                                state.bump();
191                            }
192                            else {
193                                break;
194                            }
195                        }
196                        break;
197                    }
198                    else {
199                        break;
200                    }
201                }
202
203                state.add_token(HaskellSyntaxKind::Number, start_pos, state.get_position());
204                true
205            }
206            else {
207                false
208            }
209        }
210        else {
211            false
212        }
213    }
214
215    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
216        let start_pos = state.get_position();
217
218        if let Some('"') = state.peek() {
219            state.bump();
220
221            while let Some(ch) = state.peek() {
222                if ch == '"' {
223                    state.bump();
224                    state.add_token(HaskellSyntaxKind::StringLiteral, start_pos, state.get_position());
225                    return true;
226                }
227                else if ch == '\\' {
228                    state.bump();
229                    if let Some(_) = state.peek() {
230                        state.bump();
231                    }
232                }
233                else {
234                    state.bump();
235                }
236            }
237
238            state.add_token(HaskellSyntaxKind::StringLiteral, start_pos, state.get_position());
239            true
240        }
241        else {
242            false
243        }
244    }
245
246    fn lex_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
247        let start_pos = state.get_position();
248
249        if let Some('\'') = state.peek() {
250            state.bump();
251
252            if let Some(ch) = state.peek() {
253                if ch == '\\' {
254                    state.bump();
255                    if let Some(_) = state.peek() {
256                        state.bump();
257                    }
258                }
259                else if ch != '\'' {
260                    state.bump();
261                }
262            }
263
264            if let Some('\'') = state.peek() {
265                state.bump();
266                state.add_token(HaskellSyntaxKind::CharLiteral, start_pos, state.get_position());
267                true
268            }
269            else {
270                state.add_token(HaskellSyntaxKind::CharLiteral, start_pos, state.get_position());
271                true
272            }
273        }
274        else {
275            false
276        }
277    }
278
279    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
280        let start_pos = state.get_position();
281
282        if let Some(ch) = state.peek() {
283            let token_kind = match ch {
284                '+' => {
285                    state.bump();
286                    if let Some('+') = state.peek() {
287                        state.bump();
288                        HaskellSyntaxKind::Append
289                    }
290                    else {
291                        HaskellSyntaxKind::Plus
292                    }
293                }
294                '-' => {
295                    state.bump();
296                    if let Some('>') = state.peek() {
297                        state.bump();
298                        HaskellSyntaxKind::Arrow
299                    }
300                    else {
301                        HaskellSyntaxKind::Minus
302                    }
303                }
304                '*' => {
305                    state.bump();
306                    HaskellSyntaxKind::Star
307                }
308                '/' => {
309                    state.bump();
310                    HaskellSyntaxKind::Slash
311                }
312                '=' => {
313                    state.bump();
314                    if let Some('=') = state.peek() {
315                        state.bump();
316                        HaskellSyntaxKind::Equal
317                    }
318                    else {
319                        HaskellSyntaxKind::Assign
320                    }
321                }
322                '<' => {
323                    state.bump();
324                    if let Some('=') = state.peek() {
325                        state.bump();
326                        HaskellSyntaxKind::LessEqual
327                    }
328                    else if let Some('-') = state.peek() {
329                        state.bump();
330                        HaskellSyntaxKind::LeftArrow
331                    }
332                    else {
333                        HaskellSyntaxKind::Less
334                    }
335                }
336                '>' => {
337                    state.bump();
338                    if let Some('=') = state.peek() {
339                        state.bump();
340                        HaskellSyntaxKind::GreaterEqual
341                    }
342                    else {
343                        HaskellSyntaxKind::Greater
344                    }
345                }
346                ':' => {
347                    state.bump();
348                    if let Some(':') = state.peek() {
349                        state.bump();
350                        HaskellSyntaxKind::DoubleColon
351                    }
352                    else {
353                        HaskellSyntaxKind::Colon
354                    }
355                }
356                '|' => {
357                    state.bump();
358                    HaskellSyntaxKind::Pipe
359                }
360                '&' => {
361                    state.bump();
362                    HaskellSyntaxKind::Ampersand
363                }
364                '!' => {
365                    state.bump();
366                    HaskellSyntaxKind::Bang
367                }
368                '?' => {
369                    state.bump();
370                    HaskellSyntaxKind::Question
371                }
372                ';' => {
373                    state.bump();
374                    HaskellSyntaxKind::Semicolon
375                }
376                ',' => {
377                    state.bump();
378                    HaskellSyntaxKind::Comma
379                }
380                '.' => {
381                    state.bump();
382                    if let Some('.') = state.peek() {
383                        state.bump();
384                        HaskellSyntaxKind::DoubleDot
385                    }
386                    else {
387                        HaskellSyntaxKind::Dot
388                    }
389                }
390                '$' => {
391                    state.bump();
392                    HaskellSyntaxKind::Dollar
393                }
394                '@' => {
395                    state.bump();
396                    HaskellSyntaxKind::At
397                }
398                '~' => {
399                    state.bump();
400                    HaskellSyntaxKind::Tilde
401                }
402                '\\' => {
403                    state.bump();
404                    HaskellSyntaxKind::Backslash
405                }
406                '`' => {
407                    state.bump();
408                    HaskellSyntaxKind::Backtick
409                }
410                _ => return false,
411            };
412
413            state.add_token(token_kind, start_pos, state.get_position());
414            true
415        }
416        else {
417            false
418        }
419    }
420
421    fn lex_delimiters<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
422        let start_pos = state.get_position();
423
424        if let Some(ch) = state.peek() {
425            let token_kind = match ch {
426                '(' => {
427                    state.bump();
428                    HaskellSyntaxKind::LeftParen
429                }
430                ')' => {
431                    state.bump();
432                    HaskellSyntaxKind::RightParen
433                }
434                '[' => {
435                    state.bump();
436                    HaskellSyntaxKind::LeftBracket
437                }
438                ']' => {
439                    state.bump();
440                    HaskellSyntaxKind::RightBracket
441                }
442                '{' => {
443                    state.bump();
444                    HaskellSyntaxKind::LeftBrace
445                }
446                '}' => {
447                    state.bump();
448                    HaskellSyntaxKind::RightBrace
449                }
450                _ => return false,
451            };
452
453            state.add_token(token_kind, start_pos, state.get_position());
454            true
455        }
456        else {
457            false
458        }
459    }
460}
461
462impl<'config> Lexer<HaskellLanguage> for HaskellLexer<'config> {
463    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<HaskellLanguage>) -> LexOutput<HaskellLanguage> {
464        let mut state = State::new(source);
465
466        while state.not_at_end() {
467            let safe_point = state.get_position();
468            if self.skip_whitespace(&mut state) {
469                continue;
470            }
471
472            if self.lex_newline(&mut state) {
473                continue;
474            }
475
476            if self.lex_single_line_comment(&mut state) {
477                continue;
478            }
479
480            if self.lex_multi_line_comment(&mut state) {
481                continue;
482            }
483
484            if self.lex_identifier_or_keyword(&mut state) {
485                continue;
486            }
487
488            if self.lex_number(&mut state) {
489                continue;
490            }
491
492            if self.lex_string(&mut state) {
493                continue;
494            }
495
496            if self.lex_char(&mut state) {
497                continue;
498            }
499
500            if self.lex_operators(&mut state) {
501                continue;
502            }
503
504            if self.lex_delimiters(&mut state) {
505                continue;
506            }
507
508            // 如果没有匹配到任何模式,跳过当前字符并标记为错误
509            let start_pos = state.get_position();
510            if state.peek().is_some() {
511                state.advance(1);
512                state.add_token(HaskellSyntaxKind::Error, start_pos, state.get_position());
513            }
514
515            state.advance_if_dead_lock(safe_point);
516        }
517
518        // 添加 EOF token
519        let pos = state.get_position();
520        state.add_token(HaskellSyntaxKind::Eof, pos, pos);
521
522        state.finish_with_cache(Ok(()), cache)
523    }
524}