Skip to main content

oak_haskell/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::HaskellLanguage, lexer::token_type::HaskellTokenType};
5use oak_core::{Lexer, LexerCache, LexerState, TextEdit, lexer::LexOutput, source::Source};
6
7type State<'a, S> = LexerState<'a, S, HaskellLanguage>;
8
9#[derive(Clone)]
10pub struct HaskellLexer<'config> {
11    _config: &'config HaskellLanguage,
12}
13
14impl<'config> HaskellLexer<'config> {
15    pub fn new(config: &'config HaskellLanguage) -> Self {
16        Self { _config: config }
17    }
18
19    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
20        let start_pos = state.get_position();
21        while let Some(ch) = state.peek() {
22            if ch == ' ' || ch == '\t' {
23                state.bump();
24            }
25            else {
26                break;
27            }
28        }
29
30        if state.get_position() > start_pos {
31            state.add_token(HaskellTokenType::Whitespace, start_pos, state.get_position());
32            true
33        }
34        else {
35            false
36        }
37    }
38
39    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
40        let start_pos = state.get_position();
41
42        if let Some('\n') = state.peek() {
43            state.bump();
44            state.add_token(HaskellTokenType::Newline, start_pos, state.get_position());
45            true
46        }
47        else if let Some('\r') = state.peek() {
48            state.bump();
49            if let Some('\n') = state.peek() {
50                state.bump();
51            }
52            state.add_token(HaskellTokenType::Newline, start_pos, state.get_position());
53            true
54        }
55        else {
56            false
57        }
58    }
59
60    fn lex_single_line_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
61        let start_pos = state.get_position();
62
63        if let Some('-') = state.peek() {
64            if let Some('-') = state.peek_next_n(1) {
65                state.advance(2);
66                while let Some(ch) = state.peek() {
67                    if ch == '\n' || ch == '\r' {
68                        break;
69                    }
70                    state.bump();
71                }
72                state.add_token(HaskellTokenType::Comment, start_pos, state.get_position());
73                true
74            }
75            else {
76                false
77            }
78        }
79        else {
80            false
81        }
82    }
83
84    fn lex_multi_line_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
85        let start_pos = state.get_position();
86
87        if let Some('{') = state.peek() {
88            if let Some('-') = state.peek_next_n(1) {
89                state.advance(2);
90                let mut depth = 1;
91                while let Some(ch) = state.peek() {
92                    if ch == '{' && state.peek_next_n(1) == Some('-') {
93                        depth += 1;
94                        state.advance(2)
95                    }
96                    else if ch == '-' && state.peek_next_n(1) == Some('}') {
97                        depth -= 1;
98                        state.advance(2);
99                        if depth == 0 {
100                            break;
101                        }
102                    }
103                    else {
104                        state.bump();
105                    }
106                }
107                state.add_token(HaskellTokenType::Comment, start_pos, state.get_position());
108                true
109            }
110            else {
111                false
112            }
113        }
114        else {
115            false
116        }
117    }
118
119    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
120        let start_pos = state.get_position();
121
122        if let Some(ch) = state.peek() {
123            if ch.is_ascii_alphabetic() || ch == '_' {
124                state.bump();
125
126                while let Some(ch) = state.peek() {
127                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '\'' {
128                        state.bump();
129                    }
130                    else {
131                        break;
132                    }
133                }
134
135                let end_pos = state.get_position();
136                let text = state.get_text_in((start_pos..end_pos).into());
137                let kind = self.keyword_or_identifier(text.as_ref());
138
139                state.add_token(kind, start_pos, end_pos);
140                true
141            }
142            else {
143                false
144            }
145        }
146        else {
147            false
148        }
149    }
150
151    fn keyword_or_identifier(&self, text: &str) -> HaskellTokenType {
152        match text {
153            "case" => HaskellTokenType::Case,
154            "class" => HaskellTokenType::Class,
155            "data" => HaskellTokenType::Data,
156            "default" => HaskellTokenType::Default,
157            "deriving" => HaskellTokenType::Deriving,
158            "do" => HaskellTokenType::Do,
159            "else" => HaskellTokenType::Else,
160            "if" => HaskellTokenType::If,
161            "import" => HaskellTokenType::Import,
162            "in" => HaskellTokenType::In,
163            "infix" => HaskellTokenType::Infix,
164            "infixl" => HaskellTokenType::Infixl,
165            "infixr" => HaskellTokenType::Infixr,
166            "instance" => HaskellTokenType::Instance,
167            "let" => HaskellTokenType::Let,
168            "module" => HaskellTokenType::Module,
169            "newtype" => HaskellTokenType::Newtype,
170            "of" => HaskellTokenType::Of,
171            "then" => HaskellTokenType::Then,
172            "type" => HaskellTokenType::Type,
173            "where" => HaskellTokenType::Where,
174            _ => HaskellTokenType::Identifier,
175        }
176    }
177
178    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
179        let start_pos = state.get_position();
180
181        if let Some(ch) = state.peek() {
182            if ch.is_ascii_digit() {
183                state.bump();
184
185                while let Some(ch) = state.peek() {
186                    if ch.is_ascii_digit() {
187                        state.bump();
188                    }
189                    else if ch == '.' {
190                        state.bump();
191                        while let Some(ch) = state.peek() {
192                            if ch.is_ascii_digit() {
193                                state.bump();
194                            }
195                            else {
196                                break;
197                            }
198                        }
199                        break;
200                    }
201                    else {
202                        break;
203                    }
204                }
205
206                state.add_token(HaskellTokenType::Number, start_pos, state.get_position());
207                true
208            }
209            else {
210                false
211            }
212        }
213        else {
214            false
215        }
216    }
217
218    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
219        let start_pos = state.get_position();
220
221        if let Some('"') = state.peek() {
222            state.bump();
223
224            while let Some(ch) = state.peek() {
225                if ch == '"' {
226                    state.bump();
227                    state.add_token(HaskellTokenType::StringLiteral, start_pos, state.get_position());
228                    return true;
229                }
230                else if ch == '\\' {
231                    state.bump();
232                    if let Some(_) = state.peek() {
233                        state.bump();
234                    }
235                }
236                else {
237                    state.bump();
238                }
239            }
240
241            state.add_token(HaskellTokenType::StringLiteral, start_pos, state.get_position());
242            true
243        }
244        else {
245            false
246        }
247    }
248
249    fn lex_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
250        let start_pos = state.get_position();
251
252        if let Some('\'') = state.peek() {
253            state.bump();
254
255            if let Some(ch) = state.peek() {
256                if ch == '\\' {
257                    state.bump();
258                    if let Some(_) = state.peek() {
259                        state.bump();
260                    }
261                }
262                else if ch != '\'' {
263                    state.bump();
264                }
265            }
266
267            if let Some('\'') = state.peek() {
268                state.bump();
269                state.add_token(HaskellTokenType::CharLiteral, start_pos, state.get_position());
270                true
271            }
272            else {
273                state.add_token(HaskellTokenType::CharLiteral, start_pos, state.get_position());
274                true
275            }
276        }
277        else {
278            false
279        }
280    }
281
282    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
283        let start_pos = state.get_position();
284
285        if let Some(ch) = state.peek() {
286            let token_kind = match ch {
287                '+' => {
288                    state.bump();
289                    if let Some('+') = state.peek() {
290                        state.bump();
291                        HaskellTokenType::Append
292                    }
293                    else {
294                        HaskellTokenType::Plus
295                    }
296                }
297                '-' => {
298                    state.bump();
299                    if let Some('>') = state.peek() {
300                        state.bump();
301                        HaskellTokenType::Arrow
302                    }
303                    else {
304                        HaskellTokenType::Minus
305                    }
306                }
307                '*' => {
308                    state.bump();
309                    HaskellTokenType::Star
310                }
311                '/' => {
312                    state.bump();
313                    HaskellTokenType::Slash
314                }
315                '=' => {
316                    state.bump();
317                    if let Some('=') = state.peek() {
318                        state.bump();
319                        HaskellTokenType::Equal
320                    }
321                    else {
322                        HaskellTokenType::Assign
323                    }
324                }
325                '<' => {
326                    state.bump();
327                    if let Some('=') = state.peek() {
328                        state.bump();
329                        HaskellTokenType::LessEqual
330                    }
331                    else if let Some('-') = state.peek() {
332                        state.bump();
333                        HaskellTokenType::LeftArrow
334                    }
335                    else {
336                        HaskellTokenType::Less
337                    }
338                }
339                '>' => {
340                    state.bump();
341                    if let Some('=') = state.peek() {
342                        state.bump();
343                        HaskellTokenType::GreaterEqual
344                    }
345                    else {
346                        HaskellTokenType::Greater
347                    }
348                }
349                ':' => {
350                    state.bump();
351                    if let Some(':') = state.peek() {
352                        state.bump();
353                        HaskellTokenType::DoubleColon
354                    }
355                    else {
356                        HaskellTokenType::Colon
357                    }
358                }
359                '|' => {
360                    state.bump();
361                    HaskellTokenType::Pipe
362                }
363                '&' => {
364                    state.bump();
365                    HaskellTokenType::Ampersand
366                }
367                '!' => {
368                    state.bump();
369                    HaskellTokenType::Bang
370                }
371                '?' => {
372                    state.bump();
373                    HaskellTokenType::Question
374                }
375                ';' => {
376                    state.bump();
377                    HaskellTokenType::Semicolon
378                }
379                ',' => {
380                    state.bump();
381                    HaskellTokenType::Comma
382                }
383                '.' => {
384                    state.bump();
385                    if let Some('.') = state.peek() {
386                        state.bump();
387                        HaskellTokenType::DoubleDot
388                    }
389                    else {
390                        HaskellTokenType::Dot
391                    }
392                }
393                '$' => {
394                    state.bump();
395                    HaskellTokenType::Dollar
396                }
397                '@' => {
398                    state.bump();
399                    HaskellTokenType::At
400                }
401                '~' => {
402                    state.bump();
403                    HaskellTokenType::Tilde
404                }
405                '\\' => {
406                    state.bump();
407                    HaskellTokenType::Backslash
408                }
409                '`' => {
410                    state.bump();
411                    HaskellTokenType::Backtick
412                }
413                _ => return false,
414            };
415
416            state.add_token(token_kind, start_pos, state.get_position());
417            true
418        }
419        else {
420            false
421        }
422    }
423
424    fn lex_delimiters<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
425        let start_pos = state.get_position();
426
427        if let Some(ch) = state.peek() {
428            let token_kind = match ch {
429                '(' => {
430                    state.bump();
431                    HaskellTokenType::LeftParen
432                }
433                ')' => {
434                    state.bump();
435                    HaskellTokenType::RightParen
436                }
437                '[' => {
438                    state.bump();
439                    HaskellTokenType::LeftBracket
440                }
441                ']' => {
442                    state.bump();
443                    HaskellTokenType::RightBracket
444                }
445                '{' => {
446                    state.bump();
447                    HaskellTokenType::LeftBrace
448                }
449                '}' => {
450                    state.bump();
451                    HaskellTokenType::RightBrace
452                }
453                _ => return false,
454            };
455
456            state.add_token(token_kind, start_pos, state.get_position());
457            true
458        }
459        else {
460            false
461        }
462    }
463}
464
465impl<'config> Lexer<HaskellLanguage> for HaskellLexer<'config> {
466    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<HaskellLanguage>) -> LexOutput<HaskellLanguage> {
467        let mut state = State::new(source);
468
469        while state.not_at_end() {
470            let safe_point = state.get_position();
471            if self.skip_whitespace(&mut state) {
472                continue;
473            }
474
475            if self.lex_newline(&mut state) {
476                continue;
477            }
478
479            if self.lex_single_line_comment(&mut state) {
480                continue;
481            }
482
483            if self.lex_multi_line_comment(&mut state) {
484                continue;
485            }
486
487            if self.lex_identifier_or_keyword(&mut state) {
488                continue;
489            }
490
491            if self.lex_number(&mut state) {
492                continue;
493            }
494
495            if self.lex_string(&mut state) {
496                continue;
497            }
498
499            if self.lex_char(&mut state) {
500                continue;
501            }
502
503            if self.lex_operators(&mut state) {
504                continue;
505            }
506
507            if self.lex_delimiters(&mut state) {
508                continue;
509            }
510
511            // 如果没有匹配到任何模式,跳过当前字符并标记为错误
512            let start_pos = state.get_position();
513            if state.peek().is_some() {
514                state.advance(1);
515                state.add_token(HaskellTokenType::Error, start_pos, state.get_position())
516            }
517
518            state.advance_if_dead_lock(safe_point)
519        }
520
521        // 添加 EOF token
522        let pos = state.get_position();
523        state.add_token(HaskellTokenType::Eof, pos, pos);
524
525        state.finish_with_cache(Ok(()), cache)
526    }
527}