Skip to main content

oak_haskell/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::HaskellLanguage, lexer::token_type::HaskellTokenType};
5use oak_core::{Lexer, LexerCache, LexerState, TextEdit, lexer::LexOutput, source::Source};
6
7pub(crate) type State<'a, S> = LexerState<'a, S, HaskellLanguage>;
8
9/// Lexer for the Haskell language.
10#[derive(Clone)]
11pub struct HaskellLexer<'config> {
12    /// Language configuration.
13    config: &'config HaskellLanguage,
14}
15
16impl<'config> HaskellLexer<'config> {
17    /// Creates a new Haskell lexer with the given configuration.
18    pub fn new(config: &'config HaskellLanguage) -> Self {
19        Self { config }
20    }
21
22    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
23        let start_pos = state.get_position();
24        while let Some(ch) = state.peek() {
25            if ch == ' ' || ch == '\t' {
26                state.bump();
27            }
28            else {
29                break;
30            }
31        }
32
33        if state.get_position() > start_pos {
34            state.add_token(HaskellTokenType::Whitespace, start_pos, state.get_position());
35            true
36        }
37        else {
38            false
39        }
40    }
41
42    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
43        let start_pos = state.get_position();
44
45        if let Some('\n') = state.peek() {
46            state.bump();
47            state.add_token(HaskellTokenType::Newline, start_pos, state.get_position());
48            true
49        }
50        else if let Some('\r') = state.peek() {
51            state.bump();
52            if let Some('\n') = state.peek() {
53                state.bump();
54            }
55            state.add_token(HaskellTokenType::Newline, start_pos, state.get_position());
56            true
57        }
58        else {
59            false
60        }
61    }
62
63    fn lex_single_line_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
64        let start_pos = state.get_position();
65
66        if let Some('-') = state.peek() {
67            if let Some('-') = state.peek_next_n(1) {
68                state.advance(2);
69                while let Some(ch) = state.peek() {
70                    if ch == '\n' || ch == '\r' {
71                        break;
72                    }
73                    state.bump();
74                }
75                state.add_token(HaskellTokenType::Comment, start_pos, state.get_position());
76                true
77            }
78            else {
79                false
80            }
81        }
82        else {
83            false
84        }
85    }
86
87    fn lex_multi_line_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
88        let start_pos = state.get_position();
89
90        if let Some('{') = state.peek() {
91            if let Some('-') = state.peek_next_n(1) {
92                state.advance(2);
93                let mut depth = 1;
94                while let Some(ch) = state.peek() {
95                    if ch == '{' && state.peek_next_n(1) == Some('-') {
96                        depth += 1;
97                        state.advance(2)
98                    }
99                    else if ch == '-' && state.peek_next_n(1) == Some('}') {
100                        depth -= 1;
101                        state.advance(2);
102                        if depth == 0 {
103                            break;
104                        }
105                    }
106                    else {
107                        state.bump();
108                    }
109                }
110                state.add_token(HaskellTokenType::Comment, start_pos, state.get_position());
111                true
112            }
113            else {
114                false
115            }
116        }
117        else {
118            false
119        }
120    }
121
122    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
123        let start_pos = state.get_position();
124
125        if let Some(ch) = state.peek() {
126            if ch.is_ascii_alphabetic() || ch == '_' {
127                state.bump();
128
129                while let Some(ch) = state.peek() {
130                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '\'' {
131                        state.bump();
132                    }
133                    else {
134                        break;
135                    }
136                }
137
138                let end_pos = state.get_position();
139                let text = state.get_text_in((start_pos..end_pos).into());
140                let kind = self.keyword_or_identifier(text.as_ref());
141
142                state.add_token(kind, start_pos, end_pos);
143                true
144            }
145            else {
146                false
147            }
148        }
149        else {
150            false
151        }
152    }
153
154    fn keyword_or_identifier(&self, text: &str) -> HaskellTokenType {
155        match text {
156            "case" => HaskellTokenType::Case,
157            "class" => HaskellTokenType::Class,
158            "data" => HaskellTokenType::Data,
159            "default" => HaskellTokenType::Default,
160            "deriving" => HaskellTokenType::Deriving,
161            "do" => HaskellTokenType::Do,
162            "else" => HaskellTokenType::Else,
163            "if" => HaskellTokenType::If,
164            "import" => HaskellTokenType::Import,
165            "in" => HaskellTokenType::In,
166            "infix" => HaskellTokenType::Infix,
167            "infixl" => HaskellTokenType::Infixl,
168            "infixr" => HaskellTokenType::Infixr,
169            "instance" => HaskellTokenType::Instance,
170            "let" => HaskellTokenType::Let,
171            "module" => HaskellTokenType::Module,
172            "newtype" => HaskellTokenType::Newtype,
173            "of" => HaskellTokenType::Of,
174            "then" => HaskellTokenType::Then,
175            "type" => HaskellTokenType::Type,
176            "where" => HaskellTokenType::Where,
177            _ => HaskellTokenType::Identifier,
178        }
179    }
180
181    fn lex_number<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
182        let start_pos = state.get_position();
183
184        if let Some(ch) = state.peek() {
185            if ch.is_ascii_digit() {
186                state.bump();
187
188                while let Some(ch) = state.peek() {
189                    if ch.is_ascii_digit() {
190                        state.bump();
191                    }
192                    else if ch == '.' {
193                        state.bump();
194                        while let Some(ch) = state.peek() {
195                            if ch.is_ascii_digit() {
196                                state.bump();
197                            }
198                            else {
199                                break;
200                            }
201                        }
202                        break;
203                    }
204                    else {
205                        break;
206                    }
207                }
208
209                state.add_token(HaskellTokenType::Number, start_pos, state.get_position());
210                true
211            }
212            else {
213                false
214            }
215        }
216        else {
217            false
218        }
219    }
220
221    fn lex_string<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
222        let start_pos = state.get_position();
223
224        if let Some('"') = state.peek() {
225            state.bump();
226
227            while let Some(ch) = state.peek() {
228                if ch == '"' {
229                    state.bump();
230                    state.add_token(HaskellTokenType::StringLiteral, start_pos, state.get_position());
231                    return true;
232                }
233                else if ch == '\\' {
234                    state.bump();
235                    if let Some(_) = state.peek() {
236                        state.bump();
237                    }
238                }
239                else {
240                    state.bump();
241                }
242            }
243
244            state.add_token(HaskellTokenType::StringLiteral, start_pos, state.get_position());
245            true
246        }
247        else {
248            false
249        }
250    }
251
252    fn lex_char<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
253        let start_pos = state.get_position();
254
255        if let Some('\'') = state.peek() {
256            state.bump();
257
258            if let Some(ch) = state.peek() {
259                if ch == '\\' {
260                    state.bump();
261                    if let Some(_) = state.peek() {
262                        state.bump();
263                    }
264                }
265                else if ch != '\'' {
266                    state.bump();
267                }
268            }
269
270            if let Some('\'') = state.peek() {
271                state.bump();
272                state.add_token(HaskellTokenType::CharLiteral, start_pos, state.get_position());
273                true
274            }
275            else {
276                state.add_token(HaskellTokenType::CharLiteral, start_pos, state.get_position());
277                true
278            }
279        }
280        else {
281            false
282        }
283    }
284
285    fn lex_operators<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
286        let start_pos = state.get_position();
287
288        if let Some(ch) = state.peek() {
289            let token_kind = match ch {
290                '+' => {
291                    state.bump();
292                    if let Some('+') = state.peek() {
293                        state.bump();
294                        HaskellTokenType::Append
295                    }
296                    else {
297                        HaskellTokenType::Plus
298                    }
299                }
300                '-' => {
301                    state.bump();
302                    if let Some('>') = state.peek() {
303                        state.bump();
304                        HaskellTokenType::Arrow
305                    }
306                    else {
307                        HaskellTokenType::Minus
308                    }
309                }
310                '*' => {
311                    state.bump();
312                    HaskellTokenType::Star
313                }
314                '/' => {
315                    state.bump();
316                    HaskellTokenType::Slash
317                }
318                '=' => {
319                    state.bump();
320                    if let Some('=') = state.peek() {
321                        state.bump();
322                        HaskellTokenType::Equal
323                    }
324                    else {
325                        HaskellTokenType::Assign
326                    }
327                }
328                '<' => {
329                    state.bump();
330                    if let Some('=') = state.peek() {
331                        state.bump();
332                        HaskellTokenType::LessEqual
333                    }
334                    else if let Some('-') = state.peek() {
335                        state.bump();
336                        HaskellTokenType::LeftArrow
337                    }
338                    else {
339                        HaskellTokenType::Less
340                    }
341                }
342                '>' => {
343                    state.bump();
344                    if let Some('=') = state.peek() {
345                        state.bump();
346                        HaskellTokenType::GreaterEqual
347                    }
348                    else {
349                        HaskellTokenType::Greater
350                    }
351                }
352                ':' => {
353                    state.bump();
354                    if let Some(':') = state.peek() {
355                        state.bump();
356                        HaskellTokenType::DoubleColon
357                    }
358                    else {
359                        HaskellTokenType::Colon
360                    }
361                }
362                '|' => {
363                    state.bump();
364                    HaskellTokenType::Pipe
365                }
366                '&' => {
367                    state.bump();
368                    HaskellTokenType::Ampersand
369                }
370                '!' => {
371                    state.bump();
372                    HaskellTokenType::Bang
373                }
374                '?' => {
375                    state.bump();
376                    HaskellTokenType::Question
377                }
378                ';' => {
379                    state.bump();
380                    HaskellTokenType::Semicolon
381                }
382                ',' => {
383                    state.bump();
384                    HaskellTokenType::Comma
385                }
386                '.' => {
387                    state.bump();
388                    if let Some('.') = state.peek() {
389                        state.bump();
390                        HaskellTokenType::DoubleDot
391                    }
392                    else {
393                        HaskellTokenType::Dot
394                    }
395                }
396                '$' => {
397                    state.bump();
398                    HaskellTokenType::Dollar
399                }
400                '@' => {
401                    state.bump();
402                    HaskellTokenType::At
403                }
404                '~' => {
405                    state.bump();
406                    HaskellTokenType::Tilde
407                }
408                '\\' => {
409                    state.bump();
410                    HaskellTokenType::Backslash
411                }
412                '`' => {
413                    state.bump();
414                    HaskellTokenType::Backtick
415                }
416                _ => return false,
417            };
418
419            state.add_token(token_kind, start_pos, state.get_position());
420            true
421        }
422        else {
423            false
424        }
425    }
426
427    fn lex_delimiters<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
428        let start_pos = state.get_position();
429
430        if let Some(ch) = state.peek() {
431            let token_kind = match ch {
432                '(' => {
433                    state.bump();
434                    HaskellTokenType::LeftParen
435                }
436                ')' => {
437                    state.bump();
438                    HaskellTokenType::RightParen
439                }
440                '[' => {
441                    state.bump();
442                    HaskellTokenType::LeftBracket
443                }
444                ']' => {
445                    state.bump();
446                    HaskellTokenType::RightBracket
447                }
448                '{' => {
449                    state.bump();
450                    HaskellTokenType::LeftBrace
451                }
452                '}' => {
453                    state.bump();
454                    HaskellTokenType::RightBrace
455                }
456                _ => return false,
457            };
458
459            state.add_token(token_kind, start_pos, state.get_position());
460            true
461        }
462        else {
463            false
464        }
465    }
466}
467
468impl<'config> Lexer<HaskellLanguage> for HaskellLexer<'config> {
469    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[TextEdit], cache: &'a mut impl LexerCache<HaskellLanguage>) -> LexOutput<HaskellLanguage> {
470        let mut state = State::new(source);
471
472        while state.not_at_end() {
473            let safe_point = state.get_position();
474            if self.skip_whitespace(&mut state) {
475                continue;
476            }
477
478            if self.lex_newline(&mut state) {
479                continue;
480            }
481
482            if self.lex_single_line_comment(&mut state) {
483                continue;
484            }
485
486            if self.lex_multi_line_comment(&mut state) {
487                continue;
488            }
489
490            if self.lex_identifier_or_keyword(&mut state) {
491                continue;
492            }
493
494            if self.lex_number(&mut state) {
495                continue;
496            }
497
498            if self.lex_string(&mut state) {
499                continue;
500            }
501
502            if self.lex_char(&mut state) {
503                continue;
504            }
505
506            if self.lex_operators(&mut state) {
507                continue;
508            }
509
510            if self.lex_delimiters(&mut state) {
511                continue;
512            }
513
514            // If no pattern matches, skip the current character and mark as error
515            let start_pos = state.get_position();
516            if state.peek().is_some() {
517                state.advance(1);
518                state.add_token(HaskellTokenType::Error, start_pos, state.get_position())
519            }
520
521            state.advance_if_dead_lock(safe_point)
522        }
523
524        // Add EOF token
525        let pos = state.get_position();
526        state.add_token(HaskellTokenType::Eof, pos, pos);
527
528        state.finish_with_cache(Ok(()), cache)
529    }
530}