Skip to main content

oak_purescript/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::PurescriptLanguage, lexer::token_type::PurescriptTokenType};
5use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
6
7pub(crate) type State<'a, S> = LexerState<'a, S, PurescriptLanguage>;
8
9#[derive(Clone)]
10/// Lexer for PureScript source code.
11pub struct PurescriptLexer<'config> {
12    config: &'config PurescriptLanguage,
13}
14
15impl<'config> Lexer<PurescriptLanguage> for PurescriptLexer<'config> {
16    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<PurescriptLanguage>) -> LexOutput<PurescriptLanguage> {
17        let mut state = State::new_with_cache(source, 0, cache);
18        let result = self.run(&mut state);
19        if result.is_ok() {
20            state.add_eof();
21        }
22        state.finish_with_cache(result, cache)
23    }
24}
25
26impl<'config> PurescriptLexer<'config> {
27    /// Creates a new PurescriptLexer
28    /// Creates a new PurescriptLexer with the given language configuration.
29    pub fn new(config: &'config PurescriptLanguage) -> Self {
30        Self { config }
31    }
32
33    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
34        while state.not_at_end() {
35            let safe_point = state.get_position();
36            if self.skip_whitespace(state) {
37                continue;
38            }
39
40            if self.lex_newline(state) {
41                continue;
42            }
43
44            if self.lex_comment(state) {
45                continue;
46            }
47
48            if self.lex_identifier_or_keyword(state) {
49                continue;
50            }
51
52            if self.lex_number_literal(state) {
53                continue;
54            }
55
56            if self.lex_string_literal(state) {
57                continue;
58            }
59
60            if self.lex_char_literal(state) {
61                continue;
62            }
63
64            if self.lex_operator(state) {
65                continue;
66            }
67
68            if self.lex_delimiter(state) {
69                continue;
70            }
71
72            // If no rules match, skip current character and mark as error
73            let start_pos = state.get_position();
74            if let Some(ch) = state.peek() {
75                state.advance(ch.len_utf8());
76                state.add_token(PurescriptTokenType::Error, start_pos, state.get_position())
77            }
78
79            state.advance_if_dead_lock(safe_point)
80        }
81
82        Ok(())
83    }
84
85    /// Skips whitespace
86    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
87        let start_pos = state.get_position();
88
89        while let Some(ch) = state.peek() {
90            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
91        }
92
93        if state.get_position() > start_pos {
94            state.add_token(PurescriptTokenType::Whitespace, start_pos, state.get_position());
95            true
96        }
97        else {
98            false
99        }
100    }
101
102    /// Handles newlines
103    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
104        let start_pos = state.get_position();
105
106        if let Some('\n') = state.peek() {
107            state.advance(1);
108            state.add_token(PurescriptTokenType::Newline, start_pos, state.get_position());
109            true
110        }
111        else if let Some('\r') = state.peek() {
112            state.advance(1);
113            if let Some('\n') = state.peek() {
114                state.advance(1)
115            }
116            state.add_token(PurescriptTokenType::Newline, start_pos, state.get_position());
117            true
118        }
119        else {
120            false
121        }
122    }
123
124    /// Handles comments
125    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
126        let start_pos = state.get_position();
127
128        if let Some('-') = state.peek() {
129            state.advance(1);
130            if let Some('-') = state.peek() {
131                // Single-line comment
132                state.advance(1);
133                while let Some(ch) = state.peek() {
134                    if ch == '\n' || ch == '\r' {
135                        break;
136                    }
137                    state.advance(ch.len_utf8())
138                }
139                state.add_token(PurescriptTokenType::Comment, start_pos, state.get_position());
140                true
141            }
142            else {
143                state.set_position(start_pos);
144                false
145            }
146        }
147        else if let Some('{') = state.peek() {
148            state.advance(1);
149            if let Some('-') = state.peek() {
150                // Multi-line comment
151                state.advance(1);
152                let mut depth = 1;
153                while let Some(ch) = state.peek() {
154                    if ch == '{' {
155                        state.advance(1);
156                        if let Some('-') = state.peek() {
157                            depth += 1;
158                            state.advance(1)
159                        }
160                    }
161                    else if ch == '-' {
162                        state.advance(1);
163                        if let Some('}') = state.peek() {
164                            depth -= 1;
165                            state.advance(1);
166                            if depth == 0 {
167                                break;
168                            }
169                        }
170                    }
171                    else {
172                        state.advance(ch.len_utf8())
173                    }
174                }
175                state.add_token(PurescriptTokenType::Comment, start_pos, state.get_position());
176                true
177            }
178            else {
179                state.set_position(start_pos);
180                false
181            }
182        }
183        else {
184            false
185        }
186    }
187
188    /// Handles identifiers or keywords
189    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
190        let start_pos = state.get_position();
191
192        if let Some(ch) = state.peek() {
193            if ch.is_ascii_alphabetic() || ch == '_' {
194                state.advance(ch.len_utf8());
195
196                while let Some(ch) = state.peek() {
197                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '\'' {
198                        state.advance(ch.len_utf8());
199                    }
200                    else {
201                        break;
202                    }
203                }
204
205                // Check if it's a keyword
206                let text = state.get_text_in((start_pos..state.get_position()).into());
207
208                let token_kind = match text.as_ref() {
209                    "ado" => PurescriptTokenType::Ado,
210                    "case" => PurescriptTokenType::Case,
211                    "class" => PurescriptTokenType::Class,
212                    "data" => PurescriptTokenType::Data,
213                    "derive" => PurescriptTokenType::Derive,
214                    "do" => PurescriptTokenType::Do,
215                    "else" => PurescriptTokenType::Else,
216                    "false" => PurescriptTokenType::False,
217                    "forall" => PurescriptTokenType::Forall,
218                    "foreign" => PurescriptTokenType::Foreign,
219                    "if" => PurescriptTokenType::If,
220                    "import" => PurescriptTokenType::Import,
221                    "in" => PurescriptTokenType::In,
222                    "infix" => PurescriptTokenType::Infix,
223                    "infixl" => PurescriptTokenType::Infixl,
224                    "infixr" => PurescriptTokenType::Infixr,
225                    "instance" => PurescriptTokenType::Instance,
226                    "let" => PurescriptTokenType::Let,
227                    "module" => PurescriptTokenType::Module,
228                    "newtype" => PurescriptTokenType::Newtype,
229                    "of" => PurescriptTokenType::Of,
230                    "then" => PurescriptTokenType::Then,
231                    "true" => PurescriptTokenType::True,
232                    "type" => PurescriptTokenType::Type,
233                    "where" => PurescriptTokenType::Where,
234                    _ => PurescriptTokenType::Identifier,
235                };
236                state.add_token(token_kind, start_pos, state.get_position());
237                true
238            }
239            else {
240                false
241            }
242        }
243        else {
244            false
245        }
246    }
247
248    /// Handles number literals
249    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
250        let start_pos = state.get_position();
251
252        if let Some(ch) = state.peek() {
253            if ch.is_ascii_digit() {
254                state.advance(1);
255
256                // Handles hexadecimal numbers
257                if ch == '0' {
258                    if let Some('x') | Some('X') = state.peek() {
259                        state.advance(1);
260                        while let Some(ch) = state.peek() {
261                            if ch.is_ascii_hexdigit() {
262                                state.advance(1);
263                            }
264                            else {
265                                break;
266                            }
267                        }
268                    }
269                    else {
270                        // Handles regular numbers
271                        while let Some(ch) = state.peek() {
272                            if ch.is_ascii_digit() { state.advance(1) } else { break }
273                        }
274                    }
275                }
276                else {
277                    // Handles decimal numbers
278                    while let Some(ch) = state.peek() {
279                        if ch.is_ascii_digit() { state.advance(1) } else { break }
280                    }
281                }
282
283                // Handles decimals
284                if let Some('.') = state.peek() {
285                    state.advance(1);
286                    while let Some(ch) = state.peek() {
287                        if ch.is_ascii_digit() { state.advance(1) } else { break }
288                    }
289                }
290
291                // Handles exponents
292                if let Some('e') | Some('E') = state.peek() {
293                    state.advance(1);
294                    if let Some('+') | Some('-') = state.peek() {
295                        state.advance(1)
296                    }
297                    while let Some(ch) = state.peek() {
298                        if ch.is_ascii_digit() { state.advance(1) } else { break }
299                    }
300                }
301
302                state.add_token(PurescriptTokenType::NumberLiteral, start_pos, state.get_position());
303                true
304            }
305            else {
306                false
307            }
308        }
309        else {
310            false
311        }
312    }
313
314    /// Handles string literals
315    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
316        let start_pos = state.get_position();
317
318        if let Some('"') = state.peek() {
319            state.advance(1);
320
321            while let Some(ch) = state.peek() {
322                if ch == '"' {
323                    state.advance(1);
324                    break;
325                }
326                else if ch == '\\' {
327                    state.advance(1);
328                    if let Some(_) = state.peek() {
329                        state.advance(1)
330                    }
331                }
332                else if ch == '\n' || ch == '\r' {
333                    break; // Strings cannot span multiple lines
334                }
335                else {
336                    state.advance(ch.len_utf8())
337                }
338            }
339
340            state.add_token(PurescriptTokenType::StringLiteral, start_pos, state.get_position());
341            true
342        }
343        else {
344            false
345        }
346    }
347
348    /// Handles character literals
349    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
350        let start_pos = state.get_position();
351
352        if let Some('\'') = state.peek() {
353            state.advance(1);
354
355            if let Some(ch) = state.peek() {
356                if ch == '\\' {
357                    state.advance(1);
358                    if let Some(_) = state.peek() {
359                        state.advance(1)
360                    }
361                }
362                else if ch != '\'' {
363                    state.advance(ch.len_utf8())
364                }
365            }
366
367            if let Some('\'') = state.peek() {
368                state.advance(1);
369                state.add_token(PurescriptTokenType::CharLiteral, start_pos, state.get_position());
370                true
371            }
372            else {
373                state.set_position(start_pos);
374                false
375            }
376        }
377        else {
378            false
379        }
380    }
381
382    /// Handles operators
383    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
384        let start_pos = state.get_position();
385
386        if let Some(ch) = state.peek() {
387            let token_kind = match ch {
388                '+' => {
389                    state.advance(1);
390                    PurescriptTokenType::Plus
391                }
392                '-' => {
393                    state.advance(1);
394                    if let Some('>') = state.peek() {
395                        state.advance(1);
396                        PurescriptTokenType::Arrow
397                    }
398                    else {
399                        PurescriptTokenType::Minus
400                    }
401                }
402                '*' => {
403                    state.advance(1);
404                    if let Some('*') = state.peek() {
405                        state.advance(1);
406                        PurescriptTokenType::Caret // Use Caret instead of Power
407                    }
408                    else {
409                        PurescriptTokenType::Star
410                    }
411                }
412                '/' => {
413                    state.advance(1);
414                    if let Some('=') = state.peek() {
415                        state.advance(1);
416                        PurescriptTokenType::NotEqual
417                    }
418                    else {
419                        PurescriptTokenType::Slash
420                    }
421                }
422                '%' => {
423                    state.advance(1);
424                    PurescriptTokenType::Percent
425                }
426                '=' => {
427                    state.advance(1);
428                    match state.peek() {
429                        Some('=') => {
430                            state.advance(1);
431                            PurescriptTokenType::Equal
432                        }
433                        Some('>') => {
434                            state.advance(1);
435                            PurescriptTokenType::FatArrow
436                        }
437                        _ => PurescriptTokenType::Equal,
438                    }
439                }
440                '<' => {
441                    state.advance(1);
442                    match state.peek() {
443                        Some('=') => {
444                            state.advance(1);
445                            PurescriptTokenType::LessEqual
446                        }
447                        Some('-') => {
448                            state.advance(1);
449                            PurescriptTokenType::Bind
450                        }
451                        _ => PurescriptTokenType::Less,
452                    }
453                }
454                '>' => {
455                    state.advance(1);
456                    if let Some('=') = state.peek() {
457                        state.advance(1);
458                        PurescriptTokenType::GreaterEqual
459                    }
460                    else {
461                        PurescriptTokenType::Greater
462                    }
463                }
464                '&' => {
465                    state.advance(1);
466                    if let Some('&') = state.peek() {
467                        state.advance(1);
468                        PurescriptTokenType::And
469                    }
470                    else {
471                        return false;
472                    }
473                }
474                '|' => {
475                    state.advance(1);
476                    if let Some('|') = state.peek() {
477                        state.advance(1);
478                        PurescriptTokenType::Or
479                    }
480                    else {
481                        PurescriptTokenType::Pipe
482                    }
483                }
484                '\\' => {
485                    state.advance(1);
486                    PurescriptTokenType::Backslash
487                }
488                _ => return false,
489            };
490
491            state.add_token(token_kind, start_pos, state.get_position());
492            true
493        }
494        else {
495            false
496        }
497    }
498
499    /// Handles delimiters
500    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
501        let start_pos = state.get_position();
502
503        if let Some(ch) = state.peek() {
504            let token_kind = match ch {
505                '(' => PurescriptTokenType::LeftParen,
506                ')' => PurescriptTokenType::RightParen,
507                '[' => PurescriptTokenType::LeftBracket,
508                ']' => PurescriptTokenType::RightBracket,
509                '{' => PurescriptTokenType::LeftBrace,
510                '}' => PurescriptTokenType::RightBrace,
511                ',' => PurescriptTokenType::Comma,
512                ';' => PurescriptTokenType::Semicolon,
513                '.' => PurescriptTokenType::Dot,
514                ':' => {
515                    state.advance(1);
516                    if let Some(':') = state.peek() {
517                        state.advance(1);
518                        state.add_token(PurescriptTokenType::ColonColon, start_pos, state.get_position());
519                        return true;
520                    }
521                    else {
522                        state.add_token(PurescriptTokenType::Colon, start_pos, state.get_position());
523                        return true;
524                    }
525                }
526                '?' => PurescriptTokenType::Question,
527                '_' => PurescriptTokenType::Underscore,
528                '@' => PurescriptTokenType::At,
529                '`' => PurescriptTokenType::Tick,
530                _ => return false,
531            };
532
533            state.advance(ch.len_utf8());
534            state.add_token(token_kind, start_pos, state.get_position());
535            true
536        }
537        else {
538            false
539        }
540    }
541}