oak_purescript/lexer/
mod.rs

1use crate::{kind::PurescriptSyntaxKind, language::PurescriptLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
3
4type State<S> = LexerState<S, PurescriptLanguage>;
5
6#[derive(Clone)]
7pub struct PurescriptLexer<'config> {
8    config: &'config PurescriptLanguage,
9}
10
11impl<'config> PurescriptLexer<'config> {
12    pub fn new(config: &'config PurescriptLanguage) -> Self {
13        Self { config }
14    }
15
16    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
17        while state.not_at_end() {
18            if self.skip_whitespace(state) {
19                continue;
20            }
21
22            if self.lex_newline(state) {
23                continue;
24            }
25
26            if self.lex_comment(state) {
27                continue;
28            }
29
30            if self.lex_identifier_or_keyword(state) {
31                continue;
32            }
33
34            if self.lex_number_literal(state) {
35                continue;
36            }
37
38            if self.lex_string_literal(state) {
39                continue;
40            }
41
42            if self.lex_char_literal(state) {
43                continue;
44            }
45
46            if self.lex_operator(state) {
47                continue;
48            }
49
50            if self.lex_delimiter(state) {
51                continue;
52            }
53
54            // 如果所有规则都不匹配,跳过当前字符并标记为错误
55            let start_pos = state.get_position();
56            if let Some(ch) = state.peek() {
57                state.advance(ch.len_utf8());
58                state.add_token(PurescriptSyntaxKind::Error, start_pos, state.get_position());
59            }
60        }
61
62        // 添加 EOF token
63        let eof_pos = state.get_position();
64        state.add_token(PurescriptSyntaxKind::Eof, eof_pos, eof_pos);
65
66        Ok(())
67    }
68
69    /// 跳过空白字符
70    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
71        let start_pos = state.get_position();
72
73        while let Some(ch) = state.peek() {
74            if ch == ' ' || ch == '\t' {
75                state.advance(ch.len_utf8());
76            }
77            else {
78                break;
79            }
80        }
81
82        if state.get_position() > start_pos {
83            state.add_token(PurescriptSyntaxKind::Whitespace, start_pos, state.get_position());
84            true
85        }
86        else {
87            false
88        }
89    }
90
91    /// 处理换行
92    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
93        let start_pos = state.get_position();
94
95        if let Some('\n') = state.peek() {
96            state.advance(1);
97            state.add_token(PurescriptSyntaxKind::Newline, start_pos, state.get_position());
98            true
99        }
100        else if let Some('\r') = state.peek() {
101            state.advance(1);
102            if let Some('\n') = state.peek() {
103                state.advance(1);
104            }
105            state.add_token(PurescriptSyntaxKind::Newline, start_pos, state.get_position());
106            true
107        }
108        else {
109            false
110        }
111    }
112
113    /// 处理注释
114    fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
115        let start_pos = state.get_position();
116
117        if let Some('-') = state.peek() {
118            state.advance(1);
119            if let Some('-') = state.peek() {
120                // 单行注释
121                state.advance(1);
122                while let Some(ch) = state.peek() {
123                    if ch == '\n' || ch == '\r' {
124                        break;
125                    }
126                    state.advance(ch.len_utf8());
127                }
128                state.add_token(PurescriptSyntaxKind::Comment, start_pos, state.get_position());
129                true
130            }
131            else {
132                state.set_position(start_pos);
133                false
134            }
135        }
136        else if let Some('{') = state.peek() {
137            state.advance(1);
138            if let Some('-') = state.peek() {
139                // 多行注释
140                state.advance(1);
141                let mut depth = 1;
142                while let Some(ch) = state.peek() {
143                    if ch == '{' {
144                        state.advance(1);
145                        if let Some('-') = state.peek() {
146                            depth += 1;
147                            state.advance(1);
148                        }
149                    }
150                    else if ch == '-' {
151                        state.advance(1);
152                        if let Some('}') = state.peek() {
153                            depth -= 1;
154                            state.advance(1);
155                            if depth == 0 {
156                                break;
157                            }
158                        }
159                    }
160                    else {
161                        state.advance(ch.len_utf8());
162                    }
163                }
164                state.add_token(PurescriptSyntaxKind::Comment, start_pos, state.get_position());
165                true
166            }
167            else {
168                state.set_position(start_pos);
169                false
170            }
171        }
172        else {
173            false
174        }
175    }
176
177    /// 处理标识符或关键
178    fn lex_identifier_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
179        let start_pos = state.get_position();
180
181        if let Some(ch) = state.peek() {
182            if ch.is_ascii_alphabetic() || ch == '_' {
183                state.advance(ch.len_utf8());
184
185                while let Some(ch) = state.peek() {
186                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '\'' {
187                        state.advance(ch.len_utf8());
188                    }
189                    else {
190                        break;
191                    }
192                }
193
194                // 检查是否为关键字
195                let text = state.get_text_in((start_pos..state.get_position()).into());
196
197                let token_kind = match text.as_ref() {
198                    "ado" => PurescriptSyntaxKind::Ado,
199                    "case" => PurescriptSyntaxKind::Case,
200                    "class" => PurescriptSyntaxKind::Class,
201                    "data" => PurescriptSyntaxKind::Data,
202                    "derive" => PurescriptSyntaxKind::Derive,
203                    "do" => PurescriptSyntaxKind::Do,
204                    "else" => PurescriptSyntaxKind::Else,
205                    "false" => PurescriptSyntaxKind::False,
206                    "forall" => PurescriptSyntaxKind::Forall,
207                    "foreign" => PurescriptSyntaxKind::Foreign,
208                    "if" => PurescriptSyntaxKind::If,
209                    "import" => PurescriptSyntaxKind::Import,
210                    "in" => PurescriptSyntaxKind::In,
211                    "infix" => PurescriptSyntaxKind::Infix,
212                    "infixl" => PurescriptSyntaxKind::Infixl,
213                    "infixr" => PurescriptSyntaxKind::Infixr,
214                    "instance" => PurescriptSyntaxKind::Instance,
215                    "let" => PurescriptSyntaxKind::Let,
216                    "module" => PurescriptSyntaxKind::Module,
217                    "newtype" => PurescriptSyntaxKind::Newtype,
218                    "of" => PurescriptSyntaxKind::Of,
219                    "then" => PurescriptSyntaxKind::Then,
220                    "true" => PurescriptSyntaxKind::True,
221                    "type" => PurescriptSyntaxKind::Type,
222                    "where" => PurescriptSyntaxKind::Where,
223                    _ => PurescriptSyntaxKind::Identifier,
224                };
225                state.add_token(token_kind, start_pos, state.get_position());
226                true
227            }
228            else {
229                false
230            }
231        }
232        else {
233            false
234        }
235    }
236
237    /// 处理数字字面
238    fn lex_number_literal<S: Source>(&self, state: &mut State<S>) -> bool {
239        let start_pos = state.get_position();
240
241        if let Some(ch) = state.peek() {
242            if ch.is_ascii_digit() {
243                state.advance(1);
244
245                // 处理十六进制数字
246                if ch == '0' {
247                    if let Some('x') | Some('X') = state.peek() {
248                        state.advance(1);
249                        while let Some(ch) = state.peek() {
250                            if ch.is_ascii_hexdigit() {
251                                state.advance(1);
252                            }
253                            else {
254                                break;
255                            }
256                        }
257                    }
258                    else {
259                        // 处理普通数
260                        while let Some(ch) = state.peek() {
261                            if ch.is_ascii_digit() {
262                                state.advance(1);
263                            }
264                            else {
265                                break;
266                            }
267                        }
268                    }
269                }
270                else {
271                    // 处理十进制数
272                    while let Some(ch) = state.peek() {
273                        if ch.is_ascii_digit() {
274                            state.advance(1);
275                        }
276                        else {
277                            break;
278                        }
279                    }
280                }
281
282                // 处理小数
283                if let Some('.') = state.peek() {
284                    state.advance(1);
285                    while let Some(ch) = state.peek() {
286                        if ch.is_ascii_digit() {
287                            state.advance(1);
288                        }
289                        else {
290                            break;
291                        }
292                    }
293                }
294
295                // 处理指数
296                if let Some('e') | Some('E') = state.peek() {
297                    state.advance(1);
298                    if let Some('+') | Some('-') = state.peek() {
299                        state.advance(1);
300                    }
301                    while let Some(ch) = state.peek() {
302                        if ch.is_ascii_digit() {
303                            state.advance(1);
304                        }
305                        else {
306                            break;
307                        }
308                    }
309                }
310
311                state.add_token(PurescriptSyntaxKind::NumberLiteral, start_pos, state.get_position());
312                true
313            }
314            else {
315                false
316            }
317        }
318        else {
319            false
320        }
321    }
322
323    /// 处理字符串字面量
324    fn lex_string_literal<S: Source>(&self, state: &mut State<S>) -> bool {
325        let start_pos = state.get_position();
326
327        if let Some('"') = state.peek() {
328            state.advance(1);
329
330            while let Some(ch) = state.peek() {
331                if ch == '"' {
332                    state.advance(1);
333                    break;
334                }
335                else if ch == '\\' {
336                    state.advance(1);
337                    if let Some(_) = state.peek() {
338                        state.advance(1);
339                    }
340                }
341                else if ch == '\n' || ch == '\r' {
342                    break; // 字符串不能跨行
343                }
344                else {
345                    state.advance(ch.len_utf8());
346                }
347            }
348
349            state.add_token(PurescriptSyntaxKind::StringLiteral, start_pos, state.get_position());
350            true
351        }
352        else {
353            false
354        }
355    }
356
357    /// 处理字符字面
358    fn lex_char_literal<S: Source>(&self, state: &mut State<S>) -> bool {
359        let start_pos = state.get_position();
360
361        if let Some('\'') = state.peek() {
362            state.advance(1);
363
364            if let Some(ch) = state.peek() {
365                if ch == '\\' {
366                    state.advance(1);
367                    if let Some(_) = state.peek() {
368                        state.advance(1);
369                    }
370                }
371                else if ch != '\'' {
372                    state.advance(ch.len_utf8());
373                }
374            }
375
376            if let Some('\'') = state.peek() {
377                state.advance(1);
378                state.add_token(PurescriptSyntaxKind::CharLiteral, start_pos, state.get_position());
379                true
380            }
381            else {
382                state.set_position(start_pos);
383                false
384            }
385        }
386        else {
387            false
388        }
389    }
390
391    /// 处理操作
392    fn lex_operator<S: Source>(&self, state: &mut State<S>) -> bool {
393        let start_pos = state.get_position();
394
395        if let Some(ch) = state.peek() {
396            let token_kind = match ch {
397                '+' => {
398                    state.advance(1);
399                    PurescriptSyntaxKind::Plus
400                }
401                '-' => {
402                    state.advance(1);
403                    if let Some('>') = state.peek() {
404                        state.advance(1);
405                        PurescriptSyntaxKind::Arrow
406                    }
407                    else {
408                        PurescriptSyntaxKind::Minus
409                    }
410                }
411                '*' => {
412                    state.advance(1);
413                    if let Some('*') = state.peek() {
414                        state.advance(1);
415                        PurescriptSyntaxKind::Caret // 使用 Caret 代替 Power
416                    }
417                    else {
418                        PurescriptSyntaxKind::Star
419                    }
420                }
421                '/' => {
422                    state.advance(1);
423                    if let Some('=') = state.peek() {
424                        state.advance(1);
425                        PurescriptSyntaxKind::NotEqual
426                    }
427                    else {
428                        PurescriptSyntaxKind::Slash
429                    }
430                }
431                '%' => {
432                    state.advance(1);
433                    PurescriptSyntaxKind::Percent
434                }
435                '=' => {
436                    state.advance(1);
437                    match state.peek() {
438                        Some('=') => {
439                            state.advance(1);
440                            PurescriptSyntaxKind::Equal
441                        }
442                        Some('>') => {
443                            state.advance(1);
444                            PurescriptSyntaxKind::FatArrow
445                        }
446                        _ => PurescriptSyntaxKind::Equal,
447                    }
448                }
449                '<' => {
450                    state.advance(1);
451                    match state.peek() {
452                        Some('=') => {
453                            state.advance(1);
454                            PurescriptSyntaxKind::LessEqual
455                        }
456                        Some('-') => {
457                            state.advance(1);
458                            PurescriptSyntaxKind::Bind
459                        }
460                        _ => PurescriptSyntaxKind::Less,
461                    }
462                }
463                '>' => {
464                    state.advance(1);
465                    if let Some('=') = state.peek() {
466                        state.advance(1);
467                        PurescriptSyntaxKind::GreaterEqual
468                    }
469                    else {
470                        PurescriptSyntaxKind::Greater
471                    }
472                }
473                '&' => {
474                    state.advance(1);
475                    if let Some('&') = state.peek() {
476                        state.advance(1);
477                        PurescriptSyntaxKind::And
478                    }
479                    else {
480                        return false;
481                    }
482                }
483                '|' => {
484                    state.advance(1);
485                    if let Some('|') = state.peek() {
486                        state.advance(1);
487                        PurescriptSyntaxKind::Or
488                    }
489                    else {
490                        PurescriptSyntaxKind::Pipe
491                    }
492                }
493                '\\' => {
494                    state.advance(1);
495                    PurescriptSyntaxKind::Backslash
496                }
497                _ => return false,
498            };
499
500            state.add_token(token_kind, start_pos, state.get_position());
501            true
502        }
503        else {
504            false
505        }
506    }
507
508    /// 处理分隔
509    fn lex_delimiter<S: Source>(&self, state: &mut State<S>) -> bool {
510        let start_pos = state.get_position();
511
512        if let Some(ch) = state.peek() {
513            let token_kind = match ch {
514                '(' => PurescriptSyntaxKind::LeftParen,
515                ')' => PurescriptSyntaxKind::RightParen,
516                '[' => PurescriptSyntaxKind::LeftBracket,
517                ']' => PurescriptSyntaxKind::RightBracket,
518                '{' => PurescriptSyntaxKind::LeftBrace,
519                '}' => PurescriptSyntaxKind::RightBrace,
520                ',' => PurescriptSyntaxKind::Comma,
521                ';' => PurescriptSyntaxKind::Semicolon,
522                '.' => PurescriptSyntaxKind::Dot,
523                ':' => {
524                    state.advance(1);
525                    if let Some(':') = state.peek() {
526                        state.advance(1);
527                        state.add_token(PurescriptSyntaxKind::ColonColon, start_pos, state.get_position());
528                        return true;
529                    }
530                    else {
531                        state.add_token(PurescriptSyntaxKind::Colon, start_pos, state.get_position());
532                        return true;
533                    }
534                }
535                '?' => PurescriptSyntaxKind::Question,
536                '_' => PurescriptSyntaxKind::Underscore,
537                '@' => PurescriptSyntaxKind::At,
538                _ => return false,
539            };
540
541            state.advance(ch.len_utf8());
542            state.add_token(token_kind, start_pos, state.get_position());
543            true
544        }
545        else {
546            false
547        }
548    }
549}
550
551impl<'config> Lexer<PurescriptLanguage> for PurescriptLexer<'config> {
552    fn lex_incremental(
553        &self,
554        source: impl Source,
555        changed: usize,
556        cache: IncrementalCache<PurescriptLanguage>,
557    ) -> LexOutput<PurescriptLanguage> {
558        let mut state = LexerState::new_with_cache(source, changed, cache);
559        let result = self.run(&mut state);
560        state.finish(result)
561    }
562}