oak_purescript/lexer/
mod.rs

1use crate::{kind::PurescriptSyntaxKind, language::PurescriptLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
3
4type State<'a, S> = LexerState<'a, S, PurescriptLanguage>;
5
6#[derive(Clone)]
7pub struct PurescriptLexer<'config> {
8    _config: &'config PurescriptLanguage,
9}
10
11impl<'config> Lexer<PurescriptLanguage> for PurescriptLexer<'config> {
12    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<PurescriptLanguage>) -> LexOutput<PurescriptLanguage> {
13        let mut state = State::new_with_cache(source, 0, cache);
14        let result = self.run(&mut state);
15        if result.is_ok() {
16            state.add_eof();
17        }
18        state.finish_with_cache(result, cache)
19    }
20}
21
22impl<'config> PurescriptLexer<'config> {
23    pub fn new(config: &'config PurescriptLanguage) -> Self {
24        Self { _config: config }
25    }
26
27    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
28        while state.not_at_end() {
29            let safe_point = state.get_position();
30            if self.skip_whitespace(state) {
31                continue;
32            }
33
34            if self.lex_newline(state) {
35                continue;
36            }
37
38            if self.lex_comment(state) {
39                continue;
40            }
41
42            if self.lex_identifier_or_keyword(state) {
43                continue;
44            }
45
46            if self.lex_number_literal(state) {
47                continue;
48            }
49
50            if self.lex_string_literal(state) {
51                continue;
52            }
53
54            if self.lex_char_literal(state) {
55                continue;
56            }
57
58            if self.lex_operator(state) {
59                continue;
60            }
61
62            if self.lex_delimiter(state) {
63                continue;
64            }
65
66            // 如果所有规则都不匹配,跳过当前字符并标记为错误
67            let start_pos = state.get_position();
68            if let Some(ch) = state.peek() {
69                state.advance(ch.len_utf8());
70                state.add_token(PurescriptSyntaxKind::Error, start_pos, state.get_position());
71            }
72
73            state.advance_if_dead_lock(safe_point);
74        }
75
76        Ok(())
77    }
78
79    /// 跳过空白字符
80    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
81        let start_pos = state.get_position();
82
83        while let Some(ch) = state.peek() {
84            if ch == ' ' || ch == '\t' {
85                state.advance(ch.len_utf8());
86            }
87            else {
88                break;
89            }
90        }
91
92        if state.get_position() > start_pos {
93            state.add_token(PurescriptSyntaxKind::Whitespace, start_pos, state.get_position());
94            true
95        }
96        else {
97            false
98        }
99    }
100
101    /// 处理换行
102    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
103        let start_pos = state.get_position();
104
105        if let Some('\n') = state.peek() {
106            state.advance(1);
107            state.add_token(PurescriptSyntaxKind::Newline, start_pos, state.get_position());
108            true
109        }
110        else if let Some('\r') = state.peek() {
111            state.advance(1);
112            if let Some('\n') = state.peek() {
113                state.advance(1);
114            }
115            state.add_token(PurescriptSyntaxKind::Newline, start_pos, state.get_position());
116            true
117        }
118        else {
119            false
120        }
121    }
122
123    /// 处理注释
124    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
125        let start_pos = state.get_position();
126
127        if let Some('-') = state.peek() {
128            state.advance(1);
129            if let Some('-') = state.peek() {
130                // 单行注释
131                state.advance(1);
132                while let Some(ch) = state.peek() {
133                    if ch == '\n' || ch == '\r' {
134                        break;
135                    }
136                    state.advance(ch.len_utf8());
137                }
138                state.add_token(PurescriptSyntaxKind::Comment, start_pos, state.get_position());
139                true
140            }
141            else {
142                state.set_position(start_pos);
143                false
144            }
145        }
146        else if let Some('{') = state.peek() {
147            state.advance(1);
148            if let Some('-') = state.peek() {
149                // 多行注释
150                state.advance(1);
151                let mut depth = 1;
152                while let Some(ch) = state.peek() {
153                    if ch == '{' {
154                        state.advance(1);
155                        if let Some('-') = state.peek() {
156                            depth += 1;
157                            state.advance(1);
158                        }
159                    }
160                    else if ch == '-' {
161                        state.advance(1);
162                        if let Some('}') = state.peek() {
163                            depth -= 1;
164                            state.advance(1);
165                            if depth == 0 {
166                                break;
167                            }
168                        }
169                    }
170                    else {
171                        state.advance(ch.len_utf8());
172                    }
173                }
174                state.add_token(PurescriptSyntaxKind::Comment, start_pos, state.get_position());
175                true
176            }
177            else {
178                state.set_position(start_pos);
179                false
180            }
181        }
182        else {
183            false
184        }
185    }
186
187    /// 处理标识符或关键
188    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
189        let start_pos = state.get_position();
190
191        if let Some(ch) = state.peek() {
192            if ch.is_ascii_alphabetic() || ch == '_' {
193                state.advance(ch.len_utf8());
194
195                while let Some(ch) = state.peek() {
196                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '\'' {
197                        state.advance(ch.len_utf8());
198                    }
199                    else {
200                        break;
201                    }
202                }
203
204                // 检查是否为关键字
205                let text = state.get_text_in((start_pos..state.get_position()).into());
206
207                let token_kind = match text.as_ref() {
208                    "ado" => PurescriptSyntaxKind::Ado,
209                    "case" => PurescriptSyntaxKind::Case,
210                    "class" => PurescriptSyntaxKind::Class,
211                    "data" => PurescriptSyntaxKind::Data,
212                    "derive" => PurescriptSyntaxKind::Derive,
213                    "do" => PurescriptSyntaxKind::Do,
214                    "else" => PurescriptSyntaxKind::Else,
215                    "false" => PurescriptSyntaxKind::False,
216                    "forall" => PurescriptSyntaxKind::Forall,
217                    "foreign" => PurescriptSyntaxKind::Foreign,
218                    "if" => PurescriptSyntaxKind::If,
219                    "import" => PurescriptSyntaxKind::Import,
220                    "in" => PurescriptSyntaxKind::In,
221                    "infix" => PurescriptSyntaxKind::Infix,
222                    "infixl" => PurescriptSyntaxKind::Infixl,
223                    "infixr" => PurescriptSyntaxKind::Infixr,
224                    "instance" => PurescriptSyntaxKind::Instance,
225                    "let" => PurescriptSyntaxKind::Let,
226                    "module" => PurescriptSyntaxKind::Module,
227                    "newtype" => PurescriptSyntaxKind::Newtype,
228                    "of" => PurescriptSyntaxKind::Of,
229                    "then" => PurescriptSyntaxKind::Then,
230                    "true" => PurescriptSyntaxKind::True,
231                    "type" => PurescriptSyntaxKind::Type,
232                    "where" => PurescriptSyntaxKind::Where,
233                    _ => PurescriptSyntaxKind::Identifier,
234                };
235                state.add_token(token_kind, start_pos, state.get_position());
236                true
237            }
238            else {
239                false
240            }
241        }
242        else {
243            false
244        }
245    }
246
247    /// 处理数字字面
248    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
249        let start_pos = state.get_position();
250
251        if let Some(ch) = state.peek() {
252            if ch.is_ascii_digit() {
253                state.advance(1);
254
255                // 处理十六进制数字
256                if ch == '0' {
257                    if let Some('x') | Some('X') = state.peek() {
258                        state.advance(1);
259                        while let Some(ch) = state.peek() {
260                            if ch.is_ascii_hexdigit() {
261                                state.advance(1);
262                            }
263                            else {
264                                break;
265                            }
266                        }
267                    }
268                    else {
269                        // 处理普通数
270                        while let Some(ch) = state.peek() {
271                            if ch.is_ascii_digit() {
272                                state.advance(1);
273                            }
274                            else {
275                                break;
276                            }
277                        }
278                    }
279                }
280                else {
281                    // 处理十进制数
282                    while let Some(ch) = state.peek() {
283                        if ch.is_ascii_digit() {
284                            state.advance(1);
285                        }
286                        else {
287                            break;
288                        }
289                    }
290                }
291
292                // 处理小数
293                if let Some('.') = state.peek() {
294                    state.advance(1);
295                    while let Some(ch) = state.peek() {
296                        if ch.is_ascii_digit() {
297                            state.advance(1);
298                        }
299                        else {
300                            break;
301                        }
302                    }
303                }
304
305                // 处理指数
306                if let Some('e') | Some('E') = state.peek() {
307                    state.advance(1);
308                    if let Some('+') | Some('-') = state.peek() {
309                        state.advance(1);
310                    }
311                    while let Some(ch) = state.peek() {
312                        if ch.is_ascii_digit() {
313                            state.advance(1);
314                        }
315                        else {
316                            break;
317                        }
318                    }
319                }
320
321                state.add_token(PurescriptSyntaxKind::NumberLiteral, start_pos, state.get_position());
322                true
323            }
324            else {
325                false
326            }
327        }
328        else {
329            false
330        }
331    }
332
333    /// 处理字符串字面量
334    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
335        let start_pos = state.get_position();
336
337        if let Some('"') = state.peek() {
338            state.advance(1);
339
340            while let Some(ch) = state.peek() {
341                if ch == '"' {
342                    state.advance(1);
343                    break;
344                }
345                else if ch == '\\' {
346                    state.advance(1);
347                    if let Some(_) = state.peek() {
348                        state.advance(1);
349                    }
350                }
351                else if ch == '\n' || ch == '\r' {
352                    break; // 字符串不能跨行
353                }
354                else {
355                    state.advance(ch.len_utf8());
356                }
357            }
358
359            state.add_token(PurescriptSyntaxKind::StringLiteral, start_pos, state.get_position());
360            true
361        }
362        else {
363            false
364        }
365    }
366
367    /// 处理字符字面
368    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
369        let start_pos = state.get_position();
370
371        if let Some('\'') = state.peek() {
372            state.advance(1);
373
374            if let Some(ch) = state.peek() {
375                if ch == '\\' {
376                    state.advance(1);
377                    if let Some(_) = state.peek() {
378                        state.advance(1);
379                    }
380                }
381                else if ch != '\'' {
382                    state.advance(ch.len_utf8());
383                }
384            }
385
386            if let Some('\'') = state.peek() {
387                state.advance(1);
388                state.add_token(PurescriptSyntaxKind::CharLiteral, start_pos, state.get_position());
389                true
390            }
391            else {
392                state.set_position(start_pos);
393                false
394            }
395        }
396        else {
397            false
398        }
399    }
400
401    /// 处理操作
402    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
403        let start_pos = state.get_position();
404
405        if let Some(ch) = state.peek() {
406            let token_kind = match ch {
407                '+' => {
408                    state.advance(1);
409                    PurescriptSyntaxKind::Plus
410                }
411                '-' => {
412                    state.advance(1);
413                    if let Some('>') = state.peek() {
414                        state.advance(1);
415                        PurescriptSyntaxKind::Arrow
416                    }
417                    else {
418                        PurescriptSyntaxKind::Minus
419                    }
420                }
421                '*' => {
422                    state.advance(1);
423                    if let Some('*') = state.peek() {
424                        state.advance(1);
425                        PurescriptSyntaxKind::Caret // 使用 Caret 代替 Power
426                    }
427                    else {
428                        PurescriptSyntaxKind::Star
429                    }
430                }
431                '/' => {
432                    state.advance(1);
433                    if let Some('=') = state.peek() {
434                        state.advance(1);
435                        PurescriptSyntaxKind::NotEqual
436                    }
437                    else {
438                        PurescriptSyntaxKind::Slash
439                    }
440                }
441                '%' => {
442                    state.advance(1);
443                    PurescriptSyntaxKind::Percent
444                }
445                '=' => {
446                    state.advance(1);
447                    match state.peek() {
448                        Some('=') => {
449                            state.advance(1);
450                            PurescriptSyntaxKind::Equal
451                        }
452                        Some('>') => {
453                            state.advance(1);
454                            PurescriptSyntaxKind::FatArrow
455                        }
456                        _ => PurescriptSyntaxKind::Equal,
457                    }
458                }
459                '<' => {
460                    state.advance(1);
461                    match state.peek() {
462                        Some('=') => {
463                            state.advance(1);
464                            PurescriptSyntaxKind::LessEqual
465                        }
466                        Some('-') => {
467                            state.advance(1);
468                            PurescriptSyntaxKind::Bind
469                        }
470                        _ => PurescriptSyntaxKind::Less,
471                    }
472                }
473                '>' => {
474                    state.advance(1);
475                    if let Some('=') = state.peek() {
476                        state.advance(1);
477                        PurescriptSyntaxKind::GreaterEqual
478                    }
479                    else {
480                        PurescriptSyntaxKind::Greater
481                    }
482                }
483                '&' => {
484                    state.advance(1);
485                    if let Some('&') = state.peek() {
486                        state.advance(1);
487                        PurescriptSyntaxKind::And
488                    }
489                    else {
490                        return false;
491                    }
492                }
493                '|' => {
494                    state.advance(1);
495                    if let Some('|') = state.peek() {
496                        state.advance(1);
497                        PurescriptSyntaxKind::Or
498                    }
499                    else {
500                        PurescriptSyntaxKind::Pipe
501                    }
502                }
503                '\\' => {
504                    state.advance(1);
505                    PurescriptSyntaxKind::Backslash
506                }
507                _ => return false,
508            };
509
510            state.add_token(token_kind, start_pos, state.get_position());
511            true
512        }
513        else {
514            false
515        }
516    }
517
518    /// 处理分隔
519    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
520        let start_pos = state.get_position();
521
522        if let Some(ch) = state.peek() {
523            let token_kind = match ch {
524                '(' => PurescriptSyntaxKind::LeftParen,
525                ')' => PurescriptSyntaxKind::RightParen,
526                '[' => PurescriptSyntaxKind::LeftBracket,
527                ']' => PurescriptSyntaxKind::RightBracket,
528                '{' => PurescriptSyntaxKind::LeftBrace,
529                '}' => PurescriptSyntaxKind::RightBrace,
530                ',' => PurescriptSyntaxKind::Comma,
531                ';' => PurescriptSyntaxKind::Semicolon,
532                '.' => PurescriptSyntaxKind::Dot,
533                ':' => {
534                    state.advance(1);
535                    if let Some(':') = state.peek() {
536                        state.advance(1);
537                        state.add_token(PurescriptSyntaxKind::ColonColon, start_pos, state.get_position());
538                        return true;
539                    }
540                    else {
541                        state.add_token(PurescriptSyntaxKind::Colon, start_pos, state.get_position());
542                        return true;
543                    }
544                }
545                '?' => PurescriptSyntaxKind::Question,
546                '_' => PurescriptSyntaxKind::Underscore,
547                '@' => PurescriptSyntaxKind::At,
548                _ => return false,
549            };
550
551            state.advance(ch.len_utf8());
552            state.add_token(token_kind, start_pos, state.get_position());
553            true
554        }
555        else {
556            false
557        }
558    }
559}