Skip to main content

oak_purescript/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3
4use crate::{language::PurescriptLanguage, lexer::token_type::PurescriptTokenType};
5use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
6
7type State<'a, S> = LexerState<'a, S, PurescriptLanguage>;
8
9#[derive(Clone)]
10pub struct PurescriptLexer<'config> {
11    _config: &'config PurescriptLanguage,
12}
13
14impl<'config> Lexer<PurescriptLanguage> for PurescriptLexer<'config> {
15    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::TextEdit], cache: &'a mut impl LexerCache<PurescriptLanguage>) -> LexOutput<PurescriptLanguage> {
16        let mut state = State::new_with_cache(source, 0, cache);
17        let result = self.run(&mut state);
18        if result.is_ok() {
19            state.add_eof();
20        }
21        state.finish_with_cache(result, cache)
22    }
23}
24
25impl<'config> PurescriptLexer<'config> {
26    /// 创建一个新的 PurescriptLexer
27    pub fn new(config: &'config PurescriptLanguage) -> Self {
28        Self { _config: config }
29    }
30
31    fn run<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> Result<(), OakError> {
32        while state.not_at_end() {
33            let safe_point = state.get_position();
34            if self.skip_whitespace(state) {
35                continue;
36            }
37
38            if self.lex_newline(state) {
39                continue;
40            }
41
42            if self.lex_comment(state) {
43                continue;
44            }
45
46            if self.lex_identifier_or_keyword(state) {
47                continue;
48            }
49
50            if self.lex_number_literal(state) {
51                continue;
52            }
53
54            if self.lex_string_literal(state) {
55                continue;
56            }
57
58            if self.lex_char_literal(state) {
59                continue;
60            }
61
62            if self.lex_operator(state) {
63                continue;
64            }
65
66            if self.lex_delimiter(state) {
67                continue;
68            }
69
70            // 如果所有规则都不匹配,跳过当前字符并标记为错误
71            let start_pos = state.get_position();
72            if let Some(ch) = state.peek() {
73                state.advance(ch.len_utf8());
74                state.add_token(PurescriptTokenType::Error, start_pos, state.get_position())
75            }
76
77            state.advance_if_dead_lock(safe_point)
78        }
79
80        Ok(())
81    }
82
83    /// 跳过空白字符
84    fn skip_whitespace<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
85        let start_pos = state.get_position();
86
87        while let Some(ch) = state.peek() {
88            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
89        }
90
91        if state.get_position() > start_pos {
92            state.add_token(PurescriptTokenType::Whitespace, start_pos, state.get_position());
93            true
94        }
95        else {
96            false
97        }
98    }
99
100    /// 处理换行
101    fn lex_newline<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
102        let start_pos = state.get_position();
103
104        if let Some('\n') = state.peek() {
105            state.advance(1);
106            state.add_token(PurescriptTokenType::Newline, start_pos, state.get_position());
107            true
108        }
109        else if let Some('\r') = state.peek() {
110            state.advance(1);
111            if let Some('\n') = state.peek() {
112                state.advance(1)
113            }
114            state.add_token(PurescriptTokenType::Newline, start_pos, state.get_position());
115            true
116        }
117        else {
118            false
119        }
120    }
121
122    /// 处理注释
123    fn lex_comment<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
124        let start_pos = state.get_position();
125
126        if let Some('-') = state.peek() {
127            state.advance(1);
128            if let Some('-') = state.peek() {
129                // 单行注释
130                state.advance(1);
131                while let Some(ch) = state.peek() {
132                    if ch == '\n' || ch == '\r' {
133                        break;
134                    }
135                    state.advance(ch.len_utf8())
136                }
137                state.add_token(PurescriptTokenType::Comment, start_pos, state.get_position());
138                true
139            }
140            else {
141                state.set_position(start_pos);
142                false
143            }
144        }
145        else if let Some('{') = state.peek() {
146            state.advance(1);
147            if let Some('-') = state.peek() {
148                // 多行注释
149                state.advance(1);
150                let mut depth = 1;
151                while let Some(ch) = state.peek() {
152                    if ch == '{' {
153                        state.advance(1);
154                        if let Some('-') = state.peek() {
155                            depth += 1;
156                            state.advance(1)
157                        }
158                    }
159                    else if ch == '-' {
160                        state.advance(1);
161                        if let Some('}') = state.peek() {
162                            depth -= 1;
163                            state.advance(1);
164                            if depth == 0 {
165                                break;
166                            }
167                        }
168                    }
169                    else {
170                        state.advance(ch.len_utf8())
171                    }
172                }
173                state.add_token(PurescriptTokenType::Comment, start_pos, state.get_position());
174                true
175            }
176            else {
177                state.set_position(start_pos);
178                false
179            }
180        }
181        else {
182            false
183        }
184    }
185
186    /// 处理标识符或关键字
187    fn lex_identifier_or_keyword<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
188        let start_pos = state.get_position();
189
190        if let Some(ch) = state.peek() {
191            if ch.is_ascii_alphabetic() || ch == '_' {
192                state.advance(ch.len_utf8());
193
194                while let Some(ch) = state.peek() {
195                    if ch.is_ascii_alphanumeric() || ch == '_' || ch == '\'' {
196                        state.advance(ch.len_utf8());
197                    }
198                    else {
199                        break;
200                    }
201                }
202
203                // 检查是否为关键字
204                let text = state.get_text_in((start_pos..state.get_position()).into());
205
206                let token_kind = match text.as_ref() {
207                    "ado" => PurescriptTokenType::Ado,
208                    "case" => PurescriptTokenType::Case,
209                    "class" => PurescriptTokenType::Class,
210                    "data" => PurescriptTokenType::Data,
211                    "derive" => PurescriptTokenType::Derive,
212                    "do" => PurescriptTokenType::Do,
213                    "else" => PurescriptTokenType::Else,
214                    "false" => PurescriptTokenType::False,
215                    "forall" => PurescriptTokenType::Forall,
216                    "foreign" => PurescriptTokenType::Foreign,
217                    "if" => PurescriptTokenType::If,
218                    "import" => PurescriptTokenType::Import,
219                    "in" => PurescriptTokenType::In,
220                    "infix" => PurescriptTokenType::Infix,
221                    "infixl" => PurescriptTokenType::Infixl,
222                    "infixr" => PurescriptTokenType::Infixr,
223                    "instance" => PurescriptTokenType::Instance,
224                    "let" => PurescriptTokenType::Let,
225                    "module" => PurescriptTokenType::Module,
226                    "newtype" => PurescriptTokenType::Newtype,
227                    "of" => PurescriptTokenType::Of,
228                    "then" => PurescriptTokenType::Then,
229                    "true" => PurescriptTokenType::True,
230                    "type" => PurescriptTokenType::Type,
231                    "where" => PurescriptTokenType::Where,
232                    _ => PurescriptTokenType::Identifier,
233                };
234                state.add_token(token_kind, start_pos, state.get_position());
235                true
236            }
237            else {
238                false
239            }
240        }
241        else {
242            false
243        }
244    }
245
246    /// 处理数字字面
247    fn lex_number_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
248        let start_pos = state.get_position();
249
250        if let Some(ch) = state.peek() {
251            if ch.is_ascii_digit() {
252                state.advance(1);
253
254                // 处理十六进制数字
255                if ch == '0' {
256                    if let Some('x') | Some('X') = state.peek() {
257                        state.advance(1);
258                        while let Some(ch) = state.peek() {
259                            if ch.is_ascii_hexdigit() {
260                                state.advance(1);
261                            }
262                            else {
263                                break;
264                            }
265                        }
266                    }
267                    else {
268                        // 处理普通数
269                        while let Some(ch) = state.peek() {
270                            if ch.is_ascii_digit() { state.advance(1) } else { break }
271                        }
272                    }
273                }
274                else {
275                    // 处理十进制数
276                    while let Some(ch) = state.peek() {
277                        if ch.is_ascii_digit() { state.advance(1) } else { break }
278                    }
279                }
280
281                // 处理小数
282                if let Some('.') = state.peek() {
283                    state.advance(1);
284                    while let Some(ch) = state.peek() {
285                        if ch.is_ascii_digit() { state.advance(1) } else { break }
286                    }
287                }
288
289                // 处理指数
290                if let Some('e') | Some('E') = state.peek() {
291                    state.advance(1);
292                    if let Some('+') | Some('-') = state.peek() {
293                        state.advance(1)
294                    }
295                    while let Some(ch) = state.peek() {
296                        if ch.is_ascii_digit() { state.advance(1) } else { break }
297                    }
298                }
299
300                state.add_token(PurescriptTokenType::NumberLiteral, start_pos, state.get_position());
301                true
302            }
303            else {
304                false
305            }
306        }
307        else {
308            false
309        }
310    }
311
312    /// 处理字符串字面量
313    fn lex_string_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
314        let start_pos = state.get_position();
315
316        if let Some('"') = state.peek() {
317            state.advance(1);
318
319            while let Some(ch) = state.peek() {
320                if ch == '"' {
321                    state.advance(1);
322                    break;
323                }
324                else if ch == '\\' {
325                    state.advance(1);
326                    if let Some(_) = state.peek() {
327                        state.advance(1)
328                    }
329                }
330                else if ch == '\n' || ch == '\r' {
331                    break; // 字符串不能跨行
332                }
333                else {
334                    state.advance(ch.len_utf8())
335                }
336            }
337
338            state.add_token(PurescriptTokenType::StringLiteral, start_pos, state.get_position());
339            true
340        }
341        else {
342            false
343        }
344    }
345
346    /// 处理字符字面
347    fn lex_char_literal<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
348        let start_pos = state.get_position();
349
350        if let Some('\'') = state.peek() {
351            state.advance(1);
352
353            if let Some(ch) = state.peek() {
354                if ch == '\\' {
355                    state.advance(1);
356                    if let Some(_) = state.peek() {
357                        state.advance(1)
358                    }
359                }
360                else if ch != '\'' {
361                    state.advance(ch.len_utf8())
362                }
363            }
364
365            if let Some('\'') = state.peek() {
366                state.advance(1);
367                state.add_token(PurescriptTokenType::CharLiteral, start_pos, state.get_position());
368                true
369            }
370            else {
371                state.set_position(start_pos);
372                false
373            }
374        }
375        else {
376            false
377        }
378    }
379
380    /// 处理操作
381    fn lex_operator<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
382        let start_pos = state.get_position();
383
384        if let Some(ch) = state.peek() {
385            let token_kind = match ch {
386                '+' => {
387                    state.advance(1);
388                    PurescriptTokenType::Plus
389                }
390                '-' => {
391                    state.advance(1);
392                    if let Some('>') = state.peek() {
393                        state.advance(1);
394                        PurescriptTokenType::Arrow
395                    }
396                    else {
397                        PurescriptTokenType::Minus
398                    }
399                }
400                '*' => {
401                    state.advance(1);
402                    if let Some('*') = state.peek() {
403                        state.advance(1);
404                        PurescriptTokenType::Caret // 使用 Caret 代替 Power
405                    }
406                    else {
407                        PurescriptTokenType::Star
408                    }
409                }
410                '/' => {
411                    state.advance(1);
412                    if let Some('=') = state.peek() {
413                        state.advance(1);
414                        PurescriptTokenType::NotEqual
415                    }
416                    else {
417                        PurescriptTokenType::Slash
418                    }
419                }
420                '%' => {
421                    state.advance(1);
422                    PurescriptTokenType::Percent
423                }
424                '=' => {
425                    state.advance(1);
426                    match state.peek() {
427                        Some('=') => {
428                            state.advance(1);
429                            PurescriptTokenType::Equal
430                        }
431                        Some('>') => {
432                            state.advance(1);
433                            PurescriptTokenType::FatArrow
434                        }
435                        _ => PurescriptTokenType::Equal,
436                    }
437                }
438                '<' => {
439                    state.advance(1);
440                    match state.peek() {
441                        Some('=') => {
442                            state.advance(1);
443                            PurescriptTokenType::LessEqual
444                        }
445                        Some('-') => {
446                            state.advance(1);
447                            PurescriptTokenType::Bind
448                        }
449                        _ => PurescriptTokenType::Less,
450                    }
451                }
452                '>' => {
453                    state.advance(1);
454                    if let Some('=') = state.peek() {
455                        state.advance(1);
456                        PurescriptTokenType::GreaterEqual
457                    }
458                    else {
459                        PurescriptTokenType::Greater
460                    }
461                }
462                '&' => {
463                    state.advance(1);
464                    if let Some('&') = state.peek() {
465                        state.advance(1);
466                        PurescriptTokenType::And
467                    }
468                    else {
469                        return false;
470                    }
471                }
472                '|' => {
473                    state.advance(1);
474                    if let Some('|') = state.peek() {
475                        state.advance(1);
476                        PurescriptTokenType::Or
477                    }
478                    else {
479                        PurescriptTokenType::Pipe
480                    }
481                }
482                '\\' => {
483                    state.advance(1);
484                    PurescriptTokenType::Backslash
485                }
486                _ => return false,
487            };
488
489            state.add_token(token_kind, start_pos, state.get_position());
490            true
491        }
492        else {
493            false
494        }
495    }
496
497    /// 处理分隔
498    fn lex_delimiter<'a, S: Source + ?Sized>(&self, state: &mut State<'a, S>) -> bool {
499        let start_pos = state.get_position();
500
501        if let Some(ch) = state.peek() {
502            let token_kind = match ch {
503                '(' => PurescriptTokenType::LeftParen,
504                ')' => PurescriptTokenType::RightParen,
505                '[' => PurescriptTokenType::LeftBracket,
506                ']' => PurescriptTokenType::RightBracket,
507                '{' => PurescriptTokenType::LeftBrace,
508                '}' => PurescriptTokenType::RightBrace,
509                ',' => PurescriptTokenType::Comma,
510                ';' => PurescriptTokenType::Semicolon,
511                '.' => PurescriptTokenType::Dot,
512                ':' => {
513                    state.advance(1);
514                    if let Some(':') = state.peek() {
515                        state.advance(1);
516                        state.add_token(PurescriptTokenType::ColonColon, start_pos, state.get_position());
517                        return true;
518                    }
519                    else {
520                        state.add_token(PurescriptTokenType::Colon, start_pos, state.get_position());
521                        return true;
522                    }
523                }
524                '?' => PurescriptTokenType::Question,
525                '_' => PurescriptTokenType::Underscore,
526                '@' => PurescriptTokenType::At,
527                '`' => PurescriptTokenType::Tick,
528                _ => return false,
529            };
530
531            state.advance(ch.len_utf8());
532            state.add_token(token_kind, start_pos, state.get_position());
533            true
534        }
535        else {
536            false
537        }
538    }
539}