Skip to main content

oak_prolog/lexer/
mod.rs

1use crate::{kind::PrologSyntaxKind, language::PrologLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
3
4type State<'s, S> = LexerState<'s, S, PrologLanguage>;
5
6#[derive(Clone, Debug)]
7pub struct PrologLexer<'config> {
8    _config: &'config PrologLanguage,
9}
10
11impl<'config> PrologLexer<'config> {
12    pub fn new(config: &'config PrologLanguage) -> Self {
13        Self { _config: config }
14    }
15
16    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
17        while state.not_at_end() {
18            let safe_point = state.get_position();
19
20            if self.skip_whitespace(state) {
21                continue;
22            }
23
24            if self.lex_newline(state) {
25                continue;
26            }
27
28            if self.lex_comment(state) {
29                continue;
30            }
31
32            if self.lex_string(state) {
33                continue;
34            }
35
36            if self.lex_number(state) {
37                continue;
38            }
39
40            if self.lex_atom_or_keyword(state) {
41                continue;
42            }
43
44            if self.lex_variable(state) {
45                continue;
46            }
47
48            if self.lex_operators_and_punctuation(state) {
49                continue;
50            }
51
52            // 如果没有匹配任何规则,跳过当前字符
53            if let Some(ch) = state.peek() {
54                let start_pos = state.get_position();
55                state.advance(ch.len_utf8());
56                state.add_token(PrologSyntaxKind::Error, start_pos, state.get_position());
57            }
58
59            state.advance_if_dead_lock(safe_point);
60        }
61
62        Ok(())
63    }
64
65    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
66        let start_pos = state.get_position();
67
68        while let Some(ch) = state.peek() {
69            if ch == ' ' || ch == '\t' {
70                state.advance(ch.len_utf8());
71            }
72            else {
73                break;
74            }
75        }
76
77        if state.get_position() > start_pos {
78            state.add_token(PrologSyntaxKind::Whitespace, start_pos, state.get_position());
79            true
80        }
81        else {
82            false
83        }
84    }
85
86    fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
87        let start_pos = state.get_position();
88
89        if let Some('\n') = state.peek() {
90            state.advance(1);
91            state.add_token(PrologSyntaxKind::Newline, start_pos, state.get_position());
92            true
93        }
94        else if let Some('\r') = state.peek() {
95            state.advance(1);
96            if let Some('\n') = state.peek() {
97                state.advance(1);
98            }
99            state.add_token(PrologSyntaxKind::Newline, start_pos, state.get_position());
100            true
101        }
102        else {
103            false
104        }
105    }
106
107    fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
108        let start_pos = state.get_position();
109
110        if let Some('%') = state.peek() {
111            state.advance(1);
112            // 单行注释
113            while let Some(ch) = state.peek() {
114                if ch == '\n' || ch == '\r' {
115                    break;
116                }
117                state.advance(ch.len_utf8());
118            }
119            state.add_token(PrologSyntaxKind::Comment, start_pos, state.get_position());
120            true
121        }
122        else if let Some('/') = state.peek() {
123            state.advance(1);
124            if let Some('*') = state.peek() {
125                state.advance(1);
126                // 多行注释 /* ... */
127                while let Some(ch) = state.peek() {
128                    if ch == '*' {
129                        state.advance(1);
130                        if let Some('/') = state.peek() {
131                            state.advance(1);
132                            break;
133                        }
134                    }
135                    else {
136                        state.advance(ch.len_utf8());
137                    }
138                }
139                state.add_token(PrologSyntaxKind::Comment, start_pos, state.get_position());
140                true
141            }
142            else {
143                // 回退,这不是注释
144                state.set_position(start_pos);
145                false
146            }
147        }
148        else {
149            false
150        }
151    }
152
153    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
154        let start_pos = state.get_position();
155
156        if let Some(quote_char) = state.peek() {
157            if quote_char == '"' || quote_char == '\'' {
158                state.advance(1); // 跳过开始引号
159
160                let mut escaped = false;
161                while let Some(ch) = state.peek() {
162                    if escaped {
163                        escaped = false;
164                        state.advance(ch.len_utf8());
165                    }
166                    else if ch == '\\' {
167                        escaped = true;
168                        state.advance(1);
169                    }
170                    else if ch == quote_char {
171                        state.advance(1); // 跳过结束引号
172                        break;
173                    }
174                    else if ch == '\n' || ch == '\r' {
175                        // 字符串不能跨行
176                        break;
177                    }
178                    else {
179                        state.advance(ch.len_utf8());
180                    }
181                }
182
183                state.add_token(PrologSyntaxKind::String, start_pos, state.get_position());
184                true
185            }
186            else {
187                false
188            }
189        }
190        else {
191            false
192        }
193    }
194
195    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
196        if let Some(ch) = state.peek() {
197            if ch.is_ascii_digit() {
198                let start_pos = state.get_position();
199
200                // 读取整数部分
201                while let Some(ch) = state.peek() {
202                    if ch.is_ascii_digit() {
203                        state.advance(1);
204                    }
205                    else {
206                        break;
207                    }
208                }
209
210                // 检查小数点
211                if let Some('.') = state.peek() {
212                    state.advance(1);
213                    // 读取小数部分
214                    while let Some(ch) = state.peek() {
215                        if ch.is_ascii_digit() {
216                            state.advance(1);
217                        }
218                        else {
219                            break;
220                        }
221                    }
222                }
223
224                // 检查科学记数法
225                if let Some(ch) = state.peek() {
226                    if ch == 'e' || ch == 'E' {
227                        state.advance(1);
228                        if let Some(ch) = state.peek() {
229                            if ch == '+' || ch == '-' {
230                                state.advance(1);
231                            }
232                        }
233                        while let Some(ch) = state.peek() {
234                            if ch.is_ascii_digit() {
235                                state.advance(1);
236                            }
237                            else {
238                                break;
239                            }
240                        }
241                    }
242                }
243
244                state.add_token(PrologSyntaxKind::Integer, start_pos, state.get_position());
245                true
246            }
247            else {
248                false
249            }
250        }
251        else {
252            false
253        }
254    }
255
256    fn lex_atom_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
257        if let Some(ch) = state.peek() {
258            if ch.is_ascii_lowercase() || ch == '_' {
259                let start_pos = state.get_position();
260                let mut text = String::new();
261
262                // 读取原子
263                while let Some(ch) = state.peek() {
264                    if ch.is_alphanumeric() || ch == '_' {
265                        text.push(ch);
266                        state.advance(ch.len_utf8());
267                    }
268                    else {
269                        break;
270                    }
271                }
272
273                // 检查是否是关键字
274                let kind = match text.as_str() {
275                    "is" => PrologSyntaxKind::Is,
276                    "mod" => PrologSyntaxKind::Modulo,
277                    _ => PrologSyntaxKind::Atom,
278                };
279
280                state.add_token(kind, start_pos, state.get_position());
281                true
282            }
283            else {
284                false
285            }
286        }
287        else {
288            false
289        }
290    }
291
292    fn lex_variable<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
293        if let Some(ch) = state.peek() {
294            if ch.is_ascii_uppercase() || ch == '_' {
295                let start_pos = state.get_position();
296
297                // 读取变量名
298                while let Some(ch) = state.peek() {
299                    if ch.is_alphanumeric() || ch == '_' {
300                        state.advance(ch.len_utf8());
301                    }
302                    else {
303                        break;
304                    }
305                }
306
307                state.add_token(PrologSyntaxKind::Variable, start_pos, state.get_position());
308                true
309            }
310            else {
311                false
312            }
313        }
314        else {
315            false
316        }
317    }
318
319    fn lex_operators_and_punctuation<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
320        if let Some(ch) = state.peek() {
321            let start_pos = state.get_position();
322
323            let kind = match ch {
324                '+' => {
325                    state.advance(1);
326                    PrologSyntaxKind::Plus
327                }
328                '-' => {
329                    state.advance(1);
330                    PrologSyntaxKind::Minus
331                }
332                '*' => {
333                    state.advance(1);
334                    if let Some('*') = state.peek() {
335                        state.advance(1);
336                        PrologSyntaxKind::Power
337                    }
338                    else {
339                        PrologSyntaxKind::Multiply
340                    }
341                }
342                '/' => {
343                    state.advance(1);
344                    if let Some('/') = state.peek() {
345                        state.advance(1);
346                        PrologSyntaxKind::IntDivide
347                    }
348                    else {
349                        PrologSyntaxKind::Divide
350                    }
351                }
352                '=' => {
353                    state.advance(1);
354                    if let Some('=') = state.peek() {
355                        state.advance(1);
356                        PrologSyntaxKind::Equal
357                    }
358                    else if let Some(':') = state.peek() {
359                        state.advance(1);
360                        if let Some('=') = state.peek() {
361                            state.advance(1);
362                            PrologSyntaxKind::ArithEqual
363                        }
364                        else {
365                            // 回退
366                            state.set_position(start_pos + 1);
367                            PrologSyntaxKind::Unify
368                        }
369                    }
370                    else if let Some('\\') = state.peek() {
371                        state.advance(1);
372                        if let Some('=') = state.peek() {
373                            state.advance(1);
374                            PrologSyntaxKind::NotUnify
375                        }
376                        else {
377                            // 回退
378                            state.set_position(start_pos + 1);
379                            PrologSyntaxKind::Unify
380                        }
381                    }
382                    else if let Some('<') = state.peek() {
383                        state.advance(1);
384                        PrologSyntaxKind::ArithNotEqual
385                    }
386                    else {
387                        PrologSyntaxKind::Unify
388                    }
389                }
390                '<' => {
391                    state.advance(1);
392                    if let Some('=') = state.peek() {
393                        state.advance(1);
394                        PrologSyntaxKind::LessEqual
395                    }
396                    else {
397                        PrologSyntaxKind::Less
398                    }
399                }
400                '>' => {
401                    state.advance(1);
402                    if let Some('=') = state.peek() {
403                        state.advance(1);
404                        PrologSyntaxKind::GreaterEqual
405                    }
406                    else {
407                        PrologSyntaxKind::Greater
408                    }
409                }
410                '\\' => {
411                    state.advance(1);
412                    if let Some('=') = state.peek() {
413                        state.advance(1);
414                        if let Some('=') = state.peek() {
415                            state.advance(1);
416                            PrologSyntaxKind::NotEqual
417                        }
418                        else {
419                            PrologSyntaxKind::NotUnify
420                        }
421                    }
422                    else {
423                        PrologSyntaxKind::BitwiseNot
424                    }
425                }
426                '!' => {
427                    state.advance(1);
428                    PrologSyntaxKind::Cut
429                }
430                '?' => {
431                    state.advance(1);
432                    PrologSyntaxKind::Question
433                }
434                ':' => {
435                    state.advance(1);
436                    if let Some('-') = state.peek() {
437                        state.advance(1);
438                        PrologSyntaxKind::ColonMinus
439                    }
440                    else {
441                        PrologSyntaxKind::Colon
442                    }
443                }
444                ';' => {
445                    state.advance(1);
446                    PrologSyntaxKind::Semicolon
447                }
448                ',' => {
449                    state.advance(1);
450                    PrologSyntaxKind::Comma
451                }
452                '.' => {
453                    state.advance(1);
454                    PrologSyntaxKind::Dot
455                }
456                '(' => {
457                    state.advance(1);
458                    PrologSyntaxKind::LeftParen
459                }
460                ')' => {
461                    state.advance(1);
462                    PrologSyntaxKind::RightParen
463                }
464                '[' => {
465                    state.advance(1);
466                    PrologSyntaxKind::LeftBracket
467                }
468                ']' => {
469                    state.advance(1);
470                    PrologSyntaxKind::RightBracket
471                }
472                '{' => {
473                    state.advance(1);
474                    PrologSyntaxKind::LeftBrace
475                }
476                '}' => {
477                    state.advance(1);
478                    PrologSyntaxKind::RightBrace
479                }
480                '|' => {
481                    state.advance(1);
482                    PrologSyntaxKind::Pipe
483                }
484                '^' => {
485                    state.advance(1);
486                    PrologSyntaxKind::BitwiseXor
487                }
488                _ => return false,
489            };
490
491            state.add_token(kind, start_pos, state.get_position());
492            true
493        }
494        else {
495            false
496        }
497    }
498}
499
500impl<'config> Lexer<PrologLanguage> for PrologLexer<'config> {
501    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PrologLanguage>) -> LexOutput<PrologLanguage> {
502        let mut state = State::new_with_cache(source, 0, cache);
503        let result = self.run(&mut state);
504        if result.is_ok() {
505            state.add_eof();
506        }
507        state.finish_with_cache(result, cache)
508    }
509}