oak_prolog/lexer/
mod.rs

1use crate::{kind::PrologSyntaxKind, language::PrologLanguage};
2use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
3
4type State<'s, S> = LexerState<'s, S, PrologLanguage>;
5
6#[derive(Clone, Default)]
7pub struct PrologLexer {}
8
9impl PrologLexer {
10    pub fn new(_config: &PrologLanguage) -> Self {
11        Self {}
12    }
13
14    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
15        while state.not_at_end() {
16            if self.skip_whitespace(state) {
17                continue;
18            }
19
20            if self.lex_newline(state) {
21                continue;
22            }
23
24            if self.lex_comment(state) {
25                continue;
26            }
27
28            if self.lex_string(state) {
29                continue;
30            }
31
32            if self.lex_number(state) {
33                continue;
34            }
35
36            if self.lex_atom_or_keyword(state) {
37                continue;
38            }
39
40            if self.lex_variable(state) {
41                continue;
42            }
43
44            if self.lex_operators_and_punctuation(state) {
45                continue;
46            }
47
48            // 如果没有匹配任何规则,跳过当前字符
49            if let Some(ch) = state.peek() {
50                let start_pos = state.get_position();
51                state.advance(ch.len_utf8());
52                state.add_token(PrologSyntaxKind::Error, start_pos, state.get_position());
53            }
54            else {
55                // 如果已到达文件末尾,退出循环
56                break;
57            }
58        }
59
60        // Add EOF token
61        let pos = state.get_position();
62        state.add_token(PrologSyntaxKind::Eof, pos, pos);
63
64        Ok(())
65    }
66
67    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
68        let start_pos = state.get_position();
69
70        while let Some(ch) = state.peek() {
71            if ch == ' ' || ch == '\t' {
72                state.advance(ch.len_utf8());
73            }
74            else {
75                break;
76            }
77        }
78
79        if state.get_position() > start_pos {
80            state.add_token(PrologSyntaxKind::Whitespace, start_pos, state.get_position());
81            true
82        }
83        else {
84            false
85        }
86    }
87
88    fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
89        let start_pos = state.get_position();
90
91        if let Some('\n') = state.peek() {
92            state.advance(1);
93            state.add_token(PrologSyntaxKind::Newline, start_pos, state.get_position());
94            true
95        }
96        else if let Some('\r') = state.peek() {
97            state.advance(1);
98            if let Some('\n') = state.peek() {
99                state.advance(1);
100            }
101            state.add_token(PrologSyntaxKind::Newline, start_pos, state.get_position());
102            true
103        }
104        else {
105            false
106        }
107    }
108
109    fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
110        let start_pos = state.get_position();
111
112        if let Some('%') = state.peek() {
113            state.advance(1);
114            // 单行注释
115            while let Some(ch) = state.peek() {
116                if ch == '\n' || ch == '\r' {
117                    break;
118                }
119                state.advance(ch.len_utf8());
120            }
121            state.add_token(PrologSyntaxKind::Comment, start_pos, state.get_position());
122            true
123        }
124        else if let Some('/') = state.peek() {
125            state.advance(1);
126            if let Some('*') = state.peek() {
127                state.advance(1);
128                // 多行注释 /* ... */
129                while let Some(ch) = state.peek() {
130                    if ch == '*' {
131                        state.advance(1);
132                        if let Some('/') = state.peek() {
133                            state.advance(1);
134                            break;
135                        }
136                    }
137                    else {
138                        state.advance(ch.len_utf8());
139                    }
140                }
141                state.add_token(PrologSyntaxKind::Comment, start_pos, state.get_position());
142                true
143            }
144            else {
145                // 回退,这不是注释
146                state.set_position(start_pos);
147                false
148            }
149        }
150        else {
151            false
152        }
153    }
154
155    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
156        let start_pos = state.get_position();
157
158        if let Some(quote_char) = state.peek() {
159            if quote_char == '"' || quote_char == '\'' {
160                state.advance(1); // 跳过开始引号
161
162                let mut escaped = false;
163                while let Some(ch) = state.peek() {
164                    if escaped {
165                        escaped = false;
166                        state.advance(ch.len_utf8());
167                    }
168                    else if ch == '\\' {
169                        escaped = true;
170                        state.advance(1);
171                    }
172                    else if ch == quote_char {
173                        state.advance(1); // 跳过结束引号
174                        break;
175                    }
176                    else if ch == '\n' || ch == '\r' {
177                        // 字符串不能跨行
178                        break;
179                    }
180                    else {
181                        state.advance(ch.len_utf8());
182                    }
183                }
184
185                state.add_token(PrologSyntaxKind::String, start_pos, state.get_position());
186                true
187            }
188            else {
189                false
190            }
191        }
192        else {
193            false
194        }
195    }
196
197    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
198        if let Some(ch) = state.peek() {
199            if ch.is_ascii_digit() {
200                let start_pos = state.get_position();
201
202                // 读取整数部分
203                while let Some(ch) = state.peek() {
204                    if ch.is_ascii_digit() {
205                        state.advance(1);
206                    }
207                    else {
208                        break;
209                    }
210                }
211
212                // 检查小数点
213                if let Some('.') = state.peek() {
214                    state.advance(1);
215                    // 读取小数部分
216                    while let Some(ch) = state.peek() {
217                        if ch.is_ascii_digit() {
218                            state.advance(1);
219                        }
220                        else {
221                            break;
222                        }
223                    }
224                }
225
226                // 检查科学记数法
227                if let Some(ch) = state.peek() {
228                    if ch == 'e' || ch == 'E' {
229                        state.advance(1);
230                        if let Some(ch) = state.peek() {
231                            if ch == '+' || ch == '-' {
232                                state.advance(1);
233                            }
234                        }
235                        while let Some(ch) = state.peek() {
236                            if ch.is_ascii_digit() {
237                                state.advance(1);
238                            }
239                            else {
240                                break;
241                            }
242                        }
243                    }
244                }
245
246                state.add_token(PrologSyntaxKind::Integer, start_pos, state.get_position());
247                true
248            }
249            else {
250                false
251            }
252        }
253        else {
254            false
255        }
256    }
257
258    fn lex_atom_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
259        if let Some(ch) = state.peek() {
260            if ch.is_ascii_lowercase() || ch == '_' {
261                let start_pos = state.get_position();
262                let mut text = String::new();
263
264                // 读取原子
265                while let Some(ch) = state.peek() {
266                    if ch.is_alphanumeric() || ch == '_' {
267                        text.push(ch);
268                        state.advance(ch.len_utf8());
269                    }
270                    else {
271                        break;
272                    }
273                }
274
275                // 检查是否是关键字
276                let kind = match text.as_str() {
277                    "is" => PrologSyntaxKind::Is,
278                    "mod" => PrologSyntaxKind::Modulo,
279                    _ => PrologSyntaxKind::Atom,
280                };
281
282                state.add_token(kind, start_pos, state.get_position());
283                true
284            }
285            else {
286                false
287            }
288        }
289        else {
290            false
291        }
292    }
293
294    fn lex_variable<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
295        if let Some(ch) = state.peek() {
296            if ch.is_ascii_uppercase() || ch == '_' {
297                let start_pos = state.get_position();
298
299                // 读取变量名
300                while let Some(ch) = state.peek() {
301                    if ch.is_alphanumeric() || ch == '_' {
302                        state.advance(ch.len_utf8());
303                    }
304                    else {
305                        break;
306                    }
307                }
308
309                state.add_token(PrologSyntaxKind::Variable, start_pos, state.get_position());
310                true
311            }
312            else {
313                false
314            }
315        }
316        else {
317            false
318        }
319    }
320
321    fn lex_operators_and_punctuation<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
322        if let Some(ch) = state.peek() {
323            let start_pos = state.get_position();
324
325            let kind = match ch {
326                '+' => {
327                    state.advance(1);
328                    PrologSyntaxKind::Plus
329                }
330                '-' => {
331                    state.advance(1);
332                    PrologSyntaxKind::Minus
333                }
334                '*' => {
335                    state.advance(1);
336                    if let Some('*') = state.peek() {
337                        state.advance(1);
338                        PrologSyntaxKind::Power
339                    }
340                    else {
341                        PrologSyntaxKind::Multiply
342                    }
343                }
344                '/' => {
345                    state.advance(1);
346                    if let Some('/') = state.peek() {
347                        state.advance(1);
348                        PrologSyntaxKind::IntDivide
349                    }
350                    else {
351                        PrologSyntaxKind::Divide
352                    }
353                }
354                '=' => {
355                    state.advance(1);
356                    if let Some('=') = state.peek() {
357                        state.advance(1);
358                        PrologSyntaxKind::Equal
359                    }
360                    else if let Some(':') = state.peek() {
361                        state.advance(1);
362                        if let Some('=') = state.peek() {
363                            state.advance(1);
364                            PrologSyntaxKind::ArithEqual
365                        }
366                        else {
367                            // 回退
368                            state.set_position(start_pos + 1);
369                            PrologSyntaxKind::Unify
370                        }
371                    }
372                    else if let Some('\\') = state.peek() {
373                        state.advance(1);
374                        if let Some('=') = state.peek() {
375                            state.advance(1);
376                            PrologSyntaxKind::NotUnify
377                        }
378                        else {
379                            // 回退
380                            state.set_position(start_pos + 1);
381                            PrologSyntaxKind::Unify
382                        }
383                    }
384                    else if let Some('<') = state.peek() {
385                        state.advance(1);
386                        PrologSyntaxKind::ArithNotEqual
387                    }
388                    else {
389                        PrologSyntaxKind::Unify
390                    }
391                }
392                '<' => {
393                    state.advance(1);
394                    if let Some('=') = state.peek() {
395                        state.advance(1);
396                        PrologSyntaxKind::LessEqual
397                    }
398                    else {
399                        PrologSyntaxKind::Less
400                    }
401                }
402                '>' => {
403                    state.advance(1);
404                    if let Some('=') = state.peek() {
405                        state.advance(1);
406                        PrologSyntaxKind::GreaterEqual
407                    }
408                    else {
409                        PrologSyntaxKind::Greater
410                    }
411                }
412                '\\' => {
413                    state.advance(1);
414                    if let Some('=') = state.peek() {
415                        state.advance(1);
416                        if let Some('=') = state.peek() {
417                            state.advance(1);
418                            PrologSyntaxKind::NotEqual
419                        }
420                        else {
421                            PrologSyntaxKind::NotUnify
422                        }
423                    }
424                    else {
425                        PrologSyntaxKind::BitwiseNot
426                    }
427                }
428                '!' => {
429                    state.advance(1);
430                    PrologSyntaxKind::Cut
431                }
432                '?' => {
433                    state.advance(1);
434                    PrologSyntaxKind::Question
435                }
436                ':' => {
437                    state.advance(1);
438                    if let Some('-') = state.peek() {
439                        state.advance(1);
440                        PrologSyntaxKind::ColonMinus
441                    }
442                    else {
443                        PrologSyntaxKind::Colon
444                    }
445                }
446                ';' => {
447                    state.advance(1);
448                    PrologSyntaxKind::Semicolon
449                }
450                ',' => {
451                    state.advance(1);
452                    PrologSyntaxKind::Comma
453                }
454                '.' => {
455                    state.advance(1);
456                    PrologSyntaxKind::Dot
457                }
458                '(' => {
459                    state.advance(1);
460                    PrologSyntaxKind::LeftParen
461                }
462                ')' => {
463                    state.advance(1);
464                    PrologSyntaxKind::RightParen
465                }
466                '[' => {
467                    state.advance(1);
468                    PrologSyntaxKind::LeftBracket
469                }
470                ']' => {
471                    state.advance(1);
472                    PrologSyntaxKind::RightBracket
473                }
474                '{' => {
475                    state.advance(1);
476                    PrologSyntaxKind::LeftBrace
477                }
478                '}' => {
479                    state.advance(1);
480                    PrologSyntaxKind::RightBrace
481                }
482                '|' => {
483                    state.advance(1);
484                    PrologSyntaxKind::Pipe
485                }
486                '^' => {
487                    state.advance(1);
488                    PrologSyntaxKind::BitwiseXor
489                }
490                _ => return false,
491            };
492
493            state.add_token(kind, start_pos, state.get_position());
494            true
495        }
496        else {
497            false
498        }
499    }
500}
501
502impl Lexer<PrologLanguage> for PrologLexer {
503    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PrologLanguage>) -> LexOutput<PrologLanguage> {
504        let mut state = LexerState::new(source);
505        let result = self.run(&mut state);
506        if result.is_ok() {
507            // state.run already adds EOF, but LexerState::new doesn't include it by default
508            // PrologLexer::run adds EOF manually, so we don't need to add it again here
509        }
510        state.finish_with_cache(result, cache)
511    }
512}