oak_prolog/lexer/
mod.rs

1use crate::{kind::PrologSyntaxKind, language::PrologLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
3
4type State<S> = LexerState<S, PrologLanguage>;
5
6#[derive(Clone)]
7pub struct PrologLexer<'config> {
8    config: &'config PrologLanguage,
9}
10
11impl<'config> PrologLexer<'config> {
12    pub fn new(config: &'config PrologLanguage) -> Self {
13        Self { config }
14    }
15
16    fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
17        while state.not_at_end() {
18            if self.skip_whitespace(state) {
19                continue;
20            }
21
22            if self.lex_newline(state) {
23                continue;
24            }
25
26            if self.lex_comment(state) {
27                continue;
28            }
29
30            if self.lex_string(state) {
31                continue;
32            }
33
34            if self.lex_number(state) {
35                continue;
36            }
37
38            if self.lex_atom_or_keyword(state) {
39                continue;
40            }
41
42            if self.lex_variable(state) {
43                continue;
44            }
45
46            if self.lex_operators_and_punctuation(state) {
47                continue;
48            }
49
50            // 如果没有匹配任何规则,跳过当前字符
51            if let Some(ch) = state.peek() {
52                let start_pos = state.get_position();
53                state.advance(ch.len_utf8());
54                state.add_token(PrologSyntaxKind::Error, start_pos, state.get_position());
55            }
56            else {
57                // 如果已到达文件末尾,退出循环
58                break;
59            }
60        }
61
62        // Add EOF token
63        let pos = state.get_position();
64        state.add_token(PrologSyntaxKind::Eof, pos, pos);
65
66        Ok(())
67    }
68
69    fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
70        let start_pos = state.get_position();
71
72        while let Some(ch) = state.peek() {
73            if ch == ' ' || ch == '\t' {
74                state.advance(ch.len_utf8());
75            }
76            else {
77                break;
78            }
79        }
80
81        if state.get_position() > start_pos {
82            state.add_token(PrologSyntaxKind::Whitespace, start_pos, state.get_position());
83            true
84        }
85        else {
86            false
87        }
88    }
89
90    fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
91        let start_pos = state.get_position();
92
93        if let Some('\n') = state.peek() {
94            state.advance(1);
95            state.add_token(PrologSyntaxKind::Newline, start_pos, state.get_position());
96            true
97        }
98        else if let Some('\r') = state.peek() {
99            state.advance(1);
100            if let Some('\n') = state.peek() {
101                state.advance(1);
102            }
103            state.add_token(PrologSyntaxKind::Newline, start_pos, state.get_position());
104            true
105        }
106        else {
107            false
108        }
109    }
110
111    fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
112        let start_pos = state.get_position();
113
114        if let Some('%') = state.peek() {
115            state.advance(1);
116            // 单行注释
117            while let Some(ch) = state.peek() {
118                if ch == '\n' || ch == '\r' {
119                    break;
120                }
121                state.advance(ch.len_utf8());
122            }
123            state.add_token(PrologSyntaxKind::Comment, start_pos, state.get_position());
124            true
125        }
126        else if let Some('/') = state.peek() {
127            state.advance(1);
128            if let Some('*') = state.peek() {
129                state.advance(1);
130                // 多行注释 /* ... */
131                while let Some(ch) = state.peek() {
132                    if ch == '*' {
133                        state.advance(1);
134                        if let Some('/') = state.peek() {
135                            state.advance(1);
136                            break;
137                        }
138                    }
139                    else {
140                        state.advance(ch.len_utf8());
141                    }
142                }
143                state.add_token(PrologSyntaxKind::Comment, start_pos, state.get_position());
144                true
145            }
146            else {
147                // 回退,这不是注释
148                state.set_position(start_pos);
149                false
150            }
151        }
152        else {
153            false
154        }
155    }
156
157    fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
158        let start_pos = state.get_position();
159
160        if let Some(quote_char) = state.peek() {
161            if quote_char == '"' || quote_char == '\'' {
162                state.advance(1); // 跳过开始引号
163
164                let mut escaped = false;
165                while let Some(ch) = state.peek() {
166                    if escaped {
167                        escaped = false;
168                        state.advance(ch.len_utf8());
169                    }
170                    else if ch == '\\' {
171                        escaped = true;
172                        state.advance(1);
173                    }
174                    else if ch == quote_char {
175                        state.advance(1); // 跳过结束引号
176                        break;
177                    }
178                    else if ch == '\n' || ch == '\r' {
179                        // 字符串不能跨行
180                        break;
181                    }
182                    else {
183                        state.advance(ch.len_utf8());
184                    }
185                }
186
187                state.add_token(PrologSyntaxKind::String, start_pos, state.get_position());
188                true
189            }
190            else {
191                false
192            }
193        }
194        else {
195            false
196        }
197    }
198
199    fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
200        if let Some(ch) = state.peek() {
201            if ch.is_ascii_digit() {
202                let start_pos = state.get_position();
203
204                // 读取整数部分
205                while let Some(ch) = state.peek() {
206                    if ch.is_ascii_digit() {
207                        state.advance(1);
208                    }
209                    else {
210                        break;
211                    }
212                }
213
214                // 检查小数点
215                if let Some('.') = state.peek() {
216                    state.advance(1);
217                    // 读取小数部分
218                    while let Some(ch) = state.peek() {
219                        if ch.is_ascii_digit() {
220                            state.advance(1);
221                        }
222                        else {
223                            break;
224                        }
225                    }
226                }
227
228                // 检查科学记数法
229                if let Some(ch) = state.peek() {
230                    if ch == 'e' || ch == 'E' {
231                        state.advance(1);
232                        if let Some(ch) = state.peek() {
233                            if ch == '+' || ch == '-' {
234                                state.advance(1);
235                            }
236                        }
237                        while let Some(ch) = state.peek() {
238                            if ch.is_ascii_digit() {
239                                state.advance(1);
240                            }
241                            else {
242                                break;
243                            }
244                        }
245                    }
246                }
247
248                state.add_token(PrologSyntaxKind::Integer, start_pos, state.get_position());
249                true
250            }
251            else {
252                false
253            }
254        }
255        else {
256            false
257        }
258    }
259
260    fn lex_atom_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
261        if let Some(ch) = state.peek() {
262            if ch.is_ascii_lowercase() || ch == '_' {
263                let start_pos = state.get_position();
264                let mut text = String::new();
265
266                // 读取原子
267                while let Some(ch) = state.peek() {
268                    if ch.is_alphanumeric() || ch == '_' {
269                        text.push(ch);
270                        state.advance(ch.len_utf8());
271                    }
272                    else {
273                        break;
274                    }
275                }
276
277                // 检查是否是关键字
278                let kind = match text.as_str() {
279                    "is" => PrologSyntaxKind::Is,
280                    "mod" => PrologSyntaxKind::Modulo,
281                    _ => PrologSyntaxKind::Atom,
282                };
283
284                state.add_token(kind, start_pos, state.get_position());
285                true
286            }
287            else {
288                false
289            }
290        }
291        else {
292            false
293        }
294    }
295
296    fn lex_variable<S: Source>(&self, state: &mut State<S>) -> bool {
297        if let Some(ch) = state.peek() {
298            if ch.is_ascii_uppercase() || ch == '_' {
299                let start_pos = state.get_position();
300
301                // 读取变量名
302                while let Some(ch) = state.peek() {
303                    if ch.is_alphanumeric() || ch == '_' {
304                        state.advance(ch.len_utf8());
305                    }
306                    else {
307                        break;
308                    }
309                }
310
311                state.add_token(PrologSyntaxKind::Variable, start_pos, state.get_position());
312                true
313            }
314            else {
315                false
316            }
317        }
318        else {
319            false
320        }
321    }
322
323    fn lex_operators_and_punctuation<S: Source>(&self, state: &mut State<S>) -> bool {
324        if let Some(ch) = state.peek() {
325            let start_pos = state.get_position();
326
327            let kind = match ch {
328                '+' => {
329                    state.advance(1);
330                    PrologSyntaxKind::Plus
331                }
332                '-' => {
333                    state.advance(1);
334                    PrologSyntaxKind::Minus
335                }
336                '*' => {
337                    state.advance(1);
338                    if let Some('*') = state.peek() {
339                        state.advance(1);
340                        PrologSyntaxKind::Power
341                    }
342                    else {
343                        PrologSyntaxKind::Multiply
344                    }
345                }
346                '/' => {
347                    state.advance(1);
348                    if let Some('/') = state.peek() {
349                        state.advance(1);
350                        PrologSyntaxKind::IntDivide
351                    }
352                    else {
353                        PrologSyntaxKind::Divide
354                    }
355                }
356                '=' => {
357                    state.advance(1);
358                    if let Some('=') = state.peek() {
359                        state.advance(1);
360                        PrologSyntaxKind::Equal
361                    }
362                    else if let Some(':') = state.peek() {
363                        state.advance(1);
364                        if let Some('=') = state.peek() {
365                            state.advance(1);
366                            PrologSyntaxKind::ArithEqual
367                        }
368                        else {
369                            // 回退
370                            state.set_position(start_pos + 1);
371                            PrologSyntaxKind::Unify
372                        }
373                    }
374                    else if let Some('\\') = state.peek() {
375                        state.advance(1);
376                        if let Some('=') = state.peek() {
377                            state.advance(1);
378                            PrologSyntaxKind::NotUnify
379                        }
380                        else {
381                            // 回退
382                            state.set_position(start_pos + 1);
383                            PrologSyntaxKind::Unify
384                        }
385                    }
386                    else if let Some('<') = state.peek() {
387                        state.advance(1);
388                        PrologSyntaxKind::ArithNotEqual
389                    }
390                    else {
391                        PrologSyntaxKind::Unify
392                    }
393                }
394                '<' => {
395                    state.advance(1);
396                    if let Some('=') = state.peek() {
397                        state.advance(1);
398                        PrologSyntaxKind::LessEqual
399                    }
400                    else {
401                        PrologSyntaxKind::Less
402                    }
403                }
404                '>' => {
405                    state.advance(1);
406                    if let Some('=') = state.peek() {
407                        state.advance(1);
408                        PrologSyntaxKind::GreaterEqual
409                    }
410                    else {
411                        PrologSyntaxKind::Greater
412                    }
413                }
414                '\\' => {
415                    state.advance(1);
416                    if let Some('=') = state.peek() {
417                        state.advance(1);
418                        if let Some('=') = state.peek() {
419                            state.advance(1);
420                            PrologSyntaxKind::NotEqual
421                        }
422                        else {
423                            PrologSyntaxKind::NotUnify
424                        }
425                    }
426                    else {
427                        PrologSyntaxKind::BitwiseNot
428                    }
429                }
430                '!' => {
431                    state.advance(1);
432                    PrologSyntaxKind::Cut
433                }
434                '?' => {
435                    state.advance(1);
436                    PrologSyntaxKind::Question
437                }
438                ':' => {
439                    state.advance(1);
440                    if let Some('-') = state.peek() {
441                        state.advance(1);
442                        PrologSyntaxKind::ColonMinus
443                    }
444                    else {
445                        PrologSyntaxKind::Colon
446                    }
447                }
448                ';' => {
449                    state.advance(1);
450                    PrologSyntaxKind::Semicolon
451                }
452                ',' => {
453                    state.advance(1);
454                    PrologSyntaxKind::Comma
455                }
456                '.' => {
457                    state.advance(1);
458                    PrologSyntaxKind::Dot
459                }
460                '(' => {
461                    state.advance(1);
462                    PrologSyntaxKind::LeftParen
463                }
464                ')' => {
465                    state.advance(1);
466                    PrologSyntaxKind::RightParen
467                }
468                '[' => {
469                    state.advance(1);
470                    PrologSyntaxKind::LeftBracket
471                }
472                ']' => {
473                    state.advance(1);
474                    PrologSyntaxKind::RightBracket
475                }
476                '{' => {
477                    state.advance(1);
478                    PrologSyntaxKind::LeftBrace
479                }
480                '}' => {
481                    state.advance(1);
482                    PrologSyntaxKind::RightBrace
483                }
484                '|' => {
485                    state.advance(1);
486                    PrologSyntaxKind::Pipe
487                }
488                '^' => {
489                    state.advance(1);
490                    PrologSyntaxKind::BitwiseXor
491                }
492                _ => return false,
493            };
494
495            state.add_token(kind, start_pos, state.get_position());
496            true
497        }
498        else {
499            false
500        }
501    }
502}
503
504impl<'config> Lexer<PrologLanguage> for PrologLexer<'config> {
505    fn lex_incremental(
506        &self,
507        source: impl Source,
508        _changed: usize,
509        _cache: IncrementalCache<PrologLanguage>,
510    ) -> LexOutput<PrologLanguage> {
511        let mut state = LexerState::new(source);
512        let result = self.run(&mut state);
513        state.finish(result)
514    }
515}