Skip to main content

oak_prolog/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2pub mod token_type;
3pub use token_type::PrologTokenType;
4
5use crate::language::PrologLanguage;
6use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
7
8type State<'s, S> = LexerState<'s, S, PrologLanguage>;
9
10#[derive(Clone, Debug)]
11pub struct PrologLexer<'config> {
12    _config: &'config PrologLanguage,
13}
14
15impl<'config> PrologLexer<'config> {
16    pub fn new(config: &'config PrologLanguage) -> Self {
17        Self { _config: config }
18    }
19
20    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
21        while state.not_at_end() {
22            let safe_point = state.get_position();
23
24            if self.skip_whitespace(state) {
25                continue;
26            }
27
28            if self.lex_newline(state) {
29                continue;
30            }
31
32            if self.lex_comment(state) {
33                continue;
34            }
35
36            if self.lex_string(state) {
37                continue;
38            }
39
40            if self.lex_number(state) {
41                continue;
42            }
43
44            if self.lex_atom_or_keyword(state) {
45                continue;
46            }
47
48            if self.lex_variable(state) {
49                continue;
50            }
51
52            if self.lex_operators_and_punctuation(state) {
53                continue;
54            }
55
56            // 如果没有匹配任何规则,跳过当前字符
57            if let Some(ch) = state.peek() {
58                let start_pos = state.get_position();
59                state.advance(ch.len_utf8());
60                state.add_token(PrologTokenType::Error, start_pos, state.get_position())
61            }
62
63            state.advance_if_dead_lock(safe_point)
64        }
65
66        Ok(())
67    }
68
69    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
70        let start_pos = state.get_position();
71
72        while let Some(ch) = state.peek() {
73            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
74        }
75
76        if state.get_position() > start_pos {
77            state.add_token(PrologTokenType::Whitespace, start_pos, state.get_position());
78            true
79        }
80        else {
81            false
82        }
83    }
84
85    fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
86        let start_pos = state.get_position();
87
88        if let Some('\n') = state.peek() {
89            state.advance(1);
90            state.add_token(PrologTokenType::Newline, start_pos, state.get_position());
91            true
92        }
93        else if let Some('\r') = state.peek() {
94            state.advance(1);
95            if let Some('\n') = state.peek() {
96                state.advance(1)
97            }
98            state.add_token(PrologTokenType::Newline, start_pos, state.get_position());
99            true
100        }
101        else {
102            false
103        }
104    }
105
106    fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
107        let start_pos = state.get_position();
108
109        if let Some('%') = state.peek() {
110            state.advance(1);
111            // 单行注释
112            while let Some(ch) = state.peek() {
113                if ch == '\n' || ch == '\r' {
114                    break;
115                }
116                state.advance(ch.len_utf8())
117            }
118            state.add_token(PrologTokenType::Comment, start_pos, state.get_position());
119            true
120        }
121        else if let Some('/') = state.peek() {
122            state.advance(1);
123            if let Some('*') = state.peek() {
124                state.advance(1);
125                // 多行注释 /* ... */
126                while let Some(ch) = state.peek() {
127                    if ch == '*' {
128                        state.advance(1);
129                        if let Some('/') = state.peek() {
130                            state.advance(1);
131                            break;
132                        }
133                    }
134                    else {
135                        state.advance(ch.len_utf8())
136                    }
137                }
138                state.add_token(PrologTokenType::Comment, start_pos, state.get_position());
139                true
140            }
141            else {
142                // 回退,这不是注释
143                state.set_position(start_pos);
144                false
145            }
146        }
147        else {
148            false
149        }
150    }
151
152    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
153        let start_pos = state.get_position();
154
155        if let Some(quote_char) = state.peek() {
156            if quote_char == '"' || quote_char == '\'' {
157                state.advance(1); // 跳过开始引号
158
159                let mut escaped = false;
160                while let Some(ch) = state.peek() {
161                    if escaped {
162                        escaped = false;
163                        state.advance(ch.len_utf8())
164                    }
165                    else if ch == '\\' {
166                        escaped = true;
167                        state.advance(1)
168                    }
169                    else if ch == quote_char {
170                        state.advance(1); // 跳过结束引号
171                        break;
172                    }
173                    else if ch == '\n' || ch == '\r' {
174                        // 字符串不能跨行
175                        break;
176                    }
177                    else {
178                        state.advance(ch.len_utf8())
179                    }
180                }
181
182                state.add_token(PrologTokenType::String, start_pos, state.get_position());
183                true
184            }
185            else {
186                false
187            }
188        }
189        else {
190            false
191        }
192    }
193
194    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
195        if let Some(ch) = state.peek() {
196            if ch.is_ascii_digit() {
197                let start_pos = state.get_position();
198
199                // 读取整数部分
200                while let Some(ch) = state.peek() {
201                    if ch.is_ascii_digit() { state.advance(1) } else { break }
202                }
203
204                // 检查小数点
205                if let Some('.') = state.peek() {
206                    state.advance(1);
207                    // 读取小数部分
208                    while let Some(ch) = state.peek() {
209                        if ch.is_ascii_digit() { state.advance(1) } else { break }
210                    }
211                }
212
213                // 检查科学记数法
214                if let Some(ch) = state.peek() {
215                    if ch == 'e' || ch == 'E' {
216                        state.advance(1);
217                        if let Some(ch) = state.peek() {
218                            if ch == '+' || ch == '-' {
219                                state.advance(1)
220                            }
221                        }
222                        while let Some(ch) = state.peek() {
223                            if ch.is_ascii_digit() { state.advance(1) } else { break }
224                        }
225                    }
226                }
227
228                state.add_token(PrologTokenType::Integer, start_pos, state.get_position());
229                true
230            }
231            else {
232                false
233            }
234        }
235        else {
236            false
237        }
238    }
239
240    fn lex_atom_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
241        if let Some(ch) = state.peek() {
242            if ch.is_ascii_lowercase() || ch == '_' {
243                let start_pos = state.get_position();
244                let mut text = String::new();
245
246                // 读取原子
247                while let Some(ch) = state.peek() {
248                    if ch.is_alphanumeric() || ch == '_' {
249                        text.push(ch);
250                        state.advance(ch.len_utf8())
251                    }
252                    else {
253                        break;
254                    }
255                }
256
257                // 检查是否是关键字
258                let kind = match text.as_str() {
259                    "is" => PrologTokenType::Is,
260                    "mod" => PrologTokenType::Modulo,
261                    _ => PrologTokenType::Atom,
262                };
263
264                state.add_token(kind, start_pos, state.get_position());
265                true
266            }
267            else {
268                false
269            }
270        }
271        else {
272            false
273        }
274    }
275
276    fn lex_variable<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
277        if let Some(ch) = state.peek() {
278            if ch.is_ascii_uppercase() || ch == '_' {
279                let start_pos = state.get_position();
280
281                // 读取变量名
282                while let Some(ch) = state.peek() {
283                    if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
284                }
285
286                state.add_token(PrologTokenType::Variable, start_pos, state.get_position());
287                true
288            }
289            else {
290                false
291            }
292        }
293        else {
294            false
295        }
296    }
297
298    fn lex_operators_and_punctuation<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
299        if let Some(ch) = state.peek() {
300            let start_pos = state.get_position();
301
302            let kind = match ch {
303                '+' => {
304                    state.advance(1);
305                    PrologTokenType::Plus
306                }
307                '-' => {
308                    state.advance(1);
309                    PrologTokenType::Minus
310                }
311                '*' => {
312                    state.advance(1);
313                    if let Some('*') = state.peek() {
314                        state.advance(1);
315                        PrologTokenType::Power
316                    }
317                    else {
318                        PrologTokenType::Multiply
319                    }
320                }
321                '/' => {
322                    state.advance(1);
323                    if let Some('/') = state.peek() {
324                        state.advance(1);
325                        PrologTokenType::IntDivide
326                    }
327                    else {
328                        PrologTokenType::Divide
329                    }
330                }
331                '=' => {
332                    state.advance(1);
333                    if let Some('=') = state.peek() {
334                        state.advance(1);
335                        PrologTokenType::Equal
336                    }
337                    else if let Some(':') = state.peek() {
338                        state.advance(1);
339                        if let Some('=') = state.peek() {
340                            state.advance(1);
341                            PrologTokenType::ArithEqual
342                        }
343                        else {
344                            // 回退
345                            state.set_position(start_pos + 1);
346                            PrologTokenType::Unify
347                        }
348                    }
349                    else if let Some('\\') = state.peek() {
350                        state.advance(1);
351                        if let Some('=') = state.peek() {
352                            state.advance(1);
353                            PrologTokenType::NotUnify
354                        }
355                        else {
356                            // 回退
357                            state.set_position(start_pos + 1);
358                            PrologTokenType::Unify
359                        }
360                    }
361                    else if let Some('<') = state.peek() {
362                        state.advance(1);
363                        PrologTokenType::ArithNotEqual
364                    }
365                    else {
366                        PrologTokenType::Unify
367                    }
368                }
369                '<' => {
370                    state.advance(1);
371                    if let Some('=') = state.peek() {
372                        state.advance(1);
373                        PrologTokenType::LessEqual
374                    }
375                    else {
376                        PrologTokenType::Less
377                    }
378                }
379                '>' => {
380                    state.advance(1);
381                    if let Some('=') = state.peek() {
382                        state.advance(1);
383                        PrologTokenType::GreaterEqual
384                    }
385                    else {
386                        PrologTokenType::Greater
387                    }
388                }
389                '\\' => {
390                    state.advance(1);
391                    if let Some('=') = state.peek() {
392                        state.advance(1);
393                        if let Some('=') = state.peek() {
394                            state.advance(1);
395                            PrologTokenType::NotEqual
396                        }
397                        else {
398                            PrologTokenType::NotUnify
399                        }
400                    }
401                    else {
402                        PrologTokenType::BitwiseNot
403                    }
404                }
405                '!' => {
406                    state.advance(1);
407                    PrologTokenType::Cut
408                }
409                '?' => {
410                    state.advance(1);
411                    PrologTokenType::Question
412                }
413                ':' => {
414                    state.advance(1);
415                    if let Some('-') = state.peek() {
416                        state.advance(1);
417                        PrologTokenType::ColonMinus
418                    }
419                    else {
420                        PrologTokenType::Colon
421                    }
422                }
423                ';' => {
424                    state.advance(1);
425                    PrologTokenType::Semicolon
426                }
427                ',' => {
428                    state.advance(1);
429                    PrologTokenType::Comma
430                }
431                '.' => {
432                    state.advance(1);
433                    PrologTokenType::Dot
434                }
435                '(' => {
436                    state.advance(1);
437                    PrologTokenType::LeftParen
438                }
439                ')' => {
440                    state.advance(1);
441                    PrologTokenType::RightParen
442                }
443                '[' => {
444                    state.advance(1);
445                    PrologTokenType::LeftBracket
446                }
447                ']' => {
448                    state.advance(1);
449                    PrologTokenType::RightBracket
450                }
451                '{' => {
452                    state.advance(1);
453                    PrologTokenType::LeftBrace
454                }
455                '}' => {
456                    state.advance(1);
457                    PrologTokenType::RightBrace
458                }
459                '|' => {
460                    state.advance(1);
461                    PrologTokenType::Pipe
462                }
463                '^' => {
464                    state.advance(1);
465                    PrologTokenType::BitwiseXor
466                }
467                _ => return false,
468            };
469
470            state.add_token(kind, start_pos, state.get_position());
471            true
472        }
473        else {
474            false
475        }
476    }
477}
478
479impl<'config> Lexer<PrologLanguage> for PrologLexer<'config> {
480    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PrologLanguage>) -> LexOutput<PrologLanguage> {
481        let mut state = State::new_with_cache(source, 0, cache);
482        let result = self.run(&mut state);
483        if result.is_ok() {
484            state.add_eof()
485        }
486        state.finish_with_cache(result, cache)
487    }
488}