Skip to main content

oak_prolog/lexer/
mod.rs

1#![doc = include_str!("readme.md")]
2/// Token type definitions.
3pub mod token_type;
4pub use token_type::PrologTokenType;
5
6use crate::language::PrologLanguage;
7use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
8
9type State<'s, S> = LexerState<'s, S, PrologLanguage>;
10
11/// Prolog lexer.
12#[derive(Clone, Debug)]
13pub struct PrologLexer<'config> {
14    config: &'config PrologLanguage,
15}
16
17impl<'config> PrologLexer<'config> {
18    /// Creates a new `PrologLexer` with the given configuration.
19    pub fn new(config: &'config PrologLanguage) -> Self {
20        Self { config }
21    }
22
23    fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
24        while state.not_at_end() {
25            let safe_point = state.get_position();
26
27            if self.skip_whitespace(state) {
28                continue;
29            }
30
31            if self.lex_newline(state) {
32                continue;
33            }
34
35            if self.lex_comment(state) {
36                continue;
37            }
38
39            if self.lex_string(state) {
40                continue;
41            }
42
43            if self.lex_number(state) {
44                continue;
45            }
46
47            if self.lex_atom_or_keyword(state) {
48                continue;
49            }
50
51            if self.lex_variable(state) {
52                continue;
53            }
54
55            if self.lex_operators_and_punctuation(state) {
56                continue;
57            }
58
59            // If no rules match, skip the current character
60            if let Some(ch) = state.peek() {
61                let start_pos = state.get_position();
62                state.advance(ch.len_utf8());
63                state.add_token(PrologTokenType::Error, start_pos, state.get_position())
64            }
65
66            state.advance_if_dead_lock(safe_point)
67        }
68
69        Ok(())
70    }
71
72    fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
73        let start_pos = state.get_position();
74
75        while let Some(ch) = state.peek() {
76            if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
77        }
78
79        if state.get_position() > start_pos {
80            state.add_token(PrologTokenType::Whitespace, start_pos, state.get_position());
81            true
82        }
83        else {
84            false
85        }
86    }
87
88    fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
89        let start_pos = state.get_position();
90
91        if let Some('\n') = state.peek() {
92            state.advance(1);
93            state.add_token(PrologTokenType::Newline, start_pos, state.get_position());
94            true
95        }
96        else if let Some('\r') = state.peek() {
97            state.advance(1);
98            if let Some('\n') = state.peek() {
99                state.advance(1)
100            }
101            state.add_token(PrologTokenType::Newline, start_pos, state.get_position());
102            true
103        }
104        else {
105            false
106        }
107    }
108
109    fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
110        let start_pos = state.get_position();
111
112        if let Some('%') = state.peek() {
113            state.advance(1);
114            // Single-line comment
115            while let Some(ch) = state.peek() {
116                if ch == '\n' || ch == '\r' {
117                    break;
118                }
119                state.advance(ch.len_utf8())
120            }
121            state.add_token(PrologTokenType::Comment, start_pos, state.get_position());
122            true
123        }
124        else if let Some('/') = state.peek() {
125            state.advance(1);
126            if let Some('*') = state.peek() {
127                state.advance(1);
128                // Multi-line comment /* ... */
129                while let Some(ch) = state.peek() {
130                    if ch == '*' {
131                        state.advance(1);
132                        if let Some('/') = state.peek() {
133                            state.advance(1);
134                            break;
135                        }
136                    }
137                    else {
138                        state.advance(ch.len_utf8())
139                    }
140                }
141                state.add_token(PrologTokenType::Comment, start_pos, state.get_position());
142                true
143            }
144            else {
145                // Backtrack, this is not a comment
146                state.set_position(start_pos);
147                false
148            }
149        }
150        else {
151            false
152        }
153    }
154
155    fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
156        let start_pos = state.get_position();
157
158        if let Some(quote_char) = state.peek() {
159            if quote_char == '"' || quote_char == '\'' {
160                state.advance(1); // Skip start quote
161
162                let mut escaped = false;
163                while let Some(ch) = state.peek() {
164                    if escaped {
165                        escaped = false;
166                        state.advance(ch.len_utf8())
167                    }
168                    else if ch == '\\' {
169                        escaped = true;
170                        state.advance(1)
171                    }
172                    else if ch == quote_char {
173                        state.advance(1); // Skip end quote
174                        break;
175                    }
176                    else if ch == '\n' || ch == '\r' {
177                        // Strings cannot span lines
178                        break;
179                    }
180                    else {
181                        state.advance(ch.len_utf8())
182                    }
183                }
184
185                state.add_token(PrologTokenType::String, start_pos, state.get_position());
186                true
187            }
188            else {
189                false
190            }
191        }
192        else {
193            false
194        }
195    }
196
197    fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
198        if let Some(ch) = state.peek() {
199            if ch.is_ascii_digit() {
200                let start_pos = state.get_position();
201
202                // Read integer part
203                while let Some(ch) = state.peek() {
204                    if ch.is_ascii_digit() { state.advance(1) } else { break }
205                }
206
207                // Check decimal point
208                if let Some('.') = state.peek() {
209                    state.advance(1);
210                    // Read fractional part
211                    while let Some(ch) = state.peek() {
212                        if ch.is_ascii_digit() { state.advance(1) } else { break }
213                    }
214                }
215
216                // Check scientific notation
217                if let Some(ch) = state.peek() {
218                    if ch == 'e' || ch == 'E' {
219                        state.advance(1);
220                        if let Some(ch) = state.peek() {
221                            if ch == '+' || ch == '-' {
222                                state.advance(1)
223                            }
224                        }
225                        while let Some(ch) = state.peek() {
226                            if ch.is_ascii_digit() { state.advance(1) } else { break }
227                        }
228                    }
229                }
230
231                state.add_token(PrologTokenType::Integer, start_pos, state.get_position());
232                true
233            }
234            else {
235                false
236            }
237        }
238        else {
239            false
240        }
241    }
242
243    fn lex_atom_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
244        if let Some(ch) = state.peek() {
245            if ch.is_ascii_lowercase() || ch == '_' {
246                let start_pos = state.get_position();
247                let mut text = String::new();
248
249                // Read atom
250                while let Some(ch) = state.peek() {
251                    if ch.is_alphanumeric() || ch == '_' {
252                        text.push(ch);
253                        state.advance(ch.len_utf8())
254                    }
255                    else {
256                        break;
257                    }
258                }
259
260                // Check if it's a keyword
261                let kind = match text.as_str() {
262                    "is" => PrologTokenType::Is,
263                    "mod" => PrologTokenType::Modulo,
264                    _ => PrologTokenType::Atom,
265                };
266
267                state.add_token(kind, start_pos, state.get_position());
268                true
269            }
270            else {
271                false
272            }
273        }
274        else {
275            false
276        }
277    }
278
279    fn lex_variable<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
280        if let Some(ch) = state.peek() {
281            if ch.is_ascii_uppercase() || ch == '_' {
282                let start_pos = state.get_position();
283
284                // Read variable name
285                while let Some(ch) = state.peek() {
286                    if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
287                }
288
289                state.add_token(PrologTokenType::Variable, start_pos, state.get_position());
290                true
291            }
292            else {
293                false
294            }
295        }
296        else {
297            false
298        }
299    }
300
301    fn lex_operators_and_punctuation<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
302        if let Some(ch) = state.peek() {
303            let start_pos = state.get_position();
304
305            let kind = match ch {
306                '+' => {
307                    state.advance(1);
308                    PrologTokenType::Plus
309                }
310                '-' => {
311                    state.advance(1);
312                    PrologTokenType::Minus
313                }
314                '*' => {
315                    state.advance(1);
316                    if let Some('*') = state.peek() {
317                        state.advance(1);
318                        PrologTokenType::Power
319                    }
320                    else {
321                        PrologTokenType::Multiply
322                    }
323                }
324                '/' => {
325                    state.advance(1);
326                    if let Some('/') = state.peek() {
327                        state.advance(1);
328                        PrologTokenType::IntDivide
329                    }
330                    else {
331                        PrologTokenType::Divide
332                    }
333                }
334                '=' => {
335                    state.advance(1);
336                    if let Some('=') = state.peek() {
337                        state.advance(1);
338                        PrologTokenType::Equal
339                    }
340                    else if let Some(':') = state.peek() {
341                        state.advance(1);
342                        if let Some('=') = state.peek() {
343                            state.advance(1);
344                            PrologTokenType::ArithEqual
345                        }
346                        else {
347                            // Backtrack
348                            state.set_position(start_pos + 1);
349                            PrologTokenType::Unify
350                        }
351                    }
352                    else if let Some('\\') = state.peek() {
353                        state.advance(1);
354                        if let Some('=') = state.peek() {
355                            state.advance(1);
356                            PrologTokenType::NotUnify
357                        }
358                        else {
359                            // Backtrack
360                            state.set_position(start_pos + 1);
361                            PrologTokenType::Unify
362                        }
363                    }
364                    else if let Some('<') = state.peek() {
365                        state.advance(1);
366                        PrologTokenType::ArithNotEqual
367                    }
368                    else {
369                        PrologTokenType::Unify
370                    }
371                }
372                '<' => {
373                    state.advance(1);
374                    if let Some('=') = state.peek() {
375                        state.advance(1);
376                        PrologTokenType::LessEqual
377                    }
378                    else {
379                        PrologTokenType::Less
380                    }
381                }
382                '>' => {
383                    state.advance(1);
384                    if let Some('=') = state.peek() {
385                        state.advance(1);
386                        PrologTokenType::GreaterEqual
387                    }
388                    else {
389                        PrologTokenType::Greater
390                    }
391                }
392                '\\' => {
393                    state.advance(1);
394                    if let Some('=') = state.peek() {
395                        state.advance(1);
396                        if let Some('=') = state.peek() {
397                            state.advance(1);
398                            PrologTokenType::NotEqual
399                        }
400                        else {
401                            PrologTokenType::NotUnify
402                        }
403                    }
404                    else {
405                        PrologTokenType::BitwiseNot
406                    }
407                }
408                '!' => {
409                    state.advance(1);
410                    PrologTokenType::Cut
411                }
412                '?' => {
413                    state.advance(1);
414                    PrologTokenType::Question
415                }
416                ':' => {
417                    state.advance(1);
418                    if let Some('-') = state.peek() {
419                        state.advance(1);
420                        PrologTokenType::ColonMinus
421                    }
422                    else {
423                        PrologTokenType::Colon
424                    }
425                }
426                ';' => {
427                    state.advance(1);
428                    PrologTokenType::Semicolon
429                }
430                ',' => {
431                    state.advance(1);
432                    PrologTokenType::Comma
433                }
434                '.' => {
435                    state.advance(1);
436                    PrologTokenType::Dot
437                }
438                '(' => {
439                    state.advance(1);
440                    PrologTokenType::LeftParen
441                }
442                ')' => {
443                    state.advance(1);
444                    PrologTokenType::RightParen
445                }
446                '[' => {
447                    state.advance(1);
448                    PrologTokenType::LeftBracket
449                }
450                ']' => {
451                    state.advance(1);
452                    PrologTokenType::RightBracket
453                }
454                '{' => {
455                    state.advance(1);
456                    PrologTokenType::LeftBrace
457                }
458                '}' => {
459                    state.advance(1);
460                    PrologTokenType::RightBrace
461                }
462                '|' => {
463                    state.advance(1);
464                    PrologTokenType::Pipe
465                }
466                '^' => {
467                    state.advance(1);
468                    PrologTokenType::BitwiseXor
469                }
470                _ => return false,
471            };
472
473            state.add_token(kind, start_pos, state.get_position());
474            true
475        }
476        else {
477            false
478        }
479    }
480}
481
482impl<'config> Lexer<PrologLanguage> for PrologLexer<'config> {
483    fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PrologLanguage>) -> LexOutput<PrologLanguage> {
484        let mut state = State::new_with_cache(source, 0, cache);
485        let result = self.run(&mut state);
486        if result.is_ok() {
487            state.add_eof()
488        }
489        state.finish_with_cache(result, cache)
490    }
491}