1use crate::{kind::PrologSyntaxKind, language::PrologLanguage};
2use oak_core::{IncrementalCache, Lexer, LexerState, OakError, lexer::LexOutput, source::Source};
3
4type State<S> = LexerState<S, PrologLanguage>;
5
6#[derive(Clone)]
7pub struct PrologLexer<'config> {
8 config: &'config PrologLanguage,
9}
10
11impl<'config> PrologLexer<'config> {
12 pub fn new(config: &'config PrologLanguage) -> Self {
13 Self { config }
14 }
15
16 fn run<S: Source>(&self, state: &mut State<S>) -> Result<(), OakError> {
17 while state.not_at_end() {
18 if self.skip_whitespace(state) {
19 continue;
20 }
21
22 if self.lex_newline(state) {
23 continue;
24 }
25
26 if self.lex_comment(state) {
27 continue;
28 }
29
30 if self.lex_string(state) {
31 continue;
32 }
33
34 if self.lex_number(state) {
35 continue;
36 }
37
38 if self.lex_atom_or_keyword(state) {
39 continue;
40 }
41
42 if self.lex_variable(state) {
43 continue;
44 }
45
46 if self.lex_operators_and_punctuation(state) {
47 continue;
48 }
49
50 if let Some(ch) = state.peek() {
52 let start_pos = state.get_position();
53 state.advance(ch.len_utf8());
54 state.add_token(PrologSyntaxKind::Error, start_pos, state.get_position());
55 }
56 else {
57 break;
59 }
60 }
61
62 let pos = state.get_position();
64 state.add_token(PrologSyntaxKind::Eof, pos, pos);
65
66 Ok(())
67 }
68
69 fn skip_whitespace<S: Source>(&self, state: &mut State<S>) -> bool {
70 let start_pos = state.get_position();
71
72 while let Some(ch) = state.peek() {
73 if ch == ' ' || ch == '\t' {
74 state.advance(ch.len_utf8());
75 }
76 else {
77 break;
78 }
79 }
80
81 if state.get_position() > start_pos {
82 state.add_token(PrologSyntaxKind::Whitespace, start_pos, state.get_position());
83 true
84 }
85 else {
86 false
87 }
88 }
89
90 fn lex_newline<S: Source>(&self, state: &mut State<S>) -> bool {
91 let start_pos = state.get_position();
92
93 if let Some('\n') = state.peek() {
94 state.advance(1);
95 state.add_token(PrologSyntaxKind::Newline, start_pos, state.get_position());
96 true
97 }
98 else if let Some('\r') = state.peek() {
99 state.advance(1);
100 if let Some('\n') = state.peek() {
101 state.advance(1);
102 }
103 state.add_token(PrologSyntaxKind::Newline, start_pos, state.get_position());
104 true
105 }
106 else {
107 false
108 }
109 }
110
111 fn lex_comment<S: Source>(&self, state: &mut State<S>) -> bool {
112 let start_pos = state.get_position();
113
114 if let Some('%') = state.peek() {
115 state.advance(1);
116 while let Some(ch) = state.peek() {
118 if ch == '\n' || ch == '\r' {
119 break;
120 }
121 state.advance(ch.len_utf8());
122 }
123 state.add_token(PrologSyntaxKind::Comment, start_pos, state.get_position());
124 true
125 }
126 else if let Some('/') = state.peek() {
127 state.advance(1);
128 if let Some('*') = state.peek() {
129 state.advance(1);
130 while let Some(ch) = state.peek() {
132 if ch == '*' {
133 state.advance(1);
134 if let Some('/') = state.peek() {
135 state.advance(1);
136 break;
137 }
138 }
139 else {
140 state.advance(ch.len_utf8());
141 }
142 }
143 state.add_token(PrologSyntaxKind::Comment, start_pos, state.get_position());
144 true
145 }
146 else {
147 state.set_position(start_pos);
149 false
150 }
151 }
152 else {
153 false
154 }
155 }
156
157 fn lex_string<S: Source>(&self, state: &mut State<S>) -> bool {
158 let start_pos = state.get_position();
159
160 if let Some(quote_char) = state.peek() {
161 if quote_char == '"' || quote_char == '\'' {
162 state.advance(1); let mut escaped = false;
165 while let Some(ch) = state.peek() {
166 if escaped {
167 escaped = false;
168 state.advance(ch.len_utf8());
169 }
170 else if ch == '\\' {
171 escaped = true;
172 state.advance(1);
173 }
174 else if ch == quote_char {
175 state.advance(1); break;
177 }
178 else if ch == '\n' || ch == '\r' {
179 break;
181 }
182 else {
183 state.advance(ch.len_utf8());
184 }
185 }
186
187 state.add_token(PrologSyntaxKind::String, start_pos, state.get_position());
188 true
189 }
190 else {
191 false
192 }
193 }
194 else {
195 false
196 }
197 }
198
199 fn lex_number<S: Source>(&self, state: &mut State<S>) -> bool {
200 if let Some(ch) = state.peek() {
201 if ch.is_ascii_digit() {
202 let start_pos = state.get_position();
203
204 while let Some(ch) = state.peek() {
206 if ch.is_ascii_digit() {
207 state.advance(1);
208 }
209 else {
210 break;
211 }
212 }
213
214 if let Some('.') = state.peek() {
216 state.advance(1);
217 while let Some(ch) = state.peek() {
219 if ch.is_ascii_digit() {
220 state.advance(1);
221 }
222 else {
223 break;
224 }
225 }
226 }
227
228 if let Some(ch) = state.peek() {
230 if ch == 'e' || ch == 'E' {
231 state.advance(1);
232 if let Some(ch) = state.peek() {
233 if ch == '+' || ch == '-' {
234 state.advance(1);
235 }
236 }
237 while let Some(ch) = state.peek() {
238 if ch.is_ascii_digit() {
239 state.advance(1);
240 }
241 else {
242 break;
243 }
244 }
245 }
246 }
247
248 state.add_token(PrologSyntaxKind::Integer, start_pos, state.get_position());
249 true
250 }
251 else {
252 false
253 }
254 }
255 else {
256 false
257 }
258 }
259
260 fn lex_atom_or_keyword<S: Source>(&self, state: &mut State<S>) -> bool {
261 if let Some(ch) = state.peek() {
262 if ch.is_ascii_lowercase() || ch == '_' {
263 let start_pos = state.get_position();
264 let mut text = String::new();
265
266 while let Some(ch) = state.peek() {
268 if ch.is_alphanumeric() || ch == '_' {
269 text.push(ch);
270 state.advance(ch.len_utf8());
271 }
272 else {
273 break;
274 }
275 }
276
277 let kind = match text.as_str() {
279 "is" => PrologSyntaxKind::Is,
280 "mod" => PrologSyntaxKind::Modulo,
281 _ => PrologSyntaxKind::Atom,
282 };
283
284 state.add_token(kind, start_pos, state.get_position());
285 true
286 }
287 else {
288 false
289 }
290 }
291 else {
292 false
293 }
294 }
295
296 fn lex_variable<S: Source>(&self, state: &mut State<S>) -> bool {
297 if let Some(ch) = state.peek() {
298 if ch.is_ascii_uppercase() || ch == '_' {
299 let start_pos = state.get_position();
300
301 while let Some(ch) = state.peek() {
303 if ch.is_alphanumeric() || ch == '_' {
304 state.advance(ch.len_utf8());
305 }
306 else {
307 break;
308 }
309 }
310
311 state.add_token(PrologSyntaxKind::Variable, start_pos, state.get_position());
312 true
313 }
314 else {
315 false
316 }
317 }
318 else {
319 false
320 }
321 }
322
323 fn lex_operators_and_punctuation<S: Source>(&self, state: &mut State<S>) -> bool {
324 if let Some(ch) = state.peek() {
325 let start_pos = state.get_position();
326
327 let kind = match ch {
328 '+' => {
329 state.advance(1);
330 PrologSyntaxKind::Plus
331 }
332 '-' => {
333 state.advance(1);
334 PrologSyntaxKind::Minus
335 }
336 '*' => {
337 state.advance(1);
338 if let Some('*') = state.peek() {
339 state.advance(1);
340 PrologSyntaxKind::Power
341 }
342 else {
343 PrologSyntaxKind::Multiply
344 }
345 }
346 '/' => {
347 state.advance(1);
348 if let Some('/') = state.peek() {
349 state.advance(1);
350 PrologSyntaxKind::IntDivide
351 }
352 else {
353 PrologSyntaxKind::Divide
354 }
355 }
356 '=' => {
357 state.advance(1);
358 if let Some('=') = state.peek() {
359 state.advance(1);
360 PrologSyntaxKind::Equal
361 }
362 else if let Some(':') = state.peek() {
363 state.advance(1);
364 if let Some('=') = state.peek() {
365 state.advance(1);
366 PrologSyntaxKind::ArithEqual
367 }
368 else {
369 state.set_position(start_pos + 1);
371 PrologSyntaxKind::Unify
372 }
373 }
374 else if let Some('\\') = state.peek() {
375 state.advance(1);
376 if let Some('=') = state.peek() {
377 state.advance(1);
378 PrologSyntaxKind::NotUnify
379 }
380 else {
381 state.set_position(start_pos + 1);
383 PrologSyntaxKind::Unify
384 }
385 }
386 else if let Some('<') = state.peek() {
387 state.advance(1);
388 PrologSyntaxKind::ArithNotEqual
389 }
390 else {
391 PrologSyntaxKind::Unify
392 }
393 }
394 '<' => {
395 state.advance(1);
396 if let Some('=') = state.peek() {
397 state.advance(1);
398 PrologSyntaxKind::LessEqual
399 }
400 else {
401 PrologSyntaxKind::Less
402 }
403 }
404 '>' => {
405 state.advance(1);
406 if let Some('=') = state.peek() {
407 state.advance(1);
408 PrologSyntaxKind::GreaterEqual
409 }
410 else {
411 PrologSyntaxKind::Greater
412 }
413 }
414 '\\' => {
415 state.advance(1);
416 if let Some('=') = state.peek() {
417 state.advance(1);
418 if let Some('=') = state.peek() {
419 state.advance(1);
420 PrologSyntaxKind::NotEqual
421 }
422 else {
423 PrologSyntaxKind::NotUnify
424 }
425 }
426 else {
427 PrologSyntaxKind::BitwiseNot
428 }
429 }
430 '!' => {
431 state.advance(1);
432 PrologSyntaxKind::Cut
433 }
434 '?' => {
435 state.advance(1);
436 PrologSyntaxKind::Question
437 }
438 ':' => {
439 state.advance(1);
440 if let Some('-') = state.peek() {
441 state.advance(1);
442 PrologSyntaxKind::ColonMinus
443 }
444 else {
445 PrologSyntaxKind::Colon
446 }
447 }
448 ';' => {
449 state.advance(1);
450 PrologSyntaxKind::Semicolon
451 }
452 ',' => {
453 state.advance(1);
454 PrologSyntaxKind::Comma
455 }
456 '.' => {
457 state.advance(1);
458 PrologSyntaxKind::Dot
459 }
460 '(' => {
461 state.advance(1);
462 PrologSyntaxKind::LeftParen
463 }
464 ')' => {
465 state.advance(1);
466 PrologSyntaxKind::RightParen
467 }
468 '[' => {
469 state.advance(1);
470 PrologSyntaxKind::LeftBracket
471 }
472 ']' => {
473 state.advance(1);
474 PrologSyntaxKind::RightBracket
475 }
476 '{' => {
477 state.advance(1);
478 PrologSyntaxKind::LeftBrace
479 }
480 '}' => {
481 state.advance(1);
482 PrologSyntaxKind::RightBrace
483 }
484 '|' => {
485 state.advance(1);
486 PrologSyntaxKind::Pipe
487 }
488 '^' => {
489 state.advance(1);
490 PrologSyntaxKind::BitwiseXor
491 }
492 _ => return false,
493 };
494
495 state.add_token(kind, start_pos, state.get_position());
496 true
497 }
498 else {
499 false
500 }
501 }
502}
503
504impl<'config> Lexer<PrologLanguage> for PrologLexer<'config> {
505 fn lex_incremental(
506 &self,
507 source: impl Source,
508 _changed: usize,
509 _cache: IncrementalCache<PrologLanguage>,
510 ) -> LexOutput<PrologLanguage> {
511 let mut state = LexerState::new(source);
512 let result = self.run(&mut state);
513 state.finish(result)
514 }
515}