1#![doc = include_str!("readme.md")]
2pub mod token_type;
4pub use token_type::PrologTokenType;
5
6use crate::language::PrologLanguage;
7use oak_core::{Lexer, LexerCache, LexerState, OakError, lexer::LexOutput, source::Source};
8
9type State<'s, S> = LexerState<'s, S, PrologLanguage>;
10
11#[derive(Clone, Debug)]
13pub struct PrologLexer<'config> {
14 config: &'config PrologLanguage,
15}
16
17impl<'config> PrologLexer<'config> {
18 pub fn new(config: &'config PrologLanguage) -> Self {
20 Self { config }
21 }
22
23 fn run<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> Result<(), OakError> {
24 while state.not_at_end() {
25 let safe_point = state.get_position();
26
27 if self.skip_whitespace(state) {
28 continue;
29 }
30
31 if self.lex_newline(state) {
32 continue;
33 }
34
35 if self.lex_comment(state) {
36 continue;
37 }
38
39 if self.lex_string(state) {
40 continue;
41 }
42
43 if self.lex_number(state) {
44 continue;
45 }
46
47 if self.lex_atom_or_keyword(state) {
48 continue;
49 }
50
51 if self.lex_variable(state) {
52 continue;
53 }
54
55 if self.lex_operators_and_punctuation(state) {
56 continue;
57 }
58
59 if let Some(ch) = state.peek() {
61 let start_pos = state.get_position();
62 state.advance(ch.len_utf8());
63 state.add_token(PrologTokenType::Error, start_pos, state.get_position())
64 }
65
66 state.advance_if_dead_lock(safe_point)
67 }
68
69 Ok(())
70 }
71
72 fn skip_whitespace<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
73 let start_pos = state.get_position();
74
75 while let Some(ch) = state.peek() {
76 if ch == ' ' || ch == '\t' { state.advance(ch.len_utf8()) } else { break }
77 }
78
79 if state.get_position() > start_pos {
80 state.add_token(PrologTokenType::Whitespace, start_pos, state.get_position());
81 true
82 }
83 else {
84 false
85 }
86 }
87
88 fn lex_newline<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
89 let start_pos = state.get_position();
90
91 if let Some('\n') = state.peek() {
92 state.advance(1);
93 state.add_token(PrologTokenType::Newline, start_pos, state.get_position());
94 true
95 }
96 else if let Some('\r') = state.peek() {
97 state.advance(1);
98 if let Some('\n') = state.peek() {
99 state.advance(1)
100 }
101 state.add_token(PrologTokenType::Newline, start_pos, state.get_position());
102 true
103 }
104 else {
105 false
106 }
107 }
108
109 fn lex_comment<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
110 let start_pos = state.get_position();
111
112 if let Some('%') = state.peek() {
113 state.advance(1);
114 while let Some(ch) = state.peek() {
116 if ch == '\n' || ch == '\r' {
117 break;
118 }
119 state.advance(ch.len_utf8())
120 }
121 state.add_token(PrologTokenType::Comment, start_pos, state.get_position());
122 true
123 }
124 else if let Some('/') = state.peek() {
125 state.advance(1);
126 if let Some('*') = state.peek() {
127 state.advance(1);
128 while let Some(ch) = state.peek() {
130 if ch == '*' {
131 state.advance(1);
132 if let Some('/') = state.peek() {
133 state.advance(1);
134 break;
135 }
136 }
137 else {
138 state.advance(ch.len_utf8())
139 }
140 }
141 state.add_token(PrologTokenType::Comment, start_pos, state.get_position());
142 true
143 }
144 else {
145 state.set_position(start_pos);
147 false
148 }
149 }
150 else {
151 false
152 }
153 }
154
155 fn lex_string<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
156 let start_pos = state.get_position();
157
158 if let Some(quote_char) = state.peek() {
159 if quote_char == '"' || quote_char == '\'' {
160 state.advance(1); let mut escaped = false;
163 while let Some(ch) = state.peek() {
164 if escaped {
165 escaped = false;
166 state.advance(ch.len_utf8())
167 }
168 else if ch == '\\' {
169 escaped = true;
170 state.advance(1)
171 }
172 else if ch == quote_char {
173 state.advance(1); break;
175 }
176 else if ch == '\n' || ch == '\r' {
177 break;
179 }
180 else {
181 state.advance(ch.len_utf8())
182 }
183 }
184
185 state.add_token(PrologTokenType::String, start_pos, state.get_position());
186 true
187 }
188 else {
189 false
190 }
191 }
192 else {
193 false
194 }
195 }
196
197 fn lex_number<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
198 if let Some(ch) = state.peek() {
199 if ch.is_ascii_digit() {
200 let start_pos = state.get_position();
201
202 while let Some(ch) = state.peek() {
204 if ch.is_ascii_digit() { state.advance(1) } else { break }
205 }
206
207 if let Some('.') = state.peek() {
209 state.advance(1);
210 while let Some(ch) = state.peek() {
212 if ch.is_ascii_digit() { state.advance(1) } else { break }
213 }
214 }
215
216 if let Some(ch) = state.peek() {
218 if ch == 'e' || ch == 'E' {
219 state.advance(1);
220 if let Some(ch) = state.peek() {
221 if ch == '+' || ch == '-' {
222 state.advance(1)
223 }
224 }
225 while let Some(ch) = state.peek() {
226 if ch.is_ascii_digit() { state.advance(1) } else { break }
227 }
228 }
229 }
230
231 state.add_token(PrologTokenType::Integer, start_pos, state.get_position());
232 true
233 }
234 else {
235 false
236 }
237 }
238 else {
239 false
240 }
241 }
242
243 fn lex_atom_or_keyword<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
244 if let Some(ch) = state.peek() {
245 if ch.is_ascii_lowercase() || ch == '_' {
246 let start_pos = state.get_position();
247 let mut text = String::new();
248
249 while let Some(ch) = state.peek() {
251 if ch.is_alphanumeric() || ch == '_' {
252 text.push(ch);
253 state.advance(ch.len_utf8())
254 }
255 else {
256 break;
257 }
258 }
259
260 let kind = match text.as_str() {
262 "is" => PrologTokenType::Is,
263 "mod" => PrologTokenType::Modulo,
264 _ => PrologTokenType::Atom,
265 };
266
267 state.add_token(kind, start_pos, state.get_position());
268 true
269 }
270 else {
271 false
272 }
273 }
274 else {
275 false
276 }
277 }
278
279 fn lex_variable<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
280 if let Some(ch) = state.peek() {
281 if ch.is_ascii_uppercase() || ch == '_' {
282 let start_pos = state.get_position();
283
284 while let Some(ch) = state.peek() {
286 if ch.is_alphanumeric() || ch == '_' { state.advance(ch.len_utf8()) } else { break }
287 }
288
289 state.add_token(PrologTokenType::Variable, start_pos, state.get_position());
290 true
291 }
292 else {
293 false
294 }
295 }
296 else {
297 false
298 }
299 }
300
301 fn lex_operators_and_punctuation<'s, S: Source + ?Sized>(&self, state: &mut State<'s, S>) -> bool {
302 if let Some(ch) = state.peek() {
303 let start_pos = state.get_position();
304
305 let kind = match ch {
306 '+' => {
307 state.advance(1);
308 PrologTokenType::Plus
309 }
310 '-' => {
311 state.advance(1);
312 PrologTokenType::Minus
313 }
314 '*' => {
315 state.advance(1);
316 if let Some('*') = state.peek() {
317 state.advance(1);
318 PrologTokenType::Power
319 }
320 else {
321 PrologTokenType::Multiply
322 }
323 }
324 '/' => {
325 state.advance(1);
326 if let Some('/') = state.peek() {
327 state.advance(1);
328 PrologTokenType::IntDivide
329 }
330 else {
331 PrologTokenType::Divide
332 }
333 }
334 '=' => {
335 state.advance(1);
336 if let Some('=') = state.peek() {
337 state.advance(1);
338 PrologTokenType::Equal
339 }
340 else if let Some(':') = state.peek() {
341 state.advance(1);
342 if let Some('=') = state.peek() {
343 state.advance(1);
344 PrologTokenType::ArithEqual
345 }
346 else {
347 state.set_position(start_pos + 1);
349 PrologTokenType::Unify
350 }
351 }
352 else if let Some('\\') = state.peek() {
353 state.advance(1);
354 if let Some('=') = state.peek() {
355 state.advance(1);
356 PrologTokenType::NotUnify
357 }
358 else {
359 state.set_position(start_pos + 1);
361 PrologTokenType::Unify
362 }
363 }
364 else if let Some('<') = state.peek() {
365 state.advance(1);
366 PrologTokenType::ArithNotEqual
367 }
368 else {
369 PrologTokenType::Unify
370 }
371 }
372 '<' => {
373 state.advance(1);
374 if let Some('=') = state.peek() {
375 state.advance(1);
376 PrologTokenType::LessEqual
377 }
378 else {
379 PrologTokenType::Less
380 }
381 }
382 '>' => {
383 state.advance(1);
384 if let Some('=') = state.peek() {
385 state.advance(1);
386 PrologTokenType::GreaterEqual
387 }
388 else {
389 PrologTokenType::Greater
390 }
391 }
392 '\\' => {
393 state.advance(1);
394 if let Some('=') = state.peek() {
395 state.advance(1);
396 if let Some('=') = state.peek() {
397 state.advance(1);
398 PrologTokenType::NotEqual
399 }
400 else {
401 PrologTokenType::NotUnify
402 }
403 }
404 else {
405 PrologTokenType::BitwiseNot
406 }
407 }
408 '!' => {
409 state.advance(1);
410 PrologTokenType::Cut
411 }
412 '?' => {
413 state.advance(1);
414 PrologTokenType::Question
415 }
416 ':' => {
417 state.advance(1);
418 if let Some('-') = state.peek() {
419 state.advance(1);
420 PrologTokenType::ColonMinus
421 }
422 else {
423 PrologTokenType::Colon
424 }
425 }
426 ';' => {
427 state.advance(1);
428 PrologTokenType::Semicolon
429 }
430 ',' => {
431 state.advance(1);
432 PrologTokenType::Comma
433 }
434 '.' => {
435 state.advance(1);
436 PrologTokenType::Dot
437 }
438 '(' => {
439 state.advance(1);
440 PrologTokenType::LeftParen
441 }
442 ')' => {
443 state.advance(1);
444 PrologTokenType::RightParen
445 }
446 '[' => {
447 state.advance(1);
448 PrologTokenType::LeftBracket
449 }
450 ']' => {
451 state.advance(1);
452 PrologTokenType::RightBracket
453 }
454 '{' => {
455 state.advance(1);
456 PrologTokenType::LeftBrace
457 }
458 '}' => {
459 state.advance(1);
460 PrologTokenType::RightBrace
461 }
462 '|' => {
463 state.advance(1);
464 PrologTokenType::Pipe
465 }
466 '^' => {
467 state.advance(1);
468 PrologTokenType::BitwiseXor
469 }
470 _ => return false,
471 };
472
473 state.add_token(kind, start_pos, state.get_position());
474 true
475 }
476 else {
477 false
478 }
479 }
480}
481
482impl<'config> Lexer<PrologLanguage> for PrologLexer<'config> {
483 fn lex<'a, S: Source + ?Sized>(&self, source: &'a S, _edits: &[oak_core::source::TextEdit], cache: &'a mut impl LexerCache<PrologLanguage>) -> LexOutput<PrologLanguage> {
484 let mut state = State::new_with_cache(source, 0, cache);
485 let result = self.run(&mut state);
486 if result.is_ok() {
487 state.add_eof()
488 }
489 state.finish_with_cache(result, cache)
490 }
491}