1use serde::{Deserialize, Serialize};
2
3#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
5pub enum TokenKind {
6 Atom(String), Variable(String), Integer(i64),
10 Float(f64),
11
12 Neck, QueryOp, Equals, NotEquals, Is, Lt, Gt, Lte, Gte, ArithEq, ArithNeq, Plus, Minus, Star, Slash, IntDiv, Mod, Rem, Not, Cut, Arrow, Semicolon, Dot, Comma, LParen, RParen, LBracket, RBracket, Pipe, Eof,
47}
48
49#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
50pub struct Token {
51 pub kind: TokenKind,
52 pub line: usize,
53 pub col: usize,
54}
55
56pub struct Tokenizer<'a> {
57 input: &'a [u8],
58 pos: usize,
59 line: usize,
60 col: usize,
61}
62
63impl<'a> Tokenizer<'a> {
64 pub fn new(input: &'a str) -> Self {
65 Tokenizer {
66 input: input.as_bytes(),
67 pos: 0,
68 line: 1,
69 col: 1,
70 }
71 }
72
73 pub fn tokenize(input: &str) -> Result<Vec<Token>, String> {
74 let mut tok = Tokenizer::new(input);
75 let mut tokens = Vec::new();
76 loop {
77 let t = tok.next_token()?;
78 if t.kind == TokenKind::Eof {
79 tokens.push(t);
80 break;
81 }
82 tokens.push(t);
83 }
84 Ok(tokens)
85 }
86
87 fn peek(&self) -> Option<u8> {
88 if self.pos < self.input.len() {
89 Some(self.input[self.pos])
90 } else {
91 None
92 }
93 }
94
95 fn peek_at(&self, offset: usize) -> Option<u8> {
96 let idx = self.pos + offset;
97 if idx < self.input.len() {
98 Some(self.input[idx])
99 } else {
100 None
101 }
102 }
103
104 fn advance(&mut self) -> u8 {
105 let ch = self.input[self.pos];
106 self.pos += 1;
107 if ch == b'\n' {
108 self.line += 1;
109 self.col = 1;
110 } else {
111 self.col += 1;
112 }
113 ch
114 }
115
116 fn skip_whitespace(&mut self) {
117 while let Some(ch) = self.peek() {
118 match ch {
119 b' ' | b'\t' | b'\r' | b'\n' => {
120 self.advance();
121 }
122 b'%' => {
123 while let Some(ch) = self.peek() {
125 if ch == b'\n' {
126 break;
127 }
128 self.advance();
129 }
130 }
131 b'/' if self.peek_at(1) == Some(b'*') => {
132 self.advance(); self.advance(); loop {
136 match self.peek() {
137 None => break,
138 Some(b'*') if self.peek_at(1) == Some(b'/') => {
139 self.advance();
140 self.advance();
141 break;
142 }
143 _ => {
144 self.advance();
145 }
146 }
147 }
148 }
149 _ => break,
150 }
151 }
152 }
153
154 fn next_token(&mut self) -> Result<Token, String> {
155 self.skip_whitespace();
156
157 let line = self.line;
158 let col = self.col;
159
160 let ch = match self.peek() {
161 None => {
162 return Ok(Token {
163 kind: TokenKind::Eof,
164 line,
165 col,
166 })
167 }
168 Some(ch) => ch,
169 };
170
171 match ch {
172 b'(' => {
173 self.advance();
174 Ok(Token {
175 kind: TokenKind::LParen,
176 line,
177 col,
178 })
179 }
180 b')' => {
181 self.advance();
182 Ok(Token {
183 kind: TokenKind::RParen,
184 line,
185 col,
186 })
187 }
188 b'[' => {
189 self.advance();
190 if self.peek() == Some(b']') {
192 self.advance();
193 Ok(Token {
194 kind: TokenKind::Atom("[]".into()),
195 line,
196 col,
197 })
198 } else {
199 Ok(Token {
200 kind: TokenKind::LBracket,
201 line,
202 col,
203 })
204 }
205 }
206 b']' => {
207 self.advance();
208 Ok(Token {
209 kind: TokenKind::RBracket,
210 line,
211 col,
212 })
213 }
214 b'|' => {
215 self.advance();
216 Ok(Token {
217 kind: TokenKind::Pipe,
218 line,
219 col,
220 })
221 }
222 b',' => {
223 self.advance();
224 Ok(Token {
225 kind: TokenKind::Comma,
226 line,
227 col,
228 })
229 }
230 b'!' => {
231 self.advance();
232 Ok(Token {
233 kind: TokenKind::Cut,
234 line,
235 col,
236 })
237 }
238 b';' => {
239 self.advance();
240 Ok(Token {
241 kind: TokenKind::Semicolon,
242 line,
243 col,
244 })
245 }
246
247 b'.' => {
248 self.advance();
249 Ok(Token {
252 kind: TokenKind::Dot,
253 line,
254 col,
255 })
256 }
257
258 b':' => {
259 self.advance();
260 if self.peek() == Some(b'-') {
261 self.advance();
262 Ok(Token {
263 kind: TokenKind::Neck,
264 line,
265 col,
266 })
267 } else {
268 Err(format!("Unexpected ':' at line {} col {}", line, col))
269 }
270 }
271
272 b'?' => {
273 self.advance();
274 if self.peek() == Some(b'-') {
275 self.advance();
276 Ok(Token {
277 kind: TokenKind::QueryOp,
278 line,
279 col,
280 })
281 } else {
282 Err(format!("Unexpected '?' at line {} col {}", line, col))
283 }
284 }
285
286 b'=' => {
287 self.advance();
288 match self.peek() {
289 Some(b':') if self.peek_at(1) == Some(b'=') => {
290 self.advance();
291 self.advance();
292 Ok(Token {
293 kind: TokenKind::ArithEq,
294 line,
295 col,
296 })
297 }
298 Some(b'\\') if self.peek_at(1) == Some(b'=') => {
299 self.advance();
300 self.advance();
301 Ok(Token {
302 kind: TokenKind::ArithNeq,
303 line,
304 col,
305 })
306 }
307 Some(b'<') => {
308 self.advance();
309 Ok(Token {
310 kind: TokenKind::Lte,
311 line,
312 col,
313 })
314 }
315 Some(b'.') if self.peek_at(1) == Some(b'.') => {
316 self.advance();
317 self.advance();
318 Ok(Token {
319 kind: TokenKind::Atom("=..".into()),
320 line,
321 col,
322 })
323 }
324 _ => Ok(Token {
325 kind: TokenKind::Equals,
326 line,
327 col,
328 }),
329 }
330 }
331
332 b'\\' => {
333 self.advance();
334 match self.peek() {
335 Some(b'=') => {
336 self.advance();
337 Ok(Token {
338 kind: TokenKind::NotEquals,
339 line,
340 col,
341 })
342 }
343 Some(b'+') => {
344 self.advance();
345 Ok(Token {
346 kind: TokenKind::Not,
347 line,
348 col,
349 })
350 }
351 _ => Err(format!("Unexpected '\\' at line {} col {}", line, col)),
352 }
353 }
354
355 b'<' => {
356 self.advance();
357 Ok(Token {
358 kind: TokenKind::Lt,
359 line,
360 col,
361 })
362 }
363 b'>' => {
364 self.advance();
365 if self.peek() == Some(b'=') {
366 self.advance();
367 Ok(Token {
368 kind: TokenKind::Gte,
369 line,
370 col,
371 })
372 } else {
373 Ok(Token {
374 kind: TokenKind::Gt,
375 line,
376 col,
377 })
378 }
379 }
380
381 b'@' => {
382 self.advance();
383 match self.peek() {
384 Some(b'<') => {
385 self.advance();
386 Ok(Token {
387 kind: TokenKind::Atom("@<".into()),
388 line,
389 col,
390 })
391 }
392 Some(b'>') => {
393 self.advance();
394 if self.peek() == Some(b'=') {
395 self.advance();
396 Ok(Token {
397 kind: TokenKind::Atom("@>=".into()),
398 line,
399 col,
400 })
401 } else {
402 Ok(Token {
403 kind: TokenKind::Atom("@>".into()),
404 line,
405 col,
406 })
407 }
408 }
409 Some(b'=') if self.peek_at(1) == Some(b'<') => {
410 self.advance();
411 self.advance();
412 Ok(Token {
413 kind: TokenKind::Atom("@=<".into()),
414 line,
415 col,
416 })
417 }
418 _ => Err(format!("Unexpected '@' at line {} col {}", line, col)),
419 }
420 }
421
422 b'+' => {
423 self.advance();
424 Ok(Token {
425 kind: TokenKind::Plus,
426 line,
427 col,
428 })
429 }
430 b'*' => {
431 self.advance();
432 Ok(Token {
433 kind: TokenKind::Star,
434 line,
435 col,
436 })
437 }
438 b'/' => {
439 self.advance();
440 if self.peek() == Some(b'/') {
441 self.advance();
442 Ok(Token {
443 kind: TokenKind::IntDiv,
444 line,
445 col,
446 })
447 } else {
448 Ok(Token {
449 kind: TokenKind::Slash,
450 line,
451 col,
452 })
453 }
454 }
455
456 b'-' => {
457 self.advance();
458 if self.peek() == Some(b'>') {
460 self.advance();
461 return Ok(Token {
462 kind: TokenKind::Arrow,
463 line,
464 col,
465 });
466 }
467 if let Some(d) = self.peek() {
469 if d.is_ascii_digit() {
470 return Ok(Token {
471 kind: TokenKind::Minus,
472 line,
473 col,
474 });
475 }
476 }
477 Ok(Token {
478 kind: TokenKind::Minus,
479 line,
480 col,
481 })
482 }
483
484 b'\'' => self.read_quoted_atom(line, col),
485
486 b'0'..=b'9' => self.read_number(line, col),
487
488 b'a'..=b'z' => self.read_atom(line, col),
489
490 b'A'..=b'Z' | b'_' => self.read_variable(line, col),
491
492 _ => {
493 self.advance();
494 Err(format!(
495 "Unexpected character '{}' at line {} col {}",
496 ch as char, line, col
497 ))
498 }
499 }
500 }
501
502 fn read_atom(&mut self, line: usize, col: usize) -> Result<Token, String> {
503 let mut s = String::new();
504 while let Some(ch) = self.peek() {
505 if ch.is_ascii_alphanumeric() || ch == b'_' {
506 s.push(self.advance() as char);
507 } else {
508 break;
509 }
510 }
511 let kind = match s.as_str() {
513 "is" => TokenKind::Is,
514 "mod" => TokenKind::Mod,
515 "rem" => TokenKind::Rem,
516 _ => TokenKind::Atom(s),
517 };
518 Ok(Token { kind, line, col })
519 }
520
521 fn read_variable(&mut self, line: usize, col: usize) -> Result<Token, String> {
522 let mut s = String::new();
523 while let Some(ch) = self.peek() {
524 if ch.is_ascii_alphanumeric() || ch == b'_' {
525 s.push(self.advance() as char);
526 } else {
527 break;
528 }
529 }
530 Ok(Token {
531 kind: TokenKind::Variable(s),
532 line,
533 col,
534 })
535 }
536
537 fn read_number(&mut self, line: usize, col: usize) -> Result<Token, String> {
538 let mut s = String::new();
539 let mut is_float = false;
540
541 while let Some(ch) = self.peek() {
542 if ch.is_ascii_digit() {
543 s.push(self.advance() as char);
544 } else if ch == b'.' {
545 if let Some(next) = self.peek_at(1) {
547 if next.is_ascii_digit() {
548 is_float = true;
549 s.push(self.advance() as char); while let Some(d) = self.peek() {
551 if d.is_ascii_digit() {
552 s.push(self.advance() as char);
553 } else {
554 break;
555 }
556 }
557 } else {
558 break; }
560 } else {
561 break; }
563 } else {
564 break;
565 }
566 }
567
568 if is_float {
569 let val: f64 = s
570 .parse()
571 .map_err(|e| format!("Invalid float '{}': {}", s, e))?;
572 if val.is_infinite() {
573 return Err(format!(
574 "Float literal '{}' overflows f64 at line {} col {}",
575 s, line, col
576 ));
577 }
578 Ok(Token {
579 kind: TokenKind::Float(val),
580 line,
581 col,
582 })
583 } else {
584 let val: i64 = s
585 .parse()
586 .map_err(|e| format!("Invalid integer '{}': {}", s, e))?;
587 Ok(Token {
588 kind: TokenKind::Integer(val),
589 line,
590 col,
591 })
592 }
593 }
594
595 fn read_quoted_atom(&mut self, line: usize, col: usize) -> Result<Token, String> {
596 self.advance(); let mut s = String::new();
598 loop {
599 match self.peek() {
600 None => {
601 return Err(format!(
602 "Unterminated quoted atom at line {} col {}",
603 line, col
604 ))
605 }
606 Some(b'\'') => {
607 self.advance();
608 if self.peek() == Some(b'\'') {
610 s.push('\'');
611 self.advance();
612 } else {
613 break;
614 }
615 }
616 Some(b'\\') => {
617 self.advance();
618 match self.peek() {
619 Some(b'\'') => {
620 s.push('\'');
621 self.advance();
622 }
623 Some(b'\\') => {
624 s.push('\\');
625 self.advance();
626 }
627 Some(b'n') => {
628 s.push('\n');
629 self.advance();
630 }
631 Some(b't') => {
632 s.push('\t');
633 self.advance();
634 }
635 Some(ch) => {
636 s.push(ch as char);
637 self.advance();
638 }
639 None => {
640 return Err(format!(
641 "Unterminated escape at line {} col {}",
642 self.line, self.col
643 ))
644 }
645 }
646 }
647 Some(ch) => {
648 s.push(ch as char);
649 self.advance();
650 }
651 }
652 }
653 Ok(Token {
654 kind: TokenKind::Atom(s),
655 line,
656 col,
657 })
658 }
659}
660
661#[cfg(test)]
662mod tests {
663 use super::*;
664
665 fn tok(input: &str) -> Vec<TokenKind> {
666 Tokenizer::tokenize(input)
667 .unwrap()
668 .into_iter()
669 .map(|t| t.kind)
670 .filter(|k| *k != TokenKind::Eof)
671 .collect()
672 }
673
674 #[test]
675 fn test_atoms() {
676 assert_eq!(tok("hello"), vec![TokenKind::Atom("hello".into())]);
677 assert_eq!(tok("foo_bar"), vec![TokenKind::Atom("foo_bar".into())]);
678 assert_eq!(tok("a123"), vec![TokenKind::Atom("a123".into())]);
679 }
680
681 #[test]
682 fn test_quoted_atoms() {
683 assert_eq!(
684 tok("'hello world'"),
685 vec![TokenKind::Atom("hello world".into())]
686 );
687 assert_eq!(tok("'it''s'"), vec![TokenKind::Atom("it's".into())]);
688 }
689
690 #[test]
691 fn test_variables() {
692 assert_eq!(tok("X"), vec![TokenKind::Variable("X".into())]);
693 assert_eq!(tok("_foo"), vec![TokenKind::Variable("_foo".into())]);
694 assert_eq!(tok("_"), vec![TokenKind::Variable("_".into())]);
695 assert_eq!(tok("MyVar"), vec![TokenKind::Variable("MyVar".into())]);
696 }
697
698 #[test]
699 fn test_numbers() {
700 assert_eq!(tok("42"), vec![TokenKind::Integer(42)]);
701 assert_eq!(tok("3.14"), vec![TokenKind::Float(3.14)]);
702 assert_eq!(tok("0"), vec![TokenKind::Integer(0)]);
703 }
704
705 #[test]
706 fn test_operators() {
707 assert_eq!(tok(":-"), vec![TokenKind::Neck]);
708 assert_eq!(tok("?-"), vec![TokenKind::QueryOp]);
709 assert_eq!(tok("="), vec![TokenKind::Equals]);
710 assert_eq!(tok("\\="), vec![TokenKind::NotEquals]);
711 assert_eq!(tok("is"), vec![TokenKind::Is]);
712 assert_eq!(tok("<"), vec![TokenKind::Lt]);
713 assert_eq!(tok(">"), vec![TokenKind::Gt]);
714 assert_eq!(tok("=<"), vec![TokenKind::Lte]);
715 assert_eq!(tok(">="), vec![TokenKind::Gte]);
716 assert_eq!(tok("=:="), vec![TokenKind::ArithEq]);
717 assert_eq!(tok("=\\="), vec![TokenKind::ArithNeq]);
718 assert_eq!(tok("\\+"), vec![TokenKind::Not]);
719 }
720
721 #[test]
722 fn test_punctuation() {
723 assert_eq!(
724 tok("( ) | , ."),
725 vec![
726 TokenKind::LParen,
727 TokenKind::RParen,
728 TokenKind::Pipe,
729 TokenKind::Comma,
730 TokenKind::Dot,
731 ]
732 );
733 assert_eq!(tok("[ ]"), vec![TokenKind::LBracket, TokenKind::RBracket,]);
735 }
736
737 #[test]
738 fn test_cut() {
739 assert_eq!(tok("!"), vec![TokenKind::Cut]);
740 }
741
742 #[test]
743 fn test_clause() {
744 let tokens = tok("parent(tom, mary).");
745 assert_eq!(
746 tokens,
747 vec![
748 TokenKind::Atom("parent".into()),
749 TokenKind::LParen,
750 TokenKind::Atom("tom".into()),
751 TokenKind::Comma,
752 TokenKind::Atom("mary".into()),
753 TokenKind::RParen,
754 TokenKind::Dot,
755 ]
756 );
757 }
758
759 #[test]
760 fn test_rule() {
761 let tokens = tok("happy(X) :- likes(X, food).");
762 assert_eq!(
763 tokens,
764 vec![
765 TokenKind::Atom("happy".into()),
766 TokenKind::LParen,
767 TokenKind::Variable("X".into()),
768 TokenKind::RParen,
769 TokenKind::Neck,
770 TokenKind::Atom("likes".into()),
771 TokenKind::LParen,
772 TokenKind::Variable("X".into()),
773 TokenKind::Comma,
774 TokenKind::Atom("food".into()),
775 TokenKind::RParen,
776 TokenKind::Dot,
777 ]
778 );
779 }
780
781 #[test]
782 fn test_arithmetic() {
783 let tokens = tok("X is 2 + 3 * 4.");
784 assert_eq!(
785 tokens,
786 vec![
787 TokenKind::Variable("X".into()),
788 TokenKind::Is,
789 TokenKind::Integer(2),
790 TokenKind::Plus,
791 TokenKind::Integer(3),
792 TokenKind::Star,
793 TokenKind::Integer(4),
794 TokenKind::Dot,
795 ]
796 );
797 }
798
799 #[test]
800 fn test_line_comment() {
801 assert_eq!(
802 tok("foo % this is a comment\nbar"),
803 vec![TokenKind::Atom("foo".into()), TokenKind::Atom("bar".into()),]
804 );
805 }
806
807 #[test]
808 fn test_block_comment() {
809 assert_eq!(
810 tok("foo /* block */ bar"),
811 vec![TokenKind::Atom("foo".into()), TokenKind::Atom("bar".into()),]
812 );
813 }
814
815 #[test]
816 fn test_empty_list() {
817 assert_eq!(tok("[]"), vec![TokenKind::Atom("[]".into())]);
818 }
819
820 #[test]
821 fn test_list_syntax() {
822 let tokens = tok("[1, 2, 3]");
823 assert_eq!(
824 tokens,
825 vec![
826 TokenKind::LBracket,
827 TokenKind::Integer(1),
828 TokenKind::Comma,
829 TokenKind::Integer(2),
830 TokenKind::Comma,
831 TokenKind::Integer(3),
832 TokenKind::RBracket,
833 ]
834 );
835 }
836
837 #[test]
838 fn test_minus_operator() {
839 assert_eq!(
840 tok("5 - 3"),
841 vec![
842 TokenKind::Integer(5),
843 TokenKind::Minus,
844 TokenKind::Integer(3),
845 ]
846 );
847 }
848}