Skip to main content

patch_prolog_core/
tokenizer.rs

1use serde::{Deserialize, Serialize};
2
3/// Token types for Edinburgh Prolog.
4#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
5pub enum TokenKind {
6    // Identifiers
7    Atom(String),     // lowercase-starting or single-quoted
8    Variable(String), // uppercase-starting or _
9    Integer(i64),
10    Float(f64),
11
12    // Operators
13    Neck,      // :-
14    QueryOp,   // ?-
15    Equals,    // =
16    NotEquals, // \=
17    Is,        // is
18    Lt,        // <
19    Gt,        // >
20    Lte,       // =<
21    Gte,       // >=
22    ArithEq,   // =:=
23    ArithNeq,  // =\=
24    Plus,      // +
25    Minus,     // -
26    Star,      // *
27    Slash,     // /
28    IntDiv,    // //
29    Mod,       // mod
30    Rem,       // rem
31    Not,       // \+
32    Cut,       // !
33    Arrow,     // ->
34    Semicolon, // ;
35
36    // Punctuation
37    Dot,      // .
38    Comma,    // ,
39    LParen,   // (
40    RParen,   // )
41    LBracket, // [
42    RBracket, // ]
43    Pipe,     // |
44
45    // End of input
46    Eof,
47}
48
49#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
50pub struct Token {
51    pub kind: TokenKind,
52    pub line: usize,
53    pub col: usize,
54}
55
56pub struct Tokenizer<'a> {
57    input: &'a [u8],
58    pos: usize,
59    line: usize,
60    col: usize,
61}
62
63impl<'a> Tokenizer<'a> {
64    pub fn new(input: &'a str) -> Self {
65        Tokenizer {
66            input: input.as_bytes(),
67            pos: 0,
68            line: 1,
69            col: 1,
70        }
71    }
72
73    pub fn tokenize(input: &str) -> Result<Vec<Token>, String> {
74        let mut tok = Tokenizer::new(input);
75        let mut tokens = Vec::new();
76        loop {
77            let t = tok.next_token()?;
78            if t.kind == TokenKind::Eof {
79                tokens.push(t);
80                break;
81            }
82            tokens.push(t);
83        }
84        Ok(tokens)
85    }
86
87    fn peek(&self) -> Option<u8> {
88        if self.pos < self.input.len() {
89            Some(self.input[self.pos])
90        } else {
91            None
92        }
93    }
94
95    fn peek_at(&self, offset: usize) -> Option<u8> {
96        let idx = self.pos + offset;
97        if idx < self.input.len() {
98            Some(self.input[idx])
99        } else {
100            None
101        }
102    }
103
104    fn advance(&mut self) -> u8 {
105        let ch = self.input[self.pos];
106        self.pos += 1;
107        if ch == b'\n' {
108            self.line += 1;
109            self.col = 1;
110        } else {
111            self.col += 1;
112        }
113        ch
114    }
115
116    fn skip_whitespace(&mut self) {
117        while let Some(ch) = self.peek() {
118            match ch {
119                b' ' | b'\t' | b'\r' | b'\n' => {
120                    self.advance();
121                }
122                b'%' => {
123                    // Line comment
124                    while let Some(ch) = self.peek() {
125                        if ch == b'\n' {
126                            break;
127                        }
128                        self.advance();
129                    }
130                }
131                b'/' if self.peek_at(1) == Some(b'*') => {
132                    // Block comment
133                    self.advance(); // /
134                    self.advance(); // *
135                    loop {
136                        match self.peek() {
137                            None => break,
138                            Some(b'*') if self.peek_at(1) == Some(b'/') => {
139                                self.advance();
140                                self.advance();
141                                break;
142                            }
143                            _ => {
144                                self.advance();
145                            }
146                        }
147                    }
148                }
149                _ => break,
150            }
151        }
152    }
153
154    fn next_token(&mut self) -> Result<Token, String> {
155        self.skip_whitespace();
156
157        let line = self.line;
158        let col = self.col;
159
160        let ch = match self.peek() {
161            None => {
162                return Ok(Token {
163                    kind: TokenKind::Eof,
164                    line,
165                    col,
166                })
167            }
168            Some(ch) => ch,
169        };
170
171        match ch {
172            b'(' => {
173                self.advance();
174                Ok(Token {
175                    kind: TokenKind::LParen,
176                    line,
177                    col,
178                })
179            }
180            b')' => {
181                self.advance();
182                Ok(Token {
183                    kind: TokenKind::RParen,
184                    line,
185                    col,
186                })
187            }
188            b'[' => {
189                self.advance();
190                // Check for []
191                if self.peek() == Some(b']') {
192                    self.advance();
193                    Ok(Token {
194                        kind: TokenKind::Atom("[]".into()),
195                        line,
196                        col,
197                    })
198                } else {
199                    Ok(Token {
200                        kind: TokenKind::LBracket,
201                        line,
202                        col,
203                    })
204                }
205            }
206            b']' => {
207                self.advance();
208                Ok(Token {
209                    kind: TokenKind::RBracket,
210                    line,
211                    col,
212                })
213            }
214            b'|' => {
215                self.advance();
216                Ok(Token {
217                    kind: TokenKind::Pipe,
218                    line,
219                    col,
220                })
221            }
222            b',' => {
223                self.advance();
224                Ok(Token {
225                    kind: TokenKind::Comma,
226                    line,
227                    col,
228                })
229            }
230            b'!' => {
231                self.advance();
232                Ok(Token {
233                    kind: TokenKind::Cut,
234                    line,
235                    col,
236                })
237            }
238            b';' => {
239                self.advance();
240                Ok(Token {
241                    kind: TokenKind::Semicolon,
242                    line,
243                    col,
244                })
245            }
246
247            b'.' => {
248                self.advance();
249                // Check if followed by whitespace/EOF/comment (end of clause)
250                // vs followed by digit (float - but we handle that in number parsing)
251                Ok(Token {
252                    kind: TokenKind::Dot,
253                    line,
254                    col,
255                })
256            }
257
258            b':' => {
259                self.advance();
260                if self.peek() == Some(b'-') {
261                    self.advance();
262                    Ok(Token {
263                        kind: TokenKind::Neck,
264                        line,
265                        col,
266                    })
267                } else {
268                    Err(format!("Unexpected ':' at line {} col {}", line, col))
269                }
270            }
271
272            b'?' => {
273                self.advance();
274                if self.peek() == Some(b'-') {
275                    self.advance();
276                    Ok(Token {
277                        kind: TokenKind::QueryOp,
278                        line,
279                        col,
280                    })
281                } else {
282                    Err(format!("Unexpected '?' at line {} col {}", line, col))
283                }
284            }
285
286            b'=' => {
287                self.advance();
288                match self.peek() {
289                    Some(b':') if self.peek_at(1) == Some(b'=') => {
290                        self.advance();
291                        self.advance();
292                        Ok(Token {
293                            kind: TokenKind::ArithEq,
294                            line,
295                            col,
296                        })
297                    }
298                    Some(b'\\') if self.peek_at(1) == Some(b'=') => {
299                        self.advance();
300                        self.advance();
301                        Ok(Token {
302                            kind: TokenKind::ArithNeq,
303                            line,
304                            col,
305                        })
306                    }
307                    Some(b'<') => {
308                        self.advance();
309                        Ok(Token {
310                            kind: TokenKind::Lte,
311                            line,
312                            col,
313                        })
314                    }
315                    Some(b'.') if self.peek_at(1) == Some(b'.') => {
316                        self.advance();
317                        self.advance();
318                        Ok(Token {
319                            kind: TokenKind::Atom("=..".into()),
320                            line,
321                            col,
322                        })
323                    }
324                    _ => Ok(Token {
325                        kind: TokenKind::Equals,
326                        line,
327                        col,
328                    }),
329                }
330            }
331
332            b'\\' => {
333                self.advance();
334                match self.peek() {
335                    Some(b'=') => {
336                        self.advance();
337                        Ok(Token {
338                            kind: TokenKind::NotEquals,
339                            line,
340                            col,
341                        })
342                    }
343                    Some(b'+') => {
344                        self.advance();
345                        Ok(Token {
346                            kind: TokenKind::Not,
347                            line,
348                            col,
349                        })
350                    }
351                    _ => Err(format!("Unexpected '\\' at line {} col {}", line, col)),
352                }
353            }
354
355            b'<' => {
356                self.advance();
357                Ok(Token {
358                    kind: TokenKind::Lt,
359                    line,
360                    col,
361                })
362            }
363            b'>' => {
364                self.advance();
365                if self.peek() == Some(b'=') {
366                    self.advance();
367                    Ok(Token {
368                        kind: TokenKind::Gte,
369                        line,
370                        col,
371                    })
372                } else {
373                    Ok(Token {
374                        kind: TokenKind::Gt,
375                        line,
376                        col,
377                    })
378                }
379            }
380
381            b'@' => {
382                self.advance();
383                match self.peek() {
384                    Some(b'<') => {
385                        self.advance();
386                        Ok(Token {
387                            kind: TokenKind::Atom("@<".into()),
388                            line,
389                            col,
390                        })
391                    }
392                    Some(b'>') => {
393                        self.advance();
394                        if self.peek() == Some(b'=') {
395                            self.advance();
396                            Ok(Token {
397                                kind: TokenKind::Atom("@>=".into()),
398                                line,
399                                col,
400                            })
401                        } else {
402                            Ok(Token {
403                                kind: TokenKind::Atom("@>".into()),
404                                line,
405                                col,
406                            })
407                        }
408                    }
409                    Some(b'=') if self.peek_at(1) == Some(b'<') => {
410                        self.advance();
411                        self.advance();
412                        Ok(Token {
413                            kind: TokenKind::Atom("@=<".into()),
414                            line,
415                            col,
416                        })
417                    }
418                    _ => Err(format!("Unexpected '@' at line {} col {}", line, col)),
419                }
420            }
421
422            b'+' => {
423                self.advance();
424                Ok(Token {
425                    kind: TokenKind::Plus,
426                    line,
427                    col,
428                })
429            }
430            b'*' => {
431                self.advance();
432                Ok(Token {
433                    kind: TokenKind::Star,
434                    line,
435                    col,
436                })
437            }
438            b'/' => {
439                self.advance();
440                if self.peek() == Some(b'/') {
441                    self.advance();
442                    Ok(Token {
443                        kind: TokenKind::IntDiv,
444                        line,
445                        col,
446                    })
447                } else {
448                    Ok(Token {
449                        kind: TokenKind::Slash,
450                        line,
451                        col,
452                    })
453                }
454            }
455
456            b'-' => {
457                self.advance();
458                // Check for -> (arrow)
459                if self.peek() == Some(b'>') {
460                    self.advance();
461                    return Ok(Token {
462                        kind: TokenKind::Arrow,
463                        line,
464                        col,
465                    });
466                }
467                // Check if this is a negative number: dash followed by digit
468                if let Some(d) = self.peek() {
469                    if d.is_ascii_digit() {
470                        return Ok(Token {
471                            kind: TokenKind::Minus,
472                            line,
473                            col,
474                        });
475                    }
476                }
477                Ok(Token {
478                    kind: TokenKind::Minus,
479                    line,
480                    col,
481                })
482            }
483
484            b'\'' => self.read_quoted_atom(line, col),
485
486            b'0'..=b'9' => self.read_number(line, col),
487
488            b'a'..=b'z' => self.read_atom(line, col),
489
490            b'A'..=b'Z' | b'_' => self.read_variable(line, col),
491
492            _ => {
493                self.advance();
494                Err(format!(
495                    "Unexpected character '{}' at line {} col {}",
496                    ch as char, line, col
497                ))
498            }
499        }
500    }
501
502    fn read_atom(&mut self, line: usize, col: usize) -> Result<Token, String> {
503        let mut s = String::new();
504        while let Some(ch) = self.peek() {
505            if ch.is_ascii_alphanumeric() || ch == b'_' {
506                s.push(self.advance() as char);
507            } else {
508                break;
509            }
510        }
511        // Check for keyword operators
512        let kind = match s.as_str() {
513            "is" => TokenKind::Is,
514            "mod" => TokenKind::Mod,
515            "rem" => TokenKind::Rem,
516            _ => TokenKind::Atom(s),
517        };
518        Ok(Token { kind, line, col })
519    }
520
521    fn read_variable(&mut self, line: usize, col: usize) -> Result<Token, String> {
522        let mut s = String::new();
523        while let Some(ch) = self.peek() {
524            if ch.is_ascii_alphanumeric() || ch == b'_' {
525                s.push(self.advance() as char);
526            } else {
527                break;
528            }
529        }
530        Ok(Token {
531            kind: TokenKind::Variable(s),
532            line,
533            col,
534        })
535    }
536
537    fn read_number(&mut self, line: usize, col: usize) -> Result<Token, String> {
538        let mut s = String::new();
539        let mut is_float = false;
540
541        while let Some(ch) = self.peek() {
542            if ch.is_ascii_digit() {
543                s.push(self.advance() as char);
544            } else if ch == b'.' {
545                // Check if next char after dot is a digit (float), otherwise it's a clause terminator
546                if let Some(next) = self.peek_at(1) {
547                    if next.is_ascii_digit() {
548                        is_float = true;
549                        s.push(self.advance() as char); // consume .
550                        while let Some(d) = self.peek() {
551                            if d.is_ascii_digit() {
552                                s.push(self.advance() as char);
553                            } else {
554                                break;
555                            }
556                        }
557                    } else {
558                        break; // dot is clause terminator
559                    }
560                } else {
561                    break; // dot at EOF
562                }
563            } else {
564                break;
565            }
566        }
567
568        if is_float {
569            let val: f64 = s
570                .parse()
571                .map_err(|e| format!("Invalid float '{}': {}", s, e))?;
572            if val.is_infinite() {
573                return Err(format!(
574                    "Float literal '{}' overflows f64 at line {} col {}",
575                    s, line, col
576                ));
577            }
578            Ok(Token {
579                kind: TokenKind::Float(val),
580                line,
581                col,
582            })
583        } else {
584            let val: i64 = s
585                .parse()
586                .map_err(|e| format!("Invalid integer '{}': {}", s, e))?;
587            Ok(Token {
588                kind: TokenKind::Integer(val),
589                line,
590                col,
591            })
592        }
593    }
594
595    fn read_quoted_atom(&mut self, line: usize, col: usize) -> Result<Token, String> {
596        self.advance(); // skip opening quote
597        let mut s = String::new();
598        loop {
599            match self.peek() {
600                None => {
601                    return Err(format!(
602                        "Unterminated quoted atom at line {} col {}",
603                        line, col
604                    ))
605                }
606                Some(b'\'') => {
607                    self.advance();
608                    // Check for escaped quote ''
609                    if self.peek() == Some(b'\'') {
610                        s.push('\'');
611                        self.advance();
612                    } else {
613                        break;
614                    }
615                }
616                Some(b'\\') => {
617                    self.advance();
618                    match self.peek() {
619                        Some(b'\'') => {
620                            s.push('\'');
621                            self.advance();
622                        }
623                        Some(b'\\') => {
624                            s.push('\\');
625                            self.advance();
626                        }
627                        Some(b'n') => {
628                            s.push('\n');
629                            self.advance();
630                        }
631                        Some(b't') => {
632                            s.push('\t');
633                            self.advance();
634                        }
635                        Some(ch) => {
636                            s.push(ch as char);
637                            self.advance();
638                        }
639                        None => {
640                            return Err(format!(
641                                "Unterminated escape at line {} col {}",
642                                self.line, self.col
643                            ))
644                        }
645                    }
646                }
647                Some(ch) => {
648                    s.push(ch as char);
649                    self.advance();
650                }
651            }
652        }
653        Ok(Token {
654            kind: TokenKind::Atom(s),
655            line,
656            col,
657        })
658    }
659}
660
661#[cfg(test)]
662mod tests {
663    use super::*;
664
665    fn tok(input: &str) -> Vec<TokenKind> {
666        Tokenizer::tokenize(input)
667            .unwrap()
668            .into_iter()
669            .map(|t| t.kind)
670            .filter(|k| *k != TokenKind::Eof)
671            .collect()
672    }
673
674    #[test]
675    fn test_atoms() {
676        assert_eq!(tok("hello"), vec![TokenKind::Atom("hello".into())]);
677        assert_eq!(tok("foo_bar"), vec![TokenKind::Atom("foo_bar".into())]);
678        assert_eq!(tok("a123"), vec![TokenKind::Atom("a123".into())]);
679    }
680
681    #[test]
682    fn test_quoted_atoms() {
683        assert_eq!(
684            tok("'hello world'"),
685            vec![TokenKind::Atom("hello world".into())]
686        );
687        assert_eq!(tok("'it''s'"), vec![TokenKind::Atom("it's".into())]);
688    }
689
690    #[test]
691    fn test_variables() {
692        assert_eq!(tok("X"), vec![TokenKind::Variable("X".into())]);
693        assert_eq!(tok("_foo"), vec![TokenKind::Variable("_foo".into())]);
694        assert_eq!(tok("_"), vec![TokenKind::Variable("_".into())]);
695        assert_eq!(tok("MyVar"), vec![TokenKind::Variable("MyVar".into())]);
696    }
697
698    #[test]
699    fn test_numbers() {
700        assert_eq!(tok("42"), vec![TokenKind::Integer(42)]);
701        assert_eq!(tok("3.14"), vec![TokenKind::Float(3.14)]);
702        assert_eq!(tok("0"), vec![TokenKind::Integer(0)]);
703    }
704
705    #[test]
706    fn test_operators() {
707        assert_eq!(tok(":-"), vec![TokenKind::Neck]);
708        assert_eq!(tok("?-"), vec![TokenKind::QueryOp]);
709        assert_eq!(tok("="), vec![TokenKind::Equals]);
710        assert_eq!(tok("\\="), vec![TokenKind::NotEquals]);
711        assert_eq!(tok("is"), vec![TokenKind::Is]);
712        assert_eq!(tok("<"), vec![TokenKind::Lt]);
713        assert_eq!(tok(">"), vec![TokenKind::Gt]);
714        assert_eq!(tok("=<"), vec![TokenKind::Lte]);
715        assert_eq!(tok(">="), vec![TokenKind::Gte]);
716        assert_eq!(tok("=:="), vec![TokenKind::ArithEq]);
717        assert_eq!(tok("=\\="), vec![TokenKind::ArithNeq]);
718        assert_eq!(tok("\\+"), vec![TokenKind::Not]);
719    }
720
721    #[test]
722    fn test_punctuation() {
723        assert_eq!(
724            tok("( ) | , ."),
725            vec![
726                TokenKind::LParen,
727                TokenKind::RParen,
728                TokenKind::Pipe,
729                TokenKind::Comma,
730                TokenKind::Dot,
731            ]
732        );
733        // [ ] with space is separate tokens, not []
734        assert_eq!(tok("[ ]"), vec![TokenKind::LBracket, TokenKind::RBracket,]);
735    }
736
737    #[test]
738    fn test_cut() {
739        assert_eq!(tok("!"), vec![TokenKind::Cut]);
740    }
741
742    #[test]
743    fn test_clause() {
744        let tokens = tok("parent(tom, mary).");
745        assert_eq!(
746            tokens,
747            vec![
748                TokenKind::Atom("parent".into()),
749                TokenKind::LParen,
750                TokenKind::Atom("tom".into()),
751                TokenKind::Comma,
752                TokenKind::Atom("mary".into()),
753                TokenKind::RParen,
754                TokenKind::Dot,
755            ]
756        );
757    }
758
759    #[test]
760    fn test_rule() {
761        let tokens = tok("happy(X) :- likes(X, food).");
762        assert_eq!(
763            tokens,
764            vec![
765                TokenKind::Atom("happy".into()),
766                TokenKind::LParen,
767                TokenKind::Variable("X".into()),
768                TokenKind::RParen,
769                TokenKind::Neck,
770                TokenKind::Atom("likes".into()),
771                TokenKind::LParen,
772                TokenKind::Variable("X".into()),
773                TokenKind::Comma,
774                TokenKind::Atom("food".into()),
775                TokenKind::RParen,
776                TokenKind::Dot,
777            ]
778        );
779    }
780
781    #[test]
782    fn test_arithmetic() {
783        let tokens = tok("X is 2 + 3 * 4.");
784        assert_eq!(
785            tokens,
786            vec![
787                TokenKind::Variable("X".into()),
788                TokenKind::Is,
789                TokenKind::Integer(2),
790                TokenKind::Plus,
791                TokenKind::Integer(3),
792                TokenKind::Star,
793                TokenKind::Integer(4),
794                TokenKind::Dot,
795            ]
796        );
797    }
798
799    #[test]
800    fn test_line_comment() {
801        assert_eq!(
802            tok("foo % this is a comment\nbar"),
803            vec![TokenKind::Atom("foo".into()), TokenKind::Atom("bar".into()),]
804        );
805    }
806
807    #[test]
808    fn test_block_comment() {
809        assert_eq!(
810            tok("foo /* block */ bar"),
811            vec![TokenKind::Atom("foo".into()), TokenKind::Atom("bar".into()),]
812        );
813    }
814
815    #[test]
816    fn test_empty_list() {
817        assert_eq!(tok("[]"), vec![TokenKind::Atom("[]".into())]);
818    }
819
820    #[test]
821    fn test_list_syntax() {
822        let tokens = tok("[1, 2, 3]");
823        assert_eq!(
824            tokens,
825            vec![
826                TokenKind::LBracket,
827                TokenKind::Integer(1),
828                TokenKind::Comma,
829                TokenKind::Integer(2),
830                TokenKind::Comma,
831                TokenKind::Integer(3),
832                TokenKind::RBracket,
833            ]
834        );
835    }
836
837    #[test]
838    fn test_minus_operator() {
839        assert_eq!(
840            tok("5 - 3"),
841            vec![
842                TokenKind::Integer(5),
843                TokenKind::Minus,
844                TokenKind::Integer(3),
845            ]
846        );
847    }
848}