Skip to main content

patch_prolog_frontend/tokenizer/
mod.rs

1//! Tokenizer for ISO Prolog source.
2//!
3//! Ported from patch-prolog's `tokenizer.rs`, split into focused submodules:
4//! - [`token`]: `TokenKind` / `Token` value types and their `Display`.
5//! - [`chars`]: unquoted atoms and variables.
6//! - [`numbers`]: integer / float literals.
7//! - [`quoted`]: single-quoted atoms.
8//! - [`symbols`]: multi-character symbolic operator dispatch.
9//!
10//! The driver ([`Tokenizer::next_token`]) handles whitespace/comments,
11//! single-character punctuation, and dispatches everything else.
12
13mod chars;
14mod numbers;
15mod quoted;
16mod symbols;
17mod token;
18
19pub use token::{Token, TokenKind};
20
21use crate::parse_error::ParseError;
22use plg_shared::Span;
23
24pub struct Tokenizer<'a> {
25    input: &'a [u8],
26    pos: usize,
27    line: usize,
28    col: usize,
29}
30
31impl<'a> Tokenizer<'a> {
32    pub fn new(input: &'a str) -> Self {
33        Tokenizer {
34            input: input.as_bytes(),
35            pos: 0,
36            line: 1,
37            col: 1,
38        }
39    }
40
41    pub fn tokenize(input: &str) -> Result<Vec<Token>, ParseError> {
42        let mut tok = Tokenizer::new(input);
43        let mut tokens = Vec::new();
44        loop {
45            let t = tok.next_token()?;
46            if t.kind == TokenKind::Eof {
47                tokens.push(t);
48                break;
49            }
50            tokens.push(t);
51        }
52        Ok(tokens)
53    }
54
55    pub(super) fn peek(&self) -> Option<u8> {
56        if self.pos < self.input.len() {
57            Some(self.input[self.pos])
58        } else {
59            None
60        }
61    }
62
63    pub(super) fn peek_at(&self, offset: usize) -> Option<u8> {
64        let idx = self.pos + offset;
65        if idx < self.input.len() {
66            Some(self.input[idx])
67        } else {
68            None
69        }
70    }
71
72    pub(super) fn advance(&mut self) -> u8 {
73        let ch = self.input[self.pos];
74        self.pos += 1;
75        if ch == b'\n' {
76            self.line += 1;
77            self.col = 1;
78        } else {
79            self.col += 1;
80        }
81        ch
82    }
83
84    fn skip_whitespace(&mut self) {
85        while let Some(ch) = self.peek() {
86            match ch {
87                b' ' | b'\t' | b'\r' | b'\n' => {
88                    self.advance();
89                }
90                b'%' => {
91                    // Line comment
92                    while let Some(ch) = self.peek() {
93                        if ch == b'\n' {
94                            break;
95                        }
96                        self.advance();
97                    }
98                }
99                b'/' if self.peek_at(1) == Some(b'*') => {
100                    // Block comment
101                    self.advance(); // /
102                    self.advance(); // *
103                    loop {
104                        match self.peek() {
105                            None => break,
106                            Some(b'*') if self.peek_at(1) == Some(b'/') => {
107                                self.advance();
108                                self.advance();
109                                break;
110                            }
111                            _ => {
112                                self.advance();
113                            }
114                        }
115                    }
116                }
117                _ => break,
118            }
119        }
120    }
121
122    fn next_token(&mut self) -> Result<Token, ParseError> {
123        self.skip_whitespace();
124        let lo = self.pos as u32;
125        let mut token = self.next_token_inner()?;
126        // Stamp byte offsets once, at the single dispatch point, so the
127        // per-kind helpers don't each have to track them.
128        token.lo = lo;
129        token.hi = self.pos as u32;
130        Ok(token)
131    }
132
133    fn next_token_inner(&mut self) -> Result<Token, ParseError> {
134        let line = self.line;
135        let col = self.col;
136
137        let ch = match self.peek() {
138            None => return Ok(Token::new(TokenKind::Eof, line, col)),
139            Some(ch) => ch,
140        };
141
142        match ch {
143            b'(' => self.single(TokenKind::LParen, line, col),
144            b')' => self.single(TokenKind::RParen, line, col),
145            b'[' => {
146                self.advance();
147                // Check for []
148                if self.peek() == Some(b']') {
149                    self.advance();
150                    Ok(Token::new(TokenKind::Atom("[]".into()), line, col))
151                } else {
152                    Ok(Token::new(TokenKind::LBracket, line, col))
153                }
154            }
155            b']' => self.single(TokenKind::RBracket, line, col),
156            b'|' => self.single(TokenKind::Pipe, line, col),
157            b',' => self.single(TokenKind::Comma, line, col),
158            b'!' => self.single(TokenKind::Cut, line, col),
159            b';' => self.single(TokenKind::Semicolon, line, col),
160            b'.' => {
161                // A bare `.` is the clause terminator. Numbers handle their own
162                // fractional dot in `read_number`.
163                self.single(TokenKind::Dot, line, col)
164            }
165
166            b':' | b'?' | b'=' | b'\\' | b'<' | b'>' | b'@' | b'+' | b'*' | b'^' | b'/' | b'-' => {
167                self.read_symbol(ch, line, col)
168            }
169
170            b'\'' => self.read_quoted_atom(line, col),
171
172            b'0'..=b'9' => self.read_number(line, col),
173
174            b'a'..=b'z' => self.read_atom(line, col),
175
176            b'A'..=b'Z' | b'_' => self.read_variable(line, col),
177
178            _ => {
179                let lo = self.pos as u32;
180                self.advance();
181                Err(ParseError::new(
182                    format!("Unexpected character '{}'", ch as char),
183                    Span::new(0, lo, self.pos as u32),
184                ))
185            }
186        }
187    }
188
189    /// Build a lexer error pointing at the current byte position (where the
190    /// scanner stalled). Used for end-of-input and bad-token cases.
191    fn lex_error(&self, message: impl Into<String>) -> ParseError {
192        ParseError::new(message, Span::point(0, self.pos as u32))
193    }
194
195    /// Consume one byte and emit a fixed single-character token.
196    fn single(&mut self, kind: TokenKind, line: usize, col: usize) -> Result<Token, ParseError> {
197        self.advance();
198        Ok(Token::new(kind, line, col))
199    }
200}
201
202#[cfg(test)]
203mod tests;