patch_prolog_frontend/tokenizer/
mod.rs1mod chars;
14mod numbers;
15mod quoted;
16mod symbols;
17mod token;
18
19pub use token::{Token, TokenKind};
20
21use crate::parse_error::ParseError;
22use plg_shared::Span;
23
24pub struct Tokenizer<'a> {
25 input: &'a [u8],
26 pos: usize,
27 line: usize,
28 col: usize,
29}
30
31impl<'a> Tokenizer<'a> {
32 pub fn new(input: &'a str) -> Self {
33 Tokenizer {
34 input: input.as_bytes(),
35 pos: 0,
36 line: 1,
37 col: 1,
38 }
39 }
40
41 pub fn tokenize(input: &str) -> Result<Vec<Token>, ParseError> {
42 let mut tok = Tokenizer::new(input);
43 let mut tokens = Vec::new();
44 loop {
45 let t = tok.next_token()?;
46 if t.kind == TokenKind::Eof {
47 tokens.push(t);
48 break;
49 }
50 tokens.push(t);
51 }
52 Ok(tokens)
53 }
54
55 pub(super) fn peek(&self) -> Option<u8> {
56 if self.pos < self.input.len() {
57 Some(self.input[self.pos])
58 } else {
59 None
60 }
61 }
62
63 pub(super) fn peek_at(&self, offset: usize) -> Option<u8> {
64 let idx = self.pos + offset;
65 if idx < self.input.len() {
66 Some(self.input[idx])
67 } else {
68 None
69 }
70 }
71
72 pub(super) fn advance(&mut self) -> u8 {
73 let ch = self.input[self.pos];
74 self.pos += 1;
75 if ch == b'\n' {
76 self.line += 1;
77 self.col = 1;
78 } else {
79 self.col += 1;
80 }
81 ch
82 }
83
84 fn skip_whitespace(&mut self) {
85 while let Some(ch) = self.peek() {
86 match ch {
87 b' ' | b'\t' | b'\r' | b'\n' => {
88 self.advance();
89 }
90 b'%' => {
91 while let Some(ch) = self.peek() {
93 if ch == b'\n' {
94 break;
95 }
96 self.advance();
97 }
98 }
99 b'/' if self.peek_at(1) == Some(b'*') => {
100 self.advance(); self.advance(); loop {
104 match self.peek() {
105 None => break,
106 Some(b'*') if self.peek_at(1) == Some(b'/') => {
107 self.advance();
108 self.advance();
109 break;
110 }
111 _ => {
112 self.advance();
113 }
114 }
115 }
116 }
117 _ => break,
118 }
119 }
120 }
121
122 fn next_token(&mut self) -> Result<Token, ParseError> {
123 self.skip_whitespace();
124 let lo = self.pos as u32;
125 let mut token = self.next_token_inner()?;
126 token.lo = lo;
129 token.hi = self.pos as u32;
130 Ok(token)
131 }
132
133 fn next_token_inner(&mut self) -> Result<Token, ParseError> {
134 let line = self.line;
135 let col = self.col;
136
137 let ch = match self.peek() {
138 None => return Ok(Token::new(TokenKind::Eof, line, col)),
139 Some(ch) => ch,
140 };
141
142 match ch {
143 b'(' => self.single(TokenKind::LParen, line, col),
144 b')' => self.single(TokenKind::RParen, line, col),
145 b'[' => {
146 self.advance();
147 if self.peek() == Some(b']') {
149 self.advance();
150 Ok(Token::new(TokenKind::Atom("[]".into()), line, col))
151 } else {
152 Ok(Token::new(TokenKind::LBracket, line, col))
153 }
154 }
155 b']' => self.single(TokenKind::RBracket, line, col),
156 b'|' => self.single(TokenKind::Pipe, line, col),
157 b',' => self.single(TokenKind::Comma, line, col),
158 b'!' => self.single(TokenKind::Cut, line, col),
159 b';' => self.single(TokenKind::Semicolon, line, col),
160 b'.' => {
161 self.single(TokenKind::Dot, line, col)
164 }
165
166 b':' | b'?' | b'=' | b'\\' | b'<' | b'>' | b'@' | b'+' | b'*' | b'^' | b'/' | b'-' => {
167 self.read_symbol(ch, line, col)
168 }
169
170 b'\'' => self.read_quoted_atom(line, col),
171
172 b'0'..=b'9' => self.read_number(line, col),
173
174 b'a'..=b'z' => self.read_atom(line, col),
175
176 b'A'..=b'Z' | b'_' => self.read_variable(line, col),
177
178 _ => {
179 let lo = self.pos as u32;
180 self.advance();
181 Err(ParseError::new(
182 format!("Unexpected character '{}'", ch as char),
183 Span::new(0, lo, self.pos as u32),
184 ))
185 }
186 }
187 }
188
189 fn lex_error(&self, message: impl Into<String>) -> ParseError {
192 ParseError::new(message, Span::point(0, self.pos as u32))
193 }
194
195 fn single(&mut self, kind: TokenKind, line: usize, col: usize) -> Result<Token, ParseError> {
197 self.advance();
198 Ok(Token::new(kind, line, col))
199 }
200}
201
202#[cfg(test)]
203mod tests;