sieve/compiler/grammar/expr/
tokenizer.rs1use std::{
25 iter::{Enumerate, Peekable},
26 slice::Iter,
27};
28
29use crate::{compiler::Number, runtime::eval::IntoString};
30
31use super::{BinaryOperator, Token, UnaryOperator};
32
33pub(crate) struct Tokenizer<'x, F>
34where
35 F: Fn(&str, bool) -> Result<Token, String>,
36{
37 pub(crate) iter: Peekable<Enumerate<Iter<'x, u8>>>,
38 token_map: F,
39 buf: Vec<u8>,
40 depth: u32,
41 next_token: Vec<Token>,
42 has_number: bool,
43 has_dot: bool,
44 has_alpha: bool,
45 is_start: bool,
46 is_eof: bool,
47}
48
49impl<'x, F> Tokenizer<'x, F>
50where
51 F: Fn(&str, bool) -> Result<Token, String>,
52{
53 #[cfg(test)]
54 pub fn new(expr: &'x str, token_map: F) -> Self {
55 Self::from_iter(expr.as_bytes().iter().enumerate().peekable(), token_map)
56 }
57
58 #[allow(clippy::should_implement_trait)]
59 pub(crate) fn from_iter(iter: Peekable<Enumerate<Iter<'x, u8>>>, token_map: F) -> Self {
60 Self {
61 iter,
62 buf: Vec::new(),
63 depth: 0,
64 next_token: Vec::with_capacity(2),
65 has_number: false,
66 has_dot: false,
67 has_alpha: false,
68 is_start: true,
69 is_eof: false,
70 token_map,
71 }
72 }
73
74 #[allow(clippy::should_implement_trait)]
75 pub(crate) fn next(&mut self) -> Result<Option<Token>, String> {
76 if let Some(token) = self.next_token.pop() {
77 return Ok(Some(token));
78 } else if self.is_eof {
79 return Ok(None);
80 }
81
82 while let Some((_, &ch)) = self.iter.next() {
83 match ch {
84 b'A'..=b'Z' | b'a'..=b'z' | b'_' => {
85 self.buf.push(ch);
86 self.has_alpha = true;
87 }
88 b'0'..=b'9' => {
89 self.buf.push(ch);
90 self.has_number = true;
91 }
92 b'.' => {
93 self.buf.push(ch);
94 self.has_dot = true;
95 }
96 b'}' => {
97 self.is_eof = true;
98 break;
99 }
100 b'[' if matches!(self.buf.get(0..7), Some(b"header.")) => {
101 self.buf.push(ch);
102 }
103 b'-' if self.buf.last().map_or(false, |c| *c == b'[')
104 || matches!(self.buf.get(0..7), Some(b"header.")) =>
105 {
106 self.buf.push(ch);
107 }
108 b':' if self.buf.contains(&b'.') => {
109 self.buf.push(ch);
110 }
111 b']' if self.buf.contains(&b'[') => {
112 self.buf.push(b']');
113 }
114 b'*' if self.buf.last().map_or(false, |&c| c == b'[' || c == b'.') => {
115 self.buf.push(ch);
116 }
117 _ => {
118 let prev_token = if !self.buf.is_empty() {
119 self.is_start = false;
120 self.parse_buf()?.into()
121 } else {
122 None
123 };
124 let token = match ch {
125 b'&' => {
126 if matches!(self.iter.peek(), Some((_, b'&'))) {
127 self.iter.next();
128 }
129 Token::BinaryOperator(BinaryOperator::And)
130 }
131 b'|' => {
132 if matches!(self.iter.peek(), Some((_, b'|'))) {
133 self.iter.next();
134 }
135 Token::BinaryOperator(BinaryOperator::Or)
136 }
137 b'!' => {
138 if matches!(self.iter.peek(), Some((_, b'='))) {
139 self.iter.next();
140 Token::BinaryOperator(BinaryOperator::Ne)
141 } else {
142 Token::UnaryOperator(UnaryOperator::Not)
143 }
144 }
145 b'^' => Token::BinaryOperator(BinaryOperator::Xor),
146 b'(' => {
147 self.depth += 1;
148 Token::OpenParen
149 }
150 b')' => {
151 if self.depth == 0 {
152 return Err("Unmatched close parenthesis".to_string());
153 }
154 self.depth -= 1;
155 Token::CloseParen
156 }
157 b'+' => Token::BinaryOperator(BinaryOperator::Add),
158 b'*' => Token::BinaryOperator(BinaryOperator::Multiply),
159 b'/' => Token::BinaryOperator(BinaryOperator::Divide),
160 b'-' => {
161 if self.is_start {
162 Token::UnaryOperator(UnaryOperator::Minus)
163 } else {
164 Token::BinaryOperator(BinaryOperator::Subtract)
165 }
166 }
167 b'=' => match self.iter.next() {
168 Some((_, b'=')) => Token::BinaryOperator(BinaryOperator::Eq),
169 Some((_, b'>')) => Token::BinaryOperator(BinaryOperator::Ge),
170 Some((_, b'<')) => Token::BinaryOperator(BinaryOperator::Le),
171 _ => Token::BinaryOperator(BinaryOperator::Eq),
172 },
173 b'>' => match self.iter.peek() {
174 Some((_, b'=')) => {
175 self.iter.next();
176 Token::BinaryOperator(BinaryOperator::Ge)
177 }
178 _ => Token::BinaryOperator(BinaryOperator::Gt),
179 },
180 b'<' => match self.iter.peek() {
181 Some((_, b'=')) => {
182 self.iter.next();
183 Token::BinaryOperator(BinaryOperator::Le)
184 }
185 _ => Token::BinaryOperator(BinaryOperator::Lt),
186 },
187 b',' => Token::Comma,
188 b'[' => Token::OpenBracket,
189 b']' => Token::CloseBracket,
190 b' ' | b'\r' | b'\n' => {
191 if prev_token.is_some() {
192 return Ok(prev_token);
193 } else {
194 continue;
195 }
196 }
197 b'\"' | b'\'' => {
198 let mut buf = Vec::with_capacity(16);
199 let stop_ch = ch;
200 let mut last_ch = 0;
201 let mut found_end = false;
202
203 for (_, &ch) in self.iter.by_ref() {
204 if last_ch != b'\\' {
205 if ch != stop_ch {
206 buf.push(ch);
207 } else {
208 found_end = true;
209 break;
210 }
211 } else {
212 match ch {
213 b'n' => {
214 buf.push(b'\n');
215 }
216 b'r' => {
217 buf.push(b'\r');
218 }
219 b't' => {
220 buf.push(b'\t');
221 }
222 _ => {
223 buf.push(ch);
224 }
225 }
226 }
227
228 last_ch = ch;
229 }
230
231 if found_end {
232 Token::String(
233 String::from_utf8(buf)
234 .map_err(|_| "Invalid UTF-8".to_string())?,
235 )
236 } else {
237 return Err("Unterminated string".to_string());
238 }
239 }
240 _ => {
241 return Err(format!("Invalid character {:?}", char::from(ch),));
242 }
243 };
244 self.is_start = matches!(
245 token,
246 Token::OpenParen | Token::Comma | Token::BinaryOperator(_)
247 );
248
249 return if prev_token.is_some() {
250 self.next_token.push(token);
251 Ok(prev_token)
252 } else {
253 Ok(Some(token))
254 };
255 }
256 }
257 }
258
259 if self.depth > 0 {
260 Err("Unmatched open parenthesis".to_string())
261 } else if !self.buf.is_empty() {
262 self.parse_buf().map(Some)
263 } else {
264 Ok(None)
265 }
266 }
267
268 fn parse_buf(&mut self) -> Result<Token, String> {
269 let buf = std::mem::take(&mut self.buf).into_string();
270 if self.has_number && !self.has_alpha {
271 self.has_number = false;
272 if self.has_dot {
273 self.has_dot = false;
274
275 buf.parse::<f64>()
276 .map(|f| Token::Number(Number::Float(f)))
277 .map_err(|_| format!("Invalid float value {}", buf,))
278 } else {
279 buf.parse::<i64>()
280 .map(|i| Token::Number(Number::Integer(i)))
281 .map_err(|_| format!("Invalid integer value {}", buf,))
282 }
283 } else {
284 let has_dot = self.has_dot;
285 let has_number = self.has_number;
286
287 self.has_alpha = false;
288 self.has_number = false;
289 self.has_dot = false;
290
291 if !has_number && !has_dot && [4, 5].contains(&buf.len()) {
292 if buf == "true" {
293 return Ok(Token::Number(Number::Integer(1)));
294 } else if buf == "false" {
295 return Ok(Token::Number(Number::Integer(0)));
296 }
297 }
298
299 (self.token_map)(&buf, has_dot)
300 }
301 }
302}