sieve/compiler/grammar/expr/
tokenizer.rs1use std::{
8 iter::{Enumerate, Peekable},
9 slice::Iter,
10};
11
12use crate::{compiler::Number, runtime::eval::IntoString};
13
14use super::{BinaryOperator, Token, UnaryOperator};
15
16pub(crate) struct Tokenizer<'x, F>
17where
18 F: Fn(&str, bool) -> Result<Token, String>,
19{
20 pub(crate) iter: Peekable<Enumerate<Iter<'x, u8>>>,
21 token_map: F,
22 buf: Vec<u8>,
23 depth: u32,
24 next_token: Vec<Token>,
25 has_number: bool,
26 has_dot: bool,
27 has_alpha: bool,
28 is_start: bool,
29 is_eof: bool,
30}
31
32impl<'x, F> Tokenizer<'x, F>
33where
34 F: Fn(&str, bool) -> Result<Token, String>,
35{
36 #[cfg(test)]
37 pub fn new(expr: &'x str, token_map: F) -> Self {
38 Self::from_iter(expr.as_bytes().iter().enumerate().peekable(), token_map)
39 }
40
41 #[allow(clippy::should_implement_trait)]
42 pub(crate) fn from_iter(iter: Peekable<Enumerate<Iter<'x, u8>>>, token_map: F) -> Self {
43 Self {
44 iter,
45 buf: Vec::new(),
46 depth: 0,
47 next_token: Vec::with_capacity(2),
48 has_number: false,
49 has_dot: false,
50 has_alpha: false,
51 is_start: true,
52 is_eof: false,
53 token_map,
54 }
55 }
56
57 #[allow(clippy::should_implement_trait)]
58 pub(crate) fn next(&mut self) -> Result<Option<Token>, String> {
59 if let Some(token) = self.next_token.pop() {
60 return Ok(Some(token));
61 } else if self.is_eof {
62 return Ok(None);
63 }
64
65 while let Some((_, &ch)) = self.iter.next() {
66 match ch {
67 b'A'..=b'Z' | b'a'..=b'z' | b'_' => {
68 self.buf.push(ch);
69 self.has_alpha = true;
70 }
71 b'0'..=b'9' => {
72 self.buf.push(ch);
73 self.has_number = true;
74 }
75 b'.' => {
76 self.buf.push(ch);
77 self.has_dot = true;
78 }
79 b'}' => {
80 self.is_eof = true;
81 break;
82 }
83 b'[' if matches!(self.buf.get(0..7), Some(b"header.")) => {
84 self.buf.push(ch);
85 }
86 b'-' if self.buf.last().is_some_and( |c| *c == b'[')
87 || matches!(self.buf.get(0..7), Some(b"header.")) =>
88 {
89 self.buf.push(ch);
90 }
91 b':' if self.buf.contains(&b'.') => {
92 self.buf.push(ch);
93 }
94 b']' if self.buf.contains(&b'[') => {
95 self.buf.push(b']');
96 }
97 b'*' if self.buf.last().is_some_and( |&c| c == b'[' || c == b'.') => {
98 self.buf.push(ch);
99 }
100 _ => {
101 let prev_token = if !self.buf.is_empty() {
102 self.is_start = false;
103 self.parse_buf()?.into()
104 } else {
105 None
106 };
107 let token = match ch {
108 b'&' => {
109 if matches!(self.iter.peek(), Some((_, b'&'))) {
110 self.iter.next();
111 }
112 Token::BinaryOperator(BinaryOperator::And)
113 }
114 b'|' => {
115 if matches!(self.iter.peek(), Some((_, b'|'))) {
116 self.iter.next();
117 }
118 Token::BinaryOperator(BinaryOperator::Or)
119 }
120 b'!' => {
121 if matches!(self.iter.peek(), Some((_, b'='))) {
122 self.iter.next();
123 Token::BinaryOperator(BinaryOperator::Ne)
124 } else {
125 Token::UnaryOperator(UnaryOperator::Not)
126 }
127 }
128 b'^' => Token::BinaryOperator(BinaryOperator::Xor),
129 b'(' => {
130 self.depth += 1;
131 Token::OpenParen
132 }
133 b')' => {
134 if self.depth == 0 {
135 return Err("Unmatched close parenthesis".to_string());
136 }
137 self.depth -= 1;
138 Token::CloseParen
139 }
140 b'+' => Token::BinaryOperator(BinaryOperator::Add),
141 b'*' => Token::BinaryOperator(BinaryOperator::Multiply),
142 b'/' => Token::BinaryOperator(BinaryOperator::Divide),
143 b'-' => {
144 if self.is_start {
145 Token::UnaryOperator(UnaryOperator::Minus)
146 } else {
147 Token::BinaryOperator(BinaryOperator::Subtract)
148 }
149 }
150 b'=' => match self.iter.next() {
151 Some((_, b'=')) => Token::BinaryOperator(BinaryOperator::Eq),
152 Some((_, b'>')) => Token::BinaryOperator(BinaryOperator::Ge),
153 Some((_, b'<')) => Token::BinaryOperator(BinaryOperator::Le),
154 _ => Token::BinaryOperator(BinaryOperator::Eq),
155 },
156 b'>' => match self.iter.peek() {
157 Some((_, b'=')) => {
158 self.iter.next();
159 Token::BinaryOperator(BinaryOperator::Ge)
160 }
161 _ => Token::BinaryOperator(BinaryOperator::Gt),
162 },
163 b'<' => match self.iter.peek() {
164 Some((_, b'=')) => {
165 self.iter.next();
166 Token::BinaryOperator(BinaryOperator::Le)
167 }
168 _ => Token::BinaryOperator(BinaryOperator::Lt),
169 },
170 b',' => Token::Comma,
171 b'[' => Token::OpenBracket,
172 b']' => Token::CloseBracket,
173 b' ' | b'\r' | b'\n' => {
174 if prev_token.is_some() {
175 return Ok(prev_token);
176 } else {
177 continue;
178 }
179 }
180 b'\"' | b'\'' => {
181 let mut buf = Vec::with_capacity(16);
182 let stop_ch = ch;
183 let mut last_ch = 0;
184 let mut found_end = false;
185
186 for (_, &ch) in self.iter.by_ref() {
187 if last_ch != b'\\' {
188 if ch != stop_ch {
189 buf.push(ch);
190 } else {
191 found_end = true;
192 break;
193 }
194 } else {
195 match ch {
196 b'n' => {
197 buf.push(b'\n');
198 }
199 b'r' => {
200 buf.push(b'\r');
201 }
202 b't' => {
203 buf.push(b'\t');
204 }
205 _ => {
206 buf.push(ch);
207 }
208 }
209 }
210
211 last_ch = ch;
212 }
213
214 if found_end {
215 Token::String(
216 String::from_utf8(buf)
217 .map_err(|_| "Invalid UTF-8".to_string())?,
218 )
219 } else {
220 return Err("Unterminated string".to_string());
221 }
222 }
223 _ => {
224 return Err(format!("Invalid character {:?}", char::from(ch),));
225 }
226 };
227 self.is_start = matches!(
228 token,
229 Token::OpenParen | Token::Comma | Token::BinaryOperator(_)
230 );
231
232 return if prev_token.is_some() {
233 self.next_token.push(token);
234 Ok(prev_token)
235 } else {
236 Ok(Some(token))
237 };
238 }
239 }
240 }
241
242 if self.depth > 0 {
243 Err("Unmatched open parenthesis".to_string())
244 } else if !self.buf.is_empty() {
245 self.parse_buf().map(Some)
246 } else {
247 Ok(None)
248 }
249 }
250
251 fn parse_buf(&mut self) -> Result<Token, String> {
252 let buf = std::mem::take(&mut self.buf).into_string();
253 if self.has_number && !self.has_alpha {
254 self.has_number = false;
255 if self.has_dot {
256 self.has_dot = false;
257
258 buf.parse::<f64>()
259 .map(|f| Token::Number(Number::Float(f)))
260 .map_err(|_| format!("Invalid float value {}", buf,))
261 } else {
262 buf.parse::<i64>()
263 .map(|i| Token::Number(Number::Integer(i)))
264 .map_err(|_| format!("Invalid integer value {}", buf,))
265 }
266 } else {
267 let has_dot = self.has_dot;
268 let has_number = self.has_number;
269
270 self.has_alpha = false;
271 self.has_number = false;
272 self.has_dot = false;
273
274 if !has_number && !has_dot && [4, 5].contains(&buf.len()) {
275 if buf == "true" {
276 return Ok(Token::Number(Number::Integer(1)));
277 } else if buf == "false" {
278 return Ok(Token::Number(Number::Integer(0)));
279 }
280 }
281
282 (self.token_map)(&buf, has_dot)
283 }
284 }
285}