1use std::fmt;
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub struct TokenSpan {
11 pub start: usize,
12 pub len: usize,
13}
14
15impl TokenSpan {
16 pub fn new(start: usize, len: usize) -> Self {
17 Self { start, len }
18 }
19
20 pub fn end(self) -> usize {
21 self.start + self.len
22 }
23}
24
25#[derive(Debug, Clone)]
29pub struct TokenizeError {
30 pub message: String,
31 pub position: usize,
32 pub bad_len: usize,
33}
34
35impl fmt::Display for TokenizeError {
36 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
37 write!(f, "{}", self.message)
38 }
39}
40
41#[derive(Debug, Clone, PartialEq)]
42pub enum Token {
43 Identifier(String), NameRef(String), ValueRef(String), Eq, Ne, Lt, Le, Gt, Ge, Plus, Minus, And,
60 Or,
61 Not,
62 Between,
63 In,
64 Set,
65 Remove,
66 Add,
67 Delete,
68
69 LParen, RParen, LBracket, RBracket, Dot, Comma, Number(String), }
80
81impl fmt::Display for Token {
82 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
83 match self {
84 Token::Identifier(s) => write!(f, "{s}"),
85 Token::NameRef(s) => write!(f, "{s}"),
86 Token::ValueRef(s) => write!(f, "{s}"),
87 Token::Eq => write!(f, "="),
88 Token::Ne => write!(f, "<>"),
89 Token::Lt => write!(f, "<"),
90 Token::Le => write!(f, "<="),
91 Token::Gt => write!(f, ">"),
92 Token::Ge => write!(f, ">="),
93 Token::Plus => write!(f, "+"),
94 Token::Minus => write!(f, "-"),
95 Token::And => write!(f, "AND"),
96 Token::Or => write!(f, "OR"),
97 Token::Not => write!(f, "NOT"),
98 Token::Between => write!(f, "BETWEEN"),
99 Token::In => write!(f, "IN"),
100 Token::Set => write!(f, "SET"),
101 Token::Remove => write!(f, "REMOVE"),
102 Token::Add => write!(f, "ADD"),
103 Token::Delete => write!(f, "DELETE"),
104 Token::LParen => write!(f, "("),
105 Token::RParen => write!(f, ")"),
106 Token::LBracket => write!(f, "["),
107 Token::RBracket => write!(f, "]"),
108 Token::Dot => write!(f, "."),
109 Token::Comma => write!(f, ","),
110 Token::Number(n) => write!(f, "{n}"),
111 }
112 }
113}
114
115pub fn tokenize(input: &str) -> Result<Vec<(Token, TokenSpan)>, TokenizeError> {
119 let mut tokens: Vec<(Token, TokenSpan)> = Vec::new();
120 let bytes = input.as_bytes();
121 let mut i = 0;
122
123 while i < bytes.len() {
124 let start = i;
125 let c = bytes[i] as char;
126
127 if c.is_whitespace() {
129 i += 1;
130 continue;
131 }
132
133 match c {
134 '#' => {
136 i += 1;
137 let name_start = i;
138 while i < bytes.len() && is_name_char(bytes[i] as char) {
139 i += 1;
140 }
141 if i == name_start {
142 return Err(TokenizeError {
143 message: "Syntax error; token: \"#\"".to_string(),
144 position: start,
145 bad_len: 1,
146 });
147 }
148 let name = &input[name_start..i];
149 tokens.push((
150 Token::NameRef(format!("#{name}")),
151 TokenSpan::new(start, i - start),
152 ));
153 }
154
155 ':' => {
157 i += 1;
158 let name_start = i;
159 while i < bytes.len() && is_name_char(bytes[i] as char) {
160 i += 1;
161 }
162 if i == name_start {
163 return Err(TokenizeError {
164 message: "Syntax error; token: \":\"".to_string(),
165 position: start,
166 bad_len: 1,
167 });
168 }
169 let name = &input[name_start..i];
170 tokens.push((
171 Token::ValueRef(format!(":{name}")),
172 TokenSpan::new(start, i - start),
173 ));
174 }
175
176 '<' => {
178 i += 1;
179 if i < bytes.len() && bytes[i] as char == '>' {
180 i += 1;
181 tokens.push((Token::Ne, TokenSpan::new(start, 2)));
182 } else if i < bytes.len() && bytes[i] as char == '=' {
183 i += 1;
184 tokens.push((Token::Le, TokenSpan::new(start, 2)));
185 } else {
186 tokens.push((Token::Lt, TokenSpan::new(start, 1)));
187 }
188 }
189
190 '>' => {
191 i += 1;
192 if i < bytes.len() && bytes[i] as char == '=' {
193 i += 1;
194 tokens.push((Token::Ge, TokenSpan::new(start, 2)));
195 } else {
196 tokens.push((Token::Gt, TokenSpan::new(start, 1)));
197 }
198 }
199
200 '=' => {
201 i += 1;
202 tokens.push((Token::Eq, TokenSpan::new(start, 1)));
203 }
204 '+' => {
205 i += 1;
206 tokens.push((Token::Plus, TokenSpan::new(start, 1)));
207 }
208 '-' => {
209 i += 1;
210 tokens.push((Token::Minus, TokenSpan::new(start, 1)));
211 }
212
213 '(' => {
215 i += 1;
216 tokens.push((Token::LParen, TokenSpan::new(start, 1)));
217 }
218 ')' => {
219 i += 1;
220 tokens.push((Token::RParen, TokenSpan::new(start, 1)));
221 }
222 '[' => {
223 i += 1;
224 tokens.push((Token::LBracket, TokenSpan::new(start, 1)));
225 let num_start = i;
227 while i < bytes.len() && (bytes[i] as char).is_ascii_digit() {
228 i += 1;
229 }
230 if i > num_start {
231 let num = input[num_start..i].to_string();
232 tokens.push((Token::Number(num), TokenSpan::new(num_start, i - num_start)));
233 }
234 }
235 ']' => {
236 i += 1;
237 tokens.push((Token::RBracket, TokenSpan::new(start, 1)));
238 }
239 '.' => {
240 i += 1;
241 tokens.push((Token::Dot, TokenSpan::new(start, 1)));
242 }
243 ',' => {
244 i += 1;
245 tokens.push((Token::Comma, TokenSpan::new(start, 1)));
246 }
247
248 c if is_ident_start(c) => {
250 let ident_start = i;
251 while i < bytes.len() && is_name_char(bytes[i] as char) {
252 i += 1;
253 }
254 let word = &input[ident_start..i];
255 let token = match word.to_uppercase().as_str() {
256 "AND" => Token::And,
257 "OR" => Token::Or,
258 "NOT" => Token::Not,
259 "BETWEEN" => Token::Between,
260 "IN" => Token::In,
261 "SET" => Token::Set,
262 "REMOVE" => Token::Remove,
263 "ADD" => Token::Add,
264 "DELETE" => Token::Delete,
265 _ => Token::Identifier(word.to_string()),
266 };
267 tokens.push((token, TokenSpan::new(ident_start, i - ident_start)));
268 }
269
270 c => {
271 return Err(TokenizeError {
272 message: format!("Syntax error; token: \"{c}\""),
273 position: start,
274 bad_len: c.len_utf8(),
275 });
276 }
277 }
278 }
279
280 Ok(tokens)
281}
282
283fn is_ident_start(c: char) -> bool {
284 c.is_ascii_alphabetic() || c == '_'
285}
286
287fn is_name_char(c: char) -> bool {
288 c.is_ascii_alphanumeric() || c == '_'
289}
290
291pub struct TokenStream {
293 tokens: Vec<(Token, TokenSpan)>,
294 pos: usize,
295}
296
297impl TokenStream {
298 pub fn new(tokens: Vec<(Token, TokenSpan)>) -> Self {
299 Self { tokens, pos: 0 }
300 }
301
302 pub fn peek(&self) -> Option<&Token> {
303 self.tokens.get(self.pos).map(|(t, _)| t)
304 }
305
306 pub fn peek_span(&self) -> Option<TokenSpan> {
308 self.tokens.get(self.pos).map(|(_, s)| *s)
309 }
310
311 pub fn current_span(&self) -> Option<TokenSpan> {
313 if self.pos == 0 {
314 None
315 } else {
316 self.tokens.get(self.pos - 1).map(|(_, s)| *s)
317 }
318 }
319
320 #[allow(clippy::should_implement_trait)]
321 pub fn next(&mut self) -> Option<&Token> {
322 let token = self.tokens.get(self.pos).map(|(t, _)| t);
323 if token.is_some() {
324 self.pos += 1;
325 }
326 token
327 }
328
329 pub fn expect(&mut self, expected: &Token) -> Result<(), String> {
330 match self.next() {
331 Some(t) if t == expected => Ok(()),
332 Some(t) => Err(format!("Expected {expected}, got {t}")),
333 None => Err(format!("Expected {expected}, got end of expression")),
334 }
335 }
336
337 pub fn at_end(&self) -> bool {
338 self.pos >= self.tokens.len()
339 }
340
341 pub fn position(&self) -> usize {
342 self.pos
343 }
344
345 pub fn pos(&self) -> usize {
347 self.pos
348 }
349
350 pub fn set_pos(&mut self, pos: usize) {
352 self.pos = pos;
353 }
354}
355
356pub fn near_window_parser(source: &str, offending: TokenSpan, next: Option<TokenSpan>) -> &str {
363 let end = match next {
364 Some(span) => span.end(),
365 None => offending.end(),
366 };
367 let end = end.min(source.len());
368 &source[offending.start..end]
369}
370
371pub fn near_window_tokenizer(source: &str, position: usize) -> &str {
377 let bytes = source.as_bytes();
378 let mut end = position + 1;
379 if end <= bytes.len() && end < bytes.len() && !(bytes[end] as char).is_whitespace() {
380 end += 1;
381 }
382 let end = end.min(bytes.len());
383 &source[position..end]
384}
385
386#[cfg(test)]
387mod tests {
388 use super::*;
389
390 fn just_tokens(input: &str) -> Vec<Token> {
391 tokenize(input)
392 .unwrap()
393 .into_iter()
394 .map(|(t, _)| t)
395 .collect()
396 }
397
398 #[test]
399 fn test_tokenize_simple_condition() {
400 assert_eq!(
401 just_tokens("#status = :val"),
402 vec![
403 Token::NameRef("#status".into()),
404 Token::Eq,
405 Token::ValueRef(":val".into()),
406 ]
407 );
408 }
409
410 #[test]
411 fn test_tokenize_comparison_operators() {
412 let tokens = just_tokens("a < b");
413 assert!(matches!(tokens[1], Token::Lt));
414
415 let tokens = just_tokens("a <= b");
416 assert!(matches!(tokens[1], Token::Le));
417
418 let tokens = just_tokens("a > b");
419 assert!(matches!(tokens[1], Token::Gt));
420
421 let tokens = just_tokens("a >= b");
422 assert!(matches!(tokens[1], Token::Ge));
423
424 let tokens = just_tokens("a <> b");
425 assert!(matches!(tokens[1], Token::Ne));
426 }
427
428 #[test]
429 fn test_tokenize_keywords() {
430 let tokens = just_tokens("a AND b OR NOT c BETWEEN d IN e");
431 assert!(matches!(tokens[1], Token::And));
432 assert!(matches!(tokens[3], Token::Or));
433 assert!(matches!(tokens[4], Token::Not));
434 assert!(matches!(tokens[6], Token::Between));
435 assert!(matches!(tokens[8], Token::In));
436 }
437
438 #[test]
439 fn test_tokenize_update_keywords() {
440 let tokens = just_tokens("SET a = :v REMOVE b ADD c :d DELETE e :f");
441 assert!(matches!(tokens[0], Token::Set));
442 assert!(matches!(tokens[4], Token::Remove));
443 assert!(matches!(tokens[6], Token::Add));
444 assert!(matches!(tokens[9], Token::Delete));
445 }
446
447 #[test]
448 fn test_tokenize_path_expression() {
449 assert_eq!(
450 just_tokens("a.b[0].c"),
451 vec![
452 Token::Identifier("a".into()),
453 Token::Dot,
454 Token::Identifier("b".into()),
455 Token::LBracket,
456 Token::Number("0".into()),
457 Token::RBracket,
458 Token::Dot,
459 Token::Identifier("c".into()),
460 ]
461 );
462 }
463
464 #[test]
465 fn test_tokenize_function_call() {
466 assert_eq!(
467 just_tokens("attribute_exists(#name)"),
468 vec![
469 Token::Identifier("attribute_exists".into()),
470 Token::LParen,
471 Token::NameRef("#name".into()),
472 Token::RParen,
473 ]
474 );
475 }
476
477 #[test]
478 fn test_tokenize_arithmetic() {
479 let tokens = just_tokens("Price + :inc");
480 assert!(matches!(tokens[1], Token::Plus));
481
482 let tokens = just_tokens("Price - :dec");
483 assert!(matches!(tokens[1], Token::Minus));
484 }
485
486 #[test]
487 fn test_tokenize_case_insensitive_keywords() {
488 let tokens = just_tokens("set AND or");
489 assert!(matches!(tokens[0], Token::Set));
490 assert!(matches!(tokens[1], Token::And));
491 assert!(matches!(tokens[2], Token::Or));
492 }
493
494 #[test]
495 fn test_tokenize_returns_byte_spans() {
496 let tokens = tokenize("INVALID SYNTAX HERE").unwrap();
497 assert_eq!(tokens.len(), 3);
498 assert_eq!(tokens[0].1, TokenSpan::new(0, 7));
499 assert_eq!(tokens[1].1, TokenSpan::new(8, 6));
500 assert_eq!(tokens[2].1, TokenSpan::new(15, 4));
501 }
502
503 #[test]
504 fn test_tokenize_error_carries_position() {
505 let err = tokenize("!!!").unwrap_err();
506 assert_eq!(err.message, "Syntax error; token: \"!\"");
507 assert_eq!(err.position, 0);
508 }
509
510 #[test]
511 fn test_near_window_parser_uses_next_token() {
512 let source = "INVALID SYNTAX HERE";
513 let offending = TokenSpan::new(0, 7);
514 let next = Some(TokenSpan::new(8, 6));
515 assert_eq!(
516 near_window_parser(source, offending, next),
517 "INVALID SYNTAX"
518 );
519 }
520
521 #[test]
522 fn test_near_window_parser_falls_back_to_offending_when_no_next() {
523 let source = "BARE";
524 let offending = TokenSpan::new(0, 4);
525 assert_eq!(near_window_parser(source, offending, None), "BARE");
526 }
527
528 #[test]
529 fn test_near_window_tokenizer_extends_one_char() {
530 assert_eq!(near_window_tokenizer("!!! INVALID !!!", 0), "!!");
531 }
532
533 #[test]
534 fn test_near_window_tokenizer_stops_at_whitespace() {
535 assert_eq!(near_window_tokenizer("! foo", 0), "!");
537 }
538}