1use std::fmt;
4
5#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub struct TokenSpan {
11 pub start: usize,
12 pub len: usize,
13}
14
15impl TokenSpan {
16 pub fn new(start: usize, len: usize) -> Self {
17 Self { start, len }
18 }
19
20 pub fn end(self) -> usize {
21 self.start + self.len
22 }
23}
24
25#[derive(Debug, Clone)]
29pub struct TokenizeError {
30 pub message: String,
31 pub position: usize,
32 pub bad_len: usize,
33}
34
35impl fmt::Display for TokenizeError {
36 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
37 write!(f, "{}", self.message)
38 }
39}
40
41#[derive(Debug, Clone, PartialEq)]
42pub enum Token {
43 Identifier(String), NameRef(String), ValueRef(String), Eq, Ne, Lt, Le, Gt, Ge, Plus, Minus, And,
60 Or,
61 Not,
62 Between,
63 In,
64 Set,
65 Remove,
66 Add,
67 Delete,
68
69 LParen, RParen, LBracket, RBracket, Dot, Comma, Number(String), }
80
81impl fmt::Display for Token {
82 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
83 match self {
84 Token::Identifier(s) => write!(f, "{s}"),
85 Token::NameRef(s) => write!(f, "{s}"),
86 Token::ValueRef(s) => write!(f, "{s}"),
87 Token::Eq => write!(f, "="),
88 Token::Ne => write!(f, "<>"),
89 Token::Lt => write!(f, "<"),
90 Token::Le => write!(f, "<="),
91 Token::Gt => write!(f, ">"),
92 Token::Ge => write!(f, ">="),
93 Token::Plus => write!(f, "+"),
94 Token::Minus => write!(f, "-"),
95 Token::And => write!(f, "AND"),
96 Token::Or => write!(f, "OR"),
97 Token::Not => write!(f, "NOT"),
98 Token::Between => write!(f, "BETWEEN"),
99 Token::In => write!(f, "IN"),
100 Token::Set => write!(f, "SET"),
101 Token::Remove => write!(f, "REMOVE"),
102 Token::Add => write!(f, "ADD"),
103 Token::Delete => write!(f, "DELETE"),
104 Token::LParen => write!(f, "("),
105 Token::RParen => write!(f, ")"),
106 Token::LBracket => write!(f, "["),
107 Token::RBracket => write!(f, "]"),
108 Token::Dot => write!(f, "."),
109 Token::Comma => write!(f, ","),
110 Token::Number(n) => write!(f, "{n}"),
111 }
112 }
113}
114
115pub fn tokenize(input: &str) -> Result<Vec<(Token, TokenSpan)>, TokenizeError> {
119 let mut tokens: Vec<(Token, TokenSpan)> = Vec::new();
120 let bytes = input.as_bytes();
121 let mut i = 0;
122
123 while i < bytes.len() {
124 let start = i;
125 let c = bytes[i] as char;
126
127 if c.is_whitespace() {
129 i += 1;
130 continue;
131 }
132
133 match c {
134 '#' => {
136 i += 1;
137 let name_start = i;
138 while i < bytes.len() && is_name_char(bytes[i] as char) {
139 i += 1;
140 }
141 if i == name_start {
142 return Err(TokenizeError {
143 message: "Syntax error; token: \"#\"".to_string(),
144 position: start,
145 bad_len: 1,
146 });
147 }
148 let name = &input[name_start..i];
149 tokens.push((
150 Token::NameRef(format!("#{name}")),
151 TokenSpan::new(start, i - start),
152 ));
153 }
154
155 ':' => {
157 i += 1;
158 let name_start = i;
159 while i < bytes.len() && is_name_char(bytes[i] as char) {
160 i += 1;
161 }
162 if i == name_start {
163 return Err(TokenizeError {
164 message: "Syntax error; token: \":\"".to_string(),
165 position: start,
166 bad_len: 1,
167 });
168 }
169 let name = &input[name_start..i];
170 tokens.push((
171 Token::ValueRef(format!(":{name}")),
172 TokenSpan::new(start, i - start),
173 ));
174 }
175
176 '<' => {
178 i += 1;
179 if i < bytes.len() && bytes[i] as char == '>' {
180 i += 1;
181 tokens.push((Token::Ne, TokenSpan::new(start, 2)));
182 } else if i < bytes.len() && bytes[i] as char == '=' {
183 i += 1;
184 tokens.push((Token::Le, TokenSpan::new(start, 2)));
185 } else {
186 tokens.push((Token::Lt, TokenSpan::new(start, 1)));
187 }
188 }
189
190 '>' => {
191 i += 1;
192 if i < bytes.len() && bytes[i] as char == '=' {
193 i += 1;
194 tokens.push((Token::Ge, TokenSpan::new(start, 2)));
195 } else {
196 tokens.push((Token::Gt, TokenSpan::new(start, 1)));
197 }
198 }
199
200 '=' => {
201 i += 1;
202 tokens.push((Token::Eq, TokenSpan::new(start, 1)));
203 }
204 '+' => {
205 i += 1;
206 tokens.push((Token::Plus, TokenSpan::new(start, 1)));
207 }
208 '-' => {
209 i += 1;
210 tokens.push((Token::Minus, TokenSpan::new(start, 1)));
211 }
212
213 '(' => {
215 i += 1;
216 tokens.push((Token::LParen, TokenSpan::new(start, 1)));
217 }
218 ')' => {
219 i += 1;
220 tokens.push((Token::RParen, TokenSpan::new(start, 1)));
221 }
222 '[' => {
223 i += 1;
224 tokens.push((Token::LBracket, TokenSpan::new(start, 1)));
225 let num_start = i;
227 while i < bytes.len() && (bytes[i] as char).is_ascii_digit() {
228 i += 1;
229 }
230 if i > num_start {
231 let num = input[num_start..i].to_string();
232 tokens.push((Token::Number(num), TokenSpan::new(num_start, i - num_start)));
233 }
234 }
235 ']' => {
236 i += 1;
237 tokens.push((Token::RBracket, TokenSpan::new(start, 1)));
238 }
239 '.' => {
240 i += 1;
241 tokens.push((Token::Dot, TokenSpan::new(start, 1)));
242 }
243 ',' => {
244 i += 1;
245 tokens.push((Token::Comma, TokenSpan::new(start, 1)));
246 }
247
248 c if is_ident_start(c) => {
250 let ident_start = i;
251 while i < bytes.len() && is_name_char(bytes[i] as char) {
252 i += 1;
253 }
254 let word = &input[ident_start..i];
255 let token = match word.to_uppercase().as_str() {
256 "AND" => Token::And,
257 "OR" => Token::Or,
258 "NOT" => Token::Not,
259 "BETWEEN" => Token::Between,
260 "IN" => Token::In,
261 "SET" => Token::Set,
262 "REMOVE" => Token::Remove,
263 "ADD" => Token::Add,
264 "DELETE" => Token::Delete,
265 _ => Token::Identifier(word.to_string()),
266 };
267 tokens.push((token, TokenSpan::new(ident_start, i - ident_start)));
268 }
269
270 c => {
271 return Err(TokenizeError {
272 message: format!("Syntax error; token: \"{c}\""),
273 position: start,
274 bad_len: c.len_utf8(),
275 });
276 }
277 }
278 }
279
280 Ok(tokens)
281}
282
283fn is_ident_start(c: char) -> bool {
284 c.is_ascii_alphabetic() || c == '_'
285}
286
287fn is_name_char(c: char) -> bool {
288 c.is_ascii_alphanumeric() || c == '_'
289}
290
291pub struct TokenStream {
293 tokens: Vec<(Token, TokenSpan)>,
294 pos: usize,
295}
296
297impl TokenStream {
298 pub fn new(tokens: Vec<(Token, TokenSpan)>) -> Self {
299 Self { tokens, pos: 0 }
300 }
301
302 pub fn peek(&self) -> Option<&Token> {
303 self.tokens.get(self.pos).map(|(t, _)| t)
304 }
305
306 pub fn peek_span(&self) -> Option<TokenSpan> {
308 self.tokens.get(self.pos).map(|(_, s)| *s)
309 }
310
311 pub fn current_span(&self) -> Option<TokenSpan> {
313 if self.pos == 0 {
314 None
315 } else {
316 self.tokens.get(self.pos - 1).map(|(_, s)| *s)
317 }
318 }
319
320 #[allow(clippy::should_implement_trait)]
321 pub fn next(&mut self) -> Option<&Token> {
322 let token = self.tokens.get(self.pos).map(|(t, _)| t);
323 if token.is_some() {
324 self.pos += 1;
325 }
326 token
327 }
328
329 pub fn expect(&mut self, expected: &Token) -> Result<(), String> {
330 match self.next() {
331 Some(t) if t == expected => Ok(()),
332 Some(t) => Err(format!("Expected {expected}, got {t}")),
333 None => Err(format!("Expected {expected}, got end of expression")),
334 }
335 }
336
337 pub fn at_end(&self) -> bool {
338 self.pos >= self.tokens.len()
339 }
340
341 pub fn position(&self) -> usize {
342 self.pos
343 }
344
345 pub fn pos(&self) -> usize {
347 self.pos
348 }
349
350 pub fn set_pos(&mut self, pos: usize) {
352 self.pos = pos;
353 }
354}
355
356pub fn near_window_parser(source: &str, offending: TokenSpan, next: Option<TokenSpan>) -> &str {
363 let end = match next {
364 Some(span) => span.end(),
365 None => offending.end(),
366 };
367 let end = end.min(source.len());
368 &source[offending.start..end]
369}
370
371pub fn near_window_tokenizer(source: &str, position: usize) -> &str {
377 let bytes = source.as_bytes();
378 let mut end = position + 1;
379 if end <= bytes.len() && end < bytes.len() && !(bytes[end] as char).is_whitespace() {
380 end += 1;
381 }
382 let end = end.min(bytes.len());
383 &source[position..end]
384}
385
386pub fn check_redundant_parens(tokens: &[(Token, TokenSpan)]) -> Result<(), String> {
393 let toks: Vec<&Token> = tokens.iter().map(|(t, _)| t).collect();
394 let match_close = |open: usize| -> Option<usize> {
395 let mut depth = 0;
396 for (j, tok) in toks.iter().enumerate().skip(open) {
397 match tok {
398 Token::LParen => depth += 1,
399 Token::RParen => {
400 depth -= 1;
401 if depth == 0 {
402 return Some(j);
403 }
404 }
405 _ => {}
406 }
407 }
408 None
409 };
410 for i in 0..toks.len() {
411 if !matches!(toks[i], Token::LParen) {
412 continue;
413 }
414 let Some(outer_close) = match_close(i) else {
415 continue;
416 };
417 if matches!(toks.get(i + 1), Some(Token::LParen)) {
421 if let Some(inner_close) = match_close(i + 1) {
422 if inner_close == outer_close - 1 {
423 return Err("The expression has redundant parentheses;".to_string());
424 }
425 }
426 }
427 }
428 Ok(())
429}
430
431#[cfg(test)]
432mod tests {
433 use super::*;
434
435 fn just_tokens(input: &str) -> Vec<Token> {
436 tokenize(input)
437 .unwrap()
438 .into_iter()
439 .map(|(t, _)| t)
440 .collect()
441 }
442
443 #[test]
444 fn test_tokenize_simple_condition() {
445 assert_eq!(
446 just_tokens("#status = :val"),
447 vec![
448 Token::NameRef("#status".into()),
449 Token::Eq,
450 Token::ValueRef(":val".into()),
451 ]
452 );
453 }
454
455 #[test]
456 fn test_tokenize_comparison_operators() {
457 let tokens = just_tokens("a < b");
458 assert!(matches!(tokens[1], Token::Lt));
459
460 let tokens = just_tokens("a <= b");
461 assert!(matches!(tokens[1], Token::Le));
462
463 let tokens = just_tokens("a > b");
464 assert!(matches!(tokens[1], Token::Gt));
465
466 let tokens = just_tokens("a >= b");
467 assert!(matches!(tokens[1], Token::Ge));
468
469 let tokens = just_tokens("a <> b");
470 assert!(matches!(tokens[1], Token::Ne));
471 }
472
473 #[test]
474 fn test_tokenize_keywords() {
475 let tokens = just_tokens("a AND b OR NOT c BETWEEN d IN e");
476 assert!(matches!(tokens[1], Token::And));
477 assert!(matches!(tokens[3], Token::Or));
478 assert!(matches!(tokens[4], Token::Not));
479 assert!(matches!(tokens[6], Token::Between));
480 assert!(matches!(tokens[8], Token::In));
481 }
482
483 #[test]
484 fn test_tokenize_update_keywords() {
485 let tokens = just_tokens("SET a = :v REMOVE b ADD c :d DELETE e :f");
486 assert!(matches!(tokens[0], Token::Set));
487 assert!(matches!(tokens[4], Token::Remove));
488 assert!(matches!(tokens[6], Token::Add));
489 assert!(matches!(tokens[9], Token::Delete));
490 }
491
492 #[test]
493 fn test_tokenize_path_expression() {
494 assert_eq!(
495 just_tokens("a.b[0].c"),
496 vec![
497 Token::Identifier("a".into()),
498 Token::Dot,
499 Token::Identifier("b".into()),
500 Token::LBracket,
501 Token::Number("0".into()),
502 Token::RBracket,
503 Token::Dot,
504 Token::Identifier("c".into()),
505 ]
506 );
507 }
508
509 #[test]
510 fn test_tokenize_function_call() {
511 assert_eq!(
512 just_tokens("attribute_exists(#name)"),
513 vec![
514 Token::Identifier("attribute_exists".into()),
515 Token::LParen,
516 Token::NameRef("#name".into()),
517 Token::RParen,
518 ]
519 );
520 }
521
522 #[test]
523 fn test_tokenize_arithmetic() {
524 let tokens = just_tokens("Price + :inc");
525 assert!(matches!(tokens[1], Token::Plus));
526
527 let tokens = just_tokens("Price - :dec");
528 assert!(matches!(tokens[1], Token::Minus));
529 }
530
531 #[test]
532 fn test_tokenize_case_insensitive_keywords() {
533 let tokens = just_tokens("set AND or");
534 assert!(matches!(tokens[0], Token::Set));
535 assert!(matches!(tokens[1], Token::And));
536 assert!(matches!(tokens[2], Token::Or));
537 }
538
539 #[test]
540 fn test_tokenize_returns_byte_spans() {
541 let tokens = tokenize("INVALID SYNTAX HERE").unwrap();
542 assert_eq!(tokens.len(), 3);
543 assert_eq!(tokens[0].1, TokenSpan::new(0, 7));
544 assert_eq!(tokens[1].1, TokenSpan::new(8, 6));
545 assert_eq!(tokens[2].1, TokenSpan::new(15, 4));
546 }
547
548 #[test]
549 fn test_tokenize_error_carries_position() {
550 let err = tokenize("!!!").unwrap_err();
551 assert_eq!(err.message, "Syntax error; token: \"!\"");
552 assert_eq!(err.position, 0);
553 }
554
555 #[test]
556 fn test_near_window_parser_uses_next_token() {
557 let source = "INVALID SYNTAX HERE";
558 let offending = TokenSpan::new(0, 7);
559 let next = Some(TokenSpan::new(8, 6));
560 assert_eq!(
561 near_window_parser(source, offending, next),
562 "INVALID SYNTAX"
563 );
564 }
565
566 #[test]
567 fn test_near_window_parser_falls_back_to_offending_when_no_next() {
568 let source = "BARE";
569 let offending = TokenSpan::new(0, 4);
570 assert_eq!(near_window_parser(source, offending, None), "BARE");
571 }
572
573 #[test]
574 fn test_near_window_tokenizer_extends_one_char() {
575 assert_eq!(near_window_tokenizer("!!! INVALID !!!", 0), "!!");
576 }
577
578 #[test]
579 fn test_near_window_tokenizer_stops_at_whitespace() {
580 assert_eq!(near_window_tokenizer("! foo", 0), "!");
582 }
583}