Skip to main content

dynoxide/expressions/
tokenizer.rs

1//! Shared tokenizer for all DynamoDB expression types.
2
3use std::fmt;
4
5/// Byte-offset span of a token (or tokenizer error) into the original source string.
6/// Spans are byte offsets, not char indices; UpdateExpression and ProjectionExpression
7/// callers slice the original source via `&source[span.start..span.start + span.len]`
8/// to build the AWS-style `near: "..."` window in error messages.
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub struct TokenSpan {
11    pub start: usize,
12    pub len: usize,
13}
14
15impl TokenSpan {
16    pub fn new(start: usize, len: usize) -> Self {
17        Self { start, len }
18    }
19
20    pub fn end(self) -> usize {
21        self.start + self.len
22    }
23}
24
25/// Structured tokenizer error carrying the byte position of the offending input,
26/// so callers (UpdateExpression, ProjectionExpression) can build their own
27/// `near: "..."` window from the original source rather than just a flat message.
28#[derive(Debug, Clone)]
29pub struct TokenizeError {
30    pub message: String,
31    pub position: usize,
32    pub bad_len: usize,
33}
34
35impl fmt::Display for TokenizeError {
36    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
37        write!(f, "{}", self.message)
38    }
39}
40
41#[derive(Debug, Clone, PartialEq)]
42pub enum Token {
43    // Identifiers and references
44    Identifier(String), // attribute name (e.g., `pk`, `myAttr`)
45    NameRef(String),    // #name reference
46    ValueRef(String),   // :value reference
47
48    // Operators
49    Eq,    // =
50    Ne,    // <>
51    Lt,    // <
52    Le,    // <=
53    Gt,    // >
54    Ge,    // >=
55    Plus,  // +
56    Minus, // -
57
58    // Keywords (case-insensitive)
59    And,
60    Or,
61    Not,
62    Between,
63    In,
64    Set,
65    Remove,
66    Add,
67    Delete,
68
69    // Punctuation
70    LParen,   // (
71    RParen,   // )
72    LBracket, // [
73    RBracket, // ]
74    Dot,      // .
75    Comma,    // ,
76
77    // Literals
78    Number(String), // numeric literal in brackets [0], [1]
79}
80
81impl fmt::Display for Token {
82    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
83        match self {
84            Token::Identifier(s) => write!(f, "{s}"),
85            Token::NameRef(s) => write!(f, "{s}"),
86            Token::ValueRef(s) => write!(f, "{s}"),
87            Token::Eq => write!(f, "="),
88            Token::Ne => write!(f, "<>"),
89            Token::Lt => write!(f, "<"),
90            Token::Le => write!(f, "<="),
91            Token::Gt => write!(f, ">"),
92            Token::Ge => write!(f, ">="),
93            Token::Plus => write!(f, "+"),
94            Token::Minus => write!(f, "-"),
95            Token::And => write!(f, "AND"),
96            Token::Or => write!(f, "OR"),
97            Token::Not => write!(f, "NOT"),
98            Token::Between => write!(f, "BETWEEN"),
99            Token::In => write!(f, "IN"),
100            Token::Set => write!(f, "SET"),
101            Token::Remove => write!(f, "REMOVE"),
102            Token::Add => write!(f, "ADD"),
103            Token::Delete => write!(f, "DELETE"),
104            Token::LParen => write!(f, "("),
105            Token::RParen => write!(f, ")"),
106            Token::LBracket => write!(f, "["),
107            Token::RBracket => write!(f, "]"),
108            Token::Dot => write!(f, "."),
109            Token::Comma => write!(f, ","),
110            Token::Number(n) => write!(f, "{n}"),
111        }
112    }
113}
114
115/// Tokenize a DynamoDB expression string, returning each token together with its
116/// byte-offset span into the original input. Callers use the spans to build
117/// `near: "..."` windows in syntax error messages.
118pub fn tokenize(input: &str) -> Result<Vec<(Token, TokenSpan)>, TokenizeError> {
119    let mut tokens: Vec<(Token, TokenSpan)> = Vec::new();
120    let bytes = input.as_bytes();
121    let mut i = 0;
122
123    while i < bytes.len() {
124        let start = i;
125        let c = bytes[i] as char;
126
127        // Skip whitespace
128        if c.is_whitespace() {
129            i += 1;
130            continue;
131        }
132
133        match c {
134            // Attribute name reference: #name
135            '#' => {
136                i += 1;
137                let name_start = i;
138                while i < bytes.len() && is_name_char(bytes[i] as char) {
139                    i += 1;
140                }
141                if i == name_start {
142                    return Err(TokenizeError {
143                        message: "Syntax error; token: \"#\"".to_string(),
144                        position: start,
145                        bad_len: 1,
146                    });
147                }
148                let name = &input[name_start..i];
149                tokens.push((
150                    Token::NameRef(format!("#{name}")),
151                    TokenSpan::new(start, i - start),
152                ));
153            }
154
155            // Attribute value reference: :value
156            ':' => {
157                i += 1;
158                let name_start = i;
159                while i < bytes.len() && is_name_char(bytes[i] as char) {
160                    i += 1;
161                }
162                if i == name_start {
163                    return Err(TokenizeError {
164                        message: "Syntax error; token: \":\"".to_string(),
165                        position: start,
166                        bad_len: 1,
167                    });
168                }
169                let name = &input[name_start..i];
170                tokens.push((
171                    Token::ValueRef(format!(":{name}")),
172                    TokenSpan::new(start, i - start),
173                ));
174            }
175
176            // Comparison operators
177            '<' => {
178                i += 1;
179                if i < bytes.len() && bytes[i] as char == '>' {
180                    i += 1;
181                    tokens.push((Token::Ne, TokenSpan::new(start, 2)));
182                } else if i < bytes.len() && bytes[i] as char == '=' {
183                    i += 1;
184                    tokens.push((Token::Le, TokenSpan::new(start, 2)));
185                } else {
186                    tokens.push((Token::Lt, TokenSpan::new(start, 1)));
187                }
188            }
189
190            '>' => {
191                i += 1;
192                if i < bytes.len() && bytes[i] as char == '=' {
193                    i += 1;
194                    tokens.push((Token::Ge, TokenSpan::new(start, 2)));
195                } else {
196                    tokens.push((Token::Gt, TokenSpan::new(start, 1)));
197                }
198            }
199
200            '=' => {
201                i += 1;
202                tokens.push((Token::Eq, TokenSpan::new(start, 1)));
203            }
204            '+' => {
205                i += 1;
206                tokens.push((Token::Plus, TokenSpan::new(start, 1)));
207            }
208            '-' => {
209                i += 1;
210                tokens.push((Token::Minus, TokenSpan::new(start, 1)));
211            }
212
213            // Punctuation
214            '(' => {
215                i += 1;
216                tokens.push((Token::LParen, TokenSpan::new(start, 1)));
217            }
218            ')' => {
219                i += 1;
220                tokens.push((Token::RParen, TokenSpan::new(start, 1)));
221            }
222            '[' => {
223                i += 1;
224                tokens.push((Token::LBracket, TokenSpan::new(start, 1)));
225                // Try to read a number inside brackets
226                let num_start = i;
227                while i < bytes.len() && (bytes[i] as char).is_ascii_digit() {
228                    i += 1;
229                }
230                if i > num_start {
231                    let num = input[num_start..i].to_string();
232                    tokens.push((Token::Number(num), TokenSpan::new(num_start, i - num_start)));
233                }
234            }
235            ']' => {
236                i += 1;
237                tokens.push((Token::RBracket, TokenSpan::new(start, 1)));
238            }
239            '.' => {
240                i += 1;
241                tokens.push((Token::Dot, TokenSpan::new(start, 1)));
242            }
243            ',' => {
244                i += 1;
245                tokens.push((Token::Comma, TokenSpan::new(start, 1)));
246            }
247
248            // Identifiers and keywords
249            c if is_ident_start(c) => {
250                let ident_start = i;
251                while i < bytes.len() && is_name_char(bytes[i] as char) {
252                    i += 1;
253                }
254                let word = &input[ident_start..i];
255                let token = match word.to_uppercase().as_str() {
256                    "AND" => Token::And,
257                    "OR" => Token::Or,
258                    "NOT" => Token::Not,
259                    "BETWEEN" => Token::Between,
260                    "IN" => Token::In,
261                    "SET" => Token::Set,
262                    "REMOVE" => Token::Remove,
263                    "ADD" => Token::Add,
264                    "DELETE" => Token::Delete,
265                    _ => Token::Identifier(word.to_string()),
266                };
267                tokens.push((token, TokenSpan::new(ident_start, i - ident_start)));
268            }
269
270            c => {
271                return Err(TokenizeError {
272                    message: format!("Syntax error; token: \"{c}\""),
273                    position: start,
274                    bad_len: c.len_utf8(),
275                });
276            }
277        }
278    }
279
280    Ok(tokens)
281}
282
283fn is_ident_start(c: char) -> bool {
284    c.is_ascii_alphabetic() || c == '_'
285}
286
287fn is_name_char(c: char) -> bool {
288    c.is_ascii_alphanumeric() || c == '_'
289}
290
291/// A cursor over a token stream for parsing.
292pub struct TokenStream {
293    tokens: Vec<(Token, TokenSpan)>,
294    pos: usize,
295}
296
297impl TokenStream {
298    pub fn new(tokens: Vec<(Token, TokenSpan)>) -> Self {
299        Self { tokens, pos: 0 }
300    }
301
302    pub fn peek(&self) -> Option<&Token> {
303        self.tokens.get(self.pos).map(|(t, _)| t)
304    }
305
306    /// Span of the token currently at `pos` (the next call to `peek`/`next` would return it).
307    pub fn peek_span(&self) -> Option<TokenSpan> {
308        self.tokens.get(self.pos).map(|(_, s)| *s)
309    }
310
311    /// Span of the token most recently returned by `next` (i.e. at `pos - 1`).
312    pub fn current_span(&self) -> Option<TokenSpan> {
313        if self.pos == 0 {
314            None
315        } else {
316            self.tokens.get(self.pos - 1).map(|(_, s)| *s)
317        }
318    }
319
320    #[allow(clippy::should_implement_trait)]
321    pub fn next(&mut self) -> Option<&Token> {
322        let token = self.tokens.get(self.pos).map(|(t, _)| t);
323        if token.is_some() {
324            self.pos += 1;
325        }
326        token
327    }
328
329    pub fn expect(&mut self, expected: &Token) -> Result<(), String> {
330        match self.next() {
331            Some(t) if t == expected => Ok(()),
332            Some(t) => Err(format!("Expected {expected}, got {t}")),
333            None => Err(format!("Expected {expected}, got end of expression")),
334        }
335    }
336
337    pub fn at_end(&self) -> bool {
338        self.pos >= self.tokens.len()
339    }
340
341    pub fn position(&self) -> usize {
342        self.pos
343    }
344
345    /// Get the current position (alias for `position`).
346    pub fn pos(&self) -> usize {
347        self.pos
348    }
349
350    /// Set the stream position (used for backtracking).
351    pub fn set_pos(&mut self, pos: usize) {
352        self.pos = pos;
353    }
354}
355
356/// Build the AWS-style `near: "..."` window for a parser-level syntax error
357/// (such as UpdateExpression's "unexpected leading token"), where the offending
358/// token has a known span and the window extends to include the next token if
359/// one is present. The returned window is the original source slice from the
360/// offending token's start to the next token's end (or the offending token's
361/// end if no following token exists).
362pub fn near_window_parser(source: &str, offending: TokenSpan, next: Option<TokenSpan>) -> &str {
363    let end = match next {
364        Some(span) => span.end(),
365        None => offending.end(),
366    };
367    let end = end.min(source.len());
368    &source[offending.start..end]
369}
370
371/// Build the AWS-style `near: "..."` window for a tokenizer-level error (such
372/// as ProjectionExpression's stray `!`), where the failure point is a single
373/// byte position in the source. The window captures the offending byte plus
374/// at most one more contiguous non-whitespace byte, capped at the original
375/// AWS-observed shape (e.g. `!!!` -> `!!`).
376pub fn near_window_tokenizer(source: &str, position: usize) -> &str {
377    let bytes = source.as_bytes();
378    let mut end = position + 1;
379    if end <= bytes.len() && end < bytes.len() && !(bytes[end] as char).is_whitespace() {
380        end += 1;
381    }
382    let end = end.min(bytes.len());
383    &source[position..end]
384}
385
386#[cfg(test)]
387mod tests {
388    use super::*;
389
390    fn just_tokens(input: &str) -> Vec<Token> {
391        tokenize(input)
392            .unwrap()
393            .into_iter()
394            .map(|(t, _)| t)
395            .collect()
396    }
397
398    #[test]
399    fn test_tokenize_simple_condition() {
400        assert_eq!(
401            just_tokens("#status = :val"),
402            vec![
403                Token::NameRef("#status".into()),
404                Token::Eq,
405                Token::ValueRef(":val".into()),
406            ]
407        );
408    }
409
410    #[test]
411    fn test_tokenize_comparison_operators() {
412        let tokens = just_tokens("a < b");
413        assert!(matches!(tokens[1], Token::Lt));
414
415        let tokens = just_tokens("a <= b");
416        assert!(matches!(tokens[1], Token::Le));
417
418        let tokens = just_tokens("a > b");
419        assert!(matches!(tokens[1], Token::Gt));
420
421        let tokens = just_tokens("a >= b");
422        assert!(matches!(tokens[1], Token::Ge));
423
424        let tokens = just_tokens("a <> b");
425        assert!(matches!(tokens[1], Token::Ne));
426    }
427
428    #[test]
429    fn test_tokenize_keywords() {
430        let tokens = just_tokens("a AND b OR NOT c BETWEEN d IN e");
431        assert!(matches!(tokens[1], Token::And));
432        assert!(matches!(tokens[3], Token::Or));
433        assert!(matches!(tokens[4], Token::Not));
434        assert!(matches!(tokens[6], Token::Between));
435        assert!(matches!(tokens[8], Token::In));
436    }
437
438    #[test]
439    fn test_tokenize_update_keywords() {
440        let tokens = just_tokens("SET a = :v REMOVE b ADD c :d DELETE e :f");
441        assert!(matches!(tokens[0], Token::Set));
442        assert!(matches!(tokens[4], Token::Remove));
443        assert!(matches!(tokens[6], Token::Add));
444        assert!(matches!(tokens[9], Token::Delete));
445    }
446
447    #[test]
448    fn test_tokenize_path_expression() {
449        assert_eq!(
450            just_tokens("a.b[0].c"),
451            vec![
452                Token::Identifier("a".into()),
453                Token::Dot,
454                Token::Identifier("b".into()),
455                Token::LBracket,
456                Token::Number("0".into()),
457                Token::RBracket,
458                Token::Dot,
459                Token::Identifier("c".into()),
460            ]
461        );
462    }
463
464    #[test]
465    fn test_tokenize_function_call() {
466        assert_eq!(
467            just_tokens("attribute_exists(#name)"),
468            vec![
469                Token::Identifier("attribute_exists".into()),
470                Token::LParen,
471                Token::NameRef("#name".into()),
472                Token::RParen,
473            ]
474        );
475    }
476
477    #[test]
478    fn test_tokenize_arithmetic() {
479        let tokens = just_tokens("Price + :inc");
480        assert!(matches!(tokens[1], Token::Plus));
481
482        let tokens = just_tokens("Price - :dec");
483        assert!(matches!(tokens[1], Token::Minus));
484    }
485
486    #[test]
487    fn test_tokenize_case_insensitive_keywords() {
488        let tokens = just_tokens("set AND or");
489        assert!(matches!(tokens[0], Token::Set));
490        assert!(matches!(tokens[1], Token::And));
491        assert!(matches!(tokens[2], Token::Or));
492    }
493
494    #[test]
495    fn test_tokenize_returns_byte_spans() {
496        let tokens = tokenize("INVALID SYNTAX HERE").unwrap();
497        assert_eq!(tokens.len(), 3);
498        assert_eq!(tokens[0].1, TokenSpan::new(0, 7));
499        assert_eq!(tokens[1].1, TokenSpan::new(8, 6));
500        assert_eq!(tokens[2].1, TokenSpan::new(15, 4));
501    }
502
503    #[test]
504    fn test_tokenize_error_carries_position() {
505        let err = tokenize("!!!").unwrap_err();
506        assert_eq!(err.message, "Syntax error; token: \"!\"");
507        assert_eq!(err.position, 0);
508    }
509
510    #[test]
511    fn test_near_window_parser_uses_next_token() {
512        let source = "INVALID SYNTAX HERE";
513        let offending = TokenSpan::new(0, 7);
514        let next = Some(TokenSpan::new(8, 6));
515        assert_eq!(
516            near_window_parser(source, offending, next),
517            "INVALID SYNTAX"
518        );
519    }
520
521    #[test]
522    fn test_near_window_parser_falls_back_to_offending_when_no_next() {
523        let source = "BARE";
524        let offending = TokenSpan::new(0, 4);
525        assert_eq!(near_window_parser(source, offending, None), "BARE");
526    }
527
528    #[test]
529    fn test_near_window_tokenizer_extends_one_char() {
530        assert_eq!(near_window_tokenizer("!!! INVALID !!!", 0), "!!");
531    }
532
533    #[test]
534    fn test_near_window_tokenizer_stops_at_whitespace() {
535        // Only one offending char before whitespace; window stays single-char.
536        assert_eq!(near_window_tokenizer("! foo", 0), "!");
537    }
538}