Skip to main content

dynoxide/expressions/
tokenizer.rs

1//! Shared tokenizer for all DynamoDB expression types.
2
3use std::fmt;
4
5#[derive(Debug, Clone, PartialEq)]
6pub enum Token {
7    // Identifiers and references
8    Identifier(String), // attribute name (e.g., `pk`, `myAttr`)
9    NameRef(String),    // #name reference
10    ValueRef(String),   // :value reference
11
12    // Operators
13    Eq,    // =
14    Ne,    // <>
15    Lt,    // <
16    Le,    // <=
17    Gt,    // >
18    Ge,    // >=
19    Plus,  // +
20    Minus, // -
21
22    // Keywords (case-insensitive)
23    And,
24    Or,
25    Not,
26    Between,
27    In,
28    Set,
29    Remove,
30    Add,
31    Delete,
32
33    // Punctuation
34    LParen,   // (
35    RParen,   // )
36    LBracket, // [
37    RBracket, // ]
38    Dot,      // .
39    Comma,    // ,
40
41    // Literals
42    Number(String), // numeric literal in brackets [0], [1]
43}
44
45impl fmt::Display for Token {
46    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
47        match self {
48            Token::Identifier(s) => write!(f, "{s}"),
49            Token::NameRef(s) => write!(f, "{s}"),
50            Token::ValueRef(s) => write!(f, "{s}"),
51            Token::Eq => write!(f, "="),
52            Token::Ne => write!(f, "<>"),
53            Token::Lt => write!(f, "<"),
54            Token::Le => write!(f, "<="),
55            Token::Gt => write!(f, ">"),
56            Token::Ge => write!(f, ">="),
57            Token::Plus => write!(f, "+"),
58            Token::Minus => write!(f, "-"),
59            Token::And => write!(f, "AND"),
60            Token::Or => write!(f, "OR"),
61            Token::Not => write!(f, "NOT"),
62            Token::Between => write!(f, "BETWEEN"),
63            Token::In => write!(f, "IN"),
64            Token::Set => write!(f, "SET"),
65            Token::Remove => write!(f, "REMOVE"),
66            Token::Add => write!(f, "ADD"),
67            Token::Delete => write!(f, "DELETE"),
68            Token::LParen => write!(f, "("),
69            Token::RParen => write!(f, ")"),
70            Token::LBracket => write!(f, "["),
71            Token::RBracket => write!(f, "]"),
72            Token::Dot => write!(f, "."),
73            Token::Comma => write!(f, ","),
74            Token::Number(n) => write!(f, "{n}"),
75        }
76    }
77}
78
79/// Tokenize a DynamoDB expression string.
80pub fn tokenize(input: &str) -> Result<Vec<Token>, String> {
81    let mut tokens = Vec::new();
82    let chars: Vec<char> = input.chars().collect();
83    let mut i = 0;
84
85    while i < chars.len() {
86        // Skip whitespace
87        if chars[i].is_whitespace() {
88            i += 1;
89            continue;
90        }
91
92        match chars[i] {
93            // Attribute name reference: #name
94            '#' => {
95                i += 1;
96                let start = i;
97                while i < chars.len() && is_name_char(chars[i]) {
98                    i += 1;
99                }
100                if i == start {
101                    return Err("Syntax error; token: \"#\"".to_string());
102                }
103                let name: String = chars[start..i].iter().collect();
104                tokens.push(Token::NameRef(format!("#{name}")));
105            }
106
107            // Attribute value reference: :value
108            ':' => {
109                i += 1;
110                let start = i;
111                while i < chars.len() && is_name_char(chars[i]) {
112                    i += 1;
113                }
114                if i == start {
115                    return Err("Syntax error; token: \":\"".to_string());
116                }
117                let name: String = chars[start..i].iter().collect();
118                tokens.push(Token::ValueRef(format!(":{name}")));
119            }
120
121            // Comparison operators
122            '<' => {
123                i += 1;
124                if i < chars.len() && chars[i] == '>' {
125                    tokens.push(Token::Ne);
126                    i += 1;
127                } else if i < chars.len() && chars[i] == '=' {
128                    tokens.push(Token::Le);
129                    i += 1;
130                } else {
131                    tokens.push(Token::Lt);
132                }
133            }
134
135            '>' => {
136                i += 1;
137                if i < chars.len() && chars[i] == '=' {
138                    tokens.push(Token::Ge);
139                    i += 1;
140                } else {
141                    tokens.push(Token::Gt);
142                }
143            }
144
145            '=' => {
146                tokens.push(Token::Eq);
147                i += 1;
148            }
149
150            '+' => {
151                tokens.push(Token::Plus);
152                i += 1;
153            }
154            '-' => {
155                tokens.push(Token::Minus);
156                i += 1;
157            }
158
159            // Punctuation
160            '(' => {
161                tokens.push(Token::LParen);
162                i += 1;
163            }
164            ')' => {
165                tokens.push(Token::RParen);
166                i += 1;
167            }
168            '[' => {
169                // Parse bracket with numeric index
170                tokens.push(Token::LBracket);
171                i += 1;
172                // Try to read a number inside brackets
173                let start = i;
174                while i < chars.len() && chars[i].is_ascii_digit() {
175                    i += 1;
176                }
177                if i > start {
178                    let num: String = chars[start..i].iter().collect();
179                    tokens.push(Token::Number(num));
180                }
181            }
182            ']' => {
183                tokens.push(Token::RBracket);
184                i += 1;
185            }
186            '.' => {
187                tokens.push(Token::Dot);
188                i += 1;
189            }
190            ',' => {
191                tokens.push(Token::Comma);
192                i += 1;
193            }
194
195            // Identifiers and keywords
196            c if is_ident_start(c) => {
197                let start = i;
198                while i < chars.len() && is_name_char(chars[i]) {
199                    i += 1;
200                }
201                let word: String = chars[start..i].iter().collect();
202                let token = match word.to_uppercase().as_str() {
203                    "AND" => Token::And,
204                    "OR" => Token::Or,
205                    "NOT" => Token::Not,
206                    "BETWEEN" => Token::Between,
207                    "IN" => Token::In,
208                    "SET" => Token::Set,
209                    "REMOVE" => Token::Remove,
210                    "ADD" => Token::Add,
211                    "DELETE" => Token::Delete,
212                    _ => Token::Identifier(word),
213                };
214                tokens.push(token);
215            }
216
217            c => {
218                return Err(format!("Syntax error; token: \"{c}\""));
219            }
220        }
221    }
222
223    Ok(tokens)
224}
225
226fn is_ident_start(c: char) -> bool {
227    c.is_ascii_alphabetic() || c == '_'
228}
229
230fn is_name_char(c: char) -> bool {
231    c.is_ascii_alphanumeric() || c == '_'
232}
233
234/// A cursor over a token stream for parsing.
235pub struct TokenStream {
236    tokens: Vec<Token>,
237    pos: usize,
238}
239
240impl TokenStream {
241    pub fn new(tokens: Vec<Token>) -> Self {
242        Self { tokens, pos: 0 }
243    }
244
245    pub fn peek(&self) -> Option<&Token> {
246        self.tokens.get(self.pos)
247    }
248
249    #[allow(clippy::should_implement_trait)]
250    pub fn next(&mut self) -> Option<&Token> {
251        let token = self.tokens.get(self.pos);
252        if token.is_some() {
253            self.pos += 1;
254        }
255        token
256    }
257
258    pub fn expect(&mut self, expected: &Token) -> Result<(), String> {
259        match self.next() {
260            Some(t) if t == expected => Ok(()),
261            Some(t) => Err(format!("Expected {expected}, got {t}")),
262            None => Err(format!("Expected {expected}, got end of expression")),
263        }
264    }
265
266    pub fn at_end(&self) -> bool {
267        self.pos >= self.tokens.len()
268    }
269
270    pub fn position(&self) -> usize {
271        self.pos
272    }
273
274    /// Get the current position (alias for `position`).
275    pub fn pos(&self) -> usize {
276        self.pos
277    }
278
279    /// Set the stream position (used for backtracking).
280    pub fn set_pos(&mut self, pos: usize) {
281        self.pos = pos;
282    }
283}
284
285#[cfg(test)]
286mod tests {
287    use super::*;
288
289    #[test]
290    fn test_tokenize_simple_condition() {
291        let tokens = tokenize("#status = :val").unwrap();
292        assert_eq!(
293            tokens,
294            vec![
295                Token::NameRef("#status".into()),
296                Token::Eq,
297                Token::ValueRef(":val".into()),
298            ]
299        );
300    }
301
302    #[test]
303    fn test_tokenize_comparison_operators() {
304        let tokens = tokenize("a < b").unwrap();
305        assert!(matches!(tokens[1], Token::Lt));
306
307        let tokens = tokenize("a <= b").unwrap();
308        assert!(matches!(tokens[1], Token::Le));
309
310        let tokens = tokenize("a > b").unwrap();
311        assert!(matches!(tokens[1], Token::Gt));
312
313        let tokens = tokenize("a >= b").unwrap();
314        assert!(matches!(tokens[1], Token::Ge));
315
316        let tokens = tokenize("a <> b").unwrap();
317        assert!(matches!(tokens[1], Token::Ne));
318    }
319
320    #[test]
321    fn test_tokenize_keywords() {
322        let tokens = tokenize("a AND b OR NOT c BETWEEN d IN e").unwrap();
323        assert!(matches!(tokens[1], Token::And));
324        assert!(matches!(tokens[3], Token::Or));
325        assert!(matches!(tokens[4], Token::Not));
326        assert!(matches!(tokens[6], Token::Between));
327        assert!(matches!(tokens[8], Token::In));
328    }
329
330    #[test]
331    fn test_tokenize_update_keywords() {
332        let tokens = tokenize("SET a = :v REMOVE b ADD c :d DELETE e :f").unwrap();
333        assert!(matches!(tokens[0], Token::Set));
334        assert!(matches!(tokens[4], Token::Remove));
335        assert!(matches!(tokens[6], Token::Add));
336        assert!(matches!(tokens[9], Token::Delete));
337    }
338
339    #[test]
340    fn test_tokenize_path_expression() {
341        let tokens = tokenize("a.b[0].c").unwrap();
342        assert_eq!(
343            tokens,
344            vec![
345                Token::Identifier("a".into()),
346                Token::Dot,
347                Token::Identifier("b".into()),
348                Token::LBracket,
349                Token::Number("0".into()),
350                Token::RBracket,
351                Token::Dot,
352                Token::Identifier("c".into()),
353            ]
354        );
355    }
356
357    #[test]
358    fn test_tokenize_function_call() {
359        let tokens = tokenize("attribute_exists(#name)").unwrap();
360        assert_eq!(
361            tokens,
362            vec![
363                Token::Identifier("attribute_exists".into()),
364                Token::LParen,
365                Token::NameRef("#name".into()),
366                Token::RParen,
367            ]
368        );
369    }
370
371    #[test]
372    fn test_tokenize_arithmetic() {
373        let tokens = tokenize("Price + :inc").unwrap();
374        assert!(matches!(tokens[1], Token::Plus));
375
376        let tokens = tokenize("Price - :dec").unwrap();
377        assert!(matches!(tokens[1], Token::Minus));
378    }
379
380    #[test]
381    fn test_tokenize_case_insensitive_keywords() {
382        let tokens = tokenize("set AND or").unwrap();
383        assert!(matches!(tokens[0], Token::Set));
384        assert!(matches!(tokens[1], Token::And));
385        assert!(matches!(tokens[2], Token::Or));
386    }
387}