Skip to main content

powdb_query/
lexer.rs

1use crate::token::Token;
2
3/// Maximum allowed length for a string literal (16 MB).
4/// Prevents unbounded memory consumption from queries with multi-gigabyte strings.
5const MAX_STRING_LITERAL: usize = 16 * 1024 * 1024;
6
7#[derive(Debug)]
8pub struct LexError {
9    pub message: String,
10    pub position: usize,
11}
12
13impl std::fmt::Display for LexError {
14    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
15        write!(f, "at position {}: {}", self.position, self.message)
16    }
17}
18
19impl std::error::Error for LexError {}
20
21/// Tokenize a PowQL input string into a stream of tokens.
22///
23/// # Examples
24///
25/// ```
26/// use powdb_query::lexer::lex;
27/// use powdb_query::token::Token;
28///
29/// let tokens = lex("User filter .age > 30").unwrap();
30/// assert_eq!(tokens[0], Token::Ident("User".to_string()));
31/// assert_eq!(tokens[1], Token::Filter);
32/// assert_eq!(tokens[2], Token::DotIdent("age".to_string()));
33/// ```
34pub fn lex(input: &str) -> Result<Vec<Token>, LexError> {
35    let mut tokens = Vec::new();
36    let chars: Vec<char> = input.chars().collect();
37    let mut pos = 0;
38
39    while pos < chars.len() {
40        // Skip whitespace
41        if chars[pos].is_whitespace() {
42            pos += 1;
43            continue;
44        }
45
46        // Skip comments
47        if chars[pos] == '#' {
48            while pos < chars.len() && chars[pos] != '\n' {
49                pos += 1;
50            }
51            continue;
52        }
53
54        // Dot-ident: .fieldname
55        if chars[pos] == '.'
56            && pos + 1 < chars.len()
57            && (chars[pos + 1].is_alphabetic() || chars[pos + 1] == '_')
58        {
59            pos += 1; // skip dot
60            let start = pos;
61            while pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_') {
62                pos += 1;
63            }
64            let name: String = chars[start..pos].iter().collect();
65            tokens.push(Token::DotIdent(name));
66            continue;
67        }
68
69        // Param: $name
70        if chars[pos] == '$' {
71            pos += 1;
72            let start = pos;
73            while pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_') {
74                pos += 1;
75            }
76            let name: String = chars[start..pos].iter().collect();
77            tokens.push(Token::Param(name));
78            continue;
79        }
80
81        // String literal
82        if chars[pos] == '"' {
83            pos += 1;
84            let mut s = String::new();
85            while pos < chars.len() && chars[pos] != '"' {
86                if chars[pos] == '\\' && pos + 1 < chars.len() {
87                    match chars[pos + 1] {
88                        '"' => {
89                            s.push('"');
90                            pos += 2;
91                        }
92                        '\\' => {
93                            s.push('\\');
94                            pos += 2;
95                        }
96                        'n' => {
97                            s.push('\n');
98                            pos += 2;
99                        }
100                        't' => {
101                            s.push('\t');
102                            pos += 2;
103                        }
104                        _ => {
105                            s.push(chars[pos + 1]);
106                            pos += 2;
107                        }
108                    }
109                } else {
110                    s.push(chars[pos]);
111                    pos += 1;
112                }
113            }
114            if pos >= chars.len() {
115                return Err(LexError {
116                    message: "unterminated string".into(),
117                    position: pos,
118                });
119            }
120            pos += 1; // closing quote
121            if s.len() > MAX_STRING_LITERAL {
122                return Err(LexError {
123                    message: format!(
124                        "string literal exceeds maximum size of {}MB",
125                        MAX_STRING_LITERAL / (1024 * 1024)
126                    ),
127                    position: pos,
128                });
129            }
130            tokens.push(Token::StringLit(s));
131            continue;
132        }
133
134        // Number (int or float)
135        if chars[pos].is_ascii_digit()
136            || (chars[pos] == '-' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit())
137        {
138            let start = pos;
139            if chars[pos] == '-' {
140                pos += 1;
141            }
142            while pos < chars.len() && chars[pos].is_ascii_digit() {
143                pos += 1;
144            }
145            if pos < chars.len()
146                && chars[pos] == '.'
147                && pos + 1 < chars.len()
148                && chars[pos + 1].is_ascii_digit()
149            {
150                pos += 1;
151                while pos < chars.len() && chars[pos].is_ascii_digit() {
152                    pos += 1;
153                }
154                let s: String = chars[start..pos].iter().collect();
155                let value = s.parse::<f64>().map_err(|_| LexError {
156                    message: format!("float literal out of range: {s}"),
157                    position: start,
158                })?;
159                tokens.push(Token::FloatLit(value));
160            } else {
161                let s: String = chars[start..pos].iter().collect();
162                let value = s.parse::<i64>().map_err(|_| LexError {
163                    message: format!("integer literal out of range for i64: {s}"),
164                    position: start,
165                })?;
166                tokens.push(Token::IntLit(value));
167            }
168            continue;
169        }
170
171        // Identifiers and keywords
172        if chars[pos].is_alphabetic() || chars[pos] == '_' {
173            let start = pos;
174            while pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_') {
175                pos += 1;
176            }
177            let word: String = chars[start..pos].iter().collect();
178            let token = match word.as_str() {
179                "type" => Token::Type,
180                "filter" => Token::Filter,
181                "order" => Token::Order,
182                "limit" => Token::Limit,
183                "offset" => Token::Offset,
184                "insert" => Token::Insert,
185                "update" => Token::Update,
186                "delete" => Token::Delete,
187                "upsert" => Token::Upsert,
188                "conflict" => Token::Conflict,
189                "select" => Token::Select,
190                "required" => Token::Required,
191                "multi" => Token::Multi,
192                "link" => Token::Link,
193                "index" => Token::Index,
194                "unique" => Token::Unique,
195                "on" => Token::On,
196                "asc" => Token::Asc,
197                "desc" => Token::Desc,
198                "and" => Token::And,
199                "or" => Token::Or,
200                "not" => Token::Not,
201                "exists" => Token::Exists,
202                "let" => Token::Let,
203                "as" => Token::As,
204                "match" => Token::Match,
205                "group" => Token::Group,
206                "join" => Token::Join,
207                "inner" => Token::Inner,
208                "left" => Token::LeftKw,
209                "right" => Token::RightKw,
210                "outer" => Token::Outer,
211                "cross" => Token::Cross,
212                "transaction" => Token::Transaction,
213                "begin" => Token::Begin,
214                "commit" => Token::Commit,
215                "rollback" => Token::Rollback,
216                "view" => Token::View,
217                "materialized" => Token::Materialized,
218                "materialize" => Token::Materialized,
219                "refresh" => Token::Refresh,
220                "union" => Token::Union,
221                "having" => Token::Having,
222                "distinct" => Token::Distinct,
223                "in" => Token::In,
224                "between" => Token::Between,
225                "like" => Token::Like,
226                "count" => Token::Count,
227                "avg" => Token::Avg,
228                "sum" => Token::Sum,
229                "min" => Token::Min,
230                "max" => Token::Max,
231                "is" => Token::Is,
232                "null" => Token::Null,
233                "upper" => Token::Upper,
234                "lower" => Token::Lower,
235                "length" => Token::Length,
236                "trim" => Token::Trim,
237                "substring" => Token::Substring,
238                "concat" => Token::Concat,
239                "abs" => Token::Abs,
240                "round" => Token::Round,
241                "ceil" => Token::Ceil,
242                "floor" => Token::Floor,
243                "sqrt" => Token::Sqrt,
244                "pow" => Token::Pow,
245                "now" => Token::Now,
246                "extract" => Token::Extract,
247                "date_add" => Token::DateAdd,
248                "date_diff" => Token::DateDiff,
249                "cast" => Token::Cast,
250                "case" => Token::Case,
251                "when" => Token::When,
252                "then" => Token::Then,
253                "else" => Token::Else,
254                "end" => Token::End,
255                "over" => Token::Over,
256                "partition" => Token::Partition,
257                "row_number" => Token::RowNumber,
258                "rank" => Token::Rank,
259                "dense_rank" => Token::DenseRank,
260                "alter" => Token::Alter,
261                "drop" => Token::Drop,
262                "add" => Token::Add,
263                "column" => Token::Column,
264                "explain" => Token::Explain,
265                "true" => Token::BoolLit(true),
266                "false" => Token::BoolLit(false),
267                _ => Token::Ident(word),
268            };
269            tokens.push(token);
270            continue;
271        }
272
273        // Two-char operators
274        if pos + 1 < chars.len() {
275            let two: String = chars[pos..pos + 2].iter().collect();
276            match two.as_str() {
277                ":=" => {
278                    tokens.push(Token::Assign);
279                    pos += 2;
280                    continue;
281                }
282                "->" => {
283                    tokens.push(Token::Arrow);
284                    pos += 2;
285                    continue;
286                }
287                "!=" => {
288                    tokens.push(Token::Neq);
289                    pos += 2;
290                    continue;
291                }
292                "<=" => {
293                    tokens.push(Token::Lte);
294                    pos += 2;
295                    continue;
296                }
297                ">=" => {
298                    tokens.push(Token::Gte);
299                    pos += 2;
300                    continue;
301                }
302                "??" => {
303                    tokens.push(Token::Coalesce);
304                    pos += 2;
305                    continue;
306                }
307                _ => {}
308            }
309        }
310
311        // Single-char operators
312        let token = match chars[pos] {
313            '=' => Token::Eq,
314            '<' => Token::Lt,
315            '>' => Token::Gt,
316            '|' => Token::Pipe,
317            '+' => Token::Plus,
318            '-' => Token::Minus,
319            '*' => Token::Star,
320            '/' => Token::Slash,
321            '{' => Token::LBrace,
322            '}' => Token::RBrace,
323            '(' => Token::LParen,
324            ')' => Token::RParen,
325            ',' => Token::Comma,
326            ':' => Token::Colon,
327            '.' => Token::Dot,
328            c => {
329                return Err(LexError {
330                    message: format!("unexpected character: {c}"),
331                    position: pos,
332                })
333            }
334        };
335        tokens.push(token);
336        pos += 1;
337    }
338
339    tokens.push(Token::Eof);
340    Ok(tokens)
341}
342
343#[cfg(test)]
344mod tests {
345    use super::*;
346    use crate::token::Token;
347
348    #[test]
349    fn test_lex_simple_query() {
350        let tokens = lex("User filter .age > 30").unwrap();
351        assert_eq!(
352            tokens,
353            vec![
354                Token::Ident("User".into()),
355                Token::Filter,
356                Token::DotIdent("age".into()),
357                Token::Gt,
358                Token::IntLit(30),
359                Token::Eof,
360            ]
361        );
362    }
363
364    #[test]
365    fn test_lex_projection() {
366        let tokens = lex("User { name, email }").unwrap();
367        assert_eq!(
368            tokens,
369            vec![
370                Token::Ident("User".into()),
371                Token::LBrace,
372                Token::Ident("name".into()),
373                Token::Comma,
374                Token::Ident("email".into()),
375                Token::RBrace,
376                Token::Eof,
377            ]
378        );
379    }
380
381    #[test]
382    fn test_lex_insert() {
383        let tokens = lex(r#"insert User { name := "Alice", age := 30 }"#).unwrap();
384        assert_eq!(
385            tokens,
386            vec![
387                Token::Insert,
388                Token::Ident("User".into()),
389                Token::LBrace,
390                Token::Ident("name".into()),
391                Token::Assign,
392                Token::StringLit("Alice".into()),
393                Token::Comma,
394                Token::Ident("age".into()),
395                Token::Assign,
396                Token::IntLit(30),
397                Token::RBrace,
398                Token::Eof,
399            ]
400        );
401    }
402
403    #[test]
404    fn test_lex_params() {
405        let tokens = lex("User filter .age > $min_age").unwrap();
406        assert_eq!(
407            tokens,
408            vec![
409                Token::Ident("User".into()),
410                Token::Filter,
411                Token::DotIdent("age".into()),
412                Token::Gt,
413                Token::Param("min_age".into()),
414                Token::Eof,
415            ]
416        );
417    }
418
419    #[test]
420    fn test_lex_string_with_escapes() {
421        let tokens = lex(r#""hello \"world\"""#).unwrap();
422        assert_eq!(
423            tokens,
424            vec![Token::StringLit("hello \"world\"".into()), Token::Eof,]
425        );
426    }
427
428    #[test]
429    fn test_lex_aggregation() {
430        let tokens = lex("count(User)").unwrap();
431        assert_eq!(
432            tokens,
433            vec![
434                Token::Count,
435                Token::LParen,
436                Token::Ident("User".into()),
437                Token::RParen,
438                Token::Eof,
439            ]
440        );
441    }
442
443    /// Regression for issue #24: an integer literal with more digits than
444    /// i64 can hold previously reached `s.parse::<i64>().unwrap()` and
445    /// panicked. It must return a `LexError` instead.
446    #[test]
447    fn test_lex_intlit_overflow_returns_err() {
448        // 22 digits — well past i64::MAX (19 digits).
449        let err = lex("4444444441111111144444").expect_err("must error, not panic");
450        assert!(
451            err.message.contains("integer literal out of range"),
452            "unexpected message: {}",
453            err.message
454        );
455        assert_eq!(err.position, 0);
456    }
457
458    /// Same bug, reached via the exact fuzzer reproducer from the
459    /// libFuzzer artifact attached to issue #24 (base64
460    /// `YXMJCQkJCQkJCQkJCQkJNDQ0NDQ0NDQ0MTExMTExMTQ0NDQJCQkJCQk=`).
461    #[test]
462    fn test_lex_fuzz_repro_issue_24() {
463        let input = "as\t\t\t\t\t\t\t\t\t\t\t\t\t44444444411111114444\t\t\t\t\t\t";
464        let err = lex(input).expect_err("fuzz reproducer must now error, not panic");
465        assert!(err.message.contains("integer literal"));
466    }
467}