Skip to main content

powdb_query/
lexer.rs

1use crate::token::Token;
2
3/// Lowercase words recognized by the PowQL lexer as keywords or built-in
4/// literal words. CLI completion imports this list so it cannot drift from
5/// the parser surface.
6pub const POWQL_KEYWORDS: &[&str] = &[
7    "abs",
8    "add",
9    "alter",
10    "and",
11    "as",
12    "asc",
13    "avg",
14    "begin",
15    "between",
16    "case",
17    "cast",
18    "ceil",
19    "column",
20    "commit",
21    "concat",
22    "conflict",
23    "count",
24    "cross",
25    "date_add",
26    "date_diff",
27    "delete",
28    "dense_rank",
29    "desc",
30    "distinct",
31    "drop",
32    "else",
33    "end",
34    "exists",
35    "explain",
36    "extract",
37    "false",
38    "filter",
39    "floor",
40    "group",
41    "having",
42    "in",
43    "index",
44    "inner",
45    "insert",
46    "is",
47    "join",
48    "left",
49    "length",
50    "let",
51    "like",
52    "limit",
53    "link",
54    "lower",
55    "match",
56    "materialize",
57    "materialized",
58    "max",
59    "min",
60    "multi",
61    "not",
62    "now",
63    "null",
64    "offset",
65    "on",
66    "or",
67    "order",
68    "outer",
69    "over",
70    "partition",
71    "pow",
72    "rank",
73    "refresh",
74    "required",
75    "right",
76    "rollback",
77    "round",
78    "row_number",
79    "select",
80    "sqrt",
81    "substring",
82    "sum",
83    "then",
84    "transaction",
85    "trim",
86    "true",
87    "type",
88    "union",
89    "unique",
90    "update",
91    "upper",
92    "upsert",
93    "view",
94    "when",
95];
96
97/// Maximum allowed length for a string literal (16 MB).
98/// Prevents unbounded memory consumption from queries with multi-gigabyte strings.
99const MAX_STRING_LITERAL: usize = 16 * 1024 * 1024;
100
101#[derive(Debug)]
102pub struct LexError {
103    pub message: String,
104    pub position: usize,
105}
106
107impl std::fmt::Display for LexError {
108    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
109        write!(f, "at position {}: {}", self.position, self.message)
110    }
111}
112
113impl std::error::Error for LexError {}
114
115/// Tokenize a PowQL input string into a stream of tokens.
116///
117/// # Examples
118///
119/// ```
120/// use powdb_query::lexer::lex;
121/// use powdb_query::token::Token;
122///
123/// let tokens = lex("User filter .age > 30").unwrap();
124/// assert_eq!(tokens[0], Token::Ident("User".to_string()));
125/// assert_eq!(tokens[1], Token::Filter);
126/// assert_eq!(tokens[2], Token::DotIdent("age".to_string()));
127/// ```
128pub fn lex(input: &str) -> Result<Vec<Token>, LexError> {
129    let mut tokens = Vec::new();
130    let chars: Vec<char> = input.chars().collect();
131    let mut pos = 0;
132
133    while pos < chars.len() {
134        // Skip whitespace
135        if chars[pos].is_whitespace() {
136            pos += 1;
137            continue;
138        }
139
140        // Skip comments
141        if chars[pos] == '#' {
142            while pos < chars.len() && chars[pos] != '\n' {
143                pos += 1;
144            }
145            continue;
146        }
147
148        // Dot-ident: .fieldname
149        if chars[pos] == '.'
150            && pos + 1 < chars.len()
151            && (chars[pos + 1].is_alphabetic() || chars[pos + 1] == '_')
152        {
153            pos += 1; // skip dot
154            let start = pos;
155            while pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_') {
156                pos += 1;
157            }
158            let name: String = chars[start..pos].iter().collect();
159            tokens.push(Token::DotIdent(name));
160            continue;
161        }
162
163        // Param: $name
164        if chars[pos] == '$' {
165            pos += 1;
166            let start = pos;
167            while pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_') {
168                pos += 1;
169            }
170            let name: String = chars[start..pos].iter().collect();
171            tokens.push(Token::Param(name));
172            continue;
173        }
174
175        // String literal
176        if chars[pos] == '"' {
177            pos += 1;
178            let mut s = String::new();
179            while pos < chars.len() && chars[pos] != '"' {
180                if chars[pos] == '\\' && pos + 1 < chars.len() {
181                    match chars[pos + 1] {
182                        '"' => {
183                            s.push('"');
184                            pos += 2;
185                        }
186                        '\\' => {
187                            s.push('\\');
188                            pos += 2;
189                        }
190                        'n' => {
191                            s.push('\n');
192                            pos += 2;
193                        }
194                        't' => {
195                            s.push('\t');
196                            pos += 2;
197                        }
198                        _ => {
199                            s.push(chars[pos + 1]);
200                            pos += 2;
201                        }
202                    }
203                } else {
204                    s.push(chars[pos]);
205                    pos += 1;
206                }
207            }
208            if pos >= chars.len() {
209                return Err(LexError {
210                    message: "unterminated string".into(),
211                    position: pos,
212                });
213            }
214            pos += 1; // closing quote
215            if s.len() > MAX_STRING_LITERAL {
216                return Err(LexError {
217                    message: format!(
218                        "string literal exceeds maximum size of {}MB",
219                        MAX_STRING_LITERAL / (1024 * 1024)
220                    ),
221                    position: pos,
222                });
223            }
224            tokens.push(Token::StringLit(s));
225            continue;
226        }
227
228        // Number (int or float)
229        if chars[pos].is_ascii_digit()
230            || (chars[pos] == '-' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit())
231        {
232            let start = pos;
233            if chars[pos] == '-' {
234                pos += 1;
235            }
236            while pos < chars.len() && chars[pos].is_ascii_digit() {
237                pos += 1;
238            }
239            if pos < chars.len()
240                && chars[pos] == '.'
241                && pos + 1 < chars.len()
242                && chars[pos + 1].is_ascii_digit()
243            {
244                pos += 1;
245                while pos < chars.len() && chars[pos].is_ascii_digit() {
246                    pos += 1;
247                }
248                let s: String = chars[start..pos].iter().collect();
249                let value = s.parse::<f64>().map_err(|_| LexError {
250                    message: format!("float literal out of range: {s}"),
251                    position: start,
252                })?;
253                tokens.push(Token::FloatLit(value));
254            } else {
255                let s: String = chars[start..pos].iter().collect();
256                let value = s.parse::<i64>().map_err(|_| LexError {
257                    message: format!("integer literal out of range for i64: {s}"),
258                    position: start,
259                })?;
260                tokens.push(Token::IntLit(value));
261            }
262            continue;
263        }
264
265        // Identifiers and keywords
266        if chars[pos].is_alphabetic() || chars[pos] == '_' {
267            let start = pos;
268            while pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_') {
269                pos += 1;
270            }
271            let word: String = chars[start..pos].iter().collect();
272            let token = match word.as_str() {
273                "type" => Token::Type,
274                "filter" => Token::Filter,
275                "order" => Token::Order,
276                "limit" => Token::Limit,
277                "offset" => Token::Offset,
278                "insert" => Token::Insert,
279                "update" => Token::Update,
280                "delete" => Token::Delete,
281                "upsert" => Token::Upsert,
282                "conflict" => Token::Conflict,
283                "select" => Token::Select,
284                "required" => Token::Required,
285                "multi" => Token::Multi,
286                "link" => Token::Link,
287                "index" => Token::Index,
288                "unique" => Token::Unique,
289                "on" => Token::On,
290                "asc" => Token::Asc,
291                "desc" => Token::Desc,
292                "and" => Token::And,
293                "or" => Token::Or,
294                "not" => Token::Not,
295                "exists" => Token::Exists,
296                "let" => Token::Let,
297                "as" => Token::As,
298                "match" => Token::Match,
299                "group" => Token::Group,
300                "join" => Token::Join,
301                "inner" => Token::Inner,
302                "left" => Token::LeftKw,
303                "right" => Token::RightKw,
304                "outer" => Token::Outer,
305                "cross" => Token::Cross,
306                "transaction" => Token::Transaction,
307                "begin" => Token::Begin,
308                "commit" => Token::Commit,
309                "rollback" => Token::Rollback,
310                "view" => Token::View,
311                "materialized" => Token::Materialized,
312                "materialize" => Token::Materialized,
313                "refresh" => Token::Refresh,
314                "union" => Token::Union,
315                "having" => Token::Having,
316                "distinct" => Token::Distinct,
317                "in" => Token::In,
318                "between" => Token::Between,
319                "like" => Token::Like,
320                "count" => Token::Count,
321                "avg" => Token::Avg,
322                "sum" => Token::Sum,
323                "min" => Token::Min,
324                "max" => Token::Max,
325                "is" => Token::Is,
326                "null" => Token::Null,
327                "upper" => Token::Upper,
328                "lower" => Token::Lower,
329                "length" => Token::Length,
330                "trim" => Token::Trim,
331                "substring" => Token::Substring,
332                "concat" => Token::Concat,
333                "abs" => Token::Abs,
334                "round" => Token::Round,
335                "ceil" => Token::Ceil,
336                "floor" => Token::Floor,
337                "sqrt" => Token::Sqrt,
338                "pow" => Token::Pow,
339                "now" => Token::Now,
340                "extract" => Token::Extract,
341                "date_add" => Token::DateAdd,
342                "date_diff" => Token::DateDiff,
343                "cast" => Token::Cast,
344                "case" => Token::Case,
345                "when" => Token::When,
346                "then" => Token::Then,
347                "else" => Token::Else,
348                "end" => Token::End,
349                "over" => Token::Over,
350                "partition" => Token::Partition,
351                "row_number" => Token::RowNumber,
352                "rank" => Token::Rank,
353                "dense_rank" => Token::DenseRank,
354                "alter" => Token::Alter,
355                "drop" => Token::Drop,
356                "add" => Token::Add,
357                "column" => Token::Column,
358                "explain" => Token::Explain,
359                "true" => Token::BoolLit(true),
360                "false" => Token::BoolLit(false),
361                _ => Token::Ident(word),
362            };
363            tokens.push(token);
364            continue;
365        }
366
367        // Two-char operators
368        if pos + 1 < chars.len() {
369            let two: String = chars[pos..pos + 2].iter().collect();
370            match two.as_str() {
371                ":=" => {
372                    tokens.push(Token::Assign);
373                    pos += 2;
374                    continue;
375                }
376                "->" => {
377                    tokens.push(Token::Arrow);
378                    pos += 2;
379                    continue;
380                }
381                "!=" => {
382                    tokens.push(Token::Neq);
383                    pos += 2;
384                    continue;
385                }
386                "<=" => {
387                    tokens.push(Token::Lte);
388                    pos += 2;
389                    continue;
390                }
391                ">=" => {
392                    tokens.push(Token::Gte);
393                    pos += 2;
394                    continue;
395                }
396                "??" => {
397                    tokens.push(Token::Coalesce);
398                    pos += 2;
399                    continue;
400                }
401                _ => {}
402            }
403        }
404
405        // Single-char operators
406        let token = match chars[pos] {
407            '=' => Token::Eq,
408            '<' => Token::Lt,
409            '>' => Token::Gt,
410            '|' => Token::Pipe,
411            '+' => Token::Plus,
412            '-' => Token::Minus,
413            '*' => Token::Star,
414            '/' => Token::Slash,
415            '{' => Token::LBrace,
416            '}' => Token::RBrace,
417            '(' => Token::LParen,
418            ')' => Token::RParen,
419            ',' => Token::Comma,
420            ':' => Token::Colon,
421            '.' => Token::Dot,
422            c => {
423                return Err(LexError {
424                    message: format!("unexpected character: {c}"),
425                    position: pos,
426                })
427            }
428        };
429        tokens.push(token);
430        pos += 1;
431    }
432
433    tokens.push(Token::Eof);
434    Ok(tokens)
435}
436
437#[cfg(test)]
438mod tests {
439    use super::*;
440    use crate::token::Token;
441
442    #[test]
443    fn test_lex_simple_query() {
444        let tokens = lex("User filter .age > 30").unwrap();
445        assert_eq!(
446            tokens,
447            vec![
448                Token::Ident("User".into()),
449                Token::Filter,
450                Token::DotIdent("age".into()),
451                Token::Gt,
452                Token::IntLit(30),
453                Token::Eof,
454            ]
455        );
456    }
457
458    #[test]
459    fn test_lex_projection() {
460        let tokens = lex("User { name, email }").unwrap();
461        assert_eq!(
462            tokens,
463            vec![
464                Token::Ident("User".into()),
465                Token::LBrace,
466                Token::Ident("name".into()),
467                Token::Comma,
468                Token::Ident("email".into()),
469                Token::RBrace,
470                Token::Eof,
471            ]
472        );
473    }
474
475    #[test]
476    fn test_lex_insert() {
477        let tokens = lex(r#"insert User { name := "Alice", age := 30 }"#).unwrap();
478        assert_eq!(
479            tokens,
480            vec![
481                Token::Insert,
482                Token::Ident("User".into()),
483                Token::LBrace,
484                Token::Ident("name".into()),
485                Token::Assign,
486                Token::StringLit("Alice".into()),
487                Token::Comma,
488                Token::Ident("age".into()),
489                Token::Assign,
490                Token::IntLit(30),
491                Token::RBrace,
492                Token::Eof,
493            ]
494        );
495    }
496
497    #[test]
498    fn test_lex_params() {
499        let tokens = lex("User filter .age > $min_age").unwrap();
500        assert_eq!(
501            tokens,
502            vec![
503                Token::Ident("User".into()),
504                Token::Filter,
505                Token::DotIdent("age".into()),
506                Token::Gt,
507                Token::Param("min_age".into()),
508                Token::Eof,
509            ]
510        );
511    }
512
513    #[test]
514    fn test_lex_string_with_escapes() {
515        let tokens = lex(r#""hello \"world\"""#).unwrap();
516        assert_eq!(
517            tokens,
518            vec![Token::StringLit("hello \"world\"".into()), Token::Eof,]
519        );
520    }
521
522    #[test]
523    fn test_lex_aggregation() {
524        let tokens = lex("count(User)").unwrap();
525        assert_eq!(
526            tokens,
527            vec![
528                Token::Count,
529                Token::LParen,
530                Token::Ident("User".into()),
531                Token::RParen,
532                Token::Eof,
533            ]
534        );
535    }
536
537    /// Regression for issue #24: an integer literal with more digits than
538    /// i64 can hold previously reached `s.parse::<i64>().unwrap()` and
539    /// panicked. It must return a `LexError` instead.
540    #[test]
541    fn test_lex_intlit_overflow_returns_err() {
542        // 22 digits — well past i64::MAX (19 digits).
543        let err = lex("4444444441111111144444").expect_err("must error, not panic");
544        assert!(
545            err.message.contains("integer literal out of range"),
546            "unexpected message: {}",
547            err.message
548        );
549        assert_eq!(err.position, 0);
550    }
551
552    /// Same bug, reached via the exact fuzzer reproducer from the
553    /// libFuzzer artifact attached to issue #24 (base64
554    /// `YXMJCQkJCQkJCQkJCQkJNDQ0NDQ0NDQ0MTExMTExMTQ0NDQJCQkJCQk=`).
555    #[test]
556    fn test_lex_fuzz_repro_issue_24() {
557        let input = "as\t\t\t\t\t\t\t\t\t\t\t\t\t44444444411111114444\t\t\t\t\t\t";
558        let err = lex(input).expect_err("fuzz reproducer must now error, not panic");
559        assert!(err.message.contains("integer literal"));
560    }
561}