1use crate::token::Token;
2
3pub const POWQL_KEYWORDS: &[&str] = &[
7 "abs",
8 "add",
9 "alter",
10 "and",
11 "as",
12 "asc",
13 "avg",
14 "begin",
15 "between",
16 "case",
17 "cast",
18 "ceil",
19 "column",
20 "commit",
21 "concat",
22 "conflict",
23 "count",
24 "cross",
25 "date_add",
26 "date_diff",
27 "delete",
28 "dense_rank",
29 "desc",
30 "distinct",
31 "drop",
32 "else",
33 "end",
34 "exists",
35 "explain",
36 "extract",
37 "false",
38 "filter",
39 "floor",
40 "group",
41 "having",
42 "in",
43 "index",
44 "inner",
45 "insert",
46 "is",
47 "join",
48 "left",
49 "length",
50 "let",
51 "like",
52 "limit",
53 "link",
54 "lower",
55 "match",
56 "materialize",
57 "materialized",
58 "max",
59 "min",
60 "multi",
61 "not",
62 "now",
63 "null",
64 "offset",
65 "on",
66 "or",
67 "order",
68 "outer",
69 "over",
70 "partition",
71 "pow",
72 "rank",
73 "refresh",
74 "required",
75 "right",
76 "rollback",
77 "round",
78 "row_number",
79 "select",
80 "sqrt",
81 "substring",
82 "sum",
83 "then",
84 "transaction",
85 "trim",
86 "true",
87 "type",
88 "union",
89 "unique",
90 "update",
91 "upper",
92 "upsert",
93 "view",
94 "when",
95];
96
97const MAX_STRING_LITERAL: usize = 16 * 1024 * 1024;
100
101#[derive(Debug)]
102pub struct LexError {
103 pub message: String,
104 pub position: usize,
105}
106
107impl std::fmt::Display for LexError {
108 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
109 write!(f, "at position {}: {}", self.position, self.message)
110 }
111}
112
113impl std::error::Error for LexError {}
114
115pub fn lex(input: &str) -> Result<Vec<Token>, LexError> {
129 let mut tokens = Vec::new();
130 let chars: Vec<char> = input.chars().collect();
131 let mut pos = 0;
132
133 while pos < chars.len() {
134 if chars[pos].is_whitespace() {
136 pos += 1;
137 continue;
138 }
139
140 if chars[pos] == '#' {
142 while pos < chars.len() && chars[pos] != '\n' {
143 pos += 1;
144 }
145 continue;
146 }
147
148 if chars[pos] == '.'
150 && pos + 1 < chars.len()
151 && (chars[pos + 1].is_alphabetic() || chars[pos + 1] == '_')
152 {
153 pos += 1; let start = pos;
155 while pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_') {
156 pos += 1;
157 }
158 let name: String = chars[start..pos].iter().collect();
159 tokens.push(Token::DotIdent(name));
160 continue;
161 }
162
163 if chars[pos] == '$' {
165 pos += 1;
166 let start = pos;
167 while pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_') {
168 pos += 1;
169 }
170 let name: String = chars[start..pos].iter().collect();
171 tokens.push(Token::Param(name));
172 continue;
173 }
174
175 if chars[pos] == '"' {
177 pos += 1;
178 let mut s = String::new();
179 while pos < chars.len() && chars[pos] != '"' {
180 if chars[pos] == '\\' && pos + 1 < chars.len() {
181 match chars[pos + 1] {
182 '"' => {
183 s.push('"');
184 pos += 2;
185 }
186 '\\' => {
187 s.push('\\');
188 pos += 2;
189 }
190 'n' => {
191 s.push('\n');
192 pos += 2;
193 }
194 't' => {
195 s.push('\t');
196 pos += 2;
197 }
198 _ => {
199 s.push(chars[pos + 1]);
200 pos += 2;
201 }
202 }
203 } else {
204 s.push(chars[pos]);
205 pos += 1;
206 }
207 }
208 if pos >= chars.len() {
209 return Err(LexError {
210 message: "unterminated string".into(),
211 position: pos,
212 });
213 }
214 pos += 1; if s.len() > MAX_STRING_LITERAL {
216 return Err(LexError {
217 message: format!(
218 "string literal exceeds maximum size of {}MB",
219 MAX_STRING_LITERAL / (1024 * 1024)
220 ),
221 position: pos,
222 });
223 }
224 tokens.push(Token::StringLit(s));
225 continue;
226 }
227
228 if chars[pos].is_ascii_digit()
230 || (chars[pos] == '-' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit())
231 {
232 let start = pos;
233 if chars[pos] == '-' {
234 pos += 1;
235 }
236 while pos < chars.len() && chars[pos].is_ascii_digit() {
237 pos += 1;
238 }
239 if pos < chars.len()
240 && chars[pos] == '.'
241 && pos + 1 < chars.len()
242 && chars[pos + 1].is_ascii_digit()
243 {
244 pos += 1;
245 while pos < chars.len() && chars[pos].is_ascii_digit() {
246 pos += 1;
247 }
248 let s: String = chars[start..pos].iter().collect();
249 let value = s.parse::<f64>().map_err(|_| LexError {
250 message: format!("float literal out of range: {s}"),
251 position: start,
252 })?;
253 tokens.push(Token::FloatLit(value));
254 } else {
255 let s: String = chars[start..pos].iter().collect();
256 let value = s.parse::<i64>().map_err(|_| LexError {
257 message: format!("integer literal out of range for i64: {s}"),
258 position: start,
259 })?;
260 tokens.push(Token::IntLit(value));
261 }
262 continue;
263 }
264
265 if chars[pos].is_alphabetic() || chars[pos] == '_' {
267 let start = pos;
268 while pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_') {
269 pos += 1;
270 }
271 let word: String = chars[start..pos].iter().collect();
272 let token = match word.as_str() {
273 "type" => Token::Type,
274 "filter" => Token::Filter,
275 "order" => Token::Order,
276 "limit" => Token::Limit,
277 "offset" => Token::Offset,
278 "insert" => Token::Insert,
279 "update" => Token::Update,
280 "delete" => Token::Delete,
281 "upsert" => Token::Upsert,
282 "conflict" => Token::Conflict,
283 "select" => Token::Select,
284 "required" => Token::Required,
285 "multi" => Token::Multi,
286 "link" => Token::Link,
287 "index" => Token::Index,
288 "unique" => Token::Unique,
289 "on" => Token::On,
290 "asc" => Token::Asc,
291 "desc" => Token::Desc,
292 "and" => Token::And,
293 "or" => Token::Or,
294 "not" => Token::Not,
295 "exists" => Token::Exists,
296 "let" => Token::Let,
297 "as" => Token::As,
298 "match" => Token::Match,
299 "group" => Token::Group,
300 "join" => Token::Join,
301 "inner" => Token::Inner,
302 "left" => Token::LeftKw,
303 "right" => Token::RightKw,
304 "outer" => Token::Outer,
305 "cross" => Token::Cross,
306 "transaction" => Token::Transaction,
307 "begin" => Token::Begin,
308 "commit" => Token::Commit,
309 "rollback" => Token::Rollback,
310 "view" => Token::View,
311 "materialized" => Token::Materialized,
312 "materialize" => Token::Materialized,
313 "refresh" => Token::Refresh,
314 "union" => Token::Union,
315 "having" => Token::Having,
316 "distinct" => Token::Distinct,
317 "in" => Token::In,
318 "between" => Token::Between,
319 "like" => Token::Like,
320 "count" => Token::Count,
321 "avg" => Token::Avg,
322 "sum" => Token::Sum,
323 "min" => Token::Min,
324 "max" => Token::Max,
325 "is" => Token::Is,
326 "null" => Token::Null,
327 "upper" => Token::Upper,
328 "lower" => Token::Lower,
329 "length" => Token::Length,
330 "trim" => Token::Trim,
331 "substring" => Token::Substring,
332 "concat" => Token::Concat,
333 "abs" => Token::Abs,
334 "round" => Token::Round,
335 "ceil" => Token::Ceil,
336 "floor" => Token::Floor,
337 "sqrt" => Token::Sqrt,
338 "pow" => Token::Pow,
339 "now" => Token::Now,
340 "extract" => Token::Extract,
341 "date_add" => Token::DateAdd,
342 "date_diff" => Token::DateDiff,
343 "cast" => Token::Cast,
344 "case" => Token::Case,
345 "when" => Token::When,
346 "then" => Token::Then,
347 "else" => Token::Else,
348 "end" => Token::End,
349 "over" => Token::Over,
350 "partition" => Token::Partition,
351 "row_number" => Token::RowNumber,
352 "rank" => Token::Rank,
353 "dense_rank" => Token::DenseRank,
354 "alter" => Token::Alter,
355 "drop" => Token::Drop,
356 "add" => Token::Add,
357 "column" => Token::Column,
358 "explain" => Token::Explain,
359 "true" => Token::BoolLit(true),
360 "false" => Token::BoolLit(false),
361 _ => Token::Ident(word),
362 };
363 tokens.push(token);
364 continue;
365 }
366
367 if pos + 1 < chars.len() {
369 let two: String = chars[pos..pos + 2].iter().collect();
370 match two.as_str() {
371 ":=" => {
372 tokens.push(Token::Assign);
373 pos += 2;
374 continue;
375 }
376 "->" => {
377 tokens.push(Token::Arrow);
378 pos += 2;
379 continue;
380 }
381 "!=" => {
382 tokens.push(Token::Neq);
383 pos += 2;
384 continue;
385 }
386 "<=" => {
387 tokens.push(Token::Lte);
388 pos += 2;
389 continue;
390 }
391 ">=" => {
392 tokens.push(Token::Gte);
393 pos += 2;
394 continue;
395 }
396 "??" => {
397 tokens.push(Token::Coalesce);
398 pos += 2;
399 continue;
400 }
401 _ => {}
402 }
403 }
404
405 let token = match chars[pos] {
407 '=' => Token::Eq,
408 '<' => Token::Lt,
409 '>' => Token::Gt,
410 '|' => Token::Pipe,
411 '+' => Token::Plus,
412 '-' => Token::Minus,
413 '*' => Token::Star,
414 '/' => Token::Slash,
415 '{' => Token::LBrace,
416 '}' => Token::RBrace,
417 '(' => Token::LParen,
418 ')' => Token::RParen,
419 ',' => Token::Comma,
420 ':' => Token::Colon,
421 '.' => Token::Dot,
422 c => {
423 return Err(LexError {
424 message: format!("unexpected character: {c}"),
425 position: pos,
426 })
427 }
428 };
429 tokens.push(token);
430 pos += 1;
431 }
432
433 tokens.push(Token::Eof);
434 Ok(tokens)
435}
436
437#[cfg(test)]
438mod tests {
439 use super::*;
440 use crate::token::Token;
441
442 #[test]
443 fn test_lex_simple_query() {
444 let tokens = lex("User filter .age > 30").unwrap();
445 assert_eq!(
446 tokens,
447 vec![
448 Token::Ident("User".into()),
449 Token::Filter,
450 Token::DotIdent("age".into()),
451 Token::Gt,
452 Token::IntLit(30),
453 Token::Eof,
454 ]
455 );
456 }
457
458 #[test]
459 fn test_lex_projection() {
460 let tokens = lex("User { name, email }").unwrap();
461 assert_eq!(
462 tokens,
463 vec![
464 Token::Ident("User".into()),
465 Token::LBrace,
466 Token::Ident("name".into()),
467 Token::Comma,
468 Token::Ident("email".into()),
469 Token::RBrace,
470 Token::Eof,
471 ]
472 );
473 }
474
475 #[test]
476 fn test_lex_insert() {
477 let tokens = lex(r#"insert User { name := "Alice", age := 30 }"#).unwrap();
478 assert_eq!(
479 tokens,
480 vec![
481 Token::Insert,
482 Token::Ident("User".into()),
483 Token::LBrace,
484 Token::Ident("name".into()),
485 Token::Assign,
486 Token::StringLit("Alice".into()),
487 Token::Comma,
488 Token::Ident("age".into()),
489 Token::Assign,
490 Token::IntLit(30),
491 Token::RBrace,
492 Token::Eof,
493 ]
494 );
495 }
496
497 #[test]
498 fn test_lex_params() {
499 let tokens = lex("User filter .age > $min_age").unwrap();
500 assert_eq!(
501 tokens,
502 vec![
503 Token::Ident("User".into()),
504 Token::Filter,
505 Token::DotIdent("age".into()),
506 Token::Gt,
507 Token::Param("min_age".into()),
508 Token::Eof,
509 ]
510 );
511 }
512
513 #[test]
514 fn test_lex_string_with_escapes() {
515 let tokens = lex(r#""hello \"world\"""#).unwrap();
516 assert_eq!(
517 tokens,
518 vec![Token::StringLit("hello \"world\"".into()), Token::Eof,]
519 );
520 }
521
522 #[test]
523 fn test_lex_aggregation() {
524 let tokens = lex("count(User)").unwrap();
525 assert_eq!(
526 tokens,
527 vec![
528 Token::Count,
529 Token::LParen,
530 Token::Ident("User".into()),
531 Token::RParen,
532 Token::Eof,
533 ]
534 );
535 }
536
537 #[test]
541 fn test_lex_intlit_overflow_returns_err() {
542 let err = lex("4444444441111111144444").expect_err("must error, not panic");
544 assert!(
545 err.message.contains("integer literal out of range"),
546 "unexpected message: {}",
547 err.message
548 );
549 assert_eq!(err.position, 0);
550 }
551
552 #[test]
556 fn test_lex_fuzz_repro_issue_24() {
557 let input = "as\t\t\t\t\t\t\t\t\t\t\t\t\t44444444411111114444\t\t\t\t\t\t";
558 let err = lex(input).expect_err("fuzz reproducer must now error, not panic");
559 assert!(err.message.contains("integer literal"));
560 }
561}