1use crate::token::Token;
2
3const MAX_STRING_LITERAL: usize = 16 * 1024 * 1024;
6
7#[derive(Debug)]
8pub struct LexError {
9 pub message: String,
10 pub position: usize,
11}
12
13impl std::fmt::Display for LexError {
14 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
15 write!(f, "at position {}: {}", self.position, self.message)
16 }
17}
18
19impl std::error::Error for LexError {}
20
21pub fn lex(input: &str) -> Result<Vec<Token>, LexError> {
35 let mut tokens = Vec::new();
36 let chars: Vec<char> = input.chars().collect();
37 let mut pos = 0;
38
39 while pos < chars.len() {
40 if chars[pos].is_whitespace() {
42 pos += 1;
43 continue;
44 }
45
46 if chars[pos] == '#' {
48 while pos < chars.len() && chars[pos] != '\n' {
49 pos += 1;
50 }
51 continue;
52 }
53
54 if chars[pos] == '.'
56 && pos + 1 < chars.len()
57 && (chars[pos + 1].is_alphabetic() || chars[pos + 1] == '_')
58 {
59 pos += 1; let start = pos;
61 while pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_') {
62 pos += 1;
63 }
64 let name: String = chars[start..pos].iter().collect();
65 tokens.push(Token::DotIdent(name));
66 continue;
67 }
68
69 if chars[pos] == '$' {
71 pos += 1;
72 let start = pos;
73 while pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_') {
74 pos += 1;
75 }
76 let name: String = chars[start..pos].iter().collect();
77 tokens.push(Token::Param(name));
78 continue;
79 }
80
81 if chars[pos] == '"' {
83 pos += 1;
84 let mut s = String::new();
85 while pos < chars.len() && chars[pos] != '"' {
86 if chars[pos] == '\\' && pos + 1 < chars.len() {
87 match chars[pos + 1] {
88 '"' => {
89 s.push('"');
90 pos += 2;
91 }
92 '\\' => {
93 s.push('\\');
94 pos += 2;
95 }
96 'n' => {
97 s.push('\n');
98 pos += 2;
99 }
100 't' => {
101 s.push('\t');
102 pos += 2;
103 }
104 _ => {
105 s.push(chars[pos + 1]);
106 pos += 2;
107 }
108 }
109 } else {
110 s.push(chars[pos]);
111 pos += 1;
112 }
113 }
114 if pos >= chars.len() {
115 return Err(LexError {
116 message: "unterminated string".into(),
117 position: pos,
118 });
119 }
120 pos += 1; if s.len() > MAX_STRING_LITERAL {
122 return Err(LexError {
123 message: format!(
124 "string literal exceeds maximum size of {}MB",
125 MAX_STRING_LITERAL / (1024 * 1024)
126 ),
127 position: pos,
128 });
129 }
130 tokens.push(Token::StringLit(s));
131 continue;
132 }
133
134 if chars[pos].is_ascii_digit()
136 || (chars[pos] == '-' && pos + 1 < chars.len() && chars[pos + 1].is_ascii_digit())
137 {
138 let start = pos;
139 if chars[pos] == '-' {
140 pos += 1;
141 }
142 while pos < chars.len() && chars[pos].is_ascii_digit() {
143 pos += 1;
144 }
145 if pos < chars.len()
146 && chars[pos] == '.'
147 && pos + 1 < chars.len()
148 && chars[pos + 1].is_ascii_digit()
149 {
150 pos += 1;
151 while pos < chars.len() && chars[pos].is_ascii_digit() {
152 pos += 1;
153 }
154 let s: String = chars[start..pos].iter().collect();
155 let value = s.parse::<f64>().map_err(|_| LexError {
156 message: format!("float literal out of range: {s}"),
157 position: start,
158 })?;
159 tokens.push(Token::FloatLit(value));
160 } else {
161 let s: String = chars[start..pos].iter().collect();
162 let value = s.parse::<i64>().map_err(|_| LexError {
163 message: format!("integer literal out of range for i64: {s}"),
164 position: start,
165 })?;
166 tokens.push(Token::IntLit(value));
167 }
168 continue;
169 }
170
171 if chars[pos].is_alphabetic() || chars[pos] == '_' {
173 let start = pos;
174 while pos < chars.len() && (chars[pos].is_alphanumeric() || chars[pos] == '_') {
175 pos += 1;
176 }
177 let word: String = chars[start..pos].iter().collect();
178 let token = match word.as_str() {
179 "type" => Token::Type,
180 "filter" => Token::Filter,
181 "order" => Token::Order,
182 "limit" => Token::Limit,
183 "offset" => Token::Offset,
184 "insert" => Token::Insert,
185 "update" => Token::Update,
186 "delete" => Token::Delete,
187 "upsert" => Token::Upsert,
188 "conflict" => Token::Conflict,
189 "select" => Token::Select,
190 "required" => Token::Required,
191 "multi" => Token::Multi,
192 "link" => Token::Link,
193 "index" => Token::Index,
194 "unique" => Token::Unique,
195 "on" => Token::On,
196 "asc" => Token::Asc,
197 "desc" => Token::Desc,
198 "and" => Token::And,
199 "or" => Token::Or,
200 "not" => Token::Not,
201 "exists" => Token::Exists,
202 "let" => Token::Let,
203 "as" => Token::As,
204 "match" => Token::Match,
205 "group" => Token::Group,
206 "join" => Token::Join,
207 "inner" => Token::Inner,
208 "left" => Token::LeftKw,
209 "right" => Token::RightKw,
210 "outer" => Token::Outer,
211 "cross" => Token::Cross,
212 "transaction" => Token::Transaction,
213 "begin" => Token::Begin,
214 "commit" => Token::Commit,
215 "rollback" => Token::Rollback,
216 "view" => Token::View,
217 "materialized" => Token::Materialized,
218 "materialize" => Token::Materialized,
219 "refresh" => Token::Refresh,
220 "union" => Token::Union,
221 "having" => Token::Having,
222 "distinct" => Token::Distinct,
223 "in" => Token::In,
224 "between" => Token::Between,
225 "like" => Token::Like,
226 "count" => Token::Count,
227 "avg" => Token::Avg,
228 "sum" => Token::Sum,
229 "min" => Token::Min,
230 "max" => Token::Max,
231 "is" => Token::Is,
232 "null" => Token::Null,
233 "upper" => Token::Upper,
234 "lower" => Token::Lower,
235 "length" => Token::Length,
236 "trim" => Token::Trim,
237 "substring" => Token::Substring,
238 "concat" => Token::Concat,
239 "abs" => Token::Abs,
240 "round" => Token::Round,
241 "ceil" => Token::Ceil,
242 "floor" => Token::Floor,
243 "sqrt" => Token::Sqrt,
244 "pow" => Token::Pow,
245 "now" => Token::Now,
246 "extract" => Token::Extract,
247 "date_add" => Token::DateAdd,
248 "date_diff" => Token::DateDiff,
249 "cast" => Token::Cast,
250 "case" => Token::Case,
251 "when" => Token::When,
252 "then" => Token::Then,
253 "else" => Token::Else,
254 "end" => Token::End,
255 "over" => Token::Over,
256 "partition" => Token::Partition,
257 "row_number" => Token::RowNumber,
258 "rank" => Token::Rank,
259 "dense_rank" => Token::DenseRank,
260 "alter" => Token::Alter,
261 "drop" => Token::Drop,
262 "add" => Token::Add,
263 "column" => Token::Column,
264 "explain" => Token::Explain,
265 "true" => Token::BoolLit(true),
266 "false" => Token::BoolLit(false),
267 _ => Token::Ident(word),
268 };
269 tokens.push(token);
270 continue;
271 }
272
273 if pos + 1 < chars.len() {
275 let two: String = chars[pos..pos + 2].iter().collect();
276 match two.as_str() {
277 ":=" => {
278 tokens.push(Token::Assign);
279 pos += 2;
280 continue;
281 }
282 "->" => {
283 tokens.push(Token::Arrow);
284 pos += 2;
285 continue;
286 }
287 "!=" => {
288 tokens.push(Token::Neq);
289 pos += 2;
290 continue;
291 }
292 "<=" => {
293 tokens.push(Token::Lte);
294 pos += 2;
295 continue;
296 }
297 ">=" => {
298 tokens.push(Token::Gte);
299 pos += 2;
300 continue;
301 }
302 "??" => {
303 tokens.push(Token::Coalesce);
304 pos += 2;
305 continue;
306 }
307 _ => {}
308 }
309 }
310
311 let token = match chars[pos] {
313 '=' => Token::Eq,
314 '<' => Token::Lt,
315 '>' => Token::Gt,
316 '|' => Token::Pipe,
317 '+' => Token::Plus,
318 '-' => Token::Minus,
319 '*' => Token::Star,
320 '/' => Token::Slash,
321 '{' => Token::LBrace,
322 '}' => Token::RBrace,
323 '(' => Token::LParen,
324 ')' => Token::RParen,
325 ',' => Token::Comma,
326 ':' => Token::Colon,
327 '.' => Token::Dot,
328 c => {
329 return Err(LexError {
330 message: format!("unexpected character: {c}"),
331 position: pos,
332 })
333 }
334 };
335 tokens.push(token);
336 pos += 1;
337 }
338
339 tokens.push(Token::Eof);
340 Ok(tokens)
341}
342
343#[cfg(test)]
344mod tests {
345 use super::*;
346 use crate::token::Token;
347
348 #[test]
349 fn test_lex_simple_query() {
350 let tokens = lex("User filter .age > 30").unwrap();
351 assert_eq!(
352 tokens,
353 vec![
354 Token::Ident("User".into()),
355 Token::Filter,
356 Token::DotIdent("age".into()),
357 Token::Gt,
358 Token::IntLit(30),
359 Token::Eof,
360 ]
361 );
362 }
363
364 #[test]
365 fn test_lex_projection() {
366 let tokens = lex("User { name, email }").unwrap();
367 assert_eq!(
368 tokens,
369 vec![
370 Token::Ident("User".into()),
371 Token::LBrace,
372 Token::Ident("name".into()),
373 Token::Comma,
374 Token::Ident("email".into()),
375 Token::RBrace,
376 Token::Eof,
377 ]
378 );
379 }
380
381 #[test]
382 fn test_lex_insert() {
383 let tokens = lex(r#"insert User { name := "Alice", age := 30 }"#).unwrap();
384 assert_eq!(
385 tokens,
386 vec![
387 Token::Insert,
388 Token::Ident("User".into()),
389 Token::LBrace,
390 Token::Ident("name".into()),
391 Token::Assign,
392 Token::StringLit("Alice".into()),
393 Token::Comma,
394 Token::Ident("age".into()),
395 Token::Assign,
396 Token::IntLit(30),
397 Token::RBrace,
398 Token::Eof,
399 ]
400 );
401 }
402
403 #[test]
404 fn test_lex_params() {
405 let tokens = lex("User filter .age > $min_age").unwrap();
406 assert_eq!(
407 tokens,
408 vec![
409 Token::Ident("User".into()),
410 Token::Filter,
411 Token::DotIdent("age".into()),
412 Token::Gt,
413 Token::Param("min_age".into()),
414 Token::Eof,
415 ]
416 );
417 }
418
419 #[test]
420 fn test_lex_string_with_escapes() {
421 let tokens = lex(r#""hello \"world\"""#).unwrap();
422 assert_eq!(
423 tokens,
424 vec![Token::StringLit("hello \"world\"".into()), Token::Eof,]
425 );
426 }
427
428 #[test]
429 fn test_lex_aggregation() {
430 let tokens = lex("count(User)").unwrap();
431 assert_eq!(
432 tokens,
433 vec![
434 Token::Count,
435 Token::LParen,
436 Token::Ident("User".into()),
437 Token::RParen,
438 Token::Eof,
439 ]
440 );
441 }
442
443 #[test]
447 fn test_lex_intlit_overflow_returns_err() {
448 let err = lex("4444444441111111144444").expect_err("must error, not panic");
450 assert!(
451 err.message.contains("integer literal out of range"),
452 "unexpected message: {}",
453 err.message
454 );
455 assert_eq!(err.position, 0);
456 }
457
458 #[test]
462 fn test_lex_fuzz_repro_issue_24() {
463 let input = "as\t\t\t\t\t\t\t\t\t\t\t\t\t44444444411111114444\t\t\t\t\t\t";
464 let err = lex(input).expect_err("fuzz reproducer must now error, not panic");
465 assert!(err.message.contains("integer literal"));
466 }
467}