1use chumsky::{
2 error::Cheap,
3 prelude::*,
4 text::{newline, Character},
5};
6
7use prql_ast::expr::*;
8
9#[derive(Clone, PartialEq, Debug)]
10pub enum Token {
11 NewLine,
12
13 Ident(String),
14 Keyword(String),
15 Literal(Literal),
16 Param(String),
17
18 Range {
19 bind_left: bool,
20 bind_right: bool,
21 },
22 Interpolation(char, String),
23
24 Control(char),
26
27 ArrowThin, ArrowFat, Eq, Ne, Gte, Lte, RegexSearch, And, Or, Coalesce, DivInt, Annotate, }
40
41pub fn lexer() -> impl Parser<char, Vec<(Token, std::ops::Range<usize>)>, Error = Cheap<char>> {
42 let whitespace = filter(|x: &char| x.is_inline_whitespace())
43 .repeated()
44 .at_least(1)
45 .ignored();
46
47 let control_multi = choice((
48 just("->").to(Token::ArrowThin),
49 just("=>").to(Token::ArrowFat),
50 just("==").to(Token::Eq),
51 just("!=").to(Token::Ne),
52 just(">=").to(Token::Gte),
53 just("<=").to(Token::Lte),
54 just("~=").to(Token::RegexSearch),
55 just("&&").then_ignore(end_expr()).to(Token::And),
56 just("||").then_ignore(end_expr()).to(Token::Or),
57 just("??").to(Token::Coalesce),
58 just("//").to(Token::DivInt),
59 just("@").then(digits(1).not().rewind()).to(Token::Annotate),
60 ));
61
62 let control = one_of("></%=+-*[]().,:|!{}").map(Token::Control);
63
64 let ident = ident_part().map(Token::Ident);
65
66 let keyword = choice((
67 just("let"),
68 just("into"),
69 just("case"),
70 just("prql"),
71 just("type"),
72 just("module"),
73 just("internal"),
74 just("func"),
75 ))
76 .then_ignore(end_expr())
77 .map(|x| x.to_string())
78 .map(Token::Keyword);
79
80 let literal = literal().map(Token::Literal);
81
82 let param = just('$')
83 .ignore_then(filter(|c: &char| c.is_alphanumeric() || *c == '_' || *c == '.').repeated())
84 .collect::<String>()
85 .map(Token::Param);
86
87 let interpolation = one_of("sf")
88 .then(quoted_string(true))
89 .map(|(c, s)| Token::Interpolation(c, s));
90
91 let newline = newline();
97
98 let token = choice((
99 newline.to(Token::NewLine),
100 control_multi,
101 interpolation,
102 param,
103 control,
104 literal,
105 keyword,
106 ident,
107 ))
108 .recover_with(skip_then_retry_until([]).skip_start());
109
110 let comment = just('#')
111 .then(newline.not().repeated())
112 .separated_by(newline.then(whitespace.or_not()))
113 .at_least(1)
114 .ignored();
115
116 let range = (whitespace.or_not())
117 .then_ignore(just(".."))
118 .then(whitespace.or_not())
119 .map(|(left, right)| Token::Range {
120 bind_left: left.is_none(),
121 bind_right: right.is_none(),
122 })
123 .map_with_span(|tok, span| (tok, span));
124
125 let line_wrap = newline
126 .then(
127 whitespace
130 .or_not()
131 .then(comment.or_not())
132 .then(newline)
133 .repeated(),
134 )
135 .then(whitespace.repeated())
136 .then(just('\\'))
137 .ignored();
138
139 let ignored = choice((comment, whitespace, line_wrap)).repeated();
140
141 choice((
142 range,
143 ignored.ignore_then(token.map_with_span(|tok, span| (tok, span))),
144 ))
145 .repeated()
146 .then_ignore(ignored)
147 .then_ignore(end())
148}
149
150pub fn ident_part() -> impl Parser<char, String, Error = Cheap<char>> + Clone {
151 let plain = filter(|c: &char| c.is_alphabetic() || *c == '_')
152 .chain(filter(|c: &char| c.is_alphanumeric() || *c == '_').repeated());
153
154 let backticks = none_of('`').repeated().delimited_by(just('`'), just('`'));
155
156 plain.or(backticks).collect()
157}
158
159fn literal() -> impl Parser<char, Literal, Error = Cheap<char>> {
160 let exp = one_of("eE").chain(one_of("+-").or_not().chain::<char, _, _>(text::digits(10)));
161
162 let integer = filter(|c: &char| c.is_ascii_digit() && *c != '0')
163 .chain::<_, Vec<char>, _>(filter(|c: &char| c.is_ascii_digit() || *c == '_').repeated())
164 .or(just('0').map(|c| vec![c]));
165
166 let frac = just('.')
167 .chain::<char, _, _>(filter(|c: &char| c.is_ascii_digit()))
168 .chain::<char, _, _>(filter(|c: &char| c.is_ascii_digit() || *c == '_').repeated());
169
170 let number = one_of("+-")
171 .or_not()
172 .chain::<char, _, _>(integer)
173 .chain::<char, _, _>(frac.or_not().flatten())
174 .chain::<char, _, _>(exp.or_not().flatten())
175 .try_map(|chars, span| {
176 let str = chars.into_iter().filter(|c| *c != '_').collect::<String>();
177
178 if let Ok(i) = str.parse::<i64>() {
179 Ok(Literal::Integer(i))
180 } else if let Ok(f) = str.parse::<f64>() {
181 Ok(Literal::Float(f))
182 } else {
183 Err(Cheap::expected_input_found(span, None, None))
184 }
185 })
186 .labelled("number");
187
188 let string = quoted_string(true).map(Literal::String);
189
190 let raw_string = just("r")
191 .ignore_then(quoted_string(false))
192 .map(Literal::String);
193
194 let bool = (just("true").to(true))
195 .or(just("false").to(false))
196 .then_ignore(end_expr())
197 .map(Literal::Boolean);
198
199 let null = just("null").to(Literal::Null).then_ignore(end_expr());
200
201 let value_and_unit = integer
202 .then(choice((
203 just("microseconds"),
204 just("milliseconds"),
205 just("seconds"),
206 just("minutes"),
207 just("hours"),
208 just("days"),
209 just("weeks"),
210 just("months"),
211 just("years"),
212 )))
213 .then_ignore(end_expr())
214 .try_map(|(number, unit), span| {
215 let str = number.into_iter().filter(|c| *c != '_').collect::<String>();
216 if let Ok(n) = str.parse::<i64>() {
217 let unit = unit.to_string();
218 Ok(ValueAndUnit { n, unit })
219 } else {
220 Err(Cheap::expected_input_found(span, None, None))
221 }
222 })
223 .map(Literal::ValueAndUnit);
224
225 let date_inner = digits(4)
226 .chain(just('-'))
227 .chain::<char, _, _>(digits(2))
228 .chain::<char, _, _>(just('-'))
229 .chain::<char, _, _>(digits(2))
230 .boxed();
231
232 let time_inner = digits(2)
233 .chain::<char, _, _>(just(':').chain(digits(2)).or_not().flatten())
235 .chain::<char, _, _>(just(':').chain(digits(2)).or_not().flatten())
237 .chain::<char, _, _>(
239 just('.')
240 .chain(
241 filter(|c: &char| c.is_ascii_digit())
242 .repeated()
243 .at_least(1)
244 .at_most(6),
245 )
246 .or_not()
247 .flatten(),
248 )
249 .chain::<char, _, _>(
251 one_of("-+")
252 .chain(
253 (digits(2).then_ignore(just(':').or_not()).chain(digits(2)))
254 .or(just('Z').map(|x| vec![x])),
255 )
256 .or_not()
257 .flatten(),
258 )
259 .boxed();
260
261 let dt_prefix = just('@').then(just('{').not().rewind());
263
264 let date = dt_prefix
265 .ignore_then(date_inner.clone())
266 .then_ignore(end_expr())
267 .collect::<String>()
268 .map(Literal::Date);
269
270 let time = dt_prefix
271 .ignore_then(time_inner.clone())
272 .then_ignore(end_expr())
273 .collect::<String>()
274 .map(Literal::Time);
275
276 let datetime = dt_prefix
277 .ignore_then(date_inner)
278 .chain(just('T'))
279 .chain::<char, _, _>(time_inner)
280 .then_ignore(end_expr())
281 .collect::<String>()
282 .map(Literal::Timestamp);
283
284 choice((
285 string,
286 raw_string,
287 value_and_unit,
288 number,
289 bool,
290 null,
291 datetime,
292 date,
293 time,
294 ))
295}
296
297fn quoted_string(escaped: bool) -> impl Parser<char, String, Error = Cheap<char>> {
298 choice((
299 quoted_string_of_quote(&'"', escaped),
300 quoted_string_of_quote(&'\'', escaped),
301 ))
302 .collect::<String>()
303 .labelled("string")
304}
305
306fn quoted_string_of_quote(
307 quote: &char,
308 escaping: bool,
309) -> impl Parser<char, Vec<char>, Error = Cheap<char>> + '_ {
310 let opening = just(*quote).repeated().at_least(1);
311
312 opening.then_with(move |opening| {
313 if opening.len() % 2 == 0 {
314 return (just(vec![])).boxed();
316 }
317 let delimiter = just(*quote).repeated().exactly(opening.len());
318
319 let inner = if escaping {
320 choice((
321 (delimiter.or(just(vec!['\\']))).not(),
324 escaped_character(),
325 just('\\').ignore_then(just(*quote)),
327 ))
328 .boxed()
329 } else {
330 delimiter.not().boxed()
331 };
332
333 inner.repeated().then_ignore(delimiter).boxed()
334 })
335}
336
337fn escaped_character() -> impl Parser<char, char, Error = Cheap<char>> {
338 just('\\').ignore_then(choice((
339 just('\\'),
340 just('/'),
341 just('b').to('\x08'),
342 just('f').to('\x0C'),
343 just('n').to('\n'),
344 just('r').to('\r'),
345 just('t').to('\t'),
346 (just('u').ignore_then(
347 filter(|c: &char| c.is_ascii_hexdigit())
348 .repeated()
349 .exactly(4)
350 .collect::<String>()
351 .validate(|digits, span, emit| {
352 char::from_u32(u32::from_str_radix(&digits, 16).unwrap()).unwrap_or_else(|| {
353 emit(Cheap::expected_input_found(span, None, None));
354 '\u{FFFD}' })
356 }),
357 )),
358 )))
359}
360
361fn digits(count: usize) -> impl Parser<char, Vec<char>, Error = Cheap<char>> {
362 filter(|c: &char| c.is_ascii_digit())
363 .repeated()
364 .exactly(count)
365}
366
367fn end_expr() -> impl Parser<char, (), Error = Cheap<char>> {
368 choice((
369 end(),
370 one_of(",)]}\t >").ignored(),
371 newline(),
372 just("..").ignored(),
373 ))
374 .rewind()
375}
376
377impl Token {
378 pub fn range(bind_left: bool, bind_right: bool) -> Self {
379 Token::Range {
380 bind_left,
381 bind_right,
382 }
383 }
384}
385
386#[allow(clippy::derived_hash_with_manual_eq)]
391impl std::hash::Hash for Token {
392 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
393 core::mem::discriminant(self).hash(state);
394 }
395}
396
397impl std::cmp::Eq for Token {}
398
399#[test]
400fn test_line_wrap() {
401 use insta::assert_debug_snapshot;
402
403 assert_debug_snapshot!(lexer().parse(r"5 +
405 \ 3 "
406 ).unwrap(), @r###"
407 [
408 (
409 Literal(
410 Integer(
411 5,
412 ),
413 ),
414 0..1,
415 ),
416 (
417 Control(
418 '+',
419 ),
420 2..3,
421 ),
422 (
423 Literal(
424 Integer(
425 3,
426 ),
427 ),
428 10..11,
429 ),
430 ]
431 "###);
432
433 assert_debug_snapshot!(lexer().parse(r"5 +
435# comment
436 # comment with whitespace
437 \ 3 "
438 ).unwrap(), @r###"
439 [
440 (
441 Literal(
442 Integer(
443 5,
444 ),
445 ),
446 0..1,
447 ),
448 (
449 Control(
450 '+',
451 ),
452 2..3,
453 ),
454 (
455 Literal(
456 Integer(
457 3,
458 ),
459 ),
460 47..48,
461 ),
462 ]
463 "###);
464}
465
466#[test]
467fn quotes() {
468 use insta::assert_snapshot;
469
470 assert_snapshot!(quoted_string(false).parse(r#"'aoeu'"#).unwrap(), @"aoeu");
472 assert_snapshot!(quoted_string(false).parse(r#"'''aoeu'''"#).unwrap(), @"aoeu");
473 assert_snapshot!(quoted_string(false).parse(r#"'''''aoeu'''''"#).unwrap(), @"aoeu");
474 assert_snapshot!(quoted_string(false).parse(r#"'''''''aoeu'''''''"#).unwrap(), @"aoeu");
475
476 assert_snapshot!(quoted_string(false).parse(r#"''aoeu''"#).unwrap(), @"");
478
479 assert_snapshot!(quoted_string(false).parse(r#""""\"hello\""""#).unwrap(), @r###"\"hello\"###);
481
482 assert_snapshot!(quoted_string(true).parse(r#""""\"hello\"""""#).unwrap(), @r###""hello""###);
483
484 assert_snapshot!(quoted_string(true).parse(r#""\"hello\"""#).unwrap(), @r###""hello""###);
486 assert_snapshot!(quoted_string(true).parse(r"'\'hello\''").unwrap(), @"'hello'");
487
488 assert_snapshot!(quoted_string(true).parse(r#"''"#).unwrap(), @"");
489
490 quoted_string(false).parse(r#""#).unwrap_err();
492
493 assert_snapshot!(quoted_string(true).parse(r#"''''''"#).unwrap(), @"");
495}