prql-compiler 0.4.2

WHITESPACE = _{ " " | "\t" }

// TODO: maybe pass comments to AST (and potentially put them into SQL comments)
COMMENT = _{ "#" ~ (!NEWLINE ~ ANY) * }

statements = _{ SOI ~ NEWLINE* ~ query_def ? ~ (func_def | var_def)* ~ pipeline_stmt? ~ EOI }

query_def = { "prql" ~ named_arg* ~ NEWLINE+ }

func_def = { "func" ~ func_def_name ~ func_def_params ~ "->" ~ expr_call ~ ( NEWLINE+ | &EOI ) }

func_def_name = { ident_part ~ type_def? }
func_def_params = { func_def_param* }
func_def_param = { ident_part ~ type_def? ~ (":" ~ expr)? }
type_def = { "<" ~ type_term ~ ( "|" ~ type_term)* ~ ">" }
type_term = { ident_part ~ type_def? }

var_def = { "let" ~ ident_part ~ "=" ~ expr_call ~ ( NEWLINE+ | &EOI ) }

pipeline_stmt = { pipeline ~ ( NEWLINE+ | &EOI ) }

// An ident is a sequence of word-like terms, separated by `.`. Where surrounded
// by backticks, the term is taken as-is, including any periods it contains.
// We allow `e.*`, but not just `*`, since it would conflict with multiply in some cases.
ident = ${
    !operator
    ~ (ident_plain | ident_backticks)
    ~ ("."  ~ (ident_plain | ident_backticks | ident_star))*
}
ident_part = ${ ident_plain | ident_backticks }
// Either a normal ident (starting with a letter, `$` or `_`), or any string surrounded
// by backticks.
ident_plain = { ((ASCII_ALPHA | "$" | "_") ~ (ASCII_ALPHANUMERIC | "_" )* ) }
ident_backticks = _{ PUSH("`") ~ (!NEWLINE ~ string_inner)* ~ POP }
// This is split out so we can make `ident_part_next` silent, but still capture it.
ident_star = { "*" }

pipe = _{ NEWLINE+ | "|" }
pipeline = { WHITESPACE* ~ expr_call ~ (pipe ~ expr_call)* }

// Whitespace is required to prevent matching s"string". Forbid `operator_binary` so `a
// - b` can't parse as `a` & `-b` (but allow `select ![a]`)
func_call = ${ ident ~ WHITESPACE+ ~ (!operator_binary ~ (named_arg | alias | expr) ~ WHITESPACE*)+ }

named_arg   = !{ ident_part ~ ":" ~ !":" ~ expr }
// alias needs to be distinct from assign, so that in `join s=salaries [==id]`,
// `s=salaries` is parsed separately from `[==id]`, since aliases allow for an
// expr as an rvalue, but not a function call.
alias = !{ ident_part ~ "=" ~ !"=" ~ expr }
assign = !{ ident_part ~ "=" ~ !"=" ~ expr_call }

expr_call = _{ (func_call | expr) }

expr = !{ expr_coalesce ~ (operator_logical ~ expr)? }
expr_coalesce = { expr_compare ~ (operator_coalesce ~ expr_coalesce)? }
expr_compare = { expr_add ~ (operator_compare ~ expr_add)? }
expr_add = { expr_mul ~ (operator_add ~ expr_add)? }
expr_mul = { term ~ (operator_mul ~ expr_mul)? }

term = _{ ( switch | s_string | f_string | range | literal | ident | nested_pipeline | expr_unary | list | jinja ) }
expr_unary = { ( operator_unary ~ ( nested_pipeline | ident | list )) }
literal = _{ value_and_unit | number | boolean | null | string | timestamp | date | time | "(" ~ literal ~ ")" }
// `assign | pipeline` based on discussion in #648
list = { "[" ~ (NEWLINE* ~ (assign |  pipeline) ~ ("," ~ NEWLINE* ~ (assign | pipeline) )* ~ ","?)? ~ NEWLINE* ~ "]" }
nested_pipeline = { "(" ~ (WHITESPACE | NEWLINE)* ~ pipeline? ~ (WHITESPACE | NEWLINE)* ~ ")" }

// We haven't implemented escapes — I think we can mostly pass those through to
// SQL, but there may be things we're missing.
// https://pest.rs/book/examples/rust/literals.html

// We need to have a non-silent rule which contains the quotes
// — `string` in this case — because of
// https://github.com/pest-parser/pest/issues/583. Then when converting to AST,
// we only keep the `string_inner` and discard the `string` given it contains
// the quotes.
//
// TODO: I'm still a bit unclear how preceding and trailing spaces are working
// -- it seems that inner spaces are included without an atomic operator (or
// with `ANY`), but prceeding & trailing spaces require both `ANY` _and_ an
// atomic operator. We have some rudimentary tests for these.

single_quote = _{ "\"" | "'" }
multi_quote = _{ "\""{3,} | "'"{3,} }
opening_quote = _{ PUSH(multi_quote) | PUSH(single_quote) }
// PEEK refers to the opening quote; `"` or `'` or multiple quotes.
string_inner = { ( !( PEEK ) ~ ANY )+ }
// Either > 3 quotes, or just one. Currently both of those can be multiline.
string = ${ opening_quote ~ string_inner? ~ POP }

// We can use underscores within numbers
number_char = _{ ASCII_DIGIT | "_" }
// Numbers need to start and end with a digit. We implement the end by matching
// either a number_char that's not the final char, or a final char that's a digit.
number_part = _{ ( ASCII_DIGIT ) ~ (( number_char ~ &number_char )* ~ ( ASCII_DIGIT ~ !number_char ))? }
number = ${ operator_add? ~ number_part ~ ("." ~ number_part)? }

boolean = ${ "true" | "false" }

null = ${ "null" }

range = ${ range_edge ~ ".." ~ range_edge }
range_edge = ${ literal? }

operator = _{ operator_unary | operator_binary }
operator_binary = _{ operator_mul | operator_add | operator_compare | operator_logical | operator_coalesce }
operator_unary = ${ "-" | "+" | "!" | "==" }
operator_mul = ${ "*" | "/" | "%" }
operator_add = ${ "+" | "-" }
operator_compare = ${ "==" | "!=" | ">=" | "<=" | ">" | "<" }
operator_logical = ${ ("and" | "or") ~ &WHITESPACE }
operator_coalesce = ${ "??" }

// If we have lots more string prefixes then we could just have a type
// `prefixed` string and parse in the parser, but manageable for now.
s_string = ${ "s" ~ opening_quote ~ interpolate_string_inner ~ POP }
f_string = ${ "f" ~ opening_quote ~ interpolate_string_inner ~ POP }
interpolate_string_inner = _{ ( interpolate_string_inner_literal | interpolate_double_bracket | ( "{" ~ pipeline ~ "}" ))* }
// We want to strip the outer `{}` of `{{}}`, so we make a silent rule and then
// an inner non-silent rule.
interpolate_double_bracket = _{ "{" ~ interpolate_double_bracket_literal ~ "}" }
interpolate_double_bracket_literal = { "{" ~ ( !"}}"  ~ ANY )+ ~ "}" }
interpolate_string_inner_literal = { ( !( PEEK | "{" ) ~ ANY )+ }

unit = { "microseconds" | "milliseconds" | "seconds" | "minutes" | "hours" | "days" | "weeks" | "months" | "years" }
value_and_unit = ${ number ~ unit }

date = ${ "@" ~ date_inner ~ &end_expr }
time = ${ "@" ~ time_inner ~ &end_expr }
timestamp = ${ "@" ~ timestamp_inner ~ &end_expr }
// We use the `inner` types as containing the data that we want to retain in the AST.
date_inner = ${ ASCII_DIGIT{4} ~ "-" ~ ASCII_DIGIT{2} ~ "-" ~ ASCII_DIGIT{2} }
// Times are liberally defined atm, we could make this more robust.
time_inner = ${ ASCII_DIGIT{2} ~ (( ":" | "." ) ~ ASCII_DIGIT* )* ~ ((( "+" | "-" ) ~ (ASCII_DIGIT | ":" )*) | "Z")? }
timestamp_inner = ${ date_inner ~ "T" ~ time_inner }

// We can use this when want to ensure something is ending, like a date, so `@20-01-0`
// isn't treated like a time `@20-01` `-` (minus) `0`.
// (Not sure whether `..` should be here or in the items that allow it; feel
// free to demote it to those items if `end_expr` is used somewhere where it's
// not supported)
end_expr = _{ WHITESPACE | "," | ")" | "]" | EOI | NEWLINE | ".." }

// We pass text between `{{` and `}}` through, so dbt can use Jinja.
jinja = { ("{{" ~ (!"}}" ~ ANY)* ~ "}}") }

switch = { "switch" ~ "[" ~ (NEWLINE* ~ switch_case ~ ("," ~ NEWLINE* ~ switch_case )* ~ ","?)? ~ NEWLINE* ~ "]" }
switch_case = { expr_call ~ "->" ~ expr_call }