saola-schema-ast 0.1.0

// ######################################
// Readme
// ######################################
// You will notice the pattern ( X ~ Y | X )
// instead of X ~ Y?. This is because we do not
// want the concat operator ~ between X and Y
// to consume any whitespace after X, if Y is not present.

// This is how PEG grammars work:
// https://pest.rs/book/grammars/peg.html

// This is the basic syntax of Pest grammar files:
// https://pest.rs/book/grammars/syntax.html#cheat-sheet

// ######################################
// Schema - the root of all rules
// ######################################
schema = {
    SOI
    ~ (model_declaration | enum_declaration | config_block | type_alias | arbitrary_block | comment_block | empty_lines | CATCH_ALL)*
    ~ EOI
    }

// ######################################
// Model and composite types
// ######################################

// At the syntax level, models and composite types are the same.
model_declaration = {
    (MODEL_KEYWORD | TYPE_KEYWORD | VIEW_KEYWORD)
    ~ identifier
    ~ BLOCK_OPEN
    ~ model_contents
    ~ BLOCK_CLOSE
    }

field_declaration = {
    identifier
    ~ LEGACY_COLON?
    ~ field_type?
    ~ field_attribute*
    ~ trailing_comment?
    ~ NEWLINE
    }

model_contents = {
    (field_declaration | (block_attribute ~ NEWLINE) | comment_block | empty_lines | BLOCK_LEVEL_CATCH_ALL)*
}

// ######################################
// Field Type
// ######################################

// Pest is greedy, order is very important here.
field_type = { unsupported_optional_list_type | list_type | optional_type | legacy_required_type | legacy_list_type | base_type  }

unsupported_type = { "Unsupported(" ~ string_literal ~ ")" }
base_type = { unsupported_type | identifier } // Called base type to not conflict with type rust keyword
unsupported_optional_list_type = { base_type ~ "[]" ~ "?" }
list_type = { base_type ~ "[]" }
optional_type = { base_type ~ "?" }
legacy_required_type = { base_type ~ "!" }
legacy_list_type = { "[" ~ base_type ~ "]" }

// ######################################
// Type Alias
// ######################################
type_alias = { TYPE_KEYWORD ~ identifier ~ "=" ~ base_type ~ field_attribute*  }

// ######################################
// Configuration blocks
// ######################################
config_block = {
    (DATASOURCE_KEYWORD | GENERATOR_KEYWORD)
    ~ identifier
    ~ BLOCK_OPEN
    ~ config_contents
    ~ BLOCK_CLOSE
    }

key_value = { identifier ~ "=" ~ expression? ~ trailing_comment? }

config_contents = {
    ((key_value ~ NEWLINE) | comment_block | empty_lines| BLOCK_LEVEL_CATCH_ALL)*
}

// a block definition without a keyword. Is not valid. Just acts as a catch for the parser to display a nice error.
arbitrary_block = { identifier ~ BLOCK_OPEN ~ ((!BLOCK_CLOSE ~ ANY) | NEWLINE)* ~ BLOCK_CLOSE }

// ######################################
// Enum
// ######################################
enum_declaration = {
    ENUM_KEYWORD
    ~ identifier
    ~ BLOCK_OPEN
    ~ enum_contents
    ~ BLOCK_CLOSE
    }

enum_value_declaration = { identifier ~ field_attribute* ~ trailing_comment? ~ NEWLINE }
enum_contents = {
    (enum_value_declaration | (block_attribute ~ NEWLINE) | comment_block | empty_lines | BLOCK_LEVEL_CATCH_ALL)*
}

// ######################################
// Attributes
// ######################################
block_attribute = { "@@" ~ path ~ arguments_list? ~ trailing_comment? }
field_attribute = { "@" ~ path ~ arguments_list? }

// ######################################
// Arguments
// ######################################
arguments_list = { "(" ~ (argument ~ ("," ~ argument)*)? ~ trailing_comma? ~ ")" }
argument = _{ named_argument | empty_argument | expression }
empty_argument = { identifier ~ ":" }
named_argument = { identifier ~ ":" ~ expression }
trailing_comma = @{ "," }

// ######################################
// Comments and Documentation Comments
// ######################################
comment_block = ${ ((doc_comment | comment | multi_line_comment) ~ NEWLINE?)+ }
trailing_comment = ${ doc_comment | comment | multi_line_comment }
doc_comment = { WHITESPACE* ~ "///" ~ doc_content }
comment = { WHITESPACE* ~ (!"///") ~ "//" ~ doc_content }
doc_content = @{ (!NEWLINE ~ ANY)* }
multi_line_comment = {
    WHITESPACE*
    ~ "/*"
    ~ (!"*/" ~ ANY)*  // Match any characters until the closing */
    ~ "*/"
}

// ######################################
// shared building blocks
// ######################################
unicode_alphanumeric = { LETTER | ASCII_DIGIT }
identifier = @{ unicode_alphanumeric ~ ( "_" | "-" | unicode_alphanumeric)* }
path = @{ identifier ~ ("." ~ path?)* }


WHITESPACE = _{ SPACE_SEPARATOR | "\t" } // tabs are also whitespace
NEWLINE = _{ "\n" | "\r\n" | "\r" }
empty_lines = @{ (WHITESPACE* ~ NEWLINE)+ }

// the any part is to not crash on comments next to an open block, see test `parse_comments_without_crasing_or_loosing_info`
BLOCK_OPEN = { "{" ~ (!NEWLINE ~ ANY)* ~ NEWLINE }
BLOCK_CLOSE = { "}" }

ENUM_KEYWORD = { "enum" }
MODEL_KEYWORD = { "model" }
TYPE_KEYWORD = { "type" }
VIEW_KEYWORD = { "view" }
GENERATOR_KEYWORD = { "generator" }
DATASOURCE_KEYWORD = { "datasource" }
LEGACY_COLON = { ":" }

CATCH_ALL = { (!NEWLINE ~ ANY)+ ~ NEWLINE? }
BLOCK_LEVEL_CATCH_ALL = { !BLOCK_CLOSE ~ CATCH_ALL }

// ######################################
// Expressions & Functions
// ######################################
function_call = { path ~ arguments_list }
array_expression = { "[" ~ (expression ~ ( "," ~ expression )*)? ~ "]" }
expression = { function_call | array_expression | numeric_literal | string_literal | path }

// ######################################
// Literals / Values
// ######################################

numeric_literal = @{ ("-")? ~ ASCII_DIGIT+ ~ ("." ~ ASCII_DIGIT+)? }

// String literals. We follow exactly the same grammar as JSON strings
// References:
// - https://datatracker.ietf.org/doc/html/rfc8259
// - https://www.json.org/json-en.html
ASCII_CONTROL_CHARACTER = _{ '\u{0000}'..'\u{001F}' }
string_escape = _{ "\\" ~ ANY }
string_content = @{ (string_escape | !("\"" | ASCII_CONTROL_CHARACTER) ~ ANY)* }
string_literal = ${ "\"" ~ string_content ~ "\"" }