flowlog-build 0.3.0

// =============================================================================
// FlowLog GRAMMAR DEFINITION
// =============================================================================
// This grammar defines the syntax for FlowLog, a Datalog language engine
// for declarative data processing and rule-based queries.

// Main grammar entry point
main_grammar = {
    SOI ~
    (declaration | type_alias_decl | extern_fn | include_directive | input_directive | output_directive | printsize_directive | comp_decl | init_decl | fixpoint_block | loop_block | fact | rule)* ~
    EOI
}

// =============================================================================
// BASIC ELEMENTS (IDENTIFIERS, LITERALS, TYPES)
// =============================================================================

// Identifier pattern - must start with letter or underscore, 
// can contain letters, numbers, underscores
// Valid examples: x, _temp, myVar123, _private_var
// Invalid examples: 1x, x-yz (hyphen not allowed)
identifier = @{ "_"? ~ ASCII_ALPHA+ ~ (ASCII_ALPHANUMERIC | "_")* }

// Variable reference - variables used in rules and queries
variable = { identifier }

// Attribute name - names of relation attributes (columns)
attribute_name = { identifier }

// Relation name as it appears at a *defining* site (`.decl`). Always
// a single identifier — dots are reserved for inliner-produced names.
relation_name = @{ identifier }

// Relation reference at a *use* site (atom, head, directive target,
// loop condition, iterative list). Accepts the dotted form so user
// code can refer to inliner-produced relations like `c.Holds`.
relation_ref = @{ identifier ~ ("." ~ identifier)* }

// Type at a `.decl` attribute. `data_type` must come first so reserved
// keywords (`number`, `symbol`, …) win over the catch-all identifier.
type_ref = { data_type | alias_name }

// A user-declared type name at a *use* site. Resolved against the
// [`TypeRegistry`] after parsing — includes top-level `.type` aliases
// and per-instance prefixed types registered by the component inliner
// (`a.cfg.context`, etc.). Dotted forms support comp-type-parameter
// references like `.decl R(c: cfg.Context)`.
alias_name = @{ identifier ~ ("." ~ identifier)* }

// Data type specification
data_type = { integer8_type | integer16_type | integer32_type | integer64_type
            | uinteger8_type | uinteger16_type | uinteger32_type | uinteger64_type
            | float32_type | float64_type
            | string_type | bool_type }

// 8-bit signed integer data type
integer8_type = { "int8" }

// 16-bit signed integer data type
integer16_type = { "int16" }

// 32-bit signed integer data type ("number" is an alias for "int32")
integer32_type = { "int32" | "number" }

// 64-bit signed integer data type
integer64_type = { "int64" }

// 8-bit unsigned integer data type
uinteger8_type = { "uint8" }

// 16-bit unsigned integer data type
uinteger16_type = { "uint16" }

// 32-bit unsigned integer data type ("unsigned" is an alias for "uint32")
uinteger32_type = { "uint32" | "unsigned" }

// 64-bit unsigned integer data type
uinteger64_type = { "uint64" }

// 32-bit floating point data type ("float" is an alias for "f32")
float32_type = { "float" | "f32" }

// 64-bit floating point data type
float64_type = { "f64" }

// Text data type for strings ("symbol" is an alias for "string")
string_type = { "string" | "symbol" }

// Boolean data type
bool_type = { "bool" }

// Constant values
constant = { boolean | float | integer | string } 

// Float with optional sign, must contain a decimal point
// Examples: 1.5, -3.14, +0.0
float = @{ ("+" | "-")? ~ ASCII_DIGIT+ ~ "." ~ ASCII_DIGIT+ }

// Integer with optional sign
// Examples: 42, -17, +100
integer = @{ ("+" | "-")? ~ ASCII_DIGIT+ }

// String enclosed in double quotes
// Example: "hello world"
string = { "\"" ~ (!"\"" ~ ANY)* ~ "\"" }

// Placeholder for unused variables (underscore)
// Used when you don't need to reference a variable
placeholder = { "_" } 

// Boolean literal values
boolean = { ("True" | "False") ~ !(ASCII_ALPHANUMERIC | "_") } 

// =============================================================================
// RELATION DECLARATIONS
// =============================================================================
// Relations are the core data structures in FlowLog. They can be:
// - EDB (Extensional Database): Input relations with external data
// - IDB (Intentional Database): Computed relations defined by rules

// Declaration only defines the relation schema. I/O directives are
// separate and can appear anywhere in the program. Optional trailing
// `overridable` is only meaningful inside a `.comp` body.
declaration = {
    ".decl" ~ relation_name ~ "(" ~ attributes_decl? ~ ")" ~ overridable_kw?
}

// Lets a subcomponent's `.override` replace this relation's rules.
overridable_kw = @{ "overridable" ~ !(ASCII_ALPHANUMERIC | "_") }

// Input directive for EDB relations - separate from declaration
// Example: .input Relation(IO="file", filename="data.csv", delimiter=",")
input_directive = {
    ".input" ~ relation_ref ~ "(" ~ io_params? ~ ")"
}

// Output directive for relations
// Example: .output Relation
// Example: .output Relation(delimiter="|")
output_directive = {
    ".output" ~ relation_ref ~ ("(" ~ io_params? ~ ")")?
}

// Shared key=value parameter list used by both input and output directives
io_params = { io_param ~ ("," ~ io_param)* }

// Individual parameter
io_param = { parameter_name ~ "=" ~ parameter_value }

// Parameter name (IO, filename, delimiter, etc.)
parameter_name = { identifier }

// Parameter value (quoted string)
parameter_value = { string }

// Print size directive for relations - separate from declaration
// Example: .printsize Relation
printsize_directive = {
    ".printsize" ~ relation_ref
}

// Include directive — pull another .dl file into the current program.
// Path is resolved relative to the including file's directory.
// Example: .include "lib/base.dl"
include_directive = { ".include" ~ string }

// Comma-separated list of attribute declarations
// Example: src: number, dst: number, weight: string
attributes_decl = { attribute_decl ~ ("," ~ attribute_decl)* }

// Single attribute with name and type. Type is either a built-in primitive
// or a `.type`-declared alias name (resolved transitively at parse time).
// Example: "src: number" or "src: NodeId" where `NodeId` is a `.type` alias.
attribute_decl = { attribute_name ~ ":" ~ type_ref }

// =============================================================================
// USER-DEFINED EXTERNAL FUNCTION DECLARATIONS
// =============================================================================
//   .extern fn   name(param1: type1, ...) -> ret_type

extern_fn = {
    ".extern" ~ "fn" ~ identifier ~
    "(" ~ extern_fn_params? ~ ")" ~
    "->" ~ data_type
}

extern_fn_params = { extern_fn_param ~ ("," ~ extern_fn_param)* }

extern_fn_param = { identifier ~ ":" ~ data_type }

// =============================================================================
// ARITHMETIC EXPRESSIONS
// =============================================================================
// Mathematical expressions with basic operators

// Simple arithmetic without precedence
// Note: "x + y * z" is parsed left-to-right as "(x + y) * z"
arithmetic_expr = { factor ~ (arithmetic_op ~ factor)* }

// `as_cast` first so `as` wins over a UDF named `as` (reserved keyword).
factor = { as_cast | builtin_fn_call | fn_call_expr | constant | variable }

// `as(factor, TargetType)` — explicit cast, runtime no-op. Inner is a
// single `factor` (not `arithmetic_expr`) so the typechecker can lower
// `as(...)` away after validation; for `as(x + 1, T)`, bind first.
as_cast = { as_kw ~ "(" ~ factor ~ "," ~ type_ref ~ ")" }
as_kw = _{ "as" ~ !(ASCII_ALPHANUMERIC | "_") }

// Arithmetic operators
arithmetic_op = { 
    plus |    // +
    minus |   // -
    times |   // *
    divide |  // /
    modulo |  // %
    cat       // string concatenation
}

// Individual arithmetic operators
plus = { "+" }
minus = { "-" }
times = { "*" }
divide = { "/" }
modulo = { "%" }
cat = { "cat" }

// =============================================================================
// BUILT-IN FUNCTIONS
// =============================================================================
// Reserved keyword call sites lowered inline by the codegen — they are
// NOT user functions and share no infrastructure with `.extern fn`

builtin_fn_call = {
    builtin_op ~ "(" ~ (arithmetic_expr ~ ("," ~ arithmetic_expr)*)? ~ ")"
}

builtin_op = {
    strlen_op | substr_op | ord_op | contains_op | to_string_op | to_number_op
}

strlen_op    = @{ "strlen"    ~ !(ASCII_ALPHANUMERIC | "_") }
substr_op    = @{ "substr"    ~ !(ASCII_ALPHANUMERIC | "_") }
ord_op       = @{ "ord"       ~ !(ASCII_ALPHANUMERIC | "_") }
contains_op  = @{ "contains"  ~ !(ASCII_ALPHANUMERIC | "_") }
to_string_op = @{ "to_string" ~ !(ASCII_ALPHANUMERIC | "_") }
to_number_op = @{ "to_number" ~ !(ASCII_ALPHANUMERIC | "_") }

// =============================================================================
// COMPARISON EXPRESSIONS  
// =============================================================================
// Binary comparisons between arithmetic expressions

// Binary comparison between arithmetic expressions
// Examples: x + y > z + w, count(x) = 5
compare_expr = { arithmetic_expr ~ compare_op ~ arithmetic_expr }

// Comparison operators
compare_op = { 
    equal |               // =
    not_equal |           // !=
    greater_equal_than |  // >=
    greater_than |        // >
    less_equal_than |     // <=
    less_than             // <
}

// Individual comparison operators
equal = { "=" } 
not_equal = { "!=" }
greater_equal_than = { ">=" }
greater_than = { ">" }
less_equal_than = { "<=" }
less_than = { "<" }

// =============================================================================
// AGGREGATE EXPRESSIONS
// =============================================================================
// Aggregation functions for computing statistics over groups

// Aggregation operation on a value
// Examples: count(x), avg(x + y), sum(weight)
aggregate_expr = { 
    aggregate_op ~ 
    "(" ~ 
    arithmetic_expr ~
    ")" 
} 

// Supported aggregate operators
aggregate_op = { count | average | sum | min | max }
count = { "count" | "COUNT" }       // Count of elements
average = { "average" | "AVG" }     // Average value
sum = { "sum" | "SUM" }             // Sum of values
min = { "min" | "MIN" }             // Minimum value
max = { "max" | "MAX" }             // Maximum value

// =============================================================================
// ATOMS (RELATION REFERENCES)
// =============================================================================
// Atoms represent relation instances with specific arguments

// Relation with arguments
// Example: edge(x, y), person("Alice", 25)
atom = {
    relation_ref ~
    "(" ~
    (atom_arg ~ ("," ~ atom_arg)*)? ~
    ")"
}

// Negative atom for negative conditions
// Example: !edge(x, y)
negative_atom = { "!" ~ atom }

// Arguments in atoms can be constants, variables, or placeholders
atom_arg = { constant | variable | placeholder } 

// =============================================================================
// RULES SECTION
// =============================================================================
// Rules define how output relations are computed from input relations
// Format: head :- body.
// Example: reachable(x, z) :- edge(x, y), reachable(y, z).

// A fact is a ground tuple inserted directly into a relation.
// Example: edge(1, 2).
fact = { head ~ "." }

// Rule with optional multi-head and multi-body (Souffle-style).
// Commas separate alternative heads, semicolons seperate alternative bodies:
//   h1, h2 :- b1; b2.  →  h1:-b1. h1:-b2. h2:-b1. h2:-b2.
rule = { rule_heads ~ ":-" ~ rule_bodies ~ "." }

// One or more commas-separated heads
rule_heads = { head ~ ("," ~ head)* }

// One or more semicolon-separated bodies (disjunction)
rule_bodies = { predicates ~ (";" ~ predicates)* }

// Left side of rule (before :-)
// Example: reachable(x, z), stats(count(x))
head = {
    relation_ref ~
    "(" ~
    (head_arg ~ ("," ~ head_arg)*)? ~
    ")"
}

// Can be aggregate functions or arithmetic expressions (UDF calls are parsed as factors within arithmetic)
head_arg = { aggregate_expr | arithmetic_expr }

// User-defined function call in rule head
// Example: my_udf(x, y + 1)
fn_call_expr = {
    identifier ~
    "(" ~
    (arithmetic_expr ~ ("," ~ arithmetic_expr)*)? ~
    ")"
}

// Comma-separated list of predicates in the rule body
predicates = { predicate ~ ("," ~ predicate)* }

// Types of predicates allowed in rule bodies
// Note: compare_expr must come before atom because compare_expr can start with
// fn_call_expr (e.g., "cost(x) > 5") which atom would greedily consume.
predicate = {
    compare_expr |  // Comparison: x > y, cost(x) > 5
    atom |          // Positive atom: edge(x, y)
    negative_atom | // Negative atom: !edge(x, y)
    disjunction_group | // Nested disjunction: (A ; B)
    fn_call_expr    // UDF predicate: is_valid(x + 1, y)
}

// Parenthesised disjunction inside a body: `(A ; B)`.
disjunction_group = { "(" ~ rule_bodies ~ ")" }


// =============================================================================
// LOOP BLOCK / FIXPOINT BLOCK
// =============================================================================
// A fixpoint block evaluates rules until no new tuples are derived (delta
// becomes empty).  A loop block evaluates rules with bounded/conditional
// iteration.  Both act as hard evaluation barriers — the stratifier cannot
// move rules across their boundaries.
//
// Syntax:
//   fixpoint { rules... }
//   loop <condition> { rules... }
//
// The condition is composed of at most one `while` clause and one `until`
// clause, in either order:
//
//   while { iter_expr }  — keep looping while the iteration constraint holds.
//       iter_expr is one or more "@it op N" sub-conditions (joined by and/or)
//       that describe allowed iteration windows.
//         @it <= 6                  → run iterations 0..=6
//         @it >= 5 and @it <= 10   → run iterations 5..=10
//         @it < 5 or @it > 10      → run iterations 0..=4 and 11..
//
//   until { rel_expr }   — halt when the listed bool relation(s) become true.
//       rel_expr is one or more nullary relation names joined by and/or.
//         until { Done }            → stop when Done holds
//         until { Done1 or Done2 }  → stop when either holds
//
//   Combining while and until:
//     while W until U          — stop as soon as EITHER condition triggers (min)
//     while W or until U       — stop only when BOTH conditions trigger (max)
//
// Relations using replacement (iterative) semantics are declared inside the
// block with `.iterative <relation_name>` directives.  This is scoped per-block:
// the same relation can be iterative in one block and accumulative in another.
//
// Examples:
//   fixpoint { ... }
//   fixpoint { .iterative X  ... }
//   loop while { @it <= 10 } { ... }
//   loop until { Done } { .iterative X  ... }
//   loop while { @it <= 100 } until { Done } { ... }
//   loop while { @it <= 100 } or until { Done } { ... }
//   loop until { Done } while { @it >= 5 and @it <= 10 } { ... }

fixpoint_block = {
    "fixpoint" ~ "{" ~ iterative_directive* ~ rule* ~ "}"
}

loop_block = {
    "loop" ~ loop_condition ~ "{" ~ iterative_directive* ~ rule* ~ "}"
}

// .iterative <relation_name> — marks a relation as using replacement (new_from)
// semantics within this block.  Unlisted relations default to accumulative (new).
// Scoped per-block: the same relation can be iterative in one block and
// accumulative in another.
iterative_directive = { ".iterative" ~ relation_ref }

// At most one while clause and one until clause, in either order.
// If both are present without "or", stop when EITHER fires (min / and semantics).
// If joined by "or", stop when BOTH fire (max / or semantics).
loop_condition = {
    loop_while ~ (loop_or? ~ loop_until)?
  | loop_until ~ (loop_or? ~ loop_while)?
}

// while { iter_expr } — loop while iteration is in the specified window(s).
loop_while = { "while" ~ "{" ~ loop_iter_expr ~ "}" }

// until { rel_expr } — stop when the listed relation(s) become true.
loop_until = { "until" ~ "{" ~ loop_stop_group ~ "}" }

// One or more bool relations joined by and/or inside the until clause.
loop_stop_group = { loop_bool_relation ~ (loop_connective ~ loop_bool_relation)* }

// One or more "@it op N" sub-conditions joined by and/or.
// Resolved at parse time into a list of allowed iteration windows Vec<(u16,u16)>.
loop_iter_expr = {
    "@it" ~ loop_iter_compare_op ~ integer
    ~ (loop_connective ~ "@it" ~ loop_iter_compare_op ~ integer)*
}

// Comparison operators for iteration sub-conditions.
// Longer tokens must come before shorter ones to avoid premature match.
loop_iter_compare_op = { "<=" | ">=" | "==" | "<" | ">" }

// A single nullary (boolean) relation name used in an until clause.
loop_bool_relation = { relation_ref }

// Boolean connective joining clauses or sub-conditions.
loop_connective = { loop_and | loop_or }
loop_and        = @{ "and" ~ !(ASCII_ALPHANUMERIC | "_") }
loop_or         = @{ "or"  ~ !(ASCII_ALPHANUMERIC | "_") }

// =============================================================================
// TYPE DECLARATIONS (.type)
// =============================================================================
//   .type NodeId  = number    alias (transparent equivalence)
//   .type UserId <: number    subtype (distinct compile-time identity)
//
// Both forms are global, position-independent, and resolved at parse
// time to a primitive root. Subtypes are zero-cost: only the type-
// checker distinguishes them.

type_alias_decl = { ".type" ~ identifier ~ type_decl_op ~ type_ref }
type_decl_op = { subtype_op | alias_op }
subtype_op = { "<:" }
alias_op = { "=" }

// =============================================================================
// COMPONENT DECLARATIONS (.comp / .init)
// =============================================================================
//
//   .comp Container<T> {
//     .decl Holds(x: T)
//     Holds(x) :- Source(x).
//   }
//   .init c = Container<symbol>
//
// `.comp` declares a parameterized template; `.init` instantiates it
// under a name (e.g. `c`). The component inliner pass eliminates both
// before typechecking, leaving only prefixed primitive forms
// (`.decl c.holds(x: symbol)` etc.).

comp_decl = {
    ".comp" ~ identifier ~ comp_type_params? ~ comp_supertype? ~ "{" ~ comp_body_item* ~ "}"
}

// `<T>` or `<T, U>` — names of types this component is parameterized by.
comp_type_params = { "<" ~ identifier ~ ("," ~ identifier)* ~ ">" }

// `: BaseComp<args>` — inherit and splice another component's body.
comp_supertype = { ":" ~ identifier ~ comp_type_args? }

// `<symbol, NodeId>` — concrete type arguments. `type_ref` so dotted
// forms like `cfg.Context` work.
comp_type_args = { "<" ~ type_ref ~ ("," ~ type_ref)* ~ ">" }

// `.init c = Container<symbol>`
init_decl = { ".init" ~ identifier ~ "=" ~ identifier ~ comp_type_args? }

// Items allowed inside a component body. Mirrors the top-level grammar
// minus fixpoint/loop blocks and include directives (which are
// resolved at file-load time), plus `override_directive` which is
// comp-body-only.
comp_body_item = {
    declaration | type_alias_decl | input_directive | output_directive
    | printsize_directive | rule | fact | comp_decl | init_decl
    | override_directive
}

// Drop inherited rules and facts for `Name`; the parent's `.decl Name`
// (which must be `overridable`) is kept.
override_directive = { ".override" ~ identifier }

// =============================================================================
// WHITESPACE AND COMMENTS
// =============================================================================
// Rules for handling whitespace and comments (ignored during parsing)

// Whitespace and comments are ignored during parsing
WHITESPACE = _{ " " | "\t" | NEWLINE }

// Comment starts with # or // and continues until end of line
// Examples: # This is a comment, // Another comment
COMMENT = _{ ("#" | "//") ~ (!NEWLINE ~ ANY)* }