velesdb-core 1.6.0

High-performance vector database engine written in Rust
Documentation
// VelesQL Grammar - SQL-like query language for VelesDB
// Version 0.2.0 - WITH clause support

// Whitespace and comments
WHITESPACE = _{ " " | "\t" | "\r" | "\n" }
COMMENT = _{ "--" ~ (!"\n" ~ ANY)* }

// Main entry point - supports SELECT, MATCH and DML (INSERT/UPDATE)
query = { SOI ~ (match_query | compound_query | train_stmt | insert_stmt | update_stmt) ~ ";"? ~ EOI }

// MATCH query for graph pattern matching (EPIC-045 US-001)
// Syntax: MATCH pattern WHERE condition RETURN items [ORDER BY ...] [LIMIT n]
match_query = {
    ^"MATCH" ~ graph_pattern ~
    where_clause? ~
    return_clause ~
    order_by_clause? ~
    limit_clause?
}

// Graph pattern: (node)-[rel]->(node) chains
graph_pattern = { node_pattern ~ (relationship_pattern ~ node_pattern)* }
node_pattern = { "(" ~ node_spec? ~ ")" }
node_spec = { node_alias? ~ node_labels? ~ node_properties? }
node_alias = @{ (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_")* }
node_labels = { ":" ~ label_name ~ (":" ~ label_name)* }
label_name = @{ (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_")* }
node_properties = { "{" ~ property_list ~ "}" }
property_list = { property ~ ("," ~ property)* }
property = { identifier ~ ":" ~ property_value }
property_value = { string | float | integer | boolean | null_value | parameter }

// Relationship pattern: -[r:TYPE*1..3]->
relationship_pattern = { rel_incoming | rel_outgoing | rel_undirected }
rel_incoming = { "<-" ~ rel_spec? ~ "-" }
rel_outgoing = { "-" ~ rel_spec? ~ "->" }
rel_undirected = { "-" ~ rel_spec? ~ "-" }
rel_spec = { "[" ~ rel_details? ~ "]" }
rel_details = { rel_alias? ~ rel_types? ~ rel_range? ~ node_properties? }
rel_alias = @{ (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_")* }
rel_types = { ":" ~ rel_type_name ~ ("|" ~ rel_type_name)* }
rel_type_name = @{ (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_")* }
rel_range = { "*" ~ range_spec? }
range_spec = { range_bound ~ ".." ~ range_bound? | ".." ~ range_bound | integer }
range_bound = @{ ASCII_DIGIT+ }

// RETURN clause for MATCH queries
return_clause = { ^"RETURN" ~ return_item_list }
return_item_list = { return_item ~ ("," ~ return_item)* }
return_item = { return_expr ~ (^"AS" ~ identifier)? }
return_expr = { similarity_return | property_access | identifier | "*" }
similarity_return = { ^"similarity" ~ "(" ~ ")" }
property_access = @{ identifier ~ "." ~ identifier }

// Compound query: SELECT with optional UNION/INTERSECT/EXCEPT
compound_query = { select_stmt ~ (set_operator ~ select_stmt)? }
set_operator = { ^"UNION" ~ ^"ALL" | ^"UNION" | ^"INTERSECT" | ^"EXCEPT" }

// INSERT statement: INSERT INTO table (col1, col2) VALUES (v1, v2)
insert_stmt = {
    ^"INSERT" ~ ^"INTO" ~ identifier ~
    "(" ~ identifier ~ ("," ~ identifier)* ~ ")" ~
    ^"VALUES" ~
    "(" ~ value ~ ("," ~ value)* ~ ")"
}

// UPDATE statement: UPDATE table SET col1 = v1, col2 = v2 [WHERE ...]
update_stmt = {
    ^"UPDATE" ~ identifier ~
    ^"SET" ~ assignment ~ ("," ~ assignment)* ~
    where_clause?
}
assignment = { identifier ~ "=" ~ value }

// TRAIN statement: TRAIN QUANTIZER ON collection WITH (params)
train_stmt = {
    ^"TRAIN" ~ ^"QUANTIZER" ~ ^"ON" ~ identifier ~ with_clause
}

// SELECT statement with optional DISTINCT, JOIN, WHERE, GROUP BY, HAVING, ORDER BY, LIMIT, OFFSET, WITH, FUSION clauses
// EPIC-052 US-001: Added DISTINCT support
// EPIC-052 US-003: Added FROM alias support for Self-JOIN
select_stmt = { 
    ^"SELECT" ~ distinct_modifier? ~ select_list ~ ^"FROM" ~ from_clause ~
    join_clause* ~ where_clause? ~ group_by_clause? ~ having_clause? ~ order_by_clause? ~ limit_clause? ~ offset_clause? ~ with_clause? ~ using_fusion_clause?
}

// FROM clause with optional alias (EPIC-052 US-003: Self-JOIN support)
// Supports: FROM table, FROM table AS alias
// Note: "FROM table alias" without AS is intentionally NOT supported to avoid
// ambiguity with JOIN keywords. Use "FROM table AS alias" syntax.
from_clause = { identifier ~ from_alias? }
from_alias = { ^"AS" ~ identifier }

// DISTINCT modifier (EPIC-052 US-001)
distinct_modifier = { ^"DISTINCT" }

// USING FUSION clause for hybrid search (EPIC-040 US-005)
using_fusion_clause = { ^"USING" ~ ^"FUSION" ~ fusion_options? }
fusion_options = { "(" ~ fusion_option_list ~ ")" }
fusion_option_list = { fusion_option ~ ("," ~ fusion_option)* }
fusion_option = { identifier ~ "=" ~ fusion_value }
fusion_value = { string | float | integer }

// GROUP BY clause (EPIC-017 US-003, EPIC-052 US-005: nested fields support)
group_by_clause = { ^"GROUP" ~ ^"BY" ~ group_by_list }
group_by_list = { group_by_column ~ ("," ~ group_by_column)* }
// Support both simple identifiers (including quoted) and nested paths
group_by_column = { identifier ~ ("." ~ identifier)* }

// HAVING clause for filtering groups (EPIC-017 US-006)
// Supports both AND and OR logical operators
having_clause = { ^"HAVING" ~ having_condition }
having_condition = { having_term ~ (having_logical_op ~ having_term)* }
// BUG-6 FIX: Named rule so pest emits tokens for AND/OR operators
having_logical_op = { ^"AND" | ^"OR" }
having_term = { aggregate_function ~ compare_op ~ value }

// JOIN clause for cross-store queries (EPIC-031 US-004, extended EPIC-040 US-003)
join_clause = { join_type? ~ ^"JOIN" ~ identifier ~ alias_clause? ~ join_spec }
join_type = { (^"LEFT" ~ ^"OUTER"?) | (^"RIGHT" ~ ^"OUTER"?) | (^"FULL" ~ ^"OUTER"?) | ^"INNER" }
join_spec = { on_clause | using_clause }
on_clause = { ^"ON" ~ join_condition }
using_clause = { ^"USING" ~ "(" ~ identifier ~ ("," ~ identifier)* ~ ")" }
alias_clause = { ^"AS" ~ identifier }
join_condition = { column_ref ~ "=" ~ column_ref }
column_ref = @{ identifier ~ "." ~ identifier }

// ORDER BY clause (EPIC-040 US-002: supports columns, aggregates, similarity)
order_by_clause = { ^"ORDER" ~ ^"BY" ~ order_by_item ~ ("," ~ order_by_item)* }
order_by_item = { order_by_expr ~ sort_direction? }
order_by_expr = { order_by_similarity | order_by_similarity_bare | aggregate_function | property_access | identifier }
order_by_similarity = { ^"similarity" ~ "(" ~ similarity_field ~ "," ~ vector_value ~ ")" }
// similarity() zero-arg in ORDER BY: uses pre-computed search score
order_by_similarity_bare = { ^"similarity" ~ "(" ~ ")" }
sort_direction = { ^"DESC" | ^"ASC" }

// WITH clause for query-time configuration overrides
with_clause = { ^"WITH" ~ "(" ~ with_option_list ~ ")" }
with_option_list = { with_option ~ ("," ~ with_option)* }
with_option = { identifier ~ "=" ~ with_value }
with_value = { string | float | integer | boolean | identifier }

// Select list: * or mixed items (columns and/or aggregations for GROUP BY)
select_list = { "*" | select_item_list }

// Mixed select items: columns, aggregations, similarity(), and qualified wildcards
select_item_list = { select_item ~ ("," ~ select_item)* }
select_item = { similarity_select | aggregation_item | qualified_wildcard | column }

// similarity() zero-arg in SELECT: SELECT similarity() [AS alias]
similarity_select = { ^"similarity" ~ "(" ~ ")" ~ (^"AS" ~ identifier)? }

// Qualified wildcard: SELECT alias.* (e.g., SELECT ctx.*)
qualified_wildcard = { identifier ~ "." ~ "*" }

// Aggregate functions: COUNT, SUM, AVG, MIN, MAX
aggregation_item = { aggregate_function ~ (^"AS" ~ identifier)? }
aggregate_function = { aggregate_type ~ "(" ~ aggregate_arg ~ ")" }
aggregate_type = { ^"COUNT" | ^"SUM" | ^"AVG" | ^"MIN" | ^"MAX" }
aggregate_arg = { "*" | column_name }

column = { column_name ~ (^"AS" ~ identifier)? }
// EPIC-052 US-005: Support nested field paths like metadata.source or profile.address.city
column_name = @{ identifier ~ ("." ~ identifier)* }

// WHERE clause
where_clause = { ^"WHERE" ~ or_expr }

// Conditions with precedence (OR < AND < primary)
or_expr = { and_expr ~ (^"OR" ~ and_expr)* }
and_expr = { primary_expr ~ (^"AND" ~ primary_expr)* }
where_column = { identifier ~ ("." ~ identifier)* }

primary_expr = {
    "(" ~ or_expr ~ ")" |
    not_expr |
    graph_match_expr |
    similarity_expr |
    vector_fused_search |
    sparse_vector_search |
    vector_search |
    match_expr |
    in_expr |
    between_expr |
    like_expr |
    is_null_expr |
    compare_expr
}

not_expr = { ^"NOT" ~ primary_expr }

// Graph predicate inside SELECT WHERE clause:
// WHERE ... AND MATCH (a)-[:REL]->(b)
graph_match_expr = { ^"MATCH" ~ graph_pattern }

// Similarity function: similarity(field, vector) op threshold
// Used in hybrid graph-vector queries
// Note: threshold accepts both float (0.8) and integer (1) for user convenience
similarity_expr = {
    ^"similarity" ~ "(" ~ similarity_field ~ "," ~ vector_value ~ ")" ~ compare_op ~ numeric_threshold
}
numeric_threshold = { float | integer }
similarity_field = @{ (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_" | ".")* }

// Sparse vector search: vector SPARSE_NEAR sparse_value [USING 'index-name']
sparse_vector_search = {
    ^"vector" ~ ^"SPARSE_NEAR" ~ sparse_value ~ (^"USING" ~ string)?
}

// Sparse value: inline literal {12: 0.8, 45: 0.3} or bind parameter $sv
sparse_value = { sparse_literal | parameter }
sparse_literal = { "{" ~ sparse_entry ~ ("," ~ sparse_entry)* ~ "}" }
sparse_entry = { integer ~ ":" ~ float }

// Vector search: vector NEAR vector_value
// Note: Distance metric is defined at collection creation, not per-query
vector_search = {
    ^"vector" ~ ^"NEAR" ~ vector_value
}

// Multi-vector fusion search: vector NEAR_FUSED [v1, v2, ...] USING FUSION 'strategy' (params)
vector_fused_search = {
    ^"vector" ~ ^"NEAR_FUSED" ~ vector_array ~ fusion_clause?
}
vector_array = { "[" ~ vector_value ~ ("," ~ vector_value)* ~ "]" }
fusion_clause = { ^"USING" ~ ^"FUSION" ~ fusion_strategy ~ fusion_params? }
fusion_strategy = { string }
fusion_params = { "(" ~ fusion_param_list ~ ")" }
fusion_param_list = { fusion_param ~ ("," ~ fusion_param)* }
fusion_param = { identifier ~ "=" ~ fusion_param_value }
fusion_param_value = { float | integer }

vector_value = { vector_literal | parameter }
vector_literal = { "[" ~ float ~ ("," ~ float)* ~ "]" }

// Full-text search: column MATCH 'query'
match_expr = { where_column ~ ^"MATCH" ~ string }

// IN expression: column IN (value, ...)
in_expr = { where_column ~ ^"IN" ~ "(" ~ value_list ~ ")" }
value_list = { value ~ ("," ~ value)* }

// BETWEEN expression: column BETWEEN value AND value
between_expr = { where_column ~ ^"BETWEEN" ~ value ~ ^"AND" ~ value }

// LIKE / ILIKE expression: column LIKE 'pattern' or column ILIKE 'pattern'
like_expr = { where_column ~ like_op ~ string }
like_op = { ^"ILIKE" | ^"LIKE" }

// IS NULL / IS NOT NULL
is_null_expr = { where_column ~ ^"IS" ~ not_kw? ~ ^"NULL" }
not_kw = { ^"NOT" }

// Comparison: column op value
compare_expr = { where_column ~ compare_op ~ value }
compare_op = { ">=" | "<=" | "<>" | "!=" | "=" | ">" | "<" }

// LIMIT and OFFSET
limit_clause = { ^"LIMIT" ~ integer }
offset_clause = { ^"OFFSET" ~ integer }

// Values - EPIC-038: Temporal, EPIC-039: Subquery
value = { subquery_expr | temporal_expr | float | integer | string | boolean | null_value | parameter }

// Scalar subquery expression (EPIC-039)
subquery_expr = { "(" ~ ^"SELECT" ~ select_list ~ ^"FROM" ~ identifier ~ where_clause? ~ group_by_clause? ~ having_clause? ~ limit_clause? ~ ")" }
parameter = @{ "$" ~ identifier }
null_value = { ^"NULL" }
boolean = { ^"TRUE" | ^"FALSE" }

// Temporal expressions (EPIC-038)
temporal_expr = { temporal_arithmetic | now_function | interval_expr }
temporal_arithmetic = { (now_function | interval_expr) ~ temporal_op ~ (now_function | interval_expr) }
temporal_op = { "+" | "-" }
now_function = { ^"NOW" ~ "(" ~ ")" }
interval_expr = { ^"INTERVAL" ~ string }

// Literals
string = @{ "'" ~ (!"'" ~ ANY)* ~ "'" }
integer = @{ "-"? ~ ASCII_DIGIT+ }
float = @{ "-"? ~ ASCII_DIGIT+ ~ "." ~ ASCII_DIGIT+ }

// Identifiers - EPIC-044 US-005: Support quoted identifiers for reserved keywords
// Supports: regular identifiers, backtick-quoted (`select`), double-quoted ("from")
identifier = { quoted_identifier | regular_identifier }
regular_identifier = @{ (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_")* }

// Quoted identifiers for escaping reserved keywords
// Backtick style: `select`, `from`, `order`
backtick_identifier = @{ "`" ~ backtick_inner ~ "`" }
backtick_inner = @{ (!"`" ~ ANY)+ }

// Double-quote style (SQL standard): "select", "from", "order"
// Supports escaped quotes: "col""name" -> col"name
doublequote_identifier = @{ "\"" ~ doublequote_inner ~ "\"" }
doublequote_inner = @{ (doublequote_escape | !("\"") ~ ANY)* }
doublequote_escape = @{ "\"\"" }

quoted_identifier = { backtick_identifier | doublequote_identifier }