laurus 0.9.0

Unified search library for lexical, vector, and semantic retrieval
Documentation
// Query Grammar for Pest Parser
// This grammar supports the full query syntax

WHITESPACE = _{ " " | "\t" | "\n" | "\r" }

// Main entry point
query = { SOI ~ boolean_query ~ EOI }

// Boolean query with AND/OR operators (space-separated clauses default to implicit boolean)
boolean_query = { clause ~ (boolean_op ~ clause | clause)* }

clause = { required_clause | prohibited_clause | sub_clause }

required_clause = { "+" ~ sub_clause }
prohibited_clause = { "-" ~ sub_clause }

sub_clause = { grouped_query | field_query | term_query }

grouped_query = { "(" ~ boolean_query ~ ")" ~ boost? }

boolean_op = { ^"AND" | ^"OR" }

// Field-specific queries
field_query = { field ~ ":" ~ field_value }

field_value = { geo3d_query | geo_query | range_query | phrase_query | fuzzy_term | wildcard_term | simple_term }

// 3D geo queries (ECEF Cartesian coordinates, meters):
//   field:geo3d_distance(x, y, z, distance_m)
//   field:geo3d_bbox(min_x, min_y, min_z, max_x, max_y, max_z)
//   field:geo3d_nearest(x, y, z, k)
geo3d_query = { geo3d_distance | geo3d_bbox | geo3d_nearest }
geo3d_distance = {
    ^"geo3d_distance" ~ "(" ~ signed_float ~ "," ~ signed_float ~ "," ~ signed_float ~ "," ~ signed_float ~ ")"
}
geo3d_bbox = {
    ^"geo3d_bbox" ~ "(" ~ signed_float ~ "," ~ signed_float ~ "," ~ signed_float ~ "," ~ signed_float ~ "," ~ signed_float ~ "," ~ signed_float ~ ")"
}
geo3d_nearest = {
    ^"geo3d_nearest" ~ "(" ~ signed_float ~ "," ~ signed_float ~ "," ~ signed_float ~ "," ~ unsigned_int ~ ")"
}

// 2D geo queries (latitude / longitude in degrees, distance in km):
//   field:geo_distance(lat, lon, distance_m)
//   field:geo_bbox(min_lat, min_lon, max_lat, max_lon)
geo_query = { geo_distance | geo_bbox }
geo_distance = {
    ^"geo_distance" ~ "(" ~ signed_float ~ "," ~ signed_float ~ "," ~ signed_float ~ ")"
}
geo_bbox = {
    ^"geo_bbox" ~ "(" ~ signed_float ~ "," ~ signed_float ~ "," ~ signed_float ~ "," ~ signed_float ~ ")"
}

signed_float = @{ "-"? ~ ASCII_DIGIT+ ~ ("." ~ ASCII_DIGIT+)? ~ (^"e" ~ ("+" | "-")? ~ ASCII_DIGIT+)? }
unsigned_int = @{ ASCII_DIGIT+ }

// Range queries: [100 TO 500] or {A TO Z}
range_query = { range_inclusive | range_exclusive }
range_inclusive = { "[" ~ range_value ~ ^"TO" ~ range_value ~ "]" }
range_exclusive = { "{" ~ range_value ~ ^"TO" ~ range_value ~ "}" }
range_value = { "*" | quoted_string | unquoted_value }
unquoted_value = @{ (!WHITESPACE ~ !"TO" ~ !"]" ~ !"}" ~ ANY)+ }

// Phrase queries: "hello world" or "hello world"~10
phrase_query = { "\"" ~ phrase_content ~ "\"" ~ proximity? ~ boost? }
phrase_content = @{ (!"\"" ~ ANY)* }
proximity = { "~" ~ number }

// Fuzzy queries: term~2
fuzzy_term = { term ~ "~" ~ fuzziness? ~ boost? }
fuzziness = { number }

// Wildcard queries: te?t or test*
// Use Unicode `LETTER | NUMBER` so CJK / Cyrillic / Greek terms are accepted
// alongside ASCII. `LETTER` and `NUMBER` general categories together
// cover the cases users actually write.
wildcard_term = { wildcard_pattern ~ boost? }
wildcard_pattern = @{
    (LETTER | NUMBER | "_")* ~ wildcard_char ~ (LETTER | NUMBER | "_" | wildcard_char)*
}
wildcard_char = { "?" | "*" }

// Simple term with optional boost
simple_term = { term ~ boost? }

// Term query (standalone, no field)
term_query = { field_value }

// Boost: ^4 or ^0.5
boost = { "^" ~ boost_value }
boost_value = @{ number ~ ("." ~ number)? }

// Field name (ASCII-only on purpose; field identifiers stay portable)
field = @{ (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_" | ".")* }

// Term (word)
// `LETTER | NUMBER` is Unicode-aware (Letter / Number general categories),
// so CJK / Cyrillic / Greek / accented Latin etc. all parse as bare
// terms without requiring quoting. `boolean_op` keywords (AND / OR)
// remain whitespace-bounded as before.
term = @{ escaped_char+ | unescaped_term }
unescaped_term = @{ (LETTER | NUMBER | "_" | "-")+ }
escaped_char = @{ "\\" ~ special_char | normal_char }
special_char = { "+" | "-" | "!" | "(" | ")" | "{" | "}" | "[" | "]" | "^" | "\"" | "~" | "*" | "?" | ":" | "\\" | "/" }
normal_char = @{ LETTER | NUMBER | "_" | "-" }

// Quoted string
quoted_string = { "\"" ~ quoted_content ~ "\"" }
quoted_content = @{ (!"\"" ~ ANY)* }

// Number
number = @{ ASCII_DIGIT+ }