hermes-core 1.4.20

// Hermes Schema Definition Language (SDL) Grammar

// Main entry point - a file contains one or more index definitions
file = { SOI ~ index_def+ ~ EOI }

// Index definition
index_def = { "index" ~ identifier ~ "{" ~ (field_def | default_fields_def | query_router_def)* ~ "}" }

// Default fields definition
default_fields_def = { "default_fields" ~ ":" ~ "[" ~ identifier ~ ("," ~ identifier)* ~ "]" }

// Query router definition for routing queries to specific fields based on regex
// Example:
//   query_router {
//       pattern: r"10\.\d{4,}/[^\s]+"
//       substitution: "doi://{0}"
//       target_field: uris
//       mode: exclusive
//   }
query_router_def = { "query_router" ~ "{" ~ query_router_prop+ ~ "}" }

// Query router properties
query_router_prop = {
    query_router_pattern |
    query_router_substitution |
    query_router_target |
    query_router_mode
}

query_router_pattern = { "pattern" ~ ":" ~ regex_string }
query_router_substitution = { "substitution" ~ ":" ~ quoted_string }
query_router_target = { "target_field" ~ ":" ~ identifier }
query_router_mode = { "mode" ~ ":" ~ routing_mode }

// Routing mode: exclusive (only target field) or additional (target + default fields)
routing_mode = { "exclusive" | "additional" }

// Regex string: r"..." or just "..."
regex_string = { raw_string | quoted_string }
raw_string = @{ "r\"" ~ raw_string_inner ~ "\"" }
raw_string_inner = @{ (!("\"") ~ ANY)* }
quoted_string = @{ "\"" ~ string_inner ~ "\"" }
string_inner = @{ (!("\"" | "\\") ~ ANY | "\\" ~ ANY)* }

// Field definition with optional tokenizer
// Examples:
//   field title: text [indexed, stored]
//   field body: text<en_stem> [indexed]
//   field name: text<default> [indexed, stored]
//   field embedding: dense_vector<768> [indexed<rabitq>]
//   field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 32>]
//   field embedding: dense_vector<768> [indexed<scann, num_clusters: 1024, nprobe: 64, mrl_dim: 256>]
//   field embedding: dense_vector<768> [indexed<flat>]  # brute-force, no ANN index
field_def = { "field" ~ identifier ~ ":" ~ field_type ~ (sparse_vector_config | dense_vector_config | tokenizer_spec)? ~ attributes? }

// Field types
field_type = {
    "text" | "string" | "str" |
    "u64" | "uint" | "unsigned" |
    "i64" | "int" | "integer" |
    "f64" | "float" | "double" |
    "bytes" | "binary" | "blob" |
    "json" |
    "sparse_vector" |
    "dense_vector" | "vector"
}

// Sparse vector configuration - only index_size (positional)
// Quantization and weight_threshold go in indexed<...> attribute
// Examples:
//   field embedding: sparse_vector [indexed, stored]
//   field embedding: sparse_vector<u16> [indexed]
//   field embedding: sparse_vector<u32> [indexed<quantization: uint8, weight_threshold: 0.1>]
sparse_vector_config = { "<" ~ index_size_spec ~ ">" }
index_size_spec = { "u16" | "u32" }
quantization_spec = { "float32" | "float16" | "uint8" | "uint4" | "f32" | "f16" | "u8" | "u4" }
weight_threshold_spec = @{ ASCII_DIGIT+ ~ ("." ~ ASCII_DIGIT+)? }

// Dense vector configuration - only dimension
// All index-related params (index type, num_clusters, nprobe, mrl_dim, build_threshold) are in indexed<...>
// Examples:
//   field embedding: dense_vector<768> [indexed]                           # default RaBitQ
//   field embedding: dense_vector<768> [indexed<rabitq>]                   # explicit RaBitQ
//   field embedding: dense_vector<768> [indexed<ivf_rabitq>]               # IVF-RaBitQ (auto clusters)
//   field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 32>]
//   field embedding: dense_vector<768> [indexed<scann, num_clusters: 1024, nprobe: 64>]
//   field embedding: dense_vector<768> [indexed<flat>]                     # brute-force only
//   field embedding: dense_vector<768> [stored]                            # not indexed, just stored
dense_vector_config = { "<" ~ dense_vector_params ~ ">" }
dense_vector_params = { dense_vector_keyword_params | dense_vector_positional_params }

// Keyword-based params: only dims (index params moved to indexed<...>)
dense_vector_keyword_params = { dims_kwarg }
dims_kwarg = { "dims" ~ ":" ~ dimension_spec }

// Positional params: just dimension
dense_vector_positional_params = { dimension_spec }

dimension_spec = @{ ASCII_DIGIT+ }

// Tokenizer specification: <tokenizer_name>
tokenizer_spec = { "<" ~ identifier ~ ">" }

// Attributes like [indexed, stored]
// indexed can have optional index config: indexed<positions> or indexed<rabitq, centroids: "path">
// stored can have optional multi config: stored<multi> for array fields
attributes = { "[" ~ attribute ~ ("," ~ attribute)* ~ "]" }
attribute = { indexed_with_config | "indexed" | stored_with_config | "stored" }

// Stored attribute with optional multi flag
// Examples:
//   stored
//   stored<multi>  # for array fields
stored_with_config = { "stored" ~ "<" ~ "multi" ~ ">" }

// Indexed attribute with optional vector index configuration
// Examples:
//   indexed
//   indexed<positions>  # for phrase queries
//   indexed<flat>       # brute-force search (no ANN index)
//   indexed<rabitq>     # RaBitQ binary quantization
//   indexed<ivf_rabitq> # IVF-RaBitQ (centroids trained from data)
//   indexed<ivf_rabitq, num_clusters: 256, nprobe: 32>
//   indexed<scann, num_clusters: 1024, nprobe: 64, mrl_dim: 256>
//   indexed<rabitq, build_threshold: 5000>  # build ANN after 5000 vectors
//   indexed<quantization: uint8, weight_threshold: 0.1>  # for sparse vectors
//   indexed<positions, quantization: uint8>  # sparse with positions
//   indexed<quantization: uint8, query<tokenizer: "model/name", weighting: idf>>  # with query config
indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
index_config_param = { index_type_kwarg | num_clusters_kwarg | nprobe_kwarg | mrl_dim_kwarg | build_threshold_kwarg | quantization_kwarg | weight_threshold_kwarg | query_config_block | positions_kwarg | index_type_spec }
// Position tracking modes:
//   positions - full tracking (element ordinal + token position)
//   ordinal - only element ordinal for multi-valued fields
//   token_position - only token position within text for phrase queries
positions_kwarg = { "positions" | "ordinal" | "token_position" }
mrl_dim_kwarg = { "mrl_dim" ~ ":" ~ mrl_dim_spec }
mrl_dim_spec = @{ ASCII_DIGIT+ }
index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
index_type_spec = { "flat" | "scann" | "ivf_rabitq" | "rabitq" }
num_clusters_kwarg = { "num_clusters" ~ ":" ~ num_clusters_spec }
num_clusters_spec = @{ ASCII_DIGIT+ }
build_threshold_kwarg = { "build_threshold" ~ ":" ~ build_threshold_spec }
build_threshold_spec = @{ ASCII_DIGIT+ }
nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
nprobe_spec = @{ ASCII_DIGIT+ }
path_chars = @{ (!("\"" ) ~ ANY)* }

// Query-time configuration for sparse vectors
// Example: query<tokenizer: "model/name", weighting: idf>
query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
query_config_params = { query_config_param ~ ("," ~ query_config_param)* }
query_config_param = { query_tokenizer_kwarg | query_weighting_kwarg }
query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
tokenizer_path = { "\"" ~ path_chars ~ "\"" }
weighting_spec = { "one" | "idf" }

// Identifier (field names, index names, tokenizer names)
identifier = @{ (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_")* }

// Whitespace and comments
WHITESPACE = _{ " " | "\t" | "\r" | "\n" }
COMMENT = _{ "#" ~ (!"\n" ~ ANY)* }