// Hermes Schema Definition Language (SDL) Grammar
// Main entry point - a file contains one or more index definitions
file = { SOI ~ index_def+ ~ EOI }
// Index definition
index_def = { "index" ~ identifier ~ "{" ~ (field_def | default_fields_def | query_router_def)* ~ "}" }
// Default fields definition
default_fields_def = { "default_fields" ~ ":" ~ "[" ~ identifier ~ ("," ~ identifier)* ~ "]" }
// Query router definition for routing queries to specific fields based on regex
// Example:
// query_router {
// pattern: r"10\.\d{4,}/[^\s]+"
// substitution: "doi://{0}"
// target_field: uris
// mode: exclusive
// }
query_router_def = { "query_router" ~ "{" ~ query_router_prop+ ~ "}" }
// Query router properties
query_router_prop = {
query_router_pattern |
query_router_substitution |
query_router_target |
query_router_mode
}
query_router_pattern = { "pattern" ~ ":" ~ regex_string }
query_router_substitution = { "substitution" ~ ":" ~ quoted_string }
query_router_target = { "target_field" ~ ":" ~ identifier }
query_router_mode = { "mode" ~ ":" ~ routing_mode }
// Routing mode: exclusive (only target field) or additional (target + default fields)
routing_mode = { "exclusive" | "additional" }
// Regex string: r"..." or just "..."
regex_string = { raw_string | quoted_string }
raw_string = @{ "r\"" ~ raw_string_inner ~ "\"" }
raw_string_inner = @{ (!("\"") ~ ANY)* }
quoted_string = @{ "\"" ~ string_inner ~ "\"" }
string_inner = @{ (!("\"" | "\\") ~ ANY | "\\" ~ ANY)* }
// Field definition with optional tokenizer
// Examples:
// field title: text [indexed, stored]
// field body: text<en_stem> [indexed]
// field name: text<default> [indexed, stored]
// field embedding: dense_vector<768> [indexed<rabitq>]
// field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 32>]
// field embedding: dense_vector<768> [indexed<scann, num_clusters: 1024, nprobe: 64, mrl_dim: 256>]
// field embedding: dense_vector<768> [indexed<flat>] # brute-force, no ANN index
field_def = { "field" ~ identifier ~ ":" ~ field_type ~ (sparse_vector_config | dense_vector_config | tokenizer_spec)? ~ attributes? }
// Field types
field_type = {
"text" | "string" | "str" |
"u64" | "uint" | "unsigned" |
"i64" | "int" | "integer" |
"f64" | "float" | "double" |
"bytes" | "binary" | "blob" |
"json" |
"sparse_vector" |
"dense_vector" | "vector"
}
// Sparse vector configuration - only index_size (positional)
// Quantization and weight_threshold go in indexed<...> attribute
// Examples:
// field embedding: sparse_vector [indexed, stored]
// field embedding: sparse_vector<u16> [indexed]
// field embedding: sparse_vector<u32> [indexed<quantization: uint8, weight_threshold: 0.1>]
sparse_vector_config = { "<" ~ index_size_spec ~ ">" }
index_size_spec = { "u16" | "u32" }
quantization_spec = { "float32" | "float16" | "uint8" | "uint4" | "f32" | "f16" | "u8" | "u4" }
weight_threshold_spec = @{ ASCII_DIGIT+ ~ ("." ~ ASCII_DIGIT+)? }
// Dense vector configuration - only dimension
// All index-related params (index type, num_clusters, nprobe, mrl_dim, build_threshold) are in indexed<...>
// Examples:
// field embedding: dense_vector<768> [indexed] # default RaBitQ
// field embedding: dense_vector<768> [indexed<rabitq>] # explicit RaBitQ
// field embedding: dense_vector<768> [indexed<ivf_rabitq>] # IVF-RaBitQ (auto clusters)
// field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 32>]
// field embedding: dense_vector<768> [indexed<scann, num_clusters: 1024, nprobe: 64>]
// field embedding: dense_vector<768> [indexed<flat>] # brute-force only
// field embedding: dense_vector<768> [stored] # not indexed, just stored
dense_vector_config = { "<" ~ dense_vector_params ~ ">" }
dense_vector_params = { dense_vector_keyword_params | dense_vector_positional_params }
// Keyword-based params: only dims (index params moved to indexed<...>)
dense_vector_keyword_params = { dims_kwarg }
dims_kwarg = { "dims" ~ ":" ~ dimension_spec }
// Positional params: just dimension
dense_vector_positional_params = { dimension_spec }
dimension_spec = @{ ASCII_DIGIT+ }
// Tokenizer specification: <tokenizer_name>
tokenizer_spec = { "<" ~ identifier ~ ">" }
// Attributes like [indexed, stored]
// indexed can have optional index config: indexed<positions> or indexed<rabitq, centroids: "path">
// stored can have optional multi config: stored<multi> for array fields
attributes = { "[" ~ attribute ~ ("," ~ attribute)* ~ "]" }
attribute = { indexed_with_config | "indexed" | stored_with_config | "stored" }
// Stored attribute with optional multi flag
// Examples:
// stored
// stored<multi> # for array fields
stored_with_config = { "stored" ~ "<" ~ "multi" ~ ">" }
// Indexed attribute with optional vector index configuration
// Examples:
// indexed
// indexed<positions> # for phrase queries
// indexed<flat> # brute-force search (no ANN index)
// indexed<rabitq> # RaBitQ binary quantization
// indexed<ivf_rabitq> # IVF-RaBitQ (centroids trained from data)
// indexed<ivf_rabitq, num_clusters: 256, nprobe: 32>
// indexed<scann, num_clusters: 1024, nprobe: 64, mrl_dim: 256>
// indexed<rabitq, build_threshold: 5000> # build ANN after 5000 vectors
// indexed<quantization: uint8, weight_threshold: 0.1> # for sparse vectors
// indexed<positions, quantization: uint8> # sparse with positions
// indexed<quantization: uint8, query<tokenizer: "model/name", weighting: idf>> # with query config
indexed_with_config = { "indexed" ~ "<" ~ index_config_params ~ ">" }
index_config_params = { index_config_param ~ ("," ~ index_config_param)* }
index_config_param = { index_type_kwarg | num_clusters_kwarg | nprobe_kwarg | mrl_dim_kwarg | build_threshold_kwarg | quantization_kwarg | weight_threshold_kwarg | query_config_block | positions_kwarg | index_type_spec }
// Position tracking modes:
// positions - full tracking (element ordinal + token position)
// ordinal - only element ordinal for multi-valued fields
// token_position - only token position within text for phrase queries
positions_kwarg = { "positions" | "ordinal" | "token_position" }
mrl_dim_kwarg = { "mrl_dim" ~ ":" ~ mrl_dim_spec }
mrl_dim_spec = @{ ASCII_DIGIT+ }
index_type_kwarg = { "index" ~ ":" ~ index_type_spec }
index_type_spec = { "flat" | "scann" | "ivf_rabitq" | "rabitq" }
num_clusters_kwarg = { "num_clusters" ~ ":" ~ num_clusters_spec }
num_clusters_spec = @{ ASCII_DIGIT+ }
build_threshold_kwarg = { "build_threshold" ~ ":" ~ build_threshold_spec }
build_threshold_spec = @{ ASCII_DIGIT+ }
nprobe_kwarg = { "nprobe" ~ ":" ~ nprobe_spec }
quantization_kwarg = { "quantization" ~ ":" ~ quantization_spec }
weight_threshold_kwarg = { "weight_threshold" ~ ":" ~ weight_threshold_spec }
nprobe_spec = @{ ASCII_DIGIT+ }
path_chars = @{ (!("\"" ) ~ ANY)* }
// Query-time configuration for sparse vectors
// Example: query<tokenizer: "model/name", weighting: idf>
query_config_block = { "query" ~ "<" ~ query_config_params ~ ">" }
query_config_params = { query_config_param ~ ("," ~ query_config_param)* }
query_config_param = { query_tokenizer_kwarg | query_weighting_kwarg }
query_tokenizer_kwarg = { "tokenizer" ~ ":" ~ tokenizer_path }
query_weighting_kwarg = { "weighting" ~ ":" ~ weighting_spec }
tokenizer_path = { "\"" ~ path_chars ~ "\"" }
weighting_spec = { "one" | "idf" }
// Identifier (field names, index names, tokenizer names)
identifier = @{ (ASCII_ALPHA | "_") ~ (ASCII_ALPHANUMERIC | "_")* }
// Whitespace and comments
WHITESPACE = _{ " " | "\t" | "\r" | "\n" }
COMMENT = _{ "#" ~ (!"\n" ~ ANY)* }