use pest::Parser;
use pest_derive::Parser;
use super::query_field_router::{QueryRouterRule, RoutingMode};
use super::schema::{DenseVectorQuantization, FieldType, Schema, SchemaBuilder};
use crate::Result;
use crate::error::Error;
#[derive(Parser)]
#[grammar = "dsl/sdl/sdl.pest"]
pub struct SdlParser;
use super::schema::{BinaryDenseVectorConfig, DenseVectorConfig};
use crate::structures::{
IndexSize, QueryWeighting, SparseFormat, SparseQueryConfig, SparseVectorConfig,
WeightQuantization,
};
#[derive(Debug, Clone)]
pub struct FieldDef {
pub name: String,
pub field_type: FieldType,
pub indexed: bool,
pub stored: bool,
pub tokenizer: Option<String>,
pub multi: bool,
pub positions: Option<super::schema::PositionMode>,
pub sparse_vector_config: Option<SparseVectorConfig>,
pub dense_vector_config: Option<DenseVectorConfig>,
pub binary_dense_vector_config: Option<BinaryDenseVectorConfig>,
pub fast: bool,
pub primary: bool,
pub reorder: bool,
}
#[derive(Debug, Clone)]
pub struct IndexDef {
pub name: String,
pub fields: Vec<FieldDef>,
pub default_fields: Vec<String>,
pub query_routers: Vec<QueryRouterRule>,
}
impl IndexDef {
pub fn to_schema(&self) -> Schema {
let mut builder = SchemaBuilder::default();
for field in &self.fields {
let f = match field.field_type {
FieldType::Text => {
let tokenizer = field.tokenizer.as_deref().unwrap_or("simple");
builder.add_text_field_with_tokenizer(
&field.name,
field.indexed,
field.stored,
tokenizer,
)
}
FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
FieldType::Json => builder.add_json_field(&field.name, field.stored),
FieldType::SparseVector => {
if let Some(config) = &field.sparse_vector_config {
builder.add_sparse_vector_field_with_config(
&field.name,
field.indexed,
field.stored,
config.clone(),
)
} else {
builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
}
}
FieldType::DenseVector => {
let config = field
.dense_vector_config
.as_ref()
.expect("DenseVector field requires dimension to be specified");
builder.add_dense_vector_field_with_config(
&field.name,
field.indexed,
field.stored,
config.clone(),
)
}
FieldType::BinaryDenseVector => {
let config = field
.binary_dense_vector_config
.as_ref()
.expect("BinaryDenseVector field requires dimension to be specified");
builder.add_binary_dense_vector_field_with_config(
&field.name,
field.indexed,
field.stored,
config.clone(),
)
}
};
if field.multi {
builder.set_multi(f, true);
}
if field.fast {
builder.set_fast(f, true);
}
if field.primary {
builder.set_primary_key(f);
}
if field.reorder {
builder.set_reorder(f, true);
}
let positions = field.positions.or({
if field.multi
&& matches!(
field.field_type,
FieldType::SparseVector
| FieldType::DenseVector
| FieldType::BinaryDenseVector
)
{
Some(super::schema::PositionMode::Ordinal)
} else {
None
}
});
if let Some(mode) = positions {
builder.set_positions(f, mode);
}
}
if !self.default_fields.is_empty() {
builder.set_default_fields(self.default_fields.clone());
}
if !self.query_routers.is_empty() {
builder.set_query_routers(self.query_routers.clone());
}
builder.build()
}
pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
if self.query_routers.is_empty() {
return Ok(None);
}
super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
.map(Some)
.map_err(Error::Schema)
}
}
fn parse_field_type(type_str: &str) -> Result<FieldType> {
match type_str {
"text" | "string" | "str" => Ok(FieldType::Text),
"u64" | "uint" | "unsigned" => Ok(FieldType::U64),
"i64" | "int" | "integer" => Ok(FieldType::I64),
"f64" | "float" | "double" => Ok(FieldType::F64),
"bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
"json" => Ok(FieldType::Json),
"sparse_vector" => Ok(FieldType::SparseVector),
"dense_vector" | "vector" => Ok(FieldType::DenseVector),
"binary_dense_vector" | "binary_vector" => Ok(FieldType::BinaryDenseVector),
_ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
}
}
#[derive(Debug, Clone, Default)]
struct IndexConfig {
index_type: Option<super::schema::VectorIndexType>,
num_clusters: Option<usize>,
nprobe: Option<usize>,
build_threshold: Option<usize>,
sparse_format: Option<SparseFormat>,
quantization: Option<WeightQuantization>,
weight_threshold: Option<f32>,
block_size: Option<usize>,
pruning: Option<f32>,
min_terms: Option<usize>,
query_tokenizer: Option<String>,
query_weighting: Option<QueryWeighting>,
query_weight_threshold: Option<f32>,
query_max_dims: Option<usize>,
query_pruning: Option<f32>,
query_min_query_dims: Option<usize>,
dims: Option<u32>,
max_weight: Option<f32>,
positions: Option<super::schema::PositionMode>,
}
struct ParsedAttributes {
indexed: bool,
stored: bool,
multi: bool,
fast: bool,
primary: bool,
reorder: bool,
index_config: Option<IndexConfig>,
}
fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> ParsedAttributes {
let mut attrs = ParsedAttributes {
indexed: false,
stored: false,
multi: false,
fast: false,
primary: false,
reorder: false,
index_config: None,
};
for attr in pair.into_inner() {
if attr.as_rule() == Rule::attribute {
let mut found_config = false;
for inner in attr.clone().into_inner() {
match inner.as_rule() {
Rule::indexed_with_config => {
attrs.indexed = true;
attrs.index_config = Some(parse_index_config(inner));
found_config = true;
break;
}
Rule::stored_with_config => {
attrs.stored = true;
attrs.multi = true; found_config = true;
break;
}
_ => {}
}
}
if !found_config {
match attr.as_str() {
"indexed" => attrs.indexed = true,
"stored" => attrs.stored = true,
"fast" => attrs.fast = true,
"primary" => attrs.primary = true,
"reorder" => attrs.reorder = true,
_ => {}
}
}
}
}
attrs
}
fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
let mut config = IndexConfig::default();
for inner in pair.into_inner() {
if inner.as_rule() == Rule::index_config_params {
for param in inner.into_inner() {
if param.as_rule() == Rule::index_config_param {
for p in param.into_inner() {
parse_single_index_config_param(&mut config, p);
}
}
}
}
}
config
}
fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
use super::schema::VectorIndexType;
match p.as_rule() {
Rule::index_type_spec => {
config.index_type = Some(match p.as_str() {
"flat" => VectorIndexType::Flat,
"rabitq" => VectorIndexType::RaBitQ,
"ivf_rabitq" => VectorIndexType::IvfRaBitQ,
"scann" => VectorIndexType::ScaNN,
_ => VectorIndexType::RaBitQ,
});
}
Rule::index_type_kwarg => {
if let Some(t) = p.into_inner().next() {
config.index_type = Some(match t.as_str() {
"flat" => VectorIndexType::Flat,
"rabitq" => VectorIndexType::RaBitQ,
"ivf_rabitq" => VectorIndexType::IvfRaBitQ,
"scann" => VectorIndexType::ScaNN,
_ => VectorIndexType::RaBitQ,
});
}
}
Rule::num_clusters_kwarg => {
if let Some(n) = p.into_inner().next() {
config.num_clusters = Some(n.as_str().parse().unwrap_or_else(|_| {
log::warn!(
"Invalid num_clusters value '{}', using default 256",
n.as_str()
);
256
}));
}
}
Rule::build_threshold_kwarg => {
if let Some(n) = p.into_inner().next() {
config.build_threshold = Some(n.as_str().parse().unwrap_or_else(|_| {
log::warn!(
"Invalid build_threshold value '{}', using default 10000",
n.as_str()
);
10000
}));
}
}
Rule::nprobe_kwarg => {
if let Some(n) = p.into_inner().next() {
config.nprobe = Some(n.as_str().parse().unwrap_or_else(|_| {
log::warn!("Invalid nprobe value '{}', using default 32", n.as_str());
32
}));
}
}
Rule::quantization_kwarg => {
if let Some(q) = p.into_inner().next() {
config.quantization = Some(match q.as_str() {
"float32" | "f32" => WeightQuantization::Float32,
"float16" | "f16" => WeightQuantization::Float16,
"uint8" | "u8" => WeightQuantization::UInt8,
"uint4" | "u4" => WeightQuantization::UInt4,
_ => WeightQuantization::default(),
});
}
}
Rule::weight_threshold_kwarg => {
if let Some(t) = p.into_inner().next() {
config.weight_threshold = Some(t.as_str().parse().unwrap_or_else(|_| {
log::warn!(
"Invalid weight_threshold value '{}', using default 0.0",
t.as_str()
);
0.0
}));
}
}
Rule::block_size_kwarg => {
if let Some(n) = p.into_inner().next() {
config.block_size = Some(n.as_str().parse().unwrap_or_else(|_| {
log::warn!(
"Invalid block_size value '{}', using default 128",
n.as_str()
);
128
}));
}
}
Rule::pruning_kwarg => {
if let Some(f) = p.into_inner().next() {
config.pruning = Some(f.as_str().parse().unwrap_or_else(|_| {
log::warn!("Invalid pruning value '{}', using default 1.0", f.as_str());
1.0
}));
}
}
Rule::min_terms_kwarg => {
if let Some(n) = p.into_inner().next() {
config.min_terms = Some(n.as_str().parse().unwrap_or_else(|_| {
log::warn!("Invalid min_terms value '{}', using default 4", n.as_str());
4
}));
}
}
Rule::sparse_format_kwarg => {
if let Some(f) = p.into_inner().next() {
config.sparse_format = Some(match f.as_str() {
"bmp" => SparseFormat::Bmp,
"maxscore" => SparseFormat::MaxScore,
_ => SparseFormat::default(),
});
}
}
Rule::sparse_dims_kwarg => {
if let Some(n) = p.into_inner().next() {
config.dims = Some(n.as_str().parse().unwrap_or_else(|_| {
log::warn!("Invalid dims value '{}', using default 105879", n.as_str());
105879
}));
}
}
Rule::sparse_max_weight_kwarg => {
if let Some(f) = p.into_inner().next() {
config.max_weight = Some(f.as_str().parse().unwrap_or_else(|_| {
log::warn!(
"Invalid max_weight value '{}', using default 5.0",
f.as_str()
);
5.0
}));
}
}
Rule::query_config_block => {
parse_query_config_block(config, p);
}
Rule::positions_kwarg => {
use super::schema::PositionMode;
config.positions = Some(match p.as_str() {
"ordinal" => PositionMode::Ordinal,
"token_position" => PositionMode::TokenPosition,
_ => PositionMode::Full, });
}
_ => {}
}
}
fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
for inner in pair.into_inner() {
if inner.as_rule() == Rule::query_config_params {
for param in inner.into_inner() {
if param.as_rule() == Rule::query_config_param {
for p in param.into_inner() {
match p.as_rule() {
Rule::query_tokenizer_kwarg => {
if let Some(path) = p.into_inner().next()
&& let Some(inner_path) = path.into_inner().next()
{
config.query_tokenizer = Some(inner_path.as_str().to_string());
}
}
Rule::query_weighting_kwarg => {
if let Some(w) = p.into_inner().next() {
config.query_weighting = Some(match w.as_str() {
"one" => QueryWeighting::One,
"idf" => QueryWeighting::Idf,
"idf_file" => QueryWeighting::IdfFile,
_ => QueryWeighting::One,
});
}
}
Rule::query_weight_threshold_kwarg => {
if let Some(t) = p.into_inner().next() {
config.query_weight_threshold =
Some(t.as_str().parse().unwrap_or_else(|_| {
log::warn!(
"Invalid query weight_threshold '{}', using 0.0",
t.as_str()
);
0.0
}));
}
}
Rule::query_max_dims_kwarg => {
if let Some(t) = p.into_inner().next() {
config.query_max_dims =
Some(t.as_str().parse().unwrap_or_else(|_| {
log::warn!(
"Invalid query max_dims '{}', using 0",
t.as_str()
);
0
}));
}
}
Rule::query_pruning_kwarg => {
if let Some(t) = p.into_inner().next() {
config.query_pruning =
Some(t.as_str().parse().unwrap_or_else(|_| {
log::warn!(
"Invalid query pruning '{}', using 1.0",
t.as_str()
);
1.0
}));
}
}
Rule::query_min_query_dims_kwarg => {
if let Some(t) = p.into_inner().next() {
config.query_min_query_dims =
Some(t.as_str().parse().unwrap_or_else(|_| {
log::warn!(
"Invalid query min_query_dims '{}', using 4",
t.as_str()
);
4
}));
}
}
_ => {}
}
}
}
}
}
}
}
fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
let mut inner = pair.into_inner();
let name = inner
.next()
.ok_or_else(|| Error::Schema("Missing field name".to_string()))?
.as_str()
.to_string();
let field_type_str = inner
.next()
.ok_or_else(|| Error::Schema("Missing field type".to_string()))?
.as_str();
let field_type = parse_field_type(field_type_str)?;
let mut tokenizer = None;
let mut sparse_vector_config = None;
let mut dense_vector_config = None;
let mut binary_dense_vector_config = None;
let mut indexed = true;
let mut stored = true;
let mut multi = false;
let mut fast = false;
let mut primary = false;
let mut reorder = false;
let mut index_config: Option<IndexConfig> = None;
for item in inner {
match item.as_rule() {
Rule::tokenizer_spec => {
if let Some(tok_name) = item.into_inner().next() {
tokenizer = Some(tok_name.as_str().to_string());
}
}
Rule::sparse_vector_config => {
sparse_vector_config = Some(parse_sparse_vector_config(item));
}
Rule::dense_vector_config => {
dense_vector_config = Some(parse_dense_vector_config(item));
}
Rule::binary_dense_vector_config => {
let dim: usize = item
.into_inner()
.next()
.map(|d| d.as_str().parse().unwrap_or(0))
.unwrap_or(0);
if dim == 0 || !dim.is_multiple_of(8) {
return Err(Error::Schema(format!(
"BinaryDenseVector dimension must be a positive multiple of 8, got {dim}"
)));
}
binary_dense_vector_config = Some(BinaryDenseVectorConfig::new(dim));
}
Rule::attributes => {
let attrs = parse_attributes(item);
indexed = attrs.indexed;
stored = attrs.stored;
multi = attrs.multi;
fast = attrs.fast;
primary = attrs.primary;
reorder = attrs.reorder;
index_config = attrs.index_config;
}
_ => {}
}
}
if field_type == FieldType::BinaryDenseVector
&& binary_dense_vector_config.is_none()
&& let Some(ref dv_config) = dense_vector_config
{
let dim = dv_config.dim;
if dim == 0 || !dim.is_multiple_of(8) {
return Err(Error::Schema(format!(
"BinaryDenseVector dimension must be a positive multiple of 8, got {dim}"
)));
}
binary_dense_vector_config = Some(BinaryDenseVectorConfig::new(dim));
dense_vector_config = None;
}
if primary {
fast = true;
indexed = true;
}
let mut positions = None;
if let Some(idx_cfg) = index_config {
positions = idx_cfg.positions;
if let Some(ref mut dv_config) = dense_vector_config {
apply_index_config_to_dense_vector(dv_config, idx_cfg);
} else if field_type == FieldType::SparseVector {
let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
apply_index_config_to_sparse_vector(sv_config, idx_cfg);
}
}
Ok(FieldDef {
name,
field_type,
indexed,
stored,
tokenizer,
multi,
positions,
sparse_vector_config,
dense_vector_config,
binary_dense_vector_config,
fast,
primary,
reorder,
})
}
fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
if let Some(index_type) = idx_cfg.index_type {
config.index_type = index_type;
}
if idx_cfg.num_clusters.is_some() {
config.num_clusters = idx_cfg.num_clusters;
}
if let Some(nprobe) = idx_cfg.nprobe {
config.nprobe = nprobe;
}
if idx_cfg.build_threshold.is_some() {
config.build_threshold = idx_cfg.build_threshold;
}
}
fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
let mut index_size = IndexSize::default();
for inner in pair.into_inner() {
if inner.as_rule() == Rule::index_size_spec {
index_size = match inner.as_str() {
"u16" => IndexSize::U16,
"u32" => IndexSize::U32,
_ => IndexSize::default(),
};
}
}
SparseVectorConfig {
format: SparseFormat::default(),
index_size,
weight_quantization: WeightQuantization::default(),
weight_threshold: 0.0,
block_size: 128,
bmp_block_size: 64,
max_bmp_grid_bytes: 0,
bmp_superblock_size: 64,
pruning: None,
query_config: None,
dims: None,
max_weight: None,
min_terms: 4,
}
}
fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
if let Some(f) = idx_cfg.sparse_format {
config.format = f;
}
if let Some(q) = idx_cfg.quantization {
config.weight_quantization = q;
}
if let Some(t) = idx_cfg.weight_threshold {
config.weight_threshold = t;
}
if let Some(bs) = idx_cfg.block_size {
let adjusted = bs.next_power_of_two();
if adjusted != bs {
log::warn!(
"block_size {} adjusted to next power of two: {}",
bs,
adjusted
);
}
config.block_size = adjusted;
}
if let Some(p) = idx_cfg.pruning {
let clamped = p.clamp(0.0, 1.0);
if (clamped - p).abs() > f32::EPSILON {
log::warn!(
"pruning {} clamped to valid range [0.0, 1.0]: {}",
p,
clamped
);
}
config.pruning = Some(clamped);
}
if let Some(mt) = idx_cfg.min_terms {
config.min_terms = mt;
}
if let Some(d) = idx_cfg.dims {
config.dims = Some(d);
}
if let Some(mw) = idx_cfg.max_weight {
config.max_weight = Some(mw);
}
if idx_cfg.query_tokenizer.is_some()
|| idx_cfg.query_weighting.is_some()
|| idx_cfg.query_weight_threshold.is_some()
|| idx_cfg.query_max_dims.is_some()
|| idx_cfg.query_pruning.is_some()
|| idx_cfg.query_min_query_dims.is_some()
{
let query_config = config
.query_config
.get_or_insert(SparseQueryConfig::default());
if let Some(tokenizer) = idx_cfg.query_tokenizer {
query_config.tokenizer = Some(tokenizer);
}
if let Some(weighting) = idx_cfg.query_weighting {
query_config.weighting = weighting;
}
if let Some(t) = idx_cfg.query_weight_threshold {
query_config.weight_threshold = t;
}
if let Some(d) = idx_cfg.query_max_dims {
query_config.max_query_dims = Some(d);
}
if let Some(p) = idx_cfg.query_pruning {
query_config.pruning = Some(p);
}
if let Some(m) = idx_cfg.query_min_query_dims {
query_config.min_query_dims = m;
}
}
}
fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
let mut dim: usize = 0;
let mut quantization = DenseVectorQuantization::F32;
for params in pair.into_inner() {
if params.as_rule() == Rule::dense_vector_params {
for inner in params.into_inner() {
match inner.as_rule() {
Rule::dense_vector_keyword_params => {
for kwarg in inner.into_inner() {
match kwarg.as_rule() {
Rule::dims_kwarg => {
if let Some(d) = kwarg.into_inner().next() {
dim = d.as_str().parse().unwrap_or(0);
}
}
Rule::quant_type_spec => {
quantization = parse_quant_type(kwarg.as_str());
}
_ => {}
}
}
}
Rule::dense_vector_positional_params => {
for item in inner.into_inner() {
match item.as_rule() {
Rule::dimension_spec => {
dim = item.as_str().parse().unwrap_or(0);
}
Rule::quant_type_spec => {
quantization = parse_quant_type(item.as_str());
}
_ => {}
}
}
}
_ => {}
}
}
}
}
DenseVectorConfig::new(dim).with_quantization(quantization)
}
fn parse_quant_type(s: &str) -> DenseVectorQuantization {
match s.trim() {
"f16" => DenseVectorQuantization::F16,
"uint8" | "u8" => DenseVectorQuantization::UInt8,
_ => DenseVectorQuantization::F32,
}
}
fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
pair.into_inner().map(|p| p.as_str().to_string()).collect()
}
fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
let mut pattern = String::new();
let mut substitution = String::new();
let mut target_field = String::new();
let mut mode = RoutingMode::Additional;
for prop in pair.into_inner() {
if prop.as_rule() != Rule::query_router_prop {
continue;
}
for inner in prop.into_inner() {
match inner.as_rule() {
Rule::query_router_pattern => {
if let Some(regex_str) = inner.into_inner().next() {
pattern = parse_string_value(regex_str);
}
}
Rule::query_router_substitution => {
if let Some(quoted) = inner.into_inner().next() {
substitution = parse_string_value(quoted);
}
}
Rule::query_router_target => {
if let Some(ident) = inner.into_inner().next() {
target_field = ident.as_str().to_string();
}
}
Rule::query_router_mode => {
if let Some(mode_val) = inner.into_inner().next() {
mode = match mode_val.as_str() {
"exclusive" => RoutingMode::Exclusive,
"additional" => RoutingMode::Additional,
_ => RoutingMode::Additional,
};
}
}
_ => {}
}
}
}
if pattern.is_empty() {
return Err(Error::Schema("query_router missing 'pattern'".to_string()));
}
if substitution.is_empty() {
return Err(Error::Schema(
"query_router missing 'substitution'".to_string(),
));
}
if target_field.is_empty() {
return Err(Error::Schema(
"query_router missing 'target_field'".to_string(),
));
}
Ok(QueryRouterRule {
pattern,
substitution,
target_field,
mode,
})
}
fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
let s = pair.as_str();
match pair.as_rule() {
Rule::regex_string => {
if let Some(inner) = pair.into_inner().next() {
parse_string_value(inner)
} else {
s.to_string()
}
}
Rule::raw_string => {
s[2..s.len() - 1].to_string()
}
Rule::quoted_string => {
let inner = &s[1..s.len() - 1];
inner
.replace("\\n", "\n")
.replace("\\t", "\t")
.replace("\\\"", "\"")
.replace("\\\\", "\\")
}
_ => s.to_string(),
}
}
fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
let mut inner = pair.into_inner();
let name = inner
.next()
.ok_or_else(|| Error::Schema("Missing index name".to_string()))?
.as_str()
.to_string();
let mut fields = Vec::new();
let mut default_fields = Vec::new();
let mut query_routers = Vec::new();
for item in inner {
match item.as_rule() {
Rule::field_def => {
fields.push(parse_field_def(item)?);
}
Rule::default_fields_def => {
default_fields = parse_default_fields_def(item);
}
Rule::query_router_def => {
query_routers.push(parse_query_router_def(item)?);
}
_ => {}
}
}
let primary_fields: Vec<&FieldDef> = fields.iter().filter(|f| f.primary).collect();
if primary_fields.len() > 1 {
return Err(Error::Schema(format!(
"Index '{}' has {} primary key fields, but at most one is allowed",
name,
primary_fields.len()
)));
}
if let Some(pk) = primary_fields.first() {
if pk.field_type != FieldType::Text {
return Err(Error::Schema(format!(
"Primary key field '{}' must be of type text, got {:?}",
pk.name, pk.field_type
)));
}
if pk.multi {
return Err(Error::Schema(format!(
"Primary key field '{}' cannot be multi-valued",
pk.name
)));
}
}
Ok(IndexDef {
name,
fields,
default_fields,
query_routers,
})
}
pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
let pairs = SdlParser::parse(Rule::file, input)
.map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
let mut indexes = Vec::new();
for pair in pairs {
if pair.as_rule() == Rule::file {
for inner in pair.into_inner() {
if inner.as_rule() == Rule::index_def {
indexes.push(parse_index_def(inner)?);
}
}
}
}
Ok(indexes)
}
pub fn parse_single_index(input: &str) -> Result<IndexDef> {
let indexes = parse_sdl(input)?;
if indexes.is_empty() {
return Err(Error::Schema("No index definition found".to_string()));
}
if indexes.len() > 1 {
return Err(Error::Schema(
"Multiple index definitions found, expected one".to_string(),
));
}
Ok(indexes.into_iter().next().unwrap())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_parse_simple_schema() {
let sdl = r#"
index articles {
field title: text [indexed, stored]
field body: text [indexed]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
assert_eq!(indexes.len(), 1);
let index = &indexes[0];
assert_eq!(index.name, "articles");
assert_eq!(index.fields.len(), 2);
assert_eq!(index.fields[0].name, "title");
assert!(matches!(index.fields[0].field_type, FieldType::Text));
assert!(index.fields[0].indexed);
assert!(index.fields[0].stored);
assert_eq!(index.fields[1].name, "body");
assert!(matches!(index.fields[1].field_type, FieldType::Text));
assert!(index.fields[1].indexed);
assert!(!index.fields[1].stored);
}
#[test]
fn test_parse_all_field_types() {
let sdl = r#"
index test {
field text_field: text [indexed, stored]
field u64_field: u64 [indexed, stored]
field i64_field: i64 [indexed, stored]
field f64_field: f64 [indexed, stored]
field bytes_field: bytes [stored]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let index = &indexes[0];
assert!(matches!(index.fields[0].field_type, FieldType::Text));
assert!(matches!(index.fields[1].field_type, FieldType::U64));
assert!(matches!(index.fields[2].field_type, FieldType::I64));
assert!(matches!(index.fields[3].field_type, FieldType::F64));
assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
}
#[test]
fn test_parse_with_comments() {
let sdl = r#"
# This is a comment
index articles {
# Title field
field title: text [indexed, stored]
field body: text [indexed] # inline comment not supported yet
}
"#;
let indexes = parse_sdl(sdl).unwrap();
assert_eq!(indexes[0].fields.len(), 2);
}
#[test]
fn test_parse_type_aliases() {
let sdl = r#"
index test {
field a: string [indexed]
field b: int [indexed]
field c: uint [indexed]
field d: float [indexed]
field e: binary [stored]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let index = &indexes[0];
assert!(matches!(index.fields[0].field_type, FieldType::Text));
assert!(matches!(index.fields[1].field_type, FieldType::I64));
assert!(matches!(index.fields[2].field_type, FieldType::U64));
assert!(matches!(index.fields[3].field_type, FieldType::F64));
assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
}
#[test]
fn test_to_schema() {
let sdl = r#"
index articles {
field title: text [indexed, stored]
field views: u64 [indexed, stored]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let schema = indexes[0].to_schema();
assert!(schema.get_field("title").is_some());
assert!(schema.get_field("views").is_some());
assert!(schema.get_field("nonexistent").is_none());
}
#[test]
fn test_default_attributes() {
let sdl = r#"
index test {
field title: text
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let field = &indexes[0].fields[0];
assert!(field.indexed);
assert!(field.stored);
}
#[test]
fn test_multiple_indexes() {
let sdl = r#"
index articles {
field title: text [indexed, stored]
}
index users {
field name: text [indexed, stored]
field email: text [indexed, stored]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
assert_eq!(indexes.len(), 2);
assert_eq!(indexes[0].name, "articles");
assert_eq!(indexes[1].name, "users");
}
#[test]
fn test_tokenizer_spec() {
let sdl = r#"
index articles {
field title: text<en_stem> [indexed, stored]
field body: text<simple> [indexed]
field author: text [indexed, stored]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let index = &indexes[0];
assert_eq!(index.fields[0].name, "title");
assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
assert_eq!(index.fields[1].name, "body");
assert_eq!(index.fields[1].tokenizer, Some("simple".to_string()));
assert_eq!(index.fields[2].name, "author");
assert_eq!(index.fields[2].tokenizer, None); }
#[test]
fn test_tokenizer_in_schema() {
let sdl = r#"
index articles {
field title: text<german> [indexed, stored]
field body: text<en_stem> [indexed]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let schema = indexes[0].to_schema();
let title_field = schema.get_field("title").unwrap();
let title_entry = schema.get_field_entry(title_field).unwrap();
assert_eq!(title_entry.tokenizer, Some("german".to_string()));
let body_field = schema.get_field("body").unwrap();
let body_entry = schema.get_field_entry(body_field).unwrap();
assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
}
#[test]
fn test_query_router_basic() {
let sdl = r#"
index documents {
field title: text [indexed, stored]
field uri: text [indexed, stored]
query_router {
pattern: "10\\.\\d{4,}/[^\\s]+"
substitution: "doi://{0}"
target_field: uris
mode: exclusive
}
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let index = &indexes[0];
assert_eq!(index.query_routers.len(), 1);
let router = &index.query_routers[0];
assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
assert_eq!(router.substitution, "doi://{0}");
assert_eq!(router.target_field, "uris");
assert_eq!(router.mode, RoutingMode::Exclusive);
}
#[test]
fn test_query_router_raw_string() {
let sdl = r#"
index documents {
field uris: text [indexed, stored]
query_router {
pattern: r"^pmid:(\d+)$"
substitution: "pubmed://{1}"
target_field: uris
mode: additional
}
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let router = &indexes[0].query_routers[0];
assert_eq!(router.pattern, r"^pmid:(\d+)$");
assert_eq!(router.substitution, "pubmed://{1}");
assert_eq!(router.mode, RoutingMode::Additional);
}
#[test]
fn test_multiple_query_routers() {
let sdl = r#"
index documents {
field uris: text [indexed, stored]
query_router {
pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
substitution: "doi://{1}"
target_field: uris
mode: exclusive
}
query_router {
pattern: r"^pmid:(\d+)$"
substitution: "pubmed://{1}"
target_field: uris
mode: exclusive
}
query_router {
pattern: r"^arxiv:(\d+\.\d+)$"
substitution: "arxiv://{1}"
target_field: uris
mode: additional
}
}
"#;
let indexes = parse_sdl(sdl).unwrap();
assert_eq!(indexes[0].query_routers.len(), 3);
}
#[test]
fn test_query_router_default_mode() {
let sdl = r#"
index documents {
field uris: text [indexed, stored]
query_router {
pattern: r"test"
substitution: "{0}"
target_field: uris
}
}
"#;
let indexes = parse_sdl(sdl).unwrap();
assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
}
#[test]
fn test_multi_attribute() {
let sdl = r#"
index documents {
field uris: text [indexed, stored<multi>]
field title: text [indexed, stored]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
assert_eq!(indexes.len(), 1);
let fields = &indexes[0].fields;
assert_eq!(fields.len(), 2);
assert_eq!(fields[0].name, "uris");
assert!(fields[0].multi, "uris field should have multi=true");
assert_eq!(fields[1].name, "title");
assert!(!fields[1].multi, "title field should have multi=false");
let schema = indexes[0].to_schema();
let uris_field = schema.get_field("uris").unwrap();
let title_field = schema.get_field("title").unwrap();
assert!(schema.get_field_entry(uris_field).unwrap().multi);
assert!(!schema.get_field_entry(title_field).unwrap().multi);
}
#[test]
fn test_sparse_vector_field() {
let sdl = r#"
index documents {
field embedding: sparse_vector [indexed, stored]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
assert_eq!(indexes.len(), 1);
assert_eq!(indexes[0].fields.len(), 1);
assert_eq!(indexes[0].fields[0].name, "embedding");
assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
assert!(indexes[0].fields[0].sparse_vector_config.is_none());
}
#[test]
fn test_sparse_vector_with_config() {
let sdl = r#"
index documents {
field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
field dense: sparse_vector<u32> [indexed<quantization: float32>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
assert_eq!(indexes[0].fields.len(), 2);
let f1 = &indexes[0].fields[0];
assert_eq!(f1.name, "embedding");
let config1 = f1.sparse_vector_config.as_ref().unwrap();
assert_eq!(config1.index_size, IndexSize::U16);
assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
let f2 = &indexes[0].fields[1];
assert_eq!(f2.name, "dense");
let config2 = f2.sparse_vector_config.as_ref().unwrap();
assert_eq!(config2.index_size, IndexSize::U32);
assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
}
#[test]
fn test_sparse_vector_with_weight_threshold() {
let sdl = r#"
index documents {
field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
assert_eq!(indexes[0].fields.len(), 2);
let f1 = &indexes[0].fields[0];
assert_eq!(f1.name, "embedding");
let config1 = f1.sparse_vector_config.as_ref().unwrap();
assert_eq!(config1.index_size, IndexSize::U16);
assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
assert!((config1.weight_threshold - 0.1).abs() < 0.001);
let f2 = &indexes[0].fields[1];
assert_eq!(f2.name, "embedding2");
let config2 = f2.sparse_vector_config.as_ref().unwrap();
assert_eq!(config2.index_size, IndexSize::U32);
assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
assert!((config2.weight_threshold - 0.05).abs() < 0.001);
}
#[test]
fn test_sparse_vector_with_pruning() {
let sdl = r#"
index documents {
field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let f = &indexes[0].fields[0];
assert_eq!(f.name, "embedding");
let config = f.sparse_vector_config.as_ref().unwrap();
assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
assert_eq!(config.pruning, Some(0.1));
}
#[test]
fn test_dense_vector_field() {
let sdl = r#"
index documents {
field embedding: dense_vector<768> [indexed, stored]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
assert_eq!(indexes.len(), 1);
assert_eq!(indexes[0].fields.len(), 1);
let f = &indexes[0].fields[0];
assert_eq!(f.name, "embedding");
assert_eq!(f.field_type, FieldType::DenseVector);
let config = f.dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 768);
}
#[test]
fn test_dense_vector_alias() {
let sdl = r#"
index documents {
field embedding: vector<1536> [indexed]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
assert_eq!(
indexes[0].fields[0]
.dense_vector_config
.as_ref()
.unwrap()
.dim,
1536
);
}
#[test]
fn test_dense_vector_with_num_clusters() {
let sdl = r#"
index documents {
field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
assert_eq!(indexes.len(), 1);
let f = &indexes[0].fields[0];
assert_eq!(f.name, "embedding");
assert_eq!(f.field_type, FieldType::DenseVector);
let config = f.dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 768);
assert_eq!(config.num_clusters, Some(256));
assert_eq!(config.nprobe, 32); }
#[test]
fn test_dense_vector_with_num_clusters_and_nprobe() {
let sdl = r#"
index documents {
field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 1536);
assert_eq!(config.num_clusters, Some(512));
assert_eq!(config.nprobe, 64);
}
#[test]
fn test_dense_vector_keyword_syntax() {
let sdl = r#"
index documents {
field embedding: dense_vector<dims: 1536> [indexed, stored]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 1536);
assert!(config.num_clusters.is_none());
}
#[test]
fn test_dense_vector_keyword_syntax_full() {
let sdl = r#"
index documents {
field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 1536);
assert_eq!(config.num_clusters, Some(256));
assert_eq!(config.nprobe, 64);
}
#[test]
fn test_dense_vector_keyword_syntax_partial() {
let sdl = r#"
index documents {
field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 768);
assert_eq!(config.num_clusters, Some(128));
assert_eq!(config.nprobe, 32); }
#[test]
fn test_dense_vector_scann_index() {
use crate::dsl::schema::VectorIndexType;
let sdl = r#"
index documents {
field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 768);
assert_eq!(config.index_type, VectorIndexType::ScaNN);
assert_eq!(config.num_clusters, Some(256));
assert_eq!(config.nprobe, 64);
}
#[test]
fn test_dense_vector_ivf_rabitq_index() {
use crate::dsl::schema::VectorIndexType;
let sdl = r#"
index documents {
field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 1536);
assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
assert_eq!(config.num_clusters, Some(512));
}
#[test]
fn test_dense_vector_rabitq_no_clusters() {
use crate::dsl::schema::VectorIndexType;
let sdl = r#"
index documents {
field embedding: dense_vector<dims: 768> [indexed<rabitq>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 768);
assert_eq!(config.index_type, VectorIndexType::RaBitQ);
assert!(config.num_clusters.is_none());
}
#[test]
fn test_dense_vector_flat_index() {
use crate::dsl::schema::VectorIndexType;
let sdl = r#"
index documents {
field embedding: dense_vector<dims: 768> [indexed<flat>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 768);
assert_eq!(config.index_type, VectorIndexType::Flat);
}
#[test]
fn test_dense_vector_default_index_type() {
use crate::dsl::schema::VectorIndexType;
let sdl = r#"
index documents {
field embedding: dense_vector<dims: 768> [indexed]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 768);
assert_eq!(config.index_type, VectorIndexType::RaBitQ);
}
#[test]
fn test_dense_vector_f16_quantization() {
use crate::dsl::schema::{DenseVectorQuantization, VectorIndexType};
let sdl = r#"
index documents {
field embedding: dense_vector<768, f16> [indexed]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 768);
assert_eq!(config.quantization, DenseVectorQuantization::F16);
assert_eq!(config.index_type, VectorIndexType::RaBitQ);
}
#[test]
fn test_dense_vector_uint8_quantization() {
use crate::dsl::schema::DenseVectorQuantization;
let sdl = r#"
index documents {
field embedding: dense_vector<1024, uint8> [indexed<ivf_rabitq>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 1024);
assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
}
#[test]
fn test_dense_vector_u8_alias() {
use crate::dsl::schema::DenseVectorQuantization;
let sdl = r#"
index documents {
field embedding: dense_vector<512, u8> [indexed]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 512);
assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
}
#[test]
fn test_dense_vector_default_f32_quantization() {
use crate::dsl::schema::DenseVectorQuantization;
let sdl = r#"
index documents {
field embedding: dense_vector<768> [indexed]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 768);
assert_eq!(config.quantization, DenseVectorQuantization::F32);
}
#[test]
fn test_dense_vector_keyword_with_quantization() {
use crate::dsl::schema::DenseVectorQuantization;
let sdl = r#"
index documents {
field embedding: dense_vector<dims: 768, f16> [indexed]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
assert_eq!(config.dim, 768);
assert_eq!(config.quantization, DenseVectorQuantization::F16);
}
#[test]
fn test_json_field_type() {
let sdl = r#"
index documents {
field title: text [indexed, stored]
field metadata: json [stored]
field extra: json
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let index = &indexes[0];
assert_eq!(index.fields.len(), 3);
assert_eq!(index.fields[1].name, "metadata");
assert!(matches!(index.fields[1].field_type, FieldType::Json));
assert!(index.fields[1].stored);
assert_eq!(index.fields[2].name, "extra");
assert!(matches!(index.fields[2].field_type, FieldType::Json));
let schema = index.to_schema();
let metadata_field = schema.get_field("metadata").unwrap();
let entry = schema.get_field_entry(metadata_field).unwrap();
assert_eq!(entry.field_type, FieldType::Json);
assert!(!entry.indexed); assert!(entry.stored);
}
#[test]
fn test_sparse_vector_query_config() {
use crate::structures::QueryWeighting;
let sdl = r#"
index documents {
field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let index = &indexes[0];
assert_eq!(index.fields.len(), 1);
assert_eq!(index.fields[0].name, "embedding");
assert!(matches!(
index.fields[0].field_type,
FieldType::SparseVector
));
let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
assert_eq!(config.index_size, IndexSize::U16);
assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
let query_config = config.query_config.as_ref().unwrap();
assert_eq!(
query_config.tokenizer.as_deref(),
Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
);
assert_eq!(query_config.weighting, QueryWeighting::Idf);
let schema = index.to_schema();
let embedding_field = schema.get_field("embedding").unwrap();
let entry = schema.get_field_entry(embedding_field).unwrap();
let sv_config = entry.sparse_vector_config.as_ref().unwrap();
let qc = sv_config.query_config.as_ref().unwrap();
assert_eq!(
qc.tokenizer.as_deref(),
Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
);
assert_eq!(qc.weighting, QueryWeighting::Idf);
}
#[test]
fn test_sparse_vector_query_config_weighting_one() {
use crate::structures::QueryWeighting;
let sdl = r#"
index documents {
field embedding: sparse_vector [indexed<query<weighting: one>>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
let query_config = config.query_config.as_ref().unwrap();
assert!(query_config.tokenizer.is_none());
assert_eq!(query_config.weighting, QueryWeighting::One);
}
#[test]
fn test_sparse_vector_query_config_weighting_idf_file() {
use crate::structures::QueryWeighting;
let sdl = r#"
index documents {
field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
let query_config = config.query_config.as_ref().unwrap();
assert_eq!(
query_config.tokenizer.as_deref(),
Some("opensearch-neural-sparse-encoding-v1")
);
assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
let schema = indexes[0].to_schema();
let field = schema.get_field("embedding").unwrap();
let entry = schema.get_field_entry(field).unwrap();
let sc = entry.sparse_vector_config.as_ref().unwrap();
let qc = sc.query_config.as_ref().unwrap();
assert_eq!(qc.weighting, QueryWeighting::IdfFile);
}
#[test]
fn test_sparse_vector_query_config_pruning_params() {
let sdl = r#"
index documents {
field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<weighting: idf, weight_threshold: 0.03, max_dims: 25, pruning: 0.2>>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
let qc = config.query_config.as_ref().unwrap();
assert_eq!(qc.weighting, QueryWeighting::Idf);
assert!((qc.weight_threshold - 0.03).abs() < 0.001);
assert_eq!(qc.max_query_dims, Some(25));
assert!((qc.pruning.unwrap() - 0.2).abs() < 0.001);
let schema = indexes[0].to_schema();
let field = schema.get_field("embedding").unwrap();
let entry = schema.get_field_entry(field).unwrap();
let sc = entry.sparse_vector_config.as_ref().unwrap();
let rqc = sc.query_config.as_ref().unwrap();
assert!((rqc.weight_threshold - 0.03).abs() < 0.001);
assert_eq!(rqc.max_query_dims, Some(25));
assert!((rqc.pruning.unwrap() - 0.2).abs() < 0.001);
}
#[test]
fn test_sparse_vector_format_maxscore() {
let sdl = r#"
index documents {
field embedding: sparse_vector<u16> [indexed<format: maxscore, quantization: uint8>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
assert_eq!(config.format, SparseFormat::MaxScore);
assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
let schema = indexes[0].to_schema();
let field = schema.get_field("embedding").unwrap();
let entry = schema.get_field_entry(field).unwrap();
let sc = entry.sparse_vector_config.as_ref().unwrap();
assert_eq!(sc.format, SparseFormat::MaxScore);
}
#[test]
fn test_sparse_vector_format_bmp() {
let sdl = r#"
index documents {
field embedding: sparse_vector<u16> [indexed<format: bmp, quantization: uint8>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
assert_eq!(config.format, SparseFormat::Bmp);
}
#[test]
fn test_fast_attribute() {
let sdl = r#"
index products {
field name: text [indexed, stored]
field price: f64 [indexed, fast]
field category: text [indexed, stored, fast]
field count: u64 [fast]
field score: i64 [indexed, stored, fast]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
assert_eq!(indexes.len(), 1);
let index = &indexes[0];
assert_eq!(index.fields.len(), 5);
assert!(!index.fields[0].fast);
assert!(index.fields[1].fast);
assert!(matches!(index.fields[1].field_type, FieldType::F64));
assert!(index.fields[2].fast);
assert!(matches!(index.fields[2].field_type, FieldType::Text));
assert!(index.fields[3].fast);
assert!(matches!(index.fields[3].field_type, FieldType::U64));
assert!(index.fields[4].fast);
assert!(matches!(index.fields[4].field_type, FieldType::I64));
let schema = index.to_schema();
let price_field = schema.get_field("price").unwrap();
assert!(schema.get_field_entry(price_field).unwrap().fast);
let category_field = schema.get_field("category").unwrap();
assert!(schema.get_field_entry(category_field).unwrap().fast);
let name_field = schema.get_field("name").unwrap();
assert!(!schema.get_field_entry(name_field).unwrap().fast);
}
#[test]
fn test_primary_attribute() {
let sdl = r#"
index documents {
field id: text [primary, stored]
field title: text [indexed, stored]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
assert_eq!(indexes.len(), 1);
let index = &indexes[0];
assert_eq!(index.fields.len(), 2);
let id_field = &index.fields[0];
assert!(id_field.primary, "id should be primary");
assert!(id_field.fast, "primary implies fast");
assert!(id_field.indexed, "primary implies indexed");
assert!(!index.fields[1].primary);
let schema = index.to_schema();
let id = schema.get_field("id").unwrap();
let id_entry = schema.get_field_entry(id).unwrap();
assert!(id_entry.primary_key);
assert!(id_entry.fast);
assert!(id_entry.indexed);
let title = schema.get_field("title").unwrap();
assert!(!schema.get_field_entry(title).unwrap().primary_key);
assert_eq!(schema.primary_field(), Some(id));
}
#[test]
fn test_primary_with_other_attributes() {
let sdl = r#"
index documents {
field id: text<simple> [primary, indexed, stored]
field body: text [indexed]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let id_field = &indexes[0].fields[0];
assert!(id_field.primary);
assert!(id_field.indexed);
assert!(id_field.stored);
assert!(id_field.fast);
assert_eq!(id_field.tokenizer, Some("simple".to_string()));
}
#[test]
fn test_primary_only_one_allowed() {
let sdl = r#"
index documents {
field id: text [primary]
field alt_id: text [primary]
}
"#;
let result = parse_sdl(sdl);
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(
err.contains("primary key"),
"Error should mention primary key: {}",
err
);
}
#[test]
fn test_primary_must_be_text() {
let sdl = r#"
index documents {
field id: u64 [primary]
}
"#;
let result = parse_sdl(sdl);
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(
err.contains("text"),
"Error should mention text type: {}",
err
);
}
#[test]
fn test_primary_cannot_be_multi() {
let sdl = r#"
index documents {
field id: text [primary, stored<multi>]
}
"#;
let result = parse_sdl(sdl);
assert!(result.is_err());
let err = result.unwrap_err().to_string();
assert!(err.contains("multi"), "Error should mention multi: {}", err);
}
#[test]
fn test_no_primary_field() {
let sdl = r#"
index documents {
field title: text [indexed, stored]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
let schema = indexes[0].to_schema();
assert!(schema.primary_field().is_none());
}
#[test]
fn test_reorder_attribute() {
let sdl = r#"
index documents {
field embedding: sparse_vector<u16> [indexed<format: bmp, quantization: uint8>, reorder]
field embedding2: sparse_vector [indexed<format: bmp>]
}
"#;
let indexes = parse_sdl(sdl).unwrap();
assert_eq!(indexes[0].fields.len(), 2);
assert!(indexes[0].fields[0].reorder);
assert!(!indexes[0].fields[1].reorder);
let schema = indexes[0].to_schema();
let f1 = schema.get_field("embedding").unwrap();
assert!(schema.get_field_entry(f1).unwrap().reorder);
let f2 = schema.get_field("embedding2").unwrap();
assert!(!schema.get_field_entry(f2).unwrap().reorder);
}
}