1use pest::Parser;
46use pest_derive::Parser;
47
48use super::query_field_router::{QueryRouterRule, RoutingMode};
49use super::schema::{DenseVectorQuantization, FieldType, Schema, SchemaBuilder};
50use crate::Result;
51use crate::error::Error;
52
53#[derive(Parser)]
54#[grammar = "dsl/sdl/sdl.pest"]
55pub struct SdlParser;
56
57use super::schema::DenseVectorConfig;
58use crate::structures::{
59 IndexSize, QueryWeighting, SparseFormat, SparseQueryConfig, SparseVectorConfig,
60 WeightQuantization,
61};
62
63#[derive(Debug, Clone)]
65pub struct FieldDef {
66 pub name: String,
67 pub field_type: FieldType,
68 pub indexed: bool,
69 pub stored: bool,
70 pub tokenizer: Option<String>,
72 pub multi: bool,
74 pub positions: Option<super::schema::PositionMode>,
76 pub sparse_vector_config: Option<SparseVectorConfig>,
78 pub dense_vector_config: Option<DenseVectorConfig>,
80 pub fast: bool,
82 pub primary: bool,
84}
85
86#[derive(Debug, Clone)]
88pub struct IndexDef {
89 pub name: String,
90 pub fields: Vec<FieldDef>,
91 pub default_fields: Vec<String>,
92 pub query_routers: Vec<QueryRouterRule>,
94}
95
96impl IndexDef {
97 pub fn to_schema(&self) -> Schema {
99 let mut builder = SchemaBuilder::default();
100
101 for field in &self.fields {
102 let f = match field.field_type {
103 FieldType::Text => {
104 let tokenizer = field.tokenizer.as_deref().unwrap_or("simple");
105 builder.add_text_field_with_tokenizer(
106 &field.name,
107 field.indexed,
108 field.stored,
109 tokenizer,
110 )
111 }
112 FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
113 FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
114 FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
115 FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
116 FieldType::Json => builder.add_json_field(&field.name, field.stored),
117 FieldType::SparseVector => {
118 if let Some(config) = &field.sparse_vector_config {
119 builder.add_sparse_vector_field_with_config(
120 &field.name,
121 field.indexed,
122 field.stored,
123 config.clone(),
124 )
125 } else {
126 builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
127 }
128 }
129 FieldType::DenseVector => {
130 let config = field
132 .dense_vector_config
133 .as_ref()
134 .expect("DenseVector field requires dimension to be specified");
135 builder.add_dense_vector_field_with_config(
136 &field.name,
137 field.indexed,
138 field.stored,
139 config.clone(),
140 )
141 }
142 };
143 if field.multi {
144 builder.set_multi(f, true);
145 }
146 if field.fast {
147 builder.set_fast(f, true);
148 }
149 if field.primary {
150 builder.set_primary_key(f);
151 }
152 let positions = field.positions.or({
154 if field.multi
156 && matches!(
157 field.field_type,
158 FieldType::SparseVector | FieldType::DenseVector
159 )
160 {
161 Some(super::schema::PositionMode::Ordinal)
162 } else {
163 None
164 }
165 });
166 if let Some(mode) = positions {
167 builder.set_positions(f, mode);
168 }
169 }
170
171 if !self.default_fields.is_empty() {
173 builder.set_default_fields(self.default_fields.clone());
174 }
175
176 if !self.query_routers.is_empty() {
178 builder.set_query_routers(self.query_routers.clone());
179 }
180
181 builder.build()
182 }
183
184 pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
189 if self.query_routers.is_empty() {
190 return Ok(None);
191 }
192
193 super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
194 .map(Some)
195 .map_err(Error::Schema)
196 }
197}
198
199fn parse_field_type(type_str: &str) -> Result<FieldType> {
201 match type_str {
202 "text" | "string" | "str" => Ok(FieldType::Text),
203 "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
204 "i64" | "int" | "integer" => Ok(FieldType::I64),
205 "f64" | "float" | "double" => Ok(FieldType::F64),
206 "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
207 "json" => Ok(FieldType::Json),
208 "sparse_vector" => Ok(FieldType::SparseVector),
209 "dense_vector" | "vector" => Ok(FieldType::DenseVector),
210 _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
211 }
212}
213
214#[derive(Debug, Clone, Default)]
216struct IndexConfig {
217 index_type: Option<super::schema::VectorIndexType>,
218 num_clusters: Option<usize>,
219 nprobe: Option<usize>,
220 build_threshold: Option<usize>,
221 sparse_format: Option<SparseFormat>,
223 quantization: Option<WeightQuantization>,
224 weight_threshold: Option<f32>,
225 block_size: Option<usize>,
226 pruning: Option<f32>,
227 min_terms: Option<usize>,
228 query_tokenizer: Option<String>,
230 query_weighting: Option<QueryWeighting>,
231 query_weight_threshold: Option<f32>,
232 query_max_dims: Option<usize>,
233 query_pruning: Option<f32>,
234 query_min_query_dims: Option<usize>,
235 dims: Option<u32>,
237 max_weight: Option<f32>,
238 positions: Option<super::schema::PositionMode>,
240}
241
242fn parse_attributes(
247 pair: pest::iterators::Pair<Rule>,
248) -> (bool, bool, bool, bool, bool, Option<IndexConfig>) {
249 let mut indexed = false;
250 let mut stored = false;
251 let mut multi = false;
252 let mut fast = false;
253 let mut primary = false;
254 let mut index_config = None;
255
256 for attr in pair.into_inner() {
257 if attr.as_rule() == Rule::attribute {
258 let mut found_config = false;
260 for inner in attr.clone().into_inner() {
261 match inner.as_rule() {
262 Rule::indexed_with_config => {
263 indexed = true;
264 index_config = Some(parse_index_config(inner));
265 found_config = true;
266 break;
267 }
268 Rule::stored_with_config => {
269 stored = true;
270 multi = true; found_config = true;
272 break;
273 }
274 _ => {}
275 }
276 }
277 if !found_config {
278 match attr.as_str() {
280 "indexed" => indexed = true,
281 "stored" => stored = true,
282 "fast" => fast = true,
283 "primary" => primary = true,
284 _ => {}
285 }
286 }
287 }
288 }
289
290 (indexed, stored, multi, fast, primary, index_config)
291}
292
293fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
295 let mut config = IndexConfig::default();
296
297 for inner in pair.into_inner() {
302 if inner.as_rule() == Rule::index_config_params {
303 for param in inner.into_inner() {
304 if param.as_rule() == Rule::index_config_param {
305 for p in param.into_inner() {
306 parse_single_index_config_param(&mut config, p);
307 }
308 }
309 }
310 }
311 }
312
313 config
314}
315
316fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
318 use super::schema::VectorIndexType;
319
320 match p.as_rule() {
321 Rule::index_type_spec => {
322 config.index_type = Some(match p.as_str() {
323 "flat" => VectorIndexType::Flat,
324 "rabitq" => VectorIndexType::RaBitQ,
325 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
326 "scann" => VectorIndexType::ScaNN,
327 _ => VectorIndexType::RaBitQ,
328 });
329 }
330 Rule::index_type_kwarg => {
331 if let Some(t) = p.into_inner().next() {
333 config.index_type = Some(match t.as_str() {
334 "flat" => VectorIndexType::Flat,
335 "rabitq" => VectorIndexType::RaBitQ,
336 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
337 "scann" => VectorIndexType::ScaNN,
338 _ => VectorIndexType::RaBitQ,
339 });
340 }
341 }
342 Rule::num_clusters_kwarg => {
343 if let Some(n) = p.into_inner().next() {
345 config.num_clusters = Some(n.as_str().parse().unwrap_or_else(|_| {
346 log::warn!(
347 "Invalid num_clusters value '{}', using default 256",
348 n.as_str()
349 );
350 256
351 }));
352 }
353 }
354 Rule::build_threshold_kwarg => {
355 if let Some(n) = p.into_inner().next() {
357 config.build_threshold = Some(n.as_str().parse().unwrap_or_else(|_| {
358 log::warn!(
359 "Invalid build_threshold value '{}', using default 10000",
360 n.as_str()
361 );
362 10000
363 }));
364 }
365 }
366 Rule::nprobe_kwarg => {
367 if let Some(n) = p.into_inner().next() {
369 config.nprobe = Some(n.as_str().parse().unwrap_or_else(|_| {
370 log::warn!("Invalid nprobe value '{}', using default 32", n.as_str());
371 32
372 }));
373 }
374 }
375 Rule::quantization_kwarg => {
376 if let Some(q) = p.into_inner().next() {
378 config.quantization = Some(match q.as_str() {
379 "float32" | "f32" => WeightQuantization::Float32,
380 "float16" | "f16" => WeightQuantization::Float16,
381 "uint8" | "u8" => WeightQuantization::UInt8,
382 "uint4" | "u4" => WeightQuantization::UInt4,
383 _ => WeightQuantization::default(),
384 });
385 }
386 }
387 Rule::weight_threshold_kwarg => {
388 if let Some(t) = p.into_inner().next() {
390 config.weight_threshold = Some(t.as_str().parse().unwrap_or_else(|_| {
391 log::warn!(
392 "Invalid weight_threshold value '{}', using default 0.0",
393 t.as_str()
394 );
395 0.0
396 }));
397 }
398 }
399 Rule::block_size_kwarg => {
400 if let Some(n) = p.into_inner().next() {
402 config.block_size = Some(n.as_str().parse().unwrap_or_else(|_| {
403 log::warn!(
404 "Invalid block_size value '{}', using default 128",
405 n.as_str()
406 );
407 128
408 }));
409 }
410 }
411 Rule::pruning_kwarg => {
412 if let Some(f) = p.into_inner().next() {
414 config.pruning = Some(f.as_str().parse().unwrap_or_else(|_| {
415 log::warn!("Invalid pruning value '{}', using default 1.0", f.as_str());
416 1.0
417 }));
418 }
419 }
420 Rule::min_terms_kwarg => {
421 if let Some(n) = p.into_inner().next() {
422 config.min_terms = Some(n.as_str().parse().unwrap_or_else(|_| {
423 log::warn!("Invalid min_terms value '{}', using default 4", n.as_str());
424 4
425 }));
426 }
427 }
428 Rule::sparse_format_kwarg => {
429 if let Some(f) = p.into_inner().next() {
431 config.sparse_format = Some(match f.as_str() {
432 "bmp" => SparseFormat::Bmp,
433 "maxscore" => SparseFormat::MaxScore,
434 _ => SparseFormat::default(),
435 });
436 }
437 }
438 Rule::sparse_dims_kwarg => {
439 if let Some(n) = p.into_inner().next() {
440 config.dims = Some(n.as_str().parse().unwrap_or_else(|_| {
441 log::warn!("Invalid dims value '{}', using default 105879", n.as_str());
442 105879
443 }));
444 }
445 }
446 Rule::sparse_max_weight_kwarg => {
447 if let Some(f) = p.into_inner().next() {
448 config.max_weight = Some(f.as_str().parse().unwrap_or_else(|_| {
449 log::warn!(
450 "Invalid max_weight value '{}', using default 5.0",
451 f.as_str()
452 );
453 5.0
454 }));
455 }
456 }
457 Rule::query_config_block => {
458 parse_query_config_block(config, p);
460 }
461 Rule::positions_kwarg => {
462 use super::schema::PositionMode;
464 config.positions = Some(match p.as_str() {
465 "ordinal" => PositionMode::Ordinal,
466 "token_position" => PositionMode::TokenPosition,
467 _ => PositionMode::Full, });
469 }
470 _ => {}
471 }
472}
473
474fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
476 for inner in pair.into_inner() {
477 if inner.as_rule() == Rule::query_config_params {
478 for param in inner.into_inner() {
479 if param.as_rule() == Rule::query_config_param {
480 for p in param.into_inner() {
481 match p.as_rule() {
482 Rule::query_tokenizer_kwarg => {
483 if let Some(path) = p.into_inner().next()
485 && let Some(inner_path) = path.into_inner().next()
486 {
487 config.query_tokenizer = Some(inner_path.as_str().to_string());
488 }
489 }
490 Rule::query_weighting_kwarg => {
491 if let Some(w) = p.into_inner().next() {
493 config.query_weighting = Some(match w.as_str() {
494 "one" => QueryWeighting::One,
495 "idf" => QueryWeighting::Idf,
496 "idf_file" => QueryWeighting::IdfFile,
497 _ => QueryWeighting::One,
498 });
499 }
500 }
501 Rule::query_weight_threshold_kwarg => {
502 if let Some(t) = p.into_inner().next() {
503 config.query_weight_threshold =
504 Some(t.as_str().parse().unwrap_or_else(|_| {
505 log::warn!(
506 "Invalid query weight_threshold '{}', using 0.0",
507 t.as_str()
508 );
509 0.0
510 }));
511 }
512 }
513 Rule::query_max_dims_kwarg => {
514 if let Some(t) = p.into_inner().next() {
515 config.query_max_dims =
516 Some(t.as_str().parse().unwrap_or_else(|_| {
517 log::warn!(
518 "Invalid query max_dims '{}', using 0",
519 t.as_str()
520 );
521 0
522 }));
523 }
524 }
525 Rule::query_pruning_kwarg => {
526 if let Some(t) = p.into_inner().next() {
527 config.query_pruning =
528 Some(t.as_str().parse().unwrap_or_else(|_| {
529 log::warn!(
530 "Invalid query pruning '{}', using 1.0",
531 t.as_str()
532 );
533 1.0
534 }));
535 }
536 }
537 Rule::query_min_query_dims_kwarg => {
538 if let Some(t) = p.into_inner().next() {
539 config.query_min_query_dims =
540 Some(t.as_str().parse().unwrap_or_else(|_| {
541 log::warn!(
542 "Invalid query min_query_dims '{}', using 4",
543 t.as_str()
544 );
545 4
546 }));
547 }
548 }
549 _ => {}
550 }
551 }
552 }
553 }
554 }
555 }
556}
557
558fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
560 let mut inner = pair.into_inner();
561
562 let name = inner
563 .next()
564 .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
565 .as_str()
566 .to_string();
567
568 let field_type_str = inner
569 .next()
570 .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
571 .as_str();
572
573 let field_type = parse_field_type(field_type_str)?;
574
575 let mut tokenizer = None;
577 let mut sparse_vector_config = None;
578 let mut dense_vector_config = None;
579 let mut indexed = true;
580 let mut stored = true;
581 let mut multi = false;
582 let mut fast = false;
583 let mut primary = false;
584 let mut index_config: Option<IndexConfig> = None;
585
586 for item in inner {
587 match item.as_rule() {
588 Rule::tokenizer_spec => {
589 if let Some(tok_name) = item.into_inner().next() {
591 tokenizer = Some(tok_name.as_str().to_string());
592 }
593 }
594 Rule::sparse_vector_config => {
595 sparse_vector_config = Some(parse_sparse_vector_config(item));
597 }
598 Rule::dense_vector_config => {
599 dense_vector_config = Some(parse_dense_vector_config(item));
601 }
602 Rule::attributes => {
603 let (idx, sto, mul, fst, pri, idx_cfg) = parse_attributes(item);
604 indexed = idx;
605 stored = sto;
606 multi = mul;
607 fast = fst;
608 primary = pri;
609 index_config = idx_cfg;
610 }
611 _ => {}
612 }
613 }
614
615 if primary {
617 fast = true;
618 indexed = true;
619 }
620
621 let mut positions = None;
623 if let Some(idx_cfg) = index_config {
624 positions = idx_cfg.positions;
625 if let Some(ref mut dv_config) = dense_vector_config {
626 apply_index_config_to_dense_vector(dv_config, idx_cfg);
627 } else if field_type == FieldType::SparseVector {
628 let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
630 apply_index_config_to_sparse_vector(sv_config, idx_cfg);
631 }
632 }
633
634 Ok(FieldDef {
635 name,
636 field_type,
637 indexed,
638 stored,
639 tokenizer,
640 multi,
641 positions,
642 sparse_vector_config,
643 dense_vector_config,
644 fast,
645 primary,
646 })
647}
648
649fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
651 if let Some(index_type) = idx_cfg.index_type {
653 config.index_type = index_type;
654 }
655
656 if idx_cfg.num_clusters.is_some() {
658 config.num_clusters = idx_cfg.num_clusters;
659 }
660
661 if let Some(nprobe) = idx_cfg.nprobe {
663 config.nprobe = nprobe;
664 }
665
666 if idx_cfg.build_threshold.is_some() {
668 config.build_threshold = idx_cfg.build_threshold;
669 }
670}
671
672fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
675 let mut index_size = IndexSize::default();
676
677 for inner in pair.into_inner() {
679 if inner.as_rule() == Rule::index_size_spec {
680 index_size = match inner.as_str() {
681 "u16" => IndexSize::U16,
682 "u32" => IndexSize::U32,
683 _ => IndexSize::default(),
684 };
685 }
686 }
687
688 SparseVectorConfig {
689 format: SparseFormat::default(),
690 index_size,
691 weight_quantization: WeightQuantization::default(),
692 weight_threshold: 0.0,
693 block_size: 128,
694 bmp_block_size: 64,
695 max_bmp_grid_bytes: 0,
696 bmp_superblock_size: 64,
697 pruning: None,
698 query_config: None,
699 dims: None,
700 max_weight: None,
701 min_terms: 4,
702 }
703}
704
705fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
707 if let Some(f) = idx_cfg.sparse_format {
708 config.format = f;
709 }
710 if let Some(q) = idx_cfg.quantization {
711 config.weight_quantization = q;
712 }
713 if let Some(t) = idx_cfg.weight_threshold {
714 config.weight_threshold = t;
715 }
716 if let Some(bs) = idx_cfg.block_size {
717 let adjusted = bs.next_power_of_two();
718 if adjusted != bs {
719 log::warn!(
720 "block_size {} adjusted to next power of two: {}",
721 bs,
722 adjusted
723 );
724 }
725 config.block_size = adjusted;
726 }
727 if let Some(p) = idx_cfg.pruning {
728 let clamped = p.clamp(0.0, 1.0);
729 if (clamped - p).abs() > f32::EPSILON {
730 log::warn!(
731 "pruning {} clamped to valid range [0.0, 1.0]: {}",
732 p,
733 clamped
734 );
735 }
736 config.pruning = Some(clamped);
737 }
738 if let Some(mt) = idx_cfg.min_terms {
739 config.min_terms = mt;
740 }
741 if let Some(d) = idx_cfg.dims {
742 config.dims = Some(d);
743 }
744 if let Some(mw) = idx_cfg.max_weight {
745 config.max_weight = Some(mw);
746 }
747 if idx_cfg.query_tokenizer.is_some()
749 || idx_cfg.query_weighting.is_some()
750 || idx_cfg.query_weight_threshold.is_some()
751 || idx_cfg.query_max_dims.is_some()
752 || idx_cfg.query_pruning.is_some()
753 || idx_cfg.query_min_query_dims.is_some()
754 {
755 let query_config = config
756 .query_config
757 .get_or_insert(SparseQueryConfig::default());
758 if let Some(tokenizer) = idx_cfg.query_tokenizer {
759 query_config.tokenizer = Some(tokenizer);
760 }
761 if let Some(weighting) = idx_cfg.query_weighting {
762 query_config.weighting = weighting;
763 }
764 if let Some(t) = idx_cfg.query_weight_threshold {
765 query_config.weight_threshold = t;
766 }
767 if let Some(d) = idx_cfg.query_max_dims {
768 query_config.max_query_dims = Some(d);
769 }
770 if let Some(p) = idx_cfg.query_pruning {
771 query_config.pruning = Some(p);
772 }
773 if let Some(m) = idx_cfg.query_min_query_dims {
774 query_config.min_query_dims = m;
775 }
776 }
777}
778
779fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
782 let mut dim: usize = 0;
783 let mut quantization = DenseVectorQuantization::F32;
784
785 for params in pair.into_inner() {
787 if params.as_rule() == Rule::dense_vector_params {
788 for inner in params.into_inner() {
789 match inner.as_rule() {
790 Rule::dense_vector_keyword_params => {
791 for kwarg in inner.into_inner() {
792 match kwarg.as_rule() {
793 Rule::dims_kwarg => {
794 if let Some(d) = kwarg.into_inner().next() {
795 dim = d.as_str().parse().unwrap_or(0);
796 }
797 }
798 Rule::quant_type_spec => {
799 quantization = parse_quant_type(kwarg.as_str());
800 }
801 _ => {}
802 }
803 }
804 }
805 Rule::dense_vector_positional_params => {
806 for item in inner.into_inner() {
807 match item.as_rule() {
808 Rule::dimension_spec => {
809 dim = item.as_str().parse().unwrap_or(0);
810 }
811 Rule::quant_type_spec => {
812 quantization = parse_quant_type(item.as_str());
813 }
814 _ => {}
815 }
816 }
817 }
818 _ => {}
819 }
820 }
821 }
822 }
823
824 DenseVectorConfig::new(dim).with_quantization(quantization)
825}
826
827fn parse_quant_type(s: &str) -> DenseVectorQuantization {
828 match s.trim() {
829 "f16" => DenseVectorQuantization::F16,
830 "uint8" | "u8" => DenseVectorQuantization::UInt8,
831 _ => DenseVectorQuantization::F32,
832 }
833}
834
835fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
837 pair.into_inner().map(|p| p.as_str().to_string()).collect()
838}
839
840fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
842 let mut pattern = String::new();
843 let mut substitution = String::new();
844 let mut target_field = String::new();
845 let mut mode = RoutingMode::Additional;
846
847 for prop in pair.into_inner() {
848 if prop.as_rule() != Rule::query_router_prop {
849 continue;
850 }
851
852 for inner in prop.into_inner() {
853 match inner.as_rule() {
854 Rule::query_router_pattern => {
855 if let Some(regex_str) = inner.into_inner().next() {
856 pattern = parse_string_value(regex_str);
857 }
858 }
859 Rule::query_router_substitution => {
860 if let Some(quoted) = inner.into_inner().next() {
861 substitution = parse_string_value(quoted);
862 }
863 }
864 Rule::query_router_target => {
865 if let Some(ident) = inner.into_inner().next() {
866 target_field = ident.as_str().to_string();
867 }
868 }
869 Rule::query_router_mode => {
870 if let Some(mode_val) = inner.into_inner().next() {
871 mode = match mode_val.as_str() {
872 "exclusive" => RoutingMode::Exclusive,
873 "additional" => RoutingMode::Additional,
874 _ => RoutingMode::Additional,
875 };
876 }
877 }
878 _ => {}
879 }
880 }
881 }
882
883 if pattern.is_empty() {
884 return Err(Error::Schema("query_router missing 'pattern'".to_string()));
885 }
886 if substitution.is_empty() {
887 return Err(Error::Schema(
888 "query_router missing 'substitution'".to_string(),
889 ));
890 }
891 if target_field.is_empty() {
892 return Err(Error::Schema(
893 "query_router missing 'target_field'".to_string(),
894 ));
895 }
896
897 Ok(QueryRouterRule {
898 pattern,
899 substitution,
900 target_field,
901 mode,
902 })
903}
904
905fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
907 let s = pair.as_str();
908 match pair.as_rule() {
909 Rule::regex_string => {
910 if let Some(inner) = pair.into_inner().next() {
912 parse_string_value(inner)
913 } else {
914 s.to_string()
915 }
916 }
917 Rule::raw_string => {
918 s[2..s.len() - 1].to_string()
920 }
921 Rule::quoted_string => {
922 let inner = &s[1..s.len() - 1];
924 inner
926 .replace("\\n", "\n")
927 .replace("\\t", "\t")
928 .replace("\\\"", "\"")
929 .replace("\\\\", "\\")
930 }
931 _ => s.to_string(),
932 }
933}
934
935fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
937 let mut inner = pair.into_inner();
938
939 let name = inner
940 .next()
941 .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
942 .as_str()
943 .to_string();
944
945 let mut fields = Vec::new();
946 let mut default_fields = Vec::new();
947 let mut query_routers = Vec::new();
948
949 for item in inner {
950 match item.as_rule() {
951 Rule::field_def => {
952 fields.push(parse_field_def(item)?);
953 }
954 Rule::default_fields_def => {
955 default_fields = parse_default_fields_def(item);
956 }
957 Rule::query_router_def => {
958 query_routers.push(parse_query_router_def(item)?);
959 }
960 _ => {}
961 }
962 }
963
964 let primary_fields: Vec<&FieldDef> = fields.iter().filter(|f| f.primary).collect();
966 if primary_fields.len() > 1 {
967 return Err(Error::Schema(format!(
968 "Index '{}' has {} primary key fields, but at most one is allowed",
969 name,
970 primary_fields.len()
971 )));
972 }
973 if let Some(pk) = primary_fields.first() {
974 if pk.field_type != FieldType::Text {
975 return Err(Error::Schema(format!(
976 "Primary key field '{}' must be of type text, got {:?}",
977 pk.name, pk.field_type
978 )));
979 }
980 if pk.multi {
981 return Err(Error::Schema(format!(
982 "Primary key field '{}' cannot be multi-valued",
983 pk.name
984 )));
985 }
986 }
987
988 Ok(IndexDef {
989 name,
990 fields,
991 default_fields,
992 query_routers,
993 })
994}
995
996pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
998 let pairs = SdlParser::parse(Rule::file, input)
999 .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
1000
1001 let mut indexes = Vec::new();
1002
1003 for pair in pairs {
1004 if pair.as_rule() == Rule::file {
1005 for inner in pair.into_inner() {
1006 if inner.as_rule() == Rule::index_def {
1007 indexes.push(parse_index_def(inner)?);
1008 }
1009 }
1010 }
1011 }
1012
1013 Ok(indexes)
1014}
1015
1016pub fn parse_single_index(input: &str) -> Result<IndexDef> {
1018 let indexes = parse_sdl(input)?;
1019
1020 if indexes.is_empty() {
1021 return Err(Error::Schema("No index definition found".to_string()));
1022 }
1023
1024 if indexes.len() > 1 {
1025 return Err(Error::Schema(
1026 "Multiple index definitions found, expected one".to_string(),
1027 ));
1028 }
1029
1030 Ok(indexes.into_iter().next().unwrap())
1031}
1032
1033#[cfg(test)]
1034mod tests {
1035 use super::*;
1036
1037 #[test]
1038 fn test_parse_simple_schema() {
1039 let sdl = r#"
1040 index articles {
1041 field title: text [indexed, stored]
1042 field body: text [indexed]
1043 }
1044 "#;
1045
1046 let indexes = parse_sdl(sdl).unwrap();
1047 assert_eq!(indexes.len(), 1);
1048
1049 let index = &indexes[0];
1050 assert_eq!(index.name, "articles");
1051 assert_eq!(index.fields.len(), 2);
1052
1053 assert_eq!(index.fields[0].name, "title");
1054 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1055 assert!(index.fields[0].indexed);
1056 assert!(index.fields[0].stored);
1057
1058 assert_eq!(index.fields[1].name, "body");
1059 assert!(matches!(index.fields[1].field_type, FieldType::Text));
1060 assert!(index.fields[1].indexed);
1061 assert!(!index.fields[1].stored);
1062 }
1063
1064 #[test]
1065 fn test_parse_all_field_types() {
1066 let sdl = r#"
1067 index test {
1068 field text_field: text [indexed, stored]
1069 field u64_field: u64 [indexed, stored]
1070 field i64_field: i64 [indexed, stored]
1071 field f64_field: f64 [indexed, stored]
1072 field bytes_field: bytes [stored]
1073 }
1074 "#;
1075
1076 let indexes = parse_sdl(sdl).unwrap();
1077 let index = &indexes[0];
1078
1079 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1080 assert!(matches!(index.fields[1].field_type, FieldType::U64));
1081 assert!(matches!(index.fields[2].field_type, FieldType::I64));
1082 assert!(matches!(index.fields[3].field_type, FieldType::F64));
1083 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1084 }
1085
1086 #[test]
1087 fn test_parse_with_comments() {
1088 let sdl = r#"
1089 # This is a comment
1090 index articles {
1091 # Title field
1092 field title: text [indexed, stored]
1093 field body: text [indexed] # inline comment not supported yet
1094 }
1095 "#;
1096
1097 let indexes = parse_sdl(sdl).unwrap();
1098 assert_eq!(indexes[0].fields.len(), 2);
1099 }
1100
1101 #[test]
1102 fn test_parse_type_aliases() {
1103 let sdl = r#"
1104 index test {
1105 field a: string [indexed]
1106 field b: int [indexed]
1107 field c: uint [indexed]
1108 field d: float [indexed]
1109 field e: binary [stored]
1110 }
1111 "#;
1112
1113 let indexes = parse_sdl(sdl).unwrap();
1114 let index = &indexes[0];
1115
1116 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1117 assert!(matches!(index.fields[1].field_type, FieldType::I64));
1118 assert!(matches!(index.fields[2].field_type, FieldType::U64));
1119 assert!(matches!(index.fields[3].field_type, FieldType::F64));
1120 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1121 }
1122
1123 #[test]
1124 fn test_to_schema() {
1125 let sdl = r#"
1126 index articles {
1127 field title: text [indexed, stored]
1128 field views: u64 [indexed, stored]
1129 }
1130 "#;
1131
1132 let indexes = parse_sdl(sdl).unwrap();
1133 let schema = indexes[0].to_schema();
1134
1135 assert!(schema.get_field("title").is_some());
1136 assert!(schema.get_field("views").is_some());
1137 assert!(schema.get_field("nonexistent").is_none());
1138 }
1139
1140 #[test]
1141 fn test_default_attributes() {
1142 let sdl = r#"
1143 index test {
1144 field title: text
1145 }
1146 "#;
1147
1148 let indexes = parse_sdl(sdl).unwrap();
1149 let field = &indexes[0].fields[0];
1150
1151 assert!(field.indexed);
1153 assert!(field.stored);
1154 }
1155
1156 #[test]
1157 fn test_multiple_indexes() {
1158 let sdl = r#"
1159 index articles {
1160 field title: text [indexed, stored]
1161 }
1162
1163 index users {
1164 field name: text [indexed, stored]
1165 field email: text [indexed, stored]
1166 }
1167 "#;
1168
1169 let indexes = parse_sdl(sdl).unwrap();
1170 assert_eq!(indexes.len(), 2);
1171 assert_eq!(indexes[0].name, "articles");
1172 assert_eq!(indexes[1].name, "users");
1173 }
1174
1175 #[test]
1176 fn test_tokenizer_spec() {
1177 let sdl = r#"
1178 index articles {
1179 field title: text<en_stem> [indexed, stored]
1180 field body: text<simple> [indexed]
1181 field author: text [indexed, stored]
1182 }
1183 "#;
1184
1185 let indexes = parse_sdl(sdl).unwrap();
1186 let index = &indexes[0];
1187
1188 assert_eq!(index.fields[0].name, "title");
1189 assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
1190
1191 assert_eq!(index.fields[1].name, "body");
1192 assert_eq!(index.fields[1].tokenizer, Some("simple".to_string()));
1193
1194 assert_eq!(index.fields[2].name, "author");
1195 assert_eq!(index.fields[2].tokenizer, None); }
1197
1198 #[test]
1199 fn test_tokenizer_in_schema() {
1200 let sdl = r#"
1201 index articles {
1202 field title: text<german> [indexed, stored]
1203 field body: text<en_stem> [indexed]
1204 }
1205 "#;
1206
1207 let indexes = parse_sdl(sdl).unwrap();
1208 let schema = indexes[0].to_schema();
1209
1210 let title_field = schema.get_field("title").unwrap();
1211 let title_entry = schema.get_field_entry(title_field).unwrap();
1212 assert_eq!(title_entry.tokenizer, Some("german".to_string()));
1213
1214 let body_field = schema.get_field("body").unwrap();
1215 let body_entry = schema.get_field_entry(body_field).unwrap();
1216 assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
1217 }
1218
1219 #[test]
1220 fn test_query_router_basic() {
1221 let sdl = r#"
1222 index documents {
1223 field title: text [indexed, stored]
1224 field uri: text [indexed, stored]
1225
1226 query_router {
1227 pattern: "10\\.\\d{4,}/[^\\s]+"
1228 substitution: "doi://{0}"
1229 target_field: uris
1230 mode: exclusive
1231 }
1232 }
1233 "#;
1234
1235 let indexes = parse_sdl(sdl).unwrap();
1236 let index = &indexes[0];
1237
1238 assert_eq!(index.query_routers.len(), 1);
1239 let router = &index.query_routers[0];
1240 assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1241 assert_eq!(router.substitution, "doi://{0}");
1242 assert_eq!(router.target_field, "uris");
1243 assert_eq!(router.mode, RoutingMode::Exclusive);
1244 }
1245
1246 #[test]
1247 fn test_query_router_raw_string() {
1248 let sdl = r#"
1249 index documents {
1250 field uris: text [indexed, stored]
1251
1252 query_router {
1253 pattern: r"^pmid:(\d+)$"
1254 substitution: "pubmed://{1}"
1255 target_field: uris
1256 mode: additional
1257 }
1258 }
1259 "#;
1260
1261 let indexes = parse_sdl(sdl).unwrap();
1262 let router = &indexes[0].query_routers[0];
1263
1264 assert_eq!(router.pattern, r"^pmid:(\d+)$");
1265 assert_eq!(router.substitution, "pubmed://{1}");
1266 assert_eq!(router.mode, RoutingMode::Additional);
1267 }
1268
1269 #[test]
1270 fn test_multiple_query_routers() {
1271 let sdl = r#"
1272 index documents {
1273 field uris: text [indexed, stored]
1274
1275 query_router {
1276 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1277 substitution: "doi://{1}"
1278 target_field: uris
1279 mode: exclusive
1280 }
1281
1282 query_router {
1283 pattern: r"^pmid:(\d+)$"
1284 substitution: "pubmed://{1}"
1285 target_field: uris
1286 mode: exclusive
1287 }
1288
1289 query_router {
1290 pattern: r"^arxiv:(\d+\.\d+)$"
1291 substitution: "arxiv://{1}"
1292 target_field: uris
1293 mode: additional
1294 }
1295 }
1296 "#;
1297
1298 let indexes = parse_sdl(sdl).unwrap();
1299 assert_eq!(indexes[0].query_routers.len(), 3);
1300 }
1301
1302 #[test]
1303 fn test_query_router_default_mode() {
1304 let sdl = r#"
1305 index documents {
1306 field uris: text [indexed, stored]
1307
1308 query_router {
1309 pattern: r"test"
1310 substitution: "{0}"
1311 target_field: uris
1312 }
1313 }
1314 "#;
1315
1316 let indexes = parse_sdl(sdl).unwrap();
1317 assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1319 }
1320
1321 #[test]
1322 fn test_multi_attribute() {
1323 let sdl = r#"
1324 index documents {
1325 field uris: text [indexed, stored<multi>]
1326 field title: text [indexed, stored]
1327 }
1328 "#;
1329
1330 let indexes = parse_sdl(sdl).unwrap();
1331 assert_eq!(indexes.len(), 1);
1332
1333 let fields = &indexes[0].fields;
1334 assert_eq!(fields.len(), 2);
1335
1336 assert_eq!(fields[0].name, "uris");
1338 assert!(fields[0].multi, "uris field should have multi=true");
1339
1340 assert_eq!(fields[1].name, "title");
1342 assert!(!fields[1].multi, "title field should have multi=false");
1343
1344 let schema = indexes[0].to_schema();
1346 let uris_field = schema.get_field("uris").unwrap();
1347 let title_field = schema.get_field("title").unwrap();
1348
1349 assert!(schema.get_field_entry(uris_field).unwrap().multi);
1350 assert!(!schema.get_field_entry(title_field).unwrap().multi);
1351 }
1352
1353 #[test]
1354 fn test_sparse_vector_field() {
1355 let sdl = r#"
1356 index documents {
1357 field embedding: sparse_vector [indexed, stored]
1358 }
1359 "#;
1360
1361 let indexes = parse_sdl(sdl).unwrap();
1362 assert_eq!(indexes.len(), 1);
1363 assert_eq!(indexes[0].fields.len(), 1);
1364 assert_eq!(indexes[0].fields[0].name, "embedding");
1365 assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1366 assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1367 }
1368
1369 #[test]
1370 fn test_sparse_vector_with_config() {
1371 let sdl = r#"
1372 index documents {
1373 field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1374 field dense: sparse_vector<u32> [indexed<quantization: float32>]
1375 }
1376 "#;
1377
1378 let indexes = parse_sdl(sdl).unwrap();
1379 assert_eq!(indexes[0].fields.len(), 2);
1380
1381 let f1 = &indexes[0].fields[0];
1383 assert_eq!(f1.name, "embedding");
1384 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1385 assert_eq!(config1.index_size, IndexSize::U16);
1386 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1387
1388 let f2 = &indexes[0].fields[1];
1390 assert_eq!(f2.name, "dense");
1391 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1392 assert_eq!(config2.index_size, IndexSize::U32);
1393 assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1394 }
1395
1396 #[test]
1397 fn test_sparse_vector_with_weight_threshold() {
1398 let sdl = r#"
1399 index documents {
1400 field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1401 field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1402 }
1403 "#;
1404
1405 let indexes = parse_sdl(sdl).unwrap();
1406 assert_eq!(indexes[0].fields.len(), 2);
1407
1408 let f1 = &indexes[0].fields[0];
1410 assert_eq!(f1.name, "embedding");
1411 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1412 assert_eq!(config1.index_size, IndexSize::U16);
1413 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1414 assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1415
1416 let f2 = &indexes[0].fields[1];
1418 assert_eq!(f2.name, "embedding2");
1419 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1420 assert_eq!(config2.index_size, IndexSize::U32);
1421 assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1422 assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1423 }
1424
1425 #[test]
1426 fn test_sparse_vector_with_pruning() {
1427 let sdl = r#"
1428 index documents {
1429 field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1430 }
1431 "#;
1432
1433 let indexes = parse_sdl(sdl).unwrap();
1434 let f = &indexes[0].fields[0];
1435 assert_eq!(f.name, "embedding");
1436 let config = f.sparse_vector_config.as_ref().unwrap();
1437 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1438 assert_eq!(config.pruning, Some(0.1));
1439 }
1440
1441 #[test]
1442 fn test_dense_vector_field() {
1443 let sdl = r#"
1444 index documents {
1445 field embedding: dense_vector<768> [indexed, stored]
1446 }
1447 "#;
1448
1449 let indexes = parse_sdl(sdl).unwrap();
1450 assert_eq!(indexes.len(), 1);
1451 assert_eq!(indexes[0].fields.len(), 1);
1452
1453 let f = &indexes[0].fields[0];
1454 assert_eq!(f.name, "embedding");
1455 assert_eq!(f.field_type, FieldType::DenseVector);
1456
1457 let config = f.dense_vector_config.as_ref().unwrap();
1458 assert_eq!(config.dim, 768);
1459 }
1460
1461 #[test]
1462 fn test_dense_vector_alias() {
1463 let sdl = r#"
1464 index documents {
1465 field embedding: vector<1536> [indexed]
1466 }
1467 "#;
1468
1469 let indexes = parse_sdl(sdl).unwrap();
1470 assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1471 assert_eq!(
1472 indexes[0].fields[0]
1473 .dense_vector_config
1474 .as_ref()
1475 .unwrap()
1476 .dim,
1477 1536
1478 );
1479 }
1480
1481 #[test]
1482 fn test_dense_vector_with_num_clusters() {
1483 let sdl = r#"
1484 index documents {
1485 field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1486 }
1487 "#;
1488
1489 let indexes = parse_sdl(sdl).unwrap();
1490 assert_eq!(indexes.len(), 1);
1491
1492 let f = &indexes[0].fields[0];
1493 assert_eq!(f.name, "embedding");
1494 assert_eq!(f.field_type, FieldType::DenseVector);
1495
1496 let config = f.dense_vector_config.as_ref().unwrap();
1497 assert_eq!(config.dim, 768);
1498 assert_eq!(config.num_clusters, Some(256));
1499 assert_eq!(config.nprobe, 32); }
1501
1502 #[test]
1503 fn test_dense_vector_with_num_clusters_and_nprobe() {
1504 let sdl = r#"
1505 index documents {
1506 field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1507 }
1508 "#;
1509
1510 let indexes = parse_sdl(sdl).unwrap();
1511 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1512
1513 assert_eq!(config.dim, 1536);
1514 assert_eq!(config.num_clusters, Some(512));
1515 assert_eq!(config.nprobe, 64);
1516 }
1517
1518 #[test]
1519 fn test_dense_vector_keyword_syntax() {
1520 let sdl = r#"
1521 index documents {
1522 field embedding: dense_vector<dims: 1536> [indexed, stored]
1523 }
1524 "#;
1525
1526 let indexes = parse_sdl(sdl).unwrap();
1527 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1528
1529 assert_eq!(config.dim, 1536);
1530 assert!(config.num_clusters.is_none());
1531 }
1532
1533 #[test]
1534 fn test_dense_vector_keyword_syntax_full() {
1535 let sdl = r#"
1536 index documents {
1537 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1538 }
1539 "#;
1540
1541 let indexes = parse_sdl(sdl).unwrap();
1542 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1543
1544 assert_eq!(config.dim, 1536);
1545 assert_eq!(config.num_clusters, Some(256));
1546 assert_eq!(config.nprobe, 64);
1547 }
1548
1549 #[test]
1550 fn test_dense_vector_keyword_syntax_partial() {
1551 let sdl = r#"
1552 index documents {
1553 field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1554 }
1555 "#;
1556
1557 let indexes = parse_sdl(sdl).unwrap();
1558 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1559
1560 assert_eq!(config.dim, 768);
1561 assert_eq!(config.num_clusters, Some(128));
1562 assert_eq!(config.nprobe, 32); }
1564
1565 #[test]
1566 fn test_dense_vector_scann_index() {
1567 use crate::dsl::schema::VectorIndexType;
1568
1569 let sdl = r#"
1570 index documents {
1571 field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1572 }
1573 "#;
1574
1575 let indexes = parse_sdl(sdl).unwrap();
1576 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1577
1578 assert_eq!(config.dim, 768);
1579 assert_eq!(config.index_type, VectorIndexType::ScaNN);
1580 assert_eq!(config.num_clusters, Some(256));
1581 assert_eq!(config.nprobe, 64);
1582 }
1583
1584 #[test]
1585 fn test_dense_vector_ivf_rabitq_index() {
1586 use crate::dsl::schema::VectorIndexType;
1587
1588 let sdl = r#"
1589 index documents {
1590 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1591 }
1592 "#;
1593
1594 let indexes = parse_sdl(sdl).unwrap();
1595 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1596
1597 assert_eq!(config.dim, 1536);
1598 assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1599 assert_eq!(config.num_clusters, Some(512));
1600 }
1601
1602 #[test]
1603 fn test_dense_vector_rabitq_no_clusters() {
1604 use crate::dsl::schema::VectorIndexType;
1605
1606 let sdl = r#"
1607 index documents {
1608 field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1609 }
1610 "#;
1611
1612 let indexes = parse_sdl(sdl).unwrap();
1613 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1614
1615 assert_eq!(config.dim, 768);
1616 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1617 assert!(config.num_clusters.is_none());
1618 }
1619
1620 #[test]
1621 fn test_dense_vector_flat_index() {
1622 use crate::dsl::schema::VectorIndexType;
1623
1624 let sdl = r#"
1625 index documents {
1626 field embedding: dense_vector<dims: 768> [indexed<flat>]
1627 }
1628 "#;
1629
1630 let indexes = parse_sdl(sdl).unwrap();
1631 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1632
1633 assert_eq!(config.dim, 768);
1634 assert_eq!(config.index_type, VectorIndexType::Flat);
1635 }
1636
1637 #[test]
1638 fn test_dense_vector_default_index_type() {
1639 use crate::dsl::schema::VectorIndexType;
1640
1641 let sdl = r#"
1643 index documents {
1644 field embedding: dense_vector<dims: 768> [indexed]
1645 }
1646 "#;
1647
1648 let indexes = parse_sdl(sdl).unwrap();
1649 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1650
1651 assert_eq!(config.dim, 768);
1652 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1653 }
1654
1655 #[test]
1656 fn test_dense_vector_f16_quantization() {
1657 use crate::dsl::schema::{DenseVectorQuantization, VectorIndexType};
1658
1659 let sdl = r#"
1660 index documents {
1661 field embedding: dense_vector<768, f16> [indexed]
1662 }
1663 "#;
1664
1665 let indexes = parse_sdl(sdl).unwrap();
1666 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1667
1668 assert_eq!(config.dim, 768);
1669 assert_eq!(config.quantization, DenseVectorQuantization::F16);
1670 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1671 }
1672
1673 #[test]
1674 fn test_dense_vector_uint8_quantization() {
1675 use crate::dsl::schema::DenseVectorQuantization;
1676
1677 let sdl = r#"
1678 index documents {
1679 field embedding: dense_vector<1024, uint8> [indexed<ivf_rabitq>]
1680 }
1681 "#;
1682
1683 let indexes = parse_sdl(sdl).unwrap();
1684 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1685
1686 assert_eq!(config.dim, 1024);
1687 assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1688 }
1689
1690 #[test]
1691 fn test_dense_vector_u8_alias() {
1692 use crate::dsl::schema::DenseVectorQuantization;
1693
1694 let sdl = r#"
1695 index documents {
1696 field embedding: dense_vector<512, u8> [indexed]
1697 }
1698 "#;
1699
1700 let indexes = parse_sdl(sdl).unwrap();
1701 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1702
1703 assert_eq!(config.dim, 512);
1704 assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1705 }
1706
1707 #[test]
1708 fn test_dense_vector_default_f32_quantization() {
1709 use crate::dsl::schema::DenseVectorQuantization;
1710
1711 let sdl = r#"
1713 index documents {
1714 field embedding: dense_vector<768> [indexed]
1715 }
1716 "#;
1717
1718 let indexes = parse_sdl(sdl).unwrap();
1719 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1720
1721 assert_eq!(config.dim, 768);
1722 assert_eq!(config.quantization, DenseVectorQuantization::F32);
1723 }
1724
1725 #[test]
1726 fn test_dense_vector_keyword_with_quantization() {
1727 use crate::dsl::schema::DenseVectorQuantization;
1728
1729 let sdl = r#"
1730 index documents {
1731 field embedding: dense_vector<dims: 768, f16> [indexed]
1732 }
1733 "#;
1734
1735 let indexes = parse_sdl(sdl).unwrap();
1736 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1737
1738 assert_eq!(config.dim, 768);
1739 assert_eq!(config.quantization, DenseVectorQuantization::F16);
1740 }
1741
1742 #[test]
1743 fn test_json_field_type() {
1744 let sdl = r#"
1745 index documents {
1746 field title: text [indexed, stored]
1747 field metadata: json [stored]
1748 field extra: json
1749 }
1750 "#;
1751
1752 let indexes = parse_sdl(sdl).unwrap();
1753 let index = &indexes[0];
1754
1755 assert_eq!(index.fields.len(), 3);
1756
1757 assert_eq!(index.fields[1].name, "metadata");
1759 assert!(matches!(index.fields[1].field_type, FieldType::Json));
1760 assert!(index.fields[1].stored);
1761 assert_eq!(index.fields[2].name, "extra");
1765 assert!(matches!(index.fields[2].field_type, FieldType::Json));
1766
1767 let schema = index.to_schema();
1769 let metadata_field = schema.get_field("metadata").unwrap();
1770 let entry = schema.get_field_entry(metadata_field).unwrap();
1771 assert_eq!(entry.field_type, FieldType::Json);
1772 assert!(!entry.indexed); assert!(entry.stored);
1774 }
1775
1776 #[test]
1777 fn test_sparse_vector_query_config() {
1778 use crate::structures::QueryWeighting;
1779
1780 let sdl = r#"
1781 index documents {
1782 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1783 }
1784 "#;
1785
1786 let indexes = parse_sdl(sdl).unwrap();
1787 let index = &indexes[0];
1788
1789 assert_eq!(index.fields.len(), 1);
1790 assert_eq!(index.fields[0].name, "embedding");
1791 assert!(matches!(
1792 index.fields[0].field_type,
1793 FieldType::SparseVector
1794 ));
1795
1796 let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1797 assert_eq!(config.index_size, IndexSize::U16);
1798 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1799
1800 let query_config = config.query_config.as_ref().unwrap();
1802 assert_eq!(
1803 query_config.tokenizer.as_deref(),
1804 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1805 );
1806 assert_eq!(query_config.weighting, QueryWeighting::Idf);
1807
1808 let schema = index.to_schema();
1810 let embedding_field = schema.get_field("embedding").unwrap();
1811 let entry = schema.get_field_entry(embedding_field).unwrap();
1812 let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1813 let qc = sv_config.query_config.as_ref().unwrap();
1814 assert_eq!(
1815 qc.tokenizer.as_deref(),
1816 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1817 );
1818 assert_eq!(qc.weighting, QueryWeighting::Idf);
1819 }
1820
1821 #[test]
1822 fn test_sparse_vector_query_config_weighting_one() {
1823 use crate::structures::QueryWeighting;
1824
1825 let sdl = r#"
1826 index documents {
1827 field embedding: sparse_vector [indexed<query<weighting: one>>]
1828 }
1829 "#;
1830
1831 let indexes = parse_sdl(sdl).unwrap();
1832 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1833
1834 let query_config = config.query_config.as_ref().unwrap();
1835 assert!(query_config.tokenizer.is_none());
1836 assert_eq!(query_config.weighting, QueryWeighting::One);
1837 }
1838
1839 #[test]
1840 fn test_sparse_vector_query_config_weighting_idf_file() {
1841 use crate::structures::QueryWeighting;
1842
1843 let sdl = r#"
1844 index documents {
1845 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1846 }
1847 "#;
1848
1849 let indexes = parse_sdl(sdl).unwrap();
1850 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1851
1852 let query_config = config.query_config.as_ref().unwrap();
1853 assert_eq!(
1854 query_config.tokenizer.as_deref(),
1855 Some("opensearch-neural-sparse-encoding-v1")
1856 );
1857 assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1858
1859 let schema = indexes[0].to_schema();
1861 let field = schema.get_field("embedding").unwrap();
1862 let entry = schema.get_field_entry(field).unwrap();
1863 let sc = entry.sparse_vector_config.as_ref().unwrap();
1864 let qc = sc.query_config.as_ref().unwrap();
1865 assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1866 }
1867
1868 #[test]
1869 fn test_sparse_vector_query_config_pruning_params() {
1870 let sdl = r#"
1871 index documents {
1872 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<weighting: idf, weight_threshold: 0.03, max_dims: 25, pruning: 0.2>>]
1873 }
1874 "#;
1875
1876 let indexes = parse_sdl(sdl).unwrap();
1877 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1878
1879 let qc = config.query_config.as_ref().unwrap();
1880 assert_eq!(qc.weighting, QueryWeighting::Idf);
1881 assert!((qc.weight_threshold - 0.03).abs() < 0.001);
1882 assert_eq!(qc.max_query_dims, Some(25));
1883 assert!((qc.pruning.unwrap() - 0.2).abs() < 0.001);
1884
1885 let schema = indexes[0].to_schema();
1887 let field = schema.get_field("embedding").unwrap();
1888 let entry = schema.get_field_entry(field).unwrap();
1889 let sc = entry.sparse_vector_config.as_ref().unwrap();
1890 let rqc = sc.query_config.as_ref().unwrap();
1891 assert!((rqc.weight_threshold - 0.03).abs() < 0.001);
1892 assert_eq!(rqc.max_query_dims, Some(25));
1893 assert!((rqc.pruning.unwrap() - 0.2).abs() < 0.001);
1894 }
1895
1896 #[test]
1897 fn test_sparse_vector_format_maxscore() {
1898 let sdl = r#"
1899 index documents {
1900 field embedding: sparse_vector<u16> [indexed<format: maxscore, quantization: uint8>]
1901 }
1902 "#;
1903
1904 let indexes = parse_sdl(sdl).unwrap();
1905 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1906 assert_eq!(config.format, SparseFormat::MaxScore);
1907 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1908
1909 let schema = indexes[0].to_schema();
1911 let field = schema.get_field("embedding").unwrap();
1912 let entry = schema.get_field_entry(field).unwrap();
1913 let sc = entry.sparse_vector_config.as_ref().unwrap();
1914 assert_eq!(sc.format, SparseFormat::MaxScore);
1915 }
1916
1917 #[test]
1918 fn test_sparse_vector_format_bmp() {
1919 let sdl = r#"
1920 index documents {
1921 field embedding: sparse_vector<u16> [indexed<format: bmp, quantization: uint8>]
1922 }
1923 "#;
1924
1925 let indexes = parse_sdl(sdl).unwrap();
1926 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1927 assert_eq!(config.format, SparseFormat::Bmp);
1928 }
1929
1930 #[test]
1931 fn test_fast_attribute() {
1932 let sdl = r#"
1933 index products {
1934 field name: text [indexed, stored]
1935 field price: f64 [indexed, fast]
1936 field category: text [indexed, stored, fast]
1937 field count: u64 [fast]
1938 field score: i64 [indexed, stored, fast]
1939 }
1940 "#;
1941
1942 let indexes = parse_sdl(sdl).unwrap();
1943 assert_eq!(indexes.len(), 1);
1944 let index = &indexes[0];
1945 assert_eq!(index.fields.len(), 5);
1946
1947 assert!(!index.fields[0].fast);
1949 assert!(index.fields[1].fast);
1951 assert!(matches!(index.fields[1].field_type, FieldType::F64));
1952 assert!(index.fields[2].fast);
1954 assert!(matches!(index.fields[2].field_type, FieldType::Text));
1955 assert!(index.fields[3].fast);
1957 assert!(matches!(index.fields[3].field_type, FieldType::U64));
1958 assert!(index.fields[4].fast);
1960 assert!(matches!(index.fields[4].field_type, FieldType::I64));
1961
1962 let schema = index.to_schema();
1964 let price_field = schema.get_field("price").unwrap();
1965 assert!(schema.get_field_entry(price_field).unwrap().fast);
1966
1967 let category_field = schema.get_field("category").unwrap();
1968 assert!(schema.get_field_entry(category_field).unwrap().fast);
1969
1970 let name_field = schema.get_field("name").unwrap();
1971 assert!(!schema.get_field_entry(name_field).unwrap().fast);
1972 }
1973
1974 #[test]
1975 fn test_primary_attribute() {
1976 let sdl = r#"
1977 index documents {
1978 field id: text [primary, stored]
1979 field title: text [indexed, stored]
1980 }
1981 "#;
1982
1983 let indexes = parse_sdl(sdl).unwrap();
1984 assert_eq!(indexes.len(), 1);
1985 let index = &indexes[0];
1986 assert_eq!(index.fields.len(), 2);
1987
1988 let id_field = &index.fields[0];
1990 assert!(id_field.primary, "id should be primary");
1991 assert!(id_field.fast, "primary implies fast");
1992 assert!(id_field.indexed, "primary implies indexed");
1993
1994 assert!(!index.fields[1].primary);
1996
1997 let schema = index.to_schema();
1999 let id = schema.get_field("id").unwrap();
2000 let id_entry = schema.get_field_entry(id).unwrap();
2001 assert!(id_entry.primary_key);
2002 assert!(id_entry.fast);
2003 assert!(id_entry.indexed);
2004
2005 let title = schema.get_field("title").unwrap();
2006 assert!(!schema.get_field_entry(title).unwrap().primary_key);
2007
2008 assert_eq!(schema.primary_field(), Some(id));
2010 }
2011
2012 #[test]
2013 fn test_primary_with_other_attributes() {
2014 let sdl = r#"
2015 index documents {
2016 field id: text<simple> [primary, indexed, stored]
2017 field body: text [indexed]
2018 }
2019 "#;
2020
2021 let indexes = parse_sdl(sdl).unwrap();
2022 let id_field = &indexes[0].fields[0];
2023 assert!(id_field.primary);
2024 assert!(id_field.indexed);
2025 assert!(id_field.stored);
2026 assert!(id_field.fast);
2027 assert_eq!(id_field.tokenizer, Some("simple".to_string()));
2028 }
2029
2030 #[test]
2031 fn test_primary_only_one_allowed() {
2032 let sdl = r#"
2033 index documents {
2034 field id: text [primary]
2035 field alt_id: text [primary]
2036 }
2037 "#;
2038
2039 let result = parse_sdl(sdl);
2040 assert!(result.is_err());
2041 let err = result.unwrap_err().to_string();
2042 assert!(
2043 err.contains("primary key"),
2044 "Error should mention primary key: {}",
2045 err
2046 );
2047 }
2048
2049 #[test]
2050 fn test_primary_must_be_text() {
2051 let sdl = r#"
2052 index documents {
2053 field id: u64 [primary]
2054 }
2055 "#;
2056
2057 let result = parse_sdl(sdl);
2058 assert!(result.is_err());
2059 let err = result.unwrap_err().to_string();
2060 assert!(
2061 err.contains("text"),
2062 "Error should mention text type: {}",
2063 err
2064 );
2065 }
2066
2067 #[test]
2068 fn test_primary_cannot_be_multi() {
2069 let sdl = r#"
2070 index documents {
2071 field id: text [primary, stored<multi>]
2072 }
2073 "#;
2074
2075 let result = parse_sdl(sdl);
2076 assert!(result.is_err());
2077 let err = result.unwrap_err().to_string();
2078 assert!(err.contains("multi"), "Error should mention multi: {}", err);
2079 }
2080
2081 #[test]
2082 fn test_no_primary_field() {
2083 let sdl = r#"
2085 index documents {
2086 field title: text [indexed, stored]
2087 }
2088 "#;
2089
2090 let indexes = parse_sdl(sdl).unwrap();
2091 let schema = indexes[0].to_schema();
2092 assert!(schema.primary_field().is_none());
2093 }
2094}