1use pest::Parser;
46use pest_derive::Parser;
47
48use super::query_field_router::{QueryRouterRule, RoutingMode};
49use super::schema::{DenseVectorQuantization, FieldType, Schema, SchemaBuilder};
50use crate::Result;
51use crate::error::Error;
52
53#[derive(Parser)]
54#[grammar = "dsl/sdl/sdl.pest"]
55pub struct SdlParser;
56
57use super::schema::DenseVectorConfig;
58use crate::structures::{
59 IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
60};
61
62#[derive(Debug, Clone)]
64pub struct FieldDef {
65 pub name: String,
66 pub field_type: FieldType,
67 pub indexed: bool,
68 pub stored: bool,
69 pub tokenizer: Option<String>,
71 pub multi: bool,
73 pub positions: Option<super::schema::PositionMode>,
75 pub sparse_vector_config: Option<SparseVectorConfig>,
77 pub dense_vector_config: Option<DenseVectorConfig>,
79 pub fast: bool,
81 pub primary: bool,
83}
84
85#[derive(Debug, Clone)]
87pub struct IndexDef {
88 pub name: String,
89 pub fields: Vec<FieldDef>,
90 pub default_fields: Vec<String>,
91 pub query_routers: Vec<QueryRouterRule>,
93}
94
95impl IndexDef {
96 pub fn to_schema(&self) -> Schema {
98 let mut builder = SchemaBuilder::default();
99
100 for field in &self.fields {
101 let f = match field.field_type {
102 FieldType::Text => {
103 let tokenizer = field.tokenizer.as_deref().unwrap_or("simple");
104 builder.add_text_field_with_tokenizer(
105 &field.name,
106 field.indexed,
107 field.stored,
108 tokenizer,
109 )
110 }
111 FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
112 FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
113 FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
114 FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
115 FieldType::Json => builder.add_json_field(&field.name, field.stored),
116 FieldType::SparseVector => {
117 if let Some(config) = &field.sparse_vector_config {
118 builder.add_sparse_vector_field_with_config(
119 &field.name,
120 field.indexed,
121 field.stored,
122 config.clone(),
123 )
124 } else {
125 builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
126 }
127 }
128 FieldType::DenseVector => {
129 let config = field
131 .dense_vector_config
132 .as_ref()
133 .expect("DenseVector field requires dimension to be specified");
134 builder.add_dense_vector_field_with_config(
135 &field.name,
136 field.indexed,
137 field.stored,
138 config.clone(),
139 )
140 }
141 };
142 if field.multi {
143 builder.set_multi(f, true);
144 }
145 if field.fast {
146 builder.set_fast(f, true);
147 }
148 if field.primary {
149 builder.set_primary_key(f);
150 }
151 let positions = field.positions.or({
153 if field.multi
155 && matches!(
156 field.field_type,
157 FieldType::SparseVector | FieldType::DenseVector
158 )
159 {
160 Some(super::schema::PositionMode::Ordinal)
161 } else {
162 None
163 }
164 });
165 if let Some(mode) = positions {
166 builder.set_positions(f, mode);
167 }
168 }
169
170 if !self.default_fields.is_empty() {
172 builder.set_default_fields(self.default_fields.clone());
173 }
174
175 if !self.query_routers.is_empty() {
177 builder.set_query_routers(self.query_routers.clone());
178 }
179
180 builder.build()
181 }
182
183 pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
188 if self.query_routers.is_empty() {
189 return Ok(None);
190 }
191
192 super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
193 .map(Some)
194 .map_err(Error::Schema)
195 }
196}
197
198fn parse_field_type(type_str: &str) -> Result<FieldType> {
200 match type_str {
201 "text" | "string" | "str" => Ok(FieldType::Text),
202 "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
203 "i64" | "int" | "integer" => Ok(FieldType::I64),
204 "f64" | "float" | "double" => Ok(FieldType::F64),
205 "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
206 "json" => Ok(FieldType::Json),
207 "sparse_vector" => Ok(FieldType::SparseVector),
208 "dense_vector" | "vector" => Ok(FieldType::DenseVector),
209 _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
210 }
211}
212
213#[derive(Debug, Clone, Default)]
215struct IndexConfig {
216 index_type: Option<super::schema::VectorIndexType>,
217 num_clusters: Option<usize>,
218 nprobe: Option<usize>,
219 build_threshold: Option<usize>,
220 quantization: Option<WeightQuantization>,
222 weight_threshold: Option<f32>,
223 block_size: Option<usize>,
224 pruning: Option<f32>,
225 query_tokenizer: Option<String>,
227 query_weighting: Option<QueryWeighting>,
228 query_weight_threshold: Option<f32>,
229 query_max_dims: Option<usize>,
230 query_pruning: Option<f32>,
231 positions: Option<super::schema::PositionMode>,
233}
234
235fn parse_attributes(
240 pair: pest::iterators::Pair<Rule>,
241) -> (bool, bool, bool, bool, bool, Option<IndexConfig>) {
242 let mut indexed = false;
243 let mut stored = false;
244 let mut multi = false;
245 let mut fast = false;
246 let mut primary = false;
247 let mut index_config = None;
248
249 for attr in pair.into_inner() {
250 if attr.as_rule() == Rule::attribute {
251 let mut found_config = false;
253 for inner in attr.clone().into_inner() {
254 match inner.as_rule() {
255 Rule::indexed_with_config => {
256 indexed = true;
257 index_config = Some(parse_index_config(inner));
258 found_config = true;
259 break;
260 }
261 Rule::stored_with_config => {
262 stored = true;
263 multi = true; found_config = true;
265 break;
266 }
267 _ => {}
268 }
269 }
270 if !found_config {
271 match attr.as_str() {
273 "indexed" => indexed = true,
274 "stored" => stored = true,
275 "fast" => fast = true,
276 "primary" => primary = true,
277 _ => {}
278 }
279 }
280 }
281 }
282
283 (indexed, stored, multi, fast, primary, index_config)
284}
285
286fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
288 let mut config = IndexConfig::default();
289
290 for inner in pair.into_inner() {
295 if inner.as_rule() == Rule::index_config_params {
296 for param in inner.into_inner() {
297 if param.as_rule() == Rule::index_config_param {
298 for p in param.into_inner() {
299 parse_single_index_config_param(&mut config, p);
300 }
301 }
302 }
303 }
304 }
305
306 config
307}
308
309fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
311 use super::schema::VectorIndexType;
312
313 match p.as_rule() {
314 Rule::index_type_spec => {
315 config.index_type = Some(match p.as_str() {
316 "flat" => VectorIndexType::Flat,
317 "rabitq" => VectorIndexType::RaBitQ,
318 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
319 "scann" => VectorIndexType::ScaNN,
320 _ => VectorIndexType::RaBitQ,
321 });
322 }
323 Rule::index_type_kwarg => {
324 if let Some(t) = p.into_inner().next() {
326 config.index_type = Some(match t.as_str() {
327 "flat" => VectorIndexType::Flat,
328 "rabitq" => VectorIndexType::RaBitQ,
329 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
330 "scann" => VectorIndexType::ScaNN,
331 _ => VectorIndexType::RaBitQ,
332 });
333 }
334 }
335 Rule::num_clusters_kwarg => {
336 if let Some(n) = p.into_inner().next() {
338 config.num_clusters = Some(n.as_str().parse().unwrap_or_else(|_| {
339 log::warn!(
340 "Invalid num_clusters value '{}', using default 256",
341 n.as_str()
342 );
343 256
344 }));
345 }
346 }
347 Rule::build_threshold_kwarg => {
348 if let Some(n) = p.into_inner().next() {
350 config.build_threshold = Some(n.as_str().parse().unwrap_or_else(|_| {
351 log::warn!(
352 "Invalid build_threshold value '{}', using default 10000",
353 n.as_str()
354 );
355 10000
356 }));
357 }
358 }
359 Rule::nprobe_kwarg => {
360 if let Some(n) = p.into_inner().next() {
362 config.nprobe = Some(n.as_str().parse().unwrap_or_else(|_| {
363 log::warn!("Invalid nprobe value '{}', using default 32", n.as_str());
364 32
365 }));
366 }
367 }
368 Rule::quantization_kwarg => {
369 if let Some(q) = p.into_inner().next() {
371 config.quantization = Some(match q.as_str() {
372 "float32" | "f32" => WeightQuantization::Float32,
373 "float16" | "f16" => WeightQuantization::Float16,
374 "uint8" | "u8" => WeightQuantization::UInt8,
375 "uint4" | "u4" => WeightQuantization::UInt4,
376 _ => WeightQuantization::default(),
377 });
378 }
379 }
380 Rule::weight_threshold_kwarg => {
381 if let Some(t) = p.into_inner().next() {
383 config.weight_threshold = Some(t.as_str().parse().unwrap_or_else(|_| {
384 log::warn!(
385 "Invalid weight_threshold value '{}', using default 0.0",
386 t.as_str()
387 );
388 0.0
389 }));
390 }
391 }
392 Rule::block_size_kwarg => {
393 if let Some(n) = p.into_inner().next() {
395 config.block_size = Some(n.as_str().parse().unwrap_or_else(|_| {
396 log::warn!(
397 "Invalid block_size value '{}', using default 128",
398 n.as_str()
399 );
400 128
401 }));
402 }
403 }
404 Rule::pruning_kwarg => {
405 if let Some(f) = p.into_inner().next() {
407 config.pruning = Some(f.as_str().parse().unwrap_or_else(|_| {
408 log::warn!("Invalid pruning value '{}', using default 1.0", f.as_str());
409 1.0
410 }));
411 }
412 }
413 Rule::query_config_block => {
414 parse_query_config_block(config, p);
416 }
417 Rule::positions_kwarg => {
418 use super::schema::PositionMode;
420 config.positions = Some(match p.as_str() {
421 "ordinal" => PositionMode::Ordinal,
422 "token_position" => PositionMode::TokenPosition,
423 _ => PositionMode::Full, });
425 }
426 _ => {}
427 }
428}
429
430fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
432 for inner in pair.into_inner() {
433 if inner.as_rule() == Rule::query_config_params {
434 for param in inner.into_inner() {
435 if param.as_rule() == Rule::query_config_param {
436 for p in param.into_inner() {
437 match p.as_rule() {
438 Rule::query_tokenizer_kwarg => {
439 if let Some(path) = p.into_inner().next()
441 && let Some(inner_path) = path.into_inner().next()
442 {
443 config.query_tokenizer = Some(inner_path.as_str().to_string());
444 }
445 }
446 Rule::query_weighting_kwarg => {
447 if let Some(w) = p.into_inner().next() {
449 config.query_weighting = Some(match w.as_str() {
450 "one" => QueryWeighting::One,
451 "idf" => QueryWeighting::Idf,
452 "idf_file" => QueryWeighting::IdfFile,
453 _ => QueryWeighting::One,
454 });
455 }
456 }
457 Rule::query_weight_threshold_kwarg => {
458 if let Some(t) = p.into_inner().next() {
459 config.query_weight_threshold =
460 Some(t.as_str().parse().unwrap_or_else(|_| {
461 log::warn!(
462 "Invalid query weight_threshold '{}', using 0.0",
463 t.as_str()
464 );
465 0.0
466 }));
467 }
468 }
469 Rule::query_max_dims_kwarg => {
470 if let Some(t) = p.into_inner().next() {
471 config.query_max_dims =
472 Some(t.as_str().parse().unwrap_or_else(|_| {
473 log::warn!(
474 "Invalid query max_dims '{}', using 0",
475 t.as_str()
476 );
477 0
478 }));
479 }
480 }
481 Rule::query_pruning_kwarg => {
482 if let Some(t) = p.into_inner().next() {
483 config.query_pruning =
484 Some(t.as_str().parse().unwrap_or_else(|_| {
485 log::warn!(
486 "Invalid query pruning '{}', using 1.0",
487 t.as_str()
488 );
489 1.0
490 }));
491 }
492 }
493 _ => {}
494 }
495 }
496 }
497 }
498 }
499 }
500}
501
502fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
504 let mut inner = pair.into_inner();
505
506 let name = inner
507 .next()
508 .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
509 .as_str()
510 .to_string();
511
512 let field_type_str = inner
513 .next()
514 .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
515 .as_str();
516
517 let field_type = parse_field_type(field_type_str)?;
518
519 let mut tokenizer = None;
521 let mut sparse_vector_config = None;
522 let mut dense_vector_config = None;
523 let mut indexed = true;
524 let mut stored = true;
525 let mut multi = false;
526 let mut fast = false;
527 let mut primary = false;
528 let mut index_config: Option<IndexConfig> = None;
529
530 for item in inner {
531 match item.as_rule() {
532 Rule::tokenizer_spec => {
533 if let Some(tok_name) = item.into_inner().next() {
535 tokenizer = Some(tok_name.as_str().to_string());
536 }
537 }
538 Rule::sparse_vector_config => {
539 sparse_vector_config = Some(parse_sparse_vector_config(item));
541 }
542 Rule::dense_vector_config => {
543 dense_vector_config = Some(parse_dense_vector_config(item));
545 }
546 Rule::attributes => {
547 let (idx, sto, mul, fst, pri, idx_cfg) = parse_attributes(item);
548 indexed = idx;
549 stored = sto;
550 multi = mul;
551 fast = fst;
552 primary = pri;
553 index_config = idx_cfg;
554 }
555 _ => {}
556 }
557 }
558
559 if primary {
561 fast = true;
562 indexed = true;
563 }
564
565 let mut positions = None;
567 if let Some(idx_cfg) = index_config {
568 positions = idx_cfg.positions;
569 if let Some(ref mut dv_config) = dense_vector_config {
570 apply_index_config_to_dense_vector(dv_config, idx_cfg);
571 } else if field_type == FieldType::SparseVector {
572 let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
574 apply_index_config_to_sparse_vector(sv_config, idx_cfg);
575 }
576 }
577
578 Ok(FieldDef {
579 name,
580 field_type,
581 indexed,
582 stored,
583 tokenizer,
584 multi,
585 positions,
586 sparse_vector_config,
587 dense_vector_config,
588 fast,
589 primary,
590 })
591}
592
593fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
595 if let Some(index_type) = idx_cfg.index_type {
597 config.index_type = index_type;
598 }
599
600 if idx_cfg.num_clusters.is_some() {
602 config.num_clusters = idx_cfg.num_clusters;
603 }
604
605 if let Some(nprobe) = idx_cfg.nprobe {
607 config.nprobe = nprobe;
608 }
609
610 if idx_cfg.build_threshold.is_some() {
612 config.build_threshold = idx_cfg.build_threshold;
613 }
614}
615
616fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
619 let mut index_size = IndexSize::default();
620
621 for inner in pair.into_inner() {
623 if inner.as_rule() == Rule::index_size_spec {
624 index_size = match inner.as_str() {
625 "u16" => IndexSize::U16,
626 "u32" => IndexSize::U32,
627 _ => IndexSize::default(),
628 };
629 }
630 }
631
632 SparseVectorConfig {
633 format: crate::structures::SparseFormat::Bmp,
634 index_size,
635 weight_quantization: WeightQuantization::default(),
636 weight_threshold: 0.0,
637 block_size: 128,
638 bmp_block_size: 64,
639 max_bmp_grid_bytes: 0,
640 bmp_superblock_size: 64,
641 pruning: None,
642 query_config: None,
643 }
644}
645
646fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
648 if let Some(q) = idx_cfg.quantization {
649 config.weight_quantization = q;
650 }
651 if let Some(t) = idx_cfg.weight_threshold {
652 config.weight_threshold = t;
653 }
654 if let Some(bs) = idx_cfg.block_size {
655 let adjusted = bs.next_power_of_two();
656 if adjusted != bs {
657 log::warn!(
658 "block_size {} adjusted to next power of two: {}",
659 bs,
660 adjusted
661 );
662 }
663 config.block_size = adjusted;
664 }
665 if let Some(p) = idx_cfg.pruning {
666 let clamped = p.clamp(0.0, 1.0);
667 if (clamped - p).abs() > f32::EPSILON {
668 log::warn!(
669 "pruning {} clamped to valid range [0.0, 1.0]: {}",
670 p,
671 clamped
672 );
673 }
674 config.pruning = Some(clamped);
675 }
676 if idx_cfg.query_tokenizer.is_some()
678 || idx_cfg.query_weighting.is_some()
679 || idx_cfg.query_weight_threshold.is_some()
680 || idx_cfg.query_max_dims.is_some()
681 || idx_cfg.query_pruning.is_some()
682 {
683 let query_config = config
684 .query_config
685 .get_or_insert(SparseQueryConfig::default());
686 if let Some(tokenizer) = idx_cfg.query_tokenizer {
687 query_config.tokenizer = Some(tokenizer);
688 }
689 if let Some(weighting) = idx_cfg.query_weighting {
690 query_config.weighting = weighting;
691 }
692 if let Some(t) = idx_cfg.query_weight_threshold {
693 query_config.weight_threshold = t;
694 }
695 if let Some(d) = idx_cfg.query_max_dims {
696 query_config.max_query_dims = Some(d);
697 }
698 if let Some(p) = idx_cfg.query_pruning {
699 query_config.pruning = Some(p);
700 }
701 }
702}
703
704fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
707 let mut dim: usize = 0;
708 let mut quantization = DenseVectorQuantization::F32;
709
710 for params in pair.into_inner() {
712 if params.as_rule() == Rule::dense_vector_params {
713 for inner in params.into_inner() {
714 match inner.as_rule() {
715 Rule::dense_vector_keyword_params => {
716 for kwarg in inner.into_inner() {
717 match kwarg.as_rule() {
718 Rule::dims_kwarg => {
719 if let Some(d) = kwarg.into_inner().next() {
720 dim = d.as_str().parse().unwrap_or(0);
721 }
722 }
723 Rule::quant_type_spec => {
724 quantization = parse_quant_type(kwarg.as_str());
725 }
726 _ => {}
727 }
728 }
729 }
730 Rule::dense_vector_positional_params => {
731 for item in inner.into_inner() {
732 match item.as_rule() {
733 Rule::dimension_spec => {
734 dim = item.as_str().parse().unwrap_or(0);
735 }
736 Rule::quant_type_spec => {
737 quantization = parse_quant_type(item.as_str());
738 }
739 _ => {}
740 }
741 }
742 }
743 _ => {}
744 }
745 }
746 }
747 }
748
749 DenseVectorConfig::new(dim).with_quantization(quantization)
750}
751
752fn parse_quant_type(s: &str) -> DenseVectorQuantization {
753 match s.trim() {
754 "f16" => DenseVectorQuantization::F16,
755 "uint8" | "u8" => DenseVectorQuantization::UInt8,
756 _ => DenseVectorQuantization::F32,
757 }
758}
759
760fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
762 pair.into_inner().map(|p| p.as_str().to_string()).collect()
763}
764
765fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
767 let mut pattern = String::new();
768 let mut substitution = String::new();
769 let mut target_field = String::new();
770 let mut mode = RoutingMode::Additional;
771
772 for prop in pair.into_inner() {
773 if prop.as_rule() != Rule::query_router_prop {
774 continue;
775 }
776
777 for inner in prop.into_inner() {
778 match inner.as_rule() {
779 Rule::query_router_pattern => {
780 if let Some(regex_str) = inner.into_inner().next() {
781 pattern = parse_string_value(regex_str);
782 }
783 }
784 Rule::query_router_substitution => {
785 if let Some(quoted) = inner.into_inner().next() {
786 substitution = parse_string_value(quoted);
787 }
788 }
789 Rule::query_router_target => {
790 if let Some(ident) = inner.into_inner().next() {
791 target_field = ident.as_str().to_string();
792 }
793 }
794 Rule::query_router_mode => {
795 if let Some(mode_val) = inner.into_inner().next() {
796 mode = match mode_val.as_str() {
797 "exclusive" => RoutingMode::Exclusive,
798 "additional" => RoutingMode::Additional,
799 _ => RoutingMode::Additional,
800 };
801 }
802 }
803 _ => {}
804 }
805 }
806 }
807
808 if pattern.is_empty() {
809 return Err(Error::Schema("query_router missing 'pattern'".to_string()));
810 }
811 if substitution.is_empty() {
812 return Err(Error::Schema(
813 "query_router missing 'substitution'".to_string(),
814 ));
815 }
816 if target_field.is_empty() {
817 return Err(Error::Schema(
818 "query_router missing 'target_field'".to_string(),
819 ));
820 }
821
822 Ok(QueryRouterRule {
823 pattern,
824 substitution,
825 target_field,
826 mode,
827 })
828}
829
830fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
832 let s = pair.as_str();
833 match pair.as_rule() {
834 Rule::regex_string => {
835 if let Some(inner) = pair.into_inner().next() {
837 parse_string_value(inner)
838 } else {
839 s.to_string()
840 }
841 }
842 Rule::raw_string => {
843 s[2..s.len() - 1].to_string()
845 }
846 Rule::quoted_string => {
847 let inner = &s[1..s.len() - 1];
849 inner
851 .replace("\\n", "\n")
852 .replace("\\t", "\t")
853 .replace("\\\"", "\"")
854 .replace("\\\\", "\\")
855 }
856 _ => s.to_string(),
857 }
858}
859
860fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
862 let mut inner = pair.into_inner();
863
864 let name = inner
865 .next()
866 .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
867 .as_str()
868 .to_string();
869
870 let mut fields = Vec::new();
871 let mut default_fields = Vec::new();
872 let mut query_routers = Vec::new();
873
874 for item in inner {
875 match item.as_rule() {
876 Rule::field_def => {
877 fields.push(parse_field_def(item)?);
878 }
879 Rule::default_fields_def => {
880 default_fields = parse_default_fields_def(item);
881 }
882 Rule::query_router_def => {
883 query_routers.push(parse_query_router_def(item)?);
884 }
885 _ => {}
886 }
887 }
888
889 let primary_fields: Vec<&FieldDef> = fields.iter().filter(|f| f.primary).collect();
891 if primary_fields.len() > 1 {
892 return Err(Error::Schema(format!(
893 "Index '{}' has {} primary key fields, but at most one is allowed",
894 name,
895 primary_fields.len()
896 )));
897 }
898 if let Some(pk) = primary_fields.first() {
899 if pk.field_type != FieldType::Text {
900 return Err(Error::Schema(format!(
901 "Primary key field '{}' must be of type text, got {:?}",
902 pk.name, pk.field_type
903 )));
904 }
905 if pk.multi {
906 return Err(Error::Schema(format!(
907 "Primary key field '{}' cannot be multi-valued",
908 pk.name
909 )));
910 }
911 }
912
913 Ok(IndexDef {
914 name,
915 fields,
916 default_fields,
917 query_routers,
918 })
919}
920
921pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
923 let pairs = SdlParser::parse(Rule::file, input)
924 .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
925
926 let mut indexes = Vec::new();
927
928 for pair in pairs {
929 if pair.as_rule() == Rule::file {
930 for inner in pair.into_inner() {
931 if inner.as_rule() == Rule::index_def {
932 indexes.push(parse_index_def(inner)?);
933 }
934 }
935 }
936 }
937
938 Ok(indexes)
939}
940
941pub fn parse_single_index(input: &str) -> Result<IndexDef> {
943 let indexes = parse_sdl(input)?;
944
945 if indexes.is_empty() {
946 return Err(Error::Schema("No index definition found".to_string()));
947 }
948
949 if indexes.len() > 1 {
950 return Err(Error::Schema(
951 "Multiple index definitions found, expected one".to_string(),
952 ));
953 }
954
955 Ok(indexes.into_iter().next().unwrap())
956}
957
958#[cfg(test)]
959mod tests {
960 use super::*;
961
962 #[test]
963 fn test_parse_simple_schema() {
964 let sdl = r#"
965 index articles {
966 field title: text [indexed, stored]
967 field body: text [indexed]
968 }
969 "#;
970
971 let indexes = parse_sdl(sdl).unwrap();
972 assert_eq!(indexes.len(), 1);
973
974 let index = &indexes[0];
975 assert_eq!(index.name, "articles");
976 assert_eq!(index.fields.len(), 2);
977
978 assert_eq!(index.fields[0].name, "title");
979 assert!(matches!(index.fields[0].field_type, FieldType::Text));
980 assert!(index.fields[0].indexed);
981 assert!(index.fields[0].stored);
982
983 assert_eq!(index.fields[1].name, "body");
984 assert!(matches!(index.fields[1].field_type, FieldType::Text));
985 assert!(index.fields[1].indexed);
986 assert!(!index.fields[1].stored);
987 }
988
989 #[test]
990 fn test_parse_all_field_types() {
991 let sdl = r#"
992 index test {
993 field text_field: text [indexed, stored]
994 field u64_field: u64 [indexed, stored]
995 field i64_field: i64 [indexed, stored]
996 field f64_field: f64 [indexed, stored]
997 field bytes_field: bytes [stored]
998 }
999 "#;
1000
1001 let indexes = parse_sdl(sdl).unwrap();
1002 let index = &indexes[0];
1003
1004 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1005 assert!(matches!(index.fields[1].field_type, FieldType::U64));
1006 assert!(matches!(index.fields[2].field_type, FieldType::I64));
1007 assert!(matches!(index.fields[3].field_type, FieldType::F64));
1008 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1009 }
1010
1011 #[test]
1012 fn test_parse_with_comments() {
1013 let sdl = r#"
1014 # This is a comment
1015 index articles {
1016 # Title field
1017 field title: text [indexed, stored]
1018 field body: text [indexed] # inline comment not supported yet
1019 }
1020 "#;
1021
1022 let indexes = parse_sdl(sdl).unwrap();
1023 assert_eq!(indexes[0].fields.len(), 2);
1024 }
1025
1026 #[test]
1027 fn test_parse_type_aliases() {
1028 let sdl = r#"
1029 index test {
1030 field a: string [indexed]
1031 field b: int [indexed]
1032 field c: uint [indexed]
1033 field d: float [indexed]
1034 field e: binary [stored]
1035 }
1036 "#;
1037
1038 let indexes = parse_sdl(sdl).unwrap();
1039 let index = &indexes[0];
1040
1041 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1042 assert!(matches!(index.fields[1].field_type, FieldType::I64));
1043 assert!(matches!(index.fields[2].field_type, FieldType::U64));
1044 assert!(matches!(index.fields[3].field_type, FieldType::F64));
1045 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1046 }
1047
1048 #[test]
1049 fn test_to_schema() {
1050 let sdl = r#"
1051 index articles {
1052 field title: text [indexed, stored]
1053 field views: u64 [indexed, stored]
1054 }
1055 "#;
1056
1057 let indexes = parse_sdl(sdl).unwrap();
1058 let schema = indexes[0].to_schema();
1059
1060 assert!(schema.get_field("title").is_some());
1061 assert!(schema.get_field("views").is_some());
1062 assert!(schema.get_field("nonexistent").is_none());
1063 }
1064
1065 #[test]
1066 fn test_default_attributes() {
1067 let sdl = r#"
1068 index test {
1069 field title: text
1070 }
1071 "#;
1072
1073 let indexes = parse_sdl(sdl).unwrap();
1074 let field = &indexes[0].fields[0];
1075
1076 assert!(field.indexed);
1078 assert!(field.stored);
1079 }
1080
1081 #[test]
1082 fn test_multiple_indexes() {
1083 let sdl = r#"
1084 index articles {
1085 field title: text [indexed, stored]
1086 }
1087
1088 index users {
1089 field name: text [indexed, stored]
1090 field email: text [indexed, stored]
1091 }
1092 "#;
1093
1094 let indexes = parse_sdl(sdl).unwrap();
1095 assert_eq!(indexes.len(), 2);
1096 assert_eq!(indexes[0].name, "articles");
1097 assert_eq!(indexes[1].name, "users");
1098 }
1099
1100 #[test]
1101 fn test_tokenizer_spec() {
1102 let sdl = r#"
1103 index articles {
1104 field title: text<en_stem> [indexed, stored]
1105 field body: text<simple> [indexed]
1106 field author: text [indexed, stored]
1107 }
1108 "#;
1109
1110 let indexes = parse_sdl(sdl).unwrap();
1111 let index = &indexes[0];
1112
1113 assert_eq!(index.fields[0].name, "title");
1114 assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
1115
1116 assert_eq!(index.fields[1].name, "body");
1117 assert_eq!(index.fields[1].tokenizer, Some("simple".to_string()));
1118
1119 assert_eq!(index.fields[2].name, "author");
1120 assert_eq!(index.fields[2].tokenizer, None); }
1122
1123 #[test]
1124 fn test_tokenizer_in_schema() {
1125 let sdl = r#"
1126 index articles {
1127 field title: text<german> [indexed, stored]
1128 field body: text<en_stem> [indexed]
1129 }
1130 "#;
1131
1132 let indexes = parse_sdl(sdl).unwrap();
1133 let schema = indexes[0].to_schema();
1134
1135 let title_field = schema.get_field("title").unwrap();
1136 let title_entry = schema.get_field_entry(title_field).unwrap();
1137 assert_eq!(title_entry.tokenizer, Some("german".to_string()));
1138
1139 let body_field = schema.get_field("body").unwrap();
1140 let body_entry = schema.get_field_entry(body_field).unwrap();
1141 assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
1142 }
1143
1144 #[test]
1145 fn test_query_router_basic() {
1146 let sdl = r#"
1147 index documents {
1148 field title: text [indexed, stored]
1149 field uri: text [indexed, stored]
1150
1151 query_router {
1152 pattern: "10\\.\\d{4,}/[^\\s]+"
1153 substitution: "doi://{0}"
1154 target_field: uris
1155 mode: exclusive
1156 }
1157 }
1158 "#;
1159
1160 let indexes = parse_sdl(sdl).unwrap();
1161 let index = &indexes[0];
1162
1163 assert_eq!(index.query_routers.len(), 1);
1164 let router = &index.query_routers[0];
1165 assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1166 assert_eq!(router.substitution, "doi://{0}");
1167 assert_eq!(router.target_field, "uris");
1168 assert_eq!(router.mode, RoutingMode::Exclusive);
1169 }
1170
1171 #[test]
1172 fn test_query_router_raw_string() {
1173 let sdl = r#"
1174 index documents {
1175 field uris: text [indexed, stored]
1176
1177 query_router {
1178 pattern: r"^pmid:(\d+)$"
1179 substitution: "pubmed://{1}"
1180 target_field: uris
1181 mode: additional
1182 }
1183 }
1184 "#;
1185
1186 let indexes = parse_sdl(sdl).unwrap();
1187 let router = &indexes[0].query_routers[0];
1188
1189 assert_eq!(router.pattern, r"^pmid:(\d+)$");
1190 assert_eq!(router.substitution, "pubmed://{1}");
1191 assert_eq!(router.mode, RoutingMode::Additional);
1192 }
1193
1194 #[test]
1195 fn test_multiple_query_routers() {
1196 let sdl = r#"
1197 index documents {
1198 field uris: text [indexed, stored]
1199
1200 query_router {
1201 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1202 substitution: "doi://{1}"
1203 target_field: uris
1204 mode: exclusive
1205 }
1206
1207 query_router {
1208 pattern: r"^pmid:(\d+)$"
1209 substitution: "pubmed://{1}"
1210 target_field: uris
1211 mode: exclusive
1212 }
1213
1214 query_router {
1215 pattern: r"^arxiv:(\d+\.\d+)$"
1216 substitution: "arxiv://{1}"
1217 target_field: uris
1218 mode: additional
1219 }
1220 }
1221 "#;
1222
1223 let indexes = parse_sdl(sdl).unwrap();
1224 assert_eq!(indexes[0].query_routers.len(), 3);
1225 }
1226
1227 #[test]
1228 fn test_query_router_default_mode() {
1229 let sdl = r#"
1230 index documents {
1231 field uris: text [indexed, stored]
1232
1233 query_router {
1234 pattern: r"test"
1235 substitution: "{0}"
1236 target_field: uris
1237 }
1238 }
1239 "#;
1240
1241 let indexes = parse_sdl(sdl).unwrap();
1242 assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1244 }
1245
1246 #[test]
1247 fn test_multi_attribute() {
1248 let sdl = r#"
1249 index documents {
1250 field uris: text [indexed, stored<multi>]
1251 field title: text [indexed, stored]
1252 }
1253 "#;
1254
1255 let indexes = parse_sdl(sdl).unwrap();
1256 assert_eq!(indexes.len(), 1);
1257
1258 let fields = &indexes[0].fields;
1259 assert_eq!(fields.len(), 2);
1260
1261 assert_eq!(fields[0].name, "uris");
1263 assert!(fields[0].multi, "uris field should have multi=true");
1264
1265 assert_eq!(fields[1].name, "title");
1267 assert!(!fields[1].multi, "title field should have multi=false");
1268
1269 let schema = indexes[0].to_schema();
1271 let uris_field = schema.get_field("uris").unwrap();
1272 let title_field = schema.get_field("title").unwrap();
1273
1274 assert!(schema.get_field_entry(uris_field).unwrap().multi);
1275 assert!(!schema.get_field_entry(title_field).unwrap().multi);
1276 }
1277
1278 #[test]
1279 fn test_sparse_vector_field() {
1280 let sdl = r#"
1281 index documents {
1282 field embedding: sparse_vector [indexed, stored]
1283 }
1284 "#;
1285
1286 let indexes = parse_sdl(sdl).unwrap();
1287 assert_eq!(indexes.len(), 1);
1288 assert_eq!(indexes[0].fields.len(), 1);
1289 assert_eq!(indexes[0].fields[0].name, "embedding");
1290 assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1291 assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1292 }
1293
1294 #[test]
1295 fn test_sparse_vector_with_config() {
1296 let sdl = r#"
1297 index documents {
1298 field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1299 field dense: sparse_vector<u32> [indexed<quantization: float32>]
1300 }
1301 "#;
1302
1303 let indexes = parse_sdl(sdl).unwrap();
1304 assert_eq!(indexes[0].fields.len(), 2);
1305
1306 let f1 = &indexes[0].fields[0];
1308 assert_eq!(f1.name, "embedding");
1309 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1310 assert_eq!(config1.index_size, IndexSize::U16);
1311 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1312
1313 let f2 = &indexes[0].fields[1];
1315 assert_eq!(f2.name, "dense");
1316 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1317 assert_eq!(config2.index_size, IndexSize::U32);
1318 assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1319 }
1320
1321 #[test]
1322 fn test_sparse_vector_with_weight_threshold() {
1323 let sdl = r#"
1324 index documents {
1325 field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1326 field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1327 }
1328 "#;
1329
1330 let indexes = parse_sdl(sdl).unwrap();
1331 assert_eq!(indexes[0].fields.len(), 2);
1332
1333 let f1 = &indexes[0].fields[0];
1335 assert_eq!(f1.name, "embedding");
1336 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1337 assert_eq!(config1.index_size, IndexSize::U16);
1338 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1339 assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1340
1341 let f2 = &indexes[0].fields[1];
1343 assert_eq!(f2.name, "embedding2");
1344 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1345 assert_eq!(config2.index_size, IndexSize::U32);
1346 assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1347 assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1348 }
1349
1350 #[test]
1351 fn test_sparse_vector_with_pruning() {
1352 let sdl = r#"
1353 index documents {
1354 field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1355 }
1356 "#;
1357
1358 let indexes = parse_sdl(sdl).unwrap();
1359 let f = &indexes[0].fields[0];
1360 assert_eq!(f.name, "embedding");
1361 let config = f.sparse_vector_config.as_ref().unwrap();
1362 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1363 assert_eq!(config.pruning, Some(0.1));
1364 }
1365
1366 #[test]
1367 fn test_dense_vector_field() {
1368 let sdl = r#"
1369 index documents {
1370 field embedding: dense_vector<768> [indexed, stored]
1371 }
1372 "#;
1373
1374 let indexes = parse_sdl(sdl).unwrap();
1375 assert_eq!(indexes.len(), 1);
1376 assert_eq!(indexes[0].fields.len(), 1);
1377
1378 let f = &indexes[0].fields[0];
1379 assert_eq!(f.name, "embedding");
1380 assert_eq!(f.field_type, FieldType::DenseVector);
1381
1382 let config = f.dense_vector_config.as_ref().unwrap();
1383 assert_eq!(config.dim, 768);
1384 }
1385
1386 #[test]
1387 fn test_dense_vector_alias() {
1388 let sdl = r#"
1389 index documents {
1390 field embedding: vector<1536> [indexed]
1391 }
1392 "#;
1393
1394 let indexes = parse_sdl(sdl).unwrap();
1395 assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1396 assert_eq!(
1397 indexes[0].fields[0]
1398 .dense_vector_config
1399 .as_ref()
1400 .unwrap()
1401 .dim,
1402 1536
1403 );
1404 }
1405
1406 #[test]
1407 fn test_dense_vector_with_num_clusters() {
1408 let sdl = r#"
1409 index documents {
1410 field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1411 }
1412 "#;
1413
1414 let indexes = parse_sdl(sdl).unwrap();
1415 assert_eq!(indexes.len(), 1);
1416
1417 let f = &indexes[0].fields[0];
1418 assert_eq!(f.name, "embedding");
1419 assert_eq!(f.field_type, FieldType::DenseVector);
1420
1421 let config = f.dense_vector_config.as_ref().unwrap();
1422 assert_eq!(config.dim, 768);
1423 assert_eq!(config.num_clusters, Some(256));
1424 assert_eq!(config.nprobe, 32); }
1426
1427 #[test]
1428 fn test_dense_vector_with_num_clusters_and_nprobe() {
1429 let sdl = r#"
1430 index documents {
1431 field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1432 }
1433 "#;
1434
1435 let indexes = parse_sdl(sdl).unwrap();
1436 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1437
1438 assert_eq!(config.dim, 1536);
1439 assert_eq!(config.num_clusters, Some(512));
1440 assert_eq!(config.nprobe, 64);
1441 }
1442
1443 #[test]
1444 fn test_dense_vector_keyword_syntax() {
1445 let sdl = r#"
1446 index documents {
1447 field embedding: dense_vector<dims: 1536> [indexed, stored]
1448 }
1449 "#;
1450
1451 let indexes = parse_sdl(sdl).unwrap();
1452 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1453
1454 assert_eq!(config.dim, 1536);
1455 assert!(config.num_clusters.is_none());
1456 }
1457
1458 #[test]
1459 fn test_dense_vector_keyword_syntax_full() {
1460 let sdl = r#"
1461 index documents {
1462 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1463 }
1464 "#;
1465
1466 let indexes = parse_sdl(sdl).unwrap();
1467 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1468
1469 assert_eq!(config.dim, 1536);
1470 assert_eq!(config.num_clusters, Some(256));
1471 assert_eq!(config.nprobe, 64);
1472 }
1473
1474 #[test]
1475 fn test_dense_vector_keyword_syntax_partial() {
1476 let sdl = r#"
1477 index documents {
1478 field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1479 }
1480 "#;
1481
1482 let indexes = parse_sdl(sdl).unwrap();
1483 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1484
1485 assert_eq!(config.dim, 768);
1486 assert_eq!(config.num_clusters, Some(128));
1487 assert_eq!(config.nprobe, 32); }
1489
1490 #[test]
1491 fn test_dense_vector_scann_index() {
1492 use crate::dsl::schema::VectorIndexType;
1493
1494 let sdl = r#"
1495 index documents {
1496 field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1497 }
1498 "#;
1499
1500 let indexes = parse_sdl(sdl).unwrap();
1501 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1502
1503 assert_eq!(config.dim, 768);
1504 assert_eq!(config.index_type, VectorIndexType::ScaNN);
1505 assert_eq!(config.num_clusters, Some(256));
1506 assert_eq!(config.nprobe, 64);
1507 }
1508
1509 #[test]
1510 fn test_dense_vector_ivf_rabitq_index() {
1511 use crate::dsl::schema::VectorIndexType;
1512
1513 let sdl = r#"
1514 index documents {
1515 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1516 }
1517 "#;
1518
1519 let indexes = parse_sdl(sdl).unwrap();
1520 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1521
1522 assert_eq!(config.dim, 1536);
1523 assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1524 assert_eq!(config.num_clusters, Some(512));
1525 }
1526
1527 #[test]
1528 fn test_dense_vector_rabitq_no_clusters() {
1529 use crate::dsl::schema::VectorIndexType;
1530
1531 let sdl = r#"
1532 index documents {
1533 field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1534 }
1535 "#;
1536
1537 let indexes = parse_sdl(sdl).unwrap();
1538 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1539
1540 assert_eq!(config.dim, 768);
1541 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1542 assert!(config.num_clusters.is_none());
1543 }
1544
1545 #[test]
1546 fn test_dense_vector_flat_index() {
1547 use crate::dsl::schema::VectorIndexType;
1548
1549 let sdl = r#"
1550 index documents {
1551 field embedding: dense_vector<dims: 768> [indexed<flat>]
1552 }
1553 "#;
1554
1555 let indexes = parse_sdl(sdl).unwrap();
1556 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1557
1558 assert_eq!(config.dim, 768);
1559 assert_eq!(config.index_type, VectorIndexType::Flat);
1560 }
1561
1562 #[test]
1563 fn test_dense_vector_default_index_type() {
1564 use crate::dsl::schema::VectorIndexType;
1565
1566 let sdl = r#"
1568 index documents {
1569 field embedding: dense_vector<dims: 768> [indexed]
1570 }
1571 "#;
1572
1573 let indexes = parse_sdl(sdl).unwrap();
1574 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1575
1576 assert_eq!(config.dim, 768);
1577 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1578 }
1579
1580 #[test]
1581 fn test_dense_vector_f16_quantization() {
1582 use crate::dsl::schema::{DenseVectorQuantization, VectorIndexType};
1583
1584 let sdl = r#"
1585 index documents {
1586 field embedding: dense_vector<768, f16> [indexed]
1587 }
1588 "#;
1589
1590 let indexes = parse_sdl(sdl).unwrap();
1591 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1592
1593 assert_eq!(config.dim, 768);
1594 assert_eq!(config.quantization, DenseVectorQuantization::F16);
1595 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1596 }
1597
1598 #[test]
1599 fn test_dense_vector_uint8_quantization() {
1600 use crate::dsl::schema::DenseVectorQuantization;
1601
1602 let sdl = r#"
1603 index documents {
1604 field embedding: dense_vector<1024, uint8> [indexed<ivf_rabitq>]
1605 }
1606 "#;
1607
1608 let indexes = parse_sdl(sdl).unwrap();
1609 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1610
1611 assert_eq!(config.dim, 1024);
1612 assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1613 }
1614
1615 #[test]
1616 fn test_dense_vector_u8_alias() {
1617 use crate::dsl::schema::DenseVectorQuantization;
1618
1619 let sdl = r#"
1620 index documents {
1621 field embedding: dense_vector<512, u8> [indexed]
1622 }
1623 "#;
1624
1625 let indexes = parse_sdl(sdl).unwrap();
1626 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1627
1628 assert_eq!(config.dim, 512);
1629 assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1630 }
1631
1632 #[test]
1633 fn test_dense_vector_default_f32_quantization() {
1634 use crate::dsl::schema::DenseVectorQuantization;
1635
1636 let sdl = r#"
1638 index documents {
1639 field embedding: dense_vector<768> [indexed]
1640 }
1641 "#;
1642
1643 let indexes = parse_sdl(sdl).unwrap();
1644 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1645
1646 assert_eq!(config.dim, 768);
1647 assert_eq!(config.quantization, DenseVectorQuantization::F32);
1648 }
1649
1650 #[test]
1651 fn test_dense_vector_keyword_with_quantization() {
1652 use crate::dsl::schema::DenseVectorQuantization;
1653
1654 let sdl = r#"
1655 index documents {
1656 field embedding: dense_vector<dims: 768, f16> [indexed]
1657 }
1658 "#;
1659
1660 let indexes = parse_sdl(sdl).unwrap();
1661 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1662
1663 assert_eq!(config.dim, 768);
1664 assert_eq!(config.quantization, DenseVectorQuantization::F16);
1665 }
1666
1667 #[test]
1668 fn test_json_field_type() {
1669 let sdl = r#"
1670 index documents {
1671 field title: text [indexed, stored]
1672 field metadata: json [stored]
1673 field extra: json
1674 }
1675 "#;
1676
1677 let indexes = parse_sdl(sdl).unwrap();
1678 let index = &indexes[0];
1679
1680 assert_eq!(index.fields.len(), 3);
1681
1682 assert_eq!(index.fields[1].name, "metadata");
1684 assert!(matches!(index.fields[1].field_type, FieldType::Json));
1685 assert!(index.fields[1].stored);
1686 assert_eq!(index.fields[2].name, "extra");
1690 assert!(matches!(index.fields[2].field_type, FieldType::Json));
1691
1692 let schema = index.to_schema();
1694 let metadata_field = schema.get_field("metadata").unwrap();
1695 let entry = schema.get_field_entry(metadata_field).unwrap();
1696 assert_eq!(entry.field_type, FieldType::Json);
1697 assert!(!entry.indexed); assert!(entry.stored);
1699 }
1700
1701 #[test]
1702 fn test_sparse_vector_query_config() {
1703 use crate::structures::QueryWeighting;
1704
1705 let sdl = r#"
1706 index documents {
1707 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1708 }
1709 "#;
1710
1711 let indexes = parse_sdl(sdl).unwrap();
1712 let index = &indexes[0];
1713
1714 assert_eq!(index.fields.len(), 1);
1715 assert_eq!(index.fields[0].name, "embedding");
1716 assert!(matches!(
1717 index.fields[0].field_type,
1718 FieldType::SparseVector
1719 ));
1720
1721 let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1722 assert_eq!(config.index_size, IndexSize::U16);
1723 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1724
1725 let query_config = config.query_config.as_ref().unwrap();
1727 assert_eq!(
1728 query_config.tokenizer.as_deref(),
1729 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1730 );
1731 assert_eq!(query_config.weighting, QueryWeighting::Idf);
1732
1733 let schema = index.to_schema();
1735 let embedding_field = schema.get_field("embedding").unwrap();
1736 let entry = schema.get_field_entry(embedding_field).unwrap();
1737 let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1738 let qc = sv_config.query_config.as_ref().unwrap();
1739 assert_eq!(
1740 qc.tokenizer.as_deref(),
1741 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1742 );
1743 assert_eq!(qc.weighting, QueryWeighting::Idf);
1744 }
1745
1746 #[test]
1747 fn test_sparse_vector_query_config_weighting_one() {
1748 use crate::structures::QueryWeighting;
1749
1750 let sdl = r#"
1751 index documents {
1752 field embedding: sparse_vector [indexed<query<weighting: one>>]
1753 }
1754 "#;
1755
1756 let indexes = parse_sdl(sdl).unwrap();
1757 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1758
1759 let query_config = config.query_config.as_ref().unwrap();
1760 assert!(query_config.tokenizer.is_none());
1761 assert_eq!(query_config.weighting, QueryWeighting::One);
1762 }
1763
1764 #[test]
1765 fn test_sparse_vector_query_config_weighting_idf_file() {
1766 use crate::structures::QueryWeighting;
1767
1768 let sdl = r#"
1769 index documents {
1770 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1771 }
1772 "#;
1773
1774 let indexes = parse_sdl(sdl).unwrap();
1775 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1776
1777 let query_config = config.query_config.as_ref().unwrap();
1778 assert_eq!(
1779 query_config.tokenizer.as_deref(),
1780 Some("opensearch-neural-sparse-encoding-v1")
1781 );
1782 assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1783
1784 let schema = indexes[0].to_schema();
1786 let field = schema.get_field("embedding").unwrap();
1787 let entry = schema.get_field_entry(field).unwrap();
1788 let sc = entry.sparse_vector_config.as_ref().unwrap();
1789 let qc = sc.query_config.as_ref().unwrap();
1790 assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1791 }
1792
1793 #[test]
1794 fn test_sparse_vector_query_config_pruning_params() {
1795 let sdl = r#"
1796 index documents {
1797 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<weighting: idf, weight_threshold: 0.03, max_dims: 25, pruning: 0.2>>]
1798 }
1799 "#;
1800
1801 let indexes = parse_sdl(sdl).unwrap();
1802 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1803
1804 let qc = config.query_config.as_ref().unwrap();
1805 assert_eq!(qc.weighting, QueryWeighting::Idf);
1806 assert!((qc.weight_threshold - 0.03).abs() < 0.001);
1807 assert_eq!(qc.max_query_dims, Some(25));
1808 assert!((qc.pruning.unwrap() - 0.2).abs() < 0.001);
1809
1810 let schema = indexes[0].to_schema();
1812 let field = schema.get_field("embedding").unwrap();
1813 let entry = schema.get_field_entry(field).unwrap();
1814 let sc = entry.sparse_vector_config.as_ref().unwrap();
1815 let rqc = sc.query_config.as_ref().unwrap();
1816 assert!((rqc.weight_threshold - 0.03).abs() < 0.001);
1817 assert_eq!(rqc.max_query_dims, Some(25));
1818 assert!((rqc.pruning.unwrap() - 0.2).abs() < 0.001);
1819 }
1820
1821 #[test]
1822 fn test_fast_attribute() {
1823 let sdl = r#"
1824 index products {
1825 field name: text [indexed, stored]
1826 field price: f64 [indexed, fast]
1827 field category: text [indexed, stored, fast]
1828 field count: u64 [fast]
1829 field score: i64 [indexed, stored, fast]
1830 }
1831 "#;
1832
1833 let indexes = parse_sdl(sdl).unwrap();
1834 assert_eq!(indexes.len(), 1);
1835 let index = &indexes[0];
1836 assert_eq!(index.fields.len(), 5);
1837
1838 assert!(!index.fields[0].fast);
1840 assert!(index.fields[1].fast);
1842 assert!(matches!(index.fields[1].field_type, FieldType::F64));
1843 assert!(index.fields[2].fast);
1845 assert!(matches!(index.fields[2].field_type, FieldType::Text));
1846 assert!(index.fields[3].fast);
1848 assert!(matches!(index.fields[3].field_type, FieldType::U64));
1849 assert!(index.fields[4].fast);
1851 assert!(matches!(index.fields[4].field_type, FieldType::I64));
1852
1853 let schema = index.to_schema();
1855 let price_field = schema.get_field("price").unwrap();
1856 assert!(schema.get_field_entry(price_field).unwrap().fast);
1857
1858 let category_field = schema.get_field("category").unwrap();
1859 assert!(schema.get_field_entry(category_field).unwrap().fast);
1860
1861 let name_field = schema.get_field("name").unwrap();
1862 assert!(!schema.get_field_entry(name_field).unwrap().fast);
1863 }
1864
1865 #[test]
1866 fn test_primary_attribute() {
1867 let sdl = r#"
1868 index documents {
1869 field id: text [primary, stored]
1870 field title: text [indexed, stored]
1871 }
1872 "#;
1873
1874 let indexes = parse_sdl(sdl).unwrap();
1875 assert_eq!(indexes.len(), 1);
1876 let index = &indexes[0];
1877 assert_eq!(index.fields.len(), 2);
1878
1879 let id_field = &index.fields[0];
1881 assert!(id_field.primary, "id should be primary");
1882 assert!(id_field.fast, "primary implies fast");
1883 assert!(id_field.indexed, "primary implies indexed");
1884
1885 assert!(!index.fields[1].primary);
1887
1888 let schema = index.to_schema();
1890 let id = schema.get_field("id").unwrap();
1891 let id_entry = schema.get_field_entry(id).unwrap();
1892 assert!(id_entry.primary_key);
1893 assert!(id_entry.fast);
1894 assert!(id_entry.indexed);
1895
1896 let title = schema.get_field("title").unwrap();
1897 assert!(!schema.get_field_entry(title).unwrap().primary_key);
1898
1899 assert_eq!(schema.primary_field(), Some(id));
1901 }
1902
1903 #[test]
1904 fn test_primary_with_other_attributes() {
1905 let sdl = r#"
1906 index documents {
1907 field id: text<simple> [primary, indexed, stored]
1908 field body: text [indexed]
1909 }
1910 "#;
1911
1912 let indexes = parse_sdl(sdl).unwrap();
1913 let id_field = &indexes[0].fields[0];
1914 assert!(id_field.primary);
1915 assert!(id_field.indexed);
1916 assert!(id_field.stored);
1917 assert!(id_field.fast);
1918 assert_eq!(id_field.tokenizer, Some("simple".to_string()));
1919 }
1920
1921 #[test]
1922 fn test_primary_only_one_allowed() {
1923 let sdl = r#"
1924 index documents {
1925 field id: text [primary]
1926 field alt_id: text [primary]
1927 }
1928 "#;
1929
1930 let result = parse_sdl(sdl);
1931 assert!(result.is_err());
1932 let err = result.unwrap_err().to_string();
1933 assert!(
1934 err.contains("primary key"),
1935 "Error should mention primary key: {}",
1936 err
1937 );
1938 }
1939
1940 #[test]
1941 fn test_primary_must_be_text() {
1942 let sdl = r#"
1943 index documents {
1944 field id: u64 [primary]
1945 }
1946 "#;
1947
1948 let result = parse_sdl(sdl);
1949 assert!(result.is_err());
1950 let err = result.unwrap_err().to_string();
1951 assert!(
1952 err.contains("text"),
1953 "Error should mention text type: {}",
1954 err
1955 );
1956 }
1957
1958 #[test]
1959 fn test_primary_cannot_be_multi() {
1960 let sdl = r#"
1961 index documents {
1962 field id: text [primary, stored<multi>]
1963 }
1964 "#;
1965
1966 let result = parse_sdl(sdl);
1967 assert!(result.is_err());
1968 let err = result.unwrap_err().to_string();
1969 assert!(err.contains("multi"), "Error should mention multi: {}", err);
1970 }
1971
1972 #[test]
1973 fn test_no_primary_field() {
1974 let sdl = r#"
1976 index documents {
1977 field title: text [indexed, stored]
1978 }
1979 "#;
1980
1981 let indexes = parse_sdl(sdl).unwrap();
1982 let schema = indexes[0].to_schema();
1983 assert!(schema.primary_field().is_none());
1984 }
1985}