1use pest::Parser;
49use pest_derive::Parser;
50
51use super::query_field_router::{QueryRouterRule, RoutingMode};
52use super::schema::{FieldType, Schema, SchemaBuilder};
53use crate::Result;
54use crate::error::Error;
55
56#[derive(Parser)]
57#[grammar = "dsl/sdl/sdl.pest"]
58pub struct SdlParser;
59
60use super::schema::DenseVectorConfig;
61use crate::structures::{
62 IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
63};
64
65#[derive(Debug, Clone)]
67pub struct FieldDef {
68 pub name: String,
69 pub field_type: FieldType,
70 pub indexed: bool,
71 pub stored: bool,
72 pub tokenizer: Option<String>,
74 pub multi: bool,
76 pub positions: Option<super::schema::PositionMode>,
78 pub sparse_vector_config: Option<SparseVectorConfig>,
80 pub dense_vector_config: Option<DenseVectorConfig>,
82}
83
84#[derive(Debug, Clone)]
86pub struct IndexDef {
87 pub name: String,
88 pub fields: Vec<FieldDef>,
89 pub default_fields: Vec<String>,
90 pub query_routers: Vec<QueryRouterRule>,
92}
93
94impl IndexDef {
95 pub fn to_schema(&self) -> Schema {
97 let mut builder = SchemaBuilder::default();
98
99 for field in &self.fields {
100 let f = match field.field_type {
101 FieldType::Text => {
102 let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
103 builder.add_text_field_with_tokenizer(
104 &field.name,
105 field.indexed,
106 field.stored,
107 tokenizer,
108 )
109 }
110 FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
111 FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
112 FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
113 FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
114 FieldType::Json => builder.add_json_field(&field.name, field.stored),
115 FieldType::SparseVector => {
116 if let Some(config) = &field.sparse_vector_config {
117 builder.add_sparse_vector_field_with_config(
118 &field.name,
119 field.indexed,
120 field.stored,
121 config.clone(),
122 )
123 } else {
124 builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
125 }
126 }
127 FieldType::DenseVector => {
128 let config = field
130 .dense_vector_config
131 .as_ref()
132 .expect("DenseVector field requires dimension to be specified");
133 builder.add_dense_vector_field_with_config(
134 &field.name,
135 field.indexed,
136 field.stored,
137 config.clone(),
138 )
139 }
140 };
141 if field.multi {
142 builder.set_multi(f, true);
143 }
144 let positions = field.positions.or({
146 if field.multi
148 && matches!(
149 field.field_type,
150 FieldType::SparseVector | FieldType::DenseVector
151 )
152 {
153 Some(super::schema::PositionMode::Ordinal)
154 } else {
155 None
156 }
157 });
158 if let Some(mode) = positions {
159 builder.set_positions(f, mode);
160 }
161 }
162
163 if !self.default_fields.is_empty() {
165 builder.set_default_fields(self.default_fields.clone());
166 }
167
168 if !self.query_routers.is_empty() {
170 builder.set_query_routers(self.query_routers.clone());
171 }
172
173 builder.build()
174 }
175
176 pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
181 if self.query_routers.is_empty() {
182 return Ok(None);
183 }
184
185 super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
186 .map(Some)
187 .map_err(Error::Schema)
188 }
189}
190
191fn parse_field_type(type_str: &str) -> Result<FieldType> {
193 match type_str {
194 "text" | "string" | "str" => Ok(FieldType::Text),
195 "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
196 "i64" | "int" | "integer" => Ok(FieldType::I64),
197 "f64" | "float" | "double" => Ok(FieldType::F64),
198 "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
199 "json" => Ok(FieldType::Json),
200 "sparse_vector" => Ok(FieldType::SparseVector),
201 "dense_vector" | "vector" => Ok(FieldType::DenseVector),
202 _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
203 }
204}
205
206#[derive(Debug, Clone, Default)]
208struct IndexConfig {
209 index_type: Option<super::schema::VectorIndexType>,
210 num_clusters: Option<usize>,
211 nprobe: Option<usize>,
212 mrl_dim: Option<usize>,
213 build_threshold: Option<usize>,
214 quantization: Option<WeightQuantization>,
216 weight_threshold: Option<f32>,
217 query_tokenizer: Option<String>,
219 query_weighting: Option<QueryWeighting>,
220 positions: Option<super::schema::PositionMode>,
222}
223
224fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool, Option<IndexConfig>) {
229 let mut indexed = false;
230 let mut stored = false;
231 let mut multi = false;
232 let mut index_config = None;
233
234 for attr in pair.into_inner() {
235 if attr.as_rule() == Rule::attribute {
236 let mut found_config = false;
238 for inner in attr.clone().into_inner() {
239 match inner.as_rule() {
240 Rule::indexed_with_config => {
241 indexed = true;
242 index_config = Some(parse_index_config(inner));
243 found_config = true;
244 break;
245 }
246 Rule::stored_with_config => {
247 stored = true;
248 multi = true; found_config = true;
250 break;
251 }
252 _ => {}
253 }
254 }
255 if !found_config {
256 match attr.as_str() {
258 "indexed" => indexed = true,
259 "stored" => stored = true,
260 _ => {}
261 }
262 }
263 }
264 }
265
266 (indexed, stored, multi, index_config)
267}
268
269fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
271 let mut config = IndexConfig::default();
272
273 for inner in pair.into_inner() {
278 if inner.as_rule() == Rule::index_config_params {
279 for param in inner.into_inner() {
280 if param.as_rule() == Rule::index_config_param {
281 for p in param.into_inner() {
282 parse_single_index_config_param(&mut config, p);
283 }
284 }
285 }
286 }
287 }
288
289 config
290}
291
292fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
294 use super::schema::VectorIndexType;
295
296 match p.as_rule() {
297 Rule::index_type_spec => {
298 config.index_type = Some(match p.as_str() {
299 "flat" => VectorIndexType::Flat,
300 "rabitq" => VectorIndexType::RaBitQ,
301 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
302 "scann" => VectorIndexType::ScaNN,
303 _ => VectorIndexType::RaBitQ,
304 });
305 }
306 Rule::index_type_kwarg => {
307 if let Some(t) = p.into_inner().next() {
309 config.index_type = Some(match t.as_str() {
310 "flat" => VectorIndexType::Flat,
311 "rabitq" => VectorIndexType::RaBitQ,
312 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
313 "scann" => VectorIndexType::ScaNN,
314 _ => VectorIndexType::RaBitQ,
315 });
316 }
317 }
318 Rule::num_clusters_kwarg => {
319 if let Some(n) = p.into_inner().next() {
321 config.num_clusters = Some(n.as_str().parse().unwrap_or(256));
322 }
323 }
324 Rule::build_threshold_kwarg => {
325 if let Some(n) = p.into_inner().next() {
327 config.build_threshold = Some(n.as_str().parse().unwrap_or(10000));
328 }
329 }
330 Rule::nprobe_kwarg => {
331 if let Some(n) = p.into_inner().next() {
333 config.nprobe = Some(n.as_str().parse().unwrap_or(32));
334 }
335 }
336 Rule::mrl_dim_kwarg => {
337 if let Some(n) = p.into_inner().next() {
339 config.mrl_dim = Some(n.as_str().parse().unwrap_or(0));
340 }
341 }
342 Rule::quantization_kwarg => {
343 if let Some(q) = p.into_inner().next() {
345 config.quantization = Some(match q.as_str() {
346 "float32" | "f32" => WeightQuantization::Float32,
347 "float16" | "f16" => WeightQuantization::Float16,
348 "uint8" | "u8" => WeightQuantization::UInt8,
349 "uint4" | "u4" => WeightQuantization::UInt4,
350 _ => WeightQuantization::default(),
351 });
352 }
353 }
354 Rule::weight_threshold_kwarg => {
355 if let Some(t) = p.into_inner().next() {
357 config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
358 }
359 }
360 Rule::query_config_block => {
361 parse_query_config_block(config, p);
363 }
364 Rule::positions_kwarg => {
365 use super::schema::PositionMode;
367 config.positions = Some(match p.as_str() {
368 "ordinal" => PositionMode::Ordinal,
369 "token_position" => PositionMode::TokenPosition,
370 _ => PositionMode::Full, });
372 }
373 _ => {}
374 }
375}
376
377fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
379 for inner in pair.into_inner() {
380 if inner.as_rule() == Rule::query_config_params {
381 for param in inner.into_inner() {
382 if param.as_rule() == Rule::query_config_param {
383 for p in param.into_inner() {
384 match p.as_rule() {
385 Rule::query_tokenizer_kwarg => {
386 if let Some(path) = p.into_inner().next()
388 && let Some(inner_path) = path.into_inner().next()
389 {
390 config.query_tokenizer = Some(inner_path.as_str().to_string());
391 }
392 }
393 Rule::query_weighting_kwarg => {
394 if let Some(w) = p.into_inner().next() {
396 config.query_weighting = Some(match w.as_str() {
397 "one" => QueryWeighting::One,
398 "idf" => QueryWeighting::Idf,
399 _ => QueryWeighting::One,
400 });
401 }
402 }
403 _ => {}
404 }
405 }
406 }
407 }
408 }
409 }
410}
411
412fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
414 let mut inner = pair.into_inner();
415
416 let name = inner
417 .next()
418 .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
419 .as_str()
420 .to_string();
421
422 let field_type_str = inner
423 .next()
424 .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
425 .as_str();
426
427 let field_type = parse_field_type(field_type_str)?;
428
429 let mut tokenizer = None;
431 let mut sparse_vector_config = None;
432 let mut dense_vector_config = None;
433 let mut indexed = true;
434 let mut stored = true;
435 let mut multi = false;
436 let mut index_config: Option<IndexConfig> = None;
437
438 for item in inner {
439 match item.as_rule() {
440 Rule::tokenizer_spec => {
441 if let Some(tok_name) = item.into_inner().next() {
443 tokenizer = Some(tok_name.as_str().to_string());
444 }
445 }
446 Rule::sparse_vector_config => {
447 sparse_vector_config = Some(parse_sparse_vector_config(item));
449 }
450 Rule::dense_vector_config => {
451 dense_vector_config = Some(parse_dense_vector_config(item));
453 }
454 Rule::attributes => {
455 let (idx, sto, mul, idx_cfg) = parse_attributes(item);
456 indexed = idx;
457 stored = sto;
458 multi = mul;
459 index_config = idx_cfg;
460 }
461 _ => {}
462 }
463 }
464
465 let mut positions = None;
467 if let Some(idx_cfg) = index_config {
468 positions = idx_cfg.positions;
469 if let Some(ref mut dv_config) = dense_vector_config {
470 apply_index_config_to_dense_vector(dv_config, idx_cfg);
471 } else if field_type == FieldType::SparseVector {
472 let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
474 apply_index_config_to_sparse_vector(sv_config, idx_cfg);
475 }
476 }
477
478 Ok(FieldDef {
479 name,
480 field_type,
481 indexed,
482 stored,
483 tokenizer,
484 multi,
485 positions,
486 sparse_vector_config,
487 dense_vector_config,
488 })
489}
490
491fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
493 if let Some(index_type) = idx_cfg.index_type {
495 config.index_type = index_type;
496 }
497
498 if idx_cfg.num_clusters.is_some() {
500 config.num_clusters = idx_cfg.num_clusters;
501 }
502
503 if let Some(nprobe) = idx_cfg.nprobe {
505 config.nprobe = nprobe;
506 }
507
508 if idx_cfg.mrl_dim.is_some() {
510 config.mrl_dim = idx_cfg.mrl_dim;
511 }
512
513 if idx_cfg.build_threshold.is_some() {
515 config.build_threshold = idx_cfg.build_threshold;
516 }
517}
518
519fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
522 let mut index_size = IndexSize::default();
523
524 for inner in pair.into_inner() {
526 if inner.as_rule() == Rule::index_size_spec {
527 index_size = match inner.as_str() {
528 "u16" => IndexSize::U16,
529 "u32" => IndexSize::U32,
530 _ => IndexSize::default(),
531 };
532 }
533 }
534
535 SparseVectorConfig {
536 index_size,
537 weight_quantization: WeightQuantization::default(),
538 weight_threshold: 0.0,
539 posting_list_pruning: None,
540 query_config: None,
541 }
542}
543
544fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
546 if let Some(q) = idx_cfg.quantization {
547 config.weight_quantization = q;
548 }
549 if let Some(t) = idx_cfg.weight_threshold {
550 config.weight_threshold = t;
551 }
552 if idx_cfg.query_tokenizer.is_some() || idx_cfg.query_weighting.is_some() {
554 let query_config = config
555 .query_config
556 .get_or_insert(SparseQueryConfig::default());
557 if let Some(tokenizer) = idx_cfg.query_tokenizer {
558 query_config.tokenizer = Some(tokenizer);
559 }
560 if let Some(weighting) = idx_cfg.query_weighting {
561 query_config.weighting = weighting;
562 }
563 }
564}
565
566fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
569 let mut dim: usize = 0;
570
571 for params in pair.into_inner() {
573 if params.as_rule() == Rule::dense_vector_params {
574 for inner in params.into_inner() {
575 match inner.as_rule() {
576 Rule::dense_vector_keyword_params => {
577 for kwarg in inner.into_inner() {
579 if kwarg.as_rule() == Rule::dims_kwarg
580 && let Some(d) = kwarg.into_inner().next()
581 {
582 dim = d.as_str().parse().unwrap_or(0);
583 }
584 }
585 }
586 Rule::dense_vector_positional_params => {
587 if let Some(dim_pair) = inner.into_inner().next() {
589 dim = dim_pair.as_str().parse().unwrap_or(0);
590 }
591 }
592 _ => {}
593 }
594 }
595 }
596 }
597
598 DenseVectorConfig::new(dim)
599}
600
601fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
603 pair.into_inner().map(|p| p.as_str().to_string()).collect()
604}
605
606fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
608 let mut pattern = String::new();
609 let mut substitution = String::new();
610 let mut target_field = String::new();
611 let mut mode = RoutingMode::Additional;
612
613 for prop in pair.into_inner() {
614 if prop.as_rule() != Rule::query_router_prop {
615 continue;
616 }
617
618 for inner in prop.into_inner() {
619 match inner.as_rule() {
620 Rule::query_router_pattern => {
621 if let Some(regex_str) = inner.into_inner().next() {
622 pattern = parse_string_value(regex_str);
623 }
624 }
625 Rule::query_router_substitution => {
626 if let Some(quoted) = inner.into_inner().next() {
627 substitution = parse_string_value(quoted);
628 }
629 }
630 Rule::query_router_target => {
631 if let Some(ident) = inner.into_inner().next() {
632 target_field = ident.as_str().to_string();
633 }
634 }
635 Rule::query_router_mode => {
636 if let Some(mode_val) = inner.into_inner().next() {
637 mode = match mode_val.as_str() {
638 "exclusive" => RoutingMode::Exclusive,
639 "additional" => RoutingMode::Additional,
640 _ => RoutingMode::Additional,
641 };
642 }
643 }
644 _ => {}
645 }
646 }
647 }
648
649 if pattern.is_empty() {
650 return Err(Error::Schema("query_router missing 'pattern'".to_string()));
651 }
652 if substitution.is_empty() {
653 return Err(Error::Schema(
654 "query_router missing 'substitution'".to_string(),
655 ));
656 }
657 if target_field.is_empty() {
658 return Err(Error::Schema(
659 "query_router missing 'target_field'".to_string(),
660 ));
661 }
662
663 Ok(QueryRouterRule {
664 pattern,
665 substitution,
666 target_field,
667 mode,
668 })
669}
670
671fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
673 let s = pair.as_str();
674 match pair.as_rule() {
675 Rule::regex_string => {
676 if let Some(inner) = pair.into_inner().next() {
678 parse_string_value(inner)
679 } else {
680 s.to_string()
681 }
682 }
683 Rule::raw_string => {
684 s[2..s.len() - 1].to_string()
686 }
687 Rule::quoted_string => {
688 let inner = &s[1..s.len() - 1];
690 inner
692 .replace("\\n", "\n")
693 .replace("\\t", "\t")
694 .replace("\\\"", "\"")
695 .replace("\\\\", "\\")
696 }
697 _ => s.to_string(),
698 }
699}
700
701fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
703 let mut inner = pair.into_inner();
704
705 let name = inner
706 .next()
707 .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
708 .as_str()
709 .to_string();
710
711 let mut fields = Vec::new();
712 let mut default_fields = Vec::new();
713 let mut query_routers = Vec::new();
714
715 for item in inner {
716 match item.as_rule() {
717 Rule::field_def => {
718 fields.push(parse_field_def(item)?);
719 }
720 Rule::default_fields_def => {
721 default_fields = parse_default_fields_def(item);
722 }
723 Rule::query_router_def => {
724 query_routers.push(parse_query_router_def(item)?);
725 }
726 _ => {}
727 }
728 }
729
730 Ok(IndexDef {
731 name,
732 fields,
733 default_fields,
734 query_routers,
735 })
736}
737
738pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
740 let pairs = SdlParser::parse(Rule::file, input)
741 .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
742
743 let mut indexes = Vec::new();
744
745 for pair in pairs {
746 if pair.as_rule() == Rule::file {
747 for inner in pair.into_inner() {
748 if inner.as_rule() == Rule::index_def {
749 indexes.push(parse_index_def(inner)?);
750 }
751 }
752 }
753 }
754
755 Ok(indexes)
756}
757
758pub fn parse_single_index(input: &str) -> Result<IndexDef> {
760 let indexes = parse_sdl(input)?;
761
762 if indexes.is_empty() {
763 return Err(Error::Schema("No index definition found".to_string()));
764 }
765
766 if indexes.len() > 1 {
767 return Err(Error::Schema(
768 "Multiple index definitions found, expected one".to_string(),
769 ));
770 }
771
772 Ok(indexes.into_iter().next().unwrap())
773}
774
775#[cfg(test)]
776mod tests {
777 use super::*;
778
779 #[test]
780 fn test_parse_simple_schema() {
781 let sdl = r#"
782 index articles {
783 field title: text [indexed, stored]
784 field body: text [indexed]
785 }
786 "#;
787
788 let indexes = parse_sdl(sdl).unwrap();
789 assert_eq!(indexes.len(), 1);
790
791 let index = &indexes[0];
792 assert_eq!(index.name, "articles");
793 assert_eq!(index.fields.len(), 2);
794
795 assert_eq!(index.fields[0].name, "title");
796 assert!(matches!(index.fields[0].field_type, FieldType::Text));
797 assert!(index.fields[0].indexed);
798 assert!(index.fields[0].stored);
799
800 assert_eq!(index.fields[1].name, "body");
801 assert!(matches!(index.fields[1].field_type, FieldType::Text));
802 assert!(index.fields[1].indexed);
803 assert!(!index.fields[1].stored);
804 }
805
806 #[test]
807 fn test_parse_all_field_types() {
808 let sdl = r#"
809 index test {
810 field text_field: text [indexed, stored]
811 field u64_field: u64 [indexed, stored]
812 field i64_field: i64 [indexed, stored]
813 field f64_field: f64 [indexed, stored]
814 field bytes_field: bytes [stored]
815 }
816 "#;
817
818 let indexes = parse_sdl(sdl).unwrap();
819 let index = &indexes[0];
820
821 assert!(matches!(index.fields[0].field_type, FieldType::Text));
822 assert!(matches!(index.fields[1].field_type, FieldType::U64));
823 assert!(matches!(index.fields[2].field_type, FieldType::I64));
824 assert!(matches!(index.fields[3].field_type, FieldType::F64));
825 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
826 }
827
828 #[test]
829 fn test_parse_with_comments() {
830 let sdl = r#"
831 # This is a comment
832 index articles {
833 # Title field
834 field title: text [indexed, stored]
835 field body: text [indexed] # inline comment not supported yet
836 }
837 "#;
838
839 let indexes = parse_sdl(sdl).unwrap();
840 assert_eq!(indexes[0].fields.len(), 2);
841 }
842
843 #[test]
844 fn test_parse_type_aliases() {
845 let sdl = r#"
846 index test {
847 field a: string [indexed]
848 field b: int [indexed]
849 field c: uint [indexed]
850 field d: float [indexed]
851 field e: binary [stored]
852 }
853 "#;
854
855 let indexes = parse_sdl(sdl).unwrap();
856 let index = &indexes[0];
857
858 assert!(matches!(index.fields[0].field_type, FieldType::Text));
859 assert!(matches!(index.fields[1].field_type, FieldType::I64));
860 assert!(matches!(index.fields[2].field_type, FieldType::U64));
861 assert!(matches!(index.fields[3].field_type, FieldType::F64));
862 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
863 }
864
865 #[test]
866 fn test_to_schema() {
867 let sdl = r#"
868 index articles {
869 field title: text [indexed, stored]
870 field views: u64 [indexed, stored]
871 }
872 "#;
873
874 let indexes = parse_sdl(sdl).unwrap();
875 let schema = indexes[0].to_schema();
876
877 assert!(schema.get_field("title").is_some());
878 assert!(schema.get_field("views").is_some());
879 assert!(schema.get_field("nonexistent").is_none());
880 }
881
882 #[test]
883 fn test_default_attributes() {
884 let sdl = r#"
885 index test {
886 field title: text
887 }
888 "#;
889
890 let indexes = parse_sdl(sdl).unwrap();
891 let field = &indexes[0].fields[0];
892
893 assert!(field.indexed);
895 assert!(field.stored);
896 }
897
898 #[test]
899 fn test_multiple_indexes() {
900 let sdl = r#"
901 index articles {
902 field title: text [indexed, stored]
903 }
904
905 index users {
906 field name: text [indexed, stored]
907 field email: text [indexed, stored]
908 }
909 "#;
910
911 let indexes = parse_sdl(sdl).unwrap();
912 assert_eq!(indexes.len(), 2);
913 assert_eq!(indexes[0].name, "articles");
914 assert_eq!(indexes[1].name, "users");
915 }
916
917 #[test]
918 fn test_tokenizer_spec() {
919 let sdl = r#"
920 index articles {
921 field title: text<en_stem> [indexed, stored]
922 field body: text<default> [indexed]
923 field author: text [indexed, stored]
924 }
925 "#;
926
927 let indexes = parse_sdl(sdl).unwrap();
928 let index = &indexes[0];
929
930 assert_eq!(index.fields[0].name, "title");
931 assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
932
933 assert_eq!(index.fields[1].name, "body");
934 assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
935
936 assert_eq!(index.fields[2].name, "author");
937 assert_eq!(index.fields[2].tokenizer, None); }
939
940 #[test]
941 fn test_tokenizer_in_schema() {
942 let sdl = r#"
943 index articles {
944 field title: text<german> [indexed, stored]
945 field body: text<en_stem> [indexed]
946 }
947 "#;
948
949 let indexes = parse_sdl(sdl).unwrap();
950 let schema = indexes[0].to_schema();
951
952 let title_field = schema.get_field("title").unwrap();
953 let title_entry = schema.get_field_entry(title_field).unwrap();
954 assert_eq!(title_entry.tokenizer, Some("german".to_string()));
955
956 let body_field = schema.get_field("body").unwrap();
957 let body_entry = schema.get_field_entry(body_field).unwrap();
958 assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
959 }
960
961 #[test]
962 fn test_query_router_basic() {
963 let sdl = r#"
964 index documents {
965 field title: text [indexed, stored]
966 field uri: text [indexed, stored]
967
968 query_router {
969 pattern: "10\\.\\d{4,}/[^\\s]+"
970 substitution: "doi://{0}"
971 target_field: uris
972 mode: exclusive
973 }
974 }
975 "#;
976
977 let indexes = parse_sdl(sdl).unwrap();
978 let index = &indexes[0];
979
980 assert_eq!(index.query_routers.len(), 1);
981 let router = &index.query_routers[0];
982 assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
983 assert_eq!(router.substitution, "doi://{0}");
984 assert_eq!(router.target_field, "uris");
985 assert_eq!(router.mode, RoutingMode::Exclusive);
986 }
987
988 #[test]
989 fn test_query_router_raw_string() {
990 let sdl = r#"
991 index documents {
992 field uris: text [indexed, stored]
993
994 query_router {
995 pattern: r"^pmid:(\d+)$"
996 substitution: "pubmed://{1}"
997 target_field: uris
998 mode: additional
999 }
1000 }
1001 "#;
1002
1003 let indexes = parse_sdl(sdl).unwrap();
1004 let router = &indexes[0].query_routers[0];
1005
1006 assert_eq!(router.pattern, r"^pmid:(\d+)$");
1007 assert_eq!(router.substitution, "pubmed://{1}");
1008 assert_eq!(router.mode, RoutingMode::Additional);
1009 }
1010
1011 #[test]
1012 fn test_multiple_query_routers() {
1013 let sdl = r#"
1014 index documents {
1015 field uris: text [indexed, stored]
1016
1017 query_router {
1018 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1019 substitution: "doi://{1}"
1020 target_field: uris
1021 mode: exclusive
1022 }
1023
1024 query_router {
1025 pattern: r"^pmid:(\d+)$"
1026 substitution: "pubmed://{1}"
1027 target_field: uris
1028 mode: exclusive
1029 }
1030
1031 query_router {
1032 pattern: r"^arxiv:(\d+\.\d+)$"
1033 substitution: "arxiv://{1}"
1034 target_field: uris
1035 mode: additional
1036 }
1037 }
1038 "#;
1039
1040 let indexes = parse_sdl(sdl).unwrap();
1041 assert_eq!(indexes[0].query_routers.len(), 3);
1042 }
1043
1044 #[test]
1045 fn test_query_router_default_mode() {
1046 let sdl = r#"
1047 index documents {
1048 field uris: text [indexed, stored]
1049
1050 query_router {
1051 pattern: r"test"
1052 substitution: "{0}"
1053 target_field: uris
1054 }
1055 }
1056 "#;
1057
1058 let indexes = parse_sdl(sdl).unwrap();
1059 assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1061 }
1062
1063 #[test]
1064 fn test_multi_attribute() {
1065 let sdl = r#"
1066 index documents {
1067 field uris: text [indexed, stored<multi>]
1068 field title: text [indexed, stored]
1069 }
1070 "#;
1071
1072 let indexes = parse_sdl(sdl).unwrap();
1073 assert_eq!(indexes.len(), 1);
1074
1075 let fields = &indexes[0].fields;
1076 assert_eq!(fields.len(), 2);
1077
1078 assert_eq!(fields[0].name, "uris");
1080 assert!(fields[0].multi, "uris field should have multi=true");
1081
1082 assert_eq!(fields[1].name, "title");
1084 assert!(!fields[1].multi, "title field should have multi=false");
1085
1086 let schema = indexes[0].to_schema();
1088 let uris_field = schema.get_field("uris").unwrap();
1089 let title_field = schema.get_field("title").unwrap();
1090
1091 assert!(schema.get_field_entry(uris_field).unwrap().multi);
1092 assert!(!schema.get_field_entry(title_field).unwrap().multi);
1093 }
1094
1095 #[test]
1096 fn test_sparse_vector_field() {
1097 let sdl = r#"
1098 index documents {
1099 field embedding: sparse_vector [indexed, stored]
1100 }
1101 "#;
1102
1103 let indexes = parse_sdl(sdl).unwrap();
1104 assert_eq!(indexes.len(), 1);
1105 assert_eq!(indexes[0].fields.len(), 1);
1106 assert_eq!(indexes[0].fields[0].name, "embedding");
1107 assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1108 assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1109 }
1110
1111 #[test]
1112 fn test_sparse_vector_with_config() {
1113 let sdl = r#"
1114 index documents {
1115 field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1116 field dense: sparse_vector<u32> [indexed<quantization: float32>]
1117 }
1118 "#;
1119
1120 let indexes = parse_sdl(sdl).unwrap();
1121 assert_eq!(indexes[0].fields.len(), 2);
1122
1123 let f1 = &indexes[0].fields[0];
1125 assert_eq!(f1.name, "embedding");
1126 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1127 assert_eq!(config1.index_size, IndexSize::U16);
1128 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1129
1130 let f2 = &indexes[0].fields[1];
1132 assert_eq!(f2.name, "dense");
1133 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1134 assert_eq!(config2.index_size, IndexSize::U32);
1135 assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1136 }
1137
1138 #[test]
1139 fn test_sparse_vector_with_weight_threshold() {
1140 let sdl = r#"
1141 index documents {
1142 field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1143 field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1144 }
1145 "#;
1146
1147 let indexes = parse_sdl(sdl).unwrap();
1148 assert_eq!(indexes[0].fields.len(), 2);
1149
1150 let f1 = &indexes[0].fields[0];
1152 assert_eq!(f1.name, "embedding");
1153 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1154 assert_eq!(config1.index_size, IndexSize::U16);
1155 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1156 assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1157
1158 let f2 = &indexes[0].fields[1];
1160 assert_eq!(f2.name, "embedding2");
1161 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1162 assert_eq!(config2.index_size, IndexSize::U32);
1163 assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1164 assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1165 }
1166
1167 #[test]
1168 fn test_dense_vector_field() {
1169 let sdl = r#"
1170 index documents {
1171 field embedding: dense_vector<768> [indexed, stored]
1172 }
1173 "#;
1174
1175 let indexes = parse_sdl(sdl).unwrap();
1176 assert_eq!(indexes.len(), 1);
1177 assert_eq!(indexes[0].fields.len(), 1);
1178
1179 let f = &indexes[0].fields[0];
1180 assert_eq!(f.name, "embedding");
1181 assert_eq!(f.field_type, FieldType::DenseVector);
1182
1183 let config = f.dense_vector_config.as_ref().unwrap();
1184 assert_eq!(config.dim, 768);
1185 }
1186
1187 #[test]
1188 fn test_dense_vector_alias() {
1189 let sdl = r#"
1190 index documents {
1191 field embedding: vector<1536> [indexed]
1192 }
1193 "#;
1194
1195 let indexes = parse_sdl(sdl).unwrap();
1196 assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1197 assert_eq!(
1198 indexes[0].fields[0]
1199 .dense_vector_config
1200 .as_ref()
1201 .unwrap()
1202 .dim,
1203 1536
1204 );
1205 }
1206
1207 #[test]
1208 fn test_dense_vector_with_num_clusters() {
1209 let sdl = r#"
1210 index documents {
1211 field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1212 }
1213 "#;
1214
1215 let indexes = parse_sdl(sdl).unwrap();
1216 assert_eq!(indexes.len(), 1);
1217
1218 let f = &indexes[0].fields[0];
1219 assert_eq!(f.name, "embedding");
1220 assert_eq!(f.field_type, FieldType::DenseVector);
1221
1222 let config = f.dense_vector_config.as_ref().unwrap();
1223 assert_eq!(config.dim, 768);
1224 assert_eq!(config.num_clusters, Some(256));
1225 assert_eq!(config.nprobe, 32); }
1227
1228 #[test]
1229 fn test_dense_vector_with_num_clusters_and_nprobe() {
1230 let sdl = r#"
1231 index documents {
1232 field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1233 }
1234 "#;
1235
1236 let indexes = parse_sdl(sdl).unwrap();
1237 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1238
1239 assert_eq!(config.dim, 1536);
1240 assert_eq!(config.num_clusters, Some(512));
1241 assert_eq!(config.nprobe, 64);
1242 }
1243
1244 #[test]
1245 fn test_dense_vector_keyword_syntax() {
1246 let sdl = r#"
1247 index documents {
1248 field embedding: dense_vector<dims: 1536> [indexed, stored]
1249 }
1250 "#;
1251
1252 let indexes = parse_sdl(sdl).unwrap();
1253 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1254
1255 assert_eq!(config.dim, 1536);
1256 assert!(config.num_clusters.is_none());
1257 }
1258
1259 #[test]
1260 fn test_dense_vector_keyword_syntax_full() {
1261 let sdl = r#"
1262 index documents {
1263 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1264 }
1265 "#;
1266
1267 let indexes = parse_sdl(sdl).unwrap();
1268 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1269
1270 assert_eq!(config.dim, 1536);
1271 assert_eq!(config.num_clusters, Some(256));
1272 assert_eq!(config.nprobe, 64);
1273 }
1274
1275 #[test]
1276 fn test_dense_vector_keyword_syntax_partial() {
1277 let sdl = r#"
1278 index documents {
1279 field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1280 }
1281 "#;
1282
1283 let indexes = parse_sdl(sdl).unwrap();
1284 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1285
1286 assert_eq!(config.dim, 768);
1287 assert_eq!(config.num_clusters, Some(128));
1288 assert_eq!(config.nprobe, 32); }
1290
1291 #[test]
1292 fn test_dense_vector_scann_index() {
1293 use crate::dsl::schema::VectorIndexType;
1294
1295 let sdl = r#"
1296 index documents {
1297 field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1298 }
1299 "#;
1300
1301 let indexes = parse_sdl(sdl).unwrap();
1302 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1303
1304 assert_eq!(config.dim, 768);
1305 assert_eq!(config.index_type, VectorIndexType::ScaNN);
1306 assert_eq!(config.num_clusters, Some(256));
1307 assert_eq!(config.nprobe, 64);
1308 }
1309
1310 #[test]
1311 fn test_dense_vector_ivf_rabitq_index() {
1312 use crate::dsl::schema::VectorIndexType;
1313
1314 let sdl = r#"
1315 index documents {
1316 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1317 }
1318 "#;
1319
1320 let indexes = parse_sdl(sdl).unwrap();
1321 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1322
1323 assert_eq!(config.dim, 1536);
1324 assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1325 assert_eq!(config.num_clusters, Some(512));
1326 }
1327
1328 #[test]
1329 fn test_dense_vector_rabitq_no_clusters() {
1330 use crate::dsl::schema::VectorIndexType;
1331
1332 let sdl = r#"
1333 index documents {
1334 field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1335 }
1336 "#;
1337
1338 let indexes = parse_sdl(sdl).unwrap();
1339 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1340
1341 assert_eq!(config.dim, 768);
1342 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1343 assert!(config.num_clusters.is_none());
1344 }
1345
1346 #[test]
1347 fn test_dense_vector_flat_index() {
1348 use crate::dsl::schema::VectorIndexType;
1349
1350 let sdl = r#"
1351 index documents {
1352 field embedding: dense_vector<dims: 768> [indexed<flat>]
1353 }
1354 "#;
1355
1356 let indexes = parse_sdl(sdl).unwrap();
1357 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1358
1359 assert_eq!(config.dim, 768);
1360 assert_eq!(config.index_type, VectorIndexType::Flat);
1361 }
1362
1363 #[test]
1364 fn test_dense_vector_default_index_type() {
1365 use crate::dsl::schema::VectorIndexType;
1366
1367 let sdl = r#"
1369 index documents {
1370 field embedding: dense_vector<dims: 768> [indexed]
1371 }
1372 "#;
1373
1374 let indexes = parse_sdl(sdl).unwrap();
1375 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1376
1377 assert_eq!(config.dim, 768);
1378 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1379 }
1380
1381 #[test]
1382 fn test_dense_vector_mrl_dim() {
1383 let sdl = r#"
1385 index documents {
1386 field embedding: dense_vector<1536> [indexed<mrl_dim: 256>]
1387 }
1388 "#;
1389
1390 let indexes = parse_sdl(sdl).unwrap();
1391 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1392
1393 assert_eq!(config.dim, 1536);
1394 assert_eq!(config.mrl_dim, Some(256));
1395 assert_eq!(config.index_dim(), 256);
1396 }
1397
1398 #[test]
1399 fn test_dense_vector_mrl_dim_with_num_clusters() {
1400 let sdl = r#"
1402 index documents {
1403 field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64, mrl_dim: 128>]
1404 }
1405 "#;
1406
1407 let indexes = parse_sdl(sdl).unwrap();
1408 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1409
1410 assert_eq!(config.dim, 768);
1411 assert_eq!(config.mrl_dim, Some(128));
1412 assert_eq!(config.index_dim(), 128);
1413 assert_eq!(config.num_clusters, Some(256));
1414 assert_eq!(config.nprobe, 64);
1415 }
1416
1417 #[test]
1418 fn test_dense_vector_no_mrl_dim() {
1419 let sdl = r#"
1421 index documents {
1422 field embedding: dense_vector<dims: 768> [indexed]
1423 }
1424 "#;
1425
1426 let indexes = parse_sdl(sdl).unwrap();
1427 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1428
1429 assert_eq!(config.dim, 768);
1430 assert_eq!(config.mrl_dim, None);
1431 assert_eq!(config.index_dim(), 768);
1432 }
1433
1434 #[test]
1435 fn test_json_field_type() {
1436 let sdl = r#"
1437 index documents {
1438 field title: text [indexed, stored]
1439 field metadata: json [stored]
1440 field extra: json
1441 }
1442 "#;
1443
1444 let indexes = parse_sdl(sdl).unwrap();
1445 let index = &indexes[0];
1446
1447 assert_eq!(index.fields.len(), 3);
1448
1449 assert_eq!(index.fields[1].name, "metadata");
1451 assert!(matches!(index.fields[1].field_type, FieldType::Json));
1452 assert!(index.fields[1].stored);
1453 assert_eq!(index.fields[2].name, "extra");
1457 assert!(matches!(index.fields[2].field_type, FieldType::Json));
1458
1459 let schema = index.to_schema();
1461 let metadata_field = schema.get_field("metadata").unwrap();
1462 let entry = schema.get_field_entry(metadata_field).unwrap();
1463 assert_eq!(entry.field_type, FieldType::Json);
1464 assert!(!entry.indexed); assert!(entry.stored);
1466 }
1467
1468 #[test]
1469 fn test_sparse_vector_query_config() {
1470 use crate::structures::QueryWeighting;
1471
1472 let sdl = r#"
1473 index documents {
1474 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1475 }
1476 "#;
1477
1478 let indexes = parse_sdl(sdl).unwrap();
1479 let index = &indexes[0];
1480
1481 assert_eq!(index.fields.len(), 1);
1482 assert_eq!(index.fields[0].name, "embedding");
1483 assert!(matches!(
1484 index.fields[0].field_type,
1485 FieldType::SparseVector
1486 ));
1487
1488 let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1489 assert_eq!(config.index_size, IndexSize::U16);
1490 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1491
1492 let query_config = config.query_config.as_ref().unwrap();
1494 assert_eq!(
1495 query_config.tokenizer.as_deref(),
1496 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1497 );
1498 assert_eq!(query_config.weighting, QueryWeighting::Idf);
1499
1500 let schema = index.to_schema();
1502 let embedding_field = schema.get_field("embedding").unwrap();
1503 let entry = schema.get_field_entry(embedding_field).unwrap();
1504 let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1505 let qc = sv_config.query_config.as_ref().unwrap();
1506 assert_eq!(
1507 qc.tokenizer.as_deref(),
1508 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1509 );
1510 assert_eq!(qc.weighting, QueryWeighting::Idf);
1511 }
1512
1513 #[test]
1514 fn test_sparse_vector_query_config_weighting_one() {
1515 use crate::structures::QueryWeighting;
1516
1517 let sdl = r#"
1518 index documents {
1519 field embedding: sparse_vector [indexed<query<weighting: one>>]
1520 }
1521 "#;
1522
1523 let indexes = parse_sdl(sdl).unwrap();
1524 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1525
1526 let query_config = config.query_config.as_ref().unwrap();
1527 assert!(query_config.tokenizer.is_none());
1528 assert_eq!(query_config.weighting, QueryWeighting::One);
1529 }
1530}