1use pest::Parser;
49use pest_derive::Parser;
50
51use super::query_field_router::{QueryRouterRule, RoutingMode};
52use super::schema::{FieldType, Schema, SchemaBuilder};
53use crate::Result;
54use crate::error::Error;
55
56#[derive(Parser)]
57#[grammar = "dsl/sdl/sdl.pest"]
58pub struct SdlParser;
59
60use super::schema::DenseVectorConfig;
61use crate::structures::{
62 IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
63};
64
65#[derive(Debug, Clone)]
67pub struct FieldDef {
68 pub name: String,
69 pub field_type: FieldType,
70 pub indexed: bool,
71 pub stored: bool,
72 pub tokenizer: Option<String>,
74 pub multi: bool,
76 pub positions: Option<super::schema::PositionMode>,
78 pub sparse_vector_config: Option<SparseVectorConfig>,
80 pub dense_vector_config: Option<DenseVectorConfig>,
82}
83
84#[derive(Debug, Clone)]
86pub struct IndexDef {
87 pub name: String,
88 pub fields: Vec<FieldDef>,
89 pub default_fields: Vec<String>,
90 pub query_routers: Vec<QueryRouterRule>,
92}
93
94impl IndexDef {
95 pub fn to_schema(&self) -> Schema {
97 let mut builder = SchemaBuilder::default();
98
99 for field in &self.fields {
100 let f = match field.field_type {
101 FieldType::Text => {
102 let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
103 builder.add_text_field_with_tokenizer(
104 &field.name,
105 field.indexed,
106 field.stored,
107 tokenizer,
108 )
109 }
110 FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
111 FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
112 FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
113 FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
114 FieldType::Json => builder.add_json_field(&field.name, field.stored),
115 FieldType::SparseVector => {
116 if let Some(config) = &field.sparse_vector_config {
117 builder.add_sparse_vector_field_with_config(
118 &field.name,
119 field.indexed,
120 field.stored,
121 config.clone(),
122 )
123 } else {
124 builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
125 }
126 }
127 FieldType::DenseVector => {
128 let config = field
130 .dense_vector_config
131 .as_ref()
132 .expect("DenseVector field requires dimension to be specified");
133 builder.add_dense_vector_field_with_config(
134 &field.name,
135 field.indexed,
136 field.stored,
137 config.clone(),
138 )
139 }
140 };
141 if field.multi {
142 builder.set_multi(f, true);
143 }
144 let positions = field.positions.or({
146 if field.multi
148 && matches!(
149 field.field_type,
150 FieldType::SparseVector | FieldType::DenseVector
151 )
152 {
153 Some(super::schema::PositionMode::Ordinal)
154 } else {
155 None
156 }
157 });
158 if let Some(mode) = positions {
159 builder.set_positions(f, mode);
160 }
161 }
162
163 if !self.default_fields.is_empty() {
165 builder.set_default_fields(self.default_fields.clone());
166 }
167
168 if !self.query_routers.is_empty() {
170 builder.set_query_routers(self.query_routers.clone());
171 }
172
173 builder.build()
174 }
175
176 pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
181 if self.query_routers.is_empty() {
182 return Ok(None);
183 }
184
185 super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
186 .map(Some)
187 .map_err(Error::Schema)
188 }
189}
190
191fn parse_field_type(type_str: &str) -> Result<FieldType> {
193 match type_str {
194 "text" | "string" | "str" => Ok(FieldType::Text),
195 "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
196 "i64" | "int" | "integer" => Ok(FieldType::I64),
197 "f64" | "float" | "double" => Ok(FieldType::F64),
198 "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
199 "json" => Ok(FieldType::Json),
200 "sparse_vector" => Ok(FieldType::SparseVector),
201 "dense_vector" | "vector" => Ok(FieldType::DenseVector),
202 _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
203 }
204}
205
206#[derive(Debug, Clone, Default)]
208struct IndexConfig {
209 index_type: Option<super::schema::VectorIndexType>,
210 num_clusters: Option<usize>,
211 nprobe: Option<usize>,
212 mrl_dim: Option<usize>,
213 build_threshold: Option<usize>,
214 quantization: Option<WeightQuantization>,
216 weight_threshold: Option<f32>,
217 block_size: Option<usize>,
218 query_tokenizer: Option<String>,
220 query_weighting: Option<QueryWeighting>,
221 positions: Option<super::schema::PositionMode>,
223}
224
225fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool, Option<IndexConfig>) {
230 let mut indexed = false;
231 let mut stored = false;
232 let mut multi = false;
233 let mut index_config = None;
234
235 for attr in pair.into_inner() {
236 if attr.as_rule() == Rule::attribute {
237 let mut found_config = false;
239 for inner in attr.clone().into_inner() {
240 match inner.as_rule() {
241 Rule::indexed_with_config => {
242 indexed = true;
243 index_config = Some(parse_index_config(inner));
244 found_config = true;
245 break;
246 }
247 Rule::stored_with_config => {
248 stored = true;
249 multi = true; found_config = true;
251 break;
252 }
253 _ => {}
254 }
255 }
256 if !found_config {
257 match attr.as_str() {
259 "indexed" => indexed = true,
260 "stored" => stored = true,
261 _ => {}
262 }
263 }
264 }
265 }
266
267 (indexed, stored, multi, index_config)
268}
269
270fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
272 let mut config = IndexConfig::default();
273
274 for inner in pair.into_inner() {
279 if inner.as_rule() == Rule::index_config_params {
280 for param in inner.into_inner() {
281 if param.as_rule() == Rule::index_config_param {
282 for p in param.into_inner() {
283 parse_single_index_config_param(&mut config, p);
284 }
285 }
286 }
287 }
288 }
289
290 config
291}
292
293fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
295 use super::schema::VectorIndexType;
296
297 match p.as_rule() {
298 Rule::index_type_spec => {
299 config.index_type = Some(match p.as_str() {
300 "flat" => VectorIndexType::Flat,
301 "rabitq" => VectorIndexType::RaBitQ,
302 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
303 "scann" => VectorIndexType::ScaNN,
304 _ => VectorIndexType::RaBitQ,
305 });
306 }
307 Rule::index_type_kwarg => {
308 if let Some(t) = p.into_inner().next() {
310 config.index_type = Some(match t.as_str() {
311 "flat" => VectorIndexType::Flat,
312 "rabitq" => VectorIndexType::RaBitQ,
313 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
314 "scann" => VectorIndexType::ScaNN,
315 _ => VectorIndexType::RaBitQ,
316 });
317 }
318 }
319 Rule::num_clusters_kwarg => {
320 if let Some(n) = p.into_inner().next() {
322 config.num_clusters = Some(n.as_str().parse().unwrap_or(256));
323 }
324 }
325 Rule::build_threshold_kwarg => {
326 if let Some(n) = p.into_inner().next() {
328 config.build_threshold = Some(n.as_str().parse().unwrap_or(10000));
329 }
330 }
331 Rule::nprobe_kwarg => {
332 if let Some(n) = p.into_inner().next() {
334 config.nprobe = Some(n.as_str().parse().unwrap_or(32));
335 }
336 }
337 Rule::mrl_dim_kwarg => {
338 if let Some(n) = p.into_inner().next() {
340 config.mrl_dim = Some(n.as_str().parse().unwrap_or(0));
341 }
342 }
343 Rule::quantization_kwarg => {
344 if let Some(q) = p.into_inner().next() {
346 config.quantization = Some(match q.as_str() {
347 "float32" | "f32" => WeightQuantization::Float32,
348 "float16" | "f16" => WeightQuantization::Float16,
349 "uint8" | "u8" => WeightQuantization::UInt8,
350 "uint4" | "u4" => WeightQuantization::UInt4,
351 _ => WeightQuantization::default(),
352 });
353 }
354 }
355 Rule::weight_threshold_kwarg => {
356 if let Some(t) = p.into_inner().next() {
358 config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
359 }
360 }
361 Rule::block_size_kwarg => {
362 if let Some(n) = p.into_inner().next() {
364 config.block_size = Some(n.as_str().parse().unwrap_or(128));
365 }
366 }
367 Rule::query_config_block => {
368 parse_query_config_block(config, p);
370 }
371 Rule::positions_kwarg => {
372 use super::schema::PositionMode;
374 config.positions = Some(match p.as_str() {
375 "ordinal" => PositionMode::Ordinal,
376 "token_position" => PositionMode::TokenPosition,
377 _ => PositionMode::Full, });
379 }
380 _ => {}
381 }
382}
383
384fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
386 for inner in pair.into_inner() {
387 if inner.as_rule() == Rule::query_config_params {
388 for param in inner.into_inner() {
389 if param.as_rule() == Rule::query_config_param {
390 for p in param.into_inner() {
391 match p.as_rule() {
392 Rule::query_tokenizer_kwarg => {
393 if let Some(path) = p.into_inner().next()
395 && let Some(inner_path) = path.into_inner().next()
396 {
397 config.query_tokenizer = Some(inner_path.as_str().to_string());
398 }
399 }
400 Rule::query_weighting_kwarg => {
401 if let Some(w) = p.into_inner().next() {
403 config.query_weighting = Some(match w.as_str() {
404 "one" => QueryWeighting::One,
405 "idf" => QueryWeighting::Idf,
406 _ => QueryWeighting::One,
407 });
408 }
409 }
410 _ => {}
411 }
412 }
413 }
414 }
415 }
416 }
417}
418
419fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
421 let mut inner = pair.into_inner();
422
423 let name = inner
424 .next()
425 .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
426 .as_str()
427 .to_string();
428
429 let field_type_str = inner
430 .next()
431 .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
432 .as_str();
433
434 let field_type = parse_field_type(field_type_str)?;
435
436 let mut tokenizer = None;
438 let mut sparse_vector_config = None;
439 let mut dense_vector_config = None;
440 let mut indexed = true;
441 let mut stored = true;
442 let mut multi = false;
443 let mut index_config: Option<IndexConfig> = None;
444
445 for item in inner {
446 match item.as_rule() {
447 Rule::tokenizer_spec => {
448 if let Some(tok_name) = item.into_inner().next() {
450 tokenizer = Some(tok_name.as_str().to_string());
451 }
452 }
453 Rule::sparse_vector_config => {
454 sparse_vector_config = Some(parse_sparse_vector_config(item));
456 }
457 Rule::dense_vector_config => {
458 dense_vector_config = Some(parse_dense_vector_config(item));
460 }
461 Rule::attributes => {
462 let (idx, sto, mul, idx_cfg) = parse_attributes(item);
463 indexed = idx;
464 stored = sto;
465 multi = mul;
466 index_config = idx_cfg;
467 }
468 _ => {}
469 }
470 }
471
472 let mut positions = None;
474 if let Some(idx_cfg) = index_config {
475 positions = idx_cfg.positions;
476 if let Some(ref mut dv_config) = dense_vector_config {
477 apply_index_config_to_dense_vector(dv_config, idx_cfg);
478 } else if field_type == FieldType::SparseVector {
479 let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
481 apply_index_config_to_sparse_vector(sv_config, idx_cfg);
482 }
483 }
484
485 Ok(FieldDef {
486 name,
487 field_type,
488 indexed,
489 stored,
490 tokenizer,
491 multi,
492 positions,
493 sparse_vector_config,
494 dense_vector_config,
495 })
496}
497
498fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
500 if let Some(index_type) = idx_cfg.index_type {
502 config.index_type = index_type;
503 }
504
505 if idx_cfg.num_clusters.is_some() {
507 config.num_clusters = idx_cfg.num_clusters;
508 }
509
510 if let Some(nprobe) = idx_cfg.nprobe {
512 config.nprobe = nprobe;
513 }
514
515 if idx_cfg.mrl_dim.is_some() {
517 config.mrl_dim = idx_cfg.mrl_dim;
518 }
519
520 if idx_cfg.build_threshold.is_some() {
522 config.build_threshold = idx_cfg.build_threshold;
523 }
524}
525
526fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
529 let mut index_size = IndexSize::default();
530
531 for inner in pair.into_inner() {
533 if inner.as_rule() == Rule::index_size_spec {
534 index_size = match inner.as_str() {
535 "u16" => IndexSize::U16,
536 "u32" => IndexSize::U32,
537 _ => IndexSize::default(),
538 };
539 }
540 }
541
542 SparseVectorConfig {
543 index_size,
544 weight_quantization: WeightQuantization::default(),
545 weight_threshold: 0.0,
546 block_size: 128,
547 posting_list_pruning: None,
548 query_config: None,
549 }
550}
551
552fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
554 if let Some(q) = idx_cfg.quantization {
555 config.weight_quantization = q;
556 }
557 if let Some(t) = idx_cfg.weight_threshold {
558 config.weight_threshold = t;
559 }
560 if let Some(bs) = idx_cfg.block_size {
561 config.block_size = bs.next_power_of_two();
562 }
563 if idx_cfg.query_tokenizer.is_some() || idx_cfg.query_weighting.is_some() {
565 let query_config = config
566 .query_config
567 .get_or_insert(SparseQueryConfig::default());
568 if let Some(tokenizer) = idx_cfg.query_tokenizer {
569 query_config.tokenizer = Some(tokenizer);
570 }
571 if let Some(weighting) = idx_cfg.query_weighting {
572 query_config.weighting = weighting;
573 }
574 }
575}
576
577fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
580 let mut dim: usize = 0;
581
582 for params in pair.into_inner() {
584 if params.as_rule() == Rule::dense_vector_params {
585 for inner in params.into_inner() {
586 match inner.as_rule() {
587 Rule::dense_vector_keyword_params => {
588 for kwarg in inner.into_inner() {
590 if kwarg.as_rule() == Rule::dims_kwarg
591 && let Some(d) = kwarg.into_inner().next()
592 {
593 dim = d.as_str().parse().unwrap_or(0);
594 }
595 }
596 }
597 Rule::dense_vector_positional_params => {
598 if let Some(dim_pair) = inner.into_inner().next() {
600 dim = dim_pair.as_str().parse().unwrap_or(0);
601 }
602 }
603 _ => {}
604 }
605 }
606 }
607 }
608
609 DenseVectorConfig::new(dim)
610}
611
612fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
614 pair.into_inner().map(|p| p.as_str().to_string()).collect()
615}
616
617fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
619 let mut pattern = String::new();
620 let mut substitution = String::new();
621 let mut target_field = String::new();
622 let mut mode = RoutingMode::Additional;
623
624 for prop in pair.into_inner() {
625 if prop.as_rule() != Rule::query_router_prop {
626 continue;
627 }
628
629 for inner in prop.into_inner() {
630 match inner.as_rule() {
631 Rule::query_router_pattern => {
632 if let Some(regex_str) = inner.into_inner().next() {
633 pattern = parse_string_value(regex_str);
634 }
635 }
636 Rule::query_router_substitution => {
637 if let Some(quoted) = inner.into_inner().next() {
638 substitution = parse_string_value(quoted);
639 }
640 }
641 Rule::query_router_target => {
642 if let Some(ident) = inner.into_inner().next() {
643 target_field = ident.as_str().to_string();
644 }
645 }
646 Rule::query_router_mode => {
647 if let Some(mode_val) = inner.into_inner().next() {
648 mode = match mode_val.as_str() {
649 "exclusive" => RoutingMode::Exclusive,
650 "additional" => RoutingMode::Additional,
651 _ => RoutingMode::Additional,
652 };
653 }
654 }
655 _ => {}
656 }
657 }
658 }
659
660 if pattern.is_empty() {
661 return Err(Error::Schema("query_router missing 'pattern'".to_string()));
662 }
663 if substitution.is_empty() {
664 return Err(Error::Schema(
665 "query_router missing 'substitution'".to_string(),
666 ));
667 }
668 if target_field.is_empty() {
669 return Err(Error::Schema(
670 "query_router missing 'target_field'".to_string(),
671 ));
672 }
673
674 Ok(QueryRouterRule {
675 pattern,
676 substitution,
677 target_field,
678 mode,
679 })
680}
681
682fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
684 let s = pair.as_str();
685 match pair.as_rule() {
686 Rule::regex_string => {
687 if let Some(inner) = pair.into_inner().next() {
689 parse_string_value(inner)
690 } else {
691 s.to_string()
692 }
693 }
694 Rule::raw_string => {
695 s[2..s.len() - 1].to_string()
697 }
698 Rule::quoted_string => {
699 let inner = &s[1..s.len() - 1];
701 inner
703 .replace("\\n", "\n")
704 .replace("\\t", "\t")
705 .replace("\\\"", "\"")
706 .replace("\\\\", "\\")
707 }
708 _ => s.to_string(),
709 }
710}
711
712fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
714 let mut inner = pair.into_inner();
715
716 let name = inner
717 .next()
718 .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
719 .as_str()
720 .to_string();
721
722 let mut fields = Vec::new();
723 let mut default_fields = Vec::new();
724 let mut query_routers = Vec::new();
725
726 for item in inner {
727 match item.as_rule() {
728 Rule::field_def => {
729 fields.push(parse_field_def(item)?);
730 }
731 Rule::default_fields_def => {
732 default_fields = parse_default_fields_def(item);
733 }
734 Rule::query_router_def => {
735 query_routers.push(parse_query_router_def(item)?);
736 }
737 _ => {}
738 }
739 }
740
741 Ok(IndexDef {
742 name,
743 fields,
744 default_fields,
745 query_routers,
746 })
747}
748
749pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
751 let pairs = SdlParser::parse(Rule::file, input)
752 .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
753
754 let mut indexes = Vec::new();
755
756 for pair in pairs {
757 if pair.as_rule() == Rule::file {
758 for inner in pair.into_inner() {
759 if inner.as_rule() == Rule::index_def {
760 indexes.push(parse_index_def(inner)?);
761 }
762 }
763 }
764 }
765
766 Ok(indexes)
767}
768
769pub fn parse_single_index(input: &str) -> Result<IndexDef> {
771 let indexes = parse_sdl(input)?;
772
773 if indexes.is_empty() {
774 return Err(Error::Schema("No index definition found".to_string()));
775 }
776
777 if indexes.len() > 1 {
778 return Err(Error::Schema(
779 "Multiple index definitions found, expected one".to_string(),
780 ));
781 }
782
783 Ok(indexes.into_iter().next().unwrap())
784}
785
786#[cfg(test)]
787mod tests {
788 use super::*;
789
790 #[test]
791 fn test_parse_simple_schema() {
792 let sdl = r#"
793 index articles {
794 field title: text [indexed, stored]
795 field body: text [indexed]
796 }
797 "#;
798
799 let indexes = parse_sdl(sdl).unwrap();
800 assert_eq!(indexes.len(), 1);
801
802 let index = &indexes[0];
803 assert_eq!(index.name, "articles");
804 assert_eq!(index.fields.len(), 2);
805
806 assert_eq!(index.fields[0].name, "title");
807 assert!(matches!(index.fields[0].field_type, FieldType::Text));
808 assert!(index.fields[0].indexed);
809 assert!(index.fields[0].stored);
810
811 assert_eq!(index.fields[1].name, "body");
812 assert!(matches!(index.fields[1].field_type, FieldType::Text));
813 assert!(index.fields[1].indexed);
814 assert!(!index.fields[1].stored);
815 }
816
817 #[test]
818 fn test_parse_all_field_types() {
819 let sdl = r#"
820 index test {
821 field text_field: text [indexed, stored]
822 field u64_field: u64 [indexed, stored]
823 field i64_field: i64 [indexed, stored]
824 field f64_field: f64 [indexed, stored]
825 field bytes_field: bytes [stored]
826 }
827 "#;
828
829 let indexes = parse_sdl(sdl).unwrap();
830 let index = &indexes[0];
831
832 assert!(matches!(index.fields[0].field_type, FieldType::Text));
833 assert!(matches!(index.fields[1].field_type, FieldType::U64));
834 assert!(matches!(index.fields[2].field_type, FieldType::I64));
835 assert!(matches!(index.fields[3].field_type, FieldType::F64));
836 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
837 }
838
839 #[test]
840 fn test_parse_with_comments() {
841 let sdl = r#"
842 # This is a comment
843 index articles {
844 # Title field
845 field title: text [indexed, stored]
846 field body: text [indexed] # inline comment not supported yet
847 }
848 "#;
849
850 let indexes = parse_sdl(sdl).unwrap();
851 assert_eq!(indexes[0].fields.len(), 2);
852 }
853
854 #[test]
855 fn test_parse_type_aliases() {
856 let sdl = r#"
857 index test {
858 field a: string [indexed]
859 field b: int [indexed]
860 field c: uint [indexed]
861 field d: float [indexed]
862 field e: binary [stored]
863 }
864 "#;
865
866 let indexes = parse_sdl(sdl).unwrap();
867 let index = &indexes[0];
868
869 assert!(matches!(index.fields[0].field_type, FieldType::Text));
870 assert!(matches!(index.fields[1].field_type, FieldType::I64));
871 assert!(matches!(index.fields[2].field_type, FieldType::U64));
872 assert!(matches!(index.fields[3].field_type, FieldType::F64));
873 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
874 }
875
876 #[test]
877 fn test_to_schema() {
878 let sdl = r#"
879 index articles {
880 field title: text [indexed, stored]
881 field views: u64 [indexed, stored]
882 }
883 "#;
884
885 let indexes = parse_sdl(sdl).unwrap();
886 let schema = indexes[0].to_schema();
887
888 assert!(schema.get_field("title").is_some());
889 assert!(schema.get_field("views").is_some());
890 assert!(schema.get_field("nonexistent").is_none());
891 }
892
893 #[test]
894 fn test_default_attributes() {
895 let sdl = r#"
896 index test {
897 field title: text
898 }
899 "#;
900
901 let indexes = parse_sdl(sdl).unwrap();
902 let field = &indexes[0].fields[0];
903
904 assert!(field.indexed);
906 assert!(field.stored);
907 }
908
909 #[test]
910 fn test_multiple_indexes() {
911 let sdl = r#"
912 index articles {
913 field title: text [indexed, stored]
914 }
915
916 index users {
917 field name: text [indexed, stored]
918 field email: text [indexed, stored]
919 }
920 "#;
921
922 let indexes = parse_sdl(sdl).unwrap();
923 assert_eq!(indexes.len(), 2);
924 assert_eq!(indexes[0].name, "articles");
925 assert_eq!(indexes[1].name, "users");
926 }
927
928 #[test]
929 fn test_tokenizer_spec() {
930 let sdl = r#"
931 index articles {
932 field title: text<en_stem> [indexed, stored]
933 field body: text<default> [indexed]
934 field author: text [indexed, stored]
935 }
936 "#;
937
938 let indexes = parse_sdl(sdl).unwrap();
939 let index = &indexes[0];
940
941 assert_eq!(index.fields[0].name, "title");
942 assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
943
944 assert_eq!(index.fields[1].name, "body");
945 assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
946
947 assert_eq!(index.fields[2].name, "author");
948 assert_eq!(index.fields[2].tokenizer, None); }
950
951 #[test]
952 fn test_tokenizer_in_schema() {
953 let sdl = r#"
954 index articles {
955 field title: text<german> [indexed, stored]
956 field body: text<en_stem> [indexed]
957 }
958 "#;
959
960 let indexes = parse_sdl(sdl).unwrap();
961 let schema = indexes[0].to_schema();
962
963 let title_field = schema.get_field("title").unwrap();
964 let title_entry = schema.get_field_entry(title_field).unwrap();
965 assert_eq!(title_entry.tokenizer, Some("german".to_string()));
966
967 let body_field = schema.get_field("body").unwrap();
968 let body_entry = schema.get_field_entry(body_field).unwrap();
969 assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
970 }
971
972 #[test]
973 fn test_query_router_basic() {
974 let sdl = r#"
975 index documents {
976 field title: text [indexed, stored]
977 field uri: text [indexed, stored]
978
979 query_router {
980 pattern: "10\\.\\d{4,}/[^\\s]+"
981 substitution: "doi://{0}"
982 target_field: uris
983 mode: exclusive
984 }
985 }
986 "#;
987
988 let indexes = parse_sdl(sdl).unwrap();
989 let index = &indexes[0];
990
991 assert_eq!(index.query_routers.len(), 1);
992 let router = &index.query_routers[0];
993 assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
994 assert_eq!(router.substitution, "doi://{0}");
995 assert_eq!(router.target_field, "uris");
996 assert_eq!(router.mode, RoutingMode::Exclusive);
997 }
998
999 #[test]
1000 fn test_query_router_raw_string() {
1001 let sdl = r#"
1002 index documents {
1003 field uris: text [indexed, stored]
1004
1005 query_router {
1006 pattern: r"^pmid:(\d+)$"
1007 substitution: "pubmed://{1}"
1008 target_field: uris
1009 mode: additional
1010 }
1011 }
1012 "#;
1013
1014 let indexes = parse_sdl(sdl).unwrap();
1015 let router = &indexes[0].query_routers[0];
1016
1017 assert_eq!(router.pattern, r"^pmid:(\d+)$");
1018 assert_eq!(router.substitution, "pubmed://{1}");
1019 assert_eq!(router.mode, RoutingMode::Additional);
1020 }
1021
1022 #[test]
1023 fn test_multiple_query_routers() {
1024 let sdl = r#"
1025 index documents {
1026 field uris: text [indexed, stored]
1027
1028 query_router {
1029 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1030 substitution: "doi://{1}"
1031 target_field: uris
1032 mode: exclusive
1033 }
1034
1035 query_router {
1036 pattern: r"^pmid:(\d+)$"
1037 substitution: "pubmed://{1}"
1038 target_field: uris
1039 mode: exclusive
1040 }
1041
1042 query_router {
1043 pattern: r"^arxiv:(\d+\.\d+)$"
1044 substitution: "arxiv://{1}"
1045 target_field: uris
1046 mode: additional
1047 }
1048 }
1049 "#;
1050
1051 let indexes = parse_sdl(sdl).unwrap();
1052 assert_eq!(indexes[0].query_routers.len(), 3);
1053 }
1054
1055 #[test]
1056 fn test_query_router_default_mode() {
1057 let sdl = r#"
1058 index documents {
1059 field uris: text [indexed, stored]
1060
1061 query_router {
1062 pattern: r"test"
1063 substitution: "{0}"
1064 target_field: uris
1065 }
1066 }
1067 "#;
1068
1069 let indexes = parse_sdl(sdl).unwrap();
1070 assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1072 }
1073
1074 #[test]
1075 fn test_multi_attribute() {
1076 let sdl = r#"
1077 index documents {
1078 field uris: text [indexed, stored<multi>]
1079 field title: text [indexed, stored]
1080 }
1081 "#;
1082
1083 let indexes = parse_sdl(sdl).unwrap();
1084 assert_eq!(indexes.len(), 1);
1085
1086 let fields = &indexes[0].fields;
1087 assert_eq!(fields.len(), 2);
1088
1089 assert_eq!(fields[0].name, "uris");
1091 assert!(fields[0].multi, "uris field should have multi=true");
1092
1093 assert_eq!(fields[1].name, "title");
1095 assert!(!fields[1].multi, "title field should have multi=false");
1096
1097 let schema = indexes[0].to_schema();
1099 let uris_field = schema.get_field("uris").unwrap();
1100 let title_field = schema.get_field("title").unwrap();
1101
1102 assert!(schema.get_field_entry(uris_field).unwrap().multi);
1103 assert!(!schema.get_field_entry(title_field).unwrap().multi);
1104 }
1105
1106 #[test]
1107 fn test_sparse_vector_field() {
1108 let sdl = r#"
1109 index documents {
1110 field embedding: sparse_vector [indexed, stored]
1111 }
1112 "#;
1113
1114 let indexes = parse_sdl(sdl).unwrap();
1115 assert_eq!(indexes.len(), 1);
1116 assert_eq!(indexes[0].fields.len(), 1);
1117 assert_eq!(indexes[0].fields[0].name, "embedding");
1118 assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1119 assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1120 }
1121
1122 #[test]
1123 fn test_sparse_vector_with_config() {
1124 let sdl = r#"
1125 index documents {
1126 field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1127 field dense: sparse_vector<u32> [indexed<quantization: float32>]
1128 }
1129 "#;
1130
1131 let indexes = parse_sdl(sdl).unwrap();
1132 assert_eq!(indexes[0].fields.len(), 2);
1133
1134 let f1 = &indexes[0].fields[0];
1136 assert_eq!(f1.name, "embedding");
1137 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1138 assert_eq!(config1.index_size, IndexSize::U16);
1139 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1140
1141 let f2 = &indexes[0].fields[1];
1143 assert_eq!(f2.name, "dense");
1144 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1145 assert_eq!(config2.index_size, IndexSize::U32);
1146 assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1147 }
1148
1149 #[test]
1150 fn test_sparse_vector_with_weight_threshold() {
1151 let sdl = r#"
1152 index documents {
1153 field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1154 field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1155 }
1156 "#;
1157
1158 let indexes = parse_sdl(sdl).unwrap();
1159 assert_eq!(indexes[0].fields.len(), 2);
1160
1161 let f1 = &indexes[0].fields[0];
1163 assert_eq!(f1.name, "embedding");
1164 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1165 assert_eq!(config1.index_size, IndexSize::U16);
1166 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1167 assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1168
1169 let f2 = &indexes[0].fields[1];
1171 assert_eq!(f2.name, "embedding2");
1172 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1173 assert_eq!(config2.index_size, IndexSize::U32);
1174 assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1175 assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1176 }
1177
1178 #[test]
1179 fn test_dense_vector_field() {
1180 let sdl = r#"
1181 index documents {
1182 field embedding: dense_vector<768> [indexed, stored]
1183 }
1184 "#;
1185
1186 let indexes = parse_sdl(sdl).unwrap();
1187 assert_eq!(indexes.len(), 1);
1188 assert_eq!(indexes[0].fields.len(), 1);
1189
1190 let f = &indexes[0].fields[0];
1191 assert_eq!(f.name, "embedding");
1192 assert_eq!(f.field_type, FieldType::DenseVector);
1193
1194 let config = f.dense_vector_config.as_ref().unwrap();
1195 assert_eq!(config.dim, 768);
1196 }
1197
1198 #[test]
1199 fn test_dense_vector_alias() {
1200 let sdl = r#"
1201 index documents {
1202 field embedding: vector<1536> [indexed]
1203 }
1204 "#;
1205
1206 let indexes = parse_sdl(sdl).unwrap();
1207 assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1208 assert_eq!(
1209 indexes[0].fields[0]
1210 .dense_vector_config
1211 .as_ref()
1212 .unwrap()
1213 .dim,
1214 1536
1215 );
1216 }
1217
1218 #[test]
1219 fn test_dense_vector_with_num_clusters() {
1220 let sdl = r#"
1221 index documents {
1222 field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1223 }
1224 "#;
1225
1226 let indexes = parse_sdl(sdl).unwrap();
1227 assert_eq!(indexes.len(), 1);
1228
1229 let f = &indexes[0].fields[0];
1230 assert_eq!(f.name, "embedding");
1231 assert_eq!(f.field_type, FieldType::DenseVector);
1232
1233 let config = f.dense_vector_config.as_ref().unwrap();
1234 assert_eq!(config.dim, 768);
1235 assert_eq!(config.num_clusters, Some(256));
1236 assert_eq!(config.nprobe, 32); }
1238
1239 #[test]
1240 fn test_dense_vector_with_num_clusters_and_nprobe() {
1241 let sdl = r#"
1242 index documents {
1243 field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1244 }
1245 "#;
1246
1247 let indexes = parse_sdl(sdl).unwrap();
1248 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1249
1250 assert_eq!(config.dim, 1536);
1251 assert_eq!(config.num_clusters, Some(512));
1252 assert_eq!(config.nprobe, 64);
1253 }
1254
1255 #[test]
1256 fn test_dense_vector_keyword_syntax() {
1257 let sdl = r#"
1258 index documents {
1259 field embedding: dense_vector<dims: 1536> [indexed, stored]
1260 }
1261 "#;
1262
1263 let indexes = parse_sdl(sdl).unwrap();
1264 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1265
1266 assert_eq!(config.dim, 1536);
1267 assert!(config.num_clusters.is_none());
1268 }
1269
1270 #[test]
1271 fn test_dense_vector_keyword_syntax_full() {
1272 let sdl = r#"
1273 index documents {
1274 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1275 }
1276 "#;
1277
1278 let indexes = parse_sdl(sdl).unwrap();
1279 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1280
1281 assert_eq!(config.dim, 1536);
1282 assert_eq!(config.num_clusters, Some(256));
1283 assert_eq!(config.nprobe, 64);
1284 }
1285
1286 #[test]
1287 fn test_dense_vector_keyword_syntax_partial() {
1288 let sdl = r#"
1289 index documents {
1290 field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1291 }
1292 "#;
1293
1294 let indexes = parse_sdl(sdl).unwrap();
1295 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1296
1297 assert_eq!(config.dim, 768);
1298 assert_eq!(config.num_clusters, Some(128));
1299 assert_eq!(config.nprobe, 32); }
1301
1302 #[test]
1303 fn test_dense_vector_scann_index() {
1304 use crate::dsl::schema::VectorIndexType;
1305
1306 let sdl = r#"
1307 index documents {
1308 field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1309 }
1310 "#;
1311
1312 let indexes = parse_sdl(sdl).unwrap();
1313 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1314
1315 assert_eq!(config.dim, 768);
1316 assert_eq!(config.index_type, VectorIndexType::ScaNN);
1317 assert_eq!(config.num_clusters, Some(256));
1318 assert_eq!(config.nprobe, 64);
1319 }
1320
1321 #[test]
1322 fn test_dense_vector_ivf_rabitq_index() {
1323 use crate::dsl::schema::VectorIndexType;
1324
1325 let sdl = r#"
1326 index documents {
1327 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1328 }
1329 "#;
1330
1331 let indexes = parse_sdl(sdl).unwrap();
1332 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1333
1334 assert_eq!(config.dim, 1536);
1335 assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1336 assert_eq!(config.num_clusters, Some(512));
1337 }
1338
1339 #[test]
1340 fn test_dense_vector_rabitq_no_clusters() {
1341 use crate::dsl::schema::VectorIndexType;
1342
1343 let sdl = r#"
1344 index documents {
1345 field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1346 }
1347 "#;
1348
1349 let indexes = parse_sdl(sdl).unwrap();
1350 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1351
1352 assert_eq!(config.dim, 768);
1353 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1354 assert!(config.num_clusters.is_none());
1355 }
1356
1357 #[test]
1358 fn test_dense_vector_flat_index() {
1359 use crate::dsl::schema::VectorIndexType;
1360
1361 let sdl = r#"
1362 index documents {
1363 field embedding: dense_vector<dims: 768> [indexed<flat>]
1364 }
1365 "#;
1366
1367 let indexes = parse_sdl(sdl).unwrap();
1368 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1369
1370 assert_eq!(config.dim, 768);
1371 assert_eq!(config.index_type, VectorIndexType::Flat);
1372 }
1373
1374 #[test]
1375 fn test_dense_vector_default_index_type() {
1376 use crate::dsl::schema::VectorIndexType;
1377
1378 let sdl = r#"
1380 index documents {
1381 field embedding: dense_vector<dims: 768> [indexed]
1382 }
1383 "#;
1384
1385 let indexes = parse_sdl(sdl).unwrap();
1386 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1387
1388 assert_eq!(config.dim, 768);
1389 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1390 }
1391
1392 #[test]
1393 fn test_dense_vector_mrl_dim() {
1394 let sdl = r#"
1396 index documents {
1397 field embedding: dense_vector<1536> [indexed<mrl_dim: 256>]
1398 }
1399 "#;
1400
1401 let indexes = parse_sdl(sdl).unwrap();
1402 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1403
1404 assert_eq!(config.dim, 1536);
1405 assert_eq!(config.mrl_dim, Some(256));
1406 assert_eq!(config.index_dim(), 256);
1407 }
1408
1409 #[test]
1410 fn test_dense_vector_mrl_dim_with_num_clusters() {
1411 let sdl = r#"
1413 index documents {
1414 field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64, mrl_dim: 128>]
1415 }
1416 "#;
1417
1418 let indexes = parse_sdl(sdl).unwrap();
1419 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1420
1421 assert_eq!(config.dim, 768);
1422 assert_eq!(config.mrl_dim, Some(128));
1423 assert_eq!(config.index_dim(), 128);
1424 assert_eq!(config.num_clusters, Some(256));
1425 assert_eq!(config.nprobe, 64);
1426 }
1427
1428 #[test]
1429 fn test_dense_vector_no_mrl_dim() {
1430 let sdl = r#"
1432 index documents {
1433 field embedding: dense_vector<dims: 768> [indexed]
1434 }
1435 "#;
1436
1437 let indexes = parse_sdl(sdl).unwrap();
1438 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1439
1440 assert_eq!(config.dim, 768);
1441 assert_eq!(config.mrl_dim, None);
1442 assert_eq!(config.index_dim(), 768);
1443 }
1444
1445 #[test]
1446 fn test_json_field_type() {
1447 let sdl = r#"
1448 index documents {
1449 field title: text [indexed, stored]
1450 field metadata: json [stored]
1451 field extra: json
1452 }
1453 "#;
1454
1455 let indexes = parse_sdl(sdl).unwrap();
1456 let index = &indexes[0];
1457
1458 assert_eq!(index.fields.len(), 3);
1459
1460 assert_eq!(index.fields[1].name, "metadata");
1462 assert!(matches!(index.fields[1].field_type, FieldType::Json));
1463 assert!(index.fields[1].stored);
1464 assert_eq!(index.fields[2].name, "extra");
1468 assert!(matches!(index.fields[2].field_type, FieldType::Json));
1469
1470 let schema = index.to_schema();
1472 let metadata_field = schema.get_field("metadata").unwrap();
1473 let entry = schema.get_field_entry(metadata_field).unwrap();
1474 assert_eq!(entry.field_type, FieldType::Json);
1475 assert!(!entry.indexed); assert!(entry.stored);
1477 }
1478
1479 #[test]
1480 fn test_sparse_vector_query_config() {
1481 use crate::structures::QueryWeighting;
1482
1483 let sdl = r#"
1484 index documents {
1485 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1486 }
1487 "#;
1488
1489 let indexes = parse_sdl(sdl).unwrap();
1490 let index = &indexes[0];
1491
1492 assert_eq!(index.fields.len(), 1);
1493 assert_eq!(index.fields[0].name, "embedding");
1494 assert!(matches!(
1495 index.fields[0].field_type,
1496 FieldType::SparseVector
1497 ));
1498
1499 let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1500 assert_eq!(config.index_size, IndexSize::U16);
1501 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1502
1503 let query_config = config.query_config.as_ref().unwrap();
1505 assert_eq!(
1506 query_config.tokenizer.as_deref(),
1507 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1508 );
1509 assert_eq!(query_config.weighting, QueryWeighting::Idf);
1510
1511 let schema = index.to_schema();
1513 let embedding_field = schema.get_field("embedding").unwrap();
1514 let entry = schema.get_field_entry(embedding_field).unwrap();
1515 let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1516 let qc = sv_config.query_config.as_ref().unwrap();
1517 assert_eq!(
1518 qc.tokenizer.as_deref(),
1519 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1520 );
1521 assert_eq!(qc.weighting, QueryWeighting::Idf);
1522 }
1523
1524 #[test]
1525 fn test_sparse_vector_query_config_weighting_one() {
1526 use crate::structures::QueryWeighting;
1527
1528 let sdl = r#"
1529 index documents {
1530 field embedding: sparse_vector [indexed<query<weighting: one>>]
1531 }
1532 "#;
1533
1534 let indexes = parse_sdl(sdl).unwrap();
1535 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1536
1537 let query_config = config.query_config.as_ref().unwrap();
1538 assert!(query_config.tokenizer.is_none());
1539 assert_eq!(query_config.weighting, QueryWeighting::One);
1540 }
1541}