1use pest::Parser;
49use pest_derive::Parser;
50
51use super::query_field_router::{QueryRouterRule, RoutingMode};
52use super::schema::{FieldType, Schema, SchemaBuilder};
53use crate::Result;
54use crate::error::Error;
55
56#[derive(Parser)]
57#[grammar = "dsl/sdl/sdl.pest"]
58pub struct SdlParser;
59
60use super::schema::DenseVectorConfig;
61use crate::structures::{
62 IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
63};
64
65#[derive(Debug, Clone)]
67pub struct FieldDef {
68 pub name: String,
69 pub field_type: FieldType,
70 pub indexed: bool,
71 pub stored: bool,
72 pub tokenizer: Option<String>,
74 pub multi: bool,
76 pub sparse_vector_config: Option<SparseVectorConfig>,
78 pub dense_vector_config: Option<DenseVectorConfig>,
80}
81
82#[derive(Debug, Clone)]
84pub struct IndexDef {
85 pub name: String,
86 pub fields: Vec<FieldDef>,
87 pub default_fields: Vec<String>,
88 pub query_routers: Vec<QueryRouterRule>,
90}
91
92impl IndexDef {
93 pub fn to_schema(&self) -> Schema {
95 let mut builder = SchemaBuilder::default();
96
97 for field in &self.fields {
98 let f = match field.field_type {
99 FieldType::Text => {
100 let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
101 builder.add_text_field_with_tokenizer(
102 &field.name,
103 field.indexed,
104 field.stored,
105 tokenizer,
106 )
107 }
108 FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
109 FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
110 FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
111 FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
112 FieldType::Json => builder.add_json_field(&field.name, field.stored),
113 FieldType::SparseVector => {
114 if let Some(config) = &field.sparse_vector_config {
115 builder.add_sparse_vector_field_with_config(
116 &field.name,
117 field.indexed,
118 field.stored,
119 config.clone(),
120 )
121 } else {
122 builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
123 }
124 }
125 FieldType::DenseVector => {
126 let config = field
128 .dense_vector_config
129 .as_ref()
130 .expect("DenseVector field requires dimension to be specified");
131 builder.add_dense_vector_field_with_config(
132 &field.name,
133 field.indexed,
134 field.stored,
135 config.clone(),
136 )
137 }
138 };
139 if field.multi {
140 builder.set_multi(f, true);
141 }
142 }
143
144 if !self.default_fields.is_empty() {
146 builder.set_default_fields(self.default_fields.clone());
147 }
148
149 if !self.query_routers.is_empty() {
151 builder.set_query_routers(self.query_routers.clone());
152 }
153
154 builder.build()
155 }
156
157 pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
162 if self.query_routers.is_empty() {
163 return Ok(None);
164 }
165
166 super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
167 .map(Some)
168 .map_err(Error::Schema)
169 }
170}
171
172fn parse_field_type(type_str: &str) -> Result<FieldType> {
174 match type_str {
175 "text" | "string" | "str" => Ok(FieldType::Text),
176 "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
177 "i64" | "int" | "integer" => Ok(FieldType::I64),
178 "f64" | "float" | "double" => Ok(FieldType::F64),
179 "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
180 "json" => Ok(FieldType::Json),
181 "sparse_vector" => Ok(FieldType::SparseVector),
182 "dense_vector" | "vector" => Ok(FieldType::DenseVector),
183 _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
184 }
185}
186
187#[derive(Debug, Clone, Default)]
189struct IndexConfig {
190 index_type: Option<super::schema::VectorIndexType>,
191 centroids_path: Option<String>,
192 codebook_path: Option<String>,
193 nprobe: Option<usize>,
194 mrl_dim: Option<usize>,
195 quantization: Option<WeightQuantization>,
197 weight_threshold: Option<f32>,
198 query_tokenizer: Option<String>,
200 query_weighting: Option<QueryWeighting>,
201}
202
203fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool, Option<IndexConfig>) {
206 let mut indexed = false;
207 let mut stored = false;
208 let mut multi = false;
209 let mut index_config = None;
210
211 for attr in pair.into_inner() {
212 if attr.as_rule() == Rule::attribute {
213 let mut found_indexed_with_config = false;
216 for inner in attr.clone().into_inner() {
217 if inner.as_rule() == Rule::indexed_with_config {
218 indexed = true;
219 index_config = Some(parse_index_config(inner));
220 found_indexed_with_config = true;
221 break;
222 }
223 }
224 if !found_indexed_with_config {
225 match attr.as_str() {
227 "indexed" => indexed = true,
228 "stored" => stored = true,
229 "multi" => multi = true,
230 _ => {}
231 }
232 }
233 }
234 }
235
236 (indexed, stored, multi, index_config)
237}
238
239fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
241 let mut config = IndexConfig::default();
242
243 for inner in pair.into_inner() {
248 if inner.as_rule() == Rule::index_config_params {
249 for param in inner.into_inner() {
250 if param.as_rule() == Rule::index_config_param {
251 for p in param.into_inner() {
252 parse_single_index_config_param(&mut config, p);
253 }
254 }
255 }
256 }
257 }
258
259 config
260}
261
262fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
264 use super::schema::VectorIndexType;
265
266 match p.as_rule() {
267 Rule::index_type_spec => {
268 config.index_type = Some(match p.as_str() {
269 "scann" => VectorIndexType::ScaNN,
270 "rabitq" => VectorIndexType::IvfRaBitQ,
271 _ => VectorIndexType::IvfRaBitQ,
272 });
273 }
274 Rule::index_type_kwarg => {
275 if let Some(t) = p.into_inner().next() {
277 config.index_type = Some(match t.as_str() {
278 "scann" => VectorIndexType::ScaNN,
279 "rabitq" => VectorIndexType::IvfRaBitQ,
280 _ => VectorIndexType::IvfRaBitQ,
281 });
282 }
283 }
284 Rule::centroids_kwarg => {
285 if let Some(path) = p.into_inner().next()
288 && let Some(inner_path) = path.into_inner().next()
289 {
290 config.centroids_path = Some(inner_path.as_str().to_string());
291 }
292 }
293 Rule::codebook_kwarg => {
294 if let Some(path) = p.into_inner().next()
296 && let Some(inner_path) = path.into_inner().next()
297 {
298 config.codebook_path = Some(inner_path.as_str().to_string());
299 }
300 }
301 Rule::nprobe_kwarg => {
302 if let Some(n) = p.into_inner().next() {
304 config.nprobe = Some(n.as_str().parse().unwrap_or(32));
305 }
306 }
307 Rule::mrl_dim_kwarg => {
308 if let Some(n) = p.into_inner().next() {
310 config.mrl_dim = Some(n.as_str().parse().unwrap_or(0));
311 }
312 }
313 Rule::quantization_kwarg => {
314 if let Some(q) = p.into_inner().next() {
316 config.quantization = Some(match q.as_str() {
317 "float32" | "f32" => WeightQuantization::Float32,
318 "float16" | "f16" => WeightQuantization::Float16,
319 "uint8" | "u8" => WeightQuantization::UInt8,
320 "uint4" | "u4" => WeightQuantization::UInt4,
321 _ => WeightQuantization::default(),
322 });
323 }
324 }
325 Rule::weight_threshold_kwarg => {
326 if let Some(t) = p.into_inner().next() {
328 config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
329 }
330 }
331 Rule::query_config_block => {
332 parse_query_config_block(config, p);
334 }
335 _ => {}
336 }
337}
338
339fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
341 for inner in pair.into_inner() {
342 if inner.as_rule() == Rule::query_config_params {
343 for param in inner.into_inner() {
344 if param.as_rule() == Rule::query_config_param {
345 for p in param.into_inner() {
346 match p.as_rule() {
347 Rule::query_tokenizer_kwarg => {
348 if let Some(path) = p.into_inner().next()
350 && let Some(inner_path) = path.into_inner().next()
351 {
352 config.query_tokenizer = Some(inner_path.as_str().to_string());
353 }
354 }
355 Rule::query_weighting_kwarg => {
356 if let Some(w) = p.into_inner().next() {
358 config.query_weighting = Some(match w.as_str() {
359 "one" => QueryWeighting::One,
360 "idf" => QueryWeighting::Idf,
361 _ => QueryWeighting::One,
362 });
363 }
364 }
365 _ => {}
366 }
367 }
368 }
369 }
370 }
371 }
372}
373
374fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
376 let mut inner = pair.into_inner();
377
378 let name = inner
379 .next()
380 .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
381 .as_str()
382 .to_string();
383
384 let field_type_str = inner
385 .next()
386 .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
387 .as_str();
388
389 let field_type = parse_field_type(field_type_str)?;
390
391 let mut tokenizer = None;
393 let mut sparse_vector_config = None;
394 let mut dense_vector_config = None;
395 let mut indexed = true;
396 let mut stored = true;
397 let mut multi = false;
398 let mut index_config: Option<IndexConfig> = None;
399
400 for item in inner {
401 match item.as_rule() {
402 Rule::tokenizer_spec => {
403 if let Some(tok_name) = item.into_inner().next() {
405 tokenizer = Some(tok_name.as_str().to_string());
406 }
407 }
408 Rule::sparse_vector_config => {
409 sparse_vector_config = Some(parse_sparse_vector_config(item));
411 }
412 Rule::dense_vector_config => {
413 dense_vector_config = Some(parse_dense_vector_config(item));
415 }
416 Rule::attributes => {
417 let (idx, sto, mul, idx_cfg) = parse_attributes(item);
418 indexed = idx;
419 stored = sto;
420 multi = mul;
421 index_config = idx_cfg;
422 }
423 _ => {}
424 }
425 }
426
427 if let Some(idx_cfg) = index_config {
429 if let Some(ref mut dv_config) = dense_vector_config {
430 apply_index_config_to_dense_vector(dv_config, idx_cfg);
431 } else if field_type == FieldType::SparseVector {
432 let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
434 apply_index_config_to_sparse_vector(sv_config, idx_cfg);
435 }
436 }
437
438 Ok(FieldDef {
439 name,
440 field_type,
441 indexed,
442 stored,
443 tokenizer,
444 multi,
445 sparse_vector_config,
446 dense_vector_config,
447 })
448}
449
450fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
452 use super::schema::VectorIndexType;
453
454 let nprobe = idx_cfg.nprobe.unwrap_or(32);
455
456 match idx_cfg.index_type {
457 Some(VectorIndexType::ScaNN) => {
458 config.index_type = VectorIndexType::ScaNN;
459 config.coarse_centroids_path = idx_cfg.centroids_path;
460 config.pq_codebook_path = idx_cfg.codebook_path;
461 config.nprobe = nprobe;
462 }
463 Some(VectorIndexType::IvfRaBitQ) => {
464 config.index_type = VectorIndexType::IvfRaBitQ;
465 config.coarse_centroids_path = idx_cfg.centroids_path;
466 config.nprobe = nprobe;
467 }
468 Some(VectorIndexType::RaBitQ) | None => {
469 if idx_cfg.centroids_path.is_some() {
471 config.index_type = VectorIndexType::IvfRaBitQ;
472 config.coarse_centroids_path = idx_cfg.centroids_path;
473 config.nprobe = nprobe;
474 }
475 }
477 }
478
479 if idx_cfg.mrl_dim.is_some() {
481 config.mrl_dim = idx_cfg.mrl_dim;
482 }
483}
484
485fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
488 let mut index_size = IndexSize::default();
489
490 for inner in pair.into_inner() {
492 if inner.as_rule() == Rule::index_size_spec {
493 index_size = match inner.as_str() {
494 "u16" => IndexSize::U16,
495 "u32" => IndexSize::U32,
496 _ => IndexSize::default(),
497 };
498 }
499 }
500
501 SparseVectorConfig {
502 index_size,
503 weight_quantization: WeightQuantization::default(),
504 weight_threshold: 0.0,
505 posting_list_pruning: None,
506 query_config: None,
507 }
508}
509
510fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
512 if let Some(q) = idx_cfg.quantization {
513 config.weight_quantization = q;
514 }
515 if let Some(t) = idx_cfg.weight_threshold {
516 config.weight_threshold = t;
517 }
518 if idx_cfg.query_tokenizer.is_some() || idx_cfg.query_weighting.is_some() {
520 let query_config = config
521 .query_config
522 .get_or_insert(SparseQueryConfig::default());
523 if let Some(tokenizer) = idx_cfg.query_tokenizer {
524 query_config.tokenizer = Some(tokenizer);
525 }
526 if let Some(weighting) = idx_cfg.query_weighting {
527 query_config.weighting = weighting;
528 }
529 }
530}
531
532fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
535 let mut dim: usize = 0;
536
537 for params in pair.into_inner() {
539 if params.as_rule() == Rule::dense_vector_params {
540 for inner in params.into_inner() {
541 match inner.as_rule() {
542 Rule::dense_vector_keyword_params => {
543 for kwarg in inner.into_inner() {
545 if kwarg.as_rule() == Rule::dims_kwarg
546 && let Some(d) = kwarg.into_inner().next()
547 {
548 dim = d.as_str().parse().unwrap_or(0);
549 }
550 }
551 }
552 Rule::dense_vector_positional_params => {
553 if let Some(dim_pair) = inner.into_inner().next() {
555 dim = dim_pair.as_str().parse().unwrap_or(0);
556 }
557 }
558 _ => {}
559 }
560 }
561 }
562 }
563
564 DenseVectorConfig::new(dim)
565}
566
567fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
569 pair.into_inner().map(|p| p.as_str().to_string()).collect()
570}
571
572fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
574 let mut pattern = String::new();
575 let mut substitution = String::new();
576 let mut target_field = String::new();
577 let mut mode = RoutingMode::Additional;
578
579 for prop in pair.into_inner() {
580 if prop.as_rule() != Rule::query_router_prop {
581 continue;
582 }
583
584 for inner in prop.into_inner() {
585 match inner.as_rule() {
586 Rule::query_router_pattern => {
587 if let Some(regex_str) = inner.into_inner().next() {
588 pattern = parse_string_value(regex_str);
589 }
590 }
591 Rule::query_router_substitution => {
592 if let Some(quoted) = inner.into_inner().next() {
593 substitution = parse_string_value(quoted);
594 }
595 }
596 Rule::query_router_target => {
597 if let Some(ident) = inner.into_inner().next() {
598 target_field = ident.as_str().to_string();
599 }
600 }
601 Rule::query_router_mode => {
602 if let Some(mode_val) = inner.into_inner().next() {
603 mode = match mode_val.as_str() {
604 "exclusive" => RoutingMode::Exclusive,
605 "additional" => RoutingMode::Additional,
606 _ => RoutingMode::Additional,
607 };
608 }
609 }
610 _ => {}
611 }
612 }
613 }
614
615 if pattern.is_empty() {
616 return Err(Error::Schema("query_router missing 'pattern'".to_string()));
617 }
618 if substitution.is_empty() {
619 return Err(Error::Schema(
620 "query_router missing 'substitution'".to_string(),
621 ));
622 }
623 if target_field.is_empty() {
624 return Err(Error::Schema(
625 "query_router missing 'target_field'".to_string(),
626 ));
627 }
628
629 Ok(QueryRouterRule {
630 pattern,
631 substitution,
632 target_field,
633 mode,
634 })
635}
636
637fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
639 let s = pair.as_str();
640 match pair.as_rule() {
641 Rule::regex_string => {
642 if let Some(inner) = pair.into_inner().next() {
644 parse_string_value(inner)
645 } else {
646 s.to_string()
647 }
648 }
649 Rule::raw_string => {
650 s[2..s.len() - 1].to_string()
652 }
653 Rule::quoted_string => {
654 let inner = &s[1..s.len() - 1];
656 inner
658 .replace("\\n", "\n")
659 .replace("\\t", "\t")
660 .replace("\\\"", "\"")
661 .replace("\\\\", "\\")
662 }
663 _ => s.to_string(),
664 }
665}
666
667fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
669 let mut inner = pair.into_inner();
670
671 let name = inner
672 .next()
673 .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
674 .as_str()
675 .to_string();
676
677 let mut fields = Vec::new();
678 let mut default_fields = Vec::new();
679 let mut query_routers = Vec::new();
680
681 for item in inner {
682 match item.as_rule() {
683 Rule::field_def => {
684 fields.push(parse_field_def(item)?);
685 }
686 Rule::default_fields_def => {
687 default_fields = parse_default_fields_def(item);
688 }
689 Rule::query_router_def => {
690 query_routers.push(parse_query_router_def(item)?);
691 }
692 _ => {}
693 }
694 }
695
696 Ok(IndexDef {
697 name,
698 fields,
699 default_fields,
700 query_routers,
701 })
702}
703
704pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
706 let pairs = SdlParser::parse(Rule::file, input)
707 .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
708
709 let mut indexes = Vec::new();
710
711 for pair in pairs {
712 if pair.as_rule() == Rule::file {
713 for inner in pair.into_inner() {
714 if inner.as_rule() == Rule::index_def {
715 indexes.push(parse_index_def(inner)?);
716 }
717 }
718 }
719 }
720
721 Ok(indexes)
722}
723
724pub fn parse_single_index(input: &str) -> Result<IndexDef> {
726 let indexes = parse_sdl(input)?;
727
728 if indexes.is_empty() {
729 return Err(Error::Schema("No index definition found".to_string()));
730 }
731
732 if indexes.len() > 1 {
733 return Err(Error::Schema(
734 "Multiple index definitions found, expected one".to_string(),
735 ));
736 }
737
738 Ok(indexes.into_iter().next().unwrap())
739}
740
741#[cfg(test)]
742mod tests {
743 use super::*;
744
745 #[test]
746 fn test_parse_simple_schema() {
747 let sdl = r#"
748 index articles {
749 field title: text [indexed, stored]
750 field body: text [indexed]
751 }
752 "#;
753
754 let indexes = parse_sdl(sdl).unwrap();
755 assert_eq!(indexes.len(), 1);
756
757 let index = &indexes[0];
758 assert_eq!(index.name, "articles");
759 assert_eq!(index.fields.len(), 2);
760
761 assert_eq!(index.fields[0].name, "title");
762 assert!(matches!(index.fields[0].field_type, FieldType::Text));
763 assert!(index.fields[0].indexed);
764 assert!(index.fields[0].stored);
765
766 assert_eq!(index.fields[1].name, "body");
767 assert!(matches!(index.fields[1].field_type, FieldType::Text));
768 assert!(index.fields[1].indexed);
769 assert!(!index.fields[1].stored);
770 }
771
772 #[test]
773 fn test_parse_all_field_types() {
774 let sdl = r#"
775 index test {
776 field text_field: text [indexed, stored]
777 field u64_field: u64 [indexed, stored]
778 field i64_field: i64 [indexed, stored]
779 field f64_field: f64 [indexed, stored]
780 field bytes_field: bytes [stored]
781 }
782 "#;
783
784 let indexes = parse_sdl(sdl).unwrap();
785 let index = &indexes[0];
786
787 assert!(matches!(index.fields[0].field_type, FieldType::Text));
788 assert!(matches!(index.fields[1].field_type, FieldType::U64));
789 assert!(matches!(index.fields[2].field_type, FieldType::I64));
790 assert!(matches!(index.fields[3].field_type, FieldType::F64));
791 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
792 }
793
794 #[test]
795 fn test_parse_with_comments() {
796 let sdl = r#"
797 # This is a comment
798 index articles {
799 # Title field
800 field title: text [indexed, stored]
801 field body: text [indexed] # inline comment not supported yet
802 }
803 "#;
804
805 let indexes = parse_sdl(sdl).unwrap();
806 assert_eq!(indexes[0].fields.len(), 2);
807 }
808
809 #[test]
810 fn test_parse_type_aliases() {
811 let sdl = r#"
812 index test {
813 field a: string [indexed]
814 field b: int [indexed]
815 field c: uint [indexed]
816 field d: float [indexed]
817 field e: binary [stored]
818 }
819 "#;
820
821 let indexes = parse_sdl(sdl).unwrap();
822 let index = &indexes[0];
823
824 assert!(matches!(index.fields[0].field_type, FieldType::Text));
825 assert!(matches!(index.fields[1].field_type, FieldType::I64));
826 assert!(matches!(index.fields[2].field_type, FieldType::U64));
827 assert!(matches!(index.fields[3].field_type, FieldType::F64));
828 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
829 }
830
831 #[test]
832 fn test_to_schema() {
833 let sdl = r#"
834 index articles {
835 field title: text [indexed, stored]
836 field views: u64 [indexed, stored]
837 }
838 "#;
839
840 let indexes = parse_sdl(sdl).unwrap();
841 let schema = indexes[0].to_schema();
842
843 assert!(schema.get_field("title").is_some());
844 assert!(schema.get_field("views").is_some());
845 assert!(schema.get_field("nonexistent").is_none());
846 }
847
848 #[test]
849 fn test_default_attributes() {
850 let sdl = r#"
851 index test {
852 field title: text
853 }
854 "#;
855
856 let indexes = parse_sdl(sdl).unwrap();
857 let field = &indexes[0].fields[0];
858
859 assert!(field.indexed);
861 assert!(field.stored);
862 }
863
864 #[test]
865 fn test_multiple_indexes() {
866 let sdl = r#"
867 index articles {
868 field title: text [indexed, stored]
869 }
870
871 index users {
872 field name: text [indexed, stored]
873 field email: text [indexed, stored]
874 }
875 "#;
876
877 let indexes = parse_sdl(sdl).unwrap();
878 assert_eq!(indexes.len(), 2);
879 assert_eq!(indexes[0].name, "articles");
880 assert_eq!(indexes[1].name, "users");
881 }
882
883 #[test]
884 fn test_tokenizer_spec() {
885 let sdl = r#"
886 index articles {
887 field title: text<en_stem> [indexed, stored]
888 field body: text<default> [indexed]
889 field author: text [indexed, stored]
890 }
891 "#;
892
893 let indexes = parse_sdl(sdl).unwrap();
894 let index = &indexes[0];
895
896 assert_eq!(index.fields[0].name, "title");
897 assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
898
899 assert_eq!(index.fields[1].name, "body");
900 assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
901
902 assert_eq!(index.fields[2].name, "author");
903 assert_eq!(index.fields[2].tokenizer, None); }
905
906 #[test]
907 fn test_tokenizer_in_schema() {
908 let sdl = r#"
909 index articles {
910 field title: text<german> [indexed, stored]
911 field body: text<en_stem> [indexed]
912 }
913 "#;
914
915 let indexes = parse_sdl(sdl).unwrap();
916 let schema = indexes[0].to_schema();
917
918 let title_field = schema.get_field("title").unwrap();
919 let title_entry = schema.get_field_entry(title_field).unwrap();
920 assert_eq!(title_entry.tokenizer, Some("german".to_string()));
921
922 let body_field = schema.get_field("body").unwrap();
923 let body_entry = schema.get_field_entry(body_field).unwrap();
924 assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
925 }
926
927 #[test]
928 fn test_query_router_basic() {
929 let sdl = r#"
930 index documents {
931 field title: text [indexed, stored]
932 field uri: text [indexed, stored]
933
934 query_router {
935 pattern: "10\\.\\d{4,}/[^\\s]+"
936 substitution: "doi://{0}"
937 target_field: uris
938 mode: exclusive
939 }
940 }
941 "#;
942
943 let indexes = parse_sdl(sdl).unwrap();
944 let index = &indexes[0];
945
946 assert_eq!(index.query_routers.len(), 1);
947 let router = &index.query_routers[0];
948 assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
949 assert_eq!(router.substitution, "doi://{0}");
950 assert_eq!(router.target_field, "uris");
951 assert_eq!(router.mode, RoutingMode::Exclusive);
952 }
953
954 #[test]
955 fn test_query_router_raw_string() {
956 let sdl = r#"
957 index documents {
958 field uris: text [indexed, stored]
959
960 query_router {
961 pattern: r"^pmid:(\d+)$"
962 substitution: "pubmed://{1}"
963 target_field: uris
964 mode: additional
965 }
966 }
967 "#;
968
969 let indexes = parse_sdl(sdl).unwrap();
970 let router = &indexes[0].query_routers[0];
971
972 assert_eq!(router.pattern, r"^pmid:(\d+)$");
973 assert_eq!(router.substitution, "pubmed://{1}");
974 assert_eq!(router.mode, RoutingMode::Additional);
975 }
976
977 #[test]
978 fn test_multiple_query_routers() {
979 let sdl = r#"
980 index documents {
981 field uris: text [indexed, stored]
982
983 query_router {
984 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
985 substitution: "doi://{1}"
986 target_field: uris
987 mode: exclusive
988 }
989
990 query_router {
991 pattern: r"^pmid:(\d+)$"
992 substitution: "pubmed://{1}"
993 target_field: uris
994 mode: exclusive
995 }
996
997 query_router {
998 pattern: r"^arxiv:(\d+\.\d+)$"
999 substitution: "arxiv://{1}"
1000 target_field: uris
1001 mode: additional
1002 }
1003 }
1004 "#;
1005
1006 let indexes = parse_sdl(sdl).unwrap();
1007 assert_eq!(indexes[0].query_routers.len(), 3);
1008 }
1009
1010 #[test]
1011 fn test_query_router_default_mode() {
1012 let sdl = r#"
1013 index documents {
1014 field uris: text [indexed, stored]
1015
1016 query_router {
1017 pattern: r"test"
1018 substitution: "{0}"
1019 target_field: uris
1020 }
1021 }
1022 "#;
1023
1024 let indexes = parse_sdl(sdl).unwrap();
1025 assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1027 }
1028
1029 #[test]
1030 fn test_multi_attribute() {
1031 let sdl = r#"
1032 index documents {
1033 field uris: text [indexed, stored, multi]
1034 field title: text [indexed, stored]
1035 }
1036 "#;
1037
1038 let indexes = parse_sdl(sdl).unwrap();
1039 assert_eq!(indexes.len(), 1);
1040
1041 let fields = &indexes[0].fields;
1042 assert_eq!(fields.len(), 2);
1043
1044 assert_eq!(fields[0].name, "uris");
1046 assert!(fields[0].multi, "uris field should have multi=true");
1047
1048 assert_eq!(fields[1].name, "title");
1050 assert!(!fields[1].multi, "title field should have multi=false");
1051
1052 let schema = indexes[0].to_schema();
1054 let uris_field = schema.get_field("uris").unwrap();
1055 let title_field = schema.get_field("title").unwrap();
1056
1057 assert!(schema.get_field_entry(uris_field).unwrap().multi);
1058 assert!(!schema.get_field_entry(title_field).unwrap().multi);
1059 }
1060
1061 #[test]
1062 fn test_sparse_vector_field() {
1063 let sdl = r#"
1064 index documents {
1065 field embedding: sparse_vector [indexed, stored]
1066 }
1067 "#;
1068
1069 let indexes = parse_sdl(sdl).unwrap();
1070 assert_eq!(indexes.len(), 1);
1071 assert_eq!(indexes[0].fields.len(), 1);
1072 assert_eq!(indexes[0].fields[0].name, "embedding");
1073 assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1074 assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1075 }
1076
1077 #[test]
1078 fn test_sparse_vector_with_config() {
1079 let sdl = r#"
1080 index documents {
1081 field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1082 field dense: sparse_vector<u32> [indexed<quantization: float32>]
1083 }
1084 "#;
1085
1086 let indexes = parse_sdl(sdl).unwrap();
1087 assert_eq!(indexes[0].fields.len(), 2);
1088
1089 let f1 = &indexes[0].fields[0];
1091 assert_eq!(f1.name, "embedding");
1092 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1093 assert_eq!(config1.index_size, IndexSize::U16);
1094 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1095
1096 let f2 = &indexes[0].fields[1];
1098 assert_eq!(f2.name, "dense");
1099 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1100 assert_eq!(config2.index_size, IndexSize::U32);
1101 assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1102 }
1103
1104 #[test]
1105 fn test_sparse_vector_with_weight_threshold() {
1106 let sdl = r#"
1107 index documents {
1108 field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1109 field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1110 }
1111 "#;
1112
1113 let indexes = parse_sdl(sdl).unwrap();
1114 assert_eq!(indexes[0].fields.len(), 2);
1115
1116 let f1 = &indexes[0].fields[0];
1118 assert_eq!(f1.name, "embedding");
1119 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1120 assert_eq!(config1.index_size, IndexSize::U16);
1121 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1122 assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1123
1124 let f2 = &indexes[0].fields[1];
1126 assert_eq!(f2.name, "embedding2");
1127 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1128 assert_eq!(config2.index_size, IndexSize::U32);
1129 assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1130 assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1131 }
1132
1133 #[test]
1134 fn test_dense_vector_field() {
1135 let sdl = r#"
1136 index documents {
1137 field embedding: dense_vector<768> [indexed, stored]
1138 }
1139 "#;
1140
1141 let indexes = parse_sdl(sdl).unwrap();
1142 assert_eq!(indexes.len(), 1);
1143 assert_eq!(indexes[0].fields.len(), 1);
1144
1145 let f = &indexes[0].fields[0];
1146 assert_eq!(f.name, "embedding");
1147 assert_eq!(f.field_type, FieldType::DenseVector);
1148
1149 let config = f.dense_vector_config.as_ref().unwrap();
1150 assert_eq!(config.dim, 768);
1151 }
1152
1153 #[test]
1154 fn test_dense_vector_alias() {
1155 let sdl = r#"
1156 index documents {
1157 field embedding: vector<1536> [indexed]
1158 }
1159 "#;
1160
1161 let indexes = parse_sdl(sdl).unwrap();
1162 assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1163 assert_eq!(
1164 indexes[0].fields[0]
1165 .dense_vector_config
1166 .as_ref()
1167 .unwrap()
1168 .dim,
1169 1536
1170 );
1171 }
1172
1173 #[test]
1174 fn test_dense_vector_with_centroids() {
1175 let sdl = r#"
1176 index documents {
1177 field embedding: dense_vector<768> [indexed<centroids: "centroids.bin">, stored]
1178 }
1179 "#;
1180
1181 let indexes = parse_sdl(sdl).unwrap();
1182 assert_eq!(indexes.len(), 1);
1183
1184 let f = &indexes[0].fields[0];
1185 assert_eq!(f.name, "embedding");
1186 assert_eq!(f.field_type, FieldType::DenseVector);
1187
1188 let config = f.dense_vector_config.as_ref().unwrap();
1189 assert_eq!(config.dim, 768);
1190 assert_eq!(
1191 config.coarse_centroids_path.as_deref(),
1192 Some("centroids.bin")
1193 );
1194 assert_eq!(config.nprobe, 32); }
1196
1197 #[test]
1198 fn test_dense_vector_with_centroids_and_nprobe() {
1199 let sdl = r#"
1200 index documents {
1201 field embedding: dense_vector<1536> [indexed<centroids: "/path/to/centroids.bin", nprobe: 64>]
1202 }
1203 "#;
1204
1205 let indexes = parse_sdl(sdl).unwrap();
1206 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1207
1208 assert_eq!(config.dim, 1536);
1209 assert_eq!(
1210 config.coarse_centroids_path.as_deref(),
1211 Some("/path/to/centroids.bin")
1212 );
1213 assert_eq!(config.nprobe, 64);
1214 }
1215
1216 #[test]
1217 fn test_dense_vector_keyword_syntax() {
1218 let sdl = r#"
1219 index documents {
1220 field embedding: dense_vector<dims: 1536> [indexed, stored]
1221 }
1222 "#;
1223
1224 let indexes = parse_sdl(sdl).unwrap();
1225 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1226
1227 assert_eq!(config.dim, 1536);
1228 assert!(config.coarse_centroids_path.is_none());
1229 }
1230
1231 #[test]
1232 fn test_dense_vector_keyword_syntax_full() {
1233 let sdl = r#"
1234 index documents {
1235 field embedding: dense_vector<dims: 1536> [indexed<centroids: "/path/to/centroids.bin", nprobe: 64>]
1236 }
1237 "#;
1238
1239 let indexes = parse_sdl(sdl).unwrap();
1240 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1241
1242 assert_eq!(config.dim, 1536);
1243 assert_eq!(
1244 config.coarse_centroids_path.as_deref(),
1245 Some("/path/to/centroids.bin")
1246 );
1247 assert_eq!(config.nprobe, 64);
1248 }
1249
1250 #[test]
1251 fn test_dense_vector_keyword_syntax_partial() {
1252 let sdl = r#"
1253 index documents {
1254 field embedding: dense_vector<dims: 768> [indexed<centroids: "centroids.bin">]
1255 }
1256 "#;
1257
1258 let indexes = parse_sdl(sdl).unwrap();
1259 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1260
1261 assert_eq!(config.dim, 768);
1262 assert_eq!(
1263 config.coarse_centroids_path.as_deref(),
1264 Some("centroids.bin")
1265 );
1266 assert_eq!(config.nprobe, 32); }
1268
1269 #[test]
1270 fn test_dense_vector_scann_index() {
1271 use crate::dsl::schema::VectorIndexType;
1272
1273 let sdl = r#"
1274 index documents {
1275 field embedding: dense_vector<dims: 768> [indexed<scann, centroids: "centroids.bin", codebook: "pq_codebook.bin", nprobe: 64>]
1276 }
1277 "#;
1278
1279 let indexes = parse_sdl(sdl).unwrap();
1280 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1281
1282 assert_eq!(config.dim, 768);
1283 assert_eq!(config.index_type, VectorIndexType::ScaNN);
1284 assert_eq!(
1285 config.coarse_centroids_path.as_deref(),
1286 Some("centroids.bin")
1287 );
1288 assert_eq!(config.pq_codebook_path.as_deref(), Some("pq_codebook.bin"));
1289 assert_eq!(config.nprobe, 64);
1290 }
1291
1292 #[test]
1293 fn test_dense_vector_rabitq_index() {
1294 use crate::dsl::schema::VectorIndexType;
1295
1296 let sdl = r#"
1297 index documents {
1298 field embedding: dense_vector<dims: 1536> [indexed<rabitq, centroids: "centroids.bin">]
1299 }
1300 "#;
1301
1302 let indexes = parse_sdl(sdl).unwrap();
1303 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1304
1305 assert_eq!(config.dim, 1536);
1306 assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1307 assert_eq!(
1308 config.coarse_centroids_path.as_deref(),
1309 Some("centroids.bin")
1310 );
1311 assert!(config.pq_codebook_path.is_none());
1312 }
1313
1314 #[test]
1315 fn test_dense_vector_rabitq_no_centroids() {
1316 use crate::dsl::schema::VectorIndexType;
1317
1318 let sdl = r#"
1319 index documents {
1320 field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1321 }
1322 "#;
1323
1324 let indexes = parse_sdl(sdl).unwrap();
1325 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1326
1327 assert_eq!(config.dim, 768);
1328 assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1329 assert!(config.coarse_centroids_path.is_none());
1330 }
1331
1332 #[test]
1333 fn test_dense_vector_default_index_type() {
1334 use crate::dsl::schema::VectorIndexType;
1335
1336 let sdl = r#"
1338 index documents {
1339 field embedding: dense_vector<dims: 768> [indexed]
1340 }
1341 "#;
1342
1343 let indexes = parse_sdl(sdl).unwrap();
1344 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1345
1346 assert_eq!(config.dim, 768);
1347 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1348 }
1349
1350 #[test]
1351 fn test_dense_vector_mrl_dim() {
1352 let sdl = r#"
1354 index documents {
1355 field embedding: dense_vector<1536> [indexed<mrl_dim: 256>]
1356 }
1357 "#;
1358
1359 let indexes = parse_sdl(sdl).unwrap();
1360 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1361
1362 assert_eq!(config.dim, 1536);
1363 assert_eq!(config.mrl_dim, Some(256));
1364 assert_eq!(config.index_dim(), 256);
1365 }
1366
1367 #[test]
1368 fn test_dense_vector_mrl_dim_with_centroids() {
1369 let sdl = r#"
1371 index documents {
1372 field embedding: dense_vector<768> [indexed<centroids: "centroids.bin", nprobe: 64, mrl_dim: 128>]
1373 }
1374 "#;
1375
1376 let indexes = parse_sdl(sdl).unwrap();
1377 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1378
1379 assert_eq!(config.dim, 768);
1380 assert_eq!(config.mrl_dim, Some(128));
1381 assert_eq!(config.index_dim(), 128);
1382 assert_eq!(
1383 config.coarse_centroids_path.as_deref(),
1384 Some("centroids.bin")
1385 );
1386 assert_eq!(config.nprobe, 64);
1387 }
1388
1389 #[test]
1390 fn test_dense_vector_no_mrl_dim() {
1391 let sdl = r#"
1393 index documents {
1394 field embedding: dense_vector<dims: 768> [indexed]
1395 }
1396 "#;
1397
1398 let indexes = parse_sdl(sdl).unwrap();
1399 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1400
1401 assert_eq!(config.dim, 768);
1402 assert_eq!(config.mrl_dim, None);
1403 assert_eq!(config.index_dim(), 768);
1404 }
1405
1406 #[test]
1407 fn test_json_field_type() {
1408 let sdl = r#"
1409 index documents {
1410 field title: text [indexed, stored]
1411 field metadata: json [stored]
1412 field extra: json
1413 }
1414 "#;
1415
1416 let indexes = parse_sdl(sdl).unwrap();
1417 let index = &indexes[0];
1418
1419 assert_eq!(index.fields.len(), 3);
1420
1421 assert_eq!(index.fields[1].name, "metadata");
1423 assert!(matches!(index.fields[1].field_type, FieldType::Json));
1424 assert!(index.fields[1].stored);
1425 assert_eq!(index.fields[2].name, "extra");
1429 assert!(matches!(index.fields[2].field_type, FieldType::Json));
1430
1431 let schema = index.to_schema();
1433 let metadata_field = schema.get_field("metadata").unwrap();
1434 let entry = schema.get_field_entry(metadata_field).unwrap();
1435 assert_eq!(entry.field_type, FieldType::Json);
1436 assert!(!entry.indexed); assert!(entry.stored);
1438 }
1439
1440 #[test]
1441 fn test_sparse_vector_query_config() {
1442 use crate::structures::QueryWeighting;
1443
1444 let sdl = r#"
1445 index documents {
1446 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1447 }
1448 "#;
1449
1450 let indexes = parse_sdl(sdl).unwrap();
1451 let index = &indexes[0];
1452
1453 assert_eq!(index.fields.len(), 1);
1454 assert_eq!(index.fields[0].name, "embedding");
1455 assert!(matches!(
1456 index.fields[0].field_type,
1457 FieldType::SparseVector
1458 ));
1459
1460 let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1461 assert_eq!(config.index_size, IndexSize::U16);
1462 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1463
1464 let query_config = config.query_config.as_ref().unwrap();
1466 assert_eq!(
1467 query_config.tokenizer.as_deref(),
1468 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1469 );
1470 assert_eq!(query_config.weighting, QueryWeighting::Idf);
1471
1472 let schema = index.to_schema();
1474 let embedding_field = schema.get_field("embedding").unwrap();
1475 let entry = schema.get_field_entry(embedding_field).unwrap();
1476 let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1477 let qc = sv_config.query_config.as_ref().unwrap();
1478 assert_eq!(
1479 qc.tokenizer.as_deref(),
1480 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1481 );
1482 assert_eq!(qc.weighting, QueryWeighting::Idf);
1483 }
1484
1485 #[test]
1486 fn test_sparse_vector_query_config_weighting_one() {
1487 use crate::structures::QueryWeighting;
1488
1489 let sdl = r#"
1490 index documents {
1491 field embedding: sparse_vector [indexed<query<weighting: one>>]
1492 }
1493 "#;
1494
1495 let indexes = parse_sdl(sdl).unwrap();
1496 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1497
1498 let query_config = config.query_config.as_ref().unwrap();
1499 assert!(query_config.tokenizer.is_none());
1500 assert_eq!(query_config.weighting, QueryWeighting::One);
1501 }
1502}