1use pest::Parser;
49use pest_derive::Parser;
50
51use super::query_field_router::{QueryRouterRule, RoutingMode};
52use super::schema::{FieldType, Schema, SchemaBuilder};
53use crate::Result;
54use crate::error::Error;
55
56#[derive(Parser)]
57#[grammar = "dsl/sdl/sdl.pest"]
58pub struct SdlParser;
59
60use super::schema::DenseVectorConfig;
61use crate::structures::{
62 IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
63};
64
65#[derive(Debug, Clone)]
67pub struct FieldDef {
68 pub name: String,
69 pub field_type: FieldType,
70 pub indexed: bool,
71 pub stored: bool,
72 pub tokenizer: Option<String>,
74 pub multi: bool,
76 pub positions: Option<super::schema::PositionMode>,
78 pub sparse_vector_config: Option<SparseVectorConfig>,
80 pub dense_vector_config: Option<DenseVectorConfig>,
82}
83
84#[derive(Debug, Clone)]
86pub struct IndexDef {
87 pub name: String,
88 pub fields: Vec<FieldDef>,
89 pub default_fields: Vec<String>,
90 pub query_routers: Vec<QueryRouterRule>,
92}
93
94impl IndexDef {
95 pub fn to_schema(&self) -> Schema {
97 let mut builder = SchemaBuilder::default();
98
99 for field in &self.fields {
100 let f = match field.field_type {
101 FieldType::Text => {
102 let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
103 builder.add_text_field_with_tokenizer(
104 &field.name,
105 field.indexed,
106 field.stored,
107 tokenizer,
108 )
109 }
110 FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
111 FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
112 FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
113 FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
114 FieldType::Json => builder.add_json_field(&field.name, field.stored),
115 FieldType::SparseVector => {
116 if let Some(config) = &field.sparse_vector_config {
117 builder.add_sparse_vector_field_with_config(
118 &field.name,
119 field.indexed,
120 field.stored,
121 config.clone(),
122 )
123 } else {
124 builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
125 }
126 }
127 FieldType::DenseVector => {
128 let config = field
130 .dense_vector_config
131 .as_ref()
132 .expect("DenseVector field requires dimension to be specified");
133 builder.add_dense_vector_field_with_config(
134 &field.name,
135 field.indexed,
136 field.stored,
137 config.clone(),
138 )
139 }
140 };
141 if field.multi {
142 builder.set_multi(f, true);
143 }
144 let positions = field.positions.or({
146 if field.multi
148 && matches!(
149 field.field_type,
150 FieldType::SparseVector | FieldType::DenseVector
151 )
152 {
153 Some(super::schema::PositionMode::Ordinal)
154 } else {
155 None
156 }
157 });
158 if let Some(mode) = positions {
159 builder.set_positions(f, mode);
160 }
161 }
162
163 if !self.default_fields.is_empty() {
165 builder.set_default_fields(self.default_fields.clone());
166 }
167
168 if !self.query_routers.is_empty() {
170 builder.set_query_routers(self.query_routers.clone());
171 }
172
173 builder.build()
174 }
175
176 pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
181 if self.query_routers.is_empty() {
182 return Ok(None);
183 }
184
185 super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
186 .map(Some)
187 .map_err(Error::Schema)
188 }
189}
190
191fn parse_field_type(type_str: &str) -> Result<FieldType> {
193 match type_str {
194 "text" | "string" | "str" => Ok(FieldType::Text),
195 "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
196 "i64" | "int" | "integer" => Ok(FieldType::I64),
197 "f64" | "float" | "double" => Ok(FieldType::F64),
198 "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
199 "json" => Ok(FieldType::Json),
200 "sparse_vector" => Ok(FieldType::SparseVector),
201 "dense_vector" | "vector" => Ok(FieldType::DenseVector),
202 _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
203 }
204}
205
206#[derive(Debug, Clone, Default)]
208struct IndexConfig {
209 index_type: Option<super::schema::VectorIndexType>,
210 centroids_path: Option<String>,
211 codebook_path: Option<String>,
212 nprobe: Option<usize>,
213 mrl_dim: Option<usize>,
214 quantization: Option<WeightQuantization>,
216 weight_threshold: Option<f32>,
217 query_tokenizer: Option<String>,
219 query_weighting: Option<QueryWeighting>,
220 positions: Option<super::schema::PositionMode>,
222}
223
224fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool, Option<IndexConfig>) {
229 let mut indexed = false;
230 let mut stored = false;
231 let mut multi = false;
232 let mut index_config = None;
233
234 for attr in pair.into_inner() {
235 if attr.as_rule() == Rule::attribute {
236 let mut found_config = false;
238 for inner in attr.clone().into_inner() {
239 match inner.as_rule() {
240 Rule::indexed_with_config => {
241 indexed = true;
242 index_config = Some(parse_index_config(inner));
243 found_config = true;
244 break;
245 }
246 Rule::stored_with_config => {
247 stored = true;
248 multi = true; found_config = true;
250 break;
251 }
252 _ => {}
253 }
254 }
255 if !found_config {
256 match attr.as_str() {
258 "indexed" => indexed = true,
259 "stored" => stored = true,
260 _ => {}
261 }
262 }
263 }
264 }
265
266 (indexed, stored, multi, index_config)
267}
268
269fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
271 let mut config = IndexConfig::default();
272
273 for inner in pair.into_inner() {
278 if inner.as_rule() == Rule::index_config_params {
279 for param in inner.into_inner() {
280 if param.as_rule() == Rule::index_config_param {
281 for p in param.into_inner() {
282 parse_single_index_config_param(&mut config, p);
283 }
284 }
285 }
286 }
287 }
288
289 config
290}
291
292fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
294 use super::schema::VectorIndexType;
295
296 match p.as_rule() {
297 Rule::index_type_spec => {
298 config.index_type = Some(match p.as_str() {
299 "scann" => VectorIndexType::ScaNN,
300 "rabitq" => VectorIndexType::IvfRaBitQ,
301 _ => VectorIndexType::IvfRaBitQ,
302 });
303 }
304 Rule::index_type_kwarg => {
305 if let Some(t) = p.into_inner().next() {
307 config.index_type = Some(match t.as_str() {
308 "scann" => VectorIndexType::ScaNN,
309 "rabitq" => VectorIndexType::IvfRaBitQ,
310 _ => VectorIndexType::IvfRaBitQ,
311 });
312 }
313 }
314 Rule::centroids_kwarg => {
315 if let Some(path) = p.into_inner().next()
318 && let Some(inner_path) = path.into_inner().next()
319 {
320 config.centroids_path = Some(inner_path.as_str().to_string());
321 }
322 }
323 Rule::codebook_kwarg => {
324 if let Some(path) = p.into_inner().next()
326 && let Some(inner_path) = path.into_inner().next()
327 {
328 config.codebook_path = Some(inner_path.as_str().to_string());
329 }
330 }
331 Rule::nprobe_kwarg => {
332 if let Some(n) = p.into_inner().next() {
334 config.nprobe = Some(n.as_str().parse().unwrap_or(32));
335 }
336 }
337 Rule::mrl_dim_kwarg => {
338 if let Some(n) = p.into_inner().next() {
340 config.mrl_dim = Some(n.as_str().parse().unwrap_or(0));
341 }
342 }
343 Rule::quantization_kwarg => {
344 if let Some(q) = p.into_inner().next() {
346 config.quantization = Some(match q.as_str() {
347 "float32" | "f32" => WeightQuantization::Float32,
348 "float16" | "f16" => WeightQuantization::Float16,
349 "uint8" | "u8" => WeightQuantization::UInt8,
350 "uint4" | "u4" => WeightQuantization::UInt4,
351 _ => WeightQuantization::default(),
352 });
353 }
354 }
355 Rule::weight_threshold_kwarg => {
356 if let Some(t) = p.into_inner().next() {
358 config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
359 }
360 }
361 Rule::query_config_block => {
362 parse_query_config_block(config, p);
364 }
365 Rule::positions_kwarg => {
366 use super::schema::PositionMode;
368 config.positions = Some(match p.as_str() {
369 "ordinal" => PositionMode::Ordinal,
370 "token_position" => PositionMode::TokenPosition,
371 _ => PositionMode::Full, });
373 }
374 _ => {}
375 }
376}
377
378fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
380 for inner in pair.into_inner() {
381 if inner.as_rule() == Rule::query_config_params {
382 for param in inner.into_inner() {
383 if param.as_rule() == Rule::query_config_param {
384 for p in param.into_inner() {
385 match p.as_rule() {
386 Rule::query_tokenizer_kwarg => {
387 if let Some(path) = p.into_inner().next()
389 && let Some(inner_path) = path.into_inner().next()
390 {
391 config.query_tokenizer = Some(inner_path.as_str().to_string());
392 }
393 }
394 Rule::query_weighting_kwarg => {
395 if let Some(w) = p.into_inner().next() {
397 config.query_weighting = Some(match w.as_str() {
398 "one" => QueryWeighting::One,
399 "idf" => QueryWeighting::Idf,
400 _ => QueryWeighting::One,
401 });
402 }
403 }
404 _ => {}
405 }
406 }
407 }
408 }
409 }
410 }
411}
412
413fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
415 let mut inner = pair.into_inner();
416
417 let name = inner
418 .next()
419 .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
420 .as_str()
421 .to_string();
422
423 let field_type_str = inner
424 .next()
425 .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
426 .as_str();
427
428 let field_type = parse_field_type(field_type_str)?;
429
430 let mut tokenizer = None;
432 let mut sparse_vector_config = None;
433 let mut dense_vector_config = None;
434 let mut indexed = true;
435 let mut stored = true;
436 let mut multi = false;
437 let mut index_config: Option<IndexConfig> = None;
438
439 for item in inner {
440 match item.as_rule() {
441 Rule::tokenizer_spec => {
442 if let Some(tok_name) = item.into_inner().next() {
444 tokenizer = Some(tok_name.as_str().to_string());
445 }
446 }
447 Rule::sparse_vector_config => {
448 sparse_vector_config = Some(parse_sparse_vector_config(item));
450 }
451 Rule::dense_vector_config => {
452 dense_vector_config = Some(parse_dense_vector_config(item));
454 }
455 Rule::attributes => {
456 let (idx, sto, mul, idx_cfg) = parse_attributes(item);
457 indexed = idx;
458 stored = sto;
459 multi = mul;
460 index_config = idx_cfg;
461 }
462 _ => {}
463 }
464 }
465
466 let mut positions = None;
468 if let Some(idx_cfg) = index_config {
469 positions = idx_cfg.positions;
470 if let Some(ref mut dv_config) = dense_vector_config {
471 apply_index_config_to_dense_vector(dv_config, idx_cfg);
472 } else if field_type == FieldType::SparseVector {
473 let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
475 apply_index_config_to_sparse_vector(sv_config, idx_cfg);
476 }
477 }
478
479 Ok(FieldDef {
480 name,
481 field_type,
482 indexed,
483 stored,
484 tokenizer,
485 multi,
486 positions,
487 sparse_vector_config,
488 dense_vector_config,
489 })
490}
491
492fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
494 use super::schema::VectorIndexType;
495
496 let nprobe = idx_cfg.nprobe.unwrap_or(32);
497
498 match idx_cfg.index_type {
499 Some(VectorIndexType::ScaNN) => {
500 config.index_type = VectorIndexType::ScaNN;
501 config.coarse_centroids_path = idx_cfg.centroids_path;
502 config.pq_codebook_path = idx_cfg.codebook_path;
503 config.nprobe = nprobe;
504 }
505 Some(VectorIndexType::IvfRaBitQ) => {
506 config.index_type = VectorIndexType::IvfRaBitQ;
507 config.coarse_centroids_path = idx_cfg.centroids_path;
508 config.nprobe = nprobe;
509 }
510 Some(VectorIndexType::RaBitQ) | None => {
511 if idx_cfg.centroids_path.is_some() {
513 config.index_type = VectorIndexType::IvfRaBitQ;
514 config.coarse_centroids_path = idx_cfg.centroids_path;
515 config.nprobe = nprobe;
516 }
517 }
519 }
520
521 if idx_cfg.mrl_dim.is_some() {
523 config.mrl_dim = idx_cfg.mrl_dim;
524 }
525}
526
527fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
530 let mut index_size = IndexSize::default();
531
532 for inner in pair.into_inner() {
534 if inner.as_rule() == Rule::index_size_spec {
535 index_size = match inner.as_str() {
536 "u16" => IndexSize::U16,
537 "u32" => IndexSize::U32,
538 _ => IndexSize::default(),
539 };
540 }
541 }
542
543 SparseVectorConfig {
544 index_size,
545 weight_quantization: WeightQuantization::default(),
546 weight_threshold: 0.0,
547 posting_list_pruning: None,
548 query_config: None,
549 }
550}
551
552fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
554 if let Some(q) = idx_cfg.quantization {
555 config.weight_quantization = q;
556 }
557 if let Some(t) = idx_cfg.weight_threshold {
558 config.weight_threshold = t;
559 }
560 if idx_cfg.query_tokenizer.is_some() || idx_cfg.query_weighting.is_some() {
562 let query_config = config
563 .query_config
564 .get_or_insert(SparseQueryConfig::default());
565 if let Some(tokenizer) = idx_cfg.query_tokenizer {
566 query_config.tokenizer = Some(tokenizer);
567 }
568 if let Some(weighting) = idx_cfg.query_weighting {
569 query_config.weighting = weighting;
570 }
571 }
572}
573
574fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
577 let mut dim: usize = 0;
578
579 for params in pair.into_inner() {
581 if params.as_rule() == Rule::dense_vector_params {
582 for inner in params.into_inner() {
583 match inner.as_rule() {
584 Rule::dense_vector_keyword_params => {
585 for kwarg in inner.into_inner() {
587 if kwarg.as_rule() == Rule::dims_kwarg
588 && let Some(d) = kwarg.into_inner().next()
589 {
590 dim = d.as_str().parse().unwrap_or(0);
591 }
592 }
593 }
594 Rule::dense_vector_positional_params => {
595 if let Some(dim_pair) = inner.into_inner().next() {
597 dim = dim_pair.as_str().parse().unwrap_or(0);
598 }
599 }
600 _ => {}
601 }
602 }
603 }
604 }
605
606 DenseVectorConfig::new(dim)
607}
608
609fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
611 pair.into_inner().map(|p| p.as_str().to_string()).collect()
612}
613
614fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
616 let mut pattern = String::new();
617 let mut substitution = String::new();
618 let mut target_field = String::new();
619 let mut mode = RoutingMode::Additional;
620
621 for prop in pair.into_inner() {
622 if prop.as_rule() != Rule::query_router_prop {
623 continue;
624 }
625
626 for inner in prop.into_inner() {
627 match inner.as_rule() {
628 Rule::query_router_pattern => {
629 if let Some(regex_str) = inner.into_inner().next() {
630 pattern = parse_string_value(regex_str);
631 }
632 }
633 Rule::query_router_substitution => {
634 if let Some(quoted) = inner.into_inner().next() {
635 substitution = parse_string_value(quoted);
636 }
637 }
638 Rule::query_router_target => {
639 if let Some(ident) = inner.into_inner().next() {
640 target_field = ident.as_str().to_string();
641 }
642 }
643 Rule::query_router_mode => {
644 if let Some(mode_val) = inner.into_inner().next() {
645 mode = match mode_val.as_str() {
646 "exclusive" => RoutingMode::Exclusive,
647 "additional" => RoutingMode::Additional,
648 _ => RoutingMode::Additional,
649 };
650 }
651 }
652 _ => {}
653 }
654 }
655 }
656
657 if pattern.is_empty() {
658 return Err(Error::Schema("query_router missing 'pattern'".to_string()));
659 }
660 if substitution.is_empty() {
661 return Err(Error::Schema(
662 "query_router missing 'substitution'".to_string(),
663 ));
664 }
665 if target_field.is_empty() {
666 return Err(Error::Schema(
667 "query_router missing 'target_field'".to_string(),
668 ));
669 }
670
671 Ok(QueryRouterRule {
672 pattern,
673 substitution,
674 target_field,
675 mode,
676 })
677}
678
679fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
681 let s = pair.as_str();
682 match pair.as_rule() {
683 Rule::regex_string => {
684 if let Some(inner) = pair.into_inner().next() {
686 parse_string_value(inner)
687 } else {
688 s.to_string()
689 }
690 }
691 Rule::raw_string => {
692 s[2..s.len() - 1].to_string()
694 }
695 Rule::quoted_string => {
696 let inner = &s[1..s.len() - 1];
698 inner
700 .replace("\\n", "\n")
701 .replace("\\t", "\t")
702 .replace("\\\"", "\"")
703 .replace("\\\\", "\\")
704 }
705 _ => s.to_string(),
706 }
707}
708
709fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
711 let mut inner = pair.into_inner();
712
713 let name = inner
714 .next()
715 .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
716 .as_str()
717 .to_string();
718
719 let mut fields = Vec::new();
720 let mut default_fields = Vec::new();
721 let mut query_routers = Vec::new();
722
723 for item in inner {
724 match item.as_rule() {
725 Rule::field_def => {
726 fields.push(parse_field_def(item)?);
727 }
728 Rule::default_fields_def => {
729 default_fields = parse_default_fields_def(item);
730 }
731 Rule::query_router_def => {
732 query_routers.push(parse_query_router_def(item)?);
733 }
734 _ => {}
735 }
736 }
737
738 Ok(IndexDef {
739 name,
740 fields,
741 default_fields,
742 query_routers,
743 })
744}
745
746pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
748 let pairs = SdlParser::parse(Rule::file, input)
749 .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
750
751 let mut indexes = Vec::new();
752
753 for pair in pairs {
754 if pair.as_rule() == Rule::file {
755 for inner in pair.into_inner() {
756 if inner.as_rule() == Rule::index_def {
757 indexes.push(parse_index_def(inner)?);
758 }
759 }
760 }
761 }
762
763 Ok(indexes)
764}
765
766pub fn parse_single_index(input: &str) -> Result<IndexDef> {
768 let indexes = parse_sdl(input)?;
769
770 if indexes.is_empty() {
771 return Err(Error::Schema("No index definition found".to_string()));
772 }
773
774 if indexes.len() > 1 {
775 return Err(Error::Schema(
776 "Multiple index definitions found, expected one".to_string(),
777 ));
778 }
779
780 Ok(indexes.into_iter().next().unwrap())
781}
782
783#[cfg(test)]
784mod tests {
785 use super::*;
786
787 #[test]
788 fn test_parse_simple_schema() {
789 let sdl = r#"
790 index articles {
791 field title: text [indexed, stored]
792 field body: text [indexed]
793 }
794 "#;
795
796 let indexes = parse_sdl(sdl).unwrap();
797 assert_eq!(indexes.len(), 1);
798
799 let index = &indexes[0];
800 assert_eq!(index.name, "articles");
801 assert_eq!(index.fields.len(), 2);
802
803 assert_eq!(index.fields[0].name, "title");
804 assert!(matches!(index.fields[0].field_type, FieldType::Text));
805 assert!(index.fields[0].indexed);
806 assert!(index.fields[0].stored);
807
808 assert_eq!(index.fields[1].name, "body");
809 assert!(matches!(index.fields[1].field_type, FieldType::Text));
810 assert!(index.fields[1].indexed);
811 assert!(!index.fields[1].stored);
812 }
813
814 #[test]
815 fn test_parse_all_field_types() {
816 let sdl = r#"
817 index test {
818 field text_field: text [indexed, stored]
819 field u64_field: u64 [indexed, stored]
820 field i64_field: i64 [indexed, stored]
821 field f64_field: f64 [indexed, stored]
822 field bytes_field: bytes [stored]
823 }
824 "#;
825
826 let indexes = parse_sdl(sdl).unwrap();
827 let index = &indexes[0];
828
829 assert!(matches!(index.fields[0].field_type, FieldType::Text));
830 assert!(matches!(index.fields[1].field_type, FieldType::U64));
831 assert!(matches!(index.fields[2].field_type, FieldType::I64));
832 assert!(matches!(index.fields[3].field_type, FieldType::F64));
833 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
834 }
835
836 #[test]
837 fn test_parse_with_comments() {
838 let sdl = r#"
839 # This is a comment
840 index articles {
841 # Title field
842 field title: text [indexed, stored]
843 field body: text [indexed] # inline comment not supported yet
844 }
845 "#;
846
847 let indexes = parse_sdl(sdl).unwrap();
848 assert_eq!(indexes[0].fields.len(), 2);
849 }
850
851 #[test]
852 fn test_parse_type_aliases() {
853 let sdl = r#"
854 index test {
855 field a: string [indexed]
856 field b: int [indexed]
857 field c: uint [indexed]
858 field d: float [indexed]
859 field e: binary [stored]
860 }
861 "#;
862
863 let indexes = parse_sdl(sdl).unwrap();
864 let index = &indexes[0];
865
866 assert!(matches!(index.fields[0].field_type, FieldType::Text));
867 assert!(matches!(index.fields[1].field_type, FieldType::I64));
868 assert!(matches!(index.fields[2].field_type, FieldType::U64));
869 assert!(matches!(index.fields[3].field_type, FieldType::F64));
870 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
871 }
872
873 #[test]
874 fn test_to_schema() {
875 let sdl = r#"
876 index articles {
877 field title: text [indexed, stored]
878 field views: u64 [indexed, stored]
879 }
880 "#;
881
882 let indexes = parse_sdl(sdl).unwrap();
883 let schema = indexes[0].to_schema();
884
885 assert!(schema.get_field("title").is_some());
886 assert!(schema.get_field("views").is_some());
887 assert!(schema.get_field("nonexistent").is_none());
888 }
889
890 #[test]
891 fn test_default_attributes() {
892 let sdl = r#"
893 index test {
894 field title: text
895 }
896 "#;
897
898 let indexes = parse_sdl(sdl).unwrap();
899 let field = &indexes[0].fields[0];
900
901 assert!(field.indexed);
903 assert!(field.stored);
904 }
905
906 #[test]
907 fn test_multiple_indexes() {
908 let sdl = r#"
909 index articles {
910 field title: text [indexed, stored]
911 }
912
913 index users {
914 field name: text [indexed, stored]
915 field email: text [indexed, stored]
916 }
917 "#;
918
919 let indexes = parse_sdl(sdl).unwrap();
920 assert_eq!(indexes.len(), 2);
921 assert_eq!(indexes[0].name, "articles");
922 assert_eq!(indexes[1].name, "users");
923 }
924
925 #[test]
926 fn test_tokenizer_spec() {
927 let sdl = r#"
928 index articles {
929 field title: text<en_stem> [indexed, stored]
930 field body: text<default> [indexed]
931 field author: text [indexed, stored]
932 }
933 "#;
934
935 let indexes = parse_sdl(sdl).unwrap();
936 let index = &indexes[0];
937
938 assert_eq!(index.fields[0].name, "title");
939 assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
940
941 assert_eq!(index.fields[1].name, "body");
942 assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
943
944 assert_eq!(index.fields[2].name, "author");
945 assert_eq!(index.fields[2].tokenizer, None); }
947
948 #[test]
949 fn test_tokenizer_in_schema() {
950 let sdl = r#"
951 index articles {
952 field title: text<german> [indexed, stored]
953 field body: text<en_stem> [indexed]
954 }
955 "#;
956
957 let indexes = parse_sdl(sdl).unwrap();
958 let schema = indexes[0].to_schema();
959
960 let title_field = schema.get_field("title").unwrap();
961 let title_entry = schema.get_field_entry(title_field).unwrap();
962 assert_eq!(title_entry.tokenizer, Some("german".to_string()));
963
964 let body_field = schema.get_field("body").unwrap();
965 let body_entry = schema.get_field_entry(body_field).unwrap();
966 assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
967 }
968
969 #[test]
970 fn test_query_router_basic() {
971 let sdl = r#"
972 index documents {
973 field title: text [indexed, stored]
974 field uri: text [indexed, stored]
975
976 query_router {
977 pattern: "10\\.\\d{4,}/[^\\s]+"
978 substitution: "doi://{0}"
979 target_field: uris
980 mode: exclusive
981 }
982 }
983 "#;
984
985 let indexes = parse_sdl(sdl).unwrap();
986 let index = &indexes[0];
987
988 assert_eq!(index.query_routers.len(), 1);
989 let router = &index.query_routers[0];
990 assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
991 assert_eq!(router.substitution, "doi://{0}");
992 assert_eq!(router.target_field, "uris");
993 assert_eq!(router.mode, RoutingMode::Exclusive);
994 }
995
996 #[test]
997 fn test_query_router_raw_string() {
998 let sdl = r#"
999 index documents {
1000 field uris: text [indexed, stored]
1001
1002 query_router {
1003 pattern: r"^pmid:(\d+)$"
1004 substitution: "pubmed://{1}"
1005 target_field: uris
1006 mode: additional
1007 }
1008 }
1009 "#;
1010
1011 let indexes = parse_sdl(sdl).unwrap();
1012 let router = &indexes[0].query_routers[0];
1013
1014 assert_eq!(router.pattern, r"^pmid:(\d+)$");
1015 assert_eq!(router.substitution, "pubmed://{1}");
1016 assert_eq!(router.mode, RoutingMode::Additional);
1017 }
1018
1019 #[test]
1020 fn test_multiple_query_routers() {
1021 let sdl = r#"
1022 index documents {
1023 field uris: text [indexed, stored]
1024
1025 query_router {
1026 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1027 substitution: "doi://{1}"
1028 target_field: uris
1029 mode: exclusive
1030 }
1031
1032 query_router {
1033 pattern: r"^pmid:(\d+)$"
1034 substitution: "pubmed://{1}"
1035 target_field: uris
1036 mode: exclusive
1037 }
1038
1039 query_router {
1040 pattern: r"^arxiv:(\d+\.\d+)$"
1041 substitution: "arxiv://{1}"
1042 target_field: uris
1043 mode: additional
1044 }
1045 }
1046 "#;
1047
1048 let indexes = parse_sdl(sdl).unwrap();
1049 assert_eq!(indexes[0].query_routers.len(), 3);
1050 }
1051
1052 #[test]
1053 fn test_query_router_default_mode() {
1054 let sdl = r#"
1055 index documents {
1056 field uris: text [indexed, stored]
1057
1058 query_router {
1059 pattern: r"test"
1060 substitution: "{0}"
1061 target_field: uris
1062 }
1063 }
1064 "#;
1065
1066 let indexes = parse_sdl(sdl).unwrap();
1067 assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1069 }
1070
1071 #[test]
1072 fn test_multi_attribute() {
1073 let sdl = r#"
1074 index documents {
1075 field uris: text [indexed, stored<multi>]
1076 field title: text [indexed, stored]
1077 }
1078 "#;
1079
1080 let indexes = parse_sdl(sdl).unwrap();
1081 assert_eq!(indexes.len(), 1);
1082
1083 let fields = &indexes[0].fields;
1084 assert_eq!(fields.len(), 2);
1085
1086 assert_eq!(fields[0].name, "uris");
1088 assert!(fields[0].multi, "uris field should have multi=true");
1089
1090 assert_eq!(fields[1].name, "title");
1092 assert!(!fields[1].multi, "title field should have multi=false");
1093
1094 let schema = indexes[0].to_schema();
1096 let uris_field = schema.get_field("uris").unwrap();
1097 let title_field = schema.get_field("title").unwrap();
1098
1099 assert!(schema.get_field_entry(uris_field).unwrap().multi);
1100 assert!(!schema.get_field_entry(title_field).unwrap().multi);
1101 }
1102
1103 #[test]
1104 fn test_sparse_vector_field() {
1105 let sdl = r#"
1106 index documents {
1107 field embedding: sparse_vector [indexed, stored]
1108 }
1109 "#;
1110
1111 let indexes = parse_sdl(sdl).unwrap();
1112 assert_eq!(indexes.len(), 1);
1113 assert_eq!(indexes[0].fields.len(), 1);
1114 assert_eq!(indexes[0].fields[0].name, "embedding");
1115 assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1116 assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1117 }
1118
1119 #[test]
1120 fn test_sparse_vector_with_config() {
1121 let sdl = r#"
1122 index documents {
1123 field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1124 field dense: sparse_vector<u32> [indexed<quantization: float32>]
1125 }
1126 "#;
1127
1128 let indexes = parse_sdl(sdl).unwrap();
1129 assert_eq!(indexes[0].fields.len(), 2);
1130
1131 let f1 = &indexes[0].fields[0];
1133 assert_eq!(f1.name, "embedding");
1134 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1135 assert_eq!(config1.index_size, IndexSize::U16);
1136 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1137
1138 let f2 = &indexes[0].fields[1];
1140 assert_eq!(f2.name, "dense");
1141 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1142 assert_eq!(config2.index_size, IndexSize::U32);
1143 assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1144 }
1145
1146 #[test]
1147 fn test_sparse_vector_with_weight_threshold() {
1148 let sdl = r#"
1149 index documents {
1150 field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1151 field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1152 }
1153 "#;
1154
1155 let indexes = parse_sdl(sdl).unwrap();
1156 assert_eq!(indexes[0].fields.len(), 2);
1157
1158 let f1 = &indexes[0].fields[0];
1160 assert_eq!(f1.name, "embedding");
1161 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1162 assert_eq!(config1.index_size, IndexSize::U16);
1163 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1164 assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1165
1166 let f2 = &indexes[0].fields[1];
1168 assert_eq!(f2.name, "embedding2");
1169 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1170 assert_eq!(config2.index_size, IndexSize::U32);
1171 assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1172 assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1173 }
1174
1175 #[test]
1176 fn test_dense_vector_field() {
1177 let sdl = r#"
1178 index documents {
1179 field embedding: dense_vector<768> [indexed, stored]
1180 }
1181 "#;
1182
1183 let indexes = parse_sdl(sdl).unwrap();
1184 assert_eq!(indexes.len(), 1);
1185 assert_eq!(indexes[0].fields.len(), 1);
1186
1187 let f = &indexes[0].fields[0];
1188 assert_eq!(f.name, "embedding");
1189 assert_eq!(f.field_type, FieldType::DenseVector);
1190
1191 let config = f.dense_vector_config.as_ref().unwrap();
1192 assert_eq!(config.dim, 768);
1193 }
1194
1195 #[test]
1196 fn test_dense_vector_alias() {
1197 let sdl = r#"
1198 index documents {
1199 field embedding: vector<1536> [indexed]
1200 }
1201 "#;
1202
1203 let indexes = parse_sdl(sdl).unwrap();
1204 assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1205 assert_eq!(
1206 indexes[0].fields[0]
1207 .dense_vector_config
1208 .as_ref()
1209 .unwrap()
1210 .dim,
1211 1536
1212 );
1213 }
1214
1215 #[test]
1216 fn test_dense_vector_with_centroids() {
1217 let sdl = r#"
1218 index documents {
1219 field embedding: dense_vector<768> [indexed<centroids: "centroids.bin">, stored]
1220 }
1221 "#;
1222
1223 let indexes = parse_sdl(sdl).unwrap();
1224 assert_eq!(indexes.len(), 1);
1225
1226 let f = &indexes[0].fields[0];
1227 assert_eq!(f.name, "embedding");
1228 assert_eq!(f.field_type, FieldType::DenseVector);
1229
1230 let config = f.dense_vector_config.as_ref().unwrap();
1231 assert_eq!(config.dim, 768);
1232 assert_eq!(
1233 config.coarse_centroids_path.as_deref(),
1234 Some("centroids.bin")
1235 );
1236 assert_eq!(config.nprobe, 32); }
1238
1239 #[test]
1240 fn test_dense_vector_with_centroids_and_nprobe() {
1241 let sdl = r#"
1242 index documents {
1243 field embedding: dense_vector<1536> [indexed<centroids: "/path/to/centroids.bin", nprobe: 64>]
1244 }
1245 "#;
1246
1247 let indexes = parse_sdl(sdl).unwrap();
1248 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1249
1250 assert_eq!(config.dim, 1536);
1251 assert_eq!(
1252 config.coarse_centroids_path.as_deref(),
1253 Some("/path/to/centroids.bin")
1254 );
1255 assert_eq!(config.nprobe, 64);
1256 }
1257
1258 #[test]
1259 fn test_dense_vector_keyword_syntax() {
1260 let sdl = r#"
1261 index documents {
1262 field embedding: dense_vector<dims: 1536> [indexed, stored]
1263 }
1264 "#;
1265
1266 let indexes = parse_sdl(sdl).unwrap();
1267 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1268
1269 assert_eq!(config.dim, 1536);
1270 assert!(config.coarse_centroids_path.is_none());
1271 }
1272
1273 #[test]
1274 fn test_dense_vector_keyword_syntax_full() {
1275 let sdl = r#"
1276 index documents {
1277 field embedding: dense_vector<dims: 1536> [indexed<centroids: "/path/to/centroids.bin", nprobe: 64>]
1278 }
1279 "#;
1280
1281 let indexes = parse_sdl(sdl).unwrap();
1282 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1283
1284 assert_eq!(config.dim, 1536);
1285 assert_eq!(
1286 config.coarse_centroids_path.as_deref(),
1287 Some("/path/to/centroids.bin")
1288 );
1289 assert_eq!(config.nprobe, 64);
1290 }
1291
1292 #[test]
1293 fn test_dense_vector_keyword_syntax_partial() {
1294 let sdl = r#"
1295 index documents {
1296 field embedding: dense_vector<dims: 768> [indexed<centroids: "centroids.bin">]
1297 }
1298 "#;
1299
1300 let indexes = parse_sdl(sdl).unwrap();
1301 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1302
1303 assert_eq!(config.dim, 768);
1304 assert_eq!(
1305 config.coarse_centroids_path.as_deref(),
1306 Some("centroids.bin")
1307 );
1308 assert_eq!(config.nprobe, 32); }
1310
1311 #[test]
1312 fn test_dense_vector_scann_index() {
1313 use crate::dsl::schema::VectorIndexType;
1314
1315 let sdl = r#"
1316 index documents {
1317 field embedding: dense_vector<dims: 768> [indexed<scann, centroids: "centroids.bin", codebook: "pq_codebook.bin", nprobe: 64>]
1318 }
1319 "#;
1320
1321 let indexes = parse_sdl(sdl).unwrap();
1322 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1323
1324 assert_eq!(config.dim, 768);
1325 assert_eq!(config.index_type, VectorIndexType::ScaNN);
1326 assert_eq!(
1327 config.coarse_centroids_path.as_deref(),
1328 Some("centroids.bin")
1329 );
1330 assert_eq!(config.pq_codebook_path.as_deref(), Some("pq_codebook.bin"));
1331 assert_eq!(config.nprobe, 64);
1332 }
1333
1334 #[test]
1335 fn test_dense_vector_rabitq_index() {
1336 use crate::dsl::schema::VectorIndexType;
1337
1338 let sdl = r#"
1339 index documents {
1340 field embedding: dense_vector<dims: 1536> [indexed<rabitq, centroids: "centroids.bin">]
1341 }
1342 "#;
1343
1344 let indexes = parse_sdl(sdl).unwrap();
1345 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1346
1347 assert_eq!(config.dim, 1536);
1348 assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1349 assert_eq!(
1350 config.coarse_centroids_path.as_deref(),
1351 Some("centroids.bin")
1352 );
1353 assert!(config.pq_codebook_path.is_none());
1354 }
1355
1356 #[test]
1357 fn test_dense_vector_rabitq_no_centroids() {
1358 use crate::dsl::schema::VectorIndexType;
1359
1360 let sdl = r#"
1361 index documents {
1362 field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1363 }
1364 "#;
1365
1366 let indexes = parse_sdl(sdl).unwrap();
1367 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1368
1369 assert_eq!(config.dim, 768);
1370 assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1371 assert!(config.coarse_centroids_path.is_none());
1372 }
1373
1374 #[test]
1375 fn test_dense_vector_default_index_type() {
1376 use crate::dsl::schema::VectorIndexType;
1377
1378 let sdl = r#"
1380 index documents {
1381 field embedding: dense_vector<dims: 768> [indexed]
1382 }
1383 "#;
1384
1385 let indexes = parse_sdl(sdl).unwrap();
1386 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1387
1388 assert_eq!(config.dim, 768);
1389 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1390 }
1391
1392 #[test]
1393 fn test_dense_vector_mrl_dim() {
1394 let sdl = r#"
1396 index documents {
1397 field embedding: dense_vector<1536> [indexed<mrl_dim: 256>]
1398 }
1399 "#;
1400
1401 let indexes = parse_sdl(sdl).unwrap();
1402 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1403
1404 assert_eq!(config.dim, 1536);
1405 assert_eq!(config.mrl_dim, Some(256));
1406 assert_eq!(config.index_dim(), 256);
1407 }
1408
1409 #[test]
1410 fn test_dense_vector_mrl_dim_with_centroids() {
1411 let sdl = r#"
1413 index documents {
1414 field embedding: dense_vector<768> [indexed<centroids: "centroids.bin", nprobe: 64, mrl_dim: 128>]
1415 }
1416 "#;
1417
1418 let indexes = parse_sdl(sdl).unwrap();
1419 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1420
1421 assert_eq!(config.dim, 768);
1422 assert_eq!(config.mrl_dim, Some(128));
1423 assert_eq!(config.index_dim(), 128);
1424 assert_eq!(
1425 config.coarse_centroids_path.as_deref(),
1426 Some("centroids.bin")
1427 );
1428 assert_eq!(config.nprobe, 64);
1429 }
1430
1431 #[test]
1432 fn test_dense_vector_no_mrl_dim() {
1433 let sdl = r#"
1435 index documents {
1436 field embedding: dense_vector<dims: 768> [indexed]
1437 }
1438 "#;
1439
1440 let indexes = parse_sdl(sdl).unwrap();
1441 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1442
1443 assert_eq!(config.dim, 768);
1444 assert_eq!(config.mrl_dim, None);
1445 assert_eq!(config.index_dim(), 768);
1446 }
1447
1448 #[test]
1449 fn test_json_field_type() {
1450 let sdl = r#"
1451 index documents {
1452 field title: text [indexed, stored]
1453 field metadata: json [stored]
1454 field extra: json
1455 }
1456 "#;
1457
1458 let indexes = parse_sdl(sdl).unwrap();
1459 let index = &indexes[0];
1460
1461 assert_eq!(index.fields.len(), 3);
1462
1463 assert_eq!(index.fields[1].name, "metadata");
1465 assert!(matches!(index.fields[1].field_type, FieldType::Json));
1466 assert!(index.fields[1].stored);
1467 assert_eq!(index.fields[2].name, "extra");
1471 assert!(matches!(index.fields[2].field_type, FieldType::Json));
1472
1473 let schema = index.to_schema();
1475 let metadata_field = schema.get_field("metadata").unwrap();
1476 let entry = schema.get_field_entry(metadata_field).unwrap();
1477 assert_eq!(entry.field_type, FieldType::Json);
1478 assert!(!entry.indexed); assert!(entry.stored);
1480 }
1481
1482 #[test]
1483 fn test_sparse_vector_query_config() {
1484 use crate::structures::QueryWeighting;
1485
1486 let sdl = r#"
1487 index documents {
1488 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1489 }
1490 "#;
1491
1492 let indexes = parse_sdl(sdl).unwrap();
1493 let index = &indexes[0];
1494
1495 assert_eq!(index.fields.len(), 1);
1496 assert_eq!(index.fields[0].name, "embedding");
1497 assert!(matches!(
1498 index.fields[0].field_type,
1499 FieldType::SparseVector
1500 ));
1501
1502 let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1503 assert_eq!(config.index_size, IndexSize::U16);
1504 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1505
1506 let query_config = config.query_config.as_ref().unwrap();
1508 assert_eq!(
1509 query_config.tokenizer.as_deref(),
1510 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1511 );
1512 assert_eq!(query_config.weighting, QueryWeighting::Idf);
1513
1514 let schema = index.to_schema();
1516 let embedding_field = schema.get_field("embedding").unwrap();
1517 let entry = schema.get_field_entry(embedding_field).unwrap();
1518 let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1519 let qc = sv_config.query_config.as_ref().unwrap();
1520 assert_eq!(
1521 qc.tokenizer.as_deref(),
1522 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1523 );
1524 assert_eq!(qc.weighting, QueryWeighting::Idf);
1525 }
1526
1527 #[test]
1528 fn test_sparse_vector_query_config_weighting_one() {
1529 use crate::structures::QueryWeighting;
1530
1531 let sdl = r#"
1532 index documents {
1533 field embedding: sparse_vector [indexed<query<weighting: one>>]
1534 }
1535 "#;
1536
1537 let indexes = parse_sdl(sdl).unwrap();
1538 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1539
1540 let query_config = config.query_config.as_ref().unwrap();
1541 assert!(query_config.tokenizer.is_none());
1542 assert_eq!(query_config.weighting, QueryWeighting::One);
1543 }
1544}