1use pest::Parser;
49use pest_derive::Parser;
50
51use super::query_field_router::{QueryRouterRule, RoutingMode};
52use super::schema::{FieldType, Schema, SchemaBuilder};
53use crate::Result;
54use crate::error::Error;
55
56#[derive(Parser)]
57#[grammar = "dsl/sdl/sdl.pest"]
58pub struct SdlParser;
59
60use super::schema::DenseVectorConfig;
61use crate::structures::{
62 IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
63};
64
65#[derive(Debug, Clone)]
67pub struct FieldDef {
68 pub name: String,
69 pub field_type: FieldType,
70 pub indexed: bool,
71 pub stored: bool,
72 pub tokenizer: Option<String>,
74 pub multi: bool,
76 pub positions: Option<super::schema::PositionMode>,
78 pub sparse_vector_config: Option<SparseVectorConfig>,
80 pub dense_vector_config: Option<DenseVectorConfig>,
82}
83
84#[derive(Debug, Clone)]
86pub struct IndexDef {
87 pub name: String,
88 pub fields: Vec<FieldDef>,
89 pub default_fields: Vec<String>,
90 pub query_routers: Vec<QueryRouterRule>,
92}
93
94impl IndexDef {
95 pub fn to_schema(&self) -> Schema {
97 let mut builder = SchemaBuilder::default();
98
99 for field in &self.fields {
100 let f = match field.field_type {
101 FieldType::Text => {
102 let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
103 builder.add_text_field_with_tokenizer(
104 &field.name,
105 field.indexed,
106 field.stored,
107 tokenizer,
108 )
109 }
110 FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
111 FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
112 FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
113 FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
114 FieldType::Json => builder.add_json_field(&field.name, field.stored),
115 FieldType::SparseVector => {
116 if let Some(config) = &field.sparse_vector_config {
117 builder.add_sparse_vector_field_with_config(
118 &field.name,
119 field.indexed,
120 field.stored,
121 config.clone(),
122 )
123 } else {
124 builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
125 }
126 }
127 FieldType::DenseVector => {
128 let config = field
130 .dense_vector_config
131 .as_ref()
132 .expect("DenseVector field requires dimension to be specified");
133 builder.add_dense_vector_field_with_config(
134 &field.name,
135 field.indexed,
136 field.stored,
137 config.clone(),
138 )
139 }
140 };
141 if field.multi {
142 builder.set_multi(f, true);
143 }
144 let positions = field.positions.or({
146 if field.multi
148 && matches!(
149 field.field_type,
150 FieldType::SparseVector | FieldType::DenseVector
151 )
152 {
153 Some(super::schema::PositionMode::Ordinal)
154 } else {
155 None
156 }
157 });
158 if let Some(mode) = positions {
159 builder.set_positions(f, mode);
160 }
161 }
162
163 if !self.default_fields.is_empty() {
165 builder.set_default_fields(self.default_fields.clone());
166 }
167
168 if !self.query_routers.is_empty() {
170 builder.set_query_routers(self.query_routers.clone());
171 }
172
173 builder.build()
174 }
175
176 pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
181 if self.query_routers.is_empty() {
182 return Ok(None);
183 }
184
185 super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
186 .map(Some)
187 .map_err(Error::Schema)
188 }
189}
190
191fn parse_field_type(type_str: &str) -> Result<FieldType> {
193 match type_str {
194 "text" | "string" | "str" => Ok(FieldType::Text),
195 "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
196 "i64" | "int" | "integer" => Ok(FieldType::I64),
197 "f64" | "float" | "double" => Ok(FieldType::F64),
198 "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
199 "json" => Ok(FieldType::Json),
200 "sparse_vector" => Ok(FieldType::SparseVector),
201 "dense_vector" | "vector" => Ok(FieldType::DenseVector),
202 _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
203 }
204}
205
206#[derive(Debug, Clone, Default)]
208struct IndexConfig {
209 index_type: Option<super::schema::VectorIndexType>,
210 num_clusters: Option<usize>,
211 nprobe: Option<usize>,
212 mrl_dim: Option<usize>,
213 build_threshold: Option<usize>,
214 quantization: Option<WeightQuantization>,
216 weight_threshold: Option<f32>,
217 block_size: Option<usize>,
218 posting_list_pruning: Option<f32>,
219 query_tokenizer: Option<String>,
221 query_weighting: Option<QueryWeighting>,
222 positions: Option<super::schema::PositionMode>,
224}
225
226fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool, Option<IndexConfig>) {
231 let mut indexed = false;
232 let mut stored = false;
233 let mut multi = false;
234 let mut index_config = None;
235
236 for attr in pair.into_inner() {
237 if attr.as_rule() == Rule::attribute {
238 let mut found_config = false;
240 for inner in attr.clone().into_inner() {
241 match inner.as_rule() {
242 Rule::indexed_with_config => {
243 indexed = true;
244 index_config = Some(parse_index_config(inner));
245 found_config = true;
246 break;
247 }
248 Rule::stored_with_config => {
249 stored = true;
250 multi = true; found_config = true;
252 break;
253 }
254 _ => {}
255 }
256 }
257 if !found_config {
258 match attr.as_str() {
260 "indexed" => indexed = true,
261 "stored" => stored = true,
262 _ => {}
263 }
264 }
265 }
266 }
267
268 (indexed, stored, multi, index_config)
269}
270
271fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
273 let mut config = IndexConfig::default();
274
275 for inner in pair.into_inner() {
280 if inner.as_rule() == Rule::index_config_params {
281 for param in inner.into_inner() {
282 if param.as_rule() == Rule::index_config_param {
283 for p in param.into_inner() {
284 parse_single_index_config_param(&mut config, p);
285 }
286 }
287 }
288 }
289 }
290
291 config
292}
293
294fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
296 use super::schema::VectorIndexType;
297
298 match p.as_rule() {
299 Rule::index_type_spec => {
300 config.index_type = Some(match p.as_str() {
301 "flat" => VectorIndexType::Flat,
302 "rabitq" => VectorIndexType::RaBitQ,
303 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
304 "scann" => VectorIndexType::ScaNN,
305 _ => VectorIndexType::RaBitQ,
306 });
307 }
308 Rule::index_type_kwarg => {
309 if let Some(t) = p.into_inner().next() {
311 config.index_type = Some(match t.as_str() {
312 "flat" => VectorIndexType::Flat,
313 "rabitq" => VectorIndexType::RaBitQ,
314 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
315 "scann" => VectorIndexType::ScaNN,
316 _ => VectorIndexType::RaBitQ,
317 });
318 }
319 }
320 Rule::num_clusters_kwarg => {
321 if let Some(n) = p.into_inner().next() {
323 config.num_clusters = Some(n.as_str().parse().unwrap_or(256));
324 }
325 }
326 Rule::build_threshold_kwarg => {
327 if let Some(n) = p.into_inner().next() {
329 config.build_threshold = Some(n.as_str().parse().unwrap_or(10000));
330 }
331 }
332 Rule::nprobe_kwarg => {
333 if let Some(n) = p.into_inner().next() {
335 config.nprobe = Some(n.as_str().parse().unwrap_or(32));
336 }
337 }
338 Rule::mrl_dim_kwarg => {
339 if let Some(n) = p.into_inner().next() {
341 config.mrl_dim = Some(n.as_str().parse().unwrap_or(0));
342 }
343 }
344 Rule::quantization_kwarg => {
345 if let Some(q) = p.into_inner().next() {
347 config.quantization = Some(match q.as_str() {
348 "float32" | "f32" => WeightQuantization::Float32,
349 "float16" | "f16" => WeightQuantization::Float16,
350 "uint8" | "u8" => WeightQuantization::UInt8,
351 "uint4" | "u4" => WeightQuantization::UInt4,
352 _ => WeightQuantization::default(),
353 });
354 }
355 }
356 Rule::weight_threshold_kwarg => {
357 if let Some(t) = p.into_inner().next() {
359 config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
360 }
361 }
362 Rule::block_size_kwarg => {
363 if let Some(n) = p.into_inner().next() {
365 config.block_size = Some(n.as_str().parse().unwrap_or(128));
366 }
367 }
368 Rule::pruning_kwarg => {
369 if let Some(f) = p.into_inner().next() {
371 config.posting_list_pruning = Some(f.as_str().parse().unwrap_or(1.0));
372 }
373 }
374 Rule::query_config_block => {
375 parse_query_config_block(config, p);
377 }
378 Rule::positions_kwarg => {
379 use super::schema::PositionMode;
381 config.positions = Some(match p.as_str() {
382 "ordinal" => PositionMode::Ordinal,
383 "token_position" => PositionMode::TokenPosition,
384 _ => PositionMode::Full, });
386 }
387 _ => {}
388 }
389}
390
391fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
393 for inner in pair.into_inner() {
394 if inner.as_rule() == Rule::query_config_params {
395 for param in inner.into_inner() {
396 if param.as_rule() == Rule::query_config_param {
397 for p in param.into_inner() {
398 match p.as_rule() {
399 Rule::query_tokenizer_kwarg => {
400 if let Some(path) = p.into_inner().next()
402 && let Some(inner_path) = path.into_inner().next()
403 {
404 config.query_tokenizer = Some(inner_path.as_str().to_string());
405 }
406 }
407 Rule::query_weighting_kwarg => {
408 if let Some(w) = p.into_inner().next() {
410 config.query_weighting = Some(match w.as_str() {
411 "one" => QueryWeighting::One,
412 "idf" => QueryWeighting::Idf,
413 _ => QueryWeighting::One,
414 });
415 }
416 }
417 _ => {}
418 }
419 }
420 }
421 }
422 }
423 }
424}
425
426fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
428 let mut inner = pair.into_inner();
429
430 let name = inner
431 .next()
432 .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
433 .as_str()
434 .to_string();
435
436 let field_type_str = inner
437 .next()
438 .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
439 .as_str();
440
441 let field_type = parse_field_type(field_type_str)?;
442
443 let mut tokenizer = None;
445 let mut sparse_vector_config = None;
446 let mut dense_vector_config = None;
447 let mut indexed = true;
448 let mut stored = true;
449 let mut multi = false;
450 let mut index_config: Option<IndexConfig> = None;
451
452 for item in inner {
453 match item.as_rule() {
454 Rule::tokenizer_spec => {
455 if let Some(tok_name) = item.into_inner().next() {
457 tokenizer = Some(tok_name.as_str().to_string());
458 }
459 }
460 Rule::sparse_vector_config => {
461 sparse_vector_config = Some(parse_sparse_vector_config(item));
463 }
464 Rule::dense_vector_config => {
465 dense_vector_config = Some(parse_dense_vector_config(item));
467 }
468 Rule::attributes => {
469 let (idx, sto, mul, idx_cfg) = parse_attributes(item);
470 indexed = idx;
471 stored = sto;
472 multi = mul;
473 index_config = idx_cfg;
474 }
475 _ => {}
476 }
477 }
478
479 let mut positions = None;
481 if let Some(idx_cfg) = index_config {
482 positions = idx_cfg.positions;
483 if let Some(ref mut dv_config) = dense_vector_config {
484 apply_index_config_to_dense_vector(dv_config, idx_cfg);
485 } else if field_type == FieldType::SparseVector {
486 let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
488 apply_index_config_to_sparse_vector(sv_config, idx_cfg);
489 }
490 }
491
492 Ok(FieldDef {
493 name,
494 field_type,
495 indexed,
496 stored,
497 tokenizer,
498 multi,
499 positions,
500 sparse_vector_config,
501 dense_vector_config,
502 })
503}
504
505fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
507 if let Some(index_type) = idx_cfg.index_type {
509 config.index_type = index_type;
510 }
511
512 if idx_cfg.num_clusters.is_some() {
514 config.num_clusters = idx_cfg.num_clusters;
515 }
516
517 if let Some(nprobe) = idx_cfg.nprobe {
519 config.nprobe = nprobe;
520 }
521
522 if idx_cfg.mrl_dim.is_some() {
524 config.mrl_dim = idx_cfg.mrl_dim;
525 }
526
527 if idx_cfg.build_threshold.is_some() {
529 config.build_threshold = idx_cfg.build_threshold;
530 }
531}
532
533fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
536 let mut index_size = IndexSize::default();
537
538 for inner in pair.into_inner() {
540 if inner.as_rule() == Rule::index_size_spec {
541 index_size = match inner.as_str() {
542 "u16" => IndexSize::U16,
543 "u32" => IndexSize::U32,
544 _ => IndexSize::default(),
545 };
546 }
547 }
548
549 SparseVectorConfig {
550 index_size,
551 weight_quantization: WeightQuantization::default(),
552 weight_threshold: 0.0,
553 block_size: 128,
554 posting_list_pruning: None,
555 query_config: None,
556 }
557}
558
559fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
561 if let Some(q) = idx_cfg.quantization {
562 config.weight_quantization = q;
563 }
564 if let Some(t) = idx_cfg.weight_threshold {
565 config.weight_threshold = t;
566 }
567 if let Some(bs) = idx_cfg.block_size {
568 let adjusted = bs.next_power_of_two();
569 if adjusted != bs {
570 log::warn!(
571 "block_size {} adjusted to next power of two: {}",
572 bs,
573 adjusted
574 );
575 }
576 config.block_size = adjusted;
577 }
578 if let Some(p) = idx_cfg.posting_list_pruning {
579 let clamped = p.clamp(0.0, 1.0);
580 if (clamped - p).abs() > f32::EPSILON {
581 log::warn!(
582 "pruning {} clamped to valid range [0.0, 1.0]: {}",
583 p,
584 clamped
585 );
586 }
587 config.posting_list_pruning = Some(clamped);
588 }
589 if idx_cfg.query_tokenizer.is_some() || idx_cfg.query_weighting.is_some() {
591 let query_config = config
592 .query_config
593 .get_or_insert(SparseQueryConfig::default());
594 if let Some(tokenizer) = idx_cfg.query_tokenizer {
595 query_config.tokenizer = Some(tokenizer);
596 }
597 if let Some(weighting) = idx_cfg.query_weighting {
598 query_config.weighting = weighting;
599 }
600 }
601}
602
603fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
606 let mut dim: usize = 0;
607
608 for params in pair.into_inner() {
610 if params.as_rule() == Rule::dense_vector_params {
611 for inner in params.into_inner() {
612 match inner.as_rule() {
613 Rule::dense_vector_keyword_params => {
614 for kwarg in inner.into_inner() {
616 if kwarg.as_rule() == Rule::dims_kwarg
617 && let Some(d) = kwarg.into_inner().next()
618 {
619 dim = d.as_str().parse().unwrap_or(0);
620 }
621 }
622 }
623 Rule::dense_vector_positional_params => {
624 if let Some(dim_pair) = inner.into_inner().next() {
626 dim = dim_pair.as_str().parse().unwrap_or(0);
627 }
628 }
629 _ => {}
630 }
631 }
632 }
633 }
634
635 DenseVectorConfig::new(dim)
636}
637
638fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
640 pair.into_inner().map(|p| p.as_str().to_string()).collect()
641}
642
643fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
645 let mut pattern = String::new();
646 let mut substitution = String::new();
647 let mut target_field = String::new();
648 let mut mode = RoutingMode::Additional;
649
650 for prop in pair.into_inner() {
651 if prop.as_rule() != Rule::query_router_prop {
652 continue;
653 }
654
655 for inner in prop.into_inner() {
656 match inner.as_rule() {
657 Rule::query_router_pattern => {
658 if let Some(regex_str) = inner.into_inner().next() {
659 pattern = parse_string_value(regex_str);
660 }
661 }
662 Rule::query_router_substitution => {
663 if let Some(quoted) = inner.into_inner().next() {
664 substitution = parse_string_value(quoted);
665 }
666 }
667 Rule::query_router_target => {
668 if let Some(ident) = inner.into_inner().next() {
669 target_field = ident.as_str().to_string();
670 }
671 }
672 Rule::query_router_mode => {
673 if let Some(mode_val) = inner.into_inner().next() {
674 mode = match mode_val.as_str() {
675 "exclusive" => RoutingMode::Exclusive,
676 "additional" => RoutingMode::Additional,
677 _ => RoutingMode::Additional,
678 };
679 }
680 }
681 _ => {}
682 }
683 }
684 }
685
686 if pattern.is_empty() {
687 return Err(Error::Schema("query_router missing 'pattern'".to_string()));
688 }
689 if substitution.is_empty() {
690 return Err(Error::Schema(
691 "query_router missing 'substitution'".to_string(),
692 ));
693 }
694 if target_field.is_empty() {
695 return Err(Error::Schema(
696 "query_router missing 'target_field'".to_string(),
697 ));
698 }
699
700 Ok(QueryRouterRule {
701 pattern,
702 substitution,
703 target_field,
704 mode,
705 })
706}
707
708fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
710 let s = pair.as_str();
711 match pair.as_rule() {
712 Rule::regex_string => {
713 if let Some(inner) = pair.into_inner().next() {
715 parse_string_value(inner)
716 } else {
717 s.to_string()
718 }
719 }
720 Rule::raw_string => {
721 s[2..s.len() - 1].to_string()
723 }
724 Rule::quoted_string => {
725 let inner = &s[1..s.len() - 1];
727 inner
729 .replace("\\n", "\n")
730 .replace("\\t", "\t")
731 .replace("\\\"", "\"")
732 .replace("\\\\", "\\")
733 }
734 _ => s.to_string(),
735 }
736}
737
738fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
740 let mut inner = pair.into_inner();
741
742 let name = inner
743 .next()
744 .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
745 .as_str()
746 .to_string();
747
748 let mut fields = Vec::new();
749 let mut default_fields = Vec::new();
750 let mut query_routers = Vec::new();
751
752 for item in inner {
753 match item.as_rule() {
754 Rule::field_def => {
755 fields.push(parse_field_def(item)?);
756 }
757 Rule::default_fields_def => {
758 default_fields = parse_default_fields_def(item);
759 }
760 Rule::query_router_def => {
761 query_routers.push(parse_query_router_def(item)?);
762 }
763 _ => {}
764 }
765 }
766
767 Ok(IndexDef {
768 name,
769 fields,
770 default_fields,
771 query_routers,
772 })
773}
774
775pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
777 let pairs = SdlParser::parse(Rule::file, input)
778 .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
779
780 let mut indexes = Vec::new();
781
782 for pair in pairs {
783 if pair.as_rule() == Rule::file {
784 for inner in pair.into_inner() {
785 if inner.as_rule() == Rule::index_def {
786 indexes.push(parse_index_def(inner)?);
787 }
788 }
789 }
790 }
791
792 Ok(indexes)
793}
794
795pub fn parse_single_index(input: &str) -> Result<IndexDef> {
797 let indexes = parse_sdl(input)?;
798
799 if indexes.is_empty() {
800 return Err(Error::Schema("No index definition found".to_string()));
801 }
802
803 if indexes.len() > 1 {
804 return Err(Error::Schema(
805 "Multiple index definitions found, expected one".to_string(),
806 ));
807 }
808
809 Ok(indexes.into_iter().next().unwrap())
810}
811
812#[cfg(test)]
813mod tests {
814 use super::*;
815
816 #[test]
817 fn test_parse_simple_schema() {
818 let sdl = r#"
819 index articles {
820 field title: text [indexed, stored]
821 field body: text [indexed]
822 }
823 "#;
824
825 let indexes = parse_sdl(sdl).unwrap();
826 assert_eq!(indexes.len(), 1);
827
828 let index = &indexes[0];
829 assert_eq!(index.name, "articles");
830 assert_eq!(index.fields.len(), 2);
831
832 assert_eq!(index.fields[0].name, "title");
833 assert!(matches!(index.fields[0].field_type, FieldType::Text));
834 assert!(index.fields[0].indexed);
835 assert!(index.fields[0].stored);
836
837 assert_eq!(index.fields[1].name, "body");
838 assert!(matches!(index.fields[1].field_type, FieldType::Text));
839 assert!(index.fields[1].indexed);
840 assert!(!index.fields[1].stored);
841 }
842
843 #[test]
844 fn test_parse_all_field_types() {
845 let sdl = r#"
846 index test {
847 field text_field: text [indexed, stored]
848 field u64_field: u64 [indexed, stored]
849 field i64_field: i64 [indexed, stored]
850 field f64_field: f64 [indexed, stored]
851 field bytes_field: bytes [stored]
852 }
853 "#;
854
855 let indexes = parse_sdl(sdl).unwrap();
856 let index = &indexes[0];
857
858 assert!(matches!(index.fields[0].field_type, FieldType::Text));
859 assert!(matches!(index.fields[1].field_type, FieldType::U64));
860 assert!(matches!(index.fields[2].field_type, FieldType::I64));
861 assert!(matches!(index.fields[3].field_type, FieldType::F64));
862 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
863 }
864
865 #[test]
866 fn test_parse_with_comments() {
867 let sdl = r#"
868 # This is a comment
869 index articles {
870 # Title field
871 field title: text [indexed, stored]
872 field body: text [indexed] # inline comment not supported yet
873 }
874 "#;
875
876 let indexes = parse_sdl(sdl).unwrap();
877 assert_eq!(indexes[0].fields.len(), 2);
878 }
879
880 #[test]
881 fn test_parse_type_aliases() {
882 let sdl = r#"
883 index test {
884 field a: string [indexed]
885 field b: int [indexed]
886 field c: uint [indexed]
887 field d: float [indexed]
888 field e: binary [stored]
889 }
890 "#;
891
892 let indexes = parse_sdl(sdl).unwrap();
893 let index = &indexes[0];
894
895 assert!(matches!(index.fields[0].field_type, FieldType::Text));
896 assert!(matches!(index.fields[1].field_type, FieldType::I64));
897 assert!(matches!(index.fields[2].field_type, FieldType::U64));
898 assert!(matches!(index.fields[3].field_type, FieldType::F64));
899 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
900 }
901
902 #[test]
903 fn test_to_schema() {
904 let sdl = r#"
905 index articles {
906 field title: text [indexed, stored]
907 field views: u64 [indexed, stored]
908 }
909 "#;
910
911 let indexes = parse_sdl(sdl).unwrap();
912 let schema = indexes[0].to_schema();
913
914 assert!(schema.get_field("title").is_some());
915 assert!(schema.get_field("views").is_some());
916 assert!(schema.get_field("nonexistent").is_none());
917 }
918
919 #[test]
920 fn test_default_attributes() {
921 let sdl = r#"
922 index test {
923 field title: text
924 }
925 "#;
926
927 let indexes = parse_sdl(sdl).unwrap();
928 let field = &indexes[0].fields[0];
929
930 assert!(field.indexed);
932 assert!(field.stored);
933 }
934
935 #[test]
936 fn test_multiple_indexes() {
937 let sdl = r#"
938 index articles {
939 field title: text [indexed, stored]
940 }
941
942 index users {
943 field name: text [indexed, stored]
944 field email: text [indexed, stored]
945 }
946 "#;
947
948 let indexes = parse_sdl(sdl).unwrap();
949 assert_eq!(indexes.len(), 2);
950 assert_eq!(indexes[0].name, "articles");
951 assert_eq!(indexes[1].name, "users");
952 }
953
954 #[test]
955 fn test_tokenizer_spec() {
956 let sdl = r#"
957 index articles {
958 field title: text<en_stem> [indexed, stored]
959 field body: text<default> [indexed]
960 field author: text [indexed, stored]
961 }
962 "#;
963
964 let indexes = parse_sdl(sdl).unwrap();
965 let index = &indexes[0];
966
967 assert_eq!(index.fields[0].name, "title");
968 assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
969
970 assert_eq!(index.fields[1].name, "body");
971 assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
972
973 assert_eq!(index.fields[2].name, "author");
974 assert_eq!(index.fields[2].tokenizer, None); }
976
977 #[test]
978 fn test_tokenizer_in_schema() {
979 let sdl = r#"
980 index articles {
981 field title: text<german> [indexed, stored]
982 field body: text<en_stem> [indexed]
983 }
984 "#;
985
986 let indexes = parse_sdl(sdl).unwrap();
987 let schema = indexes[0].to_schema();
988
989 let title_field = schema.get_field("title").unwrap();
990 let title_entry = schema.get_field_entry(title_field).unwrap();
991 assert_eq!(title_entry.tokenizer, Some("german".to_string()));
992
993 let body_field = schema.get_field("body").unwrap();
994 let body_entry = schema.get_field_entry(body_field).unwrap();
995 assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
996 }
997
998 #[test]
999 fn test_query_router_basic() {
1000 let sdl = r#"
1001 index documents {
1002 field title: text [indexed, stored]
1003 field uri: text [indexed, stored]
1004
1005 query_router {
1006 pattern: "10\\.\\d{4,}/[^\\s]+"
1007 substitution: "doi://{0}"
1008 target_field: uris
1009 mode: exclusive
1010 }
1011 }
1012 "#;
1013
1014 let indexes = parse_sdl(sdl).unwrap();
1015 let index = &indexes[0];
1016
1017 assert_eq!(index.query_routers.len(), 1);
1018 let router = &index.query_routers[0];
1019 assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1020 assert_eq!(router.substitution, "doi://{0}");
1021 assert_eq!(router.target_field, "uris");
1022 assert_eq!(router.mode, RoutingMode::Exclusive);
1023 }
1024
1025 #[test]
1026 fn test_query_router_raw_string() {
1027 let sdl = r#"
1028 index documents {
1029 field uris: text [indexed, stored]
1030
1031 query_router {
1032 pattern: r"^pmid:(\d+)$"
1033 substitution: "pubmed://{1}"
1034 target_field: uris
1035 mode: additional
1036 }
1037 }
1038 "#;
1039
1040 let indexes = parse_sdl(sdl).unwrap();
1041 let router = &indexes[0].query_routers[0];
1042
1043 assert_eq!(router.pattern, r"^pmid:(\d+)$");
1044 assert_eq!(router.substitution, "pubmed://{1}");
1045 assert_eq!(router.mode, RoutingMode::Additional);
1046 }
1047
1048 #[test]
1049 fn test_multiple_query_routers() {
1050 let sdl = r#"
1051 index documents {
1052 field uris: text [indexed, stored]
1053
1054 query_router {
1055 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1056 substitution: "doi://{1}"
1057 target_field: uris
1058 mode: exclusive
1059 }
1060
1061 query_router {
1062 pattern: r"^pmid:(\d+)$"
1063 substitution: "pubmed://{1}"
1064 target_field: uris
1065 mode: exclusive
1066 }
1067
1068 query_router {
1069 pattern: r"^arxiv:(\d+\.\d+)$"
1070 substitution: "arxiv://{1}"
1071 target_field: uris
1072 mode: additional
1073 }
1074 }
1075 "#;
1076
1077 let indexes = parse_sdl(sdl).unwrap();
1078 assert_eq!(indexes[0].query_routers.len(), 3);
1079 }
1080
1081 #[test]
1082 fn test_query_router_default_mode() {
1083 let sdl = r#"
1084 index documents {
1085 field uris: text [indexed, stored]
1086
1087 query_router {
1088 pattern: r"test"
1089 substitution: "{0}"
1090 target_field: uris
1091 }
1092 }
1093 "#;
1094
1095 let indexes = parse_sdl(sdl).unwrap();
1096 assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1098 }
1099
1100 #[test]
1101 fn test_multi_attribute() {
1102 let sdl = r#"
1103 index documents {
1104 field uris: text [indexed, stored<multi>]
1105 field title: text [indexed, stored]
1106 }
1107 "#;
1108
1109 let indexes = parse_sdl(sdl).unwrap();
1110 assert_eq!(indexes.len(), 1);
1111
1112 let fields = &indexes[0].fields;
1113 assert_eq!(fields.len(), 2);
1114
1115 assert_eq!(fields[0].name, "uris");
1117 assert!(fields[0].multi, "uris field should have multi=true");
1118
1119 assert_eq!(fields[1].name, "title");
1121 assert!(!fields[1].multi, "title field should have multi=false");
1122
1123 let schema = indexes[0].to_schema();
1125 let uris_field = schema.get_field("uris").unwrap();
1126 let title_field = schema.get_field("title").unwrap();
1127
1128 assert!(schema.get_field_entry(uris_field).unwrap().multi);
1129 assert!(!schema.get_field_entry(title_field).unwrap().multi);
1130 }
1131
1132 #[test]
1133 fn test_sparse_vector_field() {
1134 let sdl = r#"
1135 index documents {
1136 field embedding: sparse_vector [indexed, stored]
1137 }
1138 "#;
1139
1140 let indexes = parse_sdl(sdl).unwrap();
1141 assert_eq!(indexes.len(), 1);
1142 assert_eq!(indexes[0].fields.len(), 1);
1143 assert_eq!(indexes[0].fields[0].name, "embedding");
1144 assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1145 assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1146 }
1147
1148 #[test]
1149 fn test_sparse_vector_with_config() {
1150 let sdl = r#"
1151 index documents {
1152 field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1153 field dense: sparse_vector<u32> [indexed<quantization: float32>]
1154 }
1155 "#;
1156
1157 let indexes = parse_sdl(sdl).unwrap();
1158 assert_eq!(indexes[0].fields.len(), 2);
1159
1160 let f1 = &indexes[0].fields[0];
1162 assert_eq!(f1.name, "embedding");
1163 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1164 assert_eq!(config1.index_size, IndexSize::U16);
1165 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1166
1167 let f2 = &indexes[0].fields[1];
1169 assert_eq!(f2.name, "dense");
1170 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1171 assert_eq!(config2.index_size, IndexSize::U32);
1172 assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1173 }
1174
1175 #[test]
1176 fn test_sparse_vector_with_weight_threshold() {
1177 let sdl = r#"
1178 index documents {
1179 field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1180 field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1181 }
1182 "#;
1183
1184 let indexes = parse_sdl(sdl).unwrap();
1185 assert_eq!(indexes[0].fields.len(), 2);
1186
1187 let f1 = &indexes[0].fields[0];
1189 assert_eq!(f1.name, "embedding");
1190 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1191 assert_eq!(config1.index_size, IndexSize::U16);
1192 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1193 assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1194
1195 let f2 = &indexes[0].fields[1];
1197 assert_eq!(f2.name, "embedding2");
1198 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1199 assert_eq!(config2.index_size, IndexSize::U32);
1200 assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1201 assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1202 }
1203
1204 #[test]
1205 fn test_sparse_vector_with_pruning() {
1206 let sdl = r#"
1207 index documents {
1208 field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1209 }
1210 "#;
1211
1212 let indexes = parse_sdl(sdl).unwrap();
1213 let f = &indexes[0].fields[0];
1214 assert_eq!(f.name, "embedding");
1215 let config = f.sparse_vector_config.as_ref().unwrap();
1216 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1217 assert_eq!(config.posting_list_pruning, Some(0.1));
1218 }
1219
1220 #[test]
1221 fn test_dense_vector_field() {
1222 let sdl = r#"
1223 index documents {
1224 field embedding: dense_vector<768> [indexed, stored]
1225 }
1226 "#;
1227
1228 let indexes = parse_sdl(sdl).unwrap();
1229 assert_eq!(indexes.len(), 1);
1230 assert_eq!(indexes[0].fields.len(), 1);
1231
1232 let f = &indexes[0].fields[0];
1233 assert_eq!(f.name, "embedding");
1234 assert_eq!(f.field_type, FieldType::DenseVector);
1235
1236 let config = f.dense_vector_config.as_ref().unwrap();
1237 assert_eq!(config.dim, 768);
1238 }
1239
1240 #[test]
1241 fn test_dense_vector_alias() {
1242 let sdl = r#"
1243 index documents {
1244 field embedding: vector<1536> [indexed]
1245 }
1246 "#;
1247
1248 let indexes = parse_sdl(sdl).unwrap();
1249 assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1250 assert_eq!(
1251 indexes[0].fields[0]
1252 .dense_vector_config
1253 .as_ref()
1254 .unwrap()
1255 .dim,
1256 1536
1257 );
1258 }
1259
1260 #[test]
1261 fn test_dense_vector_with_num_clusters() {
1262 let sdl = r#"
1263 index documents {
1264 field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1265 }
1266 "#;
1267
1268 let indexes = parse_sdl(sdl).unwrap();
1269 assert_eq!(indexes.len(), 1);
1270
1271 let f = &indexes[0].fields[0];
1272 assert_eq!(f.name, "embedding");
1273 assert_eq!(f.field_type, FieldType::DenseVector);
1274
1275 let config = f.dense_vector_config.as_ref().unwrap();
1276 assert_eq!(config.dim, 768);
1277 assert_eq!(config.num_clusters, Some(256));
1278 assert_eq!(config.nprobe, 32); }
1280
1281 #[test]
1282 fn test_dense_vector_with_num_clusters_and_nprobe() {
1283 let sdl = r#"
1284 index documents {
1285 field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1286 }
1287 "#;
1288
1289 let indexes = parse_sdl(sdl).unwrap();
1290 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1291
1292 assert_eq!(config.dim, 1536);
1293 assert_eq!(config.num_clusters, Some(512));
1294 assert_eq!(config.nprobe, 64);
1295 }
1296
1297 #[test]
1298 fn test_dense_vector_keyword_syntax() {
1299 let sdl = r#"
1300 index documents {
1301 field embedding: dense_vector<dims: 1536> [indexed, stored]
1302 }
1303 "#;
1304
1305 let indexes = parse_sdl(sdl).unwrap();
1306 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1307
1308 assert_eq!(config.dim, 1536);
1309 assert!(config.num_clusters.is_none());
1310 }
1311
1312 #[test]
1313 fn test_dense_vector_keyword_syntax_full() {
1314 let sdl = r#"
1315 index documents {
1316 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1317 }
1318 "#;
1319
1320 let indexes = parse_sdl(sdl).unwrap();
1321 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1322
1323 assert_eq!(config.dim, 1536);
1324 assert_eq!(config.num_clusters, Some(256));
1325 assert_eq!(config.nprobe, 64);
1326 }
1327
1328 #[test]
1329 fn test_dense_vector_keyword_syntax_partial() {
1330 let sdl = r#"
1331 index documents {
1332 field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1333 }
1334 "#;
1335
1336 let indexes = parse_sdl(sdl).unwrap();
1337 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1338
1339 assert_eq!(config.dim, 768);
1340 assert_eq!(config.num_clusters, Some(128));
1341 assert_eq!(config.nprobe, 32); }
1343
1344 #[test]
1345 fn test_dense_vector_scann_index() {
1346 use crate::dsl::schema::VectorIndexType;
1347
1348 let sdl = r#"
1349 index documents {
1350 field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1351 }
1352 "#;
1353
1354 let indexes = parse_sdl(sdl).unwrap();
1355 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1356
1357 assert_eq!(config.dim, 768);
1358 assert_eq!(config.index_type, VectorIndexType::ScaNN);
1359 assert_eq!(config.num_clusters, Some(256));
1360 assert_eq!(config.nprobe, 64);
1361 }
1362
1363 #[test]
1364 fn test_dense_vector_ivf_rabitq_index() {
1365 use crate::dsl::schema::VectorIndexType;
1366
1367 let sdl = r#"
1368 index documents {
1369 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1370 }
1371 "#;
1372
1373 let indexes = parse_sdl(sdl).unwrap();
1374 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1375
1376 assert_eq!(config.dim, 1536);
1377 assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1378 assert_eq!(config.num_clusters, Some(512));
1379 }
1380
1381 #[test]
1382 fn test_dense_vector_rabitq_no_clusters() {
1383 use crate::dsl::schema::VectorIndexType;
1384
1385 let sdl = r#"
1386 index documents {
1387 field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1388 }
1389 "#;
1390
1391 let indexes = parse_sdl(sdl).unwrap();
1392 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1393
1394 assert_eq!(config.dim, 768);
1395 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1396 assert!(config.num_clusters.is_none());
1397 }
1398
1399 #[test]
1400 fn test_dense_vector_flat_index() {
1401 use crate::dsl::schema::VectorIndexType;
1402
1403 let sdl = r#"
1404 index documents {
1405 field embedding: dense_vector<dims: 768> [indexed<flat>]
1406 }
1407 "#;
1408
1409 let indexes = parse_sdl(sdl).unwrap();
1410 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1411
1412 assert_eq!(config.dim, 768);
1413 assert_eq!(config.index_type, VectorIndexType::Flat);
1414 }
1415
1416 #[test]
1417 fn test_dense_vector_default_index_type() {
1418 use crate::dsl::schema::VectorIndexType;
1419
1420 let sdl = r#"
1422 index documents {
1423 field embedding: dense_vector<dims: 768> [indexed]
1424 }
1425 "#;
1426
1427 let indexes = parse_sdl(sdl).unwrap();
1428 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1429
1430 assert_eq!(config.dim, 768);
1431 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1432 }
1433
1434 #[test]
1435 fn test_dense_vector_mrl_dim() {
1436 let sdl = r#"
1438 index documents {
1439 field embedding: dense_vector<1536> [indexed<mrl_dim: 256>]
1440 }
1441 "#;
1442
1443 let indexes = parse_sdl(sdl).unwrap();
1444 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1445
1446 assert_eq!(config.dim, 1536);
1447 assert_eq!(config.mrl_dim, Some(256));
1448 assert_eq!(config.index_dim(), 256);
1449 }
1450
1451 #[test]
1452 fn test_dense_vector_mrl_dim_with_num_clusters() {
1453 let sdl = r#"
1455 index documents {
1456 field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64, mrl_dim: 128>]
1457 }
1458 "#;
1459
1460 let indexes = parse_sdl(sdl).unwrap();
1461 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1462
1463 assert_eq!(config.dim, 768);
1464 assert_eq!(config.mrl_dim, Some(128));
1465 assert_eq!(config.index_dim(), 128);
1466 assert_eq!(config.num_clusters, Some(256));
1467 assert_eq!(config.nprobe, 64);
1468 }
1469
1470 #[test]
1471 fn test_dense_vector_no_mrl_dim() {
1472 let sdl = r#"
1474 index documents {
1475 field embedding: dense_vector<dims: 768> [indexed]
1476 }
1477 "#;
1478
1479 let indexes = parse_sdl(sdl).unwrap();
1480 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1481
1482 assert_eq!(config.dim, 768);
1483 assert_eq!(config.mrl_dim, None);
1484 assert_eq!(config.index_dim(), 768);
1485 }
1486
1487 #[test]
1488 fn test_json_field_type() {
1489 let sdl = r#"
1490 index documents {
1491 field title: text [indexed, stored]
1492 field metadata: json [stored]
1493 field extra: json
1494 }
1495 "#;
1496
1497 let indexes = parse_sdl(sdl).unwrap();
1498 let index = &indexes[0];
1499
1500 assert_eq!(index.fields.len(), 3);
1501
1502 assert_eq!(index.fields[1].name, "metadata");
1504 assert!(matches!(index.fields[1].field_type, FieldType::Json));
1505 assert!(index.fields[1].stored);
1506 assert_eq!(index.fields[2].name, "extra");
1510 assert!(matches!(index.fields[2].field_type, FieldType::Json));
1511
1512 let schema = index.to_schema();
1514 let metadata_field = schema.get_field("metadata").unwrap();
1515 let entry = schema.get_field_entry(metadata_field).unwrap();
1516 assert_eq!(entry.field_type, FieldType::Json);
1517 assert!(!entry.indexed); assert!(entry.stored);
1519 }
1520
1521 #[test]
1522 fn test_sparse_vector_query_config() {
1523 use crate::structures::QueryWeighting;
1524
1525 let sdl = r#"
1526 index documents {
1527 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1528 }
1529 "#;
1530
1531 let indexes = parse_sdl(sdl).unwrap();
1532 let index = &indexes[0];
1533
1534 assert_eq!(index.fields.len(), 1);
1535 assert_eq!(index.fields[0].name, "embedding");
1536 assert!(matches!(
1537 index.fields[0].field_type,
1538 FieldType::SparseVector
1539 ));
1540
1541 let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1542 assert_eq!(config.index_size, IndexSize::U16);
1543 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1544
1545 let query_config = config.query_config.as_ref().unwrap();
1547 assert_eq!(
1548 query_config.tokenizer.as_deref(),
1549 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1550 );
1551 assert_eq!(query_config.weighting, QueryWeighting::Idf);
1552
1553 let schema = index.to_schema();
1555 let embedding_field = schema.get_field("embedding").unwrap();
1556 let entry = schema.get_field_entry(embedding_field).unwrap();
1557 let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1558 let qc = sv_config.query_config.as_ref().unwrap();
1559 assert_eq!(
1560 qc.tokenizer.as_deref(),
1561 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1562 );
1563 assert_eq!(qc.weighting, QueryWeighting::Idf);
1564 }
1565
1566 #[test]
1567 fn test_sparse_vector_query_config_weighting_one() {
1568 use crate::structures::QueryWeighting;
1569
1570 let sdl = r#"
1571 index documents {
1572 field embedding: sparse_vector [indexed<query<weighting: one>>]
1573 }
1574 "#;
1575
1576 let indexes = parse_sdl(sdl).unwrap();
1577 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1578
1579 let query_config = config.query_config.as_ref().unwrap();
1580 assert!(query_config.tokenizer.is_none());
1581 assert_eq!(query_config.weighting, QueryWeighting::One);
1582 }
1583}