1use pest::Parser;
49use pest_derive::Parser;
50
51use super::query_field_router::{QueryRouterRule, RoutingMode};
52use super::schema::{FieldType, Schema, SchemaBuilder};
53use crate::Result;
54use crate::error::Error;
55
56#[derive(Parser)]
57#[grammar = "dsl/sdl/sdl.pest"]
58pub struct SdlParser;
59
60use super::schema::DenseVectorConfig;
61use crate::structures::{
62 IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
63};
64
65#[derive(Debug, Clone)]
67pub struct FieldDef {
68 pub name: String,
69 pub field_type: FieldType,
70 pub indexed: bool,
71 pub stored: bool,
72 pub tokenizer: Option<String>,
74 pub multi: bool,
76 pub positions: Option<super::schema::PositionMode>,
78 pub sparse_vector_config: Option<SparseVectorConfig>,
80 pub dense_vector_config: Option<DenseVectorConfig>,
82}
83
84#[derive(Debug, Clone)]
86pub struct IndexDef {
87 pub name: String,
88 pub fields: Vec<FieldDef>,
89 pub default_fields: Vec<String>,
90 pub query_routers: Vec<QueryRouterRule>,
92}
93
94impl IndexDef {
95 pub fn to_schema(&self) -> Schema {
97 let mut builder = SchemaBuilder::default();
98
99 for field in &self.fields {
100 let f = match field.field_type {
101 FieldType::Text => {
102 let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
103 builder.add_text_field_with_tokenizer(
104 &field.name,
105 field.indexed,
106 field.stored,
107 tokenizer,
108 )
109 }
110 FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
111 FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
112 FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
113 FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
114 FieldType::Json => builder.add_json_field(&field.name, field.stored),
115 FieldType::SparseVector => {
116 if let Some(config) = &field.sparse_vector_config {
117 builder.add_sparse_vector_field_with_config(
118 &field.name,
119 field.indexed,
120 field.stored,
121 config.clone(),
122 )
123 } else {
124 builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
125 }
126 }
127 FieldType::DenseVector => {
128 let config = field
130 .dense_vector_config
131 .as_ref()
132 .expect("DenseVector field requires dimension to be specified");
133 builder.add_dense_vector_field_with_config(
134 &field.name,
135 field.indexed,
136 field.stored,
137 config.clone(),
138 )
139 }
140 };
141 if field.multi {
142 builder.set_multi(f, true);
143 }
144 let positions = field.positions.or({
146 if field.multi
148 && matches!(
149 field.field_type,
150 FieldType::SparseVector | FieldType::DenseVector
151 )
152 {
153 Some(super::schema::PositionMode::Ordinal)
154 } else {
155 None
156 }
157 });
158 if let Some(mode) = positions {
159 builder.set_positions(f, mode);
160 }
161 }
162
163 if !self.default_fields.is_empty() {
165 builder.set_default_fields(self.default_fields.clone());
166 }
167
168 if !self.query_routers.is_empty() {
170 builder.set_query_routers(self.query_routers.clone());
171 }
172
173 builder.build()
174 }
175
176 pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
181 if self.query_routers.is_empty() {
182 return Ok(None);
183 }
184
185 super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
186 .map(Some)
187 .map_err(Error::Schema)
188 }
189}
190
191fn parse_field_type(type_str: &str) -> Result<FieldType> {
193 match type_str {
194 "text" | "string" | "str" => Ok(FieldType::Text),
195 "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
196 "i64" | "int" | "integer" => Ok(FieldType::I64),
197 "f64" | "float" | "double" => Ok(FieldType::F64),
198 "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
199 "json" => Ok(FieldType::Json),
200 "sparse_vector" => Ok(FieldType::SparseVector),
201 "dense_vector" | "vector" => Ok(FieldType::DenseVector),
202 _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
203 }
204}
205
206#[derive(Debug, Clone, Default)]
208struct IndexConfig {
209 index_type: Option<super::schema::VectorIndexType>,
210 num_clusters: Option<usize>,
211 nprobe: Option<usize>,
212 mrl_dim: Option<usize>,
213 build_threshold: Option<usize>,
214 quantization: Option<WeightQuantization>,
216 weight_threshold: Option<f32>,
217 block_size: Option<usize>,
218 posting_list_pruning: Option<f32>,
219 query_tokenizer: Option<String>,
221 query_weighting: Option<QueryWeighting>,
222 positions: Option<super::schema::PositionMode>,
224}
225
226fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool, Option<IndexConfig>) {
231 let mut indexed = false;
232 let mut stored = false;
233 let mut multi = false;
234 let mut index_config = None;
235
236 for attr in pair.into_inner() {
237 if attr.as_rule() == Rule::attribute {
238 let mut found_config = false;
240 for inner in attr.clone().into_inner() {
241 match inner.as_rule() {
242 Rule::indexed_with_config => {
243 indexed = true;
244 index_config = Some(parse_index_config(inner));
245 found_config = true;
246 break;
247 }
248 Rule::stored_with_config => {
249 stored = true;
250 multi = true; found_config = true;
252 break;
253 }
254 _ => {}
255 }
256 }
257 if !found_config {
258 match attr.as_str() {
260 "indexed" => indexed = true,
261 "stored" => stored = true,
262 _ => {}
263 }
264 }
265 }
266 }
267
268 (indexed, stored, multi, index_config)
269}
270
271fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
273 let mut config = IndexConfig::default();
274
275 for inner in pair.into_inner() {
280 if inner.as_rule() == Rule::index_config_params {
281 for param in inner.into_inner() {
282 if param.as_rule() == Rule::index_config_param {
283 for p in param.into_inner() {
284 parse_single_index_config_param(&mut config, p);
285 }
286 }
287 }
288 }
289 }
290
291 config
292}
293
294fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
296 use super::schema::VectorIndexType;
297
298 match p.as_rule() {
299 Rule::index_type_spec => {
300 config.index_type = Some(match p.as_str() {
301 "flat" => VectorIndexType::Flat,
302 "rabitq" => VectorIndexType::RaBitQ,
303 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
304 "scann" => VectorIndexType::ScaNN,
305 _ => VectorIndexType::RaBitQ,
306 });
307 }
308 Rule::index_type_kwarg => {
309 if let Some(t) = p.into_inner().next() {
311 config.index_type = Some(match t.as_str() {
312 "flat" => VectorIndexType::Flat,
313 "rabitq" => VectorIndexType::RaBitQ,
314 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
315 "scann" => VectorIndexType::ScaNN,
316 _ => VectorIndexType::RaBitQ,
317 });
318 }
319 }
320 Rule::num_clusters_kwarg => {
321 if let Some(n) = p.into_inner().next() {
323 config.num_clusters = Some(n.as_str().parse().unwrap_or(256));
324 }
325 }
326 Rule::build_threshold_kwarg => {
327 if let Some(n) = p.into_inner().next() {
329 config.build_threshold = Some(n.as_str().parse().unwrap_or(10000));
330 }
331 }
332 Rule::nprobe_kwarg => {
333 if let Some(n) = p.into_inner().next() {
335 config.nprobe = Some(n.as_str().parse().unwrap_or(32));
336 }
337 }
338 Rule::mrl_dim_kwarg => {
339 if let Some(n) = p.into_inner().next() {
341 config.mrl_dim = Some(n.as_str().parse().unwrap_or(0));
342 }
343 }
344 Rule::quantization_kwarg => {
345 if let Some(q) = p.into_inner().next() {
347 config.quantization = Some(match q.as_str() {
348 "float32" | "f32" => WeightQuantization::Float32,
349 "float16" | "f16" => WeightQuantization::Float16,
350 "uint8" | "u8" => WeightQuantization::UInt8,
351 "uint4" | "u4" => WeightQuantization::UInt4,
352 _ => WeightQuantization::default(),
353 });
354 }
355 }
356 Rule::weight_threshold_kwarg => {
357 if let Some(t) = p.into_inner().next() {
359 config.weight_threshold = Some(t.as_str().parse().unwrap_or(0.0));
360 }
361 }
362 Rule::block_size_kwarg => {
363 if let Some(n) = p.into_inner().next() {
365 config.block_size = Some(n.as_str().parse().unwrap_or(128));
366 }
367 }
368 Rule::pruning_kwarg => {
369 if let Some(f) = p.into_inner().next() {
371 config.posting_list_pruning = Some(f.as_str().parse().unwrap_or(1.0));
372 }
373 }
374 Rule::query_config_block => {
375 parse_query_config_block(config, p);
377 }
378 Rule::positions_kwarg => {
379 use super::schema::PositionMode;
381 config.positions = Some(match p.as_str() {
382 "ordinal" => PositionMode::Ordinal,
383 "token_position" => PositionMode::TokenPosition,
384 _ => PositionMode::Full, });
386 }
387 _ => {}
388 }
389}
390
391fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
393 for inner in pair.into_inner() {
394 if inner.as_rule() == Rule::query_config_params {
395 for param in inner.into_inner() {
396 if param.as_rule() == Rule::query_config_param {
397 for p in param.into_inner() {
398 match p.as_rule() {
399 Rule::query_tokenizer_kwarg => {
400 if let Some(path) = p.into_inner().next()
402 && let Some(inner_path) = path.into_inner().next()
403 {
404 config.query_tokenizer = Some(inner_path.as_str().to_string());
405 }
406 }
407 Rule::query_weighting_kwarg => {
408 if let Some(w) = p.into_inner().next() {
410 config.query_weighting = Some(match w.as_str() {
411 "one" => QueryWeighting::One,
412 "idf" => QueryWeighting::Idf,
413 "idf_file" => QueryWeighting::IdfFile,
414 _ => QueryWeighting::One,
415 });
416 }
417 }
418 _ => {}
419 }
420 }
421 }
422 }
423 }
424 }
425}
426
427fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
429 let mut inner = pair.into_inner();
430
431 let name = inner
432 .next()
433 .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
434 .as_str()
435 .to_string();
436
437 let field_type_str = inner
438 .next()
439 .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
440 .as_str();
441
442 let field_type = parse_field_type(field_type_str)?;
443
444 let mut tokenizer = None;
446 let mut sparse_vector_config = None;
447 let mut dense_vector_config = None;
448 let mut indexed = true;
449 let mut stored = true;
450 let mut multi = false;
451 let mut index_config: Option<IndexConfig> = None;
452
453 for item in inner {
454 match item.as_rule() {
455 Rule::tokenizer_spec => {
456 if let Some(tok_name) = item.into_inner().next() {
458 tokenizer = Some(tok_name.as_str().to_string());
459 }
460 }
461 Rule::sparse_vector_config => {
462 sparse_vector_config = Some(parse_sparse_vector_config(item));
464 }
465 Rule::dense_vector_config => {
466 dense_vector_config = Some(parse_dense_vector_config(item));
468 }
469 Rule::attributes => {
470 let (idx, sto, mul, idx_cfg) = parse_attributes(item);
471 indexed = idx;
472 stored = sto;
473 multi = mul;
474 index_config = idx_cfg;
475 }
476 _ => {}
477 }
478 }
479
480 let mut positions = None;
482 if let Some(idx_cfg) = index_config {
483 positions = idx_cfg.positions;
484 if let Some(ref mut dv_config) = dense_vector_config {
485 apply_index_config_to_dense_vector(dv_config, idx_cfg);
486 } else if field_type == FieldType::SparseVector {
487 let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
489 apply_index_config_to_sparse_vector(sv_config, idx_cfg);
490 }
491 }
492
493 Ok(FieldDef {
494 name,
495 field_type,
496 indexed,
497 stored,
498 tokenizer,
499 multi,
500 positions,
501 sparse_vector_config,
502 dense_vector_config,
503 })
504}
505
506fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
508 if let Some(index_type) = idx_cfg.index_type {
510 config.index_type = index_type;
511 }
512
513 if idx_cfg.num_clusters.is_some() {
515 config.num_clusters = idx_cfg.num_clusters;
516 }
517
518 if let Some(nprobe) = idx_cfg.nprobe {
520 config.nprobe = nprobe;
521 }
522
523 if idx_cfg.mrl_dim.is_some() {
525 config.mrl_dim = idx_cfg.mrl_dim;
526 }
527
528 if idx_cfg.build_threshold.is_some() {
530 config.build_threshold = idx_cfg.build_threshold;
531 }
532}
533
534fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
537 let mut index_size = IndexSize::default();
538
539 for inner in pair.into_inner() {
541 if inner.as_rule() == Rule::index_size_spec {
542 index_size = match inner.as_str() {
543 "u16" => IndexSize::U16,
544 "u32" => IndexSize::U32,
545 _ => IndexSize::default(),
546 };
547 }
548 }
549
550 SparseVectorConfig {
551 index_size,
552 weight_quantization: WeightQuantization::default(),
553 weight_threshold: 0.0,
554 block_size: 128,
555 posting_list_pruning: None,
556 query_config: None,
557 }
558}
559
560fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
562 if let Some(q) = idx_cfg.quantization {
563 config.weight_quantization = q;
564 }
565 if let Some(t) = idx_cfg.weight_threshold {
566 config.weight_threshold = t;
567 }
568 if let Some(bs) = idx_cfg.block_size {
569 let adjusted = bs.next_power_of_two();
570 if adjusted != bs {
571 log::warn!(
572 "block_size {} adjusted to next power of two: {}",
573 bs,
574 adjusted
575 );
576 }
577 config.block_size = adjusted;
578 }
579 if let Some(p) = idx_cfg.posting_list_pruning {
580 let clamped = p.clamp(0.0, 1.0);
581 if (clamped - p).abs() > f32::EPSILON {
582 log::warn!(
583 "pruning {} clamped to valid range [0.0, 1.0]: {}",
584 p,
585 clamped
586 );
587 }
588 config.posting_list_pruning = Some(clamped);
589 }
590 if idx_cfg.query_tokenizer.is_some() || idx_cfg.query_weighting.is_some() {
592 let query_config = config
593 .query_config
594 .get_or_insert(SparseQueryConfig::default());
595 if let Some(tokenizer) = idx_cfg.query_tokenizer {
596 query_config.tokenizer = Some(tokenizer);
597 }
598 if let Some(weighting) = idx_cfg.query_weighting {
599 query_config.weighting = weighting;
600 }
601 }
602}
603
604fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
607 let mut dim: usize = 0;
608
609 for params in pair.into_inner() {
611 if params.as_rule() == Rule::dense_vector_params {
612 for inner in params.into_inner() {
613 match inner.as_rule() {
614 Rule::dense_vector_keyword_params => {
615 for kwarg in inner.into_inner() {
617 if kwarg.as_rule() == Rule::dims_kwarg
618 && let Some(d) = kwarg.into_inner().next()
619 {
620 dim = d.as_str().parse().unwrap_or(0);
621 }
622 }
623 }
624 Rule::dense_vector_positional_params => {
625 if let Some(dim_pair) = inner.into_inner().next() {
627 dim = dim_pair.as_str().parse().unwrap_or(0);
628 }
629 }
630 _ => {}
631 }
632 }
633 }
634 }
635
636 DenseVectorConfig::new(dim)
637}
638
639fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
641 pair.into_inner().map(|p| p.as_str().to_string()).collect()
642}
643
644fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
646 let mut pattern = String::new();
647 let mut substitution = String::new();
648 let mut target_field = String::new();
649 let mut mode = RoutingMode::Additional;
650
651 for prop in pair.into_inner() {
652 if prop.as_rule() != Rule::query_router_prop {
653 continue;
654 }
655
656 for inner in prop.into_inner() {
657 match inner.as_rule() {
658 Rule::query_router_pattern => {
659 if let Some(regex_str) = inner.into_inner().next() {
660 pattern = parse_string_value(regex_str);
661 }
662 }
663 Rule::query_router_substitution => {
664 if let Some(quoted) = inner.into_inner().next() {
665 substitution = parse_string_value(quoted);
666 }
667 }
668 Rule::query_router_target => {
669 if let Some(ident) = inner.into_inner().next() {
670 target_field = ident.as_str().to_string();
671 }
672 }
673 Rule::query_router_mode => {
674 if let Some(mode_val) = inner.into_inner().next() {
675 mode = match mode_val.as_str() {
676 "exclusive" => RoutingMode::Exclusive,
677 "additional" => RoutingMode::Additional,
678 _ => RoutingMode::Additional,
679 };
680 }
681 }
682 _ => {}
683 }
684 }
685 }
686
687 if pattern.is_empty() {
688 return Err(Error::Schema("query_router missing 'pattern'".to_string()));
689 }
690 if substitution.is_empty() {
691 return Err(Error::Schema(
692 "query_router missing 'substitution'".to_string(),
693 ));
694 }
695 if target_field.is_empty() {
696 return Err(Error::Schema(
697 "query_router missing 'target_field'".to_string(),
698 ));
699 }
700
701 Ok(QueryRouterRule {
702 pattern,
703 substitution,
704 target_field,
705 mode,
706 })
707}
708
709fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
711 let s = pair.as_str();
712 match pair.as_rule() {
713 Rule::regex_string => {
714 if let Some(inner) = pair.into_inner().next() {
716 parse_string_value(inner)
717 } else {
718 s.to_string()
719 }
720 }
721 Rule::raw_string => {
722 s[2..s.len() - 1].to_string()
724 }
725 Rule::quoted_string => {
726 let inner = &s[1..s.len() - 1];
728 inner
730 .replace("\\n", "\n")
731 .replace("\\t", "\t")
732 .replace("\\\"", "\"")
733 .replace("\\\\", "\\")
734 }
735 _ => s.to_string(),
736 }
737}
738
739fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
741 let mut inner = pair.into_inner();
742
743 let name = inner
744 .next()
745 .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
746 .as_str()
747 .to_string();
748
749 let mut fields = Vec::new();
750 let mut default_fields = Vec::new();
751 let mut query_routers = Vec::new();
752
753 for item in inner {
754 match item.as_rule() {
755 Rule::field_def => {
756 fields.push(parse_field_def(item)?);
757 }
758 Rule::default_fields_def => {
759 default_fields = parse_default_fields_def(item);
760 }
761 Rule::query_router_def => {
762 query_routers.push(parse_query_router_def(item)?);
763 }
764 _ => {}
765 }
766 }
767
768 Ok(IndexDef {
769 name,
770 fields,
771 default_fields,
772 query_routers,
773 })
774}
775
776pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
778 let pairs = SdlParser::parse(Rule::file, input)
779 .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
780
781 let mut indexes = Vec::new();
782
783 for pair in pairs {
784 if pair.as_rule() == Rule::file {
785 for inner in pair.into_inner() {
786 if inner.as_rule() == Rule::index_def {
787 indexes.push(parse_index_def(inner)?);
788 }
789 }
790 }
791 }
792
793 Ok(indexes)
794}
795
796pub fn parse_single_index(input: &str) -> Result<IndexDef> {
798 let indexes = parse_sdl(input)?;
799
800 if indexes.is_empty() {
801 return Err(Error::Schema("No index definition found".to_string()));
802 }
803
804 if indexes.len() > 1 {
805 return Err(Error::Schema(
806 "Multiple index definitions found, expected one".to_string(),
807 ));
808 }
809
810 Ok(indexes.into_iter().next().unwrap())
811}
812
813#[cfg(test)]
814mod tests {
815 use super::*;
816
817 #[test]
818 fn test_parse_simple_schema() {
819 let sdl = r#"
820 index articles {
821 field title: text [indexed, stored]
822 field body: text [indexed]
823 }
824 "#;
825
826 let indexes = parse_sdl(sdl).unwrap();
827 assert_eq!(indexes.len(), 1);
828
829 let index = &indexes[0];
830 assert_eq!(index.name, "articles");
831 assert_eq!(index.fields.len(), 2);
832
833 assert_eq!(index.fields[0].name, "title");
834 assert!(matches!(index.fields[0].field_type, FieldType::Text));
835 assert!(index.fields[0].indexed);
836 assert!(index.fields[0].stored);
837
838 assert_eq!(index.fields[1].name, "body");
839 assert!(matches!(index.fields[1].field_type, FieldType::Text));
840 assert!(index.fields[1].indexed);
841 assert!(!index.fields[1].stored);
842 }
843
844 #[test]
845 fn test_parse_all_field_types() {
846 let sdl = r#"
847 index test {
848 field text_field: text [indexed, stored]
849 field u64_field: u64 [indexed, stored]
850 field i64_field: i64 [indexed, stored]
851 field f64_field: f64 [indexed, stored]
852 field bytes_field: bytes [stored]
853 }
854 "#;
855
856 let indexes = parse_sdl(sdl).unwrap();
857 let index = &indexes[0];
858
859 assert!(matches!(index.fields[0].field_type, FieldType::Text));
860 assert!(matches!(index.fields[1].field_type, FieldType::U64));
861 assert!(matches!(index.fields[2].field_type, FieldType::I64));
862 assert!(matches!(index.fields[3].field_type, FieldType::F64));
863 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
864 }
865
866 #[test]
867 fn test_parse_with_comments() {
868 let sdl = r#"
869 # This is a comment
870 index articles {
871 # Title field
872 field title: text [indexed, stored]
873 field body: text [indexed] # inline comment not supported yet
874 }
875 "#;
876
877 let indexes = parse_sdl(sdl).unwrap();
878 assert_eq!(indexes[0].fields.len(), 2);
879 }
880
881 #[test]
882 fn test_parse_type_aliases() {
883 let sdl = r#"
884 index test {
885 field a: string [indexed]
886 field b: int [indexed]
887 field c: uint [indexed]
888 field d: float [indexed]
889 field e: binary [stored]
890 }
891 "#;
892
893 let indexes = parse_sdl(sdl).unwrap();
894 let index = &indexes[0];
895
896 assert!(matches!(index.fields[0].field_type, FieldType::Text));
897 assert!(matches!(index.fields[1].field_type, FieldType::I64));
898 assert!(matches!(index.fields[2].field_type, FieldType::U64));
899 assert!(matches!(index.fields[3].field_type, FieldType::F64));
900 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
901 }
902
903 #[test]
904 fn test_to_schema() {
905 let sdl = r#"
906 index articles {
907 field title: text [indexed, stored]
908 field views: u64 [indexed, stored]
909 }
910 "#;
911
912 let indexes = parse_sdl(sdl).unwrap();
913 let schema = indexes[0].to_schema();
914
915 assert!(schema.get_field("title").is_some());
916 assert!(schema.get_field("views").is_some());
917 assert!(schema.get_field("nonexistent").is_none());
918 }
919
920 #[test]
921 fn test_default_attributes() {
922 let sdl = r#"
923 index test {
924 field title: text
925 }
926 "#;
927
928 let indexes = parse_sdl(sdl).unwrap();
929 let field = &indexes[0].fields[0];
930
931 assert!(field.indexed);
933 assert!(field.stored);
934 }
935
936 #[test]
937 fn test_multiple_indexes() {
938 let sdl = r#"
939 index articles {
940 field title: text [indexed, stored]
941 }
942
943 index users {
944 field name: text [indexed, stored]
945 field email: text [indexed, stored]
946 }
947 "#;
948
949 let indexes = parse_sdl(sdl).unwrap();
950 assert_eq!(indexes.len(), 2);
951 assert_eq!(indexes[0].name, "articles");
952 assert_eq!(indexes[1].name, "users");
953 }
954
955 #[test]
956 fn test_tokenizer_spec() {
957 let sdl = r#"
958 index articles {
959 field title: text<en_stem> [indexed, stored]
960 field body: text<default> [indexed]
961 field author: text [indexed, stored]
962 }
963 "#;
964
965 let indexes = parse_sdl(sdl).unwrap();
966 let index = &indexes[0];
967
968 assert_eq!(index.fields[0].name, "title");
969 assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
970
971 assert_eq!(index.fields[1].name, "body");
972 assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
973
974 assert_eq!(index.fields[2].name, "author");
975 assert_eq!(index.fields[2].tokenizer, None); }
977
978 #[test]
979 fn test_tokenizer_in_schema() {
980 let sdl = r#"
981 index articles {
982 field title: text<german> [indexed, stored]
983 field body: text<en_stem> [indexed]
984 }
985 "#;
986
987 let indexes = parse_sdl(sdl).unwrap();
988 let schema = indexes[0].to_schema();
989
990 let title_field = schema.get_field("title").unwrap();
991 let title_entry = schema.get_field_entry(title_field).unwrap();
992 assert_eq!(title_entry.tokenizer, Some("german".to_string()));
993
994 let body_field = schema.get_field("body").unwrap();
995 let body_entry = schema.get_field_entry(body_field).unwrap();
996 assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
997 }
998
999 #[test]
1000 fn test_query_router_basic() {
1001 let sdl = r#"
1002 index documents {
1003 field title: text [indexed, stored]
1004 field uri: text [indexed, stored]
1005
1006 query_router {
1007 pattern: "10\\.\\d{4,}/[^\\s]+"
1008 substitution: "doi://{0}"
1009 target_field: uris
1010 mode: exclusive
1011 }
1012 }
1013 "#;
1014
1015 let indexes = parse_sdl(sdl).unwrap();
1016 let index = &indexes[0];
1017
1018 assert_eq!(index.query_routers.len(), 1);
1019 let router = &index.query_routers[0];
1020 assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1021 assert_eq!(router.substitution, "doi://{0}");
1022 assert_eq!(router.target_field, "uris");
1023 assert_eq!(router.mode, RoutingMode::Exclusive);
1024 }
1025
1026 #[test]
1027 fn test_query_router_raw_string() {
1028 let sdl = r#"
1029 index documents {
1030 field uris: text [indexed, stored]
1031
1032 query_router {
1033 pattern: r"^pmid:(\d+)$"
1034 substitution: "pubmed://{1}"
1035 target_field: uris
1036 mode: additional
1037 }
1038 }
1039 "#;
1040
1041 let indexes = parse_sdl(sdl).unwrap();
1042 let router = &indexes[0].query_routers[0];
1043
1044 assert_eq!(router.pattern, r"^pmid:(\d+)$");
1045 assert_eq!(router.substitution, "pubmed://{1}");
1046 assert_eq!(router.mode, RoutingMode::Additional);
1047 }
1048
1049 #[test]
1050 fn test_multiple_query_routers() {
1051 let sdl = r#"
1052 index documents {
1053 field uris: text [indexed, stored]
1054
1055 query_router {
1056 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1057 substitution: "doi://{1}"
1058 target_field: uris
1059 mode: exclusive
1060 }
1061
1062 query_router {
1063 pattern: r"^pmid:(\d+)$"
1064 substitution: "pubmed://{1}"
1065 target_field: uris
1066 mode: exclusive
1067 }
1068
1069 query_router {
1070 pattern: r"^arxiv:(\d+\.\d+)$"
1071 substitution: "arxiv://{1}"
1072 target_field: uris
1073 mode: additional
1074 }
1075 }
1076 "#;
1077
1078 let indexes = parse_sdl(sdl).unwrap();
1079 assert_eq!(indexes[0].query_routers.len(), 3);
1080 }
1081
1082 #[test]
1083 fn test_query_router_default_mode() {
1084 let sdl = r#"
1085 index documents {
1086 field uris: text [indexed, stored]
1087
1088 query_router {
1089 pattern: r"test"
1090 substitution: "{0}"
1091 target_field: uris
1092 }
1093 }
1094 "#;
1095
1096 let indexes = parse_sdl(sdl).unwrap();
1097 assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1099 }
1100
1101 #[test]
1102 fn test_multi_attribute() {
1103 let sdl = r#"
1104 index documents {
1105 field uris: text [indexed, stored<multi>]
1106 field title: text [indexed, stored]
1107 }
1108 "#;
1109
1110 let indexes = parse_sdl(sdl).unwrap();
1111 assert_eq!(indexes.len(), 1);
1112
1113 let fields = &indexes[0].fields;
1114 assert_eq!(fields.len(), 2);
1115
1116 assert_eq!(fields[0].name, "uris");
1118 assert!(fields[0].multi, "uris field should have multi=true");
1119
1120 assert_eq!(fields[1].name, "title");
1122 assert!(!fields[1].multi, "title field should have multi=false");
1123
1124 let schema = indexes[0].to_schema();
1126 let uris_field = schema.get_field("uris").unwrap();
1127 let title_field = schema.get_field("title").unwrap();
1128
1129 assert!(schema.get_field_entry(uris_field).unwrap().multi);
1130 assert!(!schema.get_field_entry(title_field).unwrap().multi);
1131 }
1132
1133 #[test]
1134 fn test_sparse_vector_field() {
1135 let sdl = r#"
1136 index documents {
1137 field embedding: sparse_vector [indexed, stored]
1138 }
1139 "#;
1140
1141 let indexes = parse_sdl(sdl).unwrap();
1142 assert_eq!(indexes.len(), 1);
1143 assert_eq!(indexes[0].fields.len(), 1);
1144 assert_eq!(indexes[0].fields[0].name, "embedding");
1145 assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1146 assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1147 }
1148
1149 #[test]
1150 fn test_sparse_vector_with_config() {
1151 let sdl = r#"
1152 index documents {
1153 field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1154 field dense: sparse_vector<u32> [indexed<quantization: float32>]
1155 }
1156 "#;
1157
1158 let indexes = parse_sdl(sdl).unwrap();
1159 assert_eq!(indexes[0].fields.len(), 2);
1160
1161 let f1 = &indexes[0].fields[0];
1163 assert_eq!(f1.name, "embedding");
1164 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1165 assert_eq!(config1.index_size, IndexSize::U16);
1166 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1167
1168 let f2 = &indexes[0].fields[1];
1170 assert_eq!(f2.name, "dense");
1171 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1172 assert_eq!(config2.index_size, IndexSize::U32);
1173 assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1174 }
1175
1176 #[test]
1177 fn test_sparse_vector_with_weight_threshold() {
1178 let sdl = r#"
1179 index documents {
1180 field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1181 field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1182 }
1183 "#;
1184
1185 let indexes = parse_sdl(sdl).unwrap();
1186 assert_eq!(indexes[0].fields.len(), 2);
1187
1188 let f1 = &indexes[0].fields[0];
1190 assert_eq!(f1.name, "embedding");
1191 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1192 assert_eq!(config1.index_size, IndexSize::U16);
1193 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1194 assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1195
1196 let f2 = &indexes[0].fields[1];
1198 assert_eq!(f2.name, "embedding2");
1199 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1200 assert_eq!(config2.index_size, IndexSize::U32);
1201 assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1202 assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1203 }
1204
1205 #[test]
1206 fn test_sparse_vector_with_pruning() {
1207 let sdl = r#"
1208 index documents {
1209 field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1210 }
1211 "#;
1212
1213 let indexes = parse_sdl(sdl).unwrap();
1214 let f = &indexes[0].fields[0];
1215 assert_eq!(f.name, "embedding");
1216 let config = f.sparse_vector_config.as_ref().unwrap();
1217 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1218 assert_eq!(config.posting_list_pruning, Some(0.1));
1219 }
1220
1221 #[test]
1222 fn test_dense_vector_field() {
1223 let sdl = r#"
1224 index documents {
1225 field embedding: dense_vector<768> [indexed, stored]
1226 }
1227 "#;
1228
1229 let indexes = parse_sdl(sdl).unwrap();
1230 assert_eq!(indexes.len(), 1);
1231 assert_eq!(indexes[0].fields.len(), 1);
1232
1233 let f = &indexes[0].fields[0];
1234 assert_eq!(f.name, "embedding");
1235 assert_eq!(f.field_type, FieldType::DenseVector);
1236
1237 let config = f.dense_vector_config.as_ref().unwrap();
1238 assert_eq!(config.dim, 768);
1239 }
1240
1241 #[test]
1242 fn test_dense_vector_alias() {
1243 let sdl = r#"
1244 index documents {
1245 field embedding: vector<1536> [indexed]
1246 }
1247 "#;
1248
1249 let indexes = parse_sdl(sdl).unwrap();
1250 assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1251 assert_eq!(
1252 indexes[0].fields[0]
1253 .dense_vector_config
1254 .as_ref()
1255 .unwrap()
1256 .dim,
1257 1536
1258 );
1259 }
1260
1261 #[test]
1262 fn test_dense_vector_with_num_clusters() {
1263 let sdl = r#"
1264 index documents {
1265 field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1266 }
1267 "#;
1268
1269 let indexes = parse_sdl(sdl).unwrap();
1270 assert_eq!(indexes.len(), 1);
1271
1272 let f = &indexes[0].fields[0];
1273 assert_eq!(f.name, "embedding");
1274 assert_eq!(f.field_type, FieldType::DenseVector);
1275
1276 let config = f.dense_vector_config.as_ref().unwrap();
1277 assert_eq!(config.dim, 768);
1278 assert_eq!(config.num_clusters, Some(256));
1279 assert_eq!(config.nprobe, 32); }
1281
1282 #[test]
1283 fn test_dense_vector_with_num_clusters_and_nprobe() {
1284 let sdl = r#"
1285 index documents {
1286 field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1287 }
1288 "#;
1289
1290 let indexes = parse_sdl(sdl).unwrap();
1291 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1292
1293 assert_eq!(config.dim, 1536);
1294 assert_eq!(config.num_clusters, Some(512));
1295 assert_eq!(config.nprobe, 64);
1296 }
1297
1298 #[test]
1299 fn test_dense_vector_keyword_syntax() {
1300 let sdl = r#"
1301 index documents {
1302 field embedding: dense_vector<dims: 1536> [indexed, stored]
1303 }
1304 "#;
1305
1306 let indexes = parse_sdl(sdl).unwrap();
1307 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1308
1309 assert_eq!(config.dim, 1536);
1310 assert!(config.num_clusters.is_none());
1311 }
1312
1313 #[test]
1314 fn test_dense_vector_keyword_syntax_full() {
1315 let sdl = r#"
1316 index documents {
1317 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1318 }
1319 "#;
1320
1321 let indexes = parse_sdl(sdl).unwrap();
1322 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1323
1324 assert_eq!(config.dim, 1536);
1325 assert_eq!(config.num_clusters, Some(256));
1326 assert_eq!(config.nprobe, 64);
1327 }
1328
1329 #[test]
1330 fn test_dense_vector_keyword_syntax_partial() {
1331 let sdl = r#"
1332 index documents {
1333 field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1334 }
1335 "#;
1336
1337 let indexes = parse_sdl(sdl).unwrap();
1338 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1339
1340 assert_eq!(config.dim, 768);
1341 assert_eq!(config.num_clusters, Some(128));
1342 assert_eq!(config.nprobe, 32); }
1344
1345 #[test]
1346 fn test_dense_vector_scann_index() {
1347 use crate::dsl::schema::VectorIndexType;
1348
1349 let sdl = r#"
1350 index documents {
1351 field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1352 }
1353 "#;
1354
1355 let indexes = parse_sdl(sdl).unwrap();
1356 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1357
1358 assert_eq!(config.dim, 768);
1359 assert_eq!(config.index_type, VectorIndexType::ScaNN);
1360 assert_eq!(config.num_clusters, Some(256));
1361 assert_eq!(config.nprobe, 64);
1362 }
1363
1364 #[test]
1365 fn test_dense_vector_ivf_rabitq_index() {
1366 use crate::dsl::schema::VectorIndexType;
1367
1368 let sdl = r#"
1369 index documents {
1370 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1371 }
1372 "#;
1373
1374 let indexes = parse_sdl(sdl).unwrap();
1375 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1376
1377 assert_eq!(config.dim, 1536);
1378 assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1379 assert_eq!(config.num_clusters, Some(512));
1380 }
1381
1382 #[test]
1383 fn test_dense_vector_rabitq_no_clusters() {
1384 use crate::dsl::schema::VectorIndexType;
1385
1386 let sdl = r#"
1387 index documents {
1388 field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1389 }
1390 "#;
1391
1392 let indexes = parse_sdl(sdl).unwrap();
1393 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1394
1395 assert_eq!(config.dim, 768);
1396 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1397 assert!(config.num_clusters.is_none());
1398 }
1399
1400 #[test]
1401 fn test_dense_vector_flat_index() {
1402 use crate::dsl::schema::VectorIndexType;
1403
1404 let sdl = r#"
1405 index documents {
1406 field embedding: dense_vector<dims: 768> [indexed<flat>]
1407 }
1408 "#;
1409
1410 let indexes = parse_sdl(sdl).unwrap();
1411 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1412
1413 assert_eq!(config.dim, 768);
1414 assert_eq!(config.index_type, VectorIndexType::Flat);
1415 }
1416
1417 #[test]
1418 fn test_dense_vector_default_index_type() {
1419 use crate::dsl::schema::VectorIndexType;
1420
1421 let sdl = r#"
1423 index documents {
1424 field embedding: dense_vector<dims: 768> [indexed]
1425 }
1426 "#;
1427
1428 let indexes = parse_sdl(sdl).unwrap();
1429 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1430
1431 assert_eq!(config.dim, 768);
1432 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1433 }
1434
1435 #[test]
1436 fn test_dense_vector_mrl_dim() {
1437 let sdl = r#"
1439 index documents {
1440 field embedding: dense_vector<1536> [indexed<mrl_dim: 256>]
1441 }
1442 "#;
1443
1444 let indexes = parse_sdl(sdl).unwrap();
1445 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1446
1447 assert_eq!(config.dim, 1536);
1448 assert_eq!(config.mrl_dim, Some(256));
1449 assert_eq!(config.index_dim(), 256);
1450 }
1451
1452 #[test]
1453 fn test_dense_vector_mrl_dim_with_num_clusters() {
1454 let sdl = r#"
1456 index documents {
1457 field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64, mrl_dim: 128>]
1458 }
1459 "#;
1460
1461 let indexes = parse_sdl(sdl).unwrap();
1462 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1463
1464 assert_eq!(config.dim, 768);
1465 assert_eq!(config.mrl_dim, Some(128));
1466 assert_eq!(config.index_dim(), 128);
1467 assert_eq!(config.num_clusters, Some(256));
1468 assert_eq!(config.nprobe, 64);
1469 }
1470
1471 #[test]
1472 fn test_dense_vector_no_mrl_dim() {
1473 let sdl = r#"
1475 index documents {
1476 field embedding: dense_vector<dims: 768> [indexed]
1477 }
1478 "#;
1479
1480 let indexes = parse_sdl(sdl).unwrap();
1481 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1482
1483 assert_eq!(config.dim, 768);
1484 assert_eq!(config.mrl_dim, None);
1485 assert_eq!(config.index_dim(), 768);
1486 }
1487
1488 #[test]
1489 fn test_json_field_type() {
1490 let sdl = r#"
1491 index documents {
1492 field title: text [indexed, stored]
1493 field metadata: json [stored]
1494 field extra: json
1495 }
1496 "#;
1497
1498 let indexes = parse_sdl(sdl).unwrap();
1499 let index = &indexes[0];
1500
1501 assert_eq!(index.fields.len(), 3);
1502
1503 assert_eq!(index.fields[1].name, "metadata");
1505 assert!(matches!(index.fields[1].field_type, FieldType::Json));
1506 assert!(index.fields[1].stored);
1507 assert_eq!(index.fields[2].name, "extra");
1511 assert!(matches!(index.fields[2].field_type, FieldType::Json));
1512
1513 let schema = index.to_schema();
1515 let metadata_field = schema.get_field("metadata").unwrap();
1516 let entry = schema.get_field_entry(metadata_field).unwrap();
1517 assert_eq!(entry.field_type, FieldType::Json);
1518 assert!(!entry.indexed); assert!(entry.stored);
1520 }
1521
1522 #[test]
1523 fn test_sparse_vector_query_config() {
1524 use crate::structures::QueryWeighting;
1525
1526 let sdl = r#"
1527 index documents {
1528 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1529 }
1530 "#;
1531
1532 let indexes = parse_sdl(sdl).unwrap();
1533 let index = &indexes[0];
1534
1535 assert_eq!(index.fields.len(), 1);
1536 assert_eq!(index.fields[0].name, "embedding");
1537 assert!(matches!(
1538 index.fields[0].field_type,
1539 FieldType::SparseVector
1540 ));
1541
1542 let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1543 assert_eq!(config.index_size, IndexSize::U16);
1544 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1545
1546 let query_config = config.query_config.as_ref().unwrap();
1548 assert_eq!(
1549 query_config.tokenizer.as_deref(),
1550 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1551 );
1552 assert_eq!(query_config.weighting, QueryWeighting::Idf);
1553
1554 let schema = index.to_schema();
1556 let embedding_field = schema.get_field("embedding").unwrap();
1557 let entry = schema.get_field_entry(embedding_field).unwrap();
1558 let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1559 let qc = sv_config.query_config.as_ref().unwrap();
1560 assert_eq!(
1561 qc.tokenizer.as_deref(),
1562 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1563 );
1564 assert_eq!(qc.weighting, QueryWeighting::Idf);
1565 }
1566
1567 #[test]
1568 fn test_sparse_vector_query_config_weighting_one() {
1569 use crate::structures::QueryWeighting;
1570
1571 let sdl = r#"
1572 index documents {
1573 field embedding: sparse_vector [indexed<query<weighting: one>>]
1574 }
1575 "#;
1576
1577 let indexes = parse_sdl(sdl).unwrap();
1578 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1579
1580 let query_config = config.query_config.as_ref().unwrap();
1581 assert!(query_config.tokenizer.is_none());
1582 assert_eq!(query_config.weighting, QueryWeighting::One);
1583 }
1584
1585 #[test]
1586 fn test_sparse_vector_query_config_weighting_idf_file() {
1587 use crate::structures::QueryWeighting;
1588
1589 let sdl = r#"
1590 index documents {
1591 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1592 }
1593 "#;
1594
1595 let indexes = parse_sdl(sdl).unwrap();
1596 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1597
1598 let query_config = config.query_config.as_ref().unwrap();
1599 assert_eq!(
1600 query_config.tokenizer.as_deref(),
1601 Some("opensearch-neural-sparse-encoding-v1")
1602 );
1603 assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1604
1605 let schema = indexes[0].to_schema();
1607 let field = schema.get_field("embedding").unwrap();
1608 let entry = schema.get_field_entry(field).unwrap();
1609 let sc = entry.sparse_vector_config.as_ref().unwrap();
1610 let qc = sc.query_config.as_ref().unwrap();
1611 assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1612 }
1613}