1use pest::Parser;
46use pest_derive::Parser;
47
48use super::query_field_router::{QueryRouterRule, RoutingMode};
49use super::schema::{DenseVectorQuantization, FieldType, Schema, SchemaBuilder};
50use crate::Result;
51use crate::error::Error;
52
53#[derive(Parser)]
54#[grammar = "dsl/sdl/sdl.pest"]
55pub struct SdlParser;
56
57use super::schema::DenseVectorConfig;
58use crate::structures::{
59 IndexSize, QueryWeighting, SparseQueryConfig, SparseVectorConfig, WeightQuantization,
60};
61
62#[derive(Debug, Clone)]
64pub struct FieldDef {
65 pub name: String,
66 pub field_type: FieldType,
67 pub indexed: bool,
68 pub stored: bool,
69 pub tokenizer: Option<String>,
71 pub multi: bool,
73 pub positions: Option<super::schema::PositionMode>,
75 pub sparse_vector_config: Option<SparseVectorConfig>,
77 pub dense_vector_config: Option<DenseVectorConfig>,
79 pub fast: bool,
81 pub primary: bool,
83}
84
85#[derive(Debug, Clone)]
87pub struct IndexDef {
88 pub name: String,
89 pub fields: Vec<FieldDef>,
90 pub default_fields: Vec<String>,
91 pub query_routers: Vec<QueryRouterRule>,
93}
94
95impl IndexDef {
96 pub fn to_schema(&self) -> Schema {
98 let mut builder = SchemaBuilder::default();
99
100 for field in &self.fields {
101 let f = match field.field_type {
102 FieldType::Text => {
103 let tokenizer = field.tokenizer.as_deref().unwrap_or("simple");
104 builder.add_text_field_with_tokenizer(
105 &field.name,
106 field.indexed,
107 field.stored,
108 tokenizer,
109 )
110 }
111 FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
112 FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
113 FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
114 FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
115 FieldType::Json => builder.add_json_field(&field.name, field.stored),
116 FieldType::SparseVector => {
117 if let Some(config) = &field.sparse_vector_config {
118 builder.add_sparse_vector_field_with_config(
119 &field.name,
120 field.indexed,
121 field.stored,
122 config.clone(),
123 )
124 } else {
125 builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
126 }
127 }
128 FieldType::DenseVector => {
129 let config = field
131 .dense_vector_config
132 .as_ref()
133 .expect("DenseVector field requires dimension to be specified");
134 builder.add_dense_vector_field_with_config(
135 &field.name,
136 field.indexed,
137 field.stored,
138 config.clone(),
139 )
140 }
141 };
142 if field.multi {
143 builder.set_multi(f, true);
144 }
145 if field.fast {
146 builder.set_fast(f, true);
147 }
148 if field.primary {
149 builder.set_primary_key(f);
150 }
151 let positions = field.positions.or({
153 if field.multi
155 && matches!(
156 field.field_type,
157 FieldType::SparseVector | FieldType::DenseVector
158 )
159 {
160 Some(super::schema::PositionMode::Ordinal)
161 } else {
162 None
163 }
164 });
165 if let Some(mode) = positions {
166 builder.set_positions(f, mode);
167 }
168 }
169
170 if !self.default_fields.is_empty() {
172 builder.set_default_fields(self.default_fields.clone());
173 }
174
175 if !self.query_routers.is_empty() {
177 builder.set_query_routers(self.query_routers.clone());
178 }
179
180 builder.build()
181 }
182
183 pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
188 if self.query_routers.is_empty() {
189 return Ok(None);
190 }
191
192 super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
193 .map(Some)
194 .map_err(Error::Schema)
195 }
196}
197
198fn parse_field_type(type_str: &str) -> Result<FieldType> {
200 match type_str {
201 "text" | "string" | "str" => Ok(FieldType::Text),
202 "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
203 "i64" | "int" | "integer" => Ok(FieldType::I64),
204 "f64" | "float" | "double" => Ok(FieldType::F64),
205 "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
206 "json" => Ok(FieldType::Json),
207 "sparse_vector" => Ok(FieldType::SparseVector),
208 "dense_vector" | "vector" => Ok(FieldType::DenseVector),
209 _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
210 }
211}
212
213#[derive(Debug, Clone, Default)]
215struct IndexConfig {
216 index_type: Option<super::schema::VectorIndexType>,
217 num_clusters: Option<usize>,
218 nprobe: Option<usize>,
219 build_threshold: Option<usize>,
220 quantization: Option<WeightQuantization>,
222 weight_threshold: Option<f32>,
223 block_size: Option<usize>,
224 pruning: Option<f32>,
225 query_tokenizer: Option<String>,
227 query_weighting: Option<QueryWeighting>,
228 query_weight_threshold: Option<f32>,
229 query_max_dims: Option<usize>,
230 query_pruning: Option<f32>,
231 positions: Option<super::schema::PositionMode>,
233}
234
235fn parse_attributes(
240 pair: pest::iterators::Pair<Rule>,
241) -> (bool, bool, bool, bool, bool, Option<IndexConfig>) {
242 let mut indexed = false;
243 let mut stored = false;
244 let mut multi = false;
245 let mut fast = false;
246 let mut primary = false;
247 let mut index_config = None;
248
249 for attr in pair.into_inner() {
250 if attr.as_rule() == Rule::attribute {
251 let mut found_config = false;
253 for inner in attr.clone().into_inner() {
254 match inner.as_rule() {
255 Rule::indexed_with_config => {
256 indexed = true;
257 index_config = Some(parse_index_config(inner));
258 found_config = true;
259 break;
260 }
261 Rule::stored_with_config => {
262 stored = true;
263 multi = true; found_config = true;
265 break;
266 }
267 _ => {}
268 }
269 }
270 if !found_config {
271 match attr.as_str() {
273 "indexed" => indexed = true,
274 "stored" => stored = true,
275 "fast" => fast = true,
276 "primary" => primary = true,
277 _ => {}
278 }
279 }
280 }
281 }
282
283 (indexed, stored, multi, fast, primary, index_config)
284}
285
286fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
288 let mut config = IndexConfig::default();
289
290 for inner in pair.into_inner() {
295 if inner.as_rule() == Rule::index_config_params {
296 for param in inner.into_inner() {
297 if param.as_rule() == Rule::index_config_param {
298 for p in param.into_inner() {
299 parse_single_index_config_param(&mut config, p);
300 }
301 }
302 }
303 }
304 }
305
306 config
307}
308
309fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
311 use super::schema::VectorIndexType;
312
313 match p.as_rule() {
314 Rule::index_type_spec => {
315 config.index_type = Some(match p.as_str() {
316 "flat" => VectorIndexType::Flat,
317 "rabitq" => VectorIndexType::RaBitQ,
318 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
319 "scann" => VectorIndexType::ScaNN,
320 _ => VectorIndexType::RaBitQ,
321 });
322 }
323 Rule::index_type_kwarg => {
324 if let Some(t) = p.into_inner().next() {
326 config.index_type = Some(match t.as_str() {
327 "flat" => VectorIndexType::Flat,
328 "rabitq" => VectorIndexType::RaBitQ,
329 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
330 "scann" => VectorIndexType::ScaNN,
331 _ => VectorIndexType::RaBitQ,
332 });
333 }
334 }
335 Rule::num_clusters_kwarg => {
336 if let Some(n) = p.into_inner().next() {
338 config.num_clusters = Some(n.as_str().parse().unwrap_or_else(|_| {
339 log::warn!(
340 "Invalid num_clusters value '{}', using default 256",
341 n.as_str()
342 );
343 256
344 }));
345 }
346 }
347 Rule::build_threshold_kwarg => {
348 if let Some(n) = p.into_inner().next() {
350 config.build_threshold = Some(n.as_str().parse().unwrap_or_else(|_| {
351 log::warn!(
352 "Invalid build_threshold value '{}', using default 10000",
353 n.as_str()
354 );
355 10000
356 }));
357 }
358 }
359 Rule::nprobe_kwarg => {
360 if let Some(n) = p.into_inner().next() {
362 config.nprobe = Some(n.as_str().parse().unwrap_or_else(|_| {
363 log::warn!("Invalid nprobe value '{}', using default 32", n.as_str());
364 32
365 }));
366 }
367 }
368 Rule::quantization_kwarg => {
369 if let Some(q) = p.into_inner().next() {
371 config.quantization = Some(match q.as_str() {
372 "float32" | "f32" => WeightQuantization::Float32,
373 "float16" | "f16" => WeightQuantization::Float16,
374 "uint8" | "u8" => WeightQuantization::UInt8,
375 "uint4" | "u4" => WeightQuantization::UInt4,
376 _ => WeightQuantization::default(),
377 });
378 }
379 }
380 Rule::weight_threshold_kwarg => {
381 if let Some(t) = p.into_inner().next() {
383 config.weight_threshold = Some(t.as_str().parse().unwrap_or_else(|_| {
384 log::warn!(
385 "Invalid weight_threshold value '{}', using default 0.0",
386 t.as_str()
387 );
388 0.0
389 }));
390 }
391 }
392 Rule::block_size_kwarg => {
393 if let Some(n) = p.into_inner().next() {
395 config.block_size = Some(n.as_str().parse().unwrap_or_else(|_| {
396 log::warn!(
397 "Invalid block_size value '{}', using default 128",
398 n.as_str()
399 );
400 128
401 }));
402 }
403 }
404 Rule::pruning_kwarg => {
405 if let Some(f) = p.into_inner().next() {
407 config.pruning = Some(f.as_str().parse().unwrap_or_else(|_| {
408 log::warn!("Invalid pruning value '{}', using default 1.0", f.as_str());
409 1.0
410 }));
411 }
412 }
413 Rule::query_config_block => {
414 parse_query_config_block(config, p);
416 }
417 Rule::positions_kwarg => {
418 use super::schema::PositionMode;
420 config.positions = Some(match p.as_str() {
421 "ordinal" => PositionMode::Ordinal,
422 "token_position" => PositionMode::TokenPosition,
423 _ => PositionMode::Full, });
425 }
426 _ => {}
427 }
428}
429
430fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
432 for inner in pair.into_inner() {
433 if inner.as_rule() == Rule::query_config_params {
434 for param in inner.into_inner() {
435 if param.as_rule() == Rule::query_config_param {
436 for p in param.into_inner() {
437 match p.as_rule() {
438 Rule::query_tokenizer_kwarg => {
439 if let Some(path) = p.into_inner().next()
441 && let Some(inner_path) = path.into_inner().next()
442 {
443 config.query_tokenizer = Some(inner_path.as_str().to_string());
444 }
445 }
446 Rule::query_weighting_kwarg => {
447 if let Some(w) = p.into_inner().next() {
449 config.query_weighting = Some(match w.as_str() {
450 "one" => QueryWeighting::One,
451 "idf" => QueryWeighting::Idf,
452 "idf_file" => QueryWeighting::IdfFile,
453 _ => QueryWeighting::One,
454 });
455 }
456 }
457 Rule::query_weight_threshold_kwarg => {
458 if let Some(t) = p.into_inner().next() {
459 config.query_weight_threshold =
460 Some(t.as_str().parse().unwrap_or_else(|_| {
461 log::warn!(
462 "Invalid query weight_threshold '{}', using 0.0",
463 t.as_str()
464 );
465 0.0
466 }));
467 }
468 }
469 Rule::query_max_dims_kwarg => {
470 if let Some(t) = p.into_inner().next() {
471 config.query_max_dims =
472 Some(t.as_str().parse().unwrap_or_else(|_| {
473 log::warn!(
474 "Invalid query max_dims '{}', using 0",
475 t.as_str()
476 );
477 0
478 }));
479 }
480 }
481 Rule::query_pruning_kwarg => {
482 if let Some(t) = p.into_inner().next() {
483 config.query_pruning =
484 Some(t.as_str().parse().unwrap_or_else(|_| {
485 log::warn!(
486 "Invalid query pruning '{}', using 1.0",
487 t.as_str()
488 );
489 1.0
490 }));
491 }
492 }
493 _ => {}
494 }
495 }
496 }
497 }
498 }
499 }
500}
501
502fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
504 let mut inner = pair.into_inner();
505
506 let name = inner
507 .next()
508 .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
509 .as_str()
510 .to_string();
511
512 let field_type_str = inner
513 .next()
514 .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
515 .as_str();
516
517 let field_type = parse_field_type(field_type_str)?;
518
519 let mut tokenizer = None;
521 let mut sparse_vector_config = None;
522 let mut dense_vector_config = None;
523 let mut indexed = true;
524 let mut stored = true;
525 let mut multi = false;
526 let mut fast = false;
527 let mut primary = false;
528 let mut index_config: Option<IndexConfig> = None;
529
530 for item in inner {
531 match item.as_rule() {
532 Rule::tokenizer_spec => {
533 if let Some(tok_name) = item.into_inner().next() {
535 tokenizer = Some(tok_name.as_str().to_string());
536 }
537 }
538 Rule::sparse_vector_config => {
539 sparse_vector_config = Some(parse_sparse_vector_config(item));
541 }
542 Rule::dense_vector_config => {
543 dense_vector_config = Some(parse_dense_vector_config(item));
545 }
546 Rule::attributes => {
547 let (idx, sto, mul, fst, pri, idx_cfg) = parse_attributes(item);
548 indexed = idx;
549 stored = sto;
550 multi = mul;
551 fast = fst;
552 primary = pri;
553 index_config = idx_cfg;
554 }
555 _ => {}
556 }
557 }
558
559 if primary {
561 fast = true;
562 indexed = true;
563 }
564
565 let mut positions = None;
567 if let Some(idx_cfg) = index_config {
568 positions = idx_cfg.positions;
569 if let Some(ref mut dv_config) = dense_vector_config {
570 apply_index_config_to_dense_vector(dv_config, idx_cfg);
571 } else if field_type == FieldType::SparseVector {
572 let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
574 apply_index_config_to_sparse_vector(sv_config, idx_cfg);
575 }
576 }
577
578 Ok(FieldDef {
579 name,
580 field_type,
581 indexed,
582 stored,
583 tokenizer,
584 multi,
585 positions,
586 sparse_vector_config,
587 dense_vector_config,
588 fast,
589 primary,
590 })
591}
592
593fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
595 if let Some(index_type) = idx_cfg.index_type {
597 config.index_type = index_type;
598 }
599
600 if idx_cfg.num_clusters.is_some() {
602 config.num_clusters = idx_cfg.num_clusters;
603 }
604
605 if let Some(nprobe) = idx_cfg.nprobe {
607 config.nprobe = nprobe;
608 }
609
610 if idx_cfg.build_threshold.is_some() {
612 config.build_threshold = idx_cfg.build_threshold;
613 }
614}
615
616fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
619 let mut index_size = IndexSize::default();
620
621 for inner in pair.into_inner() {
623 if inner.as_rule() == Rule::index_size_spec {
624 index_size = match inner.as_str() {
625 "u16" => IndexSize::U16,
626 "u32" => IndexSize::U32,
627 _ => IndexSize::default(),
628 };
629 }
630 }
631
632 SparseVectorConfig {
633 index_size,
634 weight_quantization: WeightQuantization::default(),
635 weight_threshold: 0.0,
636 block_size: 128,
637 pruning: None,
638 query_config: None,
639 }
640}
641
642fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
644 if let Some(q) = idx_cfg.quantization {
645 config.weight_quantization = q;
646 }
647 if let Some(t) = idx_cfg.weight_threshold {
648 config.weight_threshold = t;
649 }
650 if let Some(bs) = idx_cfg.block_size {
651 let adjusted = bs.next_power_of_two();
652 if adjusted != bs {
653 log::warn!(
654 "block_size {} adjusted to next power of two: {}",
655 bs,
656 adjusted
657 );
658 }
659 config.block_size = adjusted;
660 }
661 if let Some(p) = idx_cfg.pruning {
662 let clamped = p.clamp(0.0, 1.0);
663 if (clamped - p).abs() > f32::EPSILON {
664 log::warn!(
665 "pruning {} clamped to valid range [0.0, 1.0]: {}",
666 p,
667 clamped
668 );
669 }
670 config.pruning = Some(clamped);
671 }
672 if idx_cfg.query_tokenizer.is_some()
674 || idx_cfg.query_weighting.is_some()
675 || idx_cfg.query_weight_threshold.is_some()
676 || idx_cfg.query_max_dims.is_some()
677 || idx_cfg.query_pruning.is_some()
678 {
679 let query_config = config
680 .query_config
681 .get_or_insert(SparseQueryConfig::default());
682 if let Some(tokenizer) = idx_cfg.query_tokenizer {
683 query_config.tokenizer = Some(tokenizer);
684 }
685 if let Some(weighting) = idx_cfg.query_weighting {
686 query_config.weighting = weighting;
687 }
688 if let Some(t) = idx_cfg.query_weight_threshold {
689 query_config.weight_threshold = t;
690 }
691 if let Some(d) = idx_cfg.query_max_dims {
692 query_config.max_query_dims = Some(d);
693 }
694 if let Some(p) = idx_cfg.query_pruning {
695 query_config.pruning = Some(p);
696 }
697 }
698}
699
700fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
703 let mut dim: usize = 0;
704 let mut quantization = DenseVectorQuantization::F32;
705
706 for params in pair.into_inner() {
708 if params.as_rule() == Rule::dense_vector_params {
709 for inner in params.into_inner() {
710 match inner.as_rule() {
711 Rule::dense_vector_keyword_params => {
712 for kwarg in inner.into_inner() {
713 match kwarg.as_rule() {
714 Rule::dims_kwarg => {
715 if let Some(d) = kwarg.into_inner().next() {
716 dim = d.as_str().parse().unwrap_or(0);
717 }
718 }
719 Rule::quant_type_spec => {
720 quantization = parse_quant_type(kwarg.as_str());
721 }
722 _ => {}
723 }
724 }
725 }
726 Rule::dense_vector_positional_params => {
727 for item in inner.into_inner() {
728 match item.as_rule() {
729 Rule::dimension_spec => {
730 dim = item.as_str().parse().unwrap_or(0);
731 }
732 Rule::quant_type_spec => {
733 quantization = parse_quant_type(item.as_str());
734 }
735 _ => {}
736 }
737 }
738 }
739 _ => {}
740 }
741 }
742 }
743 }
744
745 DenseVectorConfig::new(dim).with_quantization(quantization)
746}
747
748fn parse_quant_type(s: &str) -> DenseVectorQuantization {
749 match s.trim() {
750 "f16" => DenseVectorQuantization::F16,
751 "uint8" | "u8" => DenseVectorQuantization::UInt8,
752 _ => DenseVectorQuantization::F32,
753 }
754}
755
756fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
758 pair.into_inner().map(|p| p.as_str().to_string()).collect()
759}
760
761fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
763 let mut pattern = String::new();
764 let mut substitution = String::new();
765 let mut target_field = String::new();
766 let mut mode = RoutingMode::Additional;
767
768 for prop in pair.into_inner() {
769 if prop.as_rule() != Rule::query_router_prop {
770 continue;
771 }
772
773 for inner in prop.into_inner() {
774 match inner.as_rule() {
775 Rule::query_router_pattern => {
776 if let Some(regex_str) = inner.into_inner().next() {
777 pattern = parse_string_value(regex_str);
778 }
779 }
780 Rule::query_router_substitution => {
781 if let Some(quoted) = inner.into_inner().next() {
782 substitution = parse_string_value(quoted);
783 }
784 }
785 Rule::query_router_target => {
786 if let Some(ident) = inner.into_inner().next() {
787 target_field = ident.as_str().to_string();
788 }
789 }
790 Rule::query_router_mode => {
791 if let Some(mode_val) = inner.into_inner().next() {
792 mode = match mode_val.as_str() {
793 "exclusive" => RoutingMode::Exclusive,
794 "additional" => RoutingMode::Additional,
795 _ => RoutingMode::Additional,
796 };
797 }
798 }
799 _ => {}
800 }
801 }
802 }
803
804 if pattern.is_empty() {
805 return Err(Error::Schema("query_router missing 'pattern'".to_string()));
806 }
807 if substitution.is_empty() {
808 return Err(Error::Schema(
809 "query_router missing 'substitution'".to_string(),
810 ));
811 }
812 if target_field.is_empty() {
813 return Err(Error::Schema(
814 "query_router missing 'target_field'".to_string(),
815 ));
816 }
817
818 Ok(QueryRouterRule {
819 pattern,
820 substitution,
821 target_field,
822 mode,
823 })
824}
825
826fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
828 let s = pair.as_str();
829 match pair.as_rule() {
830 Rule::regex_string => {
831 if let Some(inner) = pair.into_inner().next() {
833 parse_string_value(inner)
834 } else {
835 s.to_string()
836 }
837 }
838 Rule::raw_string => {
839 s[2..s.len() - 1].to_string()
841 }
842 Rule::quoted_string => {
843 let inner = &s[1..s.len() - 1];
845 inner
847 .replace("\\n", "\n")
848 .replace("\\t", "\t")
849 .replace("\\\"", "\"")
850 .replace("\\\\", "\\")
851 }
852 _ => s.to_string(),
853 }
854}
855
856fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
858 let mut inner = pair.into_inner();
859
860 let name = inner
861 .next()
862 .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
863 .as_str()
864 .to_string();
865
866 let mut fields = Vec::new();
867 let mut default_fields = Vec::new();
868 let mut query_routers = Vec::new();
869
870 for item in inner {
871 match item.as_rule() {
872 Rule::field_def => {
873 fields.push(parse_field_def(item)?);
874 }
875 Rule::default_fields_def => {
876 default_fields = parse_default_fields_def(item);
877 }
878 Rule::query_router_def => {
879 query_routers.push(parse_query_router_def(item)?);
880 }
881 _ => {}
882 }
883 }
884
885 let primary_fields: Vec<&FieldDef> = fields.iter().filter(|f| f.primary).collect();
887 if primary_fields.len() > 1 {
888 return Err(Error::Schema(format!(
889 "Index '{}' has {} primary key fields, but at most one is allowed",
890 name,
891 primary_fields.len()
892 )));
893 }
894 if let Some(pk) = primary_fields.first() {
895 if pk.field_type != FieldType::Text {
896 return Err(Error::Schema(format!(
897 "Primary key field '{}' must be of type text, got {:?}",
898 pk.name, pk.field_type
899 )));
900 }
901 if pk.multi {
902 return Err(Error::Schema(format!(
903 "Primary key field '{}' cannot be multi-valued",
904 pk.name
905 )));
906 }
907 }
908
909 Ok(IndexDef {
910 name,
911 fields,
912 default_fields,
913 query_routers,
914 })
915}
916
917pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
919 let pairs = SdlParser::parse(Rule::file, input)
920 .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
921
922 let mut indexes = Vec::new();
923
924 for pair in pairs {
925 if pair.as_rule() == Rule::file {
926 for inner in pair.into_inner() {
927 if inner.as_rule() == Rule::index_def {
928 indexes.push(parse_index_def(inner)?);
929 }
930 }
931 }
932 }
933
934 Ok(indexes)
935}
936
937pub fn parse_single_index(input: &str) -> Result<IndexDef> {
939 let indexes = parse_sdl(input)?;
940
941 if indexes.is_empty() {
942 return Err(Error::Schema("No index definition found".to_string()));
943 }
944
945 if indexes.len() > 1 {
946 return Err(Error::Schema(
947 "Multiple index definitions found, expected one".to_string(),
948 ));
949 }
950
951 Ok(indexes.into_iter().next().unwrap())
952}
953
954#[cfg(test)]
955mod tests {
956 use super::*;
957
958 #[test]
959 fn test_parse_simple_schema() {
960 let sdl = r#"
961 index articles {
962 field title: text [indexed, stored]
963 field body: text [indexed]
964 }
965 "#;
966
967 let indexes = parse_sdl(sdl).unwrap();
968 assert_eq!(indexes.len(), 1);
969
970 let index = &indexes[0];
971 assert_eq!(index.name, "articles");
972 assert_eq!(index.fields.len(), 2);
973
974 assert_eq!(index.fields[0].name, "title");
975 assert!(matches!(index.fields[0].field_type, FieldType::Text));
976 assert!(index.fields[0].indexed);
977 assert!(index.fields[0].stored);
978
979 assert_eq!(index.fields[1].name, "body");
980 assert!(matches!(index.fields[1].field_type, FieldType::Text));
981 assert!(index.fields[1].indexed);
982 assert!(!index.fields[1].stored);
983 }
984
985 #[test]
986 fn test_parse_all_field_types() {
987 let sdl = r#"
988 index test {
989 field text_field: text [indexed, stored]
990 field u64_field: u64 [indexed, stored]
991 field i64_field: i64 [indexed, stored]
992 field f64_field: f64 [indexed, stored]
993 field bytes_field: bytes [stored]
994 }
995 "#;
996
997 let indexes = parse_sdl(sdl).unwrap();
998 let index = &indexes[0];
999
1000 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1001 assert!(matches!(index.fields[1].field_type, FieldType::U64));
1002 assert!(matches!(index.fields[2].field_type, FieldType::I64));
1003 assert!(matches!(index.fields[3].field_type, FieldType::F64));
1004 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1005 }
1006
1007 #[test]
1008 fn test_parse_with_comments() {
1009 let sdl = r#"
1010 # This is a comment
1011 index articles {
1012 # Title field
1013 field title: text [indexed, stored]
1014 field body: text [indexed] # inline comment not supported yet
1015 }
1016 "#;
1017
1018 let indexes = parse_sdl(sdl).unwrap();
1019 assert_eq!(indexes[0].fields.len(), 2);
1020 }
1021
1022 #[test]
1023 fn test_parse_type_aliases() {
1024 let sdl = r#"
1025 index test {
1026 field a: string [indexed]
1027 field b: int [indexed]
1028 field c: uint [indexed]
1029 field d: float [indexed]
1030 field e: binary [stored]
1031 }
1032 "#;
1033
1034 let indexes = parse_sdl(sdl).unwrap();
1035 let index = &indexes[0];
1036
1037 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1038 assert!(matches!(index.fields[1].field_type, FieldType::I64));
1039 assert!(matches!(index.fields[2].field_type, FieldType::U64));
1040 assert!(matches!(index.fields[3].field_type, FieldType::F64));
1041 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1042 }
1043
1044 #[test]
1045 fn test_to_schema() {
1046 let sdl = r#"
1047 index articles {
1048 field title: text [indexed, stored]
1049 field views: u64 [indexed, stored]
1050 }
1051 "#;
1052
1053 let indexes = parse_sdl(sdl).unwrap();
1054 let schema = indexes[0].to_schema();
1055
1056 assert!(schema.get_field("title").is_some());
1057 assert!(schema.get_field("views").is_some());
1058 assert!(schema.get_field("nonexistent").is_none());
1059 }
1060
1061 #[test]
1062 fn test_default_attributes() {
1063 let sdl = r#"
1064 index test {
1065 field title: text
1066 }
1067 "#;
1068
1069 let indexes = parse_sdl(sdl).unwrap();
1070 let field = &indexes[0].fields[0];
1071
1072 assert!(field.indexed);
1074 assert!(field.stored);
1075 }
1076
1077 #[test]
1078 fn test_multiple_indexes() {
1079 let sdl = r#"
1080 index articles {
1081 field title: text [indexed, stored]
1082 }
1083
1084 index users {
1085 field name: text [indexed, stored]
1086 field email: text [indexed, stored]
1087 }
1088 "#;
1089
1090 let indexes = parse_sdl(sdl).unwrap();
1091 assert_eq!(indexes.len(), 2);
1092 assert_eq!(indexes[0].name, "articles");
1093 assert_eq!(indexes[1].name, "users");
1094 }
1095
1096 #[test]
1097 fn test_tokenizer_spec() {
1098 let sdl = r#"
1099 index articles {
1100 field title: text<en_stem> [indexed, stored]
1101 field body: text<simple> [indexed]
1102 field author: text [indexed, stored]
1103 }
1104 "#;
1105
1106 let indexes = parse_sdl(sdl).unwrap();
1107 let index = &indexes[0];
1108
1109 assert_eq!(index.fields[0].name, "title");
1110 assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
1111
1112 assert_eq!(index.fields[1].name, "body");
1113 assert_eq!(index.fields[1].tokenizer, Some("simple".to_string()));
1114
1115 assert_eq!(index.fields[2].name, "author");
1116 assert_eq!(index.fields[2].tokenizer, None); }
1118
1119 #[test]
1120 fn test_tokenizer_in_schema() {
1121 let sdl = r#"
1122 index articles {
1123 field title: text<german> [indexed, stored]
1124 field body: text<en_stem> [indexed]
1125 }
1126 "#;
1127
1128 let indexes = parse_sdl(sdl).unwrap();
1129 let schema = indexes[0].to_schema();
1130
1131 let title_field = schema.get_field("title").unwrap();
1132 let title_entry = schema.get_field_entry(title_field).unwrap();
1133 assert_eq!(title_entry.tokenizer, Some("german".to_string()));
1134
1135 let body_field = schema.get_field("body").unwrap();
1136 let body_entry = schema.get_field_entry(body_field).unwrap();
1137 assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
1138 }
1139
1140 #[test]
1141 fn test_query_router_basic() {
1142 let sdl = r#"
1143 index documents {
1144 field title: text [indexed, stored]
1145 field uri: text [indexed, stored]
1146
1147 query_router {
1148 pattern: "10\\.\\d{4,}/[^\\s]+"
1149 substitution: "doi://{0}"
1150 target_field: uris
1151 mode: exclusive
1152 }
1153 }
1154 "#;
1155
1156 let indexes = parse_sdl(sdl).unwrap();
1157 let index = &indexes[0];
1158
1159 assert_eq!(index.query_routers.len(), 1);
1160 let router = &index.query_routers[0];
1161 assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1162 assert_eq!(router.substitution, "doi://{0}");
1163 assert_eq!(router.target_field, "uris");
1164 assert_eq!(router.mode, RoutingMode::Exclusive);
1165 }
1166
1167 #[test]
1168 fn test_query_router_raw_string() {
1169 let sdl = r#"
1170 index documents {
1171 field uris: text [indexed, stored]
1172
1173 query_router {
1174 pattern: r"^pmid:(\d+)$"
1175 substitution: "pubmed://{1}"
1176 target_field: uris
1177 mode: additional
1178 }
1179 }
1180 "#;
1181
1182 let indexes = parse_sdl(sdl).unwrap();
1183 let router = &indexes[0].query_routers[0];
1184
1185 assert_eq!(router.pattern, r"^pmid:(\d+)$");
1186 assert_eq!(router.substitution, "pubmed://{1}");
1187 assert_eq!(router.mode, RoutingMode::Additional);
1188 }
1189
1190 #[test]
1191 fn test_multiple_query_routers() {
1192 let sdl = r#"
1193 index documents {
1194 field uris: text [indexed, stored]
1195
1196 query_router {
1197 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1198 substitution: "doi://{1}"
1199 target_field: uris
1200 mode: exclusive
1201 }
1202
1203 query_router {
1204 pattern: r"^pmid:(\d+)$"
1205 substitution: "pubmed://{1}"
1206 target_field: uris
1207 mode: exclusive
1208 }
1209
1210 query_router {
1211 pattern: r"^arxiv:(\d+\.\d+)$"
1212 substitution: "arxiv://{1}"
1213 target_field: uris
1214 mode: additional
1215 }
1216 }
1217 "#;
1218
1219 let indexes = parse_sdl(sdl).unwrap();
1220 assert_eq!(indexes[0].query_routers.len(), 3);
1221 }
1222
1223 #[test]
1224 fn test_query_router_default_mode() {
1225 let sdl = r#"
1226 index documents {
1227 field uris: text [indexed, stored]
1228
1229 query_router {
1230 pattern: r"test"
1231 substitution: "{0}"
1232 target_field: uris
1233 }
1234 }
1235 "#;
1236
1237 let indexes = parse_sdl(sdl).unwrap();
1238 assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1240 }
1241
1242 #[test]
1243 fn test_multi_attribute() {
1244 let sdl = r#"
1245 index documents {
1246 field uris: text [indexed, stored<multi>]
1247 field title: text [indexed, stored]
1248 }
1249 "#;
1250
1251 let indexes = parse_sdl(sdl).unwrap();
1252 assert_eq!(indexes.len(), 1);
1253
1254 let fields = &indexes[0].fields;
1255 assert_eq!(fields.len(), 2);
1256
1257 assert_eq!(fields[0].name, "uris");
1259 assert!(fields[0].multi, "uris field should have multi=true");
1260
1261 assert_eq!(fields[1].name, "title");
1263 assert!(!fields[1].multi, "title field should have multi=false");
1264
1265 let schema = indexes[0].to_schema();
1267 let uris_field = schema.get_field("uris").unwrap();
1268 let title_field = schema.get_field("title").unwrap();
1269
1270 assert!(schema.get_field_entry(uris_field).unwrap().multi);
1271 assert!(!schema.get_field_entry(title_field).unwrap().multi);
1272 }
1273
1274 #[test]
1275 fn test_sparse_vector_field() {
1276 let sdl = r#"
1277 index documents {
1278 field embedding: sparse_vector [indexed, stored]
1279 }
1280 "#;
1281
1282 let indexes = parse_sdl(sdl).unwrap();
1283 assert_eq!(indexes.len(), 1);
1284 assert_eq!(indexes[0].fields.len(), 1);
1285 assert_eq!(indexes[0].fields[0].name, "embedding");
1286 assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1287 assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1288 }
1289
1290 #[test]
1291 fn test_sparse_vector_with_config() {
1292 let sdl = r#"
1293 index documents {
1294 field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1295 field dense: sparse_vector<u32> [indexed<quantization: float32>]
1296 }
1297 "#;
1298
1299 let indexes = parse_sdl(sdl).unwrap();
1300 assert_eq!(indexes[0].fields.len(), 2);
1301
1302 let f1 = &indexes[0].fields[0];
1304 assert_eq!(f1.name, "embedding");
1305 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1306 assert_eq!(config1.index_size, IndexSize::U16);
1307 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1308
1309 let f2 = &indexes[0].fields[1];
1311 assert_eq!(f2.name, "dense");
1312 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1313 assert_eq!(config2.index_size, IndexSize::U32);
1314 assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1315 }
1316
1317 #[test]
1318 fn test_sparse_vector_with_weight_threshold() {
1319 let sdl = r#"
1320 index documents {
1321 field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1322 field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1323 }
1324 "#;
1325
1326 let indexes = parse_sdl(sdl).unwrap();
1327 assert_eq!(indexes[0].fields.len(), 2);
1328
1329 let f1 = &indexes[0].fields[0];
1331 assert_eq!(f1.name, "embedding");
1332 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1333 assert_eq!(config1.index_size, IndexSize::U16);
1334 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1335 assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1336
1337 let f2 = &indexes[0].fields[1];
1339 assert_eq!(f2.name, "embedding2");
1340 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1341 assert_eq!(config2.index_size, IndexSize::U32);
1342 assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1343 assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1344 }
1345
1346 #[test]
1347 fn test_sparse_vector_with_pruning() {
1348 let sdl = r#"
1349 index documents {
1350 field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1351 }
1352 "#;
1353
1354 let indexes = parse_sdl(sdl).unwrap();
1355 let f = &indexes[0].fields[0];
1356 assert_eq!(f.name, "embedding");
1357 let config = f.sparse_vector_config.as_ref().unwrap();
1358 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1359 assert_eq!(config.pruning, Some(0.1));
1360 }
1361
1362 #[test]
1363 fn test_dense_vector_field() {
1364 let sdl = r#"
1365 index documents {
1366 field embedding: dense_vector<768> [indexed, stored]
1367 }
1368 "#;
1369
1370 let indexes = parse_sdl(sdl).unwrap();
1371 assert_eq!(indexes.len(), 1);
1372 assert_eq!(indexes[0].fields.len(), 1);
1373
1374 let f = &indexes[0].fields[0];
1375 assert_eq!(f.name, "embedding");
1376 assert_eq!(f.field_type, FieldType::DenseVector);
1377
1378 let config = f.dense_vector_config.as_ref().unwrap();
1379 assert_eq!(config.dim, 768);
1380 }
1381
1382 #[test]
1383 fn test_dense_vector_alias() {
1384 let sdl = r#"
1385 index documents {
1386 field embedding: vector<1536> [indexed]
1387 }
1388 "#;
1389
1390 let indexes = parse_sdl(sdl).unwrap();
1391 assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1392 assert_eq!(
1393 indexes[0].fields[0]
1394 .dense_vector_config
1395 .as_ref()
1396 .unwrap()
1397 .dim,
1398 1536
1399 );
1400 }
1401
1402 #[test]
1403 fn test_dense_vector_with_num_clusters() {
1404 let sdl = r#"
1405 index documents {
1406 field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1407 }
1408 "#;
1409
1410 let indexes = parse_sdl(sdl).unwrap();
1411 assert_eq!(indexes.len(), 1);
1412
1413 let f = &indexes[0].fields[0];
1414 assert_eq!(f.name, "embedding");
1415 assert_eq!(f.field_type, FieldType::DenseVector);
1416
1417 let config = f.dense_vector_config.as_ref().unwrap();
1418 assert_eq!(config.dim, 768);
1419 assert_eq!(config.num_clusters, Some(256));
1420 assert_eq!(config.nprobe, 32); }
1422
1423 #[test]
1424 fn test_dense_vector_with_num_clusters_and_nprobe() {
1425 let sdl = r#"
1426 index documents {
1427 field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1428 }
1429 "#;
1430
1431 let indexes = parse_sdl(sdl).unwrap();
1432 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1433
1434 assert_eq!(config.dim, 1536);
1435 assert_eq!(config.num_clusters, Some(512));
1436 assert_eq!(config.nprobe, 64);
1437 }
1438
1439 #[test]
1440 fn test_dense_vector_keyword_syntax() {
1441 let sdl = r#"
1442 index documents {
1443 field embedding: dense_vector<dims: 1536> [indexed, stored]
1444 }
1445 "#;
1446
1447 let indexes = parse_sdl(sdl).unwrap();
1448 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1449
1450 assert_eq!(config.dim, 1536);
1451 assert!(config.num_clusters.is_none());
1452 }
1453
1454 #[test]
1455 fn test_dense_vector_keyword_syntax_full() {
1456 let sdl = r#"
1457 index documents {
1458 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1459 }
1460 "#;
1461
1462 let indexes = parse_sdl(sdl).unwrap();
1463 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1464
1465 assert_eq!(config.dim, 1536);
1466 assert_eq!(config.num_clusters, Some(256));
1467 assert_eq!(config.nprobe, 64);
1468 }
1469
1470 #[test]
1471 fn test_dense_vector_keyword_syntax_partial() {
1472 let sdl = r#"
1473 index documents {
1474 field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1475 }
1476 "#;
1477
1478 let indexes = parse_sdl(sdl).unwrap();
1479 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1480
1481 assert_eq!(config.dim, 768);
1482 assert_eq!(config.num_clusters, Some(128));
1483 assert_eq!(config.nprobe, 32); }
1485
1486 #[test]
1487 fn test_dense_vector_scann_index() {
1488 use crate::dsl::schema::VectorIndexType;
1489
1490 let sdl = r#"
1491 index documents {
1492 field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1493 }
1494 "#;
1495
1496 let indexes = parse_sdl(sdl).unwrap();
1497 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1498
1499 assert_eq!(config.dim, 768);
1500 assert_eq!(config.index_type, VectorIndexType::ScaNN);
1501 assert_eq!(config.num_clusters, Some(256));
1502 assert_eq!(config.nprobe, 64);
1503 }
1504
1505 #[test]
1506 fn test_dense_vector_ivf_rabitq_index() {
1507 use crate::dsl::schema::VectorIndexType;
1508
1509 let sdl = r#"
1510 index documents {
1511 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1512 }
1513 "#;
1514
1515 let indexes = parse_sdl(sdl).unwrap();
1516 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1517
1518 assert_eq!(config.dim, 1536);
1519 assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1520 assert_eq!(config.num_clusters, Some(512));
1521 }
1522
1523 #[test]
1524 fn test_dense_vector_rabitq_no_clusters() {
1525 use crate::dsl::schema::VectorIndexType;
1526
1527 let sdl = r#"
1528 index documents {
1529 field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1530 }
1531 "#;
1532
1533 let indexes = parse_sdl(sdl).unwrap();
1534 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1535
1536 assert_eq!(config.dim, 768);
1537 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1538 assert!(config.num_clusters.is_none());
1539 }
1540
1541 #[test]
1542 fn test_dense_vector_flat_index() {
1543 use crate::dsl::schema::VectorIndexType;
1544
1545 let sdl = r#"
1546 index documents {
1547 field embedding: dense_vector<dims: 768> [indexed<flat>]
1548 }
1549 "#;
1550
1551 let indexes = parse_sdl(sdl).unwrap();
1552 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1553
1554 assert_eq!(config.dim, 768);
1555 assert_eq!(config.index_type, VectorIndexType::Flat);
1556 }
1557
1558 #[test]
1559 fn test_dense_vector_default_index_type() {
1560 use crate::dsl::schema::VectorIndexType;
1561
1562 let sdl = r#"
1564 index documents {
1565 field embedding: dense_vector<dims: 768> [indexed]
1566 }
1567 "#;
1568
1569 let indexes = parse_sdl(sdl).unwrap();
1570 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1571
1572 assert_eq!(config.dim, 768);
1573 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1574 }
1575
1576 #[test]
1577 fn test_dense_vector_f16_quantization() {
1578 use crate::dsl::schema::{DenseVectorQuantization, VectorIndexType};
1579
1580 let sdl = r#"
1581 index documents {
1582 field embedding: dense_vector<768, f16> [indexed]
1583 }
1584 "#;
1585
1586 let indexes = parse_sdl(sdl).unwrap();
1587 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1588
1589 assert_eq!(config.dim, 768);
1590 assert_eq!(config.quantization, DenseVectorQuantization::F16);
1591 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1592 }
1593
1594 #[test]
1595 fn test_dense_vector_uint8_quantization() {
1596 use crate::dsl::schema::DenseVectorQuantization;
1597
1598 let sdl = r#"
1599 index documents {
1600 field embedding: dense_vector<1024, uint8> [indexed<ivf_rabitq>]
1601 }
1602 "#;
1603
1604 let indexes = parse_sdl(sdl).unwrap();
1605 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1606
1607 assert_eq!(config.dim, 1024);
1608 assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1609 }
1610
1611 #[test]
1612 fn test_dense_vector_u8_alias() {
1613 use crate::dsl::schema::DenseVectorQuantization;
1614
1615 let sdl = r#"
1616 index documents {
1617 field embedding: dense_vector<512, u8> [indexed]
1618 }
1619 "#;
1620
1621 let indexes = parse_sdl(sdl).unwrap();
1622 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1623
1624 assert_eq!(config.dim, 512);
1625 assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1626 }
1627
1628 #[test]
1629 fn test_dense_vector_default_f32_quantization() {
1630 use crate::dsl::schema::DenseVectorQuantization;
1631
1632 let sdl = r#"
1634 index documents {
1635 field embedding: dense_vector<768> [indexed]
1636 }
1637 "#;
1638
1639 let indexes = parse_sdl(sdl).unwrap();
1640 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1641
1642 assert_eq!(config.dim, 768);
1643 assert_eq!(config.quantization, DenseVectorQuantization::F32);
1644 }
1645
1646 #[test]
1647 fn test_dense_vector_keyword_with_quantization() {
1648 use crate::dsl::schema::DenseVectorQuantization;
1649
1650 let sdl = r#"
1651 index documents {
1652 field embedding: dense_vector<dims: 768, f16> [indexed]
1653 }
1654 "#;
1655
1656 let indexes = parse_sdl(sdl).unwrap();
1657 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1658
1659 assert_eq!(config.dim, 768);
1660 assert_eq!(config.quantization, DenseVectorQuantization::F16);
1661 }
1662
1663 #[test]
1664 fn test_json_field_type() {
1665 let sdl = r#"
1666 index documents {
1667 field title: text [indexed, stored]
1668 field metadata: json [stored]
1669 field extra: json
1670 }
1671 "#;
1672
1673 let indexes = parse_sdl(sdl).unwrap();
1674 let index = &indexes[0];
1675
1676 assert_eq!(index.fields.len(), 3);
1677
1678 assert_eq!(index.fields[1].name, "metadata");
1680 assert!(matches!(index.fields[1].field_type, FieldType::Json));
1681 assert!(index.fields[1].stored);
1682 assert_eq!(index.fields[2].name, "extra");
1686 assert!(matches!(index.fields[2].field_type, FieldType::Json));
1687
1688 let schema = index.to_schema();
1690 let metadata_field = schema.get_field("metadata").unwrap();
1691 let entry = schema.get_field_entry(metadata_field).unwrap();
1692 assert_eq!(entry.field_type, FieldType::Json);
1693 assert!(!entry.indexed); assert!(entry.stored);
1695 }
1696
1697 #[test]
1698 fn test_sparse_vector_query_config() {
1699 use crate::structures::QueryWeighting;
1700
1701 let sdl = r#"
1702 index documents {
1703 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1704 }
1705 "#;
1706
1707 let indexes = parse_sdl(sdl).unwrap();
1708 let index = &indexes[0];
1709
1710 assert_eq!(index.fields.len(), 1);
1711 assert_eq!(index.fields[0].name, "embedding");
1712 assert!(matches!(
1713 index.fields[0].field_type,
1714 FieldType::SparseVector
1715 ));
1716
1717 let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1718 assert_eq!(config.index_size, IndexSize::U16);
1719 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1720
1721 let query_config = config.query_config.as_ref().unwrap();
1723 assert_eq!(
1724 query_config.tokenizer.as_deref(),
1725 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1726 );
1727 assert_eq!(query_config.weighting, QueryWeighting::Idf);
1728
1729 let schema = index.to_schema();
1731 let embedding_field = schema.get_field("embedding").unwrap();
1732 let entry = schema.get_field_entry(embedding_field).unwrap();
1733 let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1734 let qc = sv_config.query_config.as_ref().unwrap();
1735 assert_eq!(
1736 qc.tokenizer.as_deref(),
1737 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1738 );
1739 assert_eq!(qc.weighting, QueryWeighting::Idf);
1740 }
1741
1742 #[test]
1743 fn test_sparse_vector_query_config_weighting_one() {
1744 use crate::structures::QueryWeighting;
1745
1746 let sdl = r#"
1747 index documents {
1748 field embedding: sparse_vector [indexed<query<weighting: one>>]
1749 }
1750 "#;
1751
1752 let indexes = parse_sdl(sdl).unwrap();
1753 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1754
1755 let query_config = config.query_config.as_ref().unwrap();
1756 assert!(query_config.tokenizer.is_none());
1757 assert_eq!(query_config.weighting, QueryWeighting::One);
1758 }
1759
1760 #[test]
1761 fn test_sparse_vector_query_config_weighting_idf_file() {
1762 use crate::structures::QueryWeighting;
1763
1764 let sdl = r#"
1765 index documents {
1766 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1767 }
1768 "#;
1769
1770 let indexes = parse_sdl(sdl).unwrap();
1771 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1772
1773 let query_config = config.query_config.as_ref().unwrap();
1774 assert_eq!(
1775 query_config.tokenizer.as_deref(),
1776 Some("opensearch-neural-sparse-encoding-v1")
1777 );
1778 assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1779
1780 let schema = indexes[0].to_schema();
1782 let field = schema.get_field("embedding").unwrap();
1783 let entry = schema.get_field_entry(field).unwrap();
1784 let sc = entry.sparse_vector_config.as_ref().unwrap();
1785 let qc = sc.query_config.as_ref().unwrap();
1786 assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1787 }
1788
1789 #[test]
1790 fn test_sparse_vector_query_config_pruning_params() {
1791 let sdl = r#"
1792 index documents {
1793 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<weighting: idf, weight_threshold: 0.03, max_dims: 25, pruning: 0.2>>]
1794 }
1795 "#;
1796
1797 let indexes = parse_sdl(sdl).unwrap();
1798 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1799
1800 let qc = config.query_config.as_ref().unwrap();
1801 assert_eq!(qc.weighting, QueryWeighting::Idf);
1802 assert!((qc.weight_threshold - 0.03).abs() < 0.001);
1803 assert_eq!(qc.max_query_dims, Some(25));
1804 assert!((qc.pruning.unwrap() - 0.2).abs() < 0.001);
1805
1806 let schema = indexes[0].to_schema();
1808 let field = schema.get_field("embedding").unwrap();
1809 let entry = schema.get_field_entry(field).unwrap();
1810 let sc = entry.sparse_vector_config.as_ref().unwrap();
1811 let rqc = sc.query_config.as_ref().unwrap();
1812 assert!((rqc.weight_threshold - 0.03).abs() < 0.001);
1813 assert_eq!(rqc.max_query_dims, Some(25));
1814 assert!((rqc.pruning.unwrap() - 0.2).abs() < 0.001);
1815 }
1816
1817 #[test]
1818 fn test_fast_attribute() {
1819 let sdl = r#"
1820 index products {
1821 field name: text [indexed, stored]
1822 field price: f64 [indexed, fast]
1823 field category: text [indexed, stored, fast]
1824 field count: u64 [fast]
1825 field score: i64 [indexed, stored, fast]
1826 }
1827 "#;
1828
1829 let indexes = parse_sdl(sdl).unwrap();
1830 assert_eq!(indexes.len(), 1);
1831 let index = &indexes[0];
1832 assert_eq!(index.fields.len(), 5);
1833
1834 assert!(!index.fields[0].fast);
1836 assert!(index.fields[1].fast);
1838 assert!(matches!(index.fields[1].field_type, FieldType::F64));
1839 assert!(index.fields[2].fast);
1841 assert!(matches!(index.fields[2].field_type, FieldType::Text));
1842 assert!(index.fields[3].fast);
1844 assert!(matches!(index.fields[3].field_type, FieldType::U64));
1845 assert!(index.fields[4].fast);
1847 assert!(matches!(index.fields[4].field_type, FieldType::I64));
1848
1849 let schema = index.to_schema();
1851 let price_field = schema.get_field("price").unwrap();
1852 assert!(schema.get_field_entry(price_field).unwrap().fast);
1853
1854 let category_field = schema.get_field("category").unwrap();
1855 assert!(schema.get_field_entry(category_field).unwrap().fast);
1856
1857 let name_field = schema.get_field("name").unwrap();
1858 assert!(!schema.get_field_entry(name_field).unwrap().fast);
1859 }
1860
1861 #[test]
1862 fn test_primary_attribute() {
1863 let sdl = r#"
1864 index documents {
1865 field id: text [primary, stored]
1866 field title: text [indexed, stored]
1867 }
1868 "#;
1869
1870 let indexes = parse_sdl(sdl).unwrap();
1871 assert_eq!(indexes.len(), 1);
1872 let index = &indexes[0];
1873 assert_eq!(index.fields.len(), 2);
1874
1875 let id_field = &index.fields[0];
1877 assert!(id_field.primary, "id should be primary");
1878 assert!(id_field.fast, "primary implies fast");
1879 assert!(id_field.indexed, "primary implies indexed");
1880
1881 assert!(!index.fields[1].primary);
1883
1884 let schema = index.to_schema();
1886 let id = schema.get_field("id").unwrap();
1887 let id_entry = schema.get_field_entry(id).unwrap();
1888 assert!(id_entry.primary_key);
1889 assert!(id_entry.fast);
1890 assert!(id_entry.indexed);
1891
1892 let title = schema.get_field("title").unwrap();
1893 assert!(!schema.get_field_entry(title).unwrap().primary_key);
1894
1895 assert_eq!(schema.primary_field(), Some(id));
1897 }
1898
1899 #[test]
1900 fn test_primary_with_other_attributes() {
1901 let sdl = r#"
1902 index documents {
1903 field id: text<simple> [primary, indexed, stored]
1904 field body: text [indexed]
1905 }
1906 "#;
1907
1908 let indexes = parse_sdl(sdl).unwrap();
1909 let id_field = &indexes[0].fields[0];
1910 assert!(id_field.primary);
1911 assert!(id_field.indexed);
1912 assert!(id_field.stored);
1913 assert!(id_field.fast);
1914 assert_eq!(id_field.tokenizer, Some("simple".to_string()));
1915 }
1916
1917 #[test]
1918 fn test_primary_only_one_allowed() {
1919 let sdl = r#"
1920 index documents {
1921 field id: text [primary]
1922 field alt_id: text [primary]
1923 }
1924 "#;
1925
1926 let result = parse_sdl(sdl);
1927 assert!(result.is_err());
1928 let err = result.unwrap_err().to_string();
1929 assert!(
1930 err.contains("primary key"),
1931 "Error should mention primary key: {}",
1932 err
1933 );
1934 }
1935
1936 #[test]
1937 fn test_primary_must_be_text() {
1938 let sdl = r#"
1939 index documents {
1940 field id: u64 [primary]
1941 }
1942 "#;
1943
1944 let result = parse_sdl(sdl);
1945 assert!(result.is_err());
1946 let err = result.unwrap_err().to_string();
1947 assert!(
1948 err.contains("text"),
1949 "Error should mention text type: {}",
1950 err
1951 );
1952 }
1953
1954 #[test]
1955 fn test_primary_cannot_be_multi() {
1956 let sdl = r#"
1957 index documents {
1958 field id: text [primary, stored<multi>]
1959 }
1960 "#;
1961
1962 let result = parse_sdl(sdl);
1963 assert!(result.is_err());
1964 let err = result.unwrap_err().to_string();
1965 assert!(err.contains("multi"), "Error should mention multi: {}", err);
1966 }
1967
1968 #[test]
1969 fn test_no_primary_field() {
1970 let sdl = r#"
1972 index documents {
1973 field title: text [indexed, stored]
1974 }
1975 "#;
1976
1977 let indexes = parse_sdl(sdl).unwrap();
1978 let schema = indexes[0].to_schema();
1979 assert!(schema.primary_field().is_none());
1980 }
1981}