1use pest::Parser;
46use pest_derive::Parser;
47
48use super::query_field_router::{QueryRouterRule, RoutingMode};
49use super::schema::{DenseVectorQuantization, FieldType, Schema, SchemaBuilder};
50use crate::Result;
51use crate::error::Error;
52
53#[derive(Parser)]
54#[grammar = "dsl/sdl/sdl.pest"]
55pub struct SdlParser;
56
57use super::schema::DenseVectorConfig;
58use crate::structures::{
59 IndexSize, QueryWeighting, SparseFormat, SparseQueryConfig, SparseVectorConfig,
60 WeightQuantization,
61};
62
63#[derive(Debug, Clone)]
65pub struct FieldDef {
66 pub name: String,
67 pub field_type: FieldType,
68 pub indexed: bool,
69 pub stored: bool,
70 pub tokenizer: Option<String>,
72 pub multi: bool,
74 pub positions: Option<super::schema::PositionMode>,
76 pub sparse_vector_config: Option<SparseVectorConfig>,
78 pub dense_vector_config: Option<DenseVectorConfig>,
80 pub fast: bool,
82 pub primary: bool,
84 pub simhash: bool,
86}
87
88#[derive(Debug, Clone)]
90pub struct IndexDef {
91 pub name: String,
92 pub fields: Vec<FieldDef>,
93 pub default_fields: Vec<String>,
94 pub query_routers: Vec<QueryRouterRule>,
96}
97
98impl IndexDef {
99 pub fn to_schema(&self) -> Schema {
101 let mut builder = SchemaBuilder::default();
102
103 for field in &self.fields {
104 let f = match field.field_type {
105 FieldType::Text => {
106 let tokenizer = field.tokenizer.as_deref().unwrap_or("simple");
107 builder.add_text_field_with_tokenizer(
108 &field.name,
109 field.indexed,
110 field.stored,
111 tokenizer,
112 )
113 }
114 FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
115 FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
116 FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
117 FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
118 FieldType::Json => builder.add_json_field(&field.name, field.stored),
119 FieldType::SparseVector => {
120 if let Some(config) = &field.sparse_vector_config {
121 builder.add_sparse_vector_field_with_config(
122 &field.name,
123 field.indexed,
124 field.stored,
125 config.clone(),
126 )
127 } else {
128 builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
129 }
130 }
131 FieldType::DenseVector => {
132 let config = field
134 .dense_vector_config
135 .as_ref()
136 .expect("DenseVector field requires dimension to be specified");
137 builder.add_dense_vector_field_with_config(
138 &field.name,
139 field.indexed,
140 field.stored,
141 config.clone(),
142 )
143 }
144 };
145 if field.multi {
146 builder.set_multi(f, true);
147 }
148 if field.fast {
149 builder.set_fast(f, true);
150 }
151 if field.primary {
152 builder.set_primary_key(f);
153 }
154 if field.simhash {
155 builder.set_simhash(f, true);
156 }
157 let positions = field.positions.or({
159 if field.multi
161 && matches!(
162 field.field_type,
163 FieldType::SparseVector | FieldType::DenseVector
164 )
165 {
166 Some(super::schema::PositionMode::Ordinal)
167 } else {
168 None
169 }
170 });
171 if let Some(mode) = positions {
172 builder.set_positions(f, mode);
173 }
174 }
175
176 if !self.default_fields.is_empty() {
178 builder.set_default_fields(self.default_fields.clone());
179 }
180
181 if !self.query_routers.is_empty() {
183 builder.set_query_routers(self.query_routers.clone());
184 }
185
186 builder.build()
187 }
188
189 pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
194 if self.query_routers.is_empty() {
195 return Ok(None);
196 }
197
198 super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
199 .map(Some)
200 .map_err(Error::Schema)
201 }
202}
203
204fn parse_field_type(type_str: &str) -> Result<FieldType> {
206 match type_str {
207 "text" | "string" | "str" => Ok(FieldType::Text),
208 "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
209 "i64" | "int" | "integer" => Ok(FieldType::I64),
210 "f64" | "float" | "double" => Ok(FieldType::F64),
211 "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
212 "json" => Ok(FieldType::Json),
213 "sparse_vector" => Ok(FieldType::SparseVector),
214 "dense_vector" | "vector" => Ok(FieldType::DenseVector),
215 _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
216 }
217}
218
219#[derive(Debug, Clone, Default)]
221struct IndexConfig {
222 index_type: Option<super::schema::VectorIndexType>,
223 num_clusters: Option<usize>,
224 nprobe: Option<usize>,
225 build_threshold: Option<usize>,
226 sparse_format: Option<SparseFormat>,
228 quantization: Option<WeightQuantization>,
229 weight_threshold: Option<f32>,
230 block_size: Option<usize>,
231 pruning: Option<f32>,
232 min_terms: Option<usize>,
233 query_tokenizer: Option<String>,
235 query_weighting: Option<QueryWeighting>,
236 query_weight_threshold: Option<f32>,
237 query_max_dims: Option<usize>,
238 query_pruning: Option<f32>,
239 query_min_query_dims: Option<usize>,
240 dims: Option<u32>,
242 max_weight: Option<f32>,
243 positions: Option<super::schema::PositionMode>,
245}
246
247fn parse_attributes(
252 pair: pest::iterators::Pair<Rule>,
253) -> (bool, bool, bool, bool, bool, bool, Option<IndexConfig>) {
254 let mut indexed = false;
255 let mut stored = false;
256 let mut multi = false;
257 let mut fast = false;
258 let mut primary = false;
259 let mut simhash = false;
260 let mut index_config = None;
261
262 for attr in pair.into_inner() {
263 if attr.as_rule() == Rule::attribute {
264 let mut found_config = false;
266 for inner in attr.clone().into_inner() {
267 match inner.as_rule() {
268 Rule::indexed_with_config => {
269 indexed = true;
270 index_config = Some(parse_index_config(inner));
271 found_config = true;
272 break;
273 }
274 Rule::stored_with_config => {
275 stored = true;
276 multi = true; found_config = true;
278 break;
279 }
280 _ => {}
281 }
282 }
283 if !found_config {
284 match attr.as_str() {
286 "indexed" => indexed = true,
287 "stored" => stored = true,
288 "fast" => fast = true,
289 "primary" => primary = true,
290 "simhash" => {
291 simhash = true;
292 }
293 _ => {}
294 }
295 }
296 }
297 }
298
299 (indexed, stored, multi, fast, primary, simhash, index_config)
300}
301
302fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
304 let mut config = IndexConfig::default();
305
306 for inner in pair.into_inner() {
311 if inner.as_rule() == Rule::index_config_params {
312 for param in inner.into_inner() {
313 if param.as_rule() == Rule::index_config_param {
314 for p in param.into_inner() {
315 parse_single_index_config_param(&mut config, p);
316 }
317 }
318 }
319 }
320 }
321
322 config
323}
324
325fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
327 use super::schema::VectorIndexType;
328
329 match p.as_rule() {
330 Rule::index_type_spec => {
331 config.index_type = Some(match p.as_str() {
332 "flat" => VectorIndexType::Flat,
333 "rabitq" => VectorIndexType::RaBitQ,
334 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
335 "scann" => VectorIndexType::ScaNN,
336 _ => VectorIndexType::RaBitQ,
337 });
338 }
339 Rule::index_type_kwarg => {
340 if let Some(t) = p.into_inner().next() {
342 config.index_type = Some(match t.as_str() {
343 "flat" => VectorIndexType::Flat,
344 "rabitq" => VectorIndexType::RaBitQ,
345 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
346 "scann" => VectorIndexType::ScaNN,
347 _ => VectorIndexType::RaBitQ,
348 });
349 }
350 }
351 Rule::num_clusters_kwarg => {
352 if let Some(n) = p.into_inner().next() {
354 config.num_clusters = Some(n.as_str().parse().unwrap_or_else(|_| {
355 log::warn!(
356 "Invalid num_clusters value '{}', using default 256",
357 n.as_str()
358 );
359 256
360 }));
361 }
362 }
363 Rule::build_threshold_kwarg => {
364 if let Some(n) = p.into_inner().next() {
366 config.build_threshold = Some(n.as_str().parse().unwrap_or_else(|_| {
367 log::warn!(
368 "Invalid build_threshold value '{}', using default 10000",
369 n.as_str()
370 );
371 10000
372 }));
373 }
374 }
375 Rule::nprobe_kwarg => {
376 if let Some(n) = p.into_inner().next() {
378 config.nprobe = Some(n.as_str().parse().unwrap_or_else(|_| {
379 log::warn!("Invalid nprobe value '{}', using default 32", n.as_str());
380 32
381 }));
382 }
383 }
384 Rule::quantization_kwarg => {
385 if let Some(q) = p.into_inner().next() {
387 config.quantization = Some(match q.as_str() {
388 "float32" | "f32" => WeightQuantization::Float32,
389 "float16" | "f16" => WeightQuantization::Float16,
390 "uint8" | "u8" => WeightQuantization::UInt8,
391 "uint4" | "u4" => WeightQuantization::UInt4,
392 _ => WeightQuantization::default(),
393 });
394 }
395 }
396 Rule::weight_threshold_kwarg => {
397 if let Some(t) = p.into_inner().next() {
399 config.weight_threshold = Some(t.as_str().parse().unwrap_or_else(|_| {
400 log::warn!(
401 "Invalid weight_threshold value '{}', using default 0.0",
402 t.as_str()
403 );
404 0.0
405 }));
406 }
407 }
408 Rule::block_size_kwarg => {
409 if let Some(n) = p.into_inner().next() {
411 config.block_size = Some(n.as_str().parse().unwrap_or_else(|_| {
412 log::warn!(
413 "Invalid block_size value '{}', using default 128",
414 n.as_str()
415 );
416 128
417 }));
418 }
419 }
420 Rule::pruning_kwarg => {
421 if let Some(f) = p.into_inner().next() {
423 config.pruning = Some(f.as_str().parse().unwrap_or_else(|_| {
424 log::warn!("Invalid pruning value '{}', using default 1.0", f.as_str());
425 1.0
426 }));
427 }
428 }
429 Rule::min_terms_kwarg => {
430 if let Some(n) = p.into_inner().next() {
431 config.min_terms = Some(n.as_str().parse().unwrap_or_else(|_| {
432 log::warn!("Invalid min_terms value '{}', using default 4", n.as_str());
433 4
434 }));
435 }
436 }
437 Rule::sparse_format_kwarg => {
438 if let Some(f) = p.into_inner().next() {
440 config.sparse_format = Some(match f.as_str() {
441 "bmp" => SparseFormat::Bmp,
442 "maxscore" => SparseFormat::MaxScore,
443 _ => SparseFormat::default(),
444 });
445 }
446 }
447 Rule::sparse_dims_kwarg => {
448 if let Some(n) = p.into_inner().next() {
449 config.dims = Some(n.as_str().parse().unwrap_or_else(|_| {
450 log::warn!("Invalid dims value '{}', using default 105879", n.as_str());
451 105879
452 }));
453 }
454 }
455 Rule::sparse_max_weight_kwarg => {
456 if let Some(f) = p.into_inner().next() {
457 config.max_weight = Some(f.as_str().parse().unwrap_or_else(|_| {
458 log::warn!(
459 "Invalid max_weight value '{}', using default 5.0",
460 f.as_str()
461 );
462 5.0
463 }));
464 }
465 }
466 Rule::query_config_block => {
467 parse_query_config_block(config, p);
469 }
470 Rule::positions_kwarg => {
471 use super::schema::PositionMode;
473 config.positions = Some(match p.as_str() {
474 "ordinal" => PositionMode::Ordinal,
475 "token_position" => PositionMode::TokenPosition,
476 _ => PositionMode::Full, });
478 }
479 _ => {}
480 }
481}
482
483fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
485 for inner in pair.into_inner() {
486 if inner.as_rule() == Rule::query_config_params {
487 for param in inner.into_inner() {
488 if param.as_rule() == Rule::query_config_param {
489 for p in param.into_inner() {
490 match p.as_rule() {
491 Rule::query_tokenizer_kwarg => {
492 if let Some(path) = p.into_inner().next()
494 && let Some(inner_path) = path.into_inner().next()
495 {
496 config.query_tokenizer = Some(inner_path.as_str().to_string());
497 }
498 }
499 Rule::query_weighting_kwarg => {
500 if let Some(w) = p.into_inner().next() {
502 config.query_weighting = Some(match w.as_str() {
503 "one" => QueryWeighting::One,
504 "idf" => QueryWeighting::Idf,
505 "idf_file" => QueryWeighting::IdfFile,
506 _ => QueryWeighting::One,
507 });
508 }
509 }
510 Rule::query_weight_threshold_kwarg => {
511 if let Some(t) = p.into_inner().next() {
512 config.query_weight_threshold =
513 Some(t.as_str().parse().unwrap_or_else(|_| {
514 log::warn!(
515 "Invalid query weight_threshold '{}', using 0.0",
516 t.as_str()
517 );
518 0.0
519 }));
520 }
521 }
522 Rule::query_max_dims_kwarg => {
523 if let Some(t) = p.into_inner().next() {
524 config.query_max_dims =
525 Some(t.as_str().parse().unwrap_or_else(|_| {
526 log::warn!(
527 "Invalid query max_dims '{}', using 0",
528 t.as_str()
529 );
530 0
531 }));
532 }
533 }
534 Rule::query_pruning_kwarg => {
535 if let Some(t) = p.into_inner().next() {
536 config.query_pruning =
537 Some(t.as_str().parse().unwrap_or_else(|_| {
538 log::warn!(
539 "Invalid query pruning '{}', using 1.0",
540 t.as_str()
541 );
542 1.0
543 }));
544 }
545 }
546 Rule::query_min_query_dims_kwarg => {
547 if let Some(t) = p.into_inner().next() {
548 config.query_min_query_dims =
549 Some(t.as_str().parse().unwrap_or_else(|_| {
550 log::warn!(
551 "Invalid query min_query_dims '{}', using 4",
552 t.as_str()
553 );
554 4
555 }));
556 }
557 }
558 _ => {}
559 }
560 }
561 }
562 }
563 }
564 }
565}
566
567fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
569 let mut inner = pair.into_inner();
570
571 let name = inner
572 .next()
573 .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
574 .as_str()
575 .to_string();
576
577 let field_type_str = inner
578 .next()
579 .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
580 .as_str();
581
582 let field_type = parse_field_type(field_type_str)?;
583
584 let mut tokenizer = None;
586 let mut sparse_vector_config = None;
587 let mut dense_vector_config = None;
588 let mut indexed = true;
589 let mut stored = true;
590 let mut multi = false;
591 let mut fast = false;
592 let mut primary = false;
593 let mut simhash = false;
594 let mut index_config: Option<IndexConfig> = None;
595
596 for item in inner {
597 match item.as_rule() {
598 Rule::tokenizer_spec => {
599 if let Some(tok_name) = item.into_inner().next() {
601 tokenizer = Some(tok_name.as_str().to_string());
602 }
603 }
604 Rule::sparse_vector_config => {
605 sparse_vector_config = Some(parse_sparse_vector_config(item));
607 }
608 Rule::dense_vector_config => {
609 dense_vector_config = Some(parse_dense_vector_config(item));
611 }
612 Rule::attributes => {
613 let (idx, sto, mul, fst, pri, sim, idx_cfg) = parse_attributes(item);
614 indexed = idx;
615 stored = sto;
616 multi = mul;
617 fast = fst;
618 primary = pri;
619 simhash = sim;
620 index_config = idx_cfg;
621 }
622 _ => {}
623 }
624 }
625
626 if primary {
628 fast = true;
629 indexed = true;
630 }
631
632 if simhash && field_type != FieldType::SparseVector {
634 return Err(Error::Schema(format!(
635 "simhash attribute on field '{}' requires type sparse_vector, got {:?}",
636 name, field_type
637 )));
638 }
639
640 let mut positions = None;
642 if let Some(idx_cfg) = index_config {
643 positions = idx_cfg.positions;
644 if let Some(ref mut dv_config) = dense_vector_config {
645 apply_index_config_to_dense_vector(dv_config, idx_cfg);
646 } else if field_type == FieldType::SparseVector {
647 let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
649 apply_index_config_to_sparse_vector(sv_config, idx_cfg);
650 }
651 }
652
653 Ok(FieldDef {
654 name,
655 field_type,
656 indexed,
657 stored,
658 tokenizer,
659 multi,
660 positions,
661 sparse_vector_config,
662 dense_vector_config,
663 fast,
664 primary,
665 simhash,
666 })
667}
668
669fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
671 if let Some(index_type) = idx_cfg.index_type {
673 config.index_type = index_type;
674 }
675
676 if idx_cfg.num_clusters.is_some() {
678 config.num_clusters = idx_cfg.num_clusters;
679 }
680
681 if let Some(nprobe) = idx_cfg.nprobe {
683 config.nprobe = nprobe;
684 }
685
686 if idx_cfg.build_threshold.is_some() {
688 config.build_threshold = idx_cfg.build_threshold;
689 }
690}
691
692fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
695 let mut index_size = IndexSize::default();
696
697 for inner in pair.into_inner() {
699 if inner.as_rule() == Rule::index_size_spec {
700 index_size = match inner.as_str() {
701 "u16" => IndexSize::U16,
702 "u32" => IndexSize::U32,
703 _ => IndexSize::default(),
704 };
705 }
706 }
707
708 SparseVectorConfig {
709 format: SparseFormat::default(),
710 index_size,
711 weight_quantization: WeightQuantization::default(),
712 weight_threshold: 0.0,
713 block_size: 128,
714 bmp_block_size: 64,
715 max_bmp_grid_bytes: 0,
716 bmp_superblock_size: 64,
717 pruning: None,
718 query_config: None,
719 dims: None,
720 max_weight: None,
721 min_terms: 4,
722 }
723}
724
725fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
727 if let Some(f) = idx_cfg.sparse_format {
728 config.format = f;
729 }
730 if let Some(q) = idx_cfg.quantization {
731 config.weight_quantization = q;
732 }
733 if let Some(t) = idx_cfg.weight_threshold {
734 config.weight_threshold = t;
735 }
736 if let Some(bs) = idx_cfg.block_size {
737 let adjusted = bs.next_power_of_two();
738 if adjusted != bs {
739 log::warn!(
740 "block_size {} adjusted to next power of two: {}",
741 bs,
742 adjusted
743 );
744 }
745 config.block_size = adjusted;
746 }
747 if let Some(p) = idx_cfg.pruning {
748 let clamped = p.clamp(0.0, 1.0);
749 if (clamped - p).abs() > f32::EPSILON {
750 log::warn!(
751 "pruning {} clamped to valid range [0.0, 1.0]: {}",
752 p,
753 clamped
754 );
755 }
756 config.pruning = Some(clamped);
757 }
758 if let Some(mt) = idx_cfg.min_terms {
759 config.min_terms = mt;
760 }
761 if let Some(d) = idx_cfg.dims {
762 config.dims = Some(d);
763 }
764 if let Some(mw) = idx_cfg.max_weight {
765 config.max_weight = Some(mw);
766 }
767 if idx_cfg.query_tokenizer.is_some()
769 || idx_cfg.query_weighting.is_some()
770 || idx_cfg.query_weight_threshold.is_some()
771 || idx_cfg.query_max_dims.is_some()
772 || idx_cfg.query_pruning.is_some()
773 || idx_cfg.query_min_query_dims.is_some()
774 {
775 let query_config = config
776 .query_config
777 .get_or_insert(SparseQueryConfig::default());
778 if let Some(tokenizer) = idx_cfg.query_tokenizer {
779 query_config.tokenizer = Some(tokenizer);
780 }
781 if let Some(weighting) = idx_cfg.query_weighting {
782 query_config.weighting = weighting;
783 }
784 if let Some(t) = idx_cfg.query_weight_threshold {
785 query_config.weight_threshold = t;
786 }
787 if let Some(d) = idx_cfg.query_max_dims {
788 query_config.max_query_dims = Some(d);
789 }
790 if let Some(p) = idx_cfg.query_pruning {
791 query_config.pruning = Some(p);
792 }
793 if let Some(m) = idx_cfg.query_min_query_dims {
794 query_config.min_query_dims = m;
795 }
796 }
797}
798
799fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
802 let mut dim: usize = 0;
803 let mut quantization = DenseVectorQuantization::F32;
804
805 for params in pair.into_inner() {
807 if params.as_rule() == Rule::dense_vector_params {
808 for inner in params.into_inner() {
809 match inner.as_rule() {
810 Rule::dense_vector_keyword_params => {
811 for kwarg in inner.into_inner() {
812 match kwarg.as_rule() {
813 Rule::dims_kwarg => {
814 if let Some(d) = kwarg.into_inner().next() {
815 dim = d.as_str().parse().unwrap_or(0);
816 }
817 }
818 Rule::quant_type_spec => {
819 quantization = parse_quant_type(kwarg.as_str());
820 }
821 _ => {}
822 }
823 }
824 }
825 Rule::dense_vector_positional_params => {
826 for item in inner.into_inner() {
827 match item.as_rule() {
828 Rule::dimension_spec => {
829 dim = item.as_str().parse().unwrap_or(0);
830 }
831 Rule::quant_type_spec => {
832 quantization = parse_quant_type(item.as_str());
833 }
834 _ => {}
835 }
836 }
837 }
838 _ => {}
839 }
840 }
841 }
842 }
843
844 DenseVectorConfig::new(dim).with_quantization(quantization)
845}
846
847fn parse_quant_type(s: &str) -> DenseVectorQuantization {
848 match s.trim() {
849 "f16" => DenseVectorQuantization::F16,
850 "uint8" | "u8" => DenseVectorQuantization::UInt8,
851 _ => DenseVectorQuantization::F32,
852 }
853}
854
855fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
857 pair.into_inner().map(|p| p.as_str().to_string()).collect()
858}
859
860fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
862 let mut pattern = String::new();
863 let mut substitution = String::new();
864 let mut target_field = String::new();
865 let mut mode = RoutingMode::Additional;
866
867 for prop in pair.into_inner() {
868 if prop.as_rule() != Rule::query_router_prop {
869 continue;
870 }
871
872 for inner in prop.into_inner() {
873 match inner.as_rule() {
874 Rule::query_router_pattern => {
875 if let Some(regex_str) = inner.into_inner().next() {
876 pattern = parse_string_value(regex_str);
877 }
878 }
879 Rule::query_router_substitution => {
880 if let Some(quoted) = inner.into_inner().next() {
881 substitution = parse_string_value(quoted);
882 }
883 }
884 Rule::query_router_target => {
885 if let Some(ident) = inner.into_inner().next() {
886 target_field = ident.as_str().to_string();
887 }
888 }
889 Rule::query_router_mode => {
890 if let Some(mode_val) = inner.into_inner().next() {
891 mode = match mode_val.as_str() {
892 "exclusive" => RoutingMode::Exclusive,
893 "additional" => RoutingMode::Additional,
894 _ => RoutingMode::Additional,
895 };
896 }
897 }
898 _ => {}
899 }
900 }
901 }
902
903 if pattern.is_empty() {
904 return Err(Error::Schema("query_router missing 'pattern'".to_string()));
905 }
906 if substitution.is_empty() {
907 return Err(Error::Schema(
908 "query_router missing 'substitution'".to_string(),
909 ));
910 }
911 if target_field.is_empty() {
912 return Err(Error::Schema(
913 "query_router missing 'target_field'".to_string(),
914 ));
915 }
916
917 Ok(QueryRouterRule {
918 pattern,
919 substitution,
920 target_field,
921 mode,
922 })
923}
924
925fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
927 let s = pair.as_str();
928 match pair.as_rule() {
929 Rule::regex_string => {
930 if let Some(inner) = pair.into_inner().next() {
932 parse_string_value(inner)
933 } else {
934 s.to_string()
935 }
936 }
937 Rule::raw_string => {
938 s[2..s.len() - 1].to_string()
940 }
941 Rule::quoted_string => {
942 let inner = &s[1..s.len() - 1];
944 inner
946 .replace("\\n", "\n")
947 .replace("\\t", "\t")
948 .replace("\\\"", "\"")
949 .replace("\\\\", "\\")
950 }
951 _ => s.to_string(),
952 }
953}
954
955fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
957 let mut inner = pair.into_inner();
958
959 let name = inner
960 .next()
961 .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
962 .as_str()
963 .to_string();
964
965 let mut fields = Vec::new();
966 let mut default_fields = Vec::new();
967 let mut query_routers = Vec::new();
968
969 for item in inner {
970 match item.as_rule() {
971 Rule::field_def => {
972 fields.push(parse_field_def(item)?);
973 }
974 Rule::default_fields_def => {
975 default_fields = parse_default_fields_def(item);
976 }
977 Rule::query_router_def => {
978 query_routers.push(parse_query_router_def(item)?);
979 }
980 _ => {}
981 }
982 }
983
984 let primary_fields: Vec<&FieldDef> = fields.iter().filter(|f| f.primary).collect();
986 if primary_fields.len() > 1 {
987 return Err(Error::Schema(format!(
988 "Index '{}' has {} primary key fields, but at most one is allowed",
989 name,
990 primary_fields.len()
991 )));
992 }
993 if let Some(pk) = primary_fields.first() {
994 if pk.field_type != FieldType::Text {
995 return Err(Error::Schema(format!(
996 "Primary key field '{}' must be of type text, got {:?}",
997 pk.name, pk.field_type
998 )));
999 }
1000 if pk.multi {
1001 return Err(Error::Schema(format!(
1002 "Primary key field '{}' cannot be multi-valued",
1003 pk.name
1004 )));
1005 }
1006 }
1007
1008 Ok(IndexDef {
1009 name,
1010 fields,
1011 default_fields,
1012 query_routers,
1013 })
1014}
1015
1016pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
1018 let pairs = SdlParser::parse(Rule::file, input)
1019 .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
1020
1021 let mut indexes = Vec::new();
1022
1023 for pair in pairs {
1024 if pair.as_rule() == Rule::file {
1025 for inner in pair.into_inner() {
1026 if inner.as_rule() == Rule::index_def {
1027 indexes.push(parse_index_def(inner)?);
1028 }
1029 }
1030 }
1031 }
1032
1033 Ok(indexes)
1034}
1035
1036pub fn parse_single_index(input: &str) -> Result<IndexDef> {
1038 let indexes = parse_sdl(input)?;
1039
1040 if indexes.is_empty() {
1041 return Err(Error::Schema("No index definition found".to_string()));
1042 }
1043
1044 if indexes.len() > 1 {
1045 return Err(Error::Schema(
1046 "Multiple index definitions found, expected one".to_string(),
1047 ));
1048 }
1049
1050 Ok(indexes.into_iter().next().unwrap())
1051}
1052
1053#[cfg(test)]
1054mod tests {
1055 use super::*;
1056
1057 #[test]
1058 fn test_parse_simple_schema() {
1059 let sdl = r#"
1060 index articles {
1061 field title: text [indexed, stored]
1062 field body: text [indexed]
1063 }
1064 "#;
1065
1066 let indexes = parse_sdl(sdl).unwrap();
1067 assert_eq!(indexes.len(), 1);
1068
1069 let index = &indexes[0];
1070 assert_eq!(index.name, "articles");
1071 assert_eq!(index.fields.len(), 2);
1072
1073 assert_eq!(index.fields[0].name, "title");
1074 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1075 assert!(index.fields[0].indexed);
1076 assert!(index.fields[0].stored);
1077
1078 assert_eq!(index.fields[1].name, "body");
1079 assert!(matches!(index.fields[1].field_type, FieldType::Text));
1080 assert!(index.fields[1].indexed);
1081 assert!(!index.fields[1].stored);
1082 }
1083
1084 #[test]
1085 fn test_parse_all_field_types() {
1086 let sdl = r#"
1087 index test {
1088 field text_field: text [indexed, stored]
1089 field u64_field: u64 [indexed, stored]
1090 field i64_field: i64 [indexed, stored]
1091 field f64_field: f64 [indexed, stored]
1092 field bytes_field: bytes [stored]
1093 }
1094 "#;
1095
1096 let indexes = parse_sdl(sdl).unwrap();
1097 let index = &indexes[0];
1098
1099 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1100 assert!(matches!(index.fields[1].field_type, FieldType::U64));
1101 assert!(matches!(index.fields[2].field_type, FieldType::I64));
1102 assert!(matches!(index.fields[3].field_type, FieldType::F64));
1103 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1104 }
1105
1106 #[test]
1107 fn test_parse_with_comments() {
1108 let sdl = r#"
1109 # This is a comment
1110 index articles {
1111 # Title field
1112 field title: text [indexed, stored]
1113 field body: text [indexed] # inline comment not supported yet
1114 }
1115 "#;
1116
1117 let indexes = parse_sdl(sdl).unwrap();
1118 assert_eq!(indexes[0].fields.len(), 2);
1119 }
1120
1121 #[test]
1122 fn test_parse_type_aliases() {
1123 let sdl = r#"
1124 index test {
1125 field a: string [indexed]
1126 field b: int [indexed]
1127 field c: uint [indexed]
1128 field d: float [indexed]
1129 field e: binary [stored]
1130 }
1131 "#;
1132
1133 let indexes = parse_sdl(sdl).unwrap();
1134 let index = &indexes[0];
1135
1136 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1137 assert!(matches!(index.fields[1].field_type, FieldType::I64));
1138 assert!(matches!(index.fields[2].field_type, FieldType::U64));
1139 assert!(matches!(index.fields[3].field_type, FieldType::F64));
1140 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1141 }
1142
1143 #[test]
1144 fn test_to_schema() {
1145 let sdl = r#"
1146 index articles {
1147 field title: text [indexed, stored]
1148 field views: u64 [indexed, stored]
1149 }
1150 "#;
1151
1152 let indexes = parse_sdl(sdl).unwrap();
1153 let schema = indexes[0].to_schema();
1154
1155 assert!(schema.get_field("title").is_some());
1156 assert!(schema.get_field("views").is_some());
1157 assert!(schema.get_field("nonexistent").is_none());
1158 }
1159
1160 #[test]
1161 fn test_default_attributes() {
1162 let sdl = r#"
1163 index test {
1164 field title: text
1165 }
1166 "#;
1167
1168 let indexes = parse_sdl(sdl).unwrap();
1169 let field = &indexes[0].fields[0];
1170
1171 assert!(field.indexed);
1173 assert!(field.stored);
1174 }
1175
1176 #[test]
1177 fn test_multiple_indexes() {
1178 let sdl = r#"
1179 index articles {
1180 field title: text [indexed, stored]
1181 }
1182
1183 index users {
1184 field name: text [indexed, stored]
1185 field email: text [indexed, stored]
1186 }
1187 "#;
1188
1189 let indexes = parse_sdl(sdl).unwrap();
1190 assert_eq!(indexes.len(), 2);
1191 assert_eq!(indexes[0].name, "articles");
1192 assert_eq!(indexes[1].name, "users");
1193 }
1194
1195 #[test]
1196 fn test_tokenizer_spec() {
1197 let sdl = r#"
1198 index articles {
1199 field title: text<en_stem> [indexed, stored]
1200 field body: text<simple> [indexed]
1201 field author: text [indexed, stored]
1202 }
1203 "#;
1204
1205 let indexes = parse_sdl(sdl).unwrap();
1206 let index = &indexes[0];
1207
1208 assert_eq!(index.fields[0].name, "title");
1209 assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
1210
1211 assert_eq!(index.fields[1].name, "body");
1212 assert_eq!(index.fields[1].tokenizer, Some("simple".to_string()));
1213
1214 assert_eq!(index.fields[2].name, "author");
1215 assert_eq!(index.fields[2].tokenizer, None); }
1217
1218 #[test]
1219 fn test_tokenizer_in_schema() {
1220 let sdl = r#"
1221 index articles {
1222 field title: text<german> [indexed, stored]
1223 field body: text<en_stem> [indexed]
1224 }
1225 "#;
1226
1227 let indexes = parse_sdl(sdl).unwrap();
1228 let schema = indexes[0].to_schema();
1229
1230 let title_field = schema.get_field("title").unwrap();
1231 let title_entry = schema.get_field_entry(title_field).unwrap();
1232 assert_eq!(title_entry.tokenizer, Some("german".to_string()));
1233
1234 let body_field = schema.get_field("body").unwrap();
1235 let body_entry = schema.get_field_entry(body_field).unwrap();
1236 assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
1237 }
1238
1239 #[test]
1240 fn test_query_router_basic() {
1241 let sdl = r#"
1242 index documents {
1243 field title: text [indexed, stored]
1244 field uri: text [indexed, stored]
1245
1246 query_router {
1247 pattern: "10\\.\\d{4,}/[^\\s]+"
1248 substitution: "doi://{0}"
1249 target_field: uris
1250 mode: exclusive
1251 }
1252 }
1253 "#;
1254
1255 let indexes = parse_sdl(sdl).unwrap();
1256 let index = &indexes[0];
1257
1258 assert_eq!(index.query_routers.len(), 1);
1259 let router = &index.query_routers[0];
1260 assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1261 assert_eq!(router.substitution, "doi://{0}");
1262 assert_eq!(router.target_field, "uris");
1263 assert_eq!(router.mode, RoutingMode::Exclusive);
1264 }
1265
1266 #[test]
1267 fn test_query_router_raw_string() {
1268 let sdl = r#"
1269 index documents {
1270 field uris: text [indexed, stored]
1271
1272 query_router {
1273 pattern: r"^pmid:(\d+)$"
1274 substitution: "pubmed://{1}"
1275 target_field: uris
1276 mode: additional
1277 }
1278 }
1279 "#;
1280
1281 let indexes = parse_sdl(sdl).unwrap();
1282 let router = &indexes[0].query_routers[0];
1283
1284 assert_eq!(router.pattern, r"^pmid:(\d+)$");
1285 assert_eq!(router.substitution, "pubmed://{1}");
1286 assert_eq!(router.mode, RoutingMode::Additional);
1287 }
1288
1289 #[test]
1290 fn test_multiple_query_routers() {
1291 let sdl = r#"
1292 index documents {
1293 field uris: text [indexed, stored]
1294
1295 query_router {
1296 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1297 substitution: "doi://{1}"
1298 target_field: uris
1299 mode: exclusive
1300 }
1301
1302 query_router {
1303 pattern: r"^pmid:(\d+)$"
1304 substitution: "pubmed://{1}"
1305 target_field: uris
1306 mode: exclusive
1307 }
1308
1309 query_router {
1310 pattern: r"^arxiv:(\d+\.\d+)$"
1311 substitution: "arxiv://{1}"
1312 target_field: uris
1313 mode: additional
1314 }
1315 }
1316 "#;
1317
1318 let indexes = parse_sdl(sdl).unwrap();
1319 assert_eq!(indexes[0].query_routers.len(), 3);
1320 }
1321
1322 #[test]
1323 fn test_query_router_default_mode() {
1324 let sdl = r#"
1325 index documents {
1326 field uris: text [indexed, stored]
1327
1328 query_router {
1329 pattern: r"test"
1330 substitution: "{0}"
1331 target_field: uris
1332 }
1333 }
1334 "#;
1335
1336 let indexes = parse_sdl(sdl).unwrap();
1337 assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1339 }
1340
1341 #[test]
1342 fn test_multi_attribute() {
1343 let sdl = r#"
1344 index documents {
1345 field uris: text [indexed, stored<multi>]
1346 field title: text [indexed, stored]
1347 }
1348 "#;
1349
1350 let indexes = parse_sdl(sdl).unwrap();
1351 assert_eq!(indexes.len(), 1);
1352
1353 let fields = &indexes[0].fields;
1354 assert_eq!(fields.len(), 2);
1355
1356 assert_eq!(fields[0].name, "uris");
1358 assert!(fields[0].multi, "uris field should have multi=true");
1359
1360 assert_eq!(fields[1].name, "title");
1362 assert!(!fields[1].multi, "title field should have multi=false");
1363
1364 let schema = indexes[0].to_schema();
1366 let uris_field = schema.get_field("uris").unwrap();
1367 let title_field = schema.get_field("title").unwrap();
1368
1369 assert!(schema.get_field_entry(uris_field).unwrap().multi);
1370 assert!(!schema.get_field_entry(title_field).unwrap().multi);
1371 }
1372
1373 #[test]
1374 fn test_sparse_vector_field() {
1375 let sdl = r#"
1376 index documents {
1377 field embedding: sparse_vector [indexed, stored]
1378 }
1379 "#;
1380
1381 let indexes = parse_sdl(sdl).unwrap();
1382 assert_eq!(indexes.len(), 1);
1383 assert_eq!(indexes[0].fields.len(), 1);
1384 assert_eq!(indexes[0].fields[0].name, "embedding");
1385 assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1386 assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1387 }
1388
1389 #[test]
1390 fn test_sparse_vector_with_config() {
1391 let sdl = r#"
1392 index documents {
1393 field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1394 field dense: sparse_vector<u32> [indexed<quantization: float32>]
1395 }
1396 "#;
1397
1398 let indexes = parse_sdl(sdl).unwrap();
1399 assert_eq!(indexes[0].fields.len(), 2);
1400
1401 let f1 = &indexes[0].fields[0];
1403 assert_eq!(f1.name, "embedding");
1404 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1405 assert_eq!(config1.index_size, IndexSize::U16);
1406 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1407
1408 let f2 = &indexes[0].fields[1];
1410 assert_eq!(f2.name, "dense");
1411 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1412 assert_eq!(config2.index_size, IndexSize::U32);
1413 assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1414 }
1415
1416 #[test]
1417 fn test_sparse_vector_with_weight_threshold() {
1418 let sdl = r#"
1419 index documents {
1420 field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1421 field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1422 }
1423 "#;
1424
1425 let indexes = parse_sdl(sdl).unwrap();
1426 assert_eq!(indexes[0].fields.len(), 2);
1427
1428 let f1 = &indexes[0].fields[0];
1430 assert_eq!(f1.name, "embedding");
1431 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1432 assert_eq!(config1.index_size, IndexSize::U16);
1433 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1434 assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1435
1436 let f2 = &indexes[0].fields[1];
1438 assert_eq!(f2.name, "embedding2");
1439 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1440 assert_eq!(config2.index_size, IndexSize::U32);
1441 assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1442 assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1443 }
1444
1445 #[test]
1446 fn test_sparse_vector_with_pruning() {
1447 let sdl = r#"
1448 index documents {
1449 field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1450 }
1451 "#;
1452
1453 let indexes = parse_sdl(sdl).unwrap();
1454 let f = &indexes[0].fields[0];
1455 assert_eq!(f.name, "embedding");
1456 let config = f.sparse_vector_config.as_ref().unwrap();
1457 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1458 assert_eq!(config.pruning, Some(0.1));
1459 }
1460
1461 #[test]
1462 fn test_dense_vector_field() {
1463 let sdl = r#"
1464 index documents {
1465 field embedding: dense_vector<768> [indexed, stored]
1466 }
1467 "#;
1468
1469 let indexes = parse_sdl(sdl).unwrap();
1470 assert_eq!(indexes.len(), 1);
1471 assert_eq!(indexes[0].fields.len(), 1);
1472
1473 let f = &indexes[0].fields[0];
1474 assert_eq!(f.name, "embedding");
1475 assert_eq!(f.field_type, FieldType::DenseVector);
1476
1477 let config = f.dense_vector_config.as_ref().unwrap();
1478 assert_eq!(config.dim, 768);
1479 }
1480
1481 #[test]
1482 fn test_dense_vector_alias() {
1483 let sdl = r#"
1484 index documents {
1485 field embedding: vector<1536> [indexed]
1486 }
1487 "#;
1488
1489 let indexes = parse_sdl(sdl).unwrap();
1490 assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1491 assert_eq!(
1492 indexes[0].fields[0]
1493 .dense_vector_config
1494 .as_ref()
1495 .unwrap()
1496 .dim,
1497 1536
1498 );
1499 }
1500
1501 #[test]
1502 fn test_dense_vector_with_num_clusters() {
1503 let sdl = r#"
1504 index documents {
1505 field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1506 }
1507 "#;
1508
1509 let indexes = parse_sdl(sdl).unwrap();
1510 assert_eq!(indexes.len(), 1);
1511
1512 let f = &indexes[0].fields[0];
1513 assert_eq!(f.name, "embedding");
1514 assert_eq!(f.field_type, FieldType::DenseVector);
1515
1516 let config = f.dense_vector_config.as_ref().unwrap();
1517 assert_eq!(config.dim, 768);
1518 assert_eq!(config.num_clusters, Some(256));
1519 assert_eq!(config.nprobe, 32); }
1521
1522 #[test]
1523 fn test_dense_vector_with_num_clusters_and_nprobe() {
1524 let sdl = r#"
1525 index documents {
1526 field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1527 }
1528 "#;
1529
1530 let indexes = parse_sdl(sdl).unwrap();
1531 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1532
1533 assert_eq!(config.dim, 1536);
1534 assert_eq!(config.num_clusters, Some(512));
1535 assert_eq!(config.nprobe, 64);
1536 }
1537
1538 #[test]
1539 fn test_dense_vector_keyword_syntax() {
1540 let sdl = r#"
1541 index documents {
1542 field embedding: dense_vector<dims: 1536> [indexed, stored]
1543 }
1544 "#;
1545
1546 let indexes = parse_sdl(sdl).unwrap();
1547 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1548
1549 assert_eq!(config.dim, 1536);
1550 assert!(config.num_clusters.is_none());
1551 }
1552
1553 #[test]
1554 fn test_dense_vector_keyword_syntax_full() {
1555 let sdl = r#"
1556 index documents {
1557 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1558 }
1559 "#;
1560
1561 let indexes = parse_sdl(sdl).unwrap();
1562 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1563
1564 assert_eq!(config.dim, 1536);
1565 assert_eq!(config.num_clusters, Some(256));
1566 assert_eq!(config.nprobe, 64);
1567 }
1568
1569 #[test]
1570 fn test_dense_vector_keyword_syntax_partial() {
1571 let sdl = r#"
1572 index documents {
1573 field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1574 }
1575 "#;
1576
1577 let indexes = parse_sdl(sdl).unwrap();
1578 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1579
1580 assert_eq!(config.dim, 768);
1581 assert_eq!(config.num_clusters, Some(128));
1582 assert_eq!(config.nprobe, 32); }
1584
1585 #[test]
1586 fn test_dense_vector_scann_index() {
1587 use crate::dsl::schema::VectorIndexType;
1588
1589 let sdl = r#"
1590 index documents {
1591 field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1592 }
1593 "#;
1594
1595 let indexes = parse_sdl(sdl).unwrap();
1596 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1597
1598 assert_eq!(config.dim, 768);
1599 assert_eq!(config.index_type, VectorIndexType::ScaNN);
1600 assert_eq!(config.num_clusters, Some(256));
1601 assert_eq!(config.nprobe, 64);
1602 }
1603
1604 #[test]
1605 fn test_dense_vector_ivf_rabitq_index() {
1606 use crate::dsl::schema::VectorIndexType;
1607
1608 let sdl = r#"
1609 index documents {
1610 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1611 }
1612 "#;
1613
1614 let indexes = parse_sdl(sdl).unwrap();
1615 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1616
1617 assert_eq!(config.dim, 1536);
1618 assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1619 assert_eq!(config.num_clusters, Some(512));
1620 }
1621
1622 #[test]
1623 fn test_dense_vector_rabitq_no_clusters() {
1624 use crate::dsl::schema::VectorIndexType;
1625
1626 let sdl = r#"
1627 index documents {
1628 field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1629 }
1630 "#;
1631
1632 let indexes = parse_sdl(sdl).unwrap();
1633 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1634
1635 assert_eq!(config.dim, 768);
1636 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1637 assert!(config.num_clusters.is_none());
1638 }
1639
1640 #[test]
1641 fn test_dense_vector_flat_index() {
1642 use crate::dsl::schema::VectorIndexType;
1643
1644 let sdl = r#"
1645 index documents {
1646 field embedding: dense_vector<dims: 768> [indexed<flat>]
1647 }
1648 "#;
1649
1650 let indexes = parse_sdl(sdl).unwrap();
1651 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1652
1653 assert_eq!(config.dim, 768);
1654 assert_eq!(config.index_type, VectorIndexType::Flat);
1655 }
1656
1657 #[test]
1658 fn test_dense_vector_default_index_type() {
1659 use crate::dsl::schema::VectorIndexType;
1660
1661 let sdl = r#"
1663 index documents {
1664 field embedding: dense_vector<dims: 768> [indexed]
1665 }
1666 "#;
1667
1668 let indexes = parse_sdl(sdl).unwrap();
1669 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1670
1671 assert_eq!(config.dim, 768);
1672 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1673 }
1674
1675 #[test]
1676 fn test_dense_vector_f16_quantization() {
1677 use crate::dsl::schema::{DenseVectorQuantization, VectorIndexType};
1678
1679 let sdl = r#"
1680 index documents {
1681 field embedding: dense_vector<768, f16> [indexed]
1682 }
1683 "#;
1684
1685 let indexes = parse_sdl(sdl).unwrap();
1686 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1687
1688 assert_eq!(config.dim, 768);
1689 assert_eq!(config.quantization, DenseVectorQuantization::F16);
1690 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1691 }
1692
1693 #[test]
1694 fn test_dense_vector_uint8_quantization() {
1695 use crate::dsl::schema::DenseVectorQuantization;
1696
1697 let sdl = r#"
1698 index documents {
1699 field embedding: dense_vector<1024, uint8> [indexed<ivf_rabitq>]
1700 }
1701 "#;
1702
1703 let indexes = parse_sdl(sdl).unwrap();
1704 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1705
1706 assert_eq!(config.dim, 1024);
1707 assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1708 }
1709
1710 #[test]
1711 fn test_dense_vector_u8_alias() {
1712 use crate::dsl::schema::DenseVectorQuantization;
1713
1714 let sdl = r#"
1715 index documents {
1716 field embedding: dense_vector<512, u8> [indexed]
1717 }
1718 "#;
1719
1720 let indexes = parse_sdl(sdl).unwrap();
1721 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1722
1723 assert_eq!(config.dim, 512);
1724 assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1725 }
1726
1727 #[test]
1728 fn test_dense_vector_default_f32_quantization() {
1729 use crate::dsl::schema::DenseVectorQuantization;
1730
1731 let sdl = r#"
1733 index documents {
1734 field embedding: dense_vector<768> [indexed]
1735 }
1736 "#;
1737
1738 let indexes = parse_sdl(sdl).unwrap();
1739 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1740
1741 assert_eq!(config.dim, 768);
1742 assert_eq!(config.quantization, DenseVectorQuantization::F32);
1743 }
1744
1745 #[test]
1746 fn test_dense_vector_keyword_with_quantization() {
1747 use crate::dsl::schema::DenseVectorQuantization;
1748
1749 let sdl = r#"
1750 index documents {
1751 field embedding: dense_vector<dims: 768, f16> [indexed]
1752 }
1753 "#;
1754
1755 let indexes = parse_sdl(sdl).unwrap();
1756 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1757
1758 assert_eq!(config.dim, 768);
1759 assert_eq!(config.quantization, DenseVectorQuantization::F16);
1760 }
1761
1762 #[test]
1763 fn test_json_field_type() {
1764 let sdl = r#"
1765 index documents {
1766 field title: text [indexed, stored]
1767 field metadata: json [stored]
1768 field extra: json
1769 }
1770 "#;
1771
1772 let indexes = parse_sdl(sdl).unwrap();
1773 let index = &indexes[0];
1774
1775 assert_eq!(index.fields.len(), 3);
1776
1777 assert_eq!(index.fields[1].name, "metadata");
1779 assert!(matches!(index.fields[1].field_type, FieldType::Json));
1780 assert!(index.fields[1].stored);
1781 assert_eq!(index.fields[2].name, "extra");
1785 assert!(matches!(index.fields[2].field_type, FieldType::Json));
1786
1787 let schema = index.to_schema();
1789 let metadata_field = schema.get_field("metadata").unwrap();
1790 let entry = schema.get_field_entry(metadata_field).unwrap();
1791 assert_eq!(entry.field_type, FieldType::Json);
1792 assert!(!entry.indexed); assert!(entry.stored);
1794 }
1795
1796 #[test]
1797 fn test_sparse_vector_query_config() {
1798 use crate::structures::QueryWeighting;
1799
1800 let sdl = r#"
1801 index documents {
1802 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1803 }
1804 "#;
1805
1806 let indexes = parse_sdl(sdl).unwrap();
1807 let index = &indexes[0];
1808
1809 assert_eq!(index.fields.len(), 1);
1810 assert_eq!(index.fields[0].name, "embedding");
1811 assert!(matches!(
1812 index.fields[0].field_type,
1813 FieldType::SparseVector
1814 ));
1815
1816 let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1817 assert_eq!(config.index_size, IndexSize::U16);
1818 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1819
1820 let query_config = config.query_config.as_ref().unwrap();
1822 assert_eq!(
1823 query_config.tokenizer.as_deref(),
1824 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1825 );
1826 assert_eq!(query_config.weighting, QueryWeighting::Idf);
1827
1828 let schema = index.to_schema();
1830 let embedding_field = schema.get_field("embedding").unwrap();
1831 let entry = schema.get_field_entry(embedding_field).unwrap();
1832 let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1833 let qc = sv_config.query_config.as_ref().unwrap();
1834 assert_eq!(
1835 qc.tokenizer.as_deref(),
1836 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1837 );
1838 assert_eq!(qc.weighting, QueryWeighting::Idf);
1839 }
1840
1841 #[test]
1842 fn test_sparse_vector_query_config_weighting_one() {
1843 use crate::structures::QueryWeighting;
1844
1845 let sdl = r#"
1846 index documents {
1847 field embedding: sparse_vector [indexed<query<weighting: one>>]
1848 }
1849 "#;
1850
1851 let indexes = parse_sdl(sdl).unwrap();
1852 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1853
1854 let query_config = config.query_config.as_ref().unwrap();
1855 assert!(query_config.tokenizer.is_none());
1856 assert_eq!(query_config.weighting, QueryWeighting::One);
1857 }
1858
1859 #[test]
1860 fn test_sparse_vector_query_config_weighting_idf_file() {
1861 use crate::structures::QueryWeighting;
1862
1863 let sdl = r#"
1864 index documents {
1865 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1866 }
1867 "#;
1868
1869 let indexes = parse_sdl(sdl).unwrap();
1870 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1871
1872 let query_config = config.query_config.as_ref().unwrap();
1873 assert_eq!(
1874 query_config.tokenizer.as_deref(),
1875 Some("opensearch-neural-sparse-encoding-v1")
1876 );
1877 assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1878
1879 let schema = indexes[0].to_schema();
1881 let field = schema.get_field("embedding").unwrap();
1882 let entry = schema.get_field_entry(field).unwrap();
1883 let sc = entry.sparse_vector_config.as_ref().unwrap();
1884 let qc = sc.query_config.as_ref().unwrap();
1885 assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1886 }
1887
1888 #[test]
1889 fn test_sparse_vector_query_config_pruning_params() {
1890 let sdl = r#"
1891 index documents {
1892 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<weighting: idf, weight_threshold: 0.03, max_dims: 25, pruning: 0.2>>]
1893 }
1894 "#;
1895
1896 let indexes = parse_sdl(sdl).unwrap();
1897 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1898
1899 let qc = config.query_config.as_ref().unwrap();
1900 assert_eq!(qc.weighting, QueryWeighting::Idf);
1901 assert!((qc.weight_threshold - 0.03).abs() < 0.001);
1902 assert_eq!(qc.max_query_dims, Some(25));
1903 assert!((qc.pruning.unwrap() - 0.2).abs() < 0.001);
1904
1905 let schema = indexes[0].to_schema();
1907 let field = schema.get_field("embedding").unwrap();
1908 let entry = schema.get_field_entry(field).unwrap();
1909 let sc = entry.sparse_vector_config.as_ref().unwrap();
1910 let rqc = sc.query_config.as_ref().unwrap();
1911 assert!((rqc.weight_threshold - 0.03).abs() < 0.001);
1912 assert_eq!(rqc.max_query_dims, Some(25));
1913 assert!((rqc.pruning.unwrap() - 0.2).abs() < 0.001);
1914 }
1915
1916 #[test]
1917 fn test_sparse_vector_format_maxscore() {
1918 let sdl = r#"
1919 index documents {
1920 field embedding: sparse_vector<u16> [indexed<format: maxscore, quantization: uint8>]
1921 }
1922 "#;
1923
1924 let indexes = parse_sdl(sdl).unwrap();
1925 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1926 assert_eq!(config.format, SparseFormat::MaxScore);
1927 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1928
1929 let schema = indexes[0].to_schema();
1931 let field = schema.get_field("embedding").unwrap();
1932 let entry = schema.get_field_entry(field).unwrap();
1933 let sc = entry.sparse_vector_config.as_ref().unwrap();
1934 assert_eq!(sc.format, SparseFormat::MaxScore);
1935 }
1936
1937 #[test]
1938 fn test_sparse_vector_format_bmp() {
1939 let sdl = r#"
1940 index documents {
1941 field embedding: sparse_vector<u16> [indexed<format: bmp, quantization: uint8>]
1942 }
1943 "#;
1944
1945 let indexes = parse_sdl(sdl).unwrap();
1946 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1947 assert_eq!(config.format, SparseFormat::Bmp);
1948 }
1949
1950 #[test]
1951 fn test_fast_attribute() {
1952 let sdl = r#"
1953 index products {
1954 field name: text [indexed, stored]
1955 field price: f64 [indexed, fast]
1956 field category: text [indexed, stored, fast]
1957 field count: u64 [fast]
1958 field score: i64 [indexed, stored, fast]
1959 }
1960 "#;
1961
1962 let indexes = parse_sdl(sdl).unwrap();
1963 assert_eq!(indexes.len(), 1);
1964 let index = &indexes[0];
1965 assert_eq!(index.fields.len(), 5);
1966
1967 assert!(!index.fields[0].fast);
1969 assert!(index.fields[1].fast);
1971 assert!(matches!(index.fields[1].field_type, FieldType::F64));
1972 assert!(index.fields[2].fast);
1974 assert!(matches!(index.fields[2].field_type, FieldType::Text));
1975 assert!(index.fields[3].fast);
1977 assert!(matches!(index.fields[3].field_type, FieldType::U64));
1978 assert!(index.fields[4].fast);
1980 assert!(matches!(index.fields[4].field_type, FieldType::I64));
1981
1982 let schema = index.to_schema();
1984 let price_field = schema.get_field("price").unwrap();
1985 assert!(schema.get_field_entry(price_field).unwrap().fast);
1986
1987 let category_field = schema.get_field("category").unwrap();
1988 assert!(schema.get_field_entry(category_field).unwrap().fast);
1989
1990 let name_field = schema.get_field("name").unwrap();
1991 assert!(!schema.get_field_entry(name_field).unwrap().fast);
1992 }
1993
1994 #[test]
1995 fn test_primary_attribute() {
1996 let sdl = r#"
1997 index documents {
1998 field id: text [primary, stored]
1999 field title: text [indexed, stored]
2000 }
2001 "#;
2002
2003 let indexes = parse_sdl(sdl).unwrap();
2004 assert_eq!(indexes.len(), 1);
2005 let index = &indexes[0];
2006 assert_eq!(index.fields.len(), 2);
2007
2008 let id_field = &index.fields[0];
2010 assert!(id_field.primary, "id should be primary");
2011 assert!(id_field.fast, "primary implies fast");
2012 assert!(id_field.indexed, "primary implies indexed");
2013
2014 assert!(!index.fields[1].primary);
2016
2017 let schema = index.to_schema();
2019 let id = schema.get_field("id").unwrap();
2020 let id_entry = schema.get_field_entry(id).unwrap();
2021 assert!(id_entry.primary_key);
2022 assert!(id_entry.fast);
2023 assert!(id_entry.indexed);
2024
2025 let title = schema.get_field("title").unwrap();
2026 assert!(!schema.get_field_entry(title).unwrap().primary_key);
2027
2028 assert_eq!(schema.primary_field(), Some(id));
2030 }
2031
2032 #[test]
2033 fn test_primary_with_other_attributes() {
2034 let sdl = r#"
2035 index documents {
2036 field id: text<simple> [primary, indexed, stored]
2037 field body: text [indexed]
2038 }
2039 "#;
2040
2041 let indexes = parse_sdl(sdl).unwrap();
2042 let id_field = &indexes[0].fields[0];
2043 assert!(id_field.primary);
2044 assert!(id_field.indexed);
2045 assert!(id_field.stored);
2046 assert!(id_field.fast);
2047 assert_eq!(id_field.tokenizer, Some("simple".to_string()));
2048 }
2049
2050 #[test]
2051 fn test_primary_only_one_allowed() {
2052 let sdl = r#"
2053 index documents {
2054 field id: text [primary]
2055 field alt_id: text [primary]
2056 }
2057 "#;
2058
2059 let result = parse_sdl(sdl);
2060 assert!(result.is_err());
2061 let err = result.unwrap_err().to_string();
2062 assert!(
2063 err.contains("primary key"),
2064 "Error should mention primary key: {}",
2065 err
2066 );
2067 }
2068
2069 #[test]
2070 fn test_primary_must_be_text() {
2071 let sdl = r#"
2072 index documents {
2073 field id: u64 [primary]
2074 }
2075 "#;
2076
2077 let result = parse_sdl(sdl);
2078 assert!(result.is_err());
2079 let err = result.unwrap_err().to_string();
2080 assert!(
2081 err.contains("text"),
2082 "Error should mention text type: {}",
2083 err
2084 );
2085 }
2086
2087 #[test]
2088 fn test_primary_cannot_be_multi() {
2089 let sdl = r#"
2090 index documents {
2091 field id: text [primary, stored<multi>]
2092 }
2093 "#;
2094
2095 let result = parse_sdl(sdl);
2096 assert!(result.is_err());
2097 let err = result.unwrap_err().to_string();
2098 assert!(err.contains("multi"), "Error should mention multi: {}", err);
2099 }
2100
2101 #[test]
2102 fn test_no_primary_field() {
2103 let sdl = r#"
2105 index documents {
2106 field title: text [indexed, stored]
2107 }
2108 "#;
2109
2110 let indexes = parse_sdl(sdl).unwrap();
2111 let schema = indexes[0].to_schema();
2112 assert!(schema.primary_field().is_none());
2113 }
2114
2115 #[test]
2116 fn test_simhash_attribute_sparse_vector() {
2117 let sdl = r#"
2118 index documents {
2119 field embedding: sparse_vector<u32> [indexed<format: bmp, dims: 105879>, simhash]
2120 }
2121 "#;
2122
2123 let indexes = parse_sdl(sdl).unwrap();
2124 let index = &indexes[0];
2125 assert_eq!(index.fields.len(), 1);
2126
2127 let sh_field = &index.fields[0];
2128 assert_eq!(sh_field.name, "embedding");
2129 assert!(sh_field.simhash);
2130 assert!(!sh_field.fast, "simhash no longer implies fast");
2131 assert!(matches!(sh_field.field_type, FieldType::SparseVector));
2132
2133 let schema = index.to_schema();
2135 let field = schema.get_field("embedding").unwrap();
2136 let entry = schema.get_field_entry(field).unwrap();
2137 assert!(entry.simhash);
2138 }
2139
2140 #[test]
2141 fn test_simhash_must_be_sparse_vector() {
2142 let sdl = r#"
2144 index documents {
2145 field simhash: u64 [simhash]
2146 }
2147 "#;
2148
2149 let result = parse_sdl(sdl);
2150 assert!(result.is_err());
2151 let err = result.unwrap_err().to_string();
2152 assert!(
2153 err.contains("sparse_vector"),
2154 "Error should mention sparse_vector: {}",
2155 err
2156 );
2157
2158 let sdl2 = r#"
2160 index documents {
2161 field simhash: text [simhash]
2162 }
2163 "#;
2164 let result2 = parse_sdl(sdl2);
2165 assert!(result2.is_err());
2166 }
2167
2168 #[test]
2169 fn test_simhash_multiple_fields() {
2170 let sdl = r#"
2171 index documents {
2172 field embed1: sparse_vector<u32> [indexed<format: bmp>, simhash]
2173 field embed2: sparse_vector<u32> [indexed<format: bmp>, simhash]
2174 }
2175 "#;
2176
2177 let indexes = parse_sdl(sdl).unwrap();
2178 let index = &indexes[0];
2179 assert!(index.fields[0].simhash);
2180 assert!(index.fields[1].simhash);
2181 }
2182
2183 #[test]
2184 fn test_no_simhash_field() {
2185 let sdl = r#"
2186 index documents {
2187 field title: text [indexed, stored]
2188 }
2189 "#;
2190
2191 let indexes = parse_sdl(sdl).unwrap();
2192 let schema = indexes[0].to_schema();
2193 assert!(schema.fields().all(|(_, entry)| !entry.simhash));
2195 }
2196}