1use pest::Parser;
46use pest_derive::Parser;
47
48use super::query_field_router::{QueryRouterRule, RoutingMode};
49use super::schema::{DenseVectorQuantization, FieldType, Schema, SchemaBuilder};
50use crate::Result;
51use crate::error::Error;
52
53#[derive(Parser)]
54#[grammar = "dsl/sdl/sdl.pest"]
55pub struct SdlParser;
56
57use super::schema::DenseVectorConfig;
58use crate::structures::{
59 IndexSize, QueryWeighting, SparseFormat, SparseQueryConfig, SparseVectorConfig,
60 WeightQuantization,
61};
62
63#[derive(Debug, Clone)]
65pub struct FieldDef {
66 pub name: String,
67 pub field_type: FieldType,
68 pub indexed: bool,
69 pub stored: bool,
70 pub tokenizer: Option<String>,
72 pub multi: bool,
74 pub positions: Option<super::schema::PositionMode>,
76 pub sparse_vector_config: Option<SparseVectorConfig>,
78 pub dense_vector_config: Option<DenseVectorConfig>,
80 pub fast: bool,
82 pub primary: bool,
84 pub reorder: bool,
86}
87
88#[derive(Debug, Clone)]
90pub struct IndexDef {
91 pub name: String,
92 pub fields: Vec<FieldDef>,
93 pub default_fields: Vec<String>,
94 pub query_routers: Vec<QueryRouterRule>,
96}
97
98impl IndexDef {
99 pub fn to_schema(&self) -> Schema {
101 let mut builder = SchemaBuilder::default();
102
103 for field in &self.fields {
104 let f = match field.field_type {
105 FieldType::Text => {
106 let tokenizer = field.tokenizer.as_deref().unwrap_or("simple");
107 builder.add_text_field_with_tokenizer(
108 &field.name,
109 field.indexed,
110 field.stored,
111 tokenizer,
112 )
113 }
114 FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
115 FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
116 FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
117 FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
118 FieldType::Json => builder.add_json_field(&field.name, field.stored),
119 FieldType::SparseVector => {
120 if let Some(config) = &field.sparse_vector_config {
121 builder.add_sparse_vector_field_with_config(
122 &field.name,
123 field.indexed,
124 field.stored,
125 config.clone(),
126 )
127 } else {
128 builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
129 }
130 }
131 FieldType::DenseVector => {
132 let config = field
134 .dense_vector_config
135 .as_ref()
136 .expect("DenseVector field requires dimension to be specified");
137 builder.add_dense_vector_field_with_config(
138 &field.name,
139 field.indexed,
140 field.stored,
141 config.clone(),
142 )
143 }
144 };
145 if field.multi {
146 builder.set_multi(f, true);
147 }
148 if field.fast {
149 builder.set_fast(f, true);
150 }
151 if field.primary {
152 builder.set_primary_key(f);
153 }
154 if field.reorder {
155 builder.set_reorder(f, true);
156 }
157 let positions = field.positions.or({
159 if field.multi
161 && matches!(
162 field.field_type,
163 FieldType::SparseVector | FieldType::DenseVector
164 )
165 {
166 Some(super::schema::PositionMode::Ordinal)
167 } else {
168 None
169 }
170 });
171 if let Some(mode) = positions {
172 builder.set_positions(f, mode);
173 }
174 }
175
176 if !self.default_fields.is_empty() {
178 builder.set_default_fields(self.default_fields.clone());
179 }
180
181 if !self.query_routers.is_empty() {
183 builder.set_query_routers(self.query_routers.clone());
184 }
185
186 builder.build()
187 }
188
189 pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
194 if self.query_routers.is_empty() {
195 return Ok(None);
196 }
197
198 super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
199 .map(Some)
200 .map_err(Error::Schema)
201 }
202}
203
204fn parse_field_type(type_str: &str) -> Result<FieldType> {
206 match type_str {
207 "text" | "string" | "str" => Ok(FieldType::Text),
208 "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
209 "i64" | "int" | "integer" => Ok(FieldType::I64),
210 "f64" | "float" | "double" => Ok(FieldType::F64),
211 "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
212 "json" => Ok(FieldType::Json),
213 "sparse_vector" => Ok(FieldType::SparseVector),
214 "dense_vector" | "vector" => Ok(FieldType::DenseVector),
215 _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
216 }
217}
218
219#[derive(Debug, Clone, Default)]
221struct IndexConfig {
222 index_type: Option<super::schema::VectorIndexType>,
223 num_clusters: Option<usize>,
224 nprobe: Option<usize>,
225 build_threshold: Option<usize>,
226 sparse_format: Option<SparseFormat>,
228 quantization: Option<WeightQuantization>,
229 weight_threshold: Option<f32>,
230 block_size: Option<usize>,
231 pruning: Option<f32>,
232 min_terms: Option<usize>,
233 query_tokenizer: Option<String>,
235 query_weighting: Option<QueryWeighting>,
236 query_weight_threshold: Option<f32>,
237 query_max_dims: Option<usize>,
238 query_pruning: Option<f32>,
239 query_min_query_dims: Option<usize>,
240 dims: Option<u32>,
242 max_weight: Option<f32>,
243 positions: Option<super::schema::PositionMode>,
245}
246
247struct ParsedAttributes {
249 indexed: bool,
250 stored: bool,
251 multi: bool,
252 fast: bool,
253 primary: bool,
254 reorder: bool,
255 index_config: Option<IndexConfig>,
256}
257
258fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> ParsedAttributes {
260 let mut attrs = ParsedAttributes {
261 indexed: false,
262 stored: false,
263 multi: false,
264 fast: false,
265 primary: false,
266 reorder: false,
267 index_config: None,
268 };
269
270 for attr in pair.into_inner() {
271 if attr.as_rule() == Rule::attribute {
272 let mut found_config = false;
273 for inner in attr.clone().into_inner() {
274 match inner.as_rule() {
275 Rule::indexed_with_config => {
276 attrs.indexed = true;
277 attrs.index_config = Some(parse_index_config(inner));
278 found_config = true;
279 break;
280 }
281 Rule::stored_with_config => {
282 attrs.stored = true;
283 attrs.multi = true; found_config = true;
285 break;
286 }
287 _ => {}
288 }
289 }
290 if !found_config {
291 match attr.as_str() {
292 "indexed" => attrs.indexed = true,
293 "stored" => attrs.stored = true,
294 "fast" => attrs.fast = true,
295 "primary" => attrs.primary = true,
296 "reorder" => attrs.reorder = true,
297 _ => {}
298 }
299 }
300 }
301 }
302
303 attrs
304}
305
306fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
308 let mut config = IndexConfig::default();
309
310 for inner in pair.into_inner() {
315 if inner.as_rule() == Rule::index_config_params {
316 for param in inner.into_inner() {
317 if param.as_rule() == Rule::index_config_param {
318 for p in param.into_inner() {
319 parse_single_index_config_param(&mut config, p);
320 }
321 }
322 }
323 }
324 }
325
326 config
327}
328
329fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
331 use super::schema::VectorIndexType;
332
333 match p.as_rule() {
334 Rule::index_type_spec => {
335 config.index_type = Some(match p.as_str() {
336 "flat" => VectorIndexType::Flat,
337 "rabitq" => VectorIndexType::RaBitQ,
338 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
339 "scann" => VectorIndexType::ScaNN,
340 _ => VectorIndexType::RaBitQ,
341 });
342 }
343 Rule::index_type_kwarg => {
344 if let Some(t) = p.into_inner().next() {
346 config.index_type = Some(match t.as_str() {
347 "flat" => VectorIndexType::Flat,
348 "rabitq" => VectorIndexType::RaBitQ,
349 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
350 "scann" => VectorIndexType::ScaNN,
351 _ => VectorIndexType::RaBitQ,
352 });
353 }
354 }
355 Rule::num_clusters_kwarg => {
356 if let Some(n) = p.into_inner().next() {
358 config.num_clusters = Some(n.as_str().parse().unwrap_or_else(|_| {
359 log::warn!(
360 "Invalid num_clusters value '{}', using default 256",
361 n.as_str()
362 );
363 256
364 }));
365 }
366 }
367 Rule::build_threshold_kwarg => {
368 if let Some(n) = p.into_inner().next() {
370 config.build_threshold = Some(n.as_str().parse().unwrap_or_else(|_| {
371 log::warn!(
372 "Invalid build_threshold value '{}', using default 10000",
373 n.as_str()
374 );
375 10000
376 }));
377 }
378 }
379 Rule::nprobe_kwarg => {
380 if let Some(n) = p.into_inner().next() {
382 config.nprobe = Some(n.as_str().parse().unwrap_or_else(|_| {
383 log::warn!("Invalid nprobe value '{}', using default 32", n.as_str());
384 32
385 }));
386 }
387 }
388 Rule::quantization_kwarg => {
389 if let Some(q) = p.into_inner().next() {
391 config.quantization = Some(match q.as_str() {
392 "float32" | "f32" => WeightQuantization::Float32,
393 "float16" | "f16" => WeightQuantization::Float16,
394 "uint8" | "u8" => WeightQuantization::UInt8,
395 "uint4" | "u4" => WeightQuantization::UInt4,
396 _ => WeightQuantization::default(),
397 });
398 }
399 }
400 Rule::weight_threshold_kwarg => {
401 if let Some(t) = p.into_inner().next() {
403 config.weight_threshold = Some(t.as_str().parse().unwrap_or_else(|_| {
404 log::warn!(
405 "Invalid weight_threshold value '{}', using default 0.0",
406 t.as_str()
407 );
408 0.0
409 }));
410 }
411 }
412 Rule::block_size_kwarg => {
413 if let Some(n) = p.into_inner().next() {
415 config.block_size = Some(n.as_str().parse().unwrap_or_else(|_| {
416 log::warn!(
417 "Invalid block_size value '{}', using default 128",
418 n.as_str()
419 );
420 128
421 }));
422 }
423 }
424 Rule::pruning_kwarg => {
425 if let Some(f) = p.into_inner().next() {
427 config.pruning = Some(f.as_str().parse().unwrap_or_else(|_| {
428 log::warn!("Invalid pruning value '{}', using default 1.0", f.as_str());
429 1.0
430 }));
431 }
432 }
433 Rule::min_terms_kwarg => {
434 if let Some(n) = p.into_inner().next() {
435 config.min_terms = Some(n.as_str().parse().unwrap_or_else(|_| {
436 log::warn!("Invalid min_terms value '{}', using default 4", n.as_str());
437 4
438 }));
439 }
440 }
441 Rule::sparse_format_kwarg => {
442 if let Some(f) = p.into_inner().next() {
444 config.sparse_format = Some(match f.as_str() {
445 "bmp" => SparseFormat::Bmp,
446 "maxscore" => SparseFormat::MaxScore,
447 _ => SparseFormat::default(),
448 });
449 }
450 }
451 Rule::sparse_dims_kwarg => {
452 if let Some(n) = p.into_inner().next() {
453 config.dims = Some(n.as_str().parse().unwrap_or_else(|_| {
454 log::warn!("Invalid dims value '{}', using default 105879", n.as_str());
455 105879
456 }));
457 }
458 }
459 Rule::sparse_max_weight_kwarg => {
460 if let Some(f) = p.into_inner().next() {
461 config.max_weight = Some(f.as_str().parse().unwrap_or_else(|_| {
462 log::warn!(
463 "Invalid max_weight value '{}', using default 5.0",
464 f.as_str()
465 );
466 5.0
467 }));
468 }
469 }
470 Rule::query_config_block => {
471 parse_query_config_block(config, p);
473 }
474 Rule::positions_kwarg => {
475 use super::schema::PositionMode;
477 config.positions = Some(match p.as_str() {
478 "ordinal" => PositionMode::Ordinal,
479 "token_position" => PositionMode::TokenPosition,
480 _ => PositionMode::Full, });
482 }
483 _ => {}
484 }
485}
486
487fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
489 for inner in pair.into_inner() {
490 if inner.as_rule() == Rule::query_config_params {
491 for param in inner.into_inner() {
492 if param.as_rule() == Rule::query_config_param {
493 for p in param.into_inner() {
494 match p.as_rule() {
495 Rule::query_tokenizer_kwarg => {
496 if let Some(path) = p.into_inner().next()
498 && let Some(inner_path) = path.into_inner().next()
499 {
500 config.query_tokenizer = Some(inner_path.as_str().to_string());
501 }
502 }
503 Rule::query_weighting_kwarg => {
504 if let Some(w) = p.into_inner().next() {
506 config.query_weighting = Some(match w.as_str() {
507 "one" => QueryWeighting::One,
508 "idf" => QueryWeighting::Idf,
509 "idf_file" => QueryWeighting::IdfFile,
510 _ => QueryWeighting::One,
511 });
512 }
513 }
514 Rule::query_weight_threshold_kwarg => {
515 if let Some(t) = p.into_inner().next() {
516 config.query_weight_threshold =
517 Some(t.as_str().parse().unwrap_or_else(|_| {
518 log::warn!(
519 "Invalid query weight_threshold '{}', using 0.0",
520 t.as_str()
521 );
522 0.0
523 }));
524 }
525 }
526 Rule::query_max_dims_kwarg => {
527 if let Some(t) = p.into_inner().next() {
528 config.query_max_dims =
529 Some(t.as_str().parse().unwrap_or_else(|_| {
530 log::warn!(
531 "Invalid query max_dims '{}', using 0",
532 t.as_str()
533 );
534 0
535 }));
536 }
537 }
538 Rule::query_pruning_kwarg => {
539 if let Some(t) = p.into_inner().next() {
540 config.query_pruning =
541 Some(t.as_str().parse().unwrap_or_else(|_| {
542 log::warn!(
543 "Invalid query pruning '{}', using 1.0",
544 t.as_str()
545 );
546 1.0
547 }));
548 }
549 }
550 Rule::query_min_query_dims_kwarg => {
551 if let Some(t) = p.into_inner().next() {
552 config.query_min_query_dims =
553 Some(t.as_str().parse().unwrap_or_else(|_| {
554 log::warn!(
555 "Invalid query min_query_dims '{}', using 4",
556 t.as_str()
557 );
558 4
559 }));
560 }
561 }
562 _ => {}
563 }
564 }
565 }
566 }
567 }
568 }
569}
570
571fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
573 let mut inner = pair.into_inner();
574
575 let name = inner
576 .next()
577 .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
578 .as_str()
579 .to_string();
580
581 let field_type_str = inner
582 .next()
583 .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
584 .as_str();
585
586 let field_type = parse_field_type(field_type_str)?;
587
588 let mut tokenizer = None;
590 let mut sparse_vector_config = None;
591 let mut dense_vector_config = None;
592 let mut indexed = true;
593 let mut stored = true;
594 let mut multi = false;
595 let mut fast = false;
596 let mut primary = false;
597 let mut reorder = false;
598 let mut index_config: Option<IndexConfig> = None;
599
600 for item in inner {
601 match item.as_rule() {
602 Rule::tokenizer_spec => {
603 if let Some(tok_name) = item.into_inner().next() {
605 tokenizer = Some(tok_name.as_str().to_string());
606 }
607 }
608 Rule::sparse_vector_config => {
609 sparse_vector_config = Some(parse_sparse_vector_config(item));
611 }
612 Rule::dense_vector_config => {
613 dense_vector_config = Some(parse_dense_vector_config(item));
615 }
616 Rule::attributes => {
617 let attrs = parse_attributes(item);
618 indexed = attrs.indexed;
619 stored = attrs.stored;
620 multi = attrs.multi;
621 fast = attrs.fast;
622 primary = attrs.primary;
623 reorder = attrs.reorder;
624 index_config = attrs.index_config;
625 }
626 _ => {}
627 }
628 }
629
630 if primary {
632 fast = true;
633 indexed = true;
634 }
635
636 let mut positions = None;
638 if let Some(idx_cfg) = index_config {
639 positions = idx_cfg.positions;
640 if let Some(ref mut dv_config) = dense_vector_config {
641 apply_index_config_to_dense_vector(dv_config, idx_cfg);
642 } else if field_type == FieldType::SparseVector {
643 let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
645 apply_index_config_to_sparse_vector(sv_config, idx_cfg);
646 }
647 }
648
649 Ok(FieldDef {
650 name,
651 field_type,
652 indexed,
653 stored,
654 tokenizer,
655 multi,
656 positions,
657 sparse_vector_config,
658 dense_vector_config,
659 fast,
660 primary,
661 reorder,
662 })
663}
664
665fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
667 if let Some(index_type) = idx_cfg.index_type {
669 config.index_type = index_type;
670 }
671
672 if idx_cfg.num_clusters.is_some() {
674 config.num_clusters = idx_cfg.num_clusters;
675 }
676
677 if let Some(nprobe) = idx_cfg.nprobe {
679 config.nprobe = nprobe;
680 }
681
682 if idx_cfg.build_threshold.is_some() {
684 config.build_threshold = idx_cfg.build_threshold;
685 }
686}
687
688fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
691 let mut index_size = IndexSize::default();
692
693 for inner in pair.into_inner() {
695 if inner.as_rule() == Rule::index_size_spec {
696 index_size = match inner.as_str() {
697 "u16" => IndexSize::U16,
698 "u32" => IndexSize::U32,
699 _ => IndexSize::default(),
700 };
701 }
702 }
703
704 SparseVectorConfig {
705 format: SparseFormat::default(),
706 index_size,
707 weight_quantization: WeightQuantization::default(),
708 weight_threshold: 0.0,
709 block_size: 128,
710 bmp_block_size: 64,
711 max_bmp_grid_bytes: 0,
712 bmp_superblock_size: 64,
713 pruning: None,
714 query_config: None,
715 dims: None,
716 max_weight: None,
717 min_terms: 4,
718 }
719}
720
721fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
723 if let Some(f) = idx_cfg.sparse_format {
724 config.format = f;
725 }
726 if let Some(q) = idx_cfg.quantization {
727 config.weight_quantization = q;
728 }
729 if let Some(t) = idx_cfg.weight_threshold {
730 config.weight_threshold = t;
731 }
732 if let Some(bs) = idx_cfg.block_size {
733 let adjusted = bs.next_power_of_two();
734 if adjusted != bs {
735 log::warn!(
736 "block_size {} adjusted to next power of two: {}",
737 bs,
738 adjusted
739 );
740 }
741 config.block_size = adjusted;
742 }
743 if let Some(p) = idx_cfg.pruning {
744 let clamped = p.clamp(0.0, 1.0);
745 if (clamped - p).abs() > f32::EPSILON {
746 log::warn!(
747 "pruning {} clamped to valid range [0.0, 1.0]: {}",
748 p,
749 clamped
750 );
751 }
752 config.pruning = Some(clamped);
753 }
754 if let Some(mt) = idx_cfg.min_terms {
755 config.min_terms = mt;
756 }
757 if let Some(d) = idx_cfg.dims {
758 config.dims = Some(d);
759 }
760 if let Some(mw) = idx_cfg.max_weight {
761 config.max_weight = Some(mw);
762 }
763 if idx_cfg.query_tokenizer.is_some()
765 || idx_cfg.query_weighting.is_some()
766 || idx_cfg.query_weight_threshold.is_some()
767 || idx_cfg.query_max_dims.is_some()
768 || idx_cfg.query_pruning.is_some()
769 || idx_cfg.query_min_query_dims.is_some()
770 {
771 let query_config = config
772 .query_config
773 .get_or_insert(SparseQueryConfig::default());
774 if let Some(tokenizer) = idx_cfg.query_tokenizer {
775 query_config.tokenizer = Some(tokenizer);
776 }
777 if let Some(weighting) = idx_cfg.query_weighting {
778 query_config.weighting = weighting;
779 }
780 if let Some(t) = idx_cfg.query_weight_threshold {
781 query_config.weight_threshold = t;
782 }
783 if let Some(d) = idx_cfg.query_max_dims {
784 query_config.max_query_dims = Some(d);
785 }
786 if let Some(p) = idx_cfg.query_pruning {
787 query_config.pruning = Some(p);
788 }
789 if let Some(m) = idx_cfg.query_min_query_dims {
790 query_config.min_query_dims = m;
791 }
792 }
793}
794
795fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
798 let mut dim: usize = 0;
799 let mut quantization = DenseVectorQuantization::F32;
800
801 for params in pair.into_inner() {
803 if params.as_rule() == Rule::dense_vector_params {
804 for inner in params.into_inner() {
805 match inner.as_rule() {
806 Rule::dense_vector_keyword_params => {
807 for kwarg in inner.into_inner() {
808 match kwarg.as_rule() {
809 Rule::dims_kwarg => {
810 if let Some(d) = kwarg.into_inner().next() {
811 dim = d.as_str().parse().unwrap_or(0);
812 }
813 }
814 Rule::quant_type_spec => {
815 quantization = parse_quant_type(kwarg.as_str());
816 }
817 _ => {}
818 }
819 }
820 }
821 Rule::dense_vector_positional_params => {
822 for item in inner.into_inner() {
823 match item.as_rule() {
824 Rule::dimension_spec => {
825 dim = item.as_str().parse().unwrap_or(0);
826 }
827 Rule::quant_type_spec => {
828 quantization = parse_quant_type(item.as_str());
829 }
830 _ => {}
831 }
832 }
833 }
834 _ => {}
835 }
836 }
837 }
838 }
839
840 DenseVectorConfig::new(dim).with_quantization(quantization)
841}
842
843fn parse_quant_type(s: &str) -> DenseVectorQuantization {
844 match s.trim() {
845 "f16" => DenseVectorQuantization::F16,
846 "uint8" | "u8" => DenseVectorQuantization::UInt8,
847 _ => DenseVectorQuantization::F32,
848 }
849}
850
851fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
853 pair.into_inner().map(|p| p.as_str().to_string()).collect()
854}
855
856fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
858 let mut pattern = String::new();
859 let mut substitution = String::new();
860 let mut target_field = String::new();
861 let mut mode = RoutingMode::Additional;
862
863 for prop in pair.into_inner() {
864 if prop.as_rule() != Rule::query_router_prop {
865 continue;
866 }
867
868 for inner in prop.into_inner() {
869 match inner.as_rule() {
870 Rule::query_router_pattern => {
871 if let Some(regex_str) = inner.into_inner().next() {
872 pattern = parse_string_value(regex_str);
873 }
874 }
875 Rule::query_router_substitution => {
876 if let Some(quoted) = inner.into_inner().next() {
877 substitution = parse_string_value(quoted);
878 }
879 }
880 Rule::query_router_target => {
881 if let Some(ident) = inner.into_inner().next() {
882 target_field = ident.as_str().to_string();
883 }
884 }
885 Rule::query_router_mode => {
886 if let Some(mode_val) = inner.into_inner().next() {
887 mode = match mode_val.as_str() {
888 "exclusive" => RoutingMode::Exclusive,
889 "additional" => RoutingMode::Additional,
890 _ => RoutingMode::Additional,
891 };
892 }
893 }
894 _ => {}
895 }
896 }
897 }
898
899 if pattern.is_empty() {
900 return Err(Error::Schema("query_router missing 'pattern'".to_string()));
901 }
902 if substitution.is_empty() {
903 return Err(Error::Schema(
904 "query_router missing 'substitution'".to_string(),
905 ));
906 }
907 if target_field.is_empty() {
908 return Err(Error::Schema(
909 "query_router missing 'target_field'".to_string(),
910 ));
911 }
912
913 Ok(QueryRouterRule {
914 pattern,
915 substitution,
916 target_field,
917 mode,
918 })
919}
920
921fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
923 let s = pair.as_str();
924 match pair.as_rule() {
925 Rule::regex_string => {
926 if let Some(inner) = pair.into_inner().next() {
928 parse_string_value(inner)
929 } else {
930 s.to_string()
931 }
932 }
933 Rule::raw_string => {
934 s[2..s.len() - 1].to_string()
936 }
937 Rule::quoted_string => {
938 let inner = &s[1..s.len() - 1];
940 inner
942 .replace("\\n", "\n")
943 .replace("\\t", "\t")
944 .replace("\\\"", "\"")
945 .replace("\\\\", "\\")
946 }
947 _ => s.to_string(),
948 }
949}
950
951fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
953 let mut inner = pair.into_inner();
954
955 let name = inner
956 .next()
957 .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
958 .as_str()
959 .to_string();
960
961 let mut fields = Vec::new();
962 let mut default_fields = Vec::new();
963 let mut query_routers = Vec::new();
964
965 for item in inner {
966 match item.as_rule() {
967 Rule::field_def => {
968 fields.push(parse_field_def(item)?);
969 }
970 Rule::default_fields_def => {
971 default_fields = parse_default_fields_def(item);
972 }
973 Rule::query_router_def => {
974 query_routers.push(parse_query_router_def(item)?);
975 }
976 _ => {}
977 }
978 }
979
980 let primary_fields: Vec<&FieldDef> = fields.iter().filter(|f| f.primary).collect();
982 if primary_fields.len() > 1 {
983 return Err(Error::Schema(format!(
984 "Index '{}' has {} primary key fields, but at most one is allowed",
985 name,
986 primary_fields.len()
987 )));
988 }
989 if let Some(pk) = primary_fields.first() {
990 if pk.field_type != FieldType::Text {
991 return Err(Error::Schema(format!(
992 "Primary key field '{}' must be of type text, got {:?}",
993 pk.name, pk.field_type
994 )));
995 }
996 if pk.multi {
997 return Err(Error::Schema(format!(
998 "Primary key field '{}' cannot be multi-valued",
999 pk.name
1000 )));
1001 }
1002 }
1003
1004 Ok(IndexDef {
1005 name,
1006 fields,
1007 default_fields,
1008 query_routers,
1009 })
1010}
1011
1012pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
1014 let pairs = SdlParser::parse(Rule::file, input)
1015 .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
1016
1017 let mut indexes = Vec::new();
1018
1019 for pair in pairs {
1020 if pair.as_rule() == Rule::file {
1021 for inner in pair.into_inner() {
1022 if inner.as_rule() == Rule::index_def {
1023 indexes.push(parse_index_def(inner)?);
1024 }
1025 }
1026 }
1027 }
1028
1029 Ok(indexes)
1030}
1031
1032pub fn parse_single_index(input: &str) -> Result<IndexDef> {
1034 let indexes = parse_sdl(input)?;
1035
1036 if indexes.is_empty() {
1037 return Err(Error::Schema("No index definition found".to_string()));
1038 }
1039
1040 if indexes.len() > 1 {
1041 return Err(Error::Schema(
1042 "Multiple index definitions found, expected one".to_string(),
1043 ));
1044 }
1045
1046 Ok(indexes.into_iter().next().unwrap())
1047}
1048
1049#[cfg(test)]
1050mod tests {
1051 use super::*;
1052
1053 #[test]
1054 fn test_parse_simple_schema() {
1055 let sdl = r#"
1056 index articles {
1057 field title: text [indexed, stored]
1058 field body: text [indexed]
1059 }
1060 "#;
1061
1062 let indexes = parse_sdl(sdl).unwrap();
1063 assert_eq!(indexes.len(), 1);
1064
1065 let index = &indexes[0];
1066 assert_eq!(index.name, "articles");
1067 assert_eq!(index.fields.len(), 2);
1068
1069 assert_eq!(index.fields[0].name, "title");
1070 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1071 assert!(index.fields[0].indexed);
1072 assert!(index.fields[0].stored);
1073
1074 assert_eq!(index.fields[1].name, "body");
1075 assert!(matches!(index.fields[1].field_type, FieldType::Text));
1076 assert!(index.fields[1].indexed);
1077 assert!(!index.fields[1].stored);
1078 }
1079
1080 #[test]
1081 fn test_parse_all_field_types() {
1082 let sdl = r#"
1083 index test {
1084 field text_field: text [indexed, stored]
1085 field u64_field: u64 [indexed, stored]
1086 field i64_field: i64 [indexed, stored]
1087 field f64_field: f64 [indexed, stored]
1088 field bytes_field: bytes [stored]
1089 }
1090 "#;
1091
1092 let indexes = parse_sdl(sdl).unwrap();
1093 let index = &indexes[0];
1094
1095 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1096 assert!(matches!(index.fields[1].field_type, FieldType::U64));
1097 assert!(matches!(index.fields[2].field_type, FieldType::I64));
1098 assert!(matches!(index.fields[3].field_type, FieldType::F64));
1099 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1100 }
1101
1102 #[test]
1103 fn test_parse_with_comments() {
1104 let sdl = r#"
1105 # This is a comment
1106 index articles {
1107 # Title field
1108 field title: text [indexed, stored]
1109 field body: text [indexed] # inline comment not supported yet
1110 }
1111 "#;
1112
1113 let indexes = parse_sdl(sdl).unwrap();
1114 assert_eq!(indexes[0].fields.len(), 2);
1115 }
1116
1117 #[test]
1118 fn test_parse_type_aliases() {
1119 let sdl = r#"
1120 index test {
1121 field a: string [indexed]
1122 field b: int [indexed]
1123 field c: uint [indexed]
1124 field d: float [indexed]
1125 field e: binary [stored]
1126 }
1127 "#;
1128
1129 let indexes = parse_sdl(sdl).unwrap();
1130 let index = &indexes[0];
1131
1132 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1133 assert!(matches!(index.fields[1].field_type, FieldType::I64));
1134 assert!(matches!(index.fields[2].field_type, FieldType::U64));
1135 assert!(matches!(index.fields[3].field_type, FieldType::F64));
1136 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1137 }
1138
1139 #[test]
1140 fn test_to_schema() {
1141 let sdl = r#"
1142 index articles {
1143 field title: text [indexed, stored]
1144 field views: u64 [indexed, stored]
1145 }
1146 "#;
1147
1148 let indexes = parse_sdl(sdl).unwrap();
1149 let schema = indexes[0].to_schema();
1150
1151 assert!(schema.get_field("title").is_some());
1152 assert!(schema.get_field("views").is_some());
1153 assert!(schema.get_field("nonexistent").is_none());
1154 }
1155
1156 #[test]
1157 fn test_default_attributes() {
1158 let sdl = r#"
1159 index test {
1160 field title: text
1161 }
1162 "#;
1163
1164 let indexes = parse_sdl(sdl).unwrap();
1165 let field = &indexes[0].fields[0];
1166
1167 assert!(field.indexed);
1169 assert!(field.stored);
1170 }
1171
1172 #[test]
1173 fn test_multiple_indexes() {
1174 let sdl = r#"
1175 index articles {
1176 field title: text [indexed, stored]
1177 }
1178
1179 index users {
1180 field name: text [indexed, stored]
1181 field email: text [indexed, stored]
1182 }
1183 "#;
1184
1185 let indexes = parse_sdl(sdl).unwrap();
1186 assert_eq!(indexes.len(), 2);
1187 assert_eq!(indexes[0].name, "articles");
1188 assert_eq!(indexes[1].name, "users");
1189 }
1190
1191 #[test]
1192 fn test_tokenizer_spec() {
1193 let sdl = r#"
1194 index articles {
1195 field title: text<en_stem> [indexed, stored]
1196 field body: text<simple> [indexed]
1197 field author: text [indexed, stored]
1198 }
1199 "#;
1200
1201 let indexes = parse_sdl(sdl).unwrap();
1202 let index = &indexes[0];
1203
1204 assert_eq!(index.fields[0].name, "title");
1205 assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
1206
1207 assert_eq!(index.fields[1].name, "body");
1208 assert_eq!(index.fields[1].tokenizer, Some("simple".to_string()));
1209
1210 assert_eq!(index.fields[2].name, "author");
1211 assert_eq!(index.fields[2].tokenizer, None); }
1213
1214 #[test]
1215 fn test_tokenizer_in_schema() {
1216 let sdl = r#"
1217 index articles {
1218 field title: text<german> [indexed, stored]
1219 field body: text<en_stem> [indexed]
1220 }
1221 "#;
1222
1223 let indexes = parse_sdl(sdl).unwrap();
1224 let schema = indexes[0].to_schema();
1225
1226 let title_field = schema.get_field("title").unwrap();
1227 let title_entry = schema.get_field_entry(title_field).unwrap();
1228 assert_eq!(title_entry.tokenizer, Some("german".to_string()));
1229
1230 let body_field = schema.get_field("body").unwrap();
1231 let body_entry = schema.get_field_entry(body_field).unwrap();
1232 assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
1233 }
1234
1235 #[test]
1236 fn test_query_router_basic() {
1237 let sdl = r#"
1238 index documents {
1239 field title: text [indexed, stored]
1240 field uri: text [indexed, stored]
1241
1242 query_router {
1243 pattern: "10\\.\\d{4,}/[^\\s]+"
1244 substitution: "doi://{0}"
1245 target_field: uris
1246 mode: exclusive
1247 }
1248 }
1249 "#;
1250
1251 let indexes = parse_sdl(sdl).unwrap();
1252 let index = &indexes[0];
1253
1254 assert_eq!(index.query_routers.len(), 1);
1255 let router = &index.query_routers[0];
1256 assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1257 assert_eq!(router.substitution, "doi://{0}");
1258 assert_eq!(router.target_field, "uris");
1259 assert_eq!(router.mode, RoutingMode::Exclusive);
1260 }
1261
1262 #[test]
1263 fn test_query_router_raw_string() {
1264 let sdl = r#"
1265 index documents {
1266 field uris: text [indexed, stored]
1267
1268 query_router {
1269 pattern: r"^pmid:(\d+)$"
1270 substitution: "pubmed://{1}"
1271 target_field: uris
1272 mode: additional
1273 }
1274 }
1275 "#;
1276
1277 let indexes = parse_sdl(sdl).unwrap();
1278 let router = &indexes[0].query_routers[0];
1279
1280 assert_eq!(router.pattern, r"^pmid:(\d+)$");
1281 assert_eq!(router.substitution, "pubmed://{1}");
1282 assert_eq!(router.mode, RoutingMode::Additional);
1283 }
1284
1285 #[test]
1286 fn test_multiple_query_routers() {
1287 let sdl = r#"
1288 index documents {
1289 field uris: text [indexed, stored]
1290
1291 query_router {
1292 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1293 substitution: "doi://{1}"
1294 target_field: uris
1295 mode: exclusive
1296 }
1297
1298 query_router {
1299 pattern: r"^pmid:(\d+)$"
1300 substitution: "pubmed://{1}"
1301 target_field: uris
1302 mode: exclusive
1303 }
1304
1305 query_router {
1306 pattern: r"^arxiv:(\d+\.\d+)$"
1307 substitution: "arxiv://{1}"
1308 target_field: uris
1309 mode: additional
1310 }
1311 }
1312 "#;
1313
1314 let indexes = parse_sdl(sdl).unwrap();
1315 assert_eq!(indexes[0].query_routers.len(), 3);
1316 }
1317
1318 #[test]
1319 fn test_query_router_default_mode() {
1320 let sdl = r#"
1321 index documents {
1322 field uris: text [indexed, stored]
1323
1324 query_router {
1325 pattern: r"test"
1326 substitution: "{0}"
1327 target_field: uris
1328 }
1329 }
1330 "#;
1331
1332 let indexes = parse_sdl(sdl).unwrap();
1333 assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1335 }
1336
1337 #[test]
1338 fn test_multi_attribute() {
1339 let sdl = r#"
1340 index documents {
1341 field uris: text [indexed, stored<multi>]
1342 field title: text [indexed, stored]
1343 }
1344 "#;
1345
1346 let indexes = parse_sdl(sdl).unwrap();
1347 assert_eq!(indexes.len(), 1);
1348
1349 let fields = &indexes[0].fields;
1350 assert_eq!(fields.len(), 2);
1351
1352 assert_eq!(fields[0].name, "uris");
1354 assert!(fields[0].multi, "uris field should have multi=true");
1355
1356 assert_eq!(fields[1].name, "title");
1358 assert!(!fields[1].multi, "title field should have multi=false");
1359
1360 let schema = indexes[0].to_schema();
1362 let uris_field = schema.get_field("uris").unwrap();
1363 let title_field = schema.get_field("title").unwrap();
1364
1365 assert!(schema.get_field_entry(uris_field).unwrap().multi);
1366 assert!(!schema.get_field_entry(title_field).unwrap().multi);
1367 }
1368
1369 #[test]
1370 fn test_sparse_vector_field() {
1371 let sdl = r#"
1372 index documents {
1373 field embedding: sparse_vector [indexed, stored]
1374 }
1375 "#;
1376
1377 let indexes = parse_sdl(sdl).unwrap();
1378 assert_eq!(indexes.len(), 1);
1379 assert_eq!(indexes[0].fields.len(), 1);
1380 assert_eq!(indexes[0].fields[0].name, "embedding");
1381 assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1382 assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1383 }
1384
1385 #[test]
1386 fn test_sparse_vector_with_config() {
1387 let sdl = r#"
1388 index documents {
1389 field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1390 field dense: sparse_vector<u32> [indexed<quantization: float32>]
1391 }
1392 "#;
1393
1394 let indexes = parse_sdl(sdl).unwrap();
1395 assert_eq!(indexes[0].fields.len(), 2);
1396
1397 let f1 = &indexes[0].fields[0];
1399 assert_eq!(f1.name, "embedding");
1400 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1401 assert_eq!(config1.index_size, IndexSize::U16);
1402 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1403
1404 let f2 = &indexes[0].fields[1];
1406 assert_eq!(f2.name, "dense");
1407 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1408 assert_eq!(config2.index_size, IndexSize::U32);
1409 assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1410 }
1411
1412 #[test]
1413 fn test_sparse_vector_with_weight_threshold() {
1414 let sdl = r#"
1415 index documents {
1416 field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1417 field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1418 }
1419 "#;
1420
1421 let indexes = parse_sdl(sdl).unwrap();
1422 assert_eq!(indexes[0].fields.len(), 2);
1423
1424 let f1 = &indexes[0].fields[0];
1426 assert_eq!(f1.name, "embedding");
1427 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1428 assert_eq!(config1.index_size, IndexSize::U16);
1429 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1430 assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1431
1432 let f2 = &indexes[0].fields[1];
1434 assert_eq!(f2.name, "embedding2");
1435 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1436 assert_eq!(config2.index_size, IndexSize::U32);
1437 assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1438 assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1439 }
1440
1441 #[test]
1442 fn test_sparse_vector_with_pruning() {
1443 let sdl = r#"
1444 index documents {
1445 field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1446 }
1447 "#;
1448
1449 let indexes = parse_sdl(sdl).unwrap();
1450 let f = &indexes[0].fields[0];
1451 assert_eq!(f.name, "embedding");
1452 let config = f.sparse_vector_config.as_ref().unwrap();
1453 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1454 assert_eq!(config.pruning, Some(0.1));
1455 }
1456
1457 #[test]
1458 fn test_dense_vector_field() {
1459 let sdl = r#"
1460 index documents {
1461 field embedding: dense_vector<768> [indexed, stored]
1462 }
1463 "#;
1464
1465 let indexes = parse_sdl(sdl).unwrap();
1466 assert_eq!(indexes.len(), 1);
1467 assert_eq!(indexes[0].fields.len(), 1);
1468
1469 let f = &indexes[0].fields[0];
1470 assert_eq!(f.name, "embedding");
1471 assert_eq!(f.field_type, FieldType::DenseVector);
1472
1473 let config = f.dense_vector_config.as_ref().unwrap();
1474 assert_eq!(config.dim, 768);
1475 }
1476
1477 #[test]
1478 fn test_dense_vector_alias() {
1479 let sdl = r#"
1480 index documents {
1481 field embedding: vector<1536> [indexed]
1482 }
1483 "#;
1484
1485 let indexes = parse_sdl(sdl).unwrap();
1486 assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1487 assert_eq!(
1488 indexes[0].fields[0]
1489 .dense_vector_config
1490 .as_ref()
1491 .unwrap()
1492 .dim,
1493 1536
1494 );
1495 }
1496
1497 #[test]
1498 fn test_dense_vector_with_num_clusters() {
1499 let sdl = r#"
1500 index documents {
1501 field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1502 }
1503 "#;
1504
1505 let indexes = parse_sdl(sdl).unwrap();
1506 assert_eq!(indexes.len(), 1);
1507
1508 let f = &indexes[0].fields[0];
1509 assert_eq!(f.name, "embedding");
1510 assert_eq!(f.field_type, FieldType::DenseVector);
1511
1512 let config = f.dense_vector_config.as_ref().unwrap();
1513 assert_eq!(config.dim, 768);
1514 assert_eq!(config.num_clusters, Some(256));
1515 assert_eq!(config.nprobe, 32); }
1517
1518 #[test]
1519 fn test_dense_vector_with_num_clusters_and_nprobe() {
1520 let sdl = r#"
1521 index documents {
1522 field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1523 }
1524 "#;
1525
1526 let indexes = parse_sdl(sdl).unwrap();
1527 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1528
1529 assert_eq!(config.dim, 1536);
1530 assert_eq!(config.num_clusters, Some(512));
1531 assert_eq!(config.nprobe, 64);
1532 }
1533
1534 #[test]
1535 fn test_dense_vector_keyword_syntax() {
1536 let sdl = r#"
1537 index documents {
1538 field embedding: dense_vector<dims: 1536> [indexed, stored]
1539 }
1540 "#;
1541
1542 let indexes = parse_sdl(sdl).unwrap();
1543 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1544
1545 assert_eq!(config.dim, 1536);
1546 assert!(config.num_clusters.is_none());
1547 }
1548
1549 #[test]
1550 fn test_dense_vector_keyword_syntax_full() {
1551 let sdl = r#"
1552 index documents {
1553 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1554 }
1555 "#;
1556
1557 let indexes = parse_sdl(sdl).unwrap();
1558 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1559
1560 assert_eq!(config.dim, 1536);
1561 assert_eq!(config.num_clusters, Some(256));
1562 assert_eq!(config.nprobe, 64);
1563 }
1564
1565 #[test]
1566 fn test_dense_vector_keyword_syntax_partial() {
1567 let sdl = r#"
1568 index documents {
1569 field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1570 }
1571 "#;
1572
1573 let indexes = parse_sdl(sdl).unwrap();
1574 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1575
1576 assert_eq!(config.dim, 768);
1577 assert_eq!(config.num_clusters, Some(128));
1578 assert_eq!(config.nprobe, 32); }
1580
1581 #[test]
1582 fn test_dense_vector_scann_index() {
1583 use crate::dsl::schema::VectorIndexType;
1584
1585 let sdl = r#"
1586 index documents {
1587 field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1588 }
1589 "#;
1590
1591 let indexes = parse_sdl(sdl).unwrap();
1592 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1593
1594 assert_eq!(config.dim, 768);
1595 assert_eq!(config.index_type, VectorIndexType::ScaNN);
1596 assert_eq!(config.num_clusters, Some(256));
1597 assert_eq!(config.nprobe, 64);
1598 }
1599
1600 #[test]
1601 fn test_dense_vector_ivf_rabitq_index() {
1602 use crate::dsl::schema::VectorIndexType;
1603
1604 let sdl = r#"
1605 index documents {
1606 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1607 }
1608 "#;
1609
1610 let indexes = parse_sdl(sdl).unwrap();
1611 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1612
1613 assert_eq!(config.dim, 1536);
1614 assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1615 assert_eq!(config.num_clusters, Some(512));
1616 }
1617
1618 #[test]
1619 fn test_dense_vector_rabitq_no_clusters() {
1620 use crate::dsl::schema::VectorIndexType;
1621
1622 let sdl = r#"
1623 index documents {
1624 field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1625 }
1626 "#;
1627
1628 let indexes = parse_sdl(sdl).unwrap();
1629 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1630
1631 assert_eq!(config.dim, 768);
1632 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1633 assert!(config.num_clusters.is_none());
1634 }
1635
1636 #[test]
1637 fn test_dense_vector_flat_index() {
1638 use crate::dsl::schema::VectorIndexType;
1639
1640 let sdl = r#"
1641 index documents {
1642 field embedding: dense_vector<dims: 768> [indexed<flat>]
1643 }
1644 "#;
1645
1646 let indexes = parse_sdl(sdl).unwrap();
1647 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1648
1649 assert_eq!(config.dim, 768);
1650 assert_eq!(config.index_type, VectorIndexType::Flat);
1651 }
1652
1653 #[test]
1654 fn test_dense_vector_default_index_type() {
1655 use crate::dsl::schema::VectorIndexType;
1656
1657 let sdl = r#"
1659 index documents {
1660 field embedding: dense_vector<dims: 768> [indexed]
1661 }
1662 "#;
1663
1664 let indexes = parse_sdl(sdl).unwrap();
1665 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1666
1667 assert_eq!(config.dim, 768);
1668 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1669 }
1670
1671 #[test]
1672 fn test_dense_vector_f16_quantization() {
1673 use crate::dsl::schema::{DenseVectorQuantization, VectorIndexType};
1674
1675 let sdl = r#"
1676 index documents {
1677 field embedding: dense_vector<768, f16> [indexed]
1678 }
1679 "#;
1680
1681 let indexes = parse_sdl(sdl).unwrap();
1682 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1683
1684 assert_eq!(config.dim, 768);
1685 assert_eq!(config.quantization, DenseVectorQuantization::F16);
1686 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1687 }
1688
1689 #[test]
1690 fn test_dense_vector_uint8_quantization() {
1691 use crate::dsl::schema::DenseVectorQuantization;
1692
1693 let sdl = r#"
1694 index documents {
1695 field embedding: dense_vector<1024, uint8> [indexed<ivf_rabitq>]
1696 }
1697 "#;
1698
1699 let indexes = parse_sdl(sdl).unwrap();
1700 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1701
1702 assert_eq!(config.dim, 1024);
1703 assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1704 }
1705
1706 #[test]
1707 fn test_dense_vector_u8_alias() {
1708 use crate::dsl::schema::DenseVectorQuantization;
1709
1710 let sdl = r#"
1711 index documents {
1712 field embedding: dense_vector<512, u8> [indexed]
1713 }
1714 "#;
1715
1716 let indexes = parse_sdl(sdl).unwrap();
1717 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1718
1719 assert_eq!(config.dim, 512);
1720 assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1721 }
1722
1723 #[test]
1724 fn test_dense_vector_default_f32_quantization() {
1725 use crate::dsl::schema::DenseVectorQuantization;
1726
1727 let sdl = r#"
1729 index documents {
1730 field embedding: dense_vector<768> [indexed]
1731 }
1732 "#;
1733
1734 let indexes = parse_sdl(sdl).unwrap();
1735 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1736
1737 assert_eq!(config.dim, 768);
1738 assert_eq!(config.quantization, DenseVectorQuantization::F32);
1739 }
1740
1741 #[test]
1742 fn test_dense_vector_keyword_with_quantization() {
1743 use crate::dsl::schema::DenseVectorQuantization;
1744
1745 let sdl = r#"
1746 index documents {
1747 field embedding: dense_vector<dims: 768, f16> [indexed]
1748 }
1749 "#;
1750
1751 let indexes = parse_sdl(sdl).unwrap();
1752 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1753
1754 assert_eq!(config.dim, 768);
1755 assert_eq!(config.quantization, DenseVectorQuantization::F16);
1756 }
1757
1758 #[test]
1759 fn test_json_field_type() {
1760 let sdl = r#"
1761 index documents {
1762 field title: text [indexed, stored]
1763 field metadata: json [stored]
1764 field extra: json
1765 }
1766 "#;
1767
1768 let indexes = parse_sdl(sdl).unwrap();
1769 let index = &indexes[0];
1770
1771 assert_eq!(index.fields.len(), 3);
1772
1773 assert_eq!(index.fields[1].name, "metadata");
1775 assert!(matches!(index.fields[1].field_type, FieldType::Json));
1776 assert!(index.fields[1].stored);
1777 assert_eq!(index.fields[2].name, "extra");
1781 assert!(matches!(index.fields[2].field_type, FieldType::Json));
1782
1783 let schema = index.to_schema();
1785 let metadata_field = schema.get_field("metadata").unwrap();
1786 let entry = schema.get_field_entry(metadata_field).unwrap();
1787 assert_eq!(entry.field_type, FieldType::Json);
1788 assert!(!entry.indexed); assert!(entry.stored);
1790 }
1791
1792 #[test]
1793 fn test_sparse_vector_query_config() {
1794 use crate::structures::QueryWeighting;
1795
1796 let sdl = r#"
1797 index documents {
1798 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1799 }
1800 "#;
1801
1802 let indexes = parse_sdl(sdl).unwrap();
1803 let index = &indexes[0];
1804
1805 assert_eq!(index.fields.len(), 1);
1806 assert_eq!(index.fields[0].name, "embedding");
1807 assert!(matches!(
1808 index.fields[0].field_type,
1809 FieldType::SparseVector
1810 ));
1811
1812 let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1813 assert_eq!(config.index_size, IndexSize::U16);
1814 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1815
1816 let query_config = config.query_config.as_ref().unwrap();
1818 assert_eq!(
1819 query_config.tokenizer.as_deref(),
1820 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1821 );
1822 assert_eq!(query_config.weighting, QueryWeighting::Idf);
1823
1824 let schema = index.to_schema();
1826 let embedding_field = schema.get_field("embedding").unwrap();
1827 let entry = schema.get_field_entry(embedding_field).unwrap();
1828 let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1829 let qc = sv_config.query_config.as_ref().unwrap();
1830 assert_eq!(
1831 qc.tokenizer.as_deref(),
1832 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1833 );
1834 assert_eq!(qc.weighting, QueryWeighting::Idf);
1835 }
1836
1837 #[test]
1838 fn test_sparse_vector_query_config_weighting_one() {
1839 use crate::structures::QueryWeighting;
1840
1841 let sdl = r#"
1842 index documents {
1843 field embedding: sparse_vector [indexed<query<weighting: one>>]
1844 }
1845 "#;
1846
1847 let indexes = parse_sdl(sdl).unwrap();
1848 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1849
1850 let query_config = config.query_config.as_ref().unwrap();
1851 assert!(query_config.tokenizer.is_none());
1852 assert_eq!(query_config.weighting, QueryWeighting::One);
1853 }
1854
1855 #[test]
1856 fn test_sparse_vector_query_config_weighting_idf_file() {
1857 use crate::structures::QueryWeighting;
1858
1859 let sdl = r#"
1860 index documents {
1861 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1862 }
1863 "#;
1864
1865 let indexes = parse_sdl(sdl).unwrap();
1866 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1867
1868 let query_config = config.query_config.as_ref().unwrap();
1869 assert_eq!(
1870 query_config.tokenizer.as_deref(),
1871 Some("opensearch-neural-sparse-encoding-v1")
1872 );
1873 assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1874
1875 let schema = indexes[0].to_schema();
1877 let field = schema.get_field("embedding").unwrap();
1878 let entry = schema.get_field_entry(field).unwrap();
1879 let sc = entry.sparse_vector_config.as_ref().unwrap();
1880 let qc = sc.query_config.as_ref().unwrap();
1881 assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1882 }
1883
1884 #[test]
1885 fn test_sparse_vector_query_config_pruning_params() {
1886 let sdl = r#"
1887 index documents {
1888 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<weighting: idf, weight_threshold: 0.03, max_dims: 25, pruning: 0.2>>]
1889 }
1890 "#;
1891
1892 let indexes = parse_sdl(sdl).unwrap();
1893 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1894
1895 let qc = config.query_config.as_ref().unwrap();
1896 assert_eq!(qc.weighting, QueryWeighting::Idf);
1897 assert!((qc.weight_threshold - 0.03).abs() < 0.001);
1898 assert_eq!(qc.max_query_dims, Some(25));
1899 assert!((qc.pruning.unwrap() - 0.2).abs() < 0.001);
1900
1901 let schema = indexes[0].to_schema();
1903 let field = schema.get_field("embedding").unwrap();
1904 let entry = schema.get_field_entry(field).unwrap();
1905 let sc = entry.sparse_vector_config.as_ref().unwrap();
1906 let rqc = sc.query_config.as_ref().unwrap();
1907 assert!((rqc.weight_threshold - 0.03).abs() < 0.001);
1908 assert_eq!(rqc.max_query_dims, Some(25));
1909 assert!((rqc.pruning.unwrap() - 0.2).abs() < 0.001);
1910 }
1911
1912 #[test]
1913 fn test_sparse_vector_format_maxscore() {
1914 let sdl = r#"
1915 index documents {
1916 field embedding: sparse_vector<u16> [indexed<format: maxscore, quantization: uint8>]
1917 }
1918 "#;
1919
1920 let indexes = parse_sdl(sdl).unwrap();
1921 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1922 assert_eq!(config.format, SparseFormat::MaxScore);
1923 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1924
1925 let schema = indexes[0].to_schema();
1927 let field = schema.get_field("embedding").unwrap();
1928 let entry = schema.get_field_entry(field).unwrap();
1929 let sc = entry.sparse_vector_config.as_ref().unwrap();
1930 assert_eq!(sc.format, SparseFormat::MaxScore);
1931 }
1932
1933 #[test]
1934 fn test_sparse_vector_format_bmp() {
1935 let sdl = r#"
1936 index documents {
1937 field embedding: sparse_vector<u16> [indexed<format: bmp, quantization: uint8>]
1938 }
1939 "#;
1940
1941 let indexes = parse_sdl(sdl).unwrap();
1942 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1943 assert_eq!(config.format, SparseFormat::Bmp);
1944 }
1945
1946 #[test]
1947 fn test_fast_attribute() {
1948 let sdl = r#"
1949 index products {
1950 field name: text [indexed, stored]
1951 field price: f64 [indexed, fast]
1952 field category: text [indexed, stored, fast]
1953 field count: u64 [fast]
1954 field score: i64 [indexed, stored, fast]
1955 }
1956 "#;
1957
1958 let indexes = parse_sdl(sdl).unwrap();
1959 assert_eq!(indexes.len(), 1);
1960 let index = &indexes[0];
1961 assert_eq!(index.fields.len(), 5);
1962
1963 assert!(!index.fields[0].fast);
1965 assert!(index.fields[1].fast);
1967 assert!(matches!(index.fields[1].field_type, FieldType::F64));
1968 assert!(index.fields[2].fast);
1970 assert!(matches!(index.fields[2].field_type, FieldType::Text));
1971 assert!(index.fields[3].fast);
1973 assert!(matches!(index.fields[3].field_type, FieldType::U64));
1974 assert!(index.fields[4].fast);
1976 assert!(matches!(index.fields[4].field_type, FieldType::I64));
1977
1978 let schema = index.to_schema();
1980 let price_field = schema.get_field("price").unwrap();
1981 assert!(schema.get_field_entry(price_field).unwrap().fast);
1982
1983 let category_field = schema.get_field("category").unwrap();
1984 assert!(schema.get_field_entry(category_field).unwrap().fast);
1985
1986 let name_field = schema.get_field("name").unwrap();
1987 assert!(!schema.get_field_entry(name_field).unwrap().fast);
1988 }
1989
1990 #[test]
1991 fn test_primary_attribute() {
1992 let sdl = r#"
1993 index documents {
1994 field id: text [primary, stored]
1995 field title: text [indexed, stored]
1996 }
1997 "#;
1998
1999 let indexes = parse_sdl(sdl).unwrap();
2000 assert_eq!(indexes.len(), 1);
2001 let index = &indexes[0];
2002 assert_eq!(index.fields.len(), 2);
2003
2004 let id_field = &index.fields[0];
2006 assert!(id_field.primary, "id should be primary");
2007 assert!(id_field.fast, "primary implies fast");
2008 assert!(id_field.indexed, "primary implies indexed");
2009
2010 assert!(!index.fields[1].primary);
2012
2013 let schema = index.to_schema();
2015 let id = schema.get_field("id").unwrap();
2016 let id_entry = schema.get_field_entry(id).unwrap();
2017 assert!(id_entry.primary_key);
2018 assert!(id_entry.fast);
2019 assert!(id_entry.indexed);
2020
2021 let title = schema.get_field("title").unwrap();
2022 assert!(!schema.get_field_entry(title).unwrap().primary_key);
2023
2024 assert_eq!(schema.primary_field(), Some(id));
2026 }
2027
2028 #[test]
2029 fn test_primary_with_other_attributes() {
2030 let sdl = r#"
2031 index documents {
2032 field id: text<simple> [primary, indexed, stored]
2033 field body: text [indexed]
2034 }
2035 "#;
2036
2037 let indexes = parse_sdl(sdl).unwrap();
2038 let id_field = &indexes[0].fields[0];
2039 assert!(id_field.primary);
2040 assert!(id_field.indexed);
2041 assert!(id_field.stored);
2042 assert!(id_field.fast);
2043 assert_eq!(id_field.tokenizer, Some("simple".to_string()));
2044 }
2045
2046 #[test]
2047 fn test_primary_only_one_allowed() {
2048 let sdl = r#"
2049 index documents {
2050 field id: text [primary]
2051 field alt_id: text [primary]
2052 }
2053 "#;
2054
2055 let result = parse_sdl(sdl);
2056 assert!(result.is_err());
2057 let err = result.unwrap_err().to_string();
2058 assert!(
2059 err.contains("primary key"),
2060 "Error should mention primary key: {}",
2061 err
2062 );
2063 }
2064
2065 #[test]
2066 fn test_primary_must_be_text() {
2067 let sdl = r#"
2068 index documents {
2069 field id: u64 [primary]
2070 }
2071 "#;
2072
2073 let result = parse_sdl(sdl);
2074 assert!(result.is_err());
2075 let err = result.unwrap_err().to_string();
2076 assert!(
2077 err.contains("text"),
2078 "Error should mention text type: {}",
2079 err
2080 );
2081 }
2082
2083 #[test]
2084 fn test_primary_cannot_be_multi() {
2085 let sdl = r#"
2086 index documents {
2087 field id: text [primary, stored<multi>]
2088 }
2089 "#;
2090
2091 let result = parse_sdl(sdl);
2092 assert!(result.is_err());
2093 let err = result.unwrap_err().to_string();
2094 assert!(err.contains("multi"), "Error should mention multi: {}", err);
2095 }
2096
2097 #[test]
2098 fn test_no_primary_field() {
2099 let sdl = r#"
2101 index documents {
2102 field title: text [indexed, stored]
2103 }
2104 "#;
2105
2106 let indexes = parse_sdl(sdl).unwrap();
2107 let schema = indexes[0].to_schema();
2108 assert!(schema.primary_field().is_none());
2109 }
2110
2111 #[test]
2112 fn test_reorder_attribute() {
2113 let sdl = r#"
2114 index documents {
2115 field embedding: sparse_vector<u16> [indexed<format: bmp, quantization: uint8>, reorder]
2116 field embedding2: sparse_vector [indexed<format: bmp>]
2117 }
2118 "#;
2119
2120 let indexes = parse_sdl(sdl).unwrap();
2121 assert_eq!(indexes[0].fields.len(), 2);
2122
2123 assert!(indexes[0].fields[0].reorder);
2125 assert!(!indexes[0].fields[1].reorder);
2127
2128 let schema = indexes[0].to_schema();
2130 let f1 = schema.get_field("embedding").unwrap();
2131 assert!(schema.get_field_entry(f1).unwrap().reorder);
2132
2133 let f2 = schema.get_field("embedding2").unwrap();
2134 assert!(!schema.get_field_entry(f2).unwrap().reorder);
2135 }
2136}