1use pest::Parser;
46use pest_derive::Parser;
47
48use super::query_field_router::{QueryRouterRule, RoutingMode};
49use super::schema::{DenseVectorQuantization, FieldType, Schema, SchemaBuilder};
50use crate::Result;
51use crate::error::Error;
52
53#[derive(Parser)]
54#[grammar = "dsl/sdl/sdl.pest"]
55pub struct SdlParser;
56
57use super::schema::{BinaryDenseVectorConfig, DenseVectorConfig};
58use crate::structures::{
59 IndexSize, QueryWeighting, SparseFormat, SparseQueryConfig, SparseVectorConfig,
60 WeightQuantization,
61};
62
63#[derive(Debug, Clone)]
65pub struct FieldDef {
66 pub name: String,
67 pub field_type: FieldType,
68 pub indexed: bool,
69 pub stored: bool,
70 pub tokenizer: Option<String>,
72 pub multi: bool,
74 pub positions: Option<super::schema::PositionMode>,
76 pub sparse_vector_config: Option<SparseVectorConfig>,
78 pub dense_vector_config: Option<DenseVectorConfig>,
80 pub binary_dense_vector_config: Option<BinaryDenseVectorConfig>,
82 pub fast: bool,
84 pub primary: bool,
86 pub reorder: bool,
88}
89
90#[derive(Debug, Clone)]
92pub struct IndexDef {
93 pub name: String,
94 pub fields: Vec<FieldDef>,
95 pub default_fields: Vec<String>,
96 pub query_routers: Vec<QueryRouterRule>,
98}
99
100impl IndexDef {
101 pub fn to_schema(&self) -> Schema {
103 let mut builder = SchemaBuilder::default();
104
105 for field in &self.fields {
106 let f = match field.field_type {
107 FieldType::Text => {
108 let tokenizer = field.tokenizer.as_deref().unwrap_or("simple");
109 builder.add_text_field_with_tokenizer(
110 &field.name,
111 field.indexed,
112 field.stored,
113 tokenizer,
114 )
115 }
116 FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
117 FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
118 FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
119 FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
120 FieldType::Json => builder.add_json_field(&field.name, field.stored),
121 FieldType::SparseVector => {
122 if let Some(config) = &field.sparse_vector_config {
123 builder.add_sparse_vector_field_with_config(
124 &field.name,
125 field.indexed,
126 field.stored,
127 config.clone(),
128 )
129 } else {
130 builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
131 }
132 }
133 FieldType::DenseVector => {
134 let config = field
136 .dense_vector_config
137 .as_ref()
138 .expect("DenseVector field requires dimension to be specified");
139 builder.add_dense_vector_field_with_config(
140 &field.name,
141 field.indexed,
142 field.stored,
143 config.clone(),
144 )
145 }
146 FieldType::BinaryDenseVector => {
147 let config = field
148 .binary_dense_vector_config
149 .as_ref()
150 .expect("BinaryDenseVector field requires dimension to be specified");
151 builder.add_binary_dense_vector_field_with_config(
152 &field.name,
153 field.indexed,
154 field.stored,
155 config.clone(),
156 )
157 }
158 };
159 if field.multi {
160 builder.set_multi(f, true);
161 }
162 if field.fast {
163 builder.set_fast(f, true);
164 }
165 if field.primary {
166 builder.set_primary_key(f);
167 }
168 if field.reorder {
169 builder.set_reorder(f, true);
170 }
171 let positions = field.positions.or({
173 if field.multi
175 && matches!(
176 field.field_type,
177 FieldType::SparseVector
178 | FieldType::DenseVector
179 | FieldType::BinaryDenseVector
180 )
181 {
182 Some(super::schema::PositionMode::Ordinal)
183 } else {
184 None
185 }
186 });
187 if let Some(mode) = positions {
188 builder.set_positions(f, mode);
189 }
190 }
191
192 if !self.default_fields.is_empty() {
194 builder.set_default_fields(self.default_fields.clone());
195 }
196
197 if !self.query_routers.is_empty() {
199 builder.set_query_routers(self.query_routers.clone());
200 }
201
202 builder.build()
203 }
204
205 pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
210 if self.query_routers.is_empty() {
211 return Ok(None);
212 }
213
214 super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
215 .map(Some)
216 .map_err(Error::Schema)
217 }
218}
219
220fn parse_field_type(type_str: &str) -> Result<FieldType> {
222 match type_str {
223 "text" | "string" | "str" => Ok(FieldType::Text),
224 "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
225 "i64" | "int" | "integer" => Ok(FieldType::I64),
226 "f64" | "float" | "double" => Ok(FieldType::F64),
227 "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
228 "json" => Ok(FieldType::Json),
229 "sparse_vector" => Ok(FieldType::SparseVector),
230 "dense_vector" | "vector" => Ok(FieldType::DenseVector),
231 "binary_dense_vector" | "binary_vector" => Ok(FieldType::BinaryDenseVector),
232 _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
233 }
234}
235
236#[derive(Debug, Clone, Default)]
238struct IndexConfig {
239 index_type: Option<super::schema::VectorIndexType>,
240 num_clusters: Option<usize>,
241 nprobe: Option<usize>,
242 build_threshold: Option<usize>,
243 sparse_format: Option<SparseFormat>,
245 quantization: Option<WeightQuantization>,
246 weight_threshold: Option<f32>,
247 block_size: Option<usize>,
248 pruning: Option<f32>,
249 min_terms: Option<usize>,
250 query_tokenizer: Option<String>,
252 query_weighting: Option<QueryWeighting>,
253 query_weight_threshold: Option<f32>,
254 query_max_dims: Option<usize>,
255 query_pruning: Option<f32>,
256 query_min_query_dims: Option<usize>,
257 dims: Option<u32>,
259 max_weight: Option<f32>,
260 positions: Option<super::schema::PositionMode>,
262}
263
264struct ParsedAttributes {
266 indexed: bool,
267 stored: bool,
268 multi: bool,
269 fast: bool,
270 primary: bool,
271 reorder: bool,
272 index_config: Option<IndexConfig>,
273}
274
275fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> ParsedAttributes {
277 let mut attrs = ParsedAttributes {
278 indexed: false,
279 stored: false,
280 multi: false,
281 fast: false,
282 primary: false,
283 reorder: false,
284 index_config: None,
285 };
286
287 for attr in pair.into_inner() {
288 if attr.as_rule() == Rule::attribute {
289 let mut found_config = false;
290 for inner in attr.clone().into_inner() {
291 match inner.as_rule() {
292 Rule::indexed_with_config => {
293 attrs.indexed = true;
294 attrs.index_config = Some(parse_index_config(inner));
295 found_config = true;
296 break;
297 }
298 Rule::stored_with_config => {
299 attrs.stored = true;
300 attrs.multi = true; found_config = true;
302 break;
303 }
304 _ => {}
305 }
306 }
307 if !found_config {
308 match attr.as_str() {
309 "indexed" => attrs.indexed = true,
310 "stored" => attrs.stored = true,
311 "fast" => attrs.fast = true,
312 "primary" => attrs.primary = true,
313 "reorder" => attrs.reorder = true,
314 _ => {}
315 }
316 }
317 }
318 }
319
320 attrs
321}
322
323fn parse_index_config(pair: pest::iterators::Pair<Rule>) -> IndexConfig {
325 let mut config = IndexConfig::default();
326
327 for inner in pair.into_inner() {
332 if inner.as_rule() == Rule::index_config_params {
333 for param in inner.into_inner() {
334 if param.as_rule() == Rule::index_config_param {
335 for p in param.into_inner() {
336 parse_single_index_config_param(&mut config, p);
337 }
338 }
339 }
340 }
341 }
342
343 config
344}
345
346fn parse_single_index_config_param(config: &mut IndexConfig, p: pest::iterators::Pair<Rule>) {
348 use super::schema::VectorIndexType;
349
350 match p.as_rule() {
351 Rule::index_type_spec => {
352 config.index_type = Some(match p.as_str() {
353 "flat" => VectorIndexType::Flat,
354 "rabitq" => VectorIndexType::RaBitQ,
355 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
356 "scann" => VectorIndexType::ScaNN,
357 _ => VectorIndexType::RaBitQ,
358 });
359 }
360 Rule::index_type_kwarg => {
361 if let Some(t) = p.into_inner().next() {
363 config.index_type = Some(match t.as_str() {
364 "flat" => VectorIndexType::Flat,
365 "rabitq" => VectorIndexType::RaBitQ,
366 "ivf_rabitq" => VectorIndexType::IvfRaBitQ,
367 "scann" => VectorIndexType::ScaNN,
368 _ => VectorIndexType::RaBitQ,
369 });
370 }
371 }
372 Rule::num_clusters_kwarg => {
373 if let Some(n) = p.into_inner().next() {
375 config.num_clusters = Some(n.as_str().parse().unwrap_or_else(|_| {
376 log::warn!(
377 "Invalid num_clusters value '{}', using default 256",
378 n.as_str()
379 );
380 256
381 }));
382 }
383 }
384 Rule::build_threshold_kwarg => {
385 if let Some(n) = p.into_inner().next() {
387 config.build_threshold = Some(n.as_str().parse().unwrap_or_else(|_| {
388 log::warn!(
389 "Invalid build_threshold value '{}', using default 10000",
390 n.as_str()
391 );
392 10000
393 }));
394 }
395 }
396 Rule::nprobe_kwarg => {
397 if let Some(n) = p.into_inner().next() {
399 config.nprobe = Some(n.as_str().parse().unwrap_or_else(|_| {
400 log::warn!("Invalid nprobe value '{}', using default 32", n.as_str());
401 32
402 }));
403 }
404 }
405 Rule::quantization_kwarg => {
406 if let Some(q) = p.into_inner().next() {
408 config.quantization = Some(match q.as_str() {
409 "float32" | "f32" => WeightQuantization::Float32,
410 "float16" | "f16" => WeightQuantization::Float16,
411 "uint8" | "u8" => WeightQuantization::UInt8,
412 "uint4" | "u4" => WeightQuantization::UInt4,
413 _ => WeightQuantization::default(),
414 });
415 }
416 }
417 Rule::weight_threshold_kwarg => {
418 if let Some(t) = p.into_inner().next() {
420 config.weight_threshold = Some(t.as_str().parse().unwrap_or_else(|_| {
421 log::warn!(
422 "Invalid weight_threshold value '{}', using default 0.0",
423 t.as_str()
424 );
425 0.0
426 }));
427 }
428 }
429 Rule::block_size_kwarg => {
430 if let Some(n) = p.into_inner().next() {
432 config.block_size = Some(n.as_str().parse().unwrap_or_else(|_| {
433 log::warn!(
434 "Invalid block_size value '{}', using default 128",
435 n.as_str()
436 );
437 128
438 }));
439 }
440 }
441 Rule::pruning_kwarg => {
442 if let Some(f) = p.into_inner().next() {
444 config.pruning = Some(f.as_str().parse().unwrap_or_else(|_| {
445 log::warn!("Invalid pruning value '{}', using default 1.0", f.as_str());
446 1.0
447 }));
448 }
449 }
450 Rule::min_terms_kwarg => {
451 if let Some(n) = p.into_inner().next() {
452 config.min_terms = Some(n.as_str().parse().unwrap_or_else(|_| {
453 log::warn!("Invalid min_terms value '{}', using default 4", n.as_str());
454 4
455 }));
456 }
457 }
458 Rule::sparse_format_kwarg => {
459 if let Some(f) = p.into_inner().next() {
461 config.sparse_format = Some(match f.as_str() {
462 "bmp" => SparseFormat::Bmp,
463 "maxscore" => SparseFormat::MaxScore,
464 _ => SparseFormat::default(),
465 });
466 }
467 }
468 Rule::sparse_dims_kwarg => {
469 if let Some(n) = p.into_inner().next() {
470 config.dims = Some(n.as_str().parse().unwrap_or_else(|_| {
471 log::warn!("Invalid dims value '{}', using default 105879", n.as_str());
472 105879
473 }));
474 }
475 }
476 Rule::sparse_max_weight_kwarg => {
477 if let Some(f) = p.into_inner().next() {
478 config.max_weight = Some(f.as_str().parse().unwrap_or_else(|_| {
479 log::warn!(
480 "Invalid max_weight value '{}', using default 5.0",
481 f.as_str()
482 );
483 5.0
484 }));
485 }
486 }
487 Rule::query_config_block => {
488 parse_query_config_block(config, p);
490 }
491 Rule::positions_kwarg => {
492 use super::schema::PositionMode;
494 config.positions = Some(match p.as_str() {
495 "ordinal" => PositionMode::Ordinal,
496 "token_position" => PositionMode::TokenPosition,
497 _ => PositionMode::Full, });
499 }
500 _ => {}
501 }
502}
503
504fn parse_query_config_block(config: &mut IndexConfig, pair: pest::iterators::Pair<Rule>) {
506 for inner in pair.into_inner() {
507 if inner.as_rule() == Rule::query_config_params {
508 for param in inner.into_inner() {
509 if param.as_rule() == Rule::query_config_param {
510 for p in param.into_inner() {
511 match p.as_rule() {
512 Rule::query_tokenizer_kwarg => {
513 if let Some(path) = p.into_inner().next()
515 && let Some(inner_path) = path.into_inner().next()
516 {
517 config.query_tokenizer = Some(inner_path.as_str().to_string());
518 }
519 }
520 Rule::query_weighting_kwarg => {
521 if let Some(w) = p.into_inner().next() {
523 config.query_weighting = Some(match w.as_str() {
524 "one" => QueryWeighting::One,
525 "idf" => QueryWeighting::Idf,
526 "idf_file" => QueryWeighting::IdfFile,
527 _ => QueryWeighting::One,
528 });
529 }
530 }
531 Rule::query_weight_threshold_kwarg => {
532 if let Some(t) = p.into_inner().next() {
533 config.query_weight_threshold =
534 Some(t.as_str().parse().unwrap_or_else(|_| {
535 log::warn!(
536 "Invalid query weight_threshold '{}', using 0.0",
537 t.as_str()
538 );
539 0.0
540 }));
541 }
542 }
543 Rule::query_max_dims_kwarg => {
544 if let Some(t) = p.into_inner().next() {
545 config.query_max_dims =
546 Some(t.as_str().parse().unwrap_or_else(|_| {
547 log::warn!(
548 "Invalid query max_dims '{}', using 0",
549 t.as_str()
550 );
551 0
552 }));
553 }
554 }
555 Rule::query_pruning_kwarg => {
556 if let Some(t) = p.into_inner().next() {
557 config.query_pruning =
558 Some(t.as_str().parse().unwrap_or_else(|_| {
559 log::warn!(
560 "Invalid query pruning '{}', using 1.0",
561 t.as_str()
562 );
563 1.0
564 }));
565 }
566 }
567 Rule::query_min_query_dims_kwarg => {
568 if let Some(t) = p.into_inner().next() {
569 config.query_min_query_dims =
570 Some(t.as_str().parse().unwrap_or_else(|_| {
571 log::warn!(
572 "Invalid query min_query_dims '{}', using 4",
573 t.as_str()
574 );
575 4
576 }));
577 }
578 }
579 _ => {}
580 }
581 }
582 }
583 }
584 }
585 }
586}
587
588fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
590 let mut inner = pair.into_inner();
591
592 let name = inner
593 .next()
594 .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
595 .as_str()
596 .to_string();
597
598 let field_type_str = inner
599 .next()
600 .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
601 .as_str();
602
603 let field_type = parse_field_type(field_type_str)?;
604
605 let mut tokenizer = None;
607 let mut sparse_vector_config = None;
608 let mut dense_vector_config = None;
609 let mut binary_dense_vector_config = None;
610 let mut indexed = true;
611 let mut stored = true;
612 let mut multi = false;
613 let mut fast = false;
614 let mut primary = false;
615 let mut reorder = false;
616 let mut index_config: Option<IndexConfig> = None;
617
618 for item in inner {
619 match item.as_rule() {
620 Rule::tokenizer_spec => {
621 if let Some(tok_name) = item.into_inner().next() {
623 tokenizer = Some(tok_name.as_str().to_string());
624 }
625 }
626 Rule::sparse_vector_config => {
627 sparse_vector_config = Some(parse_sparse_vector_config(item));
629 }
630 Rule::dense_vector_config => {
631 dense_vector_config = Some(parse_dense_vector_config(item));
633 }
634 Rule::binary_dense_vector_config => {
635 let dim: usize = item
637 .into_inner()
638 .next()
639 .map(|d| d.as_str().parse().unwrap_or(0))
640 .unwrap_or(0);
641 if dim == 0 || !dim.is_multiple_of(8) {
642 return Err(Error::Schema(format!(
643 "BinaryDenseVector dimension must be a positive multiple of 8, got {dim}"
644 )));
645 }
646 binary_dense_vector_config = Some(BinaryDenseVectorConfig::new(dim));
647 }
648 Rule::attributes => {
649 let attrs = parse_attributes(item);
650 indexed = attrs.indexed;
651 stored = attrs.stored;
652 multi = attrs.multi;
653 fast = attrs.fast;
654 primary = attrs.primary;
655 reorder = attrs.reorder;
656 index_config = attrs.index_config;
657 }
658 _ => {}
659 }
660 }
661
662 if field_type == FieldType::BinaryDenseVector
666 && binary_dense_vector_config.is_none()
667 && let Some(ref dv_config) = dense_vector_config
668 {
669 let dim = dv_config.dim;
670 if dim == 0 || !dim.is_multiple_of(8) {
671 return Err(Error::Schema(format!(
672 "BinaryDenseVector dimension must be a positive multiple of 8, got {dim}"
673 )));
674 }
675 binary_dense_vector_config = Some(BinaryDenseVectorConfig::new(dim));
676 dense_vector_config = None;
677 }
678
679 if primary {
681 fast = true;
682 indexed = true;
683 }
684
685 let mut positions = None;
687 if let Some(idx_cfg) = index_config {
688 positions = idx_cfg.positions;
689 if let Some(ref mut dv_config) = dense_vector_config {
690 apply_index_config_to_dense_vector(dv_config, idx_cfg);
691 } else if field_type == FieldType::SparseVector {
692 let sv_config = sparse_vector_config.get_or_insert(SparseVectorConfig::default());
694 apply_index_config_to_sparse_vector(sv_config, idx_cfg);
695 }
696 }
697
698 Ok(FieldDef {
699 name,
700 field_type,
701 indexed,
702 stored,
703 tokenizer,
704 multi,
705 positions,
706 sparse_vector_config,
707 dense_vector_config,
708 binary_dense_vector_config,
709 fast,
710 primary,
711 reorder,
712 })
713}
714
715fn apply_index_config_to_dense_vector(config: &mut DenseVectorConfig, idx_cfg: IndexConfig) {
717 if let Some(index_type) = idx_cfg.index_type {
719 config.index_type = index_type;
720 }
721
722 if idx_cfg.num_clusters.is_some() {
724 config.num_clusters = idx_cfg.num_clusters;
725 }
726
727 if let Some(nprobe) = idx_cfg.nprobe {
729 config.nprobe = nprobe;
730 }
731
732 if idx_cfg.build_threshold.is_some() {
734 config.build_threshold = idx_cfg.build_threshold;
735 }
736}
737
738fn parse_sparse_vector_config(pair: pest::iterators::Pair<Rule>) -> SparseVectorConfig {
741 let mut index_size = IndexSize::default();
742
743 for inner in pair.into_inner() {
745 if inner.as_rule() == Rule::index_size_spec {
746 index_size = match inner.as_str() {
747 "u16" => IndexSize::U16,
748 "u32" => IndexSize::U32,
749 _ => IndexSize::default(),
750 };
751 }
752 }
753
754 SparseVectorConfig {
755 format: SparseFormat::default(),
756 index_size,
757 weight_quantization: WeightQuantization::default(),
758 weight_threshold: 0.0,
759 block_size: 128,
760 bmp_block_size: 64,
761 max_bmp_grid_bytes: 0,
762 bmp_superblock_size: 64,
763 pruning: None,
764 query_config: None,
765 dims: None,
766 max_weight: None,
767 min_terms: 4,
768 }
769}
770
771fn apply_index_config_to_sparse_vector(config: &mut SparseVectorConfig, idx_cfg: IndexConfig) {
773 if let Some(f) = idx_cfg.sparse_format {
774 config.format = f;
775 }
776 if let Some(q) = idx_cfg.quantization {
777 config.weight_quantization = q;
778 }
779 if let Some(t) = idx_cfg.weight_threshold {
780 config.weight_threshold = t;
781 }
782 if let Some(bs) = idx_cfg.block_size {
783 let adjusted = bs.next_power_of_two();
784 if adjusted != bs {
785 log::warn!(
786 "block_size {} adjusted to next power of two: {}",
787 bs,
788 adjusted
789 );
790 }
791 config.block_size = adjusted;
792 }
793 if let Some(p) = idx_cfg.pruning {
794 let clamped = p.clamp(0.0, 1.0);
795 if (clamped - p).abs() > f32::EPSILON {
796 log::warn!(
797 "pruning {} clamped to valid range [0.0, 1.0]: {}",
798 p,
799 clamped
800 );
801 }
802 config.pruning = Some(clamped);
803 }
804 if let Some(mt) = idx_cfg.min_terms {
805 config.min_terms = mt;
806 }
807 if let Some(d) = idx_cfg.dims {
808 config.dims = Some(d);
809 }
810 if let Some(mw) = idx_cfg.max_weight {
811 config.max_weight = Some(mw);
812 }
813 if idx_cfg.query_tokenizer.is_some()
815 || idx_cfg.query_weighting.is_some()
816 || idx_cfg.query_weight_threshold.is_some()
817 || idx_cfg.query_max_dims.is_some()
818 || idx_cfg.query_pruning.is_some()
819 || idx_cfg.query_min_query_dims.is_some()
820 {
821 let query_config = config
822 .query_config
823 .get_or_insert(SparseQueryConfig::default());
824 if let Some(tokenizer) = idx_cfg.query_tokenizer {
825 query_config.tokenizer = Some(tokenizer);
826 }
827 if let Some(weighting) = idx_cfg.query_weighting {
828 query_config.weighting = weighting;
829 }
830 if let Some(t) = idx_cfg.query_weight_threshold {
831 query_config.weight_threshold = t;
832 }
833 if let Some(d) = idx_cfg.query_max_dims {
834 query_config.max_query_dims = Some(d);
835 }
836 if let Some(p) = idx_cfg.query_pruning {
837 query_config.pruning = Some(p);
838 }
839 if let Some(m) = idx_cfg.query_min_query_dims {
840 query_config.min_query_dims = m;
841 }
842 }
843}
844
845fn parse_dense_vector_config(pair: pest::iterators::Pair<Rule>) -> DenseVectorConfig {
848 let mut dim: usize = 0;
849 let mut quantization = DenseVectorQuantization::F32;
850
851 for params in pair.into_inner() {
853 if params.as_rule() == Rule::dense_vector_params {
854 for inner in params.into_inner() {
855 match inner.as_rule() {
856 Rule::dense_vector_keyword_params => {
857 for kwarg in inner.into_inner() {
858 match kwarg.as_rule() {
859 Rule::dims_kwarg => {
860 if let Some(d) = kwarg.into_inner().next() {
861 dim = d.as_str().parse().unwrap_or(0);
862 }
863 }
864 Rule::quant_type_spec => {
865 quantization = parse_quant_type(kwarg.as_str());
866 }
867 _ => {}
868 }
869 }
870 }
871 Rule::dense_vector_positional_params => {
872 for item in inner.into_inner() {
873 match item.as_rule() {
874 Rule::dimension_spec => {
875 dim = item.as_str().parse().unwrap_or(0);
876 }
877 Rule::quant_type_spec => {
878 quantization = parse_quant_type(item.as_str());
879 }
880 _ => {}
881 }
882 }
883 }
884 _ => {}
885 }
886 }
887 }
888 }
889
890 DenseVectorConfig::new(dim).with_quantization(quantization)
891}
892
893fn parse_quant_type(s: &str) -> DenseVectorQuantization {
894 match s.trim() {
895 "f16" => DenseVectorQuantization::F16,
896 "uint8" | "u8" => DenseVectorQuantization::UInt8,
897 _ => DenseVectorQuantization::F32,
898 }
899}
900
901fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
903 pair.into_inner().map(|p| p.as_str().to_string()).collect()
904}
905
906fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
908 let mut pattern = String::new();
909 let mut substitution = String::new();
910 let mut target_field = String::new();
911 let mut mode = RoutingMode::Additional;
912
913 for prop in pair.into_inner() {
914 if prop.as_rule() != Rule::query_router_prop {
915 continue;
916 }
917
918 for inner in prop.into_inner() {
919 match inner.as_rule() {
920 Rule::query_router_pattern => {
921 if let Some(regex_str) = inner.into_inner().next() {
922 pattern = parse_string_value(regex_str);
923 }
924 }
925 Rule::query_router_substitution => {
926 if let Some(quoted) = inner.into_inner().next() {
927 substitution = parse_string_value(quoted);
928 }
929 }
930 Rule::query_router_target => {
931 if let Some(ident) = inner.into_inner().next() {
932 target_field = ident.as_str().to_string();
933 }
934 }
935 Rule::query_router_mode => {
936 if let Some(mode_val) = inner.into_inner().next() {
937 mode = match mode_val.as_str() {
938 "exclusive" => RoutingMode::Exclusive,
939 "additional" => RoutingMode::Additional,
940 _ => RoutingMode::Additional,
941 };
942 }
943 }
944 _ => {}
945 }
946 }
947 }
948
949 if pattern.is_empty() {
950 return Err(Error::Schema("query_router missing 'pattern'".to_string()));
951 }
952 if substitution.is_empty() {
953 return Err(Error::Schema(
954 "query_router missing 'substitution'".to_string(),
955 ));
956 }
957 if target_field.is_empty() {
958 return Err(Error::Schema(
959 "query_router missing 'target_field'".to_string(),
960 ));
961 }
962
963 Ok(QueryRouterRule {
964 pattern,
965 substitution,
966 target_field,
967 mode,
968 })
969}
970
971fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
973 let s = pair.as_str();
974 match pair.as_rule() {
975 Rule::regex_string => {
976 if let Some(inner) = pair.into_inner().next() {
978 parse_string_value(inner)
979 } else {
980 s.to_string()
981 }
982 }
983 Rule::raw_string => {
984 s[2..s.len() - 1].to_string()
986 }
987 Rule::quoted_string => {
988 let inner = &s[1..s.len() - 1];
990 inner
992 .replace("\\n", "\n")
993 .replace("\\t", "\t")
994 .replace("\\\"", "\"")
995 .replace("\\\\", "\\")
996 }
997 _ => s.to_string(),
998 }
999}
1000
1001fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
1003 let mut inner = pair.into_inner();
1004
1005 let name = inner
1006 .next()
1007 .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
1008 .as_str()
1009 .to_string();
1010
1011 let mut fields = Vec::new();
1012 let mut default_fields = Vec::new();
1013 let mut query_routers = Vec::new();
1014
1015 for item in inner {
1016 match item.as_rule() {
1017 Rule::field_def => {
1018 fields.push(parse_field_def(item)?);
1019 }
1020 Rule::default_fields_def => {
1021 default_fields = parse_default_fields_def(item);
1022 }
1023 Rule::query_router_def => {
1024 query_routers.push(parse_query_router_def(item)?);
1025 }
1026 _ => {}
1027 }
1028 }
1029
1030 let primary_fields: Vec<&FieldDef> = fields.iter().filter(|f| f.primary).collect();
1032 if primary_fields.len() > 1 {
1033 return Err(Error::Schema(format!(
1034 "Index '{}' has {} primary key fields, but at most one is allowed",
1035 name,
1036 primary_fields.len()
1037 )));
1038 }
1039 if let Some(pk) = primary_fields.first() {
1040 if pk.field_type != FieldType::Text {
1041 return Err(Error::Schema(format!(
1042 "Primary key field '{}' must be of type text, got {:?}",
1043 pk.name, pk.field_type
1044 )));
1045 }
1046 if pk.multi {
1047 return Err(Error::Schema(format!(
1048 "Primary key field '{}' cannot be multi-valued",
1049 pk.name
1050 )));
1051 }
1052 }
1053
1054 Ok(IndexDef {
1055 name,
1056 fields,
1057 default_fields,
1058 query_routers,
1059 })
1060}
1061
1062pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
1064 let pairs = SdlParser::parse(Rule::file, input)
1065 .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
1066
1067 let mut indexes = Vec::new();
1068
1069 for pair in pairs {
1070 if pair.as_rule() == Rule::file {
1071 for inner in pair.into_inner() {
1072 if inner.as_rule() == Rule::index_def {
1073 indexes.push(parse_index_def(inner)?);
1074 }
1075 }
1076 }
1077 }
1078
1079 Ok(indexes)
1080}
1081
1082pub fn parse_single_index(input: &str) -> Result<IndexDef> {
1084 let indexes = parse_sdl(input)?;
1085
1086 if indexes.is_empty() {
1087 return Err(Error::Schema("No index definition found".to_string()));
1088 }
1089
1090 if indexes.len() > 1 {
1091 return Err(Error::Schema(
1092 "Multiple index definitions found, expected one".to_string(),
1093 ));
1094 }
1095
1096 Ok(indexes.into_iter().next().unwrap())
1097}
1098
1099#[cfg(test)]
1100mod tests {
1101 use super::*;
1102
1103 #[test]
1104 fn test_parse_simple_schema() {
1105 let sdl = r#"
1106 index articles {
1107 field title: text [indexed, stored]
1108 field body: text [indexed]
1109 }
1110 "#;
1111
1112 let indexes = parse_sdl(sdl).unwrap();
1113 assert_eq!(indexes.len(), 1);
1114
1115 let index = &indexes[0];
1116 assert_eq!(index.name, "articles");
1117 assert_eq!(index.fields.len(), 2);
1118
1119 assert_eq!(index.fields[0].name, "title");
1120 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1121 assert!(index.fields[0].indexed);
1122 assert!(index.fields[0].stored);
1123
1124 assert_eq!(index.fields[1].name, "body");
1125 assert!(matches!(index.fields[1].field_type, FieldType::Text));
1126 assert!(index.fields[1].indexed);
1127 assert!(!index.fields[1].stored);
1128 }
1129
1130 #[test]
1131 fn test_parse_all_field_types() {
1132 let sdl = r#"
1133 index test {
1134 field text_field: text [indexed, stored]
1135 field u64_field: u64 [indexed, stored]
1136 field i64_field: i64 [indexed, stored]
1137 field f64_field: f64 [indexed, stored]
1138 field bytes_field: bytes [stored]
1139 }
1140 "#;
1141
1142 let indexes = parse_sdl(sdl).unwrap();
1143 let index = &indexes[0];
1144
1145 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1146 assert!(matches!(index.fields[1].field_type, FieldType::U64));
1147 assert!(matches!(index.fields[2].field_type, FieldType::I64));
1148 assert!(matches!(index.fields[3].field_type, FieldType::F64));
1149 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1150 }
1151
1152 #[test]
1153 fn test_parse_with_comments() {
1154 let sdl = r#"
1155 # This is a comment
1156 index articles {
1157 # Title field
1158 field title: text [indexed, stored]
1159 field body: text [indexed] # inline comment not supported yet
1160 }
1161 "#;
1162
1163 let indexes = parse_sdl(sdl).unwrap();
1164 assert_eq!(indexes[0].fields.len(), 2);
1165 }
1166
1167 #[test]
1168 fn test_parse_type_aliases() {
1169 let sdl = r#"
1170 index test {
1171 field a: string [indexed]
1172 field b: int [indexed]
1173 field c: uint [indexed]
1174 field d: float [indexed]
1175 field e: binary [stored]
1176 }
1177 "#;
1178
1179 let indexes = parse_sdl(sdl).unwrap();
1180 let index = &indexes[0];
1181
1182 assert!(matches!(index.fields[0].field_type, FieldType::Text));
1183 assert!(matches!(index.fields[1].field_type, FieldType::I64));
1184 assert!(matches!(index.fields[2].field_type, FieldType::U64));
1185 assert!(matches!(index.fields[3].field_type, FieldType::F64));
1186 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
1187 }
1188
1189 #[test]
1190 fn test_to_schema() {
1191 let sdl = r#"
1192 index articles {
1193 field title: text [indexed, stored]
1194 field views: u64 [indexed, stored]
1195 }
1196 "#;
1197
1198 let indexes = parse_sdl(sdl).unwrap();
1199 let schema = indexes[0].to_schema();
1200
1201 assert!(schema.get_field("title").is_some());
1202 assert!(schema.get_field("views").is_some());
1203 assert!(schema.get_field("nonexistent").is_none());
1204 }
1205
1206 #[test]
1207 fn test_default_attributes() {
1208 let sdl = r#"
1209 index test {
1210 field title: text
1211 }
1212 "#;
1213
1214 let indexes = parse_sdl(sdl).unwrap();
1215 let field = &indexes[0].fields[0];
1216
1217 assert!(field.indexed);
1219 assert!(field.stored);
1220 }
1221
1222 #[test]
1223 fn test_multiple_indexes() {
1224 let sdl = r#"
1225 index articles {
1226 field title: text [indexed, stored]
1227 }
1228
1229 index users {
1230 field name: text [indexed, stored]
1231 field email: text [indexed, stored]
1232 }
1233 "#;
1234
1235 let indexes = parse_sdl(sdl).unwrap();
1236 assert_eq!(indexes.len(), 2);
1237 assert_eq!(indexes[0].name, "articles");
1238 assert_eq!(indexes[1].name, "users");
1239 }
1240
1241 #[test]
1242 fn test_tokenizer_spec() {
1243 let sdl = r#"
1244 index articles {
1245 field title: text<en_stem> [indexed, stored]
1246 field body: text<simple> [indexed]
1247 field author: text [indexed, stored]
1248 }
1249 "#;
1250
1251 let indexes = parse_sdl(sdl).unwrap();
1252 let index = &indexes[0];
1253
1254 assert_eq!(index.fields[0].name, "title");
1255 assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
1256
1257 assert_eq!(index.fields[1].name, "body");
1258 assert_eq!(index.fields[1].tokenizer, Some("simple".to_string()));
1259
1260 assert_eq!(index.fields[2].name, "author");
1261 assert_eq!(index.fields[2].tokenizer, None); }
1263
1264 #[test]
1265 fn test_tokenizer_in_schema() {
1266 let sdl = r#"
1267 index articles {
1268 field title: text<german> [indexed, stored]
1269 field body: text<en_stem> [indexed]
1270 }
1271 "#;
1272
1273 let indexes = parse_sdl(sdl).unwrap();
1274 let schema = indexes[0].to_schema();
1275
1276 let title_field = schema.get_field("title").unwrap();
1277 let title_entry = schema.get_field_entry(title_field).unwrap();
1278 assert_eq!(title_entry.tokenizer, Some("german".to_string()));
1279
1280 let body_field = schema.get_field("body").unwrap();
1281 let body_entry = schema.get_field_entry(body_field).unwrap();
1282 assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
1283 }
1284
1285 #[test]
1286 fn test_query_router_basic() {
1287 let sdl = r#"
1288 index documents {
1289 field title: text [indexed, stored]
1290 field uri: text [indexed, stored]
1291
1292 query_router {
1293 pattern: "10\\.\\d{4,}/[^\\s]+"
1294 substitution: "doi://{0}"
1295 target_field: uris
1296 mode: exclusive
1297 }
1298 }
1299 "#;
1300
1301 let indexes = parse_sdl(sdl).unwrap();
1302 let index = &indexes[0];
1303
1304 assert_eq!(index.query_routers.len(), 1);
1305 let router = &index.query_routers[0];
1306 assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
1307 assert_eq!(router.substitution, "doi://{0}");
1308 assert_eq!(router.target_field, "uris");
1309 assert_eq!(router.mode, RoutingMode::Exclusive);
1310 }
1311
1312 #[test]
1313 fn test_query_router_raw_string() {
1314 let sdl = r#"
1315 index documents {
1316 field uris: text [indexed, stored]
1317
1318 query_router {
1319 pattern: r"^pmid:(\d+)$"
1320 substitution: "pubmed://{1}"
1321 target_field: uris
1322 mode: additional
1323 }
1324 }
1325 "#;
1326
1327 let indexes = parse_sdl(sdl).unwrap();
1328 let router = &indexes[0].query_routers[0];
1329
1330 assert_eq!(router.pattern, r"^pmid:(\d+)$");
1331 assert_eq!(router.substitution, "pubmed://{1}");
1332 assert_eq!(router.mode, RoutingMode::Additional);
1333 }
1334
1335 #[test]
1336 fn test_multiple_query_routers() {
1337 let sdl = r#"
1338 index documents {
1339 field uris: text [indexed, stored]
1340
1341 query_router {
1342 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
1343 substitution: "doi://{1}"
1344 target_field: uris
1345 mode: exclusive
1346 }
1347
1348 query_router {
1349 pattern: r"^pmid:(\d+)$"
1350 substitution: "pubmed://{1}"
1351 target_field: uris
1352 mode: exclusive
1353 }
1354
1355 query_router {
1356 pattern: r"^arxiv:(\d+\.\d+)$"
1357 substitution: "arxiv://{1}"
1358 target_field: uris
1359 mode: additional
1360 }
1361 }
1362 "#;
1363
1364 let indexes = parse_sdl(sdl).unwrap();
1365 assert_eq!(indexes[0].query_routers.len(), 3);
1366 }
1367
1368 #[test]
1369 fn test_query_router_default_mode() {
1370 let sdl = r#"
1371 index documents {
1372 field uris: text [indexed, stored]
1373
1374 query_router {
1375 pattern: r"test"
1376 substitution: "{0}"
1377 target_field: uris
1378 }
1379 }
1380 "#;
1381
1382 let indexes = parse_sdl(sdl).unwrap();
1383 assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
1385 }
1386
1387 #[test]
1388 fn test_multi_attribute() {
1389 let sdl = r#"
1390 index documents {
1391 field uris: text [indexed, stored<multi>]
1392 field title: text [indexed, stored]
1393 }
1394 "#;
1395
1396 let indexes = parse_sdl(sdl).unwrap();
1397 assert_eq!(indexes.len(), 1);
1398
1399 let fields = &indexes[0].fields;
1400 assert_eq!(fields.len(), 2);
1401
1402 assert_eq!(fields[0].name, "uris");
1404 assert!(fields[0].multi, "uris field should have multi=true");
1405
1406 assert_eq!(fields[1].name, "title");
1408 assert!(!fields[1].multi, "title field should have multi=false");
1409
1410 let schema = indexes[0].to_schema();
1412 let uris_field = schema.get_field("uris").unwrap();
1413 let title_field = schema.get_field("title").unwrap();
1414
1415 assert!(schema.get_field_entry(uris_field).unwrap().multi);
1416 assert!(!schema.get_field_entry(title_field).unwrap().multi);
1417 }
1418
1419 #[test]
1420 fn test_sparse_vector_field() {
1421 let sdl = r#"
1422 index documents {
1423 field embedding: sparse_vector [indexed, stored]
1424 }
1425 "#;
1426
1427 let indexes = parse_sdl(sdl).unwrap();
1428 assert_eq!(indexes.len(), 1);
1429 assert_eq!(indexes[0].fields.len(), 1);
1430 assert_eq!(indexes[0].fields[0].name, "embedding");
1431 assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
1432 assert!(indexes[0].fields[0].sparse_vector_config.is_none());
1433 }
1434
1435 #[test]
1436 fn test_sparse_vector_with_config() {
1437 let sdl = r#"
1438 index documents {
1439 field embedding: sparse_vector<u16> [indexed<quantization: uint8>, stored]
1440 field dense: sparse_vector<u32> [indexed<quantization: float32>]
1441 }
1442 "#;
1443
1444 let indexes = parse_sdl(sdl).unwrap();
1445 assert_eq!(indexes[0].fields.len(), 2);
1446
1447 let f1 = &indexes[0].fields[0];
1449 assert_eq!(f1.name, "embedding");
1450 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1451 assert_eq!(config1.index_size, IndexSize::U16);
1452 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1453
1454 let f2 = &indexes[0].fields[1];
1456 assert_eq!(f2.name, "dense");
1457 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1458 assert_eq!(config2.index_size, IndexSize::U32);
1459 assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
1460 }
1461
1462 #[test]
1463 fn test_sparse_vector_with_weight_threshold() {
1464 let sdl = r#"
1465 index documents {
1466 field embedding: sparse_vector<u16> [indexed<quantization: uint8, weight_threshold: 0.1>, stored]
1467 field embedding2: sparse_vector<u32> [indexed<quantization: float16, weight_threshold: 0.05>]
1468 }
1469 "#;
1470
1471 let indexes = parse_sdl(sdl).unwrap();
1472 assert_eq!(indexes[0].fields.len(), 2);
1473
1474 let f1 = &indexes[0].fields[0];
1476 assert_eq!(f1.name, "embedding");
1477 let config1 = f1.sparse_vector_config.as_ref().unwrap();
1478 assert_eq!(config1.index_size, IndexSize::U16);
1479 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
1480 assert!((config1.weight_threshold - 0.1).abs() < 0.001);
1481
1482 let f2 = &indexes[0].fields[1];
1484 assert_eq!(f2.name, "embedding2");
1485 let config2 = f2.sparse_vector_config.as_ref().unwrap();
1486 assert_eq!(config2.index_size, IndexSize::U32);
1487 assert_eq!(config2.weight_quantization, WeightQuantization::Float16);
1488 assert!((config2.weight_threshold - 0.05).abs() < 0.001);
1489 }
1490
1491 #[test]
1492 fn test_sparse_vector_with_pruning() {
1493 let sdl = r#"
1494 index documents {
1495 field embedding: sparse_vector [indexed<quantization: uint8, pruning: 0.1>, stored]
1496 }
1497 "#;
1498
1499 let indexes = parse_sdl(sdl).unwrap();
1500 let f = &indexes[0].fields[0];
1501 assert_eq!(f.name, "embedding");
1502 let config = f.sparse_vector_config.as_ref().unwrap();
1503 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1504 assert_eq!(config.pruning, Some(0.1));
1505 }
1506
1507 #[test]
1508 fn test_dense_vector_field() {
1509 let sdl = r#"
1510 index documents {
1511 field embedding: dense_vector<768> [indexed, stored]
1512 }
1513 "#;
1514
1515 let indexes = parse_sdl(sdl).unwrap();
1516 assert_eq!(indexes.len(), 1);
1517 assert_eq!(indexes[0].fields.len(), 1);
1518
1519 let f = &indexes[0].fields[0];
1520 assert_eq!(f.name, "embedding");
1521 assert_eq!(f.field_type, FieldType::DenseVector);
1522
1523 let config = f.dense_vector_config.as_ref().unwrap();
1524 assert_eq!(config.dim, 768);
1525 }
1526
1527 #[test]
1528 fn test_dense_vector_alias() {
1529 let sdl = r#"
1530 index documents {
1531 field embedding: vector<1536> [indexed]
1532 }
1533 "#;
1534
1535 let indexes = parse_sdl(sdl).unwrap();
1536 assert_eq!(indexes[0].fields[0].field_type, FieldType::DenseVector);
1537 assert_eq!(
1538 indexes[0].fields[0]
1539 .dense_vector_config
1540 .as_ref()
1541 .unwrap()
1542 .dim,
1543 1536
1544 );
1545 }
1546
1547 #[test]
1548 fn test_dense_vector_with_num_clusters() {
1549 let sdl = r#"
1550 index documents {
1551 field embedding: dense_vector<768> [indexed<ivf_rabitq, num_clusters: 256>, stored]
1552 }
1553 "#;
1554
1555 let indexes = parse_sdl(sdl).unwrap();
1556 assert_eq!(indexes.len(), 1);
1557
1558 let f = &indexes[0].fields[0];
1559 assert_eq!(f.name, "embedding");
1560 assert_eq!(f.field_type, FieldType::DenseVector);
1561
1562 let config = f.dense_vector_config.as_ref().unwrap();
1563 assert_eq!(config.dim, 768);
1564 assert_eq!(config.num_clusters, Some(256));
1565 assert_eq!(config.nprobe, 32); }
1567
1568 #[test]
1569 fn test_dense_vector_with_num_clusters_and_nprobe() {
1570 let sdl = r#"
1571 index documents {
1572 field embedding: dense_vector<1536> [indexed<ivf_rabitq, num_clusters: 512, nprobe: 64>]
1573 }
1574 "#;
1575
1576 let indexes = parse_sdl(sdl).unwrap();
1577 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1578
1579 assert_eq!(config.dim, 1536);
1580 assert_eq!(config.num_clusters, Some(512));
1581 assert_eq!(config.nprobe, 64);
1582 }
1583
1584 #[test]
1585 fn test_dense_vector_keyword_syntax() {
1586 let sdl = r#"
1587 index documents {
1588 field embedding: dense_vector<dims: 1536> [indexed, stored]
1589 }
1590 "#;
1591
1592 let indexes = parse_sdl(sdl).unwrap();
1593 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1594
1595 assert_eq!(config.dim, 1536);
1596 assert!(config.num_clusters.is_none());
1597 }
1598
1599 #[test]
1600 fn test_dense_vector_keyword_syntax_full() {
1601 let sdl = r#"
1602 index documents {
1603 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 256, nprobe: 64>]
1604 }
1605 "#;
1606
1607 let indexes = parse_sdl(sdl).unwrap();
1608 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1609
1610 assert_eq!(config.dim, 1536);
1611 assert_eq!(config.num_clusters, Some(256));
1612 assert_eq!(config.nprobe, 64);
1613 }
1614
1615 #[test]
1616 fn test_dense_vector_keyword_syntax_partial() {
1617 let sdl = r#"
1618 index documents {
1619 field embedding: dense_vector<dims: 768> [indexed<ivf_rabitq, num_clusters: 128>]
1620 }
1621 "#;
1622
1623 let indexes = parse_sdl(sdl).unwrap();
1624 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1625
1626 assert_eq!(config.dim, 768);
1627 assert_eq!(config.num_clusters, Some(128));
1628 assert_eq!(config.nprobe, 32); }
1630
1631 #[test]
1632 fn test_dense_vector_scann_index() {
1633 use crate::dsl::schema::VectorIndexType;
1634
1635 let sdl = r#"
1636 index documents {
1637 field embedding: dense_vector<dims: 768> [indexed<scann, num_clusters: 256, nprobe: 64>]
1638 }
1639 "#;
1640
1641 let indexes = parse_sdl(sdl).unwrap();
1642 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1643
1644 assert_eq!(config.dim, 768);
1645 assert_eq!(config.index_type, VectorIndexType::ScaNN);
1646 assert_eq!(config.num_clusters, Some(256));
1647 assert_eq!(config.nprobe, 64);
1648 }
1649
1650 #[test]
1651 fn test_dense_vector_ivf_rabitq_index() {
1652 use crate::dsl::schema::VectorIndexType;
1653
1654 let sdl = r#"
1655 index documents {
1656 field embedding: dense_vector<dims: 1536> [indexed<ivf_rabitq, num_clusters: 512>]
1657 }
1658 "#;
1659
1660 let indexes = parse_sdl(sdl).unwrap();
1661 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1662
1663 assert_eq!(config.dim, 1536);
1664 assert_eq!(config.index_type, VectorIndexType::IvfRaBitQ);
1665 assert_eq!(config.num_clusters, Some(512));
1666 }
1667
1668 #[test]
1669 fn test_dense_vector_rabitq_no_clusters() {
1670 use crate::dsl::schema::VectorIndexType;
1671
1672 let sdl = r#"
1673 index documents {
1674 field embedding: dense_vector<dims: 768> [indexed<rabitq>]
1675 }
1676 "#;
1677
1678 let indexes = parse_sdl(sdl).unwrap();
1679 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1680
1681 assert_eq!(config.dim, 768);
1682 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1683 assert!(config.num_clusters.is_none());
1684 }
1685
1686 #[test]
1687 fn test_dense_vector_flat_index() {
1688 use crate::dsl::schema::VectorIndexType;
1689
1690 let sdl = r#"
1691 index documents {
1692 field embedding: dense_vector<dims: 768> [indexed<flat>]
1693 }
1694 "#;
1695
1696 let indexes = parse_sdl(sdl).unwrap();
1697 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1698
1699 assert_eq!(config.dim, 768);
1700 assert_eq!(config.index_type, VectorIndexType::Flat);
1701 }
1702
1703 #[test]
1704 fn test_dense_vector_default_index_type() {
1705 use crate::dsl::schema::VectorIndexType;
1706
1707 let sdl = r#"
1709 index documents {
1710 field embedding: dense_vector<dims: 768> [indexed]
1711 }
1712 "#;
1713
1714 let indexes = parse_sdl(sdl).unwrap();
1715 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1716
1717 assert_eq!(config.dim, 768);
1718 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1719 }
1720
1721 #[test]
1722 fn test_dense_vector_f16_quantization() {
1723 use crate::dsl::schema::{DenseVectorQuantization, VectorIndexType};
1724
1725 let sdl = r#"
1726 index documents {
1727 field embedding: dense_vector<768, f16> [indexed]
1728 }
1729 "#;
1730
1731 let indexes = parse_sdl(sdl).unwrap();
1732 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1733
1734 assert_eq!(config.dim, 768);
1735 assert_eq!(config.quantization, DenseVectorQuantization::F16);
1736 assert_eq!(config.index_type, VectorIndexType::RaBitQ);
1737 }
1738
1739 #[test]
1740 fn test_dense_vector_uint8_quantization() {
1741 use crate::dsl::schema::DenseVectorQuantization;
1742
1743 let sdl = r#"
1744 index documents {
1745 field embedding: dense_vector<1024, uint8> [indexed<ivf_rabitq>]
1746 }
1747 "#;
1748
1749 let indexes = parse_sdl(sdl).unwrap();
1750 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1751
1752 assert_eq!(config.dim, 1024);
1753 assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1754 }
1755
1756 #[test]
1757 fn test_dense_vector_u8_alias() {
1758 use crate::dsl::schema::DenseVectorQuantization;
1759
1760 let sdl = r#"
1761 index documents {
1762 field embedding: dense_vector<512, u8> [indexed]
1763 }
1764 "#;
1765
1766 let indexes = parse_sdl(sdl).unwrap();
1767 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1768
1769 assert_eq!(config.dim, 512);
1770 assert_eq!(config.quantization, DenseVectorQuantization::UInt8);
1771 }
1772
1773 #[test]
1774 fn test_dense_vector_default_f32_quantization() {
1775 use crate::dsl::schema::DenseVectorQuantization;
1776
1777 let sdl = r#"
1779 index documents {
1780 field embedding: dense_vector<768> [indexed]
1781 }
1782 "#;
1783
1784 let indexes = parse_sdl(sdl).unwrap();
1785 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1786
1787 assert_eq!(config.dim, 768);
1788 assert_eq!(config.quantization, DenseVectorQuantization::F32);
1789 }
1790
1791 #[test]
1792 fn test_dense_vector_keyword_with_quantization() {
1793 use crate::dsl::schema::DenseVectorQuantization;
1794
1795 let sdl = r#"
1796 index documents {
1797 field embedding: dense_vector<dims: 768, f16> [indexed]
1798 }
1799 "#;
1800
1801 let indexes = parse_sdl(sdl).unwrap();
1802 let config = indexes[0].fields[0].dense_vector_config.as_ref().unwrap();
1803
1804 assert_eq!(config.dim, 768);
1805 assert_eq!(config.quantization, DenseVectorQuantization::F16);
1806 }
1807
1808 #[test]
1809 fn test_json_field_type() {
1810 let sdl = r#"
1811 index documents {
1812 field title: text [indexed, stored]
1813 field metadata: json [stored]
1814 field extra: json
1815 }
1816 "#;
1817
1818 let indexes = parse_sdl(sdl).unwrap();
1819 let index = &indexes[0];
1820
1821 assert_eq!(index.fields.len(), 3);
1822
1823 assert_eq!(index.fields[1].name, "metadata");
1825 assert!(matches!(index.fields[1].field_type, FieldType::Json));
1826 assert!(index.fields[1].stored);
1827 assert_eq!(index.fields[2].name, "extra");
1831 assert!(matches!(index.fields[2].field_type, FieldType::Json));
1832
1833 let schema = index.to_schema();
1835 let metadata_field = schema.get_field("metadata").unwrap();
1836 let entry = schema.get_field_entry(metadata_field).unwrap();
1837 assert_eq!(entry.field_type, FieldType::Json);
1838 assert!(!entry.indexed); assert!(entry.stored);
1840 }
1841
1842 #[test]
1843 fn test_sparse_vector_query_config() {
1844 use crate::structures::QueryWeighting;
1845
1846 let sdl = r#"
1847 index documents {
1848 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "Alibaba-NLP/gte-Qwen2-1.5B-instruct", weighting: idf>>]
1849 }
1850 "#;
1851
1852 let indexes = parse_sdl(sdl).unwrap();
1853 let index = &indexes[0];
1854
1855 assert_eq!(index.fields.len(), 1);
1856 assert_eq!(index.fields[0].name, "embedding");
1857 assert!(matches!(
1858 index.fields[0].field_type,
1859 FieldType::SparseVector
1860 ));
1861
1862 let config = index.fields[0].sparse_vector_config.as_ref().unwrap();
1863 assert_eq!(config.index_size, IndexSize::U16);
1864 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1865
1866 let query_config = config.query_config.as_ref().unwrap();
1868 assert_eq!(
1869 query_config.tokenizer.as_deref(),
1870 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1871 );
1872 assert_eq!(query_config.weighting, QueryWeighting::Idf);
1873
1874 let schema = index.to_schema();
1876 let embedding_field = schema.get_field("embedding").unwrap();
1877 let entry = schema.get_field_entry(embedding_field).unwrap();
1878 let sv_config = entry.sparse_vector_config.as_ref().unwrap();
1879 let qc = sv_config.query_config.as_ref().unwrap();
1880 assert_eq!(
1881 qc.tokenizer.as_deref(),
1882 Some("Alibaba-NLP/gte-Qwen2-1.5B-instruct")
1883 );
1884 assert_eq!(qc.weighting, QueryWeighting::Idf);
1885 }
1886
1887 #[test]
1888 fn test_sparse_vector_query_config_weighting_one() {
1889 use crate::structures::QueryWeighting;
1890
1891 let sdl = r#"
1892 index documents {
1893 field embedding: sparse_vector [indexed<query<weighting: one>>]
1894 }
1895 "#;
1896
1897 let indexes = parse_sdl(sdl).unwrap();
1898 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1899
1900 let query_config = config.query_config.as_ref().unwrap();
1901 assert!(query_config.tokenizer.is_none());
1902 assert_eq!(query_config.weighting, QueryWeighting::One);
1903 }
1904
1905 #[test]
1906 fn test_sparse_vector_query_config_weighting_idf_file() {
1907 use crate::structures::QueryWeighting;
1908
1909 let sdl = r#"
1910 index documents {
1911 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<tokenizer: "opensearch-neural-sparse-encoding-v1", weighting: idf_file>>]
1912 }
1913 "#;
1914
1915 let indexes = parse_sdl(sdl).unwrap();
1916 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1917
1918 let query_config = config.query_config.as_ref().unwrap();
1919 assert_eq!(
1920 query_config.tokenizer.as_deref(),
1921 Some("opensearch-neural-sparse-encoding-v1")
1922 );
1923 assert_eq!(query_config.weighting, QueryWeighting::IdfFile);
1924
1925 let schema = indexes[0].to_schema();
1927 let field = schema.get_field("embedding").unwrap();
1928 let entry = schema.get_field_entry(field).unwrap();
1929 let sc = entry.sparse_vector_config.as_ref().unwrap();
1930 let qc = sc.query_config.as_ref().unwrap();
1931 assert_eq!(qc.weighting, QueryWeighting::IdfFile);
1932 }
1933
1934 #[test]
1935 fn test_sparse_vector_query_config_pruning_params() {
1936 let sdl = r#"
1937 index documents {
1938 field embedding: sparse_vector<u16> [indexed<quantization: uint8, query<weighting: idf, weight_threshold: 0.03, max_dims: 25, pruning: 0.2>>]
1939 }
1940 "#;
1941
1942 let indexes = parse_sdl(sdl).unwrap();
1943 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1944
1945 let qc = config.query_config.as_ref().unwrap();
1946 assert_eq!(qc.weighting, QueryWeighting::Idf);
1947 assert!((qc.weight_threshold - 0.03).abs() < 0.001);
1948 assert_eq!(qc.max_query_dims, Some(25));
1949 assert!((qc.pruning.unwrap() - 0.2).abs() < 0.001);
1950
1951 let schema = indexes[0].to_schema();
1953 let field = schema.get_field("embedding").unwrap();
1954 let entry = schema.get_field_entry(field).unwrap();
1955 let sc = entry.sparse_vector_config.as_ref().unwrap();
1956 let rqc = sc.query_config.as_ref().unwrap();
1957 assert!((rqc.weight_threshold - 0.03).abs() < 0.001);
1958 assert_eq!(rqc.max_query_dims, Some(25));
1959 assert!((rqc.pruning.unwrap() - 0.2).abs() < 0.001);
1960 }
1961
1962 #[test]
1963 fn test_sparse_vector_format_maxscore() {
1964 let sdl = r#"
1965 index documents {
1966 field embedding: sparse_vector<u16> [indexed<format: maxscore, quantization: uint8>]
1967 }
1968 "#;
1969
1970 let indexes = parse_sdl(sdl).unwrap();
1971 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1972 assert_eq!(config.format, SparseFormat::MaxScore);
1973 assert_eq!(config.weight_quantization, WeightQuantization::UInt8);
1974
1975 let schema = indexes[0].to_schema();
1977 let field = schema.get_field("embedding").unwrap();
1978 let entry = schema.get_field_entry(field).unwrap();
1979 let sc = entry.sparse_vector_config.as_ref().unwrap();
1980 assert_eq!(sc.format, SparseFormat::MaxScore);
1981 }
1982
1983 #[test]
1984 fn test_sparse_vector_format_bmp() {
1985 let sdl = r#"
1986 index documents {
1987 field embedding: sparse_vector<u16> [indexed<format: bmp, quantization: uint8>]
1988 }
1989 "#;
1990
1991 let indexes = parse_sdl(sdl).unwrap();
1992 let config = indexes[0].fields[0].sparse_vector_config.as_ref().unwrap();
1993 assert_eq!(config.format, SparseFormat::Bmp);
1994 }
1995
1996 #[test]
1997 fn test_fast_attribute() {
1998 let sdl = r#"
1999 index products {
2000 field name: text [indexed, stored]
2001 field price: f64 [indexed, fast]
2002 field category: text [indexed, stored, fast]
2003 field count: u64 [fast]
2004 field score: i64 [indexed, stored, fast]
2005 }
2006 "#;
2007
2008 let indexes = parse_sdl(sdl).unwrap();
2009 assert_eq!(indexes.len(), 1);
2010 let index = &indexes[0];
2011 assert_eq!(index.fields.len(), 5);
2012
2013 assert!(!index.fields[0].fast);
2015 assert!(index.fields[1].fast);
2017 assert!(matches!(index.fields[1].field_type, FieldType::F64));
2018 assert!(index.fields[2].fast);
2020 assert!(matches!(index.fields[2].field_type, FieldType::Text));
2021 assert!(index.fields[3].fast);
2023 assert!(matches!(index.fields[3].field_type, FieldType::U64));
2024 assert!(index.fields[4].fast);
2026 assert!(matches!(index.fields[4].field_type, FieldType::I64));
2027
2028 let schema = index.to_schema();
2030 let price_field = schema.get_field("price").unwrap();
2031 assert!(schema.get_field_entry(price_field).unwrap().fast);
2032
2033 let category_field = schema.get_field("category").unwrap();
2034 assert!(schema.get_field_entry(category_field).unwrap().fast);
2035
2036 let name_field = schema.get_field("name").unwrap();
2037 assert!(!schema.get_field_entry(name_field).unwrap().fast);
2038 }
2039
2040 #[test]
2041 fn test_primary_attribute() {
2042 let sdl = r#"
2043 index documents {
2044 field id: text [primary, stored]
2045 field title: text [indexed, stored]
2046 }
2047 "#;
2048
2049 let indexes = parse_sdl(sdl).unwrap();
2050 assert_eq!(indexes.len(), 1);
2051 let index = &indexes[0];
2052 assert_eq!(index.fields.len(), 2);
2053
2054 let id_field = &index.fields[0];
2056 assert!(id_field.primary, "id should be primary");
2057 assert!(id_field.fast, "primary implies fast");
2058 assert!(id_field.indexed, "primary implies indexed");
2059
2060 assert!(!index.fields[1].primary);
2062
2063 let schema = index.to_schema();
2065 let id = schema.get_field("id").unwrap();
2066 let id_entry = schema.get_field_entry(id).unwrap();
2067 assert!(id_entry.primary_key);
2068 assert!(id_entry.fast);
2069 assert!(id_entry.indexed);
2070
2071 let title = schema.get_field("title").unwrap();
2072 assert!(!schema.get_field_entry(title).unwrap().primary_key);
2073
2074 assert_eq!(schema.primary_field(), Some(id));
2076 }
2077
2078 #[test]
2079 fn test_primary_with_other_attributes() {
2080 let sdl = r#"
2081 index documents {
2082 field id: text<simple> [primary, indexed, stored]
2083 field body: text [indexed]
2084 }
2085 "#;
2086
2087 let indexes = parse_sdl(sdl).unwrap();
2088 let id_field = &indexes[0].fields[0];
2089 assert!(id_field.primary);
2090 assert!(id_field.indexed);
2091 assert!(id_field.stored);
2092 assert!(id_field.fast);
2093 assert_eq!(id_field.tokenizer, Some("simple".to_string()));
2094 }
2095
2096 #[test]
2097 fn test_primary_only_one_allowed() {
2098 let sdl = r#"
2099 index documents {
2100 field id: text [primary]
2101 field alt_id: text [primary]
2102 }
2103 "#;
2104
2105 let result = parse_sdl(sdl);
2106 assert!(result.is_err());
2107 let err = result.unwrap_err().to_string();
2108 assert!(
2109 err.contains("primary key"),
2110 "Error should mention primary key: {}",
2111 err
2112 );
2113 }
2114
2115 #[test]
2116 fn test_primary_must_be_text() {
2117 let sdl = r#"
2118 index documents {
2119 field id: u64 [primary]
2120 }
2121 "#;
2122
2123 let result = parse_sdl(sdl);
2124 assert!(result.is_err());
2125 let err = result.unwrap_err().to_string();
2126 assert!(
2127 err.contains("text"),
2128 "Error should mention text type: {}",
2129 err
2130 );
2131 }
2132
2133 #[test]
2134 fn test_primary_cannot_be_multi() {
2135 let sdl = r#"
2136 index documents {
2137 field id: text [primary, stored<multi>]
2138 }
2139 "#;
2140
2141 let result = parse_sdl(sdl);
2142 assert!(result.is_err());
2143 let err = result.unwrap_err().to_string();
2144 assert!(err.contains("multi"), "Error should mention multi: {}", err);
2145 }
2146
2147 #[test]
2148 fn test_no_primary_field() {
2149 let sdl = r#"
2151 index documents {
2152 field title: text [indexed, stored]
2153 }
2154 "#;
2155
2156 let indexes = parse_sdl(sdl).unwrap();
2157 let schema = indexes[0].to_schema();
2158 assert!(schema.primary_field().is_none());
2159 }
2160
2161 #[test]
2162 fn test_reorder_attribute() {
2163 let sdl = r#"
2164 index documents {
2165 field embedding: sparse_vector<u16> [indexed<format: bmp, quantization: uint8>, reorder]
2166 field embedding2: sparse_vector [indexed<format: bmp>]
2167 }
2168 "#;
2169
2170 let indexes = parse_sdl(sdl).unwrap();
2171 assert_eq!(indexes[0].fields.len(), 2);
2172
2173 assert!(indexes[0].fields[0].reorder);
2175 assert!(!indexes[0].fields[1].reorder);
2177
2178 let schema = indexes[0].to_schema();
2180 let f1 = schema.get_field("embedding").unwrap();
2181 assert!(schema.get_field_entry(f1).unwrap().reorder);
2182
2183 let f2 = schema.get_field("embedding2").unwrap();
2184 assert!(!schema.get_field_entry(f2).unwrap().reorder);
2185 }
2186}