1use pest::Parser;
34use pest_derive::Parser;
35
36use super::query_field_router::{QueryRouterRule, RoutingMode};
37use super::schema::{FieldType, Schema, SchemaBuilder};
38use crate::Result;
39use crate::error::Error;
40
41#[derive(Parser)]
42#[grammar = "dsl/sdl/sdl.pest"]
43pub struct SdlParser;
44
45use crate::structures::{IndexSize, SparseVectorConfig, WeightQuantization};
46
47#[derive(Debug, Clone)]
49pub struct FieldDef {
50 pub name: String,
51 pub field_type: FieldType,
52 pub indexed: bool,
53 pub stored: bool,
54 pub tokenizer: Option<String>,
56 pub multi: bool,
58 pub sparse_vector_config: Option<SparseVectorConfig>,
60}
61
62#[derive(Debug, Clone)]
64pub struct IndexDef {
65 pub name: String,
66 pub fields: Vec<FieldDef>,
67 pub default_fields: Vec<String>,
68 pub query_routers: Vec<QueryRouterRule>,
70}
71
72impl IndexDef {
73 pub fn to_schema(&self) -> Schema {
75 let mut builder = SchemaBuilder::default();
76
77 for field in &self.fields {
78 let f = match field.field_type {
79 FieldType::Text => {
80 let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
81 builder.add_text_field_with_tokenizer(
82 &field.name,
83 field.indexed,
84 field.stored,
85 tokenizer,
86 )
87 }
88 FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
89 FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
90 FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
91 FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
92 FieldType::SparseVector => {
93 if let Some(config) = &field.sparse_vector_config {
94 builder.add_sparse_vector_field_with_config(
95 &field.name,
96 field.indexed,
97 field.stored,
98 *config,
99 )
100 } else {
101 builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
102 }
103 }
104 };
105 if field.multi {
106 builder.set_multi(f, true);
107 }
108 }
109
110 if !self.default_fields.is_empty() {
112 builder.set_default_fields(self.default_fields.clone());
113 }
114
115 if !self.query_routers.is_empty() {
117 builder.set_query_routers(self.query_routers.clone());
118 }
119
120 builder.build()
121 }
122
123 pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
128 if self.query_routers.is_empty() {
129 return Ok(None);
130 }
131
132 super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
133 .map(Some)
134 .map_err(Error::Schema)
135 }
136}
137
138fn parse_field_type(type_str: &str) -> Result<FieldType> {
140 match type_str {
141 "text" | "string" | "str" => Ok(FieldType::Text),
142 "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
143 "i64" | "int" | "integer" => Ok(FieldType::I64),
144 "f64" | "float" | "double" => Ok(FieldType::F64),
145 "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
146 "sparse_vector" => Ok(FieldType::SparseVector),
147 _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
148 }
149}
150
151fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool) {
154 let mut indexed = false;
155 let mut stored = false;
156 let mut multi = false;
157
158 for attr in pair.into_inner() {
159 match attr.as_str() {
160 "indexed" => indexed = true,
161 "stored" => stored = true,
162 "multi" => multi = true,
163 _ => {}
164 }
165 }
166
167 (indexed, stored, multi)
168}
169
170fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
172 let mut inner = pair.into_inner();
173
174 let name = inner
175 .next()
176 .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
177 .as_str()
178 .to_string();
179
180 let field_type_str = inner
181 .next()
182 .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
183 .as_str();
184
185 let field_type = parse_field_type(field_type_str)?;
186
187 let mut tokenizer = None;
189 let mut sparse_vector_config = None;
190 let mut indexed = true;
191 let mut stored = true;
192 let mut multi = false;
193
194 for item in inner {
195 match item.as_rule() {
196 Rule::tokenizer_spec => {
197 if let Some(tok_name) = item.into_inner().next() {
199 tokenizer = Some(tok_name.as_str().to_string());
200 }
201 }
202 Rule::sparse_vector_config => {
203 let mut config_inner = item.into_inner();
205 let index_size = if let Some(size_pair) = config_inner.next() {
206 match size_pair.as_str() {
207 "u16" => IndexSize::U16,
208 "u32" => IndexSize::U32,
209 _ => IndexSize::default(),
210 }
211 } else {
212 IndexSize::default()
213 };
214 let quantization = if let Some(quant_pair) = config_inner.next() {
215 match quant_pair.as_str() {
216 "float32" | "f32" => WeightQuantization::Float32,
217 "float16" | "f16" => WeightQuantization::Float16,
218 "uint8" | "u8" => WeightQuantization::UInt8,
219 "uint4" | "u4" => WeightQuantization::UInt4,
220 _ => WeightQuantization::default(),
221 }
222 } else {
223 WeightQuantization::default()
224 };
225 sparse_vector_config = Some(SparseVectorConfig {
226 index_size,
227 weight_quantization: quantization,
228 });
229 }
230 Rule::attributes => {
231 let (idx, sto, mul) = parse_attributes(item);
232 indexed = idx;
233 stored = sto;
234 multi = mul;
235 }
236 _ => {}
237 }
238 }
239
240 Ok(FieldDef {
241 name,
242 field_type,
243 indexed,
244 stored,
245 tokenizer,
246 multi,
247 sparse_vector_config,
248 })
249}
250
251fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
253 pair.into_inner().map(|p| p.as_str().to_string()).collect()
254}
255
256fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
258 let mut pattern = String::new();
259 let mut substitution = String::new();
260 let mut target_field = String::new();
261 let mut mode = RoutingMode::Additional;
262
263 for prop in pair.into_inner() {
264 if prop.as_rule() != Rule::query_router_prop {
265 continue;
266 }
267
268 for inner in prop.into_inner() {
269 match inner.as_rule() {
270 Rule::query_router_pattern => {
271 if let Some(regex_str) = inner.into_inner().next() {
272 pattern = parse_string_value(regex_str);
273 }
274 }
275 Rule::query_router_substitution => {
276 if let Some(quoted) = inner.into_inner().next() {
277 substitution = parse_string_value(quoted);
278 }
279 }
280 Rule::query_router_target => {
281 if let Some(ident) = inner.into_inner().next() {
282 target_field = ident.as_str().to_string();
283 }
284 }
285 Rule::query_router_mode => {
286 if let Some(mode_val) = inner.into_inner().next() {
287 mode = match mode_val.as_str() {
288 "exclusive" => RoutingMode::Exclusive,
289 "additional" => RoutingMode::Additional,
290 _ => RoutingMode::Additional,
291 };
292 }
293 }
294 _ => {}
295 }
296 }
297 }
298
299 if pattern.is_empty() {
300 return Err(Error::Schema("query_router missing 'pattern'".to_string()));
301 }
302 if substitution.is_empty() {
303 return Err(Error::Schema(
304 "query_router missing 'substitution'".to_string(),
305 ));
306 }
307 if target_field.is_empty() {
308 return Err(Error::Schema(
309 "query_router missing 'target_field'".to_string(),
310 ));
311 }
312
313 Ok(QueryRouterRule {
314 pattern,
315 substitution,
316 target_field,
317 mode,
318 })
319}
320
321fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
323 let s = pair.as_str();
324 match pair.as_rule() {
325 Rule::regex_string => {
326 if let Some(inner) = pair.into_inner().next() {
328 parse_string_value(inner)
329 } else {
330 s.to_string()
331 }
332 }
333 Rule::raw_string => {
334 s[2..s.len() - 1].to_string()
336 }
337 Rule::quoted_string => {
338 let inner = &s[1..s.len() - 1];
340 inner
342 .replace("\\n", "\n")
343 .replace("\\t", "\t")
344 .replace("\\\"", "\"")
345 .replace("\\\\", "\\")
346 }
347 _ => s.to_string(),
348 }
349}
350
351fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
353 let mut inner = pair.into_inner();
354
355 let name = inner
356 .next()
357 .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
358 .as_str()
359 .to_string();
360
361 let mut fields = Vec::new();
362 let mut default_fields = Vec::new();
363 let mut query_routers = Vec::new();
364
365 for item in inner {
366 match item.as_rule() {
367 Rule::field_def => {
368 fields.push(parse_field_def(item)?);
369 }
370 Rule::default_fields_def => {
371 default_fields = parse_default_fields_def(item);
372 }
373 Rule::query_router_def => {
374 query_routers.push(parse_query_router_def(item)?);
375 }
376 _ => {}
377 }
378 }
379
380 Ok(IndexDef {
381 name,
382 fields,
383 default_fields,
384 query_routers,
385 })
386}
387
388pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
390 let pairs = SdlParser::parse(Rule::file, input)
391 .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
392
393 let mut indexes = Vec::new();
394
395 for pair in pairs {
396 if pair.as_rule() == Rule::file {
397 for inner in pair.into_inner() {
398 if inner.as_rule() == Rule::index_def {
399 indexes.push(parse_index_def(inner)?);
400 }
401 }
402 }
403 }
404
405 Ok(indexes)
406}
407
408pub fn parse_single_index(input: &str) -> Result<IndexDef> {
410 let indexes = parse_sdl(input)?;
411
412 if indexes.is_empty() {
413 return Err(Error::Schema("No index definition found".to_string()));
414 }
415
416 if indexes.len() > 1 {
417 return Err(Error::Schema(
418 "Multiple index definitions found, expected one".to_string(),
419 ));
420 }
421
422 Ok(indexes.into_iter().next().unwrap())
423}
424
425#[cfg(test)]
426mod tests {
427 use super::*;
428
429 #[test]
430 fn test_parse_simple_schema() {
431 let sdl = r#"
432 index articles {
433 field title: text [indexed, stored]
434 field body: text [indexed]
435 }
436 "#;
437
438 let indexes = parse_sdl(sdl).unwrap();
439 assert_eq!(indexes.len(), 1);
440
441 let index = &indexes[0];
442 assert_eq!(index.name, "articles");
443 assert_eq!(index.fields.len(), 2);
444
445 assert_eq!(index.fields[0].name, "title");
446 assert!(matches!(index.fields[0].field_type, FieldType::Text));
447 assert!(index.fields[0].indexed);
448 assert!(index.fields[0].stored);
449
450 assert_eq!(index.fields[1].name, "body");
451 assert!(matches!(index.fields[1].field_type, FieldType::Text));
452 assert!(index.fields[1].indexed);
453 assert!(!index.fields[1].stored);
454 }
455
456 #[test]
457 fn test_parse_all_field_types() {
458 let sdl = r#"
459 index test {
460 field text_field: text [indexed, stored]
461 field u64_field: u64 [indexed, stored]
462 field i64_field: i64 [indexed, stored]
463 field f64_field: f64 [indexed, stored]
464 field bytes_field: bytes [stored]
465 }
466 "#;
467
468 let indexes = parse_sdl(sdl).unwrap();
469 let index = &indexes[0];
470
471 assert!(matches!(index.fields[0].field_type, FieldType::Text));
472 assert!(matches!(index.fields[1].field_type, FieldType::U64));
473 assert!(matches!(index.fields[2].field_type, FieldType::I64));
474 assert!(matches!(index.fields[3].field_type, FieldType::F64));
475 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
476 }
477
478 #[test]
479 fn test_parse_with_comments() {
480 let sdl = r#"
481 # This is a comment
482 index articles {
483 # Title field
484 field title: text [indexed, stored]
485 field body: text [indexed] # inline comment not supported yet
486 }
487 "#;
488
489 let indexes = parse_sdl(sdl).unwrap();
490 assert_eq!(indexes[0].fields.len(), 2);
491 }
492
493 #[test]
494 fn test_parse_type_aliases() {
495 let sdl = r#"
496 index test {
497 field a: string [indexed]
498 field b: int [indexed]
499 field c: uint [indexed]
500 field d: float [indexed]
501 field e: binary [stored]
502 }
503 "#;
504
505 let indexes = parse_sdl(sdl).unwrap();
506 let index = &indexes[0];
507
508 assert!(matches!(index.fields[0].field_type, FieldType::Text));
509 assert!(matches!(index.fields[1].field_type, FieldType::I64));
510 assert!(matches!(index.fields[2].field_type, FieldType::U64));
511 assert!(matches!(index.fields[3].field_type, FieldType::F64));
512 assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
513 }
514
515 #[test]
516 fn test_to_schema() {
517 let sdl = r#"
518 index articles {
519 field title: text [indexed, stored]
520 field views: u64 [indexed, stored]
521 }
522 "#;
523
524 let indexes = parse_sdl(sdl).unwrap();
525 let schema = indexes[0].to_schema();
526
527 assert!(schema.get_field("title").is_some());
528 assert!(schema.get_field("views").is_some());
529 assert!(schema.get_field("nonexistent").is_none());
530 }
531
532 #[test]
533 fn test_default_attributes() {
534 let sdl = r#"
535 index test {
536 field title: text
537 }
538 "#;
539
540 let indexes = parse_sdl(sdl).unwrap();
541 let field = &indexes[0].fields[0];
542
543 assert!(field.indexed);
545 assert!(field.stored);
546 }
547
548 #[test]
549 fn test_multiple_indexes() {
550 let sdl = r#"
551 index articles {
552 field title: text [indexed, stored]
553 }
554
555 index users {
556 field name: text [indexed, stored]
557 field email: text [indexed, stored]
558 }
559 "#;
560
561 let indexes = parse_sdl(sdl).unwrap();
562 assert_eq!(indexes.len(), 2);
563 assert_eq!(indexes[0].name, "articles");
564 assert_eq!(indexes[1].name, "users");
565 }
566
567 #[test]
568 fn test_tokenizer_spec() {
569 let sdl = r#"
570 index articles {
571 field title: text<en_stem> [indexed, stored]
572 field body: text<default> [indexed]
573 field author: text [indexed, stored]
574 }
575 "#;
576
577 let indexes = parse_sdl(sdl).unwrap();
578 let index = &indexes[0];
579
580 assert_eq!(index.fields[0].name, "title");
581 assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
582
583 assert_eq!(index.fields[1].name, "body");
584 assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
585
586 assert_eq!(index.fields[2].name, "author");
587 assert_eq!(index.fields[2].tokenizer, None); }
589
590 #[test]
591 fn test_tokenizer_in_schema() {
592 let sdl = r#"
593 index articles {
594 field title: text<german> [indexed, stored]
595 field body: text<en_stem> [indexed]
596 }
597 "#;
598
599 let indexes = parse_sdl(sdl).unwrap();
600 let schema = indexes[0].to_schema();
601
602 let title_field = schema.get_field("title").unwrap();
603 let title_entry = schema.get_field_entry(title_field).unwrap();
604 assert_eq!(title_entry.tokenizer, Some("german".to_string()));
605
606 let body_field = schema.get_field("body").unwrap();
607 let body_entry = schema.get_field_entry(body_field).unwrap();
608 assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
609 }
610
611 #[test]
612 fn test_query_router_basic() {
613 let sdl = r#"
614 index documents {
615 field title: text [indexed, stored]
616 field uri: text [indexed, stored]
617
618 query_router {
619 pattern: "10\\.\\d{4,}/[^\\s]+"
620 substitution: "doi://{0}"
621 target_field: uris
622 mode: exclusive
623 }
624 }
625 "#;
626
627 let indexes = parse_sdl(sdl).unwrap();
628 let index = &indexes[0];
629
630 assert_eq!(index.query_routers.len(), 1);
631 let router = &index.query_routers[0];
632 assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
633 assert_eq!(router.substitution, "doi://{0}");
634 assert_eq!(router.target_field, "uris");
635 assert_eq!(router.mode, RoutingMode::Exclusive);
636 }
637
638 #[test]
639 fn test_query_router_raw_string() {
640 let sdl = r#"
641 index documents {
642 field uris: text [indexed, stored]
643
644 query_router {
645 pattern: r"^pmid:(\d+)$"
646 substitution: "pubmed://{1}"
647 target_field: uris
648 mode: additional
649 }
650 }
651 "#;
652
653 let indexes = parse_sdl(sdl).unwrap();
654 let router = &indexes[0].query_routers[0];
655
656 assert_eq!(router.pattern, r"^pmid:(\d+)$");
657 assert_eq!(router.substitution, "pubmed://{1}");
658 assert_eq!(router.mode, RoutingMode::Additional);
659 }
660
661 #[test]
662 fn test_multiple_query_routers() {
663 let sdl = r#"
664 index documents {
665 field uris: text [indexed, stored]
666
667 query_router {
668 pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
669 substitution: "doi://{1}"
670 target_field: uris
671 mode: exclusive
672 }
673
674 query_router {
675 pattern: r"^pmid:(\d+)$"
676 substitution: "pubmed://{1}"
677 target_field: uris
678 mode: exclusive
679 }
680
681 query_router {
682 pattern: r"^arxiv:(\d+\.\d+)$"
683 substitution: "arxiv://{1}"
684 target_field: uris
685 mode: additional
686 }
687 }
688 "#;
689
690 let indexes = parse_sdl(sdl).unwrap();
691 assert_eq!(indexes[0].query_routers.len(), 3);
692 }
693
694 #[test]
695 fn test_query_router_default_mode() {
696 let sdl = r#"
697 index documents {
698 field uris: text [indexed, stored]
699
700 query_router {
701 pattern: r"test"
702 substitution: "{0}"
703 target_field: uris
704 }
705 }
706 "#;
707
708 let indexes = parse_sdl(sdl).unwrap();
709 assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
711 }
712
713 #[test]
714 fn test_multi_attribute() {
715 let sdl = r#"
716 index documents {
717 field uris: text [indexed, stored, multi]
718 field title: text [indexed, stored]
719 }
720 "#;
721
722 let indexes = parse_sdl(sdl).unwrap();
723 assert_eq!(indexes.len(), 1);
724
725 let fields = &indexes[0].fields;
726 assert_eq!(fields.len(), 2);
727
728 assert_eq!(fields[0].name, "uris");
730 assert!(fields[0].multi, "uris field should have multi=true");
731
732 assert_eq!(fields[1].name, "title");
734 assert!(!fields[1].multi, "title field should have multi=false");
735
736 let schema = indexes[0].to_schema();
738 let uris_field = schema.get_field("uris").unwrap();
739 let title_field = schema.get_field("title").unwrap();
740
741 assert!(schema.get_field_entry(uris_field).unwrap().multi);
742 assert!(!schema.get_field_entry(title_field).unwrap().multi);
743 }
744
745 #[test]
746 fn test_sparse_vector_field() {
747 let sdl = r#"
748 index documents {
749 field embedding: sparse_vector [indexed, stored]
750 }
751 "#;
752
753 let indexes = parse_sdl(sdl).unwrap();
754 assert_eq!(indexes.len(), 1);
755 assert_eq!(indexes[0].fields.len(), 1);
756 assert_eq!(indexes[0].fields[0].name, "embedding");
757 assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
758 assert!(indexes[0].fields[0].sparse_vector_config.is_none());
759 }
760
761 #[test]
762 fn test_sparse_vector_with_config() {
763 let sdl = r#"
764 index documents {
765 field embedding: sparse_vector<u16, uint8> [indexed, stored]
766 field dense: sparse_vector<u32, float32> [indexed]
767 }
768 "#;
769
770 let indexes = parse_sdl(sdl).unwrap();
771 assert_eq!(indexes[0].fields.len(), 2);
772
773 let f1 = &indexes[0].fields[0];
775 assert_eq!(f1.name, "embedding");
776 let config1 = f1.sparse_vector_config.as_ref().unwrap();
777 assert_eq!(config1.index_size, IndexSize::U16);
778 assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
779
780 let f2 = &indexes[0].fields[1];
782 assert_eq!(f2.name, "dense");
783 let config2 = f2.sparse_vector_config.as_ref().unwrap();
784 assert_eq!(config2.index_size, IndexSize::U32);
785 assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
786 }
787}