hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//! }
31//! ```
32
33use pest::Parser;
34use pest_derive::Parser;
35
36use super::query_field_router::{QueryRouterRule, RoutingMode};
37use super::schema::{FieldType, Schema, SchemaBuilder};
38use crate::Result;
39use crate::error::Error;
40
41#[derive(Parser)]
42#[grammar = "dsl/sdl/sdl.pest"]
43pub struct SdlParser;
44
45use crate::structures::{IndexSize, SparseVectorConfig, WeightQuantization};
46
47/// Parsed field definition
48#[derive(Debug, Clone)]
49pub struct FieldDef {
50    pub name: String,
51    pub field_type: FieldType,
52    pub indexed: bool,
53    pub stored: bool,
54    /// Tokenizer name for text fields (e.g., "default", "en_stem", "german")
55    pub tokenizer: Option<String>,
56    /// Whether this field can have multiple values (serialized as array in JSON)
57    pub multi: bool,
58    /// Configuration for sparse vector fields
59    pub sparse_vector_config: Option<SparseVectorConfig>,
60}
61
62/// Parsed index definition
63#[derive(Debug, Clone)]
64pub struct IndexDef {
65    pub name: String,
66    pub fields: Vec<FieldDef>,
67    pub default_fields: Vec<String>,
68    /// Query router rules for routing queries to specific fields
69    pub query_routers: Vec<QueryRouterRule>,
70}
71
72impl IndexDef {
73    /// Convert to a Schema
74    pub fn to_schema(&self) -> Schema {
75        let mut builder = SchemaBuilder::default();
76
77        for field in &self.fields {
78            let f = match field.field_type {
79                FieldType::Text => {
80                    let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
81                    builder.add_text_field_with_tokenizer(
82                        &field.name,
83                        field.indexed,
84                        field.stored,
85                        tokenizer,
86                    )
87                }
88                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
89                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
90                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
91                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
92                FieldType::SparseVector => {
93                    if let Some(config) = &field.sparse_vector_config {
94                        builder.add_sparse_vector_field_with_config(
95                            &field.name,
96                            field.indexed,
97                            field.stored,
98                            *config,
99                        )
100                    } else {
101                        builder.add_sparse_vector_field(&field.name, field.indexed, field.stored)
102                    }
103                }
104            };
105            if field.multi {
106                builder.set_multi(f, true);
107            }
108        }
109
110        // Set default fields if specified
111        if !self.default_fields.is_empty() {
112            builder.set_default_fields(self.default_fields.clone());
113        }
114
115        // Set query routers if specified
116        if !self.query_routers.is_empty() {
117            builder.set_query_routers(self.query_routers.clone());
118        }
119
120        builder.build()
121    }
122
123    /// Create a QueryFieldRouter from the query router rules
124    ///
125    /// Returns None if there are no query router rules defined.
126    /// Returns Err if any regex pattern is invalid.
127    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
128        if self.query_routers.is_empty() {
129            return Ok(None);
130        }
131
132        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
133            .map(Some)
134            .map_err(Error::Schema)
135    }
136}
137
138/// Parse field type from string
139fn parse_field_type(type_str: &str) -> Result<FieldType> {
140    match type_str {
141        "text" | "string" | "str" => Ok(FieldType::Text),
142        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
143        "i64" | "int" | "integer" => Ok(FieldType::I64),
144        "f64" | "float" | "double" => Ok(FieldType::F64),
145        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
146        "sparse_vector" => Ok(FieldType::SparseVector),
147        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
148    }
149}
150
151/// Parse attributes from pest pair
152/// Returns (indexed, stored, multi)
153fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool) {
154    let mut indexed = false;
155    let mut stored = false;
156    let mut multi = false;
157
158    for attr in pair.into_inner() {
159        match attr.as_str() {
160            "indexed" => indexed = true,
161            "stored" => stored = true,
162            "multi" => multi = true,
163            _ => {}
164        }
165    }
166
167    (indexed, stored, multi)
168}
169
170/// Parse a field definition from pest pair
171fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
172    let mut inner = pair.into_inner();
173
174    let name = inner
175        .next()
176        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
177        .as_str()
178        .to_string();
179
180    let field_type_str = inner
181        .next()
182        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
183        .as_str();
184
185    let field_type = parse_field_type(field_type_str)?;
186
187    // Parse optional tokenizer spec, sparse_vector_config, and attributes
188    let mut tokenizer = None;
189    let mut sparse_vector_config = None;
190    let mut indexed = true;
191    let mut stored = true;
192    let mut multi = false;
193
194    for item in inner {
195        match item.as_rule() {
196            Rule::tokenizer_spec => {
197                // Extract tokenizer name from <name>
198                if let Some(tok_name) = item.into_inner().next() {
199                    tokenizer = Some(tok_name.as_str().to_string());
200                }
201            }
202            Rule::sparse_vector_config => {
203                // Parse <index_size, quantization>
204                let mut config_inner = item.into_inner();
205                let index_size = if let Some(size_pair) = config_inner.next() {
206                    match size_pair.as_str() {
207                        "u16" => IndexSize::U16,
208                        "u32" => IndexSize::U32,
209                        _ => IndexSize::default(),
210                    }
211                } else {
212                    IndexSize::default()
213                };
214                let quantization = if let Some(quant_pair) = config_inner.next() {
215                    match quant_pair.as_str() {
216                        "float32" | "f32" => WeightQuantization::Float32,
217                        "float16" | "f16" => WeightQuantization::Float16,
218                        "uint8" | "u8" => WeightQuantization::UInt8,
219                        "uint4" | "u4" => WeightQuantization::UInt4,
220                        _ => WeightQuantization::default(),
221                    }
222                } else {
223                    WeightQuantization::default()
224                };
225                sparse_vector_config = Some(SparseVectorConfig {
226                    index_size,
227                    weight_quantization: quantization,
228                });
229            }
230            Rule::attributes => {
231                let (idx, sto, mul) = parse_attributes(item);
232                indexed = idx;
233                stored = sto;
234                multi = mul;
235            }
236            _ => {}
237        }
238    }
239
240    Ok(FieldDef {
241        name,
242        field_type,
243        indexed,
244        stored,
245        tokenizer,
246        multi,
247        sparse_vector_config,
248    })
249}
250
251/// Parse default_fields definition
252fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
253    pair.into_inner().map(|p| p.as_str().to_string()).collect()
254}
255
256/// Parse a query router definition
257fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
258    let mut pattern = String::new();
259    let mut substitution = String::new();
260    let mut target_field = String::new();
261    let mut mode = RoutingMode::Additional;
262
263    for prop in pair.into_inner() {
264        if prop.as_rule() != Rule::query_router_prop {
265            continue;
266        }
267
268        for inner in prop.into_inner() {
269            match inner.as_rule() {
270                Rule::query_router_pattern => {
271                    if let Some(regex_str) = inner.into_inner().next() {
272                        pattern = parse_string_value(regex_str);
273                    }
274                }
275                Rule::query_router_substitution => {
276                    if let Some(quoted) = inner.into_inner().next() {
277                        substitution = parse_string_value(quoted);
278                    }
279                }
280                Rule::query_router_target => {
281                    if let Some(ident) = inner.into_inner().next() {
282                        target_field = ident.as_str().to_string();
283                    }
284                }
285                Rule::query_router_mode => {
286                    if let Some(mode_val) = inner.into_inner().next() {
287                        mode = match mode_val.as_str() {
288                            "exclusive" => RoutingMode::Exclusive,
289                            "additional" => RoutingMode::Additional,
290                            _ => RoutingMode::Additional,
291                        };
292                    }
293                }
294                _ => {}
295            }
296        }
297    }
298
299    if pattern.is_empty() {
300        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
301    }
302    if substitution.is_empty() {
303        return Err(Error::Schema(
304            "query_router missing 'substitution'".to_string(),
305        ));
306    }
307    if target_field.is_empty() {
308        return Err(Error::Schema(
309            "query_router missing 'target_field'".to_string(),
310        ));
311    }
312
313    Ok(QueryRouterRule {
314        pattern,
315        substitution,
316        target_field,
317        mode,
318    })
319}
320
321/// Parse a string value from quoted_string, raw_string, or regex_string
322fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
323    let s = pair.as_str();
324    match pair.as_rule() {
325        Rule::regex_string => {
326            // regex_string contains either raw_string or quoted_string
327            if let Some(inner) = pair.into_inner().next() {
328                parse_string_value(inner)
329            } else {
330                s.to_string()
331            }
332        }
333        Rule::raw_string => {
334            // r"..." - strip r" prefix and " suffix
335            s[2..s.len() - 1].to_string()
336        }
337        Rule::quoted_string => {
338            // "..." - strip quotes and handle escapes
339            let inner = &s[1..s.len() - 1];
340            // Simple escape handling
341            inner
342                .replace("\\n", "\n")
343                .replace("\\t", "\t")
344                .replace("\\\"", "\"")
345                .replace("\\\\", "\\")
346        }
347        _ => s.to_string(),
348    }
349}
350
351/// Parse an index definition from pest pair
352fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
353    let mut inner = pair.into_inner();
354
355    let name = inner
356        .next()
357        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
358        .as_str()
359        .to_string();
360
361    let mut fields = Vec::new();
362    let mut default_fields = Vec::new();
363    let mut query_routers = Vec::new();
364
365    for item in inner {
366        match item.as_rule() {
367            Rule::field_def => {
368                fields.push(parse_field_def(item)?);
369            }
370            Rule::default_fields_def => {
371                default_fields = parse_default_fields_def(item);
372            }
373            Rule::query_router_def => {
374                query_routers.push(parse_query_router_def(item)?);
375            }
376            _ => {}
377        }
378    }
379
380    Ok(IndexDef {
381        name,
382        fields,
383        default_fields,
384        query_routers,
385    })
386}
387
388/// Parse SDL from a string
389pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
390    let pairs = SdlParser::parse(Rule::file, input)
391        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
392
393    let mut indexes = Vec::new();
394
395    for pair in pairs {
396        if pair.as_rule() == Rule::file {
397            for inner in pair.into_inner() {
398                if inner.as_rule() == Rule::index_def {
399                    indexes.push(parse_index_def(inner)?);
400                }
401            }
402        }
403    }
404
405    Ok(indexes)
406}
407
408/// Parse SDL and return a single index definition
409pub fn parse_single_index(input: &str) -> Result<IndexDef> {
410    let indexes = parse_sdl(input)?;
411
412    if indexes.is_empty() {
413        return Err(Error::Schema("No index definition found".to_string()));
414    }
415
416    if indexes.len() > 1 {
417        return Err(Error::Schema(
418            "Multiple index definitions found, expected one".to_string(),
419        ));
420    }
421
422    Ok(indexes.into_iter().next().unwrap())
423}
424
425#[cfg(test)]
426mod tests {
427    use super::*;
428
429    #[test]
430    fn test_parse_simple_schema() {
431        let sdl = r#"
432            index articles {
433                field title: text [indexed, stored]
434                field body: text [indexed]
435            }
436        "#;
437
438        let indexes = parse_sdl(sdl).unwrap();
439        assert_eq!(indexes.len(), 1);
440
441        let index = &indexes[0];
442        assert_eq!(index.name, "articles");
443        assert_eq!(index.fields.len(), 2);
444
445        assert_eq!(index.fields[0].name, "title");
446        assert!(matches!(index.fields[0].field_type, FieldType::Text));
447        assert!(index.fields[0].indexed);
448        assert!(index.fields[0].stored);
449
450        assert_eq!(index.fields[1].name, "body");
451        assert!(matches!(index.fields[1].field_type, FieldType::Text));
452        assert!(index.fields[1].indexed);
453        assert!(!index.fields[1].stored);
454    }
455
456    #[test]
457    fn test_parse_all_field_types() {
458        let sdl = r#"
459            index test {
460                field text_field: text [indexed, stored]
461                field u64_field: u64 [indexed, stored]
462                field i64_field: i64 [indexed, stored]
463                field f64_field: f64 [indexed, stored]
464                field bytes_field: bytes [stored]
465            }
466        "#;
467
468        let indexes = parse_sdl(sdl).unwrap();
469        let index = &indexes[0];
470
471        assert!(matches!(index.fields[0].field_type, FieldType::Text));
472        assert!(matches!(index.fields[1].field_type, FieldType::U64));
473        assert!(matches!(index.fields[2].field_type, FieldType::I64));
474        assert!(matches!(index.fields[3].field_type, FieldType::F64));
475        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
476    }
477
478    #[test]
479    fn test_parse_with_comments() {
480        let sdl = r#"
481            # This is a comment
482            index articles {
483                # Title field
484                field title: text [indexed, stored]
485                field body: text [indexed] # inline comment not supported yet
486            }
487        "#;
488
489        let indexes = parse_sdl(sdl).unwrap();
490        assert_eq!(indexes[0].fields.len(), 2);
491    }
492
493    #[test]
494    fn test_parse_type_aliases() {
495        let sdl = r#"
496            index test {
497                field a: string [indexed]
498                field b: int [indexed]
499                field c: uint [indexed]
500                field d: float [indexed]
501                field e: binary [stored]
502            }
503        "#;
504
505        let indexes = parse_sdl(sdl).unwrap();
506        let index = &indexes[0];
507
508        assert!(matches!(index.fields[0].field_type, FieldType::Text));
509        assert!(matches!(index.fields[1].field_type, FieldType::I64));
510        assert!(matches!(index.fields[2].field_type, FieldType::U64));
511        assert!(matches!(index.fields[3].field_type, FieldType::F64));
512        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
513    }
514
515    #[test]
516    fn test_to_schema() {
517        let sdl = r#"
518            index articles {
519                field title: text [indexed, stored]
520                field views: u64 [indexed, stored]
521            }
522        "#;
523
524        let indexes = parse_sdl(sdl).unwrap();
525        let schema = indexes[0].to_schema();
526
527        assert!(schema.get_field("title").is_some());
528        assert!(schema.get_field("views").is_some());
529        assert!(schema.get_field("nonexistent").is_none());
530    }
531
532    #[test]
533    fn test_default_attributes() {
534        let sdl = r#"
535            index test {
536                field title: text
537            }
538        "#;
539
540        let indexes = parse_sdl(sdl).unwrap();
541        let field = &indexes[0].fields[0];
542
543        // Default should be indexed and stored
544        assert!(field.indexed);
545        assert!(field.stored);
546    }
547
548    #[test]
549    fn test_multiple_indexes() {
550        let sdl = r#"
551            index articles {
552                field title: text [indexed, stored]
553            }
554
555            index users {
556                field name: text [indexed, stored]
557                field email: text [indexed, stored]
558            }
559        "#;
560
561        let indexes = parse_sdl(sdl).unwrap();
562        assert_eq!(indexes.len(), 2);
563        assert_eq!(indexes[0].name, "articles");
564        assert_eq!(indexes[1].name, "users");
565    }
566
567    #[test]
568    fn test_tokenizer_spec() {
569        let sdl = r#"
570            index articles {
571                field title: text<en_stem> [indexed, stored]
572                field body: text<default> [indexed]
573                field author: text [indexed, stored]
574            }
575        "#;
576
577        let indexes = parse_sdl(sdl).unwrap();
578        let index = &indexes[0];
579
580        assert_eq!(index.fields[0].name, "title");
581        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
582
583        assert_eq!(index.fields[1].name, "body");
584        assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
585
586        assert_eq!(index.fields[2].name, "author");
587        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
588    }
589
590    #[test]
591    fn test_tokenizer_in_schema() {
592        let sdl = r#"
593            index articles {
594                field title: text<german> [indexed, stored]
595                field body: text<en_stem> [indexed]
596            }
597        "#;
598
599        let indexes = parse_sdl(sdl).unwrap();
600        let schema = indexes[0].to_schema();
601
602        let title_field = schema.get_field("title").unwrap();
603        let title_entry = schema.get_field_entry(title_field).unwrap();
604        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
605
606        let body_field = schema.get_field("body").unwrap();
607        let body_entry = schema.get_field_entry(body_field).unwrap();
608        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
609    }
610
611    #[test]
612    fn test_query_router_basic() {
613        let sdl = r#"
614            index documents {
615                field title: text [indexed, stored]
616                field uri: text [indexed, stored]
617
618                query_router {
619                    pattern: "10\\.\\d{4,}/[^\\s]+"
620                    substitution: "doi://{0}"
621                    target_field: uris
622                    mode: exclusive
623                }
624            }
625        "#;
626
627        let indexes = parse_sdl(sdl).unwrap();
628        let index = &indexes[0];
629
630        assert_eq!(index.query_routers.len(), 1);
631        let router = &index.query_routers[0];
632        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
633        assert_eq!(router.substitution, "doi://{0}");
634        assert_eq!(router.target_field, "uris");
635        assert_eq!(router.mode, RoutingMode::Exclusive);
636    }
637
638    #[test]
639    fn test_query_router_raw_string() {
640        let sdl = r#"
641            index documents {
642                field uris: text [indexed, stored]
643
644                query_router {
645                    pattern: r"^pmid:(\d+)$"
646                    substitution: "pubmed://{1}"
647                    target_field: uris
648                    mode: additional
649                }
650            }
651        "#;
652
653        let indexes = parse_sdl(sdl).unwrap();
654        let router = &indexes[0].query_routers[0];
655
656        assert_eq!(router.pattern, r"^pmid:(\d+)$");
657        assert_eq!(router.substitution, "pubmed://{1}");
658        assert_eq!(router.mode, RoutingMode::Additional);
659    }
660
661    #[test]
662    fn test_multiple_query_routers() {
663        let sdl = r#"
664            index documents {
665                field uris: text [indexed, stored]
666
667                query_router {
668                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
669                    substitution: "doi://{1}"
670                    target_field: uris
671                    mode: exclusive
672                }
673
674                query_router {
675                    pattern: r"^pmid:(\d+)$"
676                    substitution: "pubmed://{1}"
677                    target_field: uris
678                    mode: exclusive
679                }
680
681                query_router {
682                    pattern: r"^arxiv:(\d+\.\d+)$"
683                    substitution: "arxiv://{1}"
684                    target_field: uris
685                    mode: additional
686                }
687            }
688        "#;
689
690        let indexes = parse_sdl(sdl).unwrap();
691        assert_eq!(indexes[0].query_routers.len(), 3);
692    }
693
694    #[test]
695    fn test_query_router_default_mode() {
696        let sdl = r#"
697            index documents {
698                field uris: text [indexed, stored]
699
700                query_router {
701                    pattern: r"test"
702                    substitution: "{0}"
703                    target_field: uris
704                }
705            }
706        "#;
707
708        let indexes = parse_sdl(sdl).unwrap();
709        // Default mode should be Additional
710        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
711    }
712
713    #[test]
714    fn test_multi_attribute() {
715        let sdl = r#"
716            index documents {
717                field uris: text [indexed, stored, multi]
718                field title: text [indexed, stored]
719            }
720        "#;
721
722        let indexes = parse_sdl(sdl).unwrap();
723        assert_eq!(indexes.len(), 1);
724
725        let fields = &indexes[0].fields;
726        assert_eq!(fields.len(), 2);
727
728        // uris should have multi=true
729        assert_eq!(fields[0].name, "uris");
730        assert!(fields[0].multi, "uris field should have multi=true");
731
732        // title should have multi=false
733        assert_eq!(fields[1].name, "title");
734        assert!(!fields[1].multi, "title field should have multi=false");
735
736        // Verify schema conversion preserves multi attribute
737        let schema = indexes[0].to_schema();
738        let uris_field = schema.get_field("uris").unwrap();
739        let title_field = schema.get_field("title").unwrap();
740
741        assert!(schema.get_field_entry(uris_field).unwrap().multi);
742        assert!(!schema.get_field_entry(title_field).unwrap().multi);
743    }
744
745    #[test]
746    fn test_sparse_vector_field() {
747        let sdl = r#"
748            index documents {
749                field embedding: sparse_vector [indexed, stored]
750            }
751        "#;
752
753        let indexes = parse_sdl(sdl).unwrap();
754        assert_eq!(indexes.len(), 1);
755        assert_eq!(indexes[0].fields.len(), 1);
756        assert_eq!(indexes[0].fields[0].name, "embedding");
757        assert_eq!(indexes[0].fields[0].field_type, FieldType::SparseVector);
758        assert!(indexes[0].fields[0].sparse_vector_config.is_none());
759    }
760
761    #[test]
762    fn test_sparse_vector_with_config() {
763        let sdl = r#"
764            index documents {
765                field embedding: sparse_vector<u16, uint8> [indexed, stored]
766                field dense: sparse_vector<u32, float32> [indexed]
767            }
768        "#;
769
770        let indexes = parse_sdl(sdl).unwrap();
771        assert_eq!(indexes[0].fields.len(), 2);
772
773        // First field: u16 indices, uint8 quantization
774        let f1 = &indexes[0].fields[0];
775        assert_eq!(f1.name, "embedding");
776        let config1 = f1.sparse_vector_config.as_ref().unwrap();
777        assert_eq!(config1.index_size, IndexSize::U16);
778        assert_eq!(config1.weight_quantization, WeightQuantization::UInt8);
779
780        // Second field: u32 indices, float32 quantization
781        let f2 = &indexes[0].fields[1];
782        assert_eq!(f2.name, "dense");
783        let config2 = f2.sparse_vector_config.as_ref().unwrap();
784        assert_eq!(config2.index_size, IndexSize::U32);
785        assert_eq!(config2.weight_quantization, WeightQuantization::Float32);
786    }
787}