hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!     
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!     
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!     
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!     
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!     
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!     
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//! }
31//! ```
32
33use pest::Parser;
34use pest_derive::Parser;
35
36use super::query_field_router::{QueryRouterRule, RoutingMode};
37use super::schema::{FieldType, Schema, SchemaBuilder};
38use crate::Result;
39use crate::error::Error;
40
41#[derive(Parser)]
42#[grammar = "dsl/sdl/sdl.pest"]
43pub struct SdlParser;
44
45/// Parsed field definition
46#[derive(Debug, Clone)]
47pub struct FieldDef {
48    pub name: String,
49    pub field_type: FieldType,
50    pub indexed: bool,
51    pub stored: bool,
52    /// Tokenizer name for text fields (e.g., "default", "en_stem", "german")
53    pub tokenizer: Option<String>,
54    /// Whether this field can have multiple values (serialized as array in JSON)
55    pub multi: bool,
56}
57
58/// Parsed index definition
59#[derive(Debug, Clone)]
60pub struct IndexDef {
61    pub name: String,
62    pub fields: Vec<FieldDef>,
63    pub default_fields: Vec<String>,
64    /// Query router rules for routing queries to specific fields
65    pub query_routers: Vec<QueryRouterRule>,
66}
67
68impl IndexDef {
69    /// Convert to a Schema
70    pub fn to_schema(&self) -> Schema {
71        let mut builder = SchemaBuilder::default();
72
73        for field in &self.fields {
74            let f = match field.field_type {
75                FieldType::Text => {
76                    let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
77                    builder.add_text_field_with_tokenizer(
78                        &field.name,
79                        field.indexed,
80                        field.stored,
81                        tokenizer,
82                    )
83                }
84                FieldType::U64 => {
85                    builder.add_u64_field(&field.name, field.indexed, field.stored)
86                }
87                FieldType::I64 => {
88                    builder.add_i64_field(&field.name, field.indexed, field.stored)
89                }
90                FieldType::F64 => {
91                    builder.add_f64_field(&field.name, field.indexed, field.stored)
92                }
93                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
94            };
95            if field.multi {
96                builder.set_multi(f, true);
97            }
98        }
99
100        // Set default fields if specified
101        if !self.default_fields.is_empty() {
102            builder.set_default_fields(self.default_fields.clone());
103        }
104
105        // Set query routers if specified
106        if !self.query_routers.is_empty() {
107            builder.set_query_routers(self.query_routers.clone());
108        }
109
110        builder.build()
111    }
112
113    /// Create a QueryFieldRouter from the query router rules
114    ///
115    /// Returns None if there are no query router rules defined.
116    /// Returns Err if any regex pattern is invalid.
117    pub fn to_query_router(
118        &self,
119    ) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
120        if self.query_routers.is_empty() {
121            return Ok(None);
122        }
123
124        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
125            .map(Some)
126            .map_err(|e| Error::Schema(e))
127    }
128}
129
130/// Parse field type from string
131fn parse_field_type(type_str: &str) -> Result<FieldType> {
132    match type_str {
133        "text" | "string" | "str" => Ok(FieldType::Text),
134        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
135        "i64" | "int" | "integer" => Ok(FieldType::I64),
136        "f64" | "float" | "double" => Ok(FieldType::F64),
137        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
138        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
139    }
140}
141
142/// Parse attributes from pest pair
143/// Returns (indexed, stored, multi)
144fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool) {
145    let mut indexed = false;
146    let mut stored = false;
147    let mut multi = false;
148
149    for attr in pair.into_inner() {
150        match attr.as_str() {
151            "indexed" => indexed = true,
152            "stored" => stored = true,
153            "multi" => multi = true,
154            _ => {}
155        }
156    }
157
158    (indexed, stored, multi)
159}
160
161/// Parse a field definition from pest pair
162fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
163    let mut inner = pair.into_inner();
164
165    let name = inner
166        .next()
167        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
168        .as_str()
169        .to_string();
170
171    let field_type_str = inner
172        .next()
173        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
174        .as_str();
175
176    let field_type = parse_field_type(field_type_str)?;
177
178    // Parse optional tokenizer spec and attributes
179    let mut tokenizer = None;
180    let mut indexed = true;
181    let mut stored = true;
182    let mut multi = false;
183
184    for item in inner {
185        match item.as_rule() {
186            Rule::tokenizer_spec => {
187                // Extract tokenizer name from <name>
188                if let Some(tok_name) = item.into_inner().next() {
189                    tokenizer = Some(tok_name.as_str().to_string());
190                }
191            }
192            Rule::attributes => {
193                let (idx, sto, mul) = parse_attributes(item);
194                indexed = idx;
195                stored = sto;
196                multi = mul;
197            }
198            _ => {}
199        }
200    }
201
202    Ok(FieldDef {
203        name,
204        field_type,
205        indexed,
206        stored,
207        tokenizer,
208        multi,
209    })
210}
211
212/// Parse default_fields definition
213fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
214    pair.into_inner().map(|p| p.as_str().to_string()).collect()
215}
216
217/// Parse a query router definition
218fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
219    let mut pattern = String::new();
220    let mut substitution = String::new();
221    let mut target_field = String::new();
222    let mut mode = RoutingMode::Additional;
223
224    for prop in pair.into_inner() {
225        if prop.as_rule() != Rule::query_router_prop {
226            continue;
227        }
228
229        for inner in prop.into_inner() {
230            match inner.as_rule() {
231                Rule::query_router_pattern => {
232                    if let Some(regex_str) = inner.into_inner().next() {
233                        pattern = parse_string_value(regex_str);
234                    }
235                }
236                Rule::query_router_substitution => {
237                    if let Some(quoted) = inner.into_inner().next() {
238                        substitution = parse_string_value(quoted);
239                    }
240                }
241                Rule::query_router_target => {
242                    if let Some(ident) = inner.into_inner().next() {
243                        target_field = ident.as_str().to_string();
244                    }
245                }
246                Rule::query_router_mode => {
247                    if let Some(mode_val) = inner.into_inner().next() {
248                        mode = match mode_val.as_str() {
249                            "exclusive" => RoutingMode::Exclusive,
250                            "additional" => RoutingMode::Additional,
251                            _ => RoutingMode::Additional,
252                        };
253                    }
254                }
255                _ => {}
256            }
257        }
258    }
259
260    if pattern.is_empty() {
261        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
262    }
263    if substitution.is_empty() {
264        return Err(Error::Schema(
265            "query_router missing 'substitution'".to_string(),
266        ));
267    }
268    if target_field.is_empty() {
269        return Err(Error::Schema(
270            "query_router missing 'target_field'".to_string(),
271        ));
272    }
273
274    Ok(QueryRouterRule {
275        pattern,
276        substitution,
277        target_field,
278        mode,
279    })
280}
281
282/// Parse a string value from quoted_string, raw_string, or regex_string
283fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
284    let s = pair.as_str();
285    match pair.as_rule() {
286        Rule::regex_string => {
287            // regex_string contains either raw_string or quoted_string
288            if let Some(inner) = pair.into_inner().next() {
289                parse_string_value(inner)
290            } else {
291                s.to_string()
292            }
293        }
294        Rule::raw_string => {
295            // r"..." - strip r" prefix and " suffix
296            s[2..s.len() - 1].to_string()
297        }
298        Rule::quoted_string => {
299            // "..." - strip quotes and handle escapes
300            let inner = &s[1..s.len() - 1];
301            // Simple escape handling
302            inner
303                .replace("\\n", "\n")
304                .replace("\\t", "\t")
305                .replace("\\\"", "\"")
306                .replace("\\\\", "\\")
307        }
308        _ => s.to_string(),
309    }
310}
311
312/// Parse an index definition from pest pair
313fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
314    let mut inner = pair.into_inner();
315
316    let name = inner
317        .next()
318        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
319        .as_str()
320        .to_string();
321
322    let mut fields = Vec::new();
323    let mut default_fields = Vec::new();
324    let mut query_routers = Vec::new();
325
326    for item in inner {
327        match item.as_rule() {
328            Rule::field_def => {
329                fields.push(parse_field_def(item)?);
330            }
331            Rule::default_fields_def => {
332                default_fields = parse_default_fields_def(item);
333            }
334            Rule::query_router_def => {
335                query_routers.push(parse_query_router_def(item)?);
336            }
337            _ => {}
338        }
339    }
340
341    Ok(IndexDef {
342        name,
343        fields,
344        default_fields,
345        query_routers,
346    })
347}
348
349/// Parse SDL from a string
350pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
351    let pairs = SdlParser::parse(Rule::file, input)
352        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
353
354    let mut indexes = Vec::new();
355
356    for pair in pairs {
357        if pair.as_rule() == Rule::file {
358            for inner in pair.into_inner() {
359                if inner.as_rule() == Rule::index_def {
360                    indexes.push(parse_index_def(inner)?);
361                }
362            }
363        }
364    }
365
366    Ok(indexes)
367}
368
369/// Parse SDL and return a single index definition
370pub fn parse_single_index(input: &str) -> Result<IndexDef> {
371    let indexes = parse_sdl(input)?;
372
373    if indexes.is_empty() {
374        return Err(Error::Schema("No index definition found".to_string()));
375    }
376
377    if indexes.len() > 1 {
378        return Err(Error::Schema(
379            "Multiple index definitions found, expected one".to_string(),
380        ));
381    }
382
383    Ok(indexes.into_iter().next().unwrap())
384}
385
386#[cfg(test)]
387mod tests {
388    use super::*;
389
390    #[test]
391    fn test_parse_simple_schema() {
392        let sdl = r#"
393            index articles {
394                field title: text [indexed, stored]
395                field body: text [indexed]
396            }
397        "#;
398
399        let indexes = parse_sdl(sdl).unwrap();
400        assert_eq!(indexes.len(), 1);
401
402        let index = &indexes[0];
403        assert_eq!(index.name, "articles");
404        assert_eq!(index.fields.len(), 2);
405
406        assert_eq!(index.fields[0].name, "title");
407        assert!(matches!(index.fields[0].field_type, FieldType::Text));
408        assert!(index.fields[0].indexed);
409        assert!(index.fields[0].stored);
410
411        assert_eq!(index.fields[1].name, "body");
412        assert!(matches!(index.fields[1].field_type, FieldType::Text));
413        assert!(index.fields[1].indexed);
414        assert!(!index.fields[1].stored);
415    }
416
417    #[test]
418    fn test_parse_all_field_types() {
419        let sdl = r#"
420            index test {
421                field text_field: text [indexed, stored]
422                field u64_field: u64 [indexed, stored]
423                field i64_field: i64 [indexed, stored]
424                field f64_field: f64 [indexed, stored]
425                field bytes_field: bytes [stored]
426            }
427        "#;
428
429        let indexes = parse_sdl(sdl).unwrap();
430        let index = &indexes[0];
431
432        assert!(matches!(index.fields[0].field_type, FieldType::Text));
433        assert!(matches!(index.fields[1].field_type, FieldType::U64));
434        assert!(matches!(index.fields[2].field_type, FieldType::I64));
435        assert!(matches!(index.fields[3].field_type, FieldType::F64));
436        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
437    }
438
439    #[test]
440    fn test_parse_with_comments() {
441        let sdl = r#"
442            # This is a comment
443            index articles {
444                # Title field
445                field title: text [indexed, stored]
446                field body: text [indexed] # inline comment not supported yet
447            }
448        "#;
449
450        let indexes = parse_sdl(sdl).unwrap();
451        assert_eq!(indexes[0].fields.len(), 2);
452    }
453
454    #[test]
455    fn test_parse_type_aliases() {
456        let sdl = r#"
457            index test {
458                field a: string [indexed]
459                field b: int [indexed]
460                field c: uint [indexed]
461                field d: float [indexed]
462                field e: binary [stored]
463            }
464        "#;
465
466        let indexes = parse_sdl(sdl).unwrap();
467        let index = &indexes[0];
468
469        assert!(matches!(index.fields[0].field_type, FieldType::Text));
470        assert!(matches!(index.fields[1].field_type, FieldType::I64));
471        assert!(matches!(index.fields[2].field_type, FieldType::U64));
472        assert!(matches!(index.fields[3].field_type, FieldType::F64));
473        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
474    }
475
476    #[test]
477    fn test_to_schema() {
478        let sdl = r#"
479            index articles {
480                field title: text [indexed, stored]
481                field views: u64 [indexed, stored]
482            }
483        "#;
484
485        let indexes = parse_sdl(sdl).unwrap();
486        let schema = indexes[0].to_schema();
487
488        assert!(schema.get_field("title").is_some());
489        assert!(schema.get_field("views").is_some());
490        assert!(schema.get_field("nonexistent").is_none());
491    }
492
493    #[test]
494    fn test_default_attributes() {
495        let sdl = r#"
496            index test {
497                field title: text
498            }
499        "#;
500
501        let indexes = parse_sdl(sdl).unwrap();
502        let field = &indexes[0].fields[0];
503
504        // Default should be indexed and stored
505        assert!(field.indexed);
506        assert!(field.stored);
507    }
508
509    #[test]
510    fn test_multiple_indexes() {
511        let sdl = r#"
512            index articles {
513                field title: text [indexed, stored]
514            }
515            
516            index users {
517                field name: text [indexed, stored]
518                field email: text [indexed, stored]
519            }
520        "#;
521
522        let indexes = parse_sdl(sdl).unwrap();
523        assert_eq!(indexes.len(), 2);
524        assert_eq!(indexes[0].name, "articles");
525        assert_eq!(indexes[1].name, "users");
526    }
527
528    #[test]
529    fn test_tokenizer_spec() {
530        let sdl = r#"
531            index articles {
532                field title: text<en_stem> [indexed, stored]
533                field body: text<default> [indexed]
534                field author: text [indexed, stored]
535            }
536        "#;
537
538        let indexes = parse_sdl(sdl).unwrap();
539        let index = &indexes[0];
540
541        assert_eq!(index.fields[0].name, "title");
542        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
543
544        assert_eq!(index.fields[1].name, "body");
545        assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
546
547        assert_eq!(index.fields[2].name, "author");
548        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
549    }
550
551    #[test]
552    fn test_tokenizer_in_schema() {
553        let sdl = r#"
554            index articles {
555                field title: text<german> [indexed, stored]
556                field body: text<en_stem> [indexed]
557            }
558        "#;
559
560        let indexes = parse_sdl(sdl).unwrap();
561        let schema = indexes[0].to_schema();
562
563        let title_field = schema.get_field("title").unwrap();
564        let title_entry = schema.get_field_entry(title_field).unwrap();
565        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
566
567        let body_field = schema.get_field("body").unwrap();
568        let body_entry = schema.get_field_entry(body_field).unwrap();
569        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
570    }
571
572    #[test]
573    fn test_query_router_basic() {
574        let sdl = r#"
575            index documents {
576                field title: text [indexed, stored]
577                field uri: text [indexed, stored]
578                
579                query_router {
580                    pattern: "10\\.\\d{4,}/[^\\s]+"
581                    substitution: "doi://{0}"
582                    target_field: uris
583                    mode: exclusive
584                }
585            }
586        "#;
587
588        let indexes = parse_sdl(sdl).unwrap();
589        let index = &indexes[0];
590
591        assert_eq!(index.query_routers.len(), 1);
592        let router = &index.query_routers[0];
593        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
594        assert_eq!(router.substitution, "doi://{0}");
595        assert_eq!(router.target_field, "uris");
596        assert_eq!(router.mode, RoutingMode::Exclusive);
597    }
598
599    #[test]
600    fn test_query_router_raw_string() {
601        let sdl = r#"
602            index documents {
603                field uris: text [indexed, stored]
604                
605                query_router {
606                    pattern: r"^pmid:(\d+)$"
607                    substitution: "pubmed://{1}"
608                    target_field: uris
609                    mode: additional
610                }
611            }
612        "#;
613
614        let indexes = parse_sdl(sdl).unwrap();
615        let router = &indexes[0].query_routers[0];
616
617        assert_eq!(router.pattern, r"^pmid:(\d+)$");
618        assert_eq!(router.substitution, "pubmed://{1}");
619        assert_eq!(router.mode, RoutingMode::Additional);
620    }
621
622    #[test]
623    fn test_multiple_query_routers() {
624        let sdl = r#"
625            index documents {
626                field uris: text [indexed, stored]
627                
628                query_router {
629                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
630                    substitution: "doi://{1}"
631                    target_field: uris
632                    mode: exclusive
633                }
634                
635                query_router {
636                    pattern: r"^pmid:(\d+)$"
637                    substitution: "pubmed://{1}"
638                    target_field: uris
639                    mode: exclusive
640                }
641                
642                query_router {
643                    pattern: r"^arxiv:(\d+\.\d+)$"
644                    substitution: "arxiv://{1}"
645                    target_field: uris
646                    mode: additional
647                }
648            }
649        "#;
650
651        let indexes = parse_sdl(sdl).unwrap();
652        assert_eq!(indexes[0].query_routers.len(), 3);
653    }
654
655    #[test]
656    fn test_query_router_default_mode() {
657        let sdl = r#"
658            index documents {
659                field uris: text [indexed, stored]
660                
661                query_router {
662                    pattern: r"test"
663                    substitution: "{0}"
664                    target_field: uris
665                }
666            }
667        "#;
668
669        let indexes = parse_sdl(sdl).unwrap();
670        // Default mode should be Additional
671        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
672    }
673
674    #[test]
675    fn test_multi_attribute() {
676        let sdl = r#"
677            index documents {
678                field uris: text [indexed, stored, multi]
679                field title: text [indexed, stored]
680            }
681        "#;
682
683        let indexes = parse_sdl(sdl).unwrap();
684        assert_eq!(indexes.len(), 1);
685        
686        let fields = &indexes[0].fields;
687        assert_eq!(fields.len(), 2);
688        
689        // uris should have multi=true
690        assert_eq!(fields[0].name, "uris");
691        assert!(fields[0].multi, "uris field should have multi=true");
692        
693        // title should have multi=false
694        assert_eq!(fields[1].name, "title");
695        assert!(!fields[1].multi, "title field should have multi=false");
696
697        // Verify schema conversion preserves multi attribute
698        let schema = indexes[0].to_schema();
699        let uris_field = schema.get_field("uris").unwrap();
700        let title_field = schema.get_field("title").unwrap();
701        
702        assert!(schema.get_field_entry(uris_field).unwrap().multi);
703        assert!(!schema.get_field_entry(title_field).unwrap().multi);
704    }
705}