hermes_core/dsl/sdl/
mod.rs

1//! Schema Definition Language (SDL) for Hermes
2//!
3//! A simple, readable format for defining index schemas using pest parser.
4//!
5//! # Example SDL
6//!
7//! ```text
8//! # Article index schema
9//! index articles {
10//!     # Primary text field for full-text search
11//!     field title: text [indexed, stored]
12//!
13//!     # Body content - indexed but not stored (save space)
14//!     field body: text [indexed]
15//!
16//!     # Author name
17//!     field author: text [indexed, stored]
18//!
19//!     # Publication timestamp
20//!     field published_at: i64 [indexed, stored]
21//!
22//!     # View count
23//!     field views: u64 [indexed, stored]
24//!
25//!     # Rating score
26//!     field rating: f64 [indexed, stored]
27//!
28//!     # Raw content hash (not indexed, just stored)
29//!     field content_hash: bytes [stored]
30//! }
31//! ```
32
33use pest::Parser;
34use pest_derive::Parser;
35
36use super::query_field_router::{QueryRouterRule, RoutingMode};
37use super::schema::{FieldType, Schema, SchemaBuilder};
38use crate::Result;
39use crate::error::Error;
40
41#[derive(Parser)]
42#[grammar = "dsl/sdl/sdl.pest"]
43pub struct SdlParser;
44
45/// Parsed field definition
46#[derive(Debug, Clone)]
47pub struct FieldDef {
48    pub name: String,
49    pub field_type: FieldType,
50    pub indexed: bool,
51    pub stored: bool,
52    /// Tokenizer name for text fields (e.g., "default", "en_stem", "german")
53    pub tokenizer: Option<String>,
54    /// Whether this field can have multiple values (serialized as array in JSON)
55    pub multi: bool,
56}
57
58/// Parsed index definition
59#[derive(Debug, Clone)]
60pub struct IndexDef {
61    pub name: String,
62    pub fields: Vec<FieldDef>,
63    pub default_fields: Vec<String>,
64    /// Query router rules for routing queries to specific fields
65    pub query_routers: Vec<QueryRouterRule>,
66}
67
68impl IndexDef {
69    /// Convert to a Schema
70    pub fn to_schema(&self) -> Schema {
71        let mut builder = SchemaBuilder::default();
72
73        for field in &self.fields {
74            let f = match field.field_type {
75                FieldType::Text => {
76                    let tokenizer = field.tokenizer.as_deref().unwrap_or("default");
77                    builder.add_text_field_with_tokenizer(
78                        &field.name,
79                        field.indexed,
80                        field.stored,
81                        tokenizer,
82                    )
83                }
84                FieldType::U64 => builder.add_u64_field(&field.name, field.indexed, field.stored),
85                FieldType::I64 => builder.add_i64_field(&field.name, field.indexed, field.stored),
86                FieldType::F64 => builder.add_f64_field(&field.name, field.indexed, field.stored),
87                FieldType::Bytes => builder.add_bytes_field(&field.name, field.stored),
88            };
89            if field.multi {
90                builder.set_multi(f, true);
91            }
92        }
93
94        // Set default fields if specified
95        if !self.default_fields.is_empty() {
96            builder.set_default_fields(self.default_fields.clone());
97        }
98
99        // Set query routers if specified
100        if !self.query_routers.is_empty() {
101            builder.set_query_routers(self.query_routers.clone());
102        }
103
104        builder.build()
105    }
106
107    /// Create a QueryFieldRouter from the query router rules
108    ///
109    /// Returns None if there are no query router rules defined.
110    /// Returns Err if any regex pattern is invalid.
111    pub fn to_query_router(&self) -> Result<Option<super::query_field_router::QueryFieldRouter>> {
112        if self.query_routers.is_empty() {
113            return Ok(None);
114        }
115
116        super::query_field_router::QueryFieldRouter::from_rules(&self.query_routers)
117            .map(Some)
118            .map_err(Error::Schema)
119    }
120}
121
122/// Parse field type from string
123fn parse_field_type(type_str: &str) -> Result<FieldType> {
124    match type_str {
125        "text" | "string" | "str" => Ok(FieldType::Text),
126        "u64" | "uint" | "unsigned" => Ok(FieldType::U64),
127        "i64" | "int" | "integer" => Ok(FieldType::I64),
128        "f64" | "float" | "double" => Ok(FieldType::F64),
129        "bytes" | "binary" | "blob" => Ok(FieldType::Bytes),
130        _ => Err(Error::Schema(format!("Unknown field type: {}", type_str))),
131    }
132}
133
134/// Parse attributes from pest pair
135/// Returns (indexed, stored, multi)
136fn parse_attributes(pair: pest::iterators::Pair<Rule>) -> (bool, bool, bool) {
137    let mut indexed = false;
138    let mut stored = false;
139    let mut multi = false;
140
141    for attr in pair.into_inner() {
142        match attr.as_str() {
143            "indexed" => indexed = true,
144            "stored" => stored = true,
145            "multi" => multi = true,
146            _ => {}
147        }
148    }
149
150    (indexed, stored, multi)
151}
152
153/// Parse a field definition from pest pair
154fn parse_field_def(pair: pest::iterators::Pair<Rule>) -> Result<FieldDef> {
155    let mut inner = pair.into_inner();
156
157    let name = inner
158        .next()
159        .ok_or_else(|| Error::Schema("Missing field name".to_string()))?
160        .as_str()
161        .to_string();
162
163    let field_type_str = inner
164        .next()
165        .ok_or_else(|| Error::Schema("Missing field type".to_string()))?
166        .as_str();
167
168    let field_type = parse_field_type(field_type_str)?;
169
170    // Parse optional tokenizer spec and attributes
171    let mut tokenizer = None;
172    let mut indexed = true;
173    let mut stored = true;
174    let mut multi = false;
175
176    for item in inner {
177        match item.as_rule() {
178            Rule::tokenizer_spec => {
179                // Extract tokenizer name from <name>
180                if let Some(tok_name) = item.into_inner().next() {
181                    tokenizer = Some(tok_name.as_str().to_string());
182                }
183            }
184            Rule::attributes => {
185                let (idx, sto, mul) = parse_attributes(item);
186                indexed = idx;
187                stored = sto;
188                multi = mul;
189            }
190            _ => {}
191        }
192    }
193
194    Ok(FieldDef {
195        name,
196        field_type,
197        indexed,
198        stored,
199        tokenizer,
200        multi,
201    })
202}
203
204/// Parse default_fields definition
205fn parse_default_fields_def(pair: pest::iterators::Pair<Rule>) -> Vec<String> {
206    pair.into_inner().map(|p| p.as_str().to_string()).collect()
207}
208
209/// Parse a query router definition
210fn parse_query_router_def(pair: pest::iterators::Pair<Rule>) -> Result<QueryRouterRule> {
211    let mut pattern = String::new();
212    let mut substitution = String::new();
213    let mut target_field = String::new();
214    let mut mode = RoutingMode::Additional;
215
216    for prop in pair.into_inner() {
217        if prop.as_rule() != Rule::query_router_prop {
218            continue;
219        }
220
221        for inner in prop.into_inner() {
222            match inner.as_rule() {
223                Rule::query_router_pattern => {
224                    if let Some(regex_str) = inner.into_inner().next() {
225                        pattern = parse_string_value(regex_str);
226                    }
227                }
228                Rule::query_router_substitution => {
229                    if let Some(quoted) = inner.into_inner().next() {
230                        substitution = parse_string_value(quoted);
231                    }
232                }
233                Rule::query_router_target => {
234                    if let Some(ident) = inner.into_inner().next() {
235                        target_field = ident.as_str().to_string();
236                    }
237                }
238                Rule::query_router_mode => {
239                    if let Some(mode_val) = inner.into_inner().next() {
240                        mode = match mode_val.as_str() {
241                            "exclusive" => RoutingMode::Exclusive,
242                            "additional" => RoutingMode::Additional,
243                            _ => RoutingMode::Additional,
244                        };
245                    }
246                }
247                _ => {}
248            }
249        }
250    }
251
252    if pattern.is_empty() {
253        return Err(Error::Schema("query_router missing 'pattern'".to_string()));
254    }
255    if substitution.is_empty() {
256        return Err(Error::Schema(
257            "query_router missing 'substitution'".to_string(),
258        ));
259    }
260    if target_field.is_empty() {
261        return Err(Error::Schema(
262            "query_router missing 'target_field'".to_string(),
263        ));
264    }
265
266    Ok(QueryRouterRule {
267        pattern,
268        substitution,
269        target_field,
270        mode,
271    })
272}
273
274/// Parse a string value from quoted_string, raw_string, or regex_string
275fn parse_string_value(pair: pest::iterators::Pair<Rule>) -> String {
276    let s = pair.as_str();
277    match pair.as_rule() {
278        Rule::regex_string => {
279            // regex_string contains either raw_string or quoted_string
280            if let Some(inner) = pair.into_inner().next() {
281                parse_string_value(inner)
282            } else {
283                s.to_string()
284            }
285        }
286        Rule::raw_string => {
287            // r"..." - strip r" prefix and " suffix
288            s[2..s.len() - 1].to_string()
289        }
290        Rule::quoted_string => {
291            // "..." - strip quotes and handle escapes
292            let inner = &s[1..s.len() - 1];
293            // Simple escape handling
294            inner
295                .replace("\\n", "\n")
296                .replace("\\t", "\t")
297                .replace("\\\"", "\"")
298                .replace("\\\\", "\\")
299        }
300        _ => s.to_string(),
301    }
302}
303
304/// Parse an index definition from pest pair
305fn parse_index_def(pair: pest::iterators::Pair<Rule>) -> Result<IndexDef> {
306    let mut inner = pair.into_inner();
307
308    let name = inner
309        .next()
310        .ok_or_else(|| Error::Schema("Missing index name".to_string()))?
311        .as_str()
312        .to_string();
313
314    let mut fields = Vec::new();
315    let mut default_fields = Vec::new();
316    let mut query_routers = Vec::new();
317
318    for item in inner {
319        match item.as_rule() {
320            Rule::field_def => {
321                fields.push(parse_field_def(item)?);
322            }
323            Rule::default_fields_def => {
324                default_fields = parse_default_fields_def(item);
325            }
326            Rule::query_router_def => {
327                query_routers.push(parse_query_router_def(item)?);
328            }
329            _ => {}
330        }
331    }
332
333    Ok(IndexDef {
334        name,
335        fields,
336        default_fields,
337        query_routers,
338    })
339}
340
341/// Parse SDL from a string
342pub fn parse_sdl(input: &str) -> Result<Vec<IndexDef>> {
343    let pairs = SdlParser::parse(Rule::file, input)
344        .map_err(|e| Error::Schema(format!("Parse error: {}", e)))?;
345
346    let mut indexes = Vec::new();
347
348    for pair in pairs {
349        if pair.as_rule() == Rule::file {
350            for inner in pair.into_inner() {
351                if inner.as_rule() == Rule::index_def {
352                    indexes.push(parse_index_def(inner)?);
353                }
354            }
355        }
356    }
357
358    Ok(indexes)
359}
360
361/// Parse SDL and return a single index definition
362pub fn parse_single_index(input: &str) -> Result<IndexDef> {
363    let indexes = parse_sdl(input)?;
364
365    if indexes.is_empty() {
366        return Err(Error::Schema("No index definition found".to_string()));
367    }
368
369    if indexes.len() > 1 {
370        return Err(Error::Schema(
371            "Multiple index definitions found, expected one".to_string(),
372        ));
373    }
374
375    Ok(indexes.into_iter().next().unwrap())
376}
377
378#[cfg(test)]
379mod tests {
380    use super::*;
381
382    #[test]
383    fn test_parse_simple_schema() {
384        let sdl = r#"
385            index articles {
386                field title: text [indexed, stored]
387                field body: text [indexed]
388            }
389        "#;
390
391        let indexes = parse_sdl(sdl).unwrap();
392        assert_eq!(indexes.len(), 1);
393
394        let index = &indexes[0];
395        assert_eq!(index.name, "articles");
396        assert_eq!(index.fields.len(), 2);
397
398        assert_eq!(index.fields[0].name, "title");
399        assert!(matches!(index.fields[0].field_type, FieldType::Text));
400        assert!(index.fields[0].indexed);
401        assert!(index.fields[0].stored);
402
403        assert_eq!(index.fields[1].name, "body");
404        assert!(matches!(index.fields[1].field_type, FieldType::Text));
405        assert!(index.fields[1].indexed);
406        assert!(!index.fields[1].stored);
407    }
408
409    #[test]
410    fn test_parse_all_field_types() {
411        let sdl = r#"
412            index test {
413                field text_field: text [indexed, stored]
414                field u64_field: u64 [indexed, stored]
415                field i64_field: i64 [indexed, stored]
416                field f64_field: f64 [indexed, stored]
417                field bytes_field: bytes [stored]
418            }
419        "#;
420
421        let indexes = parse_sdl(sdl).unwrap();
422        let index = &indexes[0];
423
424        assert!(matches!(index.fields[0].field_type, FieldType::Text));
425        assert!(matches!(index.fields[1].field_type, FieldType::U64));
426        assert!(matches!(index.fields[2].field_type, FieldType::I64));
427        assert!(matches!(index.fields[3].field_type, FieldType::F64));
428        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
429    }
430
431    #[test]
432    fn test_parse_with_comments() {
433        let sdl = r#"
434            # This is a comment
435            index articles {
436                # Title field
437                field title: text [indexed, stored]
438                field body: text [indexed] # inline comment not supported yet
439            }
440        "#;
441
442        let indexes = parse_sdl(sdl).unwrap();
443        assert_eq!(indexes[0].fields.len(), 2);
444    }
445
446    #[test]
447    fn test_parse_type_aliases() {
448        let sdl = r#"
449            index test {
450                field a: string [indexed]
451                field b: int [indexed]
452                field c: uint [indexed]
453                field d: float [indexed]
454                field e: binary [stored]
455            }
456        "#;
457
458        let indexes = parse_sdl(sdl).unwrap();
459        let index = &indexes[0];
460
461        assert!(matches!(index.fields[0].field_type, FieldType::Text));
462        assert!(matches!(index.fields[1].field_type, FieldType::I64));
463        assert!(matches!(index.fields[2].field_type, FieldType::U64));
464        assert!(matches!(index.fields[3].field_type, FieldType::F64));
465        assert!(matches!(index.fields[4].field_type, FieldType::Bytes));
466    }
467
468    #[test]
469    fn test_to_schema() {
470        let sdl = r#"
471            index articles {
472                field title: text [indexed, stored]
473                field views: u64 [indexed, stored]
474            }
475        "#;
476
477        let indexes = parse_sdl(sdl).unwrap();
478        let schema = indexes[0].to_schema();
479
480        assert!(schema.get_field("title").is_some());
481        assert!(schema.get_field("views").is_some());
482        assert!(schema.get_field("nonexistent").is_none());
483    }
484
485    #[test]
486    fn test_default_attributes() {
487        let sdl = r#"
488            index test {
489                field title: text
490            }
491        "#;
492
493        let indexes = parse_sdl(sdl).unwrap();
494        let field = &indexes[0].fields[0];
495
496        // Default should be indexed and stored
497        assert!(field.indexed);
498        assert!(field.stored);
499    }
500
501    #[test]
502    fn test_multiple_indexes() {
503        let sdl = r#"
504            index articles {
505                field title: text [indexed, stored]
506            }
507
508            index users {
509                field name: text [indexed, stored]
510                field email: text [indexed, stored]
511            }
512        "#;
513
514        let indexes = parse_sdl(sdl).unwrap();
515        assert_eq!(indexes.len(), 2);
516        assert_eq!(indexes[0].name, "articles");
517        assert_eq!(indexes[1].name, "users");
518    }
519
520    #[test]
521    fn test_tokenizer_spec() {
522        let sdl = r#"
523            index articles {
524                field title: text<en_stem> [indexed, stored]
525                field body: text<default> [indexed]
526                field author: text [indexed, stored]
527            }
528        "#;
529
530        let indexes = parse_sdl(sdl).unwrap();
531        let index = &indexes[0];
532
533        assert_eq!(index.fields[0].name, "title");
534        assert_eq!(index.fields[0].tokenizer, Some("en_stem".to_string()));
535
536        assert_eq!(index.fields[1].name, "body");
537        assert_eq!(index.fields[1].tokenizer, Some("default".to_string()));
538
539        assert_eq!(index.fields[2].name, "author");
540        assert_eq!(index.fields[2].tokenizer, None); // No tokenizer specified
541    }
542
543    #[test]
544    fn test_tokenizer_in_schema() {
545        let sdl = r#"
546            index articles {
547                field title: text<german> [indexed, stored]
548                field body: text<en_stem> [indexed]
549            }
550        "#;
551
552        let indexes = parse_sdl(sdl).unwrap();
553        let schema = indexes[0].to_schema();
554
555        let title_field = schema.get_field("title").unwrap();
556        let title_entry = schema.get_field_entry(title_field).unwrap();
557        assert_eq!(title_entry.tokenizer, Some("german".to_string()));
558
559        let body_field = schema.get_field("body").unwrap();
560        let body_entry = schema.get_field_entry(body_field).unwrap();
561        assert_eq!(body_entry.tokenizer, Some("en_stem".to_string()));
562    }
563
564    #[test]
565    fn test_query_router_basic() {
566        let sdl = r#"
567            index documents {
568                field title: text [indexed, stored]
569                field uri: text [indexed, stored]
570
571                query_router {
572                    pattern: "10\\.\\d{4,}/[^\\s]+"
573                    substitution: "doi://{0}"
574                    target_field: uris
575                    mode: exclusive
576                }
577            }
578        "#;
579
580        let indexes = parse_sdl(sdl).unwrap();
581        let index = &indexes[0];
582
583        assert_eq!(index.query_routers.len(), 1);
584        let router = &index.query_routers[0];
585        assert_eq!(router.pattern, r"10\.\d{4,}/[^\s]+");
586        assert_eq!(router.substitution, "doi://{0}");
587        assert_eq!(router.target_field, "uris");
588        assert_eq!(router.mode, RoutingMode::Exclusive);
589    }
590
591    #[test]
592    fn test_query_router_raw_string() {
593        let sdl = r#"
594            index documents {
595                field uris: text [indexed, stored]
596
597                query_router {
598                    pattern: r"^pmid:(\d+)$"
599                    substitution: "pubmed://{1}"
600                    target_field: uris
601                    mode: additional
602                }
603            }
604        "#;
605
606        let indexes = parse_sdl(sdl).unwrap();
607        let router = &indexes[0].query_routers[0];
608
609        assert_eq!(router.pattern, r"^pmid:(\d+)$");
610        assert_eq!(router.substitution, "pubmed://{1}");
611        assert_eq!(router.mode, RoutingMode::Additional);
612    }
613
614    #[test]
615    fn test_multiple_query_routers() {
616        let sdl = r#"
617            index documents {
618                field uris: text [indexed, stored]
619
620                query_router {
621                    pattern: r"^doi:(10\.\d{4,}/[^\s]+)$"
622                    substitution: "doi://{1}"
623                    target_field: uris
624                    mode: exclusive
625                }
626
627                query_router {
628                    pattern: r"^pmid:(\d+)$"
629                    substitution: "pubmed://{1}"
630                    target_field: uris
631                    mode: exclusive
632                }
633
634                query_router {
635                    pattern: r"^arxiv:(\d+\.\d+)$"
636                    substitution: "arxiv://{1}"
637                    target_field: uris
638                    mode: additional
639                }
640            }
641        "#;
642
643        let indexes = parse_sdl(sdl).unwrap();
644        assert_eq!(indexes[0].query_routers.len(), 3);
645    }
646
647    #[test]
648    fn test_query_router_default_mode() {
649        let sdl = r#"
650            index documents {
651                field uris: text [indexed, stored]
652
653                query_router {
654                    pattern: r"test"
655                    substitution: "{0}"
656                    target_field: uris
657                }
658            }
659        "#;
660
661        let indexes = parse_sdl(sdl).unwrap();
662        // Default mode should be Additional
663        assert_eq!(indexes[0].query_routers[0].mode, RoutingMode::Additional);
664    }
665
666    #[test]
667    fn test_multi_attribute() {
668        let sdl = r#"
669            index documents {
670                field uris: text [indexed, stored, multi]
671                field title: text [indexed, stored]
672            }
673        "#;
674
675        let indexes = parse_sdl(sdl).unwrap();
676        assert_eq!(indexes.len(), 1);
677
678        let fields = &indexes[0].fields;
679        assert_eq!(fields.len(), 2);
680
681        // uris should have multi=true
682        assert_eq!(fields[0].name, "uris");
683        assert!(fields[0].multi, "uris field should have multi=true");
684
685        // title should have multi=false
686        assert_eq!(fields[1].name, "title");
687        assert!(!fields[1].multi, "title field should have multi=false");
688
689        // Verify schema conversion preserves multi attribute
690        let schema = indexes[0].to_schema();
691        let uris_field = schema.get_field("uris").unwrap();
692        let title_field = schema.get_field("title").unwrap();
693
694        assert!(schema.get_field_entry(uris_field).unwrap().multi);
695        assert!(!schema.get_field_entry(title_field).unwrap().multi);
696    }
697}