hermes_core/dsl/
schema.rs

1//! Schema definitions for documents and fields
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// Field identifier
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
8pub struct Field(pub u32);
9
10/// Types of fields supported
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub enum FieldType {
13    /// Text field - tokenized and indexed
14    #[serde(rename = "text")]
15    Text,
16    /// Unsigned 64-bit integer
17    #[serde(rename = "u64")]
18    U64,
19    /// Signed 64-bit integer  
20    #[serde(rename = "i64")]
21    I64,
22    /// 64-bit floating point
23    #[serde(rename = "f64")]
24    F64,
25    /// Raw bytes (not tokenized)
26    #[serde(rename = "bytes")]
27    Bytes,
28}
29
30/// Field options
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct FieldEntry {
33    pub name: String,
34    pub field_type: FieldType,
35    pub indexed: bool,
36    pub stored: bool,
37    /// Name of the tokenizer to use for this field (for text fields)
38    pub tokenizer: Option<String>,
39    /// Whether this field can have multiple values (serialized as array in JSON)
40    #[serde(default)]
41    pub multi: bool,
42}
43
44use super::query_field_router::QueryRouterRule;
45
46/// Schema defining document structure
47#[derive(Debug, Clone, Default, Serialize, Deserialize)]
48pub struct Schema {
49    fields: Vec<FieldEntry>,
50    name_to_field: HashMap<String, Field>,
51    /// Default fields for query parsing (when no field is specified)
52    #[serde(default)]
53    default_fields: Vec<Field>,
54    /// Query router rules for routing queries to specific fields based on regex patterns
55    #[serde(default)]
56    query_routers: Vec<QueryRouterRule>,
57}
58
59impl Schema {
60    pub fn builder() -> SchemaBuilder {
61        SchemaBuilder::default()
62    }
63
64    pub fn get_field(&self, name: &str) -> Option<Field> {
65        self.name_to_field.get(name).copied()
66    }
67
68    pub fn get_field_entry(&self, field: Field) -> Option<&FieldEntry> {
69        self.fields.get(field.0 as usize)
70    }
71
72    pub fn get_field_name(&self, field: Field) -> Option<&str> {
73        self.fields.get(field.0 as usize).map(|e| e.name.as_str())
74    }
75
76    pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
77        self.fields
78            .iter()
79            .enumerate()
80            .map(|(i, e)| (Field(i as u32), e))
81    }
82
83    pub fn num_fields(&self) -> usize {
84        self.fields.len()
85    }
86
87    /// Get the default fields for query parsing
88    pub fn default_fields(&self) -> &[Field] {
89        &self.default_fields
90    }
91
92    /// Set default fields (used by builder)
93    pub fn set_default_fields(&mut self, fields: Vec<Field>) {
94        self.default_fields = fields;
95    }
96
97    /// Get the query router rules
98    pub fn query_routers(&self) -> &[QueryRouterRule] {
99        &self.query_routers
100    }
101
102    /// Set query router rules
103    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
104        self.query_routers = rules;
105    }
106}
107
108/// Builder for Schema
109#[derive(Debug, Default)]
110pub struct SchemaBuilder {
111    fields: Vec<FieldEntry>,
112    default_fields: Vec<String>,
113    query_routers: Vec<QueryRouterRule>,
114}
115
116impl SchemaBuilder {
117    pub fn add_text_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
118        self.add_field_with_tokenizer(
119            name,
120            FieldType::Text,
121            indexed,
122            stored,
123            Some("default".to_string()),
124        )
125    }
126
127    pub fn add_text_field_with_tokenizer(
128        &mut self,
129        name: &str,
130        indexed: bool,
131        stored: bool,
132        tokenizer: &str,
133    ) -> Field {
134        self.add_field_with_tokenizer(
135            name,
136            FieldType::Text,
137            indexed,
138            stored,
139            Some(tokenizer.to_string()),
140        )
141    }
142
143    pub fn add_u64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
144        self.add_field(name, FieldType::U64, indexed, stored)
145    }
146
147    pub fn add_i64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
148        self.add_field(name, FieldType::I64, indexed, stored)
149    }
150
151    pub fn add_f64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
152        self.add_field(name, FieldType::F64, indexed, stored)
153    }
154
155    pub fn add_bytes_field(&mut self, name: &str, stored: bool) -> Field {
156        self.add_field(name, FieldType::Bytes, false, stored)
157    }
158
159    fn add_field(
160        &mut self,
161        name: &str,
162        field_type: FieldType,
163        indexed: bool,
164        stored: bool,
165    ) -> Field {
166        self.add_field_with_tokenizer(name, field_type, indexed, stored, None)
167    }
168
169    fn add_field_with_tokenizer(
170        &mut self,
171        name: &str,
172        field_type: FieldType,
173        indexed: bool,
174        stored: bool,
175        tokenizer: Option<String>,
176    ) -> Field {
177        self.add_field_full(name, field_type, indexed, stored, tokenizer, false)
178    }
179
180    fn add_field_full(
181        &mut self,
182        name: &str,
183        field_type: FieldType,
184        indexed: bool,
185        stored: bool,
186        tokenizer: Option<String>,
187        multi: bool,
188    ) -> Field {
189        let field = Field(self.fields.len() as u32);
190        self.fields.push(FieldEntry {
191            name: name.to_string(),
192            field_type,
193            indexed,
194            stored,
195            tokenizer,
196            multi,
197        });
198        field
199    }
200
201    /// Set the multi attribute on the last added field
202    pub fn set_multi(&mut self, field: Field, multi: bool) {
203        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
204            entry.multi = multi;
205        }
206    }
207
208    /// Set default fields by name
209    pub fn set_default_fields(&mut self, field_names: Vec<String>) {
210        self.default_fields = field_names;
211    }
212
213    /// Set query router rules
214    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
215        self.query_routers = rules;
216    }
217
218    pub fn build(self) -> Schema {
219        let mut name_to_field = HashMap::new();
220        for (i, entry) in self.fields.iter().enumerate() {
221            name_to_field.insert(entry.name.clone(), Field(i as u32));
222        }
223
224        // Resolve default field names to Field IDs
225        let default_fields: Vec<Field> = self
226            .default_fields
227            .iter()
228            .filter_map(|name| name_to_field.get(name).copied())
229            .collect();
230
231        Schema {
232            fields: self.fields,
233            name_to_field,
234            default_fields,
235            query_routers: self.query_routers,
236        }
237    }
238}
239
240/// Value that can be stored in a field
241#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
242pub enum FieldValue {
243    #[serde(rename = "text")]
244    Text(String),
245    #[serde(rename = "u64")]
246    U64(u64),
247    #[serde(rename = "i64")]
248    I64(i64),
249    #[serde(rename = "f64")]
250    F64(f64),
251    #[serde(rename = "bytes")]
252    Bytes(Vec<u8>),
253}
254
255impl FieldValue {
256    pub fn as_text(&self) -> Option<&str> {
257        match self {
258            FieldValue::Text(s) => Some(s),
259            _ => None,
260        }
261    }
262
263    pub fn as_u64(&self) -> Option<u64> {
264        match self {
265            FieldValue::U64(v) => Some(*v),
266            _ => None,
267        }
268    }
269
270    pub fn as_i64(&self) -> Option<i64> {
271        match self {
272            FieldValue::I64(v) => Some(*v),
273            _ => None,
274        }
275    }
276
277    pub fn as_f64(&self) -> Option<f64> {
278        match self {
279            FieldValue::F64(v) => Some(*v),
280            _ => None,
281        }
282    }
283
284    pub fn as_bytes(&self) -> Option<&[u8]> {
285        match self {
286            FieldValue::Bytes(b) => Some(b),
287            _ => None,
288        }
289    }
290}
291
292/// A document to be indexed
293#[derive(Debug, Clone, Default, Serialize, Deserialize)]
294pub struct Document {
295    field_values: Vec<(Field, FieldValue)>,
296}
297
298impl Document {
299    pub fn new() -> Self {
300        Self::default()
301    }
302
303    pub fn add_text(&mut self, field: Field, value: impl Into<String>) {
304        self.field_values
305            .push((field, FieldValue::Text(value.into())));
306    }
307
308    pub fn add_u64(&mut self, field: Field, value: u64) {
309        self.field_values.push((field, FieldValue::U64(value)));
310    }
311
312    pub fn add_i64(&mut self, field: Field, value: i64) {
313        self.field_values.push((field, FieldValue::I64(value)));
314    }
315
316    pub fn add_f64(&mut self, field: Field, value: f64) {
317        self.field_values.push((field, FieldValue::F64(value)));
318    }
319
320    pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
321        self.field_values.push((field, FieldValue::Bytes(value)));
322    }
323
324    pub fn get_first(&self, field: Field) -> Option<&FieldValue> {
325        self.field_values
326            .iter()
327            .find(|(f, _)| *f == field)
328            .map(|(_, v)| v)
329    }
330
331    pub fn get_all(&self, field: Field) -> impl Iterator<Item = &FieldValue> {
332        self.field_values
333            .iter()
334            .filter(move |(f, _)| *f == field)
335            .map(|(_, v)| v)
336    }
337
338    pub fn field_values(&self) -> &[(Field, FieldValue)] {
339        &self.field_values
340    }
341
342    /// Convert document to a JSON object using field names from schema
343    ///
344    /// Fields marked as `multi` in the schema are always returned as JSON arrays.
345    /// Other fields with multiple values are also returned as arrays.
346    /// Fields with a single value (and not marked multi) are returned as scalar values.
347    pub fn to_json(&self, schema: &Schema) -> serde_json::Value {
348        use std::collections::HashMap;
349
350        // Group values by field, keeping track of field entry for multi check
351        let mut field_values_map: HashMap<Field, (String, bool, Vec<serde_json::Value>)> =
352            HashMap::new();
353
354        for (field, value) in &self.field_values {
355            if let Some(entry) = schema.get_field_entry(*field) {
356                let json_value = match value {
357                    FieldValue::Text(s) => serde_json::Value::String(s.clone()),
358                    FieldValue::U64(n) => serde_json::Value::Number((*n).into()),
359                    FieldValue::I64(n) => serde_json::Value::Number((*n).into()),
360                    FieldValue::F64(n) => serde_json::json!(n),
361                    FieldValue::Bytes(b) => {
362                        use base64::Engine;
363                        serde_json::Value::String(
364                            base64::engine::general_purpose::STANDARD.encode(b),
365                        )
366                    }
367                };
368                field_values_map
369                    .entry(*field)
370                    .or_insert_with(|| (entry.name.clone(), entry.multi, Vec::new()))
371                    .2
372                    .push(json_value);
373            }
374        }
375
376        // Convert to JSON object, using arrays for multi fields or when multiple values exist
377        let mut map = serde_json::Map::new();
378        for (_field, (name, is_multi, values)) in field_values_map {
379            let json_value = if is_multi || values.len() > 1 {
380                serde_json::Value::Array(values)
381            } else {
382                values.into_iter().next().unwrap()
383            };
384            map.insert(name, json_value);
385        }
386
387        serde_json::Value::Object(map)
388    }
389
390    /// Create a Document from a JSON object using field names from schema
391    ///
392    /// Supports:
393    /// - String values -> Text fields
394    /// - Number values -> U64/I64/F64 fields (based on schema type)
395    /// - Array values -> Multiple values for the same field (multifields)
396    ///
397    /// Unknown fields (not in schema) are silently ignored.
398    pub fn from_json(json: &serde_json::Value, schema: &Schema) -> Option<Self> {
399        let obj = json.as_object()?;
400        let mut doc = Document::new();
401
402        for (key, value) in obj {
403            if let Some(field) = schema.get_field(key) {
404                let field_entry = schema.get_field_entry(field)?;
405                Self::add_json_value(&mut doc, field, &field_entry.field_type, value);
406            }
407        }
408
409        Some(doc)
410    }
411
412    /// Helper to add a JSON value to a document, handling type conversion
413    fn add_json_value(
414        doc: &mut Document,
415        field: Field,
416        field_type: &FieldType,
417        value: &serde_json::Value,
418    ) {
419        match value {
420            serde_json::Value::String(s) => {
421                if matches!(field_type, FieldType::Text) {
422                    doc.add_text(field, s.clone());
423                }
424            }
425            serde_json::Value::Number(n) => {
426                match field_type {
427                    FieldType::I64 => {
428                        if let Some(i) = n.as_i64() {
429                            doc.add_i64(field, i);
430                        }
431                    }
432                    FieldType::U64 => {
433                        if let Some(u) = n.as_u64() {
434                            doc.add_u64(field, u);
435                        } else if let Some(i) = n.as_i64() {
436                            // Allow positive i64 as u64
437                            if i >= 0 {
438                                doc.add_u64(field, i as u64);
439                            }
440                        }
441                    }
442                    FieldType::F64 => {
443                        if let Some(f) = n.as_f64() {
444                            doc.add_f64(field, f);
445                        }
446                    }
447                    _ => {}
448                }
449            }
450            // Handle arrays (multifields) - add each element separately
451            serde_json::Value::Array(arr) => {
452                for item in arr {
453                    Self::add_json_value(doc, field, field_type, item);
454                }
455            }
456            _ => {}
457        }
458    }
459}
460
461#[cfg(test)]
462mod tests {
463    use super::*;
464
465    #[test]
466    fn test_schema_builder() {
467        let mut builder = Schema::builder();
468        let title = builder.add_text_field("title", true, true);
469        let body = builder.add_text_field("body", true, false);
470        let count = builder.add_u64_field("count", true, true);
471        let schema = builder.build();
472
473        assert_eq!(schema.get_field("title"), Some(title));
474        assert_eq!(schema.get_field("body"), Some(body));
475        assert_eq!(schema.get_field("count"), Some(count));
476        assert_eq!(schema.get_field("nonexistent"), None);
477    }
478
479    #[test]
480    fn test_document() {
481        let mut builder = Schema::builder();
482        let title = builder.add_text_field("title", true, true);
483        let count = builder.add_u64_field("count", true, true);
484        let _schema = builder.build();
485
486        let mut doc = Document::new();
487        doc.add_text(title, "Hello World");
488        doc.add_u64(count, 42);
489
490        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
491        assert_eq!(doc.get_first(count).unwrap().as_u64(), Some(42));
492    }
493
494    #[test]
495    fn test_document_serialization() {
496        let mut builder = Schema::builder();
497        let title = builder.add_text_field("title", true, true);
498        let count = builder.add_u64_field("count", true, true);
499        let _schema = builder.build();
500
501        let mut doc = Document::new();
502        doc.add_text(title, "Hello World");
503        doc.add_u64(count, 42);
504
505        // Serialize
506        let json = serde_json::to_string(&doc).unwrap();
507        println!("Serialized doc: {}", json);
508
509        // Deserialize
510        let doc2: Document = serde_json::from_str(&json).unwrap();
511        assert_eq!(
512            doc2.field_values().len(),
513            2,
514            "Should have 2 field values after deserialization"
515        );
516        assert_eq!(
517            doc2.get_first(title).unwrap().as_text(),
518            Some("Hello World")
519        );
520        assert_eq!(doc2.get_first(count).unwrap().as_u64(), Some(42));
521    }
522
523    #[test]
524    fn test_multivalue_field() {
525        let mut builder = Schema::builder();
526        let uris = builder.add_text_field("uris", true, true);
527        let title = builder.add_text_field("title", true, true);
528        let schema = builder.build();
529
530        // Create document with multiple values for the same field
531        let mut doc = Document::new();
532        doc.add_text(uris, "one");
533        doc.add_text(uris, "two");
534        doc.add_text(title, "Test Document");
535
536        // Verify get_first returns the first value
537        assert_eq!(doc.get_first(uris).unwrap().as_text(), Some("one"));
538
539        // Verify get_all returns all values
540        let all_uris: Vec<_> = doc.get_all(uris).collect();
541        assert_eq!(all_uris.len(), 2);
542        assert_eq!(all_uris[0].as_text(), Some("one"));
543        assert_eq!(all_uris[1].as_text(), Some("two"));
544
545        // Verify to_json returns array for multi-value field
546        let json = doc.to_json(&schema);
547        let uris_json = json.get("uris").unwrap();
548        assert!(uris_json.is_array(), "Multi-value field should be an array");
549        let uris_arr = uris_json.as_array().unwrap();
550        assert_eq!(uris_arr.len(), 2);
551        assert_eq!(uris_arr[0].as_str(), Some("one"));
552        assert_eq!(uris_arr[1].as_str(), Some("two"));
553
554        // Verify single-value field is NOT an array
555        let title_json = json.get("title").unwrap();
556        assert!(
557            title_json.is_string(),
558            "Single-value field should be a string"
559        );
560        assert_eq!(title_json.as_str(), Some("Test Document"));
561    }
562
563    #[test]
564    fn test_multivalue_from_json() {
565        let mut builder = Schema::builder();
566        let uris = builder.add_text_field("uris", true, true);
567        let title = builder.add_text_field("title", true, true);
568        let schema = builder.build();
569
570        // Create JSON with array value
571        let json = serde_json::json!({
572            "uris": ["one", "two"],
573            "title": "Test Document"
574        });
575
576        // Parse from JSON
577        let doc = Document::from_json(&json, &schema).unwrap();
578
579        // Verify all values are present
580        let all_uris: Vec<_> = doc.get_all(uris).collect();
581        assert_eq!(all_uris.len(), 2);
582        assert_eq!(all_uris[0].as_text(), Some("one"));
583        assert_eq!(all_uris[1].as_text(), Some("two"));
584
585        // Verify single value
586        assert_eq!(
587            doc.get_first(title).unwrap().as_text(),
588            Some("Test Document")
589        );
590
591        // Verify roundtrip: to_json should produce equivalent JSON
592        let json_out = doc.to_json(&schema);
593        let uris_out = json_out.get("uris").unwrap().as_array().unwrap();
594        assert_eq!(uris_out.len(), 2);
595        assert_eq!(uris_out[0].as_str(), Some("one"));
596        assert_eq!(uris_out[1].as_str(), Some("two"));
597    }
598
599    #[test]
600    fn test_multi_attribute_forces_array() {
601        // Test that fields marked as 'multi' are always serialized as arrays,
602        // even when they have only one value
603        let mut builder = Schema::builder();
604        let uris = builder.add_text_field("uris", true, true);
605        builder.set_multi(uris, true); // Mark as multi
606        let title = builder.add_text_field("title", true, true);
607        let schema = builder.build();
608
609        // Verify the multi attribute is set
610        assert!(schema.get_field_entry(uris).unwrap().multi);
611        assert!(!schema.get_field_entry(title).unwrap().multi);
612
613        // Create document with single value for multi field
614        let mut doc = Document::new();
615        doc.add_text(uris, "only_one");
616        doc.add_text(title, "Test Document");
617
618        // Verify to_json returns array for multi field even with single value
619        let json = doc.to_json(&schema);
620
621        let uris_json = json.get("uris").unwrap();
622        assert!(
623            uris_json.is_array(),
624            "Multi field should be array even with single value"
625        );
626        let uris_arr = uris_json.as_array().unwrap();
627        assert_eq!(uris_arr.len(), 1);
628        assert_eq!(uris_arr[0].as_str(), Some("only_one"));
629
630        // Verify non-multi field with single value is NOT an array
631        let title_json = json.get("title").unwrap();
632        assert!(
633            title_json.is_string(),
634            "Non-multi single-value field should be a string"
635        );
636        assert_eq!(title_json.as_str(), Some("Test Document"));
637    }
638}