hermes_core/dsl/
schema.rs

1//! Schema definitions for documents and fields
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// Field identifier
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
8pub struct Field(pub u32);
9
10/// Types of fields supported
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub enum FieldType {
13    /// Text field - tokenized and indexed
14    #[serde(rename = "text")]
15    Text,
16    /// Unsigned 64-bit integer
17    #[serde(rename = "u64")]
18    U64,
19    /// Signed 64-bit integer
20    #[serde(rename = "i64")]
21    I64,
22    /// 64-bit floating point
23    #[serde(rename = "f64")]
24    F64,
25    /// Raw bytes (not tokenized)
26    #[serde(rename = "bytes")]
27    Bytes,
28    /// Sparse vector field - indexed as inverted posting lists with quantized weights
29    #[serde(rename = "sparse_vector")]
30    SparseVector,
31    /// Dense vector field - indexed using RaBitQ binary quantization for ANN search
32    #[serde(rename = "dense_vector")]
33    DenseVector,
34}
35
36/// Field options
37#[derive(Debug, Clone, Serialize, Deserialize)]
38pub struct FieldEntry {
39    pub name: String,
40    pub field_type: FieldType,
41    pub indexed: bool,
42    pub stored: bool,
43    /// Name of the tokenizer to use for this field (for text fields)
44    pub tokenizer: Option<String>,
45    /// Whether this field can have multiple values (serialized as array in JSON)
46    #[serde(default)]
47    pub multi: bool,
48    /// Configuration for sparse vector fields (index size, weight quantization)
49    #[serde(default, skip_serializing_if = "Option::is_none")]
50    pub sparse_vector_config: Option<crate::structures::SparseVectorConfig>,
51    /// Configuration for dense vector fields (dimension, quantization)
52    #[serde(default, skip_serializing_if = "Option::is_none")]
53    pub dense_vector_config: Option<DenseVectorConfig>,
54}
55
56/// Configuration for dense vector fields using RaBitQ or IVF-RaBitQ
57#[derive(Debug, Clone, Serialize, Deserialize)]
58pub struct DenseVectorConfig {
59    /// Dimensionality of vectors
60    pub dim: usize,
61    /// Whether to store raw vectors for re-ranking (increases storage but improves accuracy)
62    #[serde(default = "default_store_raw")]
63    pub store_raw: bool,
64    /// Path to pre-trained coarse centroids file for IVF-RaBitQ
65    /// If None, uses single-centroid RaBitQ (suitable for <100K vectors)
66    #[serde(default, skip_serializing_if = "Option::is_none")]
67    pub coarse_centroids_path: Option<String>,
68    /// Number of clusters to probe during search (default: 32)
69    #[serde(default = "default_nprobe")]
70    pub nprobe: usize,
71}
72
73fn default_store_raw() -> bool {
74    true
75}
76
77fn default_nprobe() -> usize {
78    32
79}
80
81impl DenseVectorConfig {
82    pub fn new(dim: usize) -> Self {
83        Self {
84            dim,
85            store_raw: true,
86            coarse_centroids_path: None,
87            nprobe: 32,
88        }
89    }
90
91    pub fn with_ivf(dim: usize, centroids_path: String, nprobe: usize) -> Self {
92        Self {
93            dim,
94            store_raw: true,
95            coarse_centroids_path: Some(centroids_path),
96            nprobe,
97        }
98    }
99
100    pub fn without_raw(dim: usize) -> Self {
101        Self {
102            dim,
103            store_raw: false,
104            coarse_centroids_path: None,
105            nprobe: 32,
106        }
107    }
108
109    /// Check if this config uses IVF (has coarse centroids)
110    pub fn uses_ivf(&self) -> bool {
111        self.coarse_centroids_path.is_some()
112    }
113}
114
115use super::query_field_router::QueryRouterRule;
116
117/// Schema defining document structure
118#[derive(Debug, Clone, Default, Serialize, Deserialize)]
119pub struct Schema {
120    fields: Vec<FieldEntry>,
121    name_to_field: HashMap<String, Field>,
122    /// Default fields for query parsing (when no field is specified)
123    #[serde(default)]
124    default_fields: Vec<Field>,
125    /// Query router rules for routing queries to specific fields based on regex patterns
126    #[serde(default)]
127    query_routers: Vec<QueryRouterRule>,
128}
129
130impl Schema {
131    pub fn builder() -> SchemaBuilder {
132        SchemaBuilder::default()
133    }
134
135    pub fn get_field(&self, name: &str) -> Option<Field> {
136        self.name_to_field.get(name).copied()
137    }
138
139    pub fn get_field_entry(&self, field: Field) -> Option<&FieldEntry> {
140        self.fields.get(field.0 as usize)
141    }
142
143    pub fn get_field_name(&self, field: Field) -> Option<&str> {
144        self.fields.get(field.0 as usize).map(|e| e.name.as_str())
145    }
146
147    pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
148        self.fields
149            .iter()
150            .enumerate()
151            .map(|(i, e)| (Field(i as u32), e))
152    }
153
154    pub fn num_fields(&self) -> usize {
155        self.fields.len()
156    }
157
158    /// Get the default fields for query parsing
159    pub fn default_fields(&self) -> &[Field] {
160        &self.default_fields
161    }
162
163    /// Set default fields (used by builder)
164    pub fn set_default_fields(&mut self, fields: Vec<Field>) {
165        self.default_fields = fields;
166    }
167
168    /// Get the query router rules
169    pub fn query_routers(&self) -> &[QueryRouterRule] {
170        &self.query_routers
171    }
172
173    /// Set query router rules
174    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
175        self.query_routers = rules;
176    }
177}
178
179/// Builder for Schema
180#[derive(Debug, Default)]
181pub struct SchemaBuilder {
182    fields: Vec<FieldEntry>,
183    default_fields: Vec<String>,
184    query_routers: Vec<QueryRouterRule>,
185}
186
187impl SchemaBuilder {
188    pub fn add_text_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
189        self.add_field_with_tokenizer(
190            name,
191            FieldType::Text,
192            indexed,
193            stored,
194            Some("default".to_string()),
195        )
196    }
197
198    pub fn add_text_field_with_tokenizer(
199        &mut self,
200        name: &str,
201        indexed: bool,
202        stored: bool,
203        tokenizer: &str,
204    ) -> Field {
205        self.add_field_with_tokenizer(
206            name,
207            FieldType::Text,
208            indexed,
209            stored,
210            Some(tokenizer.to_string()),
211        )
212    }
213
214    pub fn add_u64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
215        self.add_field(name, FieldType::U64, indexed, stored)
216    }
217
218    pub fn add_i64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
219        self.add_field(name, FieldType::I64, indexed, stored)
220    }
221
222    pub fn add_f64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
223        self.add_field(name, FieldType::F64, indexed, stored)
224    }
225
226    pub fn add_bytes_field(&mut self, name: &str, stored: bool) -> Field {
227        self.add_field(name, FieldType::Bytes, false, stored)
228    }
229
230    /// Add a sparse vector field with default configuration
231    ///
232    /// Sparse vectors are indexed as inverted posting lists where each dimension
233    /// becomes a "term" and documents have quantized weights for each dimension.
234    pub fn add_sparse_vector_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
235        self.add_sparse_vector_field_with_config(
236            name,
237            indexed,
238            stored,
239            crate::structures::SparseVectorConfig::default(),
240        )
241    }
242
243    /// Add a sparse vector field with custom configuration
244    ///
245    /// Use `SparseVectorConfig::splade()` for SPLADE models (u16 indices, uint8 weights).
246    /// Use `SparseVectorConfig::compact()` for maximum compression (u16 indices, uint4 weights).
247    pub fn add_sparse_vector_field_with_config(
248        &mut self,
249        name: &str,
250        indexed: bool,
251        stored: bool,
252        config: crate::structures::SparseVectorConfig,
253    ) -> Field {
254        let field = Field(self.fields.len() as u32);
255        self.fields.push(FieldEntry {
256            name: name.to_string(),
257            field_type: FieldType::SparseVector,
258            indexed,
259            stored,
260            tokenizer: None,
261            multi: false,
262            sparse_vector_config: Some(config),
263            dense_vector_config: None,
264        });
265        field
266    }
267
268    /// Set sparse vector configuration for an existing field
269    pub fn set_sparse_vector_config(
270        &mut self,
271        field: Field,
272        config: crate::structures::SparseVectorConfig,
273    ) {
274        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
275            entry.sparse_vector_config = Some(config);
276        }
277    }
278
279    /// Add a dense vector field with default configuration
280    ///
281    /// Dense vectors are indexed using RaBitQ binary quantization for fast ANN search.
282    /// The dimension must be specified as it determines the quantization structure.
283    pub fn add_dense_vector_field(
284        &mut self,
285        name: &str,
286        dim: usize,
287        indexed: bool,
288        stored: bool,
289    ) -> Field {
290        self.add_dense_vector_field_with_config(name, indexed, stored, DenseVectorConfig::new(dim))
291    }
292
293    /// Add a dense vector field with custom configuration
294    pub fn add_dense_vector_field_with_config(
295        &mut self,
296        name: &str,
297        indexed: bool,
298        stored: bool,
299        config: DenseVectorConfig,
300    ) -> Field {
301        let field = Field(self.fields.len() as u32);
302        self.fields.push(FieldEntry {
303            name: name.to_string(),
304            field_type: FieldType::DenseVector,
305            indexed,
306            stored,
307            tokenizer: None,
308            multi: false,
309            sparse_vector_config: None,
310            dense_vector_config: Some(config),
311        });
312        field
313    }
314
315    fn add_field(
316        &mut self,
317        name: &str,
318        field_type: FieldType,
319        indexed: bool,
320        stored: bool,
321    ) -> Field {
322        self.add_field_with_tokenizer(name, field_type, indexed, stored, None)
323    }
324
325    fn add_field_with_tokenizer(
326        &mut self,
327        name: &str,
328        field_type: FieldType,
329        indexed: bool,
330        stored: bool,
331        tokenizer: Option<String>,
332    ) -> Field {
333        self.add_field_full(name, field_type, indexed, stored, tokenizer, false)
334    }
335
336    fn add_field_full(
337        &mut self,
338        name: &str,
339        field_type: FieldType,
340        indexed: bool,
341        stored: bool,
342        tokenizer: Option<String>,
343        multi: bool,
344    ) -> Field {
345        let field = Field(self.fields.len() as u32);
346        self.fields.push(FieldEntry {
347            name: name.to_string(),
348            field_type,
349            indexed,
350            stored,
351            tokenizer,
352            multi,
353            sparse_vector_config: None,
354            dense_vector_config: None,
355        });
356        field
357    }
358
359    /// Set the multi attribute on the last added field
360    pub fn set_multi(&mut self, field: Field, multi: bool) {
361        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
362            entry.multi = multi;
363        }
364    }
365
366    /// Set default fields by name
367    pub fn set_default_fields(&mut self, field_names: Vec<String>) {
368        self.default_fields = field_names;
369    }
370
371    /// Set query router rules
372    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
373        self.query_routers = rules;
374    }
375
376    pub fn build(self) -> Schema {
377        let mut name_to_field = HashMap::new();
378        for (i, entry) in self.fields.iter().enumerate() {
379            name_to_field.insert(entry.name.clone(), Field(i as u32));
380        }
381
382        // Resolve default field names to Field IDs
383        let default_fields: Vec<Field> = self
384            .default_fields
385            .iter()
386            .filter_map(|name| name_to_field.get(name).copied())
387            .collect();
388
389        Schema {
390            fields: self.fields,
391            name_to_field,
392            default_fields,
393            query_routers: self.query_routers,
394        }
395    }
396}
397
398/// Value that can be stored in a field
399#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
400pub enum FieldValue {
401    #[serde(rename = "text")]
402    Text(String),
403    #[serde(rename = "u64")]
404    U64(u64),
405    #[serde(rename = "i64")]
406    I64(i64),
407    #[serde(rename = "f64")]
408    F64(f64),
409    #[serde(rename = "bytes")]
410    Bytes(Vec<u8>),
411    /// Sparse vector: (dimension_ids, weights)
412    #[serde(rename = "sparse_vector")]
413    SparseVector { indices: Vec<u32>, values: Vec<f32> },
414    /// Dense vector: float32 values
415    #[serde(rename = "dense_vector")]
416    DenseVector(Vec<f32>),
417}
418
419impl FieldValue {
420    pub fn as_text(&self) -> Option<&str> {
421        match self {
422            FieldValue::Text(s) => Some(s),
423            _ => None,
424        }
425    }
426
427    pub fn as_u64(&self) -> Option<u64> {
428        match self {
429            FieldValue::U64(v) => Some(*v),
430            _ => None,
431        }
432    }
433
434    pub fn as_i64(&self) -> Option<i64> {
435        match self {
436            FieldValue::I64(v) => Some(*v),
437            _ => None,
438        }
439    }
440
441    pub fn as_f64(&self) -> Option<f64> {
442        match self {
443            FieldValue::F64(v) => Some(*v),
444            _ => None,
445        }
446    }
447
448    pub fn as_bytes(&self) -> Option<&[u8]> {
449        match self {
450            FieldValue::Bytes(b) => Some(b),
451            _ => None,
452        }
453    }
454
455    pub fn as_sparse_vector(&self) -> Option<(&[u32], &[f32])> {
456        match self {
457            FieldValue::SparseVector { indices, values } => Some((indices, values)),
458            _ => None,
459        }
460    }
461
462    pub fn as_dense_vector(&self) -> Option<&[f32]> {
463        match self {
464            FieldValue::DenseVector(v) => Some(v),
465            _ => None,
466        }
467    }
468}
469
470/// A document to be indexed
471#[derive(Debug, Clone, Default, Serialize, Deserialize)]
472pub struct Document {
473    field_values: Vec<(Field, FieldValue)>,
474}
475
476impl Document {
477    pub fn new() -> Self {
478        Self::default()
479    }
480
481    pub fn add_text(&mut self, field: Field, value: impl Into<String>) {
482        self.field_values
483            .push((field, FieldValue::Text(value.into())));
484    }
485
486    pub fn add_u64(&mut self, field: Field, value: u64) {
487        self.field_values.push((field, FieldValue::U64(value)));
488    }
489
490    pub fn add_i64(&mut self, field: Field, value: i64) {
491        self.field_values.push((field, FieldValue::I64(value)));
492    }
493
494    pub fn add_f64(&mut self, field: Field, value: f64) {
495        self.field_values.push((field, FieldValue::F64(value)));
496    }
497
498    pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
499        self.field_values.push((field, FieldValue::Bytes(value)));
500    }
501
502    pub fn add_sparse_vector(&mut self, field: Field, indices: Vec<u32>, values: Vec<f32>) {
503        debug_assert_eq!(
504            indices.len(),
505            values.len(),
506            "Sparse vector indices and values must have same length"
507        );
508        self.field_values
509            .push((field, FieldValue::SparseVector { indices, values }));
510    }
511
512    pub fn add_dense_vector(&mut self, field: Field, values: Vec<f32>) {
513        self.field_values
514            .push((field, FieldValue::DenseVector(values)));
515    }
516
517    pub fn get_first(&self, field: Field) -> Option<&FieldValue> {
518        self.field_values
519            .iter()
520            .find(|(f, _)| *f == field)
521            .map(|(_, v)| v)
522    }
523
524    pub fn get_all(&self, field: Field) -> impl Iterator<Item = &FieldValue> {
525        self.field_values
526            .iter()
527            .filter(move |(f, _)| *f == field)
528            .map(|(_, v)| v)
529    }
530
531    pub fn field_values(&self) -> &[(Field, FieldValue)] {
532        &self.field_values
533    }
534
535    /// Convert document to a JSON object using field names from schema
536    ///
537    /// Fields marked as `multi` in the schema are always returned as JSON arrays.
538    /// Other fields with multiple values are also returned as arrays.
539    /// Fields with a single value (and not marked multi) are returned as scalar values.
540    pub fn to_json(&self, schema: &Schema) -> serde_json::Value {
541        use std::collections::HashMap;
542
543        // Group values by field, keeping track of field entry for multi check
544        let mut field_values_map: HashMap<Field, (String, bool, Vec<serde_json::Value>)> =
545            HashMap::new();
546
547        for (field, value) in &self.field_values {
548            if let Some(entry) = schema.get_field_entry(*field) {
549                let json_value = match value {
550                    FieldValue::Text(s) => serde_json::Value::String(s.clone()),
551                    FieldValue::U64(n) => serde_json::Value::Number((*n).into()),
552                    FieldValue::I64(n) => serde_json::Value::Number((*n).into()),
553                    FieldValue::F64(n) => serde_json::json!(n),
554                    FieldValue::Bytes(b) => {
555                        use base64::Engine;
556                        serde_json::Value::String(
557                            base64::engine::general_purpose::STANDARD.encode(b),
558                        )
559                    }
560                    FieldValue::SparseVector { indices, values } => {
561                        serde_json::json!({
562                            "indices": indices,
563                            "values": values
564                        })
565                    }
566                    FieldValue::DenseVector(values) => {
567                        serde_json::json!(values)
568                    }
569                };
570                field_values_map
571                    .entry(*field)
572                    .or_insert_with(|| (entry.name.clone(), entry.multi, Vec::new()))
573                    .2
574                    .push(json_value);
575            }
576        }
577
578        // Convert to JSON object, using arrays for multi fields or when multiple values exist
579        let mut map = serde_json::Map::new();
580        for (_field, (name, is_multi, values)) in field_values_map {
581            let json_value = if is_multi || values.len() > 1 {
582                serde_json::Value::Array(values)
583            } else {
584                values.into_iter().next().unwrap()
585            };
586            map.insert(name, json_value);
587        }
588
589        serde_json::Value::Object(map)
590    }
591
592    /// Create a Document from a JSON object using field names from schema
593    ///
594    /// Supports:
595    /// - String values -> Text fields
596    /// - Number values -> U64/I64/F64 fields (based on schema type)
597    /// - Array values -> Multiple values for the same field (multifields)
598    ///
599    /// Unknown fields (not in schema) are silently ignored.
600    pub fn from_json(json: &serde_json::Value, schema: &Schema) -> Option<Self> {
601        let obj = json.as_object()?;
602        let mut doc = Document::new();
603
604        for (key, value) in obj {
605            if let Some(field) = schema.get_field(key) {
606                let field_entry = schema.get_field_entry(field)?;
607                Self::add_json_value(&mut doc, field, &field_entry.field_type, value);
608            }
609        }
610
611        Some(doc)
612    }
613
614    /// Helper to add a JSON value to a document, handling type conversion
615    fn add_json_value(
616        doc: &mut Document,
617        field: Field,
618        field_type: &FieldType,
619        value: &serde_json::Value,
620    ) {
621        match value {
622            serde_json::Value::String(s) => {
623                if matches!(field_type, FieldType::Text) {
624                    doc.add_text(field, s.clone());
625                }
626            }
627            serde_json::Value::Number(n) => {
628                match field_type {
629                    FieldType::I64 => {
630                        if let Some(i) = n.as_i64() {
631                            doc.add_i64(field, i);
632                        }
633                    }
634                    FieldType::U64 => {
635                        if let Some(u) = n.as_u64() {
636                            doc.add_u64(field, u);
637                        } else if let Some(i) = n.as_i64() {
638                            // Allow positive i64 as u64
639                            if i >= 0 {
640                                doc.add_u64(field, i as u64);
641                            }
642                        }
643                    }
644                    FieldType::F64 => {
645                        if let Some(f) = n.as_f64() {
646                            doc.add_f64(field, f);
647                        }
648                    }
649                    _ => {}
650                }
651            }
652            // Handle arrays (multifields) - add each element separately
653            serde_json::Value::Array(arr) => {
654                for item in arr {
655                    Self::add_json_value(doc, field, field_type, item);
656                }
657            }
658            // Handle sparse vector objects
659            serde_json::Value::Object(obj) if matches!(field_type, FieldType::SparseVector) => {
660                if let (Some(indices_val), Some(values_val)) =
661                    (obj.get("indices"), obj.get("values"))
662                {
663                    let indices: Vec<u32> = indices_val
664                        .as_array()
665                        .map(|arr| {
666                            arr.iter()
667                                .filter_map(|v| v.as_u64().map(|n| n as u32))
668                                .collect()
669                        })
670                        .unwrap_or_default();
671                    let values: Vec<f32> = values_val
672                        .as_array()
673                        .map(|arr| {
674                            arr.iter()
675                                .filter_map(|v| v.as_f64().map(|n| n as f32))
676                                .collect()
677                        })
678                        .unwrap_or_default();
679                    if indices.len() == values.len() {
680                        doc.add_sparse_vector(field, indices, values);
681                    }
682                }
683            }
684            serde_json::Value::Object(_) => {}
685            _ => {}
686        }
687    }
688}
689
690#[cfg(test)]
691mod tests {
692    use super::*;
693
694    #[test]
695    fn test_schema_builder() {
696        let mut builder = Schema::builder();
697        let title = builder.add_text_field("title", true, true);
698        let body = builder.add_text_field("body", true, false);
699        let count = builder.add_u64_field("count", true, true);
700        let schema = builder.build();
701
702        assert_eq!(schema.get_field("title"), Some(title));
703        assert_eq!(schema.get_field("body"), Some(body));
704        assert_eq!(schema.get_field("count"), Some(count));
705        assert_eq!(schema.get_field("nonexistent"), None);
706    }
707
708    #[test]
709    fn test_document() {
710        let mut builder = Schema::builder();
711        let title = builder.add_text_field("title", true, true);
712        let count = builder.add_u64_field("count", true, true);
713        let _schema = builder.build();
714
715        let mut doc = Document::new();
716        doc.add_text(title, "Hello World");
717        doc.add_u64(count, 42);
718
719        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
720        assert_eq!(doc.get_first(count).unwrap().as_u64(), Some(42));
721    }
722
723    #[test]
724    fn test_document_serialization() {
725        let mut builder = Schema::builder();
726        let title = builder.add_text_field("title", true, true);
727        let count = builder.add_u64_field("count", true, true);
728        let _schema = builder.build();
729
730        let mut doc = Document::new();
731        doc.add_text(title, "Hello World");
732        doc.add_u64(count, 42);
733
734        // Serialize
735        let json = serde_json::to_string(&doc).unwrap();
736        println!("Serialized doc: {}", json);
737
738        // Deserialize
739        let doc2: Document = serde_json::from_str(&json).unwrap();
740        assert_eq!(
741            doc2.field_values().len(),
742            2,
743            "Should have 2 field values after deserialization"
744        );
745        assert_eq!(
746            doc2.get_first(title).unwrap().as_text(),
747            Some("Hello World")
748        );
749        assert_eq!(doc2.get_first(count).unwrap().as_u64(), Some(42));
750    }
751
752    #[test]
753    fn test_multivalue_field() {
754        let mut builder = Schema::builder();
755        let uris = builder.add_text_field("uris", true, true);
756        let title = builder.add_text_field("title", true, true);
757        let schema = builder.build();
758
759        // Create document with multiple values for the same field
760        let mut doc = Document::new();
761        doc.add_text(uris, "one");
762        doc.add_text(uris, "two");
763        doc.add_text(title, "Test Document");
764
765        // Verify get_first returns the first value
766        assert_eq!(doc.get_first(uris).unwrap().as_text(), Some("one"));
767
768        // Verify get_all returns all values
769        let all_uris: Vec<_> = doc.get_all(uris).collect();
770        assert_eq!(all_uris.len(), 2);
771        assert_eq!(all_uris[0].as_text(), Some("one"));
772        assert_eq!(all_uris[1].as_text(), Some("two"));
773
774        // Verify to_json returns array for multi-value field
775        let json = doc.to_json(&schema);
776        let uris_json = json.get("uris").unwrap();
777        assert!(uris_json.is_array(), "Multi-value field should be an array");
778        let uris_arr = uris_json.as_array().unwrap();
779        assert_eq!(uris_arr.len(), 2);
780        assert_eq!(uris_arr[0].as_str(), Some("one"));
781        assert_eq!(uris_arr[1].as_str(), Some("two"));
782
783        // Verify single-value field is NOT an array
784        let title_json = json.get("title").unwrap();
785        assert!(
786            title_json.is_string(),
787            "Single-value field should be a string"
788        );
789        assert_eq!(title_json.as_str(), Some("Test Document"));
790    }
791
792    #[test]
793    fn test_multivalue_from_json() {
794        let mut builder = Schema::builder();
795        let uris = builder.add_text_field("uris", true, true);
796        let title = builder.add_text_field("title", true, true);
797        let schema = builder.build();
798
799        // Create JSON with array value
800        let json = serde_json::json!({
801            "uris": ["one", "two"],
802            "title": "Test Document"
803        });
804
805        // Parse from JSON
806        let doc = Document::from_json(&json, &schema).unwrap();
807
808        // Verify all values are present
809        let all_uris: Vec<_> = doc.get_all(uris).collect();
810        assert_eq!(all_uris.len(), 2);
811        assert_eq!(all_uris[0].as_text(), Some("one"));
812        assert_eq!(all_uris[1].as_text(), Some("two"));
813
814        // Verify single value
815        assert_eq!(
816            doc.get_first(title).unwrap().as_text(),
817            Some("Test Document")
818        );
819
820        // Verify roundtrip: to_json should produce equivalent JSON
821        let json_out = doc.to_json(&schema);
822        let uris_out = json_out.get("uris").unwrap().as_array().unwrap();
823        assert_eq!(uris_out.len(), 2);
824        assert_eq!(uris_out[0].as_str(), Some("one"));
825        assert_eq!(uris_out[1].as_str(), Some("two"));
826    }
827
828    #[test]
829    fn test_multi_attribute_forces_array() {
830        // Test that fields marked as 'multi' are always serialized as arrays,
831        // even when they have only one value
832        let mut builder = Schema::builder();
833        let uris = builder.add_text_field("uris", true, true);
834        builder.set_multi(uris, true); // Mark as multi
835        let title = builder.add_text_field("title", true, true);
836        let schema = builder.build();
837
838        // Verify the multi attribute is set
839        assert!(schema.get_field_entry(uris).unwrap().multi);
840        assert!(!schema.get_field_entry(title).unwrap().multi);
841
842        // Create document with single value for multi field
843        let mut doc = Document::new();
844        doc.add_text(uris, "only_one");
845        doc.add_text(title, "Test Document");
846
847        // Verify to_json returns array for multi field even with single value
848        let json = doc.to_json(&schema);
849
850        let uris_json = json.get("uris").unwrap();
851        assert!(
852            uris_json.is_array(),
853            "Multi field should be array even with single value"
854        );
855        let uris_arr = uris_json.as_array().unwrap();
856        assert_eq!(uris_arr.len(), 1);
857        assert_eq!(uris_arr[0].as_str(), Some("only_one"));
858
859        // Verify non-multi field with single value is NOT an array
860        let title_json = json.get("title").unwrap();
861        assert!(
862            title_json.is_string(),
863            "Non-multi single-value field should be a string"
864        );
865        assert_eq!(title_json.as_str(), Some("Test Document"));
866    }
867
868    #[test]
869    fn test_sparse_vector_field() {
870        let mut builder = Schema::builder();
871        let embedding = builder.add_sparse_vector_field("embedding", true, true);
872        let title = builder.add_text_field("title", true, true);
873        let schema = builder.build();
874
875        assert_eq!(schema.get_field("embedding"), Some(embedding));
876        assert_eq!(
877            schema.get_field_entry(embedding).unwrap().field_type,
878            FieldType::SparseVector
879        );
880
881        // Create document with sparse vector
882        let mut doc = Document::new();
883        doc.add_sparse_vector(embedding, vec![0, 5, 10], vec![1.0, 2.5, 0.5]);
884        doc.add_text(title, "Test Document");
885
886        // Verify accessor
887        let (indices, values) = doc
888            .get_first(embedding)
889            .unwrap()
890            .as_sparse_vector()
891            .unwrap();
892        assert_eq!(indices, &[0, 5, 10]);
893        assert_eq!(values, &[1.0, 2.5, 0.5]);
894
895        // Verify JSON roundtrip
896        let json = doc.to_json(&schema);
897        let embedding_json = json.get("embedding").unwrap();
898        assert!(embedding_json.is_object());
899        assert_eq!(
900            embedding_json
901                .get("indices")
902                .unwrap()
903                .as_array()
904                .unwrap()
905                .len(),
906            3
907        );
908
909        // Parse back from JSON
910        let doc2 = Document::from_json(&json, &schema).unwrap();
911        let (indices2, values2) = doc2
912            .get_first(embedding)
913            .unwrap()
914            .as_sparse_vector()
915            .unwrap();
916        assert_eq!(indices2, &[0, 5, 10]);
917        assert!((values2[0] - 1.0).abs() < 1e-6);
918        assert!((values2[1] - 2.5).abs() < 1e-6);
919        assert!((values2[2] - 0.5).abs() < 1e-6);
920    }
921}