hermes_core/dsl/
schema.rs

1//! Schema definitions for documents and fields
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// Field identifier
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
8pub struct Field(pub u32);
9
10/// Types of fields supported
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub enum FieldType {
13    /// Text field - tokenized and indexed
14    #[serde(rename = "text")]
15    Text,
16    /// Unsigned 64-bit integer
17    #[serde(rename = "u64")]
18    U64,
19    /// Signed 64-bit integer
20    #[serde(rename = "i64")]
21    I64,
22    /// 64-bit floating point
23    #[serde(rename = "f64")]
24    F64,
25    /// Raw bytes (not tokenized)
26    #[serde(rename = "bytes")]
27    Bytes,
28    /// Sparse vector field - indexed as inverted posting lists with quantized weights
29    #[serde(rename = "sparse_vector")]
30    SparseVector,
31    /// Dense vector field - indexed using RaBitQ binary quantization for ANN search
32    #[serde(rename = "dense_vector")]
33    DenseVector,
34    /// JSON field - arbitrary JSON data, stored but not indexed
35    #[serde(rename = "json")]
36    Json,
37}
38
39/// Field options
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct FieldEntry {
42    pub name: String,
43    pub field_type: FieldType,
44    pub indexed: bool,
45    pub stored: bool,
46    /// Name of the tokenizer to use for this field (for text fields)
47    pub tokenizer: Option<String>,
48    /// Whether this field can have multiple values (serialized as array in JSON)
49    #[serde(default)]
50    pub multi: bool,
51    /// Configuration for sparse vector fields (index size, weight quantization)
52    #[serde(default, skip_serializing_if = "Option::is_none")]
53    pub sparse_vector_config: Option<crate::structures::SparseVectorConfig>,
54    /// Configuration for dense vector fields (dimension, quantization)
55    #[serde(default, skip_serializing_if = "Option::is_none")]
56    pub dense_vector_config: Option<DenseVectorConfig>,
57}
58
59/// Vector index algorithm type
60#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
61#[serde(rename_all = "snake_case")]
62pub enum VectorIndexType {
63    /// RaBitQ - binary quantization, good for small datasets (<100K)
64    #[default]
65    RaBitQ,
66    /// IVF-RaBitQ - inverted file with RaBitQ, good for medium datasets
67    IvfRaBitQ,
68    /// ScaNN - product quantization with OPQ and anisotropic loss, best for large datasets
69    ScaNN,
70}
71
72/// Configuration for dense vector fields using RaBitQ, IVF-RaBitQ, or ScaNN
73#[derive(Debug, Clone, Serialize, Deserialize)]
74pub struct DenseVectorConfig {
75    /// Dimensionality of vectors
76    pub dim: usize,
77    /// Vector index algorithm to use
78    #[serde(default)]
79    pub index_type: VectorIndexType,
80    /// Whether to store raw vectors for re-ranking (increases storage but improves accuracy)
81    #[serde(default = "default_store_raw")]
82    pub store_raw: bool,
83    /// Path to pre-trained coarse centroids file for IVF indexes
84    /// Required for IVF-RaBitQ and ScaNN
85    #[serde(default, skip_serializing_if = "Option::is_none")]
86    pub coarse_centroids_path: Option<String>,
87    /// Path to pre-trained PQ codebook file for ScaNN
88    #[serde(default, skip_serializing_if = "Option::is_none")]
89    pub pq_codebook_path: Option<String>,
90    /// Number of clusters to probe during search (default: 32)
91    #[serde(default = "default_nprobe")]
92    pub nprobe: usize,
93    /// Matryoshka/MRL dimension for index - use only first mrl_dim coordinates for indexing
94    /// Full vectors are stored but index uses truncated vectors for faster search
95    /// Must be <= dim. If None, uses full dim.
96    #[serde(default, skip_serializing_if = "Option::is_none")]
97    pub mrl_dim: Option<usize>,
98}
99
100fn default_store_raw() -> bool {
101    true
102}
103
104fn default_nprobe() -> usize {
105    32
106}
107
108impl DenseVectorConfig {
109    pub fn new(dim: usize) -> Self {
110        Self {
111            dim,
112            index_type: VectorIndexType::RaBitQ,
113            store_raw: true,
114            coarse_centroids_path: None,
115            pq_codebook_path: None,
116            nprobe: 32,
117            mrl_dim: None,
118        }
119    }
120
121    pub fn with_ivf(dim: usize, centroids_path: String, nprobe: usize) -> Self {
122        Self {
123            dim,
124            index_type: VectorIndexType::IvfRaBitQ,
125            store_raw: true,
126            coarse_centroids_path: Some(centroids_path),
127            pq_codebook_path: None,
128            nprobe,
129            mrl_dim: None,
130        }
131    }
132
133    /// Create ScaNN configuration with pre-trained centroids and codebook
134    pub fn with_scann(
135        dim: usize,
136        centroids_path: String,
137        codebook_path: String,
138        nprobe: usize,
139    ) -> Self {
140        Self {
141            dim,
142            index_type: VectorIndexType::ScaNN,
143            store_raw: true,
144            coarse_centroids_path: Some(centroids_path),
145            pq_codebook_path: Some(codebook_path),
146            nprobe,
147            mrl_dim: None,
148        }
149    }
150
151    pub fn without_raw(dim: usize) -> Self {
152        Self {
153            dim,
154            index_type: VectorIndexType::RaBitQ,
155            store_raw: false,
156            coarse_centroids_path: None,
157            pq_codebook_path: None,
158            nprobe: 32,
159            mrl_dim: None,
160        }
161    }
162
163    /// Set matryoshka/MRL dimension for index truncation
164    pub fn with_mrl_dim(mut self, mrl_dim: usize) -> Self {
165        self.mrl_dim = Some(mrl_dim);
166        self
167    }
168
169    /// Get the effective dimension for indexing (mrl_dim if set, otherwise dim)
170    pub fn index_dim(&self) -> usize {
171        self.mrl_dim.unwrap_or(self.dim)
172    }
173
174    /// Check if this config uses IVF (has coarse centroids)
175    pub fn uses_ivf(&self) -> bool {
176        self.coarse_centroids_path.is_some()
177    }
178
179    /// Check if this config uses ScaNN
180    pub fn uses_scann(&self) -> bool {
181        self.index_type == VectorIndexType::ScaNN
182    }
183}
184
185use super::query_field_router::QueryRouterRule;
186
187/// Schema defining document structure
188#[derive(Debug, Clone, Default, Serialize, Deserialize)]
189pub struct Schema {
190    fields: Vec<FieldEntry>,
191    name_to_field: HashMap<String, Field>,
192    /// Default fields for query parsing (when no field is specified)
193    #[serde(default)]
194    default_fields: Vec<Field>,
195    /// Query router rules for routing queries to specific fields based on regex patterns
196    #[serde(default)]
197    query_routers: Vec<QueryRouterRule>,
198}
199
200impl Schema {
201    pub fn builder() -> SchemaBuilder {
202        SchemaBuilder::default()
203    }
204
205    pub fn get_field(&self, name: &str) -> Option<Field> {
206        self.name_to_field.get(name).copied()
207    }
208
209    pub fn get_field_entry(&self, field: Field) -> Option<&FieldEntry> {
210        self.fields.get(field.0 as usize)
211    }
212
213    pub fn get_field_name(&self, field: Field) -> Option<&str> {
214        self.fields.get(field.0 as usize).map(|e| e.name.as_str())
215    }
216
217    pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
218        self.fields
219            .iter()
220            .enumerate()
221            .map(|(i, e)| (Field(i as u32), e))
222    }
223
224    pub fn num_fields(&self) -> usize {
225        self.fields.len()
226    }
227
228    /// Get the default fields for query parsing
229    pub fn default_fields(&self) -> &[Field] {
230        &self.default_fields
231    }
232
233    /// Set default fields (used by builder)
234    pub fn set_default_fields(&mut self, fields: Vec<Field>) {
235        self.default_fields = fields;
236    }
237
238    /// Get the query router rules
239    pub fn query_routers(&self) -> &[QueryRouterRule] {
240        &self.query_routers
241    }
242
243    /// Set query router rules
244    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
245        self.query_routers = rules;
246    }
247}
248
249/// Builder for Schema
250#[derive(Debug, Default)]
251pub struct SchemaBuilder {
252    fields: Vec<FieldEntry>,
253    default_fields: Vec<String>,
254    query_routers: Vec<QueryRouterRule>,
255}
256
257impl SchemaBuilder {
258    pub fn add_text_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
259        self.add_field_with_tokenizer(
260            name,
261            FieldType::Text,
262            indexed,
263            stored,
264            Some("default".to_string()),
265        )
266    }
267
268    pub fn add_text_field_with_tokenizer(
269        &mut self,
270        name: &str,
271        indexed: bool,
272        stored: bool,
273        tokenizer: &str,
274    ) -> Field {
275        self.add_field_with_tokenizer(
276            name,
277            FieldType::Text,
278            indexed,
279            stored,
280            Some(tokenizer.to_string()),
281        )
282    }
283
284    pub fn add_u64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
285        self.add_field(name, FieldType::U64, indexed, stored)
286    }
287
288    pub fn add_i64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
289        self.add_field(name, FieldType::I64, indexed, stored)
290    }
291
292    pub fn add_f64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
293        self.add_field(name, FieldType::F64, indexed, stored)
294    }
295
296    pub fn add_bytes_field(&mut self, name: &str, stored: bool) -> Field {
297        self.add_field(name, FieldType::Bytes, false, stored)
298    }
299
300    /// Add a JSON field for storing arbitrary JSON data
301    ///
302    /// JSON fields are never indexed, only stored. They can hold any valid JSON value
303    /// (objects, arrays, strings, numbers, booleans, null).
304    pub fn add_json_field(&mut self, name: &str, stored: bool) -> Field {
305        self.add_field(name, FieldType::Json, false, stored)
306    }
307
308    /// Add a sparse vector field with default configuration
309    ///
310    /// Sparse vectors are indexed as inverted posting lists where each dimension
311    /// becomes a "term" and documents have quantized weights for each dimension.
312    pub fn add_sparse_vector_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
313        self.add_sparse_vector_field_with_config(
314            name,
315            indexed,
316            stored,
317            crate::structures::SparseVectorConfig::default(),
318        )
319    }
320
321    /// Add a sparse vector field with custom configuration
322    ///
323    /// Use `SparseVectorConfig::splade()` for SPLADE models (u16 indices, uint8 weights).
324    /// Use `SparseVectorConfig::compact()` for maximum compression (u16 indices, uint4 weights).
325    pub fn add_sparse_vector_field_with_config(
326        &mut self,
327        name: &str,
328        indexed: bool,
329        stored: bool,
330        config: crate::structures::SparseVectorConfig,
331    ) -> Field {
332        let field = Field(self.fields.len() as u32);
333        self.fields.push(FieldEntry {
334            name: name.to_string(),
335            field_type: FieldType::SparseVector,
336            indexed,
337            stored,
338            tokenizer: None,
339            multi: false,
340            sparse_vector_config: Some(config),
341            dense_vector_config: None,
342        });
343        field
344    }
345
346    /// Set sparse vector configuration for an existing field
347    pub fn set_sparse_vector_config(
348        &mut self,
349        field: Field,
350        config: crate::structures::SparseVectorConfig,
351    ) {
352        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
353            entry.sparse_vector_config = Some(config);
354        }
355    }
356
357    /// Add a dense vector field with default configuration
358    ///
359    /// Dense vectors are indexed using RaBitQ binary quantization for fast ANN search.
360    /// The dimension must be specified as it determines the quantization structure.
361    pub fn add_dense_vector_field(
362        &mut self,
363        name: &str,
364        dim: usize,
365        indexed: bool,
366        stored: bool,
367    ) -> Field {
368        self.add_dense_vector_field_with_config(name, indexed, stored, DenseVectorConfig::new(dim))
369    }
370
371    /// Add a dense vector field with custom configuration
372    pub fn add_dense_vector_field_with_config(
373        &mut self,
374        name: &str,
375        indexed: bool,
376        stored: bool,
377        config: DenseVectorConfig,
378    ) -> Field {
379        let field = Field(self.fields.len() as u32);
380        self.fields.push(FieldEntry {
381            name: name.to_string(),
382            field_type: FieldType::DenseVector,
383            indexed,
384            stored,
385            tokenizer: None,
386            multi: false,
387            sparse_vector_config: None,
388            dense_vector_config: Some(config),
389        });
390        field
391    }
392
393    fn add_field(
394        &mut self,
395        name: &str,
396        field_type: FieldType,
397        indexed: bool,
398        stored: bool,
399    ) -> Field {
400        self.add_field_with_tokenizer(name, field_type, indexed, stored, None)
401    }
402
403    fn add_field_with_tokenizer(
404        &mut self,
405        name: &str,
406        field_type: FieldType,
407        indexed: bool,
408        stored: bool,
409        tokenizer: Option<String>,
410    ) -> Field {
411        self.add_field_full(name, field_type, indexed, stored, tokenizer, false)
412    }
413
414    fn add_field_full(
415        &mut self,
416        name: &str,
417        field_type: FieldType,
418        indexed: bool,
419        stored: bool,
420        tokenizer: Option<String>,
421        multi: bool,
422    ) -> Field {
423        let field = Field(self.fields.len() as u32);
424        self.fields.push(FieldEntry {
425            name: name.to_string(),
426            field_type,
427            indexed,
428            stored,
429            tokenizer,
430            multi,
431            sparse_vector_config: None,
432            dense_vector_config: None,
433        });
434        field
435    }
436
437    /// Set the multi attribute on the last added field
438    pub fn set_multi(&mut self, field: Field, multi: bool) {
439        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
440            entry.multi = multi;
441        }
442    }
443
444    /// Set default fields by name
445    pub fn set_default_fields(&mut self, field_names: Vec<String>) {
446        self.default_fields = field_names;
447    }
448
449    /// Set query router rules
450    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
451        self.query_routers = rules;
452    }
453
454    pub fn build(self) -> Schema {
455        let mut name_to_field = HashMap::new();
456        for (i, entry) in self.fields.iter().enumerate() {
457            name_to_field.insert(entry.name.clone(), Field(i as u32));
458        }
459
460        // Resolve default field names to Field IDs
461        let default_fields: Vec<Field> = self
462            .default_fields
463            .iter()
464            .filter_map(|name| name_to_field.get(name).copied())
465            .collect();
466
467        Schema {
468            fields: self.fields,
469            name_to_field,
470            default_fields,
471            query_routers: self.query_routers,
472        }
473    }
474}
475
476/// Value that can be stored in a field
477#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
478pub enum FieldValue {
479    #[serde(rename = "text")]
480    Text(String),
481    #[serde(rename = "u64")]
482    U64(u64),
483    #[serde(rename = "i64")]
484    I64(i64),
485    #[serde(rename = "f64")]
486    F64(f64),
487    #[serde(rename = "bytes")]
488    Bytes(Vec<u8>),
489    /// Sparse vector: list of (dimension_id, weight) pairs
490    #[serde(rename = "sparse_vector")]
491    SparseVector(Vec<(u32, f32)>),
492    /// Dense vector: float32 values
493    #[serde(rename = "dense_vector")]
494    DenseVector(Vec<f32>),
495    /// Arbitrary JSON value
496    #[serde(rename = "json")]
497    Json(serde_json::Value),
498}
499
500impl FieldValue {
501    pub fn as_text(&self) -> Option<&str> {
502        match self {
503            FieldValue::Text(s) => Some(s),
504            _ => None,
505        }
506    }
507
508    pub fn as_u64(&self) -> Option<u64> {
509        match self {
510            FieldValue::U64(v) => Some(*v),
511            _ => None,
512        }
513    }
514
515    pub fn as_i64(&self) -> Option<i64> {
516        match self {
517            FieldValue::I64(v) => Some(*v),
518            _ => None,
519        }
520    }
521
522    pub fn as_f64(&self) -> Option<f64> {
523        match self {
524            FieldValue::F64(v) => Some(*v),
525            _ => None,
526        }
527    }
528
529    pub fn as_bytes(&self) -> Option<&[u8]> {
530        match self {
531            FieldValue::Bytes(b) => Some(b),
532            _ => None,
533        }
534    }
535
536    pub fn as_sparse_vector(&self) -> Option<&[(u32, f32)]> {
537        match self {
538            FieldValue::SparseVector(entries) => Some(entries),
539            _ => None,
540        }
541    }
542
543    pub fn as_dense_vector(&self) -> Option<&[f32]> {
544        match self {
545            FieldValue::DenseVector(v) => Some(v),
546            _ => None,
547        }
548    }
549
550    pub fn as_json(&self) -> Option<&serde_json::Value> {
551        match self {
552            FieldValue::Json(v) => Some(v),
553            _ => None,
554        }
555    }
556}
557
558/// A document to be indexed
559#[derive(Debug, Clone, Default, Serialize, Deserialize)]
560pub struct Document {
561    field_values: Vec<(Field, FieldValue)>,
562}
563
564impl Document {
565    pub fn new() -> Self {
566        Self::default()
567    }
568
569    pub fn add_text(&mut self, field: Field, value: impl Into<String>) {
570        self.field_values
571            .push((field, FieldValue::Text(value.into())));
572    }
573
574    pub fn add_u64(&mut self, field: Field, value: u64) {
575        self.field_values.push((field, FieldValue::U64(value)));
576    }
577
578    pub fn add_i64(&mut self, field: Field, value: i64) {
579        self.field_values.push((field, FieldValue::I64(value)));
580    }
581
582    pub fn add_f64(&mut self, field: Field, value: f64) {
583        self.field_values.push((field, FieldValue::F64(value)));
584    }
585
586    pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
587        self.field_values.push((field, FieldValue::Bytes(value)));
588    }
589
590    pub fn add_sparse_vector(&mut self, field: Field, entries: Vec<(u32, f32)>) {
591        self.field_values
592            .push((field, FieldValue::SparseVector(entries)));
593    }
594
595    pub fn add_dense_vector(&mut self, field: Field, values: Vec<f32>) {
596        self.field_values
597            .push((field, FieldValue::DenseVector(values)));
598    }
599
600    pub fn add_json(&mut self, field: Field, value: serde_json::Value) {
601        self.field_values.push((field, FieldValue::Json(value)));
602    }
603
604    pub fn get_first(&self, field: Field) -> Option<&FieldValue> {
605        self.field_values
606            .iter()
607            .find(|(f, _)| *f == field)
608            .map(|(_, v)| v)
609    }
610
611    pub fn get_all(&self, field: Field) -> impl Iterator<Item = &FieldValue> {
612        self.field_values
613            .iter()
614            .filter(move |(f, _)| *f == field)
615            .map(|(_, v)| v)
616    }
617
618    pub fn field_values(&self) -> &[(Field, FieldValue)] {
619        &self.field_values
620    }
621
622    /// Convert document to a JSON object using field names from schema
623    ///
624    /// Fields marked as `multi` in the schema are always returned as JSON arrays.
625    /// Other fields with multiple values are also returned as arrays.
626    /// Fields with a single value (and not marked multi) are returned as scalar values.
627    pub fn to_json(&self, schema: &Schema) -> serde_json::Value {
628        use std::collections::HashMap;
629
630        // Group values by field, keeping track of field entry for multi check
631        let mut field_values_map: HashMap<Field, (String, bool, Vec<serde_json::Value>)> =
632            HashMap::new();
633
634        for (field, value) in &self.field_values {
635            if let Some(entry) = schema.get_field_entry(*field) {
636                let json_value = match value {
637                    FieldValue::Text(s) => serde_json::Value::String(s.clone()),
638                    FieldValue::U64(n) => serde_json::Value::Number((*n).into()),
639                    FieldValue::I64(n) => serde_json::Value::Number((*n).into()),
640                    FieldValue::F64(n) => serde_json::json!(n),
641                    FieldValue::Bytes(b) => {
642                        use base64::Engine;
643                        serde_json::Value::String(
644                            base64::engine::general_purpose::STANDARD.encode(b),
645                        )
646                    }
647                    FieldValue::SparseVector(entries) => {
648                        let indices: Vec<u32> = entries.iter().map(|(i, _)| *i).collect();
649                        let values: Vec<f32> = entries.iter().map(|(_, v)| *v).collect();
650                        serde_json::json!({
651                            "indices": indices,
652                            "values": values
653                        })
654                    }
655                    FieldValue::DenseVector(values) => {
656                        serde_json::json!(values)
657                    }
658                    FieldValue::Json(v) => v.clone(),
659                };
660                field_values_map
661                    .entry(*field)
662                    .or_insert_with(|| (entry.name.clone(), entry.multi, Vec::new()))
663                    .2
664                    .push(json_value);
665            }
666        }
667
668        // Convert to JSON object, using arrays for multi fields or when multiple values exist
669        let mut map = serde_json::Map::new();
670        for (_field, (name, is_multi, values)) in field_values_map {
671            let json_value = if is_multi || values.len() > 1 {
672                serde_json::Value::Array(values)
673            } else {
674                values.into_iter().next().unwrap()
675            };
676            map.insert(name, json_value);
677        }
678
679        serde_json::Value::Object(map)
680    }
681
682    /// Create a Document from a JSON object using field names from schema
683    ///
684    /// Supports:
685    /// - String values -> Text fields
686    /// - Number values -> U64/I64/F64 fields (based on schema type)
687    /// - Array values -> Multiple values for the same field (multifields)
688    ///
689    /// Unknown fields (not in schema) are silently ignored.
690    pub fn from_json(json: &serde_json::Value, schema: &Schema) -> Option<Self> {
691        let obj = json.as_object()?;
692        let mut doc = Document::new();
693
694        for (key, value) in obj {
695            if let Some(field) = schema.get_field(key) {
696                let field_entry = schema.get_field_entry(field)?;
697                Self::add_json_value(&mut doc, field, &field_entry.field_type, value);
698            }
699        }
700
701        Some(doc)
702    }
703
704    /// Helper to add a JSON value to a document, handling type conversion
705    fn add_json_value(
706        doc: &mut Document,
707        field: Field,
708        field_type: &FieldType,
709        value: &serde_json::Value,
710    ) {
711        match value {
712            serde_json::Value::String(s) => {
713                if matches!(field_type, FieldType::Text) {
714                    doc.add_text(field, s.clone());
715                }
716            }
717            serde_json::Value::Number(n) => {
718                match field_type {
719                    FieldType::I64 => {
720                        if let Some(i) = n.as_i64() {
721                            doc.add_i64(field, i);
722                        }
723                    }
724                    FieldType::U64 => {
725                        if let Some(u) = n.as_u64() {
726                            doc.add_u64(field, u);
727                        } else if let Some(i) = n.as_i64() {
728                            // Allow positive i64 as u64
729                            if i >= 0 {
730                                doc.add_u64(field, i as u64);
731                            }
732                        }
733                    }
734                    FieldType::F64 => {
735                        if let Some(f) = n.as_f64() {
736                            doc.add_f64(field, f);
737                        }
738                    }
739                    _ => {}
740                }
741            }
742            // Handle arrays (multifields) - add each element separately
743            serde_json::Value::Array(arr) => {
744                for item in arr {
745                    Self::add_json_value(doc, field, field_type, item);
746                }
747            }
748            // Handle sparse vector objects
749            serde_json::Value::Object(obj) if matches!(field_type, FieldType::SparseVector) => {
750                if let (Some(indices_val), Some(values_val)) =
751                    (obj.get("indices"), obj.get("values"))
752                {
753                    let indices: Vec<u32> = indices_val
754                        .as_array()
755                        .map(|arr| {
756                            arr.iter()
757                                .filter_map(|v| v.as_u64().map(|n| n as u32))
758                                .collect()
759                        })
760                        .unwrap_or_default();
761                    let values: Vec<f32> = values_val
762                        .as_array()
763                        .map(|arr| {
764                            arr.iter()
765                                .filter_map(|v| v.as_f64().map(|n| n as f32))
766                                .collect()
767                        })
768                        .unwrap_or_default();
769                    if indices.len() == values.len() {
770                        let entries: Vec<(u32, f32)> = indices.into_iter().zip(values).collect();
771                        doc.add_sparse_vector(field, entries);
772                    }
773                }
774            }
775            // Handle JSON fields - accept any value directly
776            _ if matches!(field_type, FieldType::Json) => {
777                doc.add_json(field, value.clone());
778            }
779            serde_json::Value::Object(_) => {}
780            _ => {}
781        }
782    }
783}
784
785#[cfg(test)]
786mod tests {
787    use super::*;
788
789    #[test]
790    fn test_schema_builder() {
791        let mut builder = Schema::builder();
792        let title = builder.add_text_field("title", true, true);
793        let body = builder.add_text_field("body", true, false);
794        let count = builder.add_u64_field("count", true, true);
795        let schema = builder.build();
796
797        assert_eq!(schema.get_field("title"), Some(title));
798        assert_eq!(schema.get_field("body"), Some(body));
799        assert_eq!(schema.get_field("count"), Some(count));
800        assert_eq!(schema.get_field("nonexistent"), None);
801    }
802
803    #[test]
804    fn test_document() {
805        let mut builder = Schema::builder();
806        let title = builder.add_text_field("title", true, true);
807        let count = builder.add_u64_field("count", true, true);
808        let _schema = builder.build();
809
810        let mut doc = Document::new();
811        doc.add_text(title, "Hello World");
812        doc.add_u64(count, 42);
813
814        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
815        assert_eq!(doc.get_first(count).unwrap().as_u64(), Some(42));
816    }
817
818    #[test]
819    fn test_document_serialization() {
820        let mut builder = Schema::builder();
821        let title = builder.add_text_field("title", true, true);
822        let count = builder.add_u64_field("count", true, true);
823        let _schema = builder.build();
824
825        let mut doc = Document::new();
826        doc.add_text(title, "Hello World");
827        doc.add_u64(count, 42);
828
829        // Serialize
830        let json = serde_json::to_string(&doc).unwrap();
831        println!("Serialized doc: {}", json);
832
833        // Deserialize
834        let doc2: Document = serde_json::from_str(&json).unwrap();
835        assert_eq!(
836            doc2.field_values().len(),
837            2,
838            "Should have 2 field values after deserialization"
839        );
840        assert_eq!(
841            doc2.get_first(title).unwrap().as_text(),
842            Some("Hello World")
843        );
844        assert_eq!(doc2.get_first(count).unwrap().as_u64(), Some(42));
845    }
846
847    #[test]
848    fn test_multivalue_field() {
849        let mut builder = Schema::builder();
850        let uris = builder.add_text_field("uris", true, true);
851        let title = builder.add_text_field("title", true, true);
852        let schema = builder.build();
853
854        // Create document with multiple values for the same field
855        let mut doc = Document::new();
856        doc.add_text(uris, "one");
857        doc.add_text(uris, "two");
858        doc.add_text(title, "Test Document");
859
860        // Verify get_first returns the first value
861        assert_eq!(doc.get_first(uris).unwrap().as_text(), Some("one"));
862
863        // Verify get_all returns all values
864        let all_uris: Vec<_> = doc.get_all(uris).collect();
865        assert_eq!(all_uris.len(), 2);
866        assert_eq!(all_uris[0].as_text(), Some("one"));
867        assert_eq!(all_uris[1].as_text(), Some("two"));
868
869        // Verify to_json returns array for multi-value field
870        let json = doc.to_json(&schema);
871        let uris_json = json.get("uris").unwrap();
872        assert!(uris_json.is_array(), "Multi-value field should be an array");
873        let uris_arr = uris_json.as_array().unwrap();
874        assert_eq!(uris_arr.len(), 2);
875        assert_eq!(uris_arr[0].as_str(), Some("one"));
876        assert_eq!(uris_arr[1].as_str(), Some("two"));
877
878        // Verify single-value field is NOT an array
879        let title_json = json.get("title").unwrap();
880        assert!(
881            title_json.is_string(),
882            "Single-value field should be a string"
883        );
884        assert_eq!(title_json.as_str(), Some("Test Document"));
885    }
886
887    #[test]
888    fn test_multivalue_from_json() {
889        let mut builder = Schema::builder();
890        let uris = builder.add_text_field("uris", true, true);
891        let title = builder.add_text_field("title", true, true);
892        let schema = builder.build();
893
894        // Create JSON with array value
895        let json = serde_json::json!({
896            "uris": ["one", "two"],
897            "title": "Test Document"
898        });
899
900        // Parse from JSON
901        let doc = Document::from_json(&json, &schema).unwrap();
902
903        // Verify all values are present
904        let all_uris: Vec<_> = doc.get_all(uris).collect();
905        assert_eq!(all_uris.len(), 2);
906        assert_eq!(all_uris[0].as_text(), Some("one"));
907        assert_eq!(all_uris[1].as_text(), Some("two"));
908
909        // Verify single value
910        assert_eq!(
911            doc.get_first(title).unwrap().as_text(),
912            Some("Test Document")
913        );
914
915        // Verify roundtrip: to_json should produce equivalent JSON
916        let json_out = doc.to_json(&schema);
917        let uris_out = json_out.get("uris").unwrap().as_array().unwrap();
918        assert_eq!(uris_out.len(), 2);
919        assert_eq!(uris_out[0].as_str(), Some("one"));
920        assert_eq!(uris_out[1].as_str(), Some("two"));
921    }
922
923    #[test]
924    fn test_multi_attribute_forces_array() {
925        // Test that fields marked as 'multi' are always serialized as arrays,
926        // even when they have only one value
927        let mut builder = Schema::builder();
928        let uris = builder.add_text_field("uris", true, true);
929        builder.set_multi(uris, true); // Mark as multi
930        let title = builder.add_text_field("title", true, true);
931        let schema = builder.build();
932
933        // Verify the multi attribute is set
934        assert!(schema.get_field_entry(uris).unwrap().multi);
935        assert!(!schema.get_field_entry(title).unwrap().multi);
936
937        // Create document with single value for multi field
938        let mut doc = Document::new();
939        doc.add_text(uris, "only_one");
940        doc.add_text(title, "Test Document");
941
942        // Verify to_json returns array for multi field even with single value
943        let json = doc.to_json(&schema);
944
945        let uris_json = json.get("uris").unwrap();
946        assert!(
947            uris_json.is_array(),
948            "Multi field should be array even with single value"
949        );
950        let uris_arr = uris_json.as_array().unwrap();
951        assert_eq!(uris_arr.len(), 1);
952        assert_eq!(uris_arr[0].as_str(), Some("only_one"));
953
954        // Verify non-multi field with single value is NOT an array
955        let title_json = json.get("title").unwrap();
956        assert!(
957            title_json.is_string(),
958            "Non-multi single-value field should be a string"
959        );
960        assert_eq!(title_json.as_str(), Some("Test Document"));
961    }
962
963    #[test]
964    fn test_sparse_vector_field() {
965        let mut builder = Schema::builder();
966        let embedding = builder.add_sparse_vector_field("embedding", true, true);
967        let title = builder.add_text_field("title", true, true);
968        let schema = builder.build();
969
970        assert_eq!(schema.get_field("embedding"), Some(embedding));
971        assert_eq!(
972            schema.get_field_entry(embedding).unwrap().field_type,
973            FieldType::SparseVector
974        );
975
976        // Create document with sparse vector
977        let mut doc = Document::new();
978        doc.add_sparse_vector(embedding, vec![(0, 1.0), (5, 2.5), (10, 0.5)]);
979        doc.add_text(title, "Test Document");
980
981        // Verify accessor
982        let entries = doc
983            .get_first(embedding)
984            .unwrap()
985            .as_sparse_vector()
986            .unwrap();
987        assert_eq!(entries, &[(0, 1.0), (5, 2.5), (10, 0.5)]);
988
989        // Verify JSON roundtrip
990        let json = doc.to_json(&schema);
991        let embedding_json = json.get("embedding").unwrap();
992        assert!(embedding_json.is_object());
993        assert_eq!(
994            embedding_json
995                .get("indices")
996                .unwrap()
997                .as_array()
998                .unwrap()
999                .len(),
1000            3
1001        );
1002
1003        // Parse back from JSON
1004        let doc2 = Document::from_json(&json, &schema).unwrap();
1005        let entries2 = doc2
1006            .get_first(embedding)
1007            .unwrap()
1008            .as_sparse_vector()
1009            .unwrap();
1010        assert_eq!(entries2[0].0, 0);
1011        assert!((entries2[0].1 - 1.0).abs() < 1e-6);
1012        assert_eq!(entries2[1].0, 5);
1013        assert!((entries2[1].1 - 2.5).abs() < 1e-6);
1014        assert_eq!(entries2[2].0, 10);
1015        assert!((entries2[2].1 - 0.5).abs() < 1e-6);
1016    }
1017
1018    #[test]
1019    fn test_json_field() {
1020        let mut builder = Schema::builder();
1021        let metadata = builder.add_json_field("metadata", true);
1022        let title = builder.add_text_field("title", true, true);
1023        let schema = builder.build();
1024
1025        assert_eq!(schema.get_field("metadata"), Some(metadata));
1026        assert_eq!(
1027            schema.get_field_entry(metadata).unwrap().field_type,
1028            FieldType::Json
1029        );
1030        // JSON fields are never indexed
1031        assert!(!schema.get_field_entry(metadata).unwrap().indexed);
1032        assert!(schema.get_field_entry(metadata).unwrap().stored);
1033
1034        // Create document with JSON value (object)
1035        let json_value = serde_json::json!({
1036            "author": "John Doe",
1037            "tags": ["rust", "search"],
1038            "nested": {"key": "value"}
1039        });
1040        let mut doc = Document::new();
1041        doc.add_json(metadata, json_value.clone());
1042        doc.add_text(title, "Test Document");
1043
1044        // Verify accessor
1045        let stored_json = doc.get_first(metadata).unwrap().as_json().unwrap();
1046        assert_eq!(stored_json, &json_value);
1047        assert_eq!(
1048            stored_json.get("author").unwrap().as_str(),
1049            Some("John Doe")
1050        );
1051
1052        // Verify JSON roundtrip via to_json/from_json
1053        let doc_json = doc.to_json(&schema);
1054        let metadata_out = doc_json.get("metadata").unwrap();
1055        assert_eq!(metadata_out, &json_value);
1056
1057        // Parse back from JSON
1058        let doc2 = Document::from_json(&doc_json, &schema).unwrap();
1059        let stored_json2 = doc2.get_first(metadata).unwrap().as_json().unwrap();
1060        assert_eq!(stored_json2, &json_value);
1061    }
1062
1063    #[test]
1064    fn test_json_field_various_types() {
1065        let mut builder = Schema::builder();
1066        let data = builder.add_json_field("data", true);
1067        let _schema = builder.build();
1068
1069        // Test with array
1070        let arr_value = serde_json::json!([1, 2, 3, "four", null]);
1071        let mut doc = Document::new();
1072        doc.add_json(data, arr_value.clone());
1073        assert_eq!(doc.get_first(data).unwrap().as_json().unwrap(), &arr_value);
1074
1075        // Test with string
1076        let str_value = serde_json::json!("just a string");
1077        let mut doc2 = Document::new();
1078        doc2.add_json(data, str_value.clone());
1079        assert_eq!(doc2.get_first(data).unwrap().as_json().unwrap(), &str_value);
1080
1081        // Test with number
1082        let num_value = serde_json::json!(42.5);
1083        let mut doc3 = Document::new();
1084        doc3.add_json(data, num_value.clone());
1085        assert_eq!(doc3.get_first(data).unwrap().as_json().unwrap(), &num_value);
1086
1087        // Test with null
1088        let null_value = serde_json::Value::Null;
1089        let mut doc4 = Document::new();
1090        doc4.add_json(data, null_value.clone());
1091        assert_eq!(
1092            doc4.get_first(data).unwrap().as_json().unwrap(),
1093            &null_value
1094        );
1095
1096        // Test with boolean
1097        let bool_value = serde_json::json!(true);
1098        let mut doc5 = Document::new();
1099        doc5.add_json(data, bool_value.clone());
1100        assert_eq!(
1101            doc5.get_first(data).unwrap().as_json().unwrap(),
1102            &bool_value
1103        );
1104    }
1105}