Skip to main content

hermes_core/dsl/
schema.rs

1//! Schema definitions for documents and fields
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// Field identifier
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
8pub struct Field(pub u32);
9
10/// Types of fields supported
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub enum FieldType {
13    /// Text field - tokenized and indexed
14    #[serde(rename = "text")]
15    Text,
16    /// Unsigned 64-bit integer
17    #[serde(rename = "u64")]
18    U64,
19    /// Signed 64-bit integer
20    #[serde(rename = "i64")]
21    I64,
22    /// 64-bit floating point
23    #[serde(rename = "f64")]
24    F64,
25    /// Raw bytes (not tokenized)
26    #[serde(rename = "bytes")]
27    Bytes,
28    /// Sparse vector field - indexed as inverted posting lists with quantized weights
29    #[serde(rename = "sparse_vector")]
30    SparseVector,
31    /// Dense vector field - indexed using RaBitQ binary quantization for ANN search
32    #[serde(rename = "dense_vector")]
33    DenseVector,
34    /// JSON field - arbitrary JSON data, stored but not indexed
35    #[serde(rename = "json")]
36    Json,
37}
38
39/// Field options
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct FieldEntry {
42    pub name: String,
43    pub field_type: FieldType,
44    pub indexed: bool,
45    pub stored: bool,
46    /// Name of the tokenizer to use for this field (for text fields)
47    pub tokenizer: Option<String>,
48    /// Whether this field can have multiple values (serialized as array in JSON)
49    #[serde(default)]
50    pub multi: bool,
51    /// Position tracking mode for phrase queries and multi-field element tracking
52    #[serde(default, skip_serializing_if = "Option::is_none")]
53    pub positions: Option<PositionMode>,
54    /// Configuration for sparse vector fields (index size, weight quantization)
55    #[serde(default, skip_serializing_if = "Option::is_none")]
56    pub sparse_vector_config: Option<crate::structures::SparseVectorConfig>,
57    /// Configuration for dense vector fields (dimension, quantization)
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub dense_vector_config: Option<DenseVectorConfig>,
60}
61
62/// Position tracking mode for text fields
63#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
64#[serde(rename_all = "snake_case")]
65pub enum PositionMode {
66    /// Track only element ordinal for multi-valued fields (which array element)
67    /// Useful for returning which element matched without full phrase query support
68    Ordinal,
69    /// Track only token position within text (for phrase queries)
70    /// Does not track element ordinal - all positions are relative to concatenated text
71    TokenPosition,
72    /// Track both element ordinal and token position (full support)
73    /// Position format: (element_ordinal << 20) | token_position
74    Full,
75}
76
77impl PositionMode {
78    /// Whether this mode tracks element ordinals
79    pub fn tracks_ordinal(&self) -> bool {
80        matches!(self, PositionMode::Ordinal | PositionMode::Full)
81    }
82
83    /// Whether this mode tracks token positions
84    pub fn tracks_token_position(&self) -> bool {
85        matches!(self, PositionMode::TokenPosition | PositionMode::Full)
86    }
87}
88
89/// Vector index algorithm type
90#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
91#[serde(rename_all = "snake_case")]
92pub enum VectorIndexType {
93    /// RaBitQ - binary quantization, good for small datasets (<100K)
94    #[default]
95    RaBitQ,
96    /// IVF-RaBitQ - inverted file with RaBitQ, good for medium datasets
97    IvfRaBitQ,
98    /// ScaNN - product quantization with OPQ and anisotropic loss, best for large datasets
99    ScaNN,
100}
101
102/// Configuration for dense vector fields using RaBitQ, IVF-RaBitQ, or ScaNN
103#[derive(Debug, Clone, Serialize, Deserialize)]
104pub struct DenseVectorConfig {
105    /// Dimensionality of vectors
106    pub dim: usize,
107    /// Vector index algorithm to use
108    #[serde(default)]
109    pub index_type: VectorIndexType,
110    /// Whether to store raw vectors for re-ranking (increases storage but improves accuracy)
111    #[serde(default = "default_store_raw")]
112    pub store_raw: bool,
113    /// Path to pre-trained coarse centroids file for IVF indexes
114    /// Required for IVF-RaBitQ and ScaNN
115    #[serde(default, skip_serializing_if = "Option::is_none")]
116    pub coarse_centroids_path: Option<String>,
117    /// Path to pre-trained PQ codebook file for ScaNN
118    #[serde(default, skip_serializing_if = "Option::is_none")]
119    pub pq_codebook_path: Option<String>,
120    /// Number of clusters to probe during search (default: 32)
121    #[serde(default = "default_nprobe")]
122    pub nprobe: usize,
123    /// Matryoshka/MRL dimension for index - use only first mrl_dim coordinates for indexing
124    /// Full vectors are stored but index uses truncated vectors for faster search
125    /// Must be <= dim. If None, uses full dim.
126    #[serde(default, skip_serializing_if = "Option::is_none")]
127    pub mrl_dim: Option<usize>,
128}
129
130fn default_store_raw() -> bool {
131    true
132}
133
134fn default_nprobe() -> usize {
135    32
136}
137
138impl DenseVectorConfig {
139    pub fn new(dim: usize) -> Self {
140        Self {
141            dim,
142            index_type: VectorIndexType::RaBitQ,
143            store_raw: true,
144            coarse_centroids_path: None,
145            pq_codebook_path: None,
146            nprobe: 32,
147            mrl_dim: None,
148        }
149    }
150
151    pub fn with_ivf(dim: usize, centroids_path: String, nprobe: usize) -> Self {
152        Self {
153            dim,
154            index_type: VectorIndexType::IvfRaBitQ,
155            store_raw: true,
156            coarse_centroids_path: Some(centroids_path),
157            pq_codebook_path: None,
158            nprobe,
159            mrl_dim: None,
160        }
161    }
162
163    /// Create ScaNN configuration with pre-trained centroids and codebook
164    pub fn with_scann(
165        dim: usize,
166        centroids_path: String,
167        codebook_path: String,
168        nprobe: usize,
169    ) -> Self {
170        Self {
171            dim,
172            index_type: VectorIndexType::ScaNN,
173            store_raw: true,
174            coarse_centroids_path: Some(centroids_path),
175            pq_codebook_path: Some(codebook_path),
176            nprobe,
177            mrl_dim: None,
178        }
179    }
180
181    pub fn without_raw(dim: usize) -> Self {
182        Self {
183            dim,
184            index_type: VectorIndexType::RaBitQ,
185            store_raw: false,
186            coarse_centroids_path: None,
187            pq_codebook_path: None,
188            nprobe: 32,
189            mrl_dim: None,
190        }
191    }
192
193    /// Set matryoshka/MRL dimension for index truncation
194    pub fn with_mrl_dim(mut self, mrl_dim: usize) -> Self {
195        self.mrl_dim = Some(mrl_dim);
196        self
197    }
198
199    /// Get the effective dimension for indexing (mrl_dim if set, otherwise dim)
200    pub fn index_dim(&self) -> usize {
201        self.mrl_dim.unwrap_or(self.dim)
202    }
203
204    /// Check if this config uses IVF (has coarse centroids)
205    pub fn uses_ivf(&self) -> bool {
206        self.coarse_centroids_path.is_some()
207    }
208
209    /// Check if this config uses ScaNN
210    pub fn uses_scann(&self) -> bool {
211        self.index_type == VectorIndexType::ScaNN
212    }
213}
214
215use super::query_field_router::QueryRouterRule;
216
217/// Schema defining document structure
218#[derive(Debug, Clone, Default, Serialize, Deserialize)]
219pub struct Schema {
220    fields: Vec<FieldEntry>,
221    name_to_field: HashMap<String, Field>,
222    /// Default fields for query parsing (when no field is specified)
223    #[serde(default)]
224    default_fields: Vec<Field>,
225    /// Query router rules for routing queries to specific fields based on regex patterns
226    #[serde(default)]
227    query_routers: Vec<QueryRouterRule>,
228}
229
230impl Schema {
231    pub fn builder() -> SchemaBuilder {
232        SchemaBuilder::default()
233    }
234
235    pub fn get_field(&self, name: &str) -> Option<Field> {
236        self.name_to_field.get(name).copied()
237    }
238
239    pub fn get_field_entry(&self, field: Field) -> Option<&FieldEntry> {
240        self.fields.get(field.0 as usize)
241    }
242
243    pub fn get_field_name(&self, field: Field) -> Option<&str> {
244        self.fields.get(field.0 as usize).map(|e| e.name.as_str())
245    }
246
247    pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
248        self.fields
249            .iter()
250            .enumerate()
251            .map(|(i, e)| (Field(i as u32), e))
252    }
253
254    pub fn num_fields(&self) -> usize {
255        self.fields.len()
256    }
257
258    /// Get the default fields for query parsing
259    pub fn default_fields(&self) -> &[Field] {
260        &self.default_fields
261    }
262
263    /// Set default fields (used by builder)
264    pub fn set_default_fields(&mut self, fields: Vec<Field>) {
265        self.default_fields = fields;
266    }
267
268    /// Get the query router rules
269    pub fn query_routers(&self) -> &[QueryRouterRule] {
270        &self.query_routers
271    }
272
273    /// Set query router rules
274    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
275        self.query_routers = rules;
276    }
277}
278
279/// Builder for Schema
280#[derive(Debug, Default)]
281pub struct SchemaBuilder {
282    fields: Vec<FieldEntry>,
283    default_fields: Vec<String>,
284    query_routers: Vec<QueryRouterRule>,
285}
286
287impl SchemaBuilder {
288    pub fn add_text_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
289        self.add_field_with_tokenizer(
290            name,
291            FieldType::Text,
292            indexed,
293            stored,
294            Some("default".to_string()),
295        )
296    }
297
298    pub fn add_text_field_with_tokenizer(
299        &mut self,
300        name: &str,
301        indexed: bool,
302        stored: bool,
303        tokenizer: &str,
304    ) -> Field {
305        self.add_field_with_tokenizer(
306            name,
307            FieldType::Text,
308            indexed,
309            stored,
310            Some(tokenizer.to_string()),
311        )
312    }
313
314    pub fn add_u64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
315        self.add_field(name, FieldType::U64, indexed, stored)
316    }
317
318    pub fn add_i64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
319        self.add_field(name, FieldType::I64, indexed, stored)
320    }
321
322    pub fn add_f64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
323        self.add_field(name, FieldType::F64, indexed, stored)
324    }
325
326    pub fn add_bytes_field(&mut self, name: &str, stored: bool) -> Field {
327        self.add_field(name, FieldType::Bytes, false, stored)
328    }
329
330    /// Add a JSON field for storing arbitrary JSON data
331    ///
332    /// JSON fields are never indexed, only stored. They can hold any valid JSON value
333    /// (objects, arrays, strings, numbers, booleans, null).
334    pub fn add_json_field(&mut self, name: &str, stored: bool) -> Field {
335        self.add_field(name, FieldType::Json, false, stored)
336    }
337
338    /// Add a sparse vector field with default configuration
339    ///
340    /// Sparse vectors are indexed as inverted posting lists where each dimension
341    /// becomes a "term" and documents have quantized weights for each dimension.
342    pub fn add_sparse_vector_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
343        self.add_sparse_vector_field_with_config(
344            name,
345            indexed,
346            stored,
347            crate::structures::SparseVectorConfig::default(),
348        )
349    }
350
351    /// Add a sparse vector field with custom configuration
352    ///
353    /// Use `SparseVectorConfig::splade()` for SPLADE models (u16 indices, uint8 weights).
354    /// Use `SparseVectorConfig::compact()` for maximum compression (u16 indices, uint4 weights).
355    pub fn add_sparse_vector_field_with_config(
356        &mut self,
357        name: &str,
358        indexed: bool,
359        stored: bool,
360        config: crate::structures::SparseVectorConfig,
361    ) -> Field {
362        let field = Field(self.fields.len() as u32);
363        self.fields.push(FieldEntry {
364            name: name.to_string(),
365            field_type: FieldType::SparseVector,
366            indexed,
367            stored,
368            tokenizer: None,
369            multi: false,
370            positions: None,
371            sparse_vector_config: Some(config),
372            dense_vector_config: None,
373        });
374        field
375    }
376
377    /// Set sparse vector configuration for an existing field
378    pub fn set_sparse_vector_config(
379        &mut self,
380        field: Field,
381        config: crate::structures::SparseVectorConfig,
382    ) {
383        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
384            entry.sparse_vector_config = Some(config);
385        }
386    }
387
388    /// Add a dense vector field with default configuration
389    ///
390    /// Dense vectors are indexed using RaBitQ binary quantization for fast ANN search.
391    /// The dimension must be specified as it determines the quantization structure.
392    pub fn add_dense_vector_field(
393        &mut self,
394        name: &str,
395        dim: usize,
396        indexed: bool,
397        stored: bool,
398    ) -> Field {
399        self.add_dense_vector_field_with_config(name, indexed, stored, DenseVectorConfig::new(dim))
400    }
401
402    /// Add a dense vector field with custom configuration
403    pub fn add_dense_vector_field_with_config(
404        &mut self,
405        name: &str,
406        indexed: bool,
407        stored: bool,
408        config: DenseVectorConfig,
409    ) -> Field {
410        let field = Field(self.fields.len() as u32);
411        self.fields.push(FieldEntry {
412            name: name.to_string(),
413            field_type: FieldType::DenseVector,
414            indexed,
415            stored,
416            tokenizer: None,
417            multi: false,
418            positions: None,
419            sparse_vector_config: None,
420            dense_vector_config: Some(config),
421        });
422        field
423    }
424
425    fn add_field(
426        &mut self,
427        name: &str,
428        field_type: FieldType,
429        indexed: bool,
430        stored: bool,
431    ) -> Field {
432        self.add_field_with_tokenizer(name, field_type, indexed, stored, None)
433    }
434
435    fn add_field_with_tokenizer(
436        &mut self,
437        name: &str,
438        field_type: FieldType,
439        indexed: bool,
440        stored: bool,
441        tokenizer: Option<String>,
442    ) -> Field {
443        self.add_field_full(name, field_type, indexed, stored, tokenizer, false)
444    }
445
446    fn add_field_full(
447        &mut self,
448        name: &str,
449        field_type: FieldType,
450        indexed: bool,
451        stored: bool,
452        tokenizer: Option<String>,
453        multi: bool,
454    ) -> Field {
455        let field = Field(self.fields.len() as u32);
456        self.fields.push(FieldEntry {
457            name: name.to_string(),
458            field_type,
459            indexed,
460            stored,
461            tokenizer,
462            multi,
463            positions: None,
464            sparse_vector_config: None,
465            dense_vector_config: None,
466        });
467        field
468    }
469
470    /// Set the multi attribute on the last added field
471    pub fn set_multi(&mut self, field: Field, multi: bool) {
472        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
473            entry.multi = multi;
474        }
475    }
476
477    /// Set position tracking mode for phrase queries and multi-field element tracking
478    pub fn set_positions(&mut self, field: Field, mode: PositionMode) {
479        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
480            entry.positions = Some(mode);
481        }
482    }
483
484    /// Set default fields by name
485    pub fn set_default_fields(&mut self, field_names: Vec<String>) {
486        self.default_fields = field_names;
487    }
488
489    /// Set query router rules
490    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
491        self.query_routers = rules;
492    }
493
494    pub fn build(self) -> Schema {
495        let mut name_to_field = HashMap::new();
496        for (i, entry) in self.fields.iter().enumerate() {
497            name_to_field.insert(entry.name.clone(), Field(i as u32));
498        }
499
500        // Resolve default field names to Field IDs
501        let default_fields: Vec<Field> = self
502            .default_fields
503            .iter()
504            .filter_map(|name| name_to_field.get(name).copied())
505            .collect();
506
507        Schema {
508            fields: self.fields,
509            name_to_field,
510            default_fields,
511            query_routers: self.query_routers,
512        }
513    }
514}
515
516/// Value that can be stored in a field
517#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
518pub enum FieldValue {
519    #[serde(rename = "text")]
520    Text(String),
521    #[serde(rename = "u64")]
522    U64(u64),
523    #[serde(rename = "i64")]
524    I64(i64),
525    #[serde(rename = "f64")]
526    F64(f64),
527    #[serde(rename = "bytes")]
528    Bytes(Vec<u8>),
529    /// Sparse vector: list of (dimension_id, weight) pairs
530    #[serde(rename = "sparse_vector")]
531    SparseVector(Vec<(u32, f32)>),
532    /// Dense vector: float32 values
533    #[serde(rename = "dense_vector")]
534    DenseVector(Vec<f32>),
535    /// Arbitrary JSON value
536    #[serde(rename = "json")]
537    Json(serde_json::Value),
538}
539
540impl FieldValue {
541    pub fn as_text(&self) -> Option<&str> {
542        match self {
543            FieldValue::Text(s) => Some(s),
544            _ => None,
545        }
546    }
547
548    pub fn as_u64(&self) -> Option<u64> {
549        match self {
550            FieldValue::U64(v) => Some(*v),
551            _ => None,
552        }
553    }
554
555    pub fn as_i64(&self) -> Option<i64> {
556        match self {
557            FieldValue::I64(v) => Some(*v),
558            _ => None,
559        }
560    }
561
562    pub fn as_f64(&self) -> Option<f64> {
563        match self {
564            FieldValue::F64(v) => Some(*v),
565            _ => None,
566        }
567    }
568
569    pub fn as_bytes(&self) -> Option<&[u8]> {
570        match self {
571            FieldValue::Bytes(b) => Some(b),
572            _ => None,
573        }
574    }
575
576    pub fn as_sparse_vector(&self) -> Option<&[(u32, f32)]> {
577        match self {
578            FieldValue::SparseVector(entries) => Some(entries),
579            _ => None,
580        }
581    }
582
583    pub fn as_dense_vector(&self) -> Option<&[f32]> {
584        match self {
585            FieldValue::DenseVector(v) => Some(v),
586            _ => None,
587        }
588    }
589
590    pub fn as_json(&self) -> Option<&serde_json::Value> {
591        match self {
592            FieldValue::Json(v) => Some(v),
593            _ => None,
594        }
595    }
596}
597
598/// A document to be indexed
599#[derive(Debug, Clone, Default, Serialize, Deserialize)]
600pub struct Document {
601    field_values: Vec<(Field, FieldValue)>,
602}
603
604impl Document {
605    pub fn new() -> Self {
606        Self::default()
607    }
608
609    pub fn add_text(&mut self, field: Field, value: impl Into<String>) {
610        self.field_values
611            .push((field, FieldValue::Text(value.into())));
612    }
613
614    pub fn add_u64(&mut self, field: Field, value: u64) {
615        self.field_values.push((field, FieldValue::U64(value)));
616    }
617
618    pub fn add_i64(&mut self, field: Field, value: i64) {
619        self.field_values.push((field, FieldValue::I64(value)));
620    }
621
622    pub fn add_f64(&mut self, field: Field, value: f64) {
623        self.field_values.push((field, FieldValue::F64(value)));
624    }
625
626    pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
627        self.field_values.push((field, FieldValue::Bytes(value)));
628    }
629
630    pub fn add_sparse_vector(&mut self, field: Field, entries: Vec<(u32, f32)>) {
631        self.field_values
632            .push((field, FieldValue::SparseVector(entries)));
633    }
634
635    pub fn add_dense_vector(&mut self, field: Field, values: Vec<f32>) {
636        self.field_values
637            .push((field, FieldValue::DenseVector(values)));
638    }
639
640    pub fn add_json(&mut self, field: Field, value: serde_json::Value) {
641        self.field_values.push((field, FieldValue::Json(value)));
642    }
643
644    pub fn get_first(&self, field: Field) -> Option<&FieldValue> {
645        self.field_values
646            .iter()
647            .find(|(f, _)| *f == field)
648            .map(|(_, v)| v)
649    }
650
651    pub fn get_all(&self, field: Field) -> impl Iterator<Item = &FieldValue> {
652        self.field_values
653            .iter()
654            .filter(move |(f, _)| *f == field)
655            .map(|(_, v)| v)
656    }
657
658    pub fn field_values(&self) -> &[(Field, FieldValue)] {
659        &self.field_values
660    }
661
662    /// Convert document to a JSON object using field names from schema
663    ///
664    /// Fields marked as `multi` in the schema are always returned as JSON arrays.
665    /// Other fields with multiple values are also returned as arrays.
666    /// Fields with a single value (and not marked multi) are returned as scalar values.
667    pub fn to_json(&self, schema: &Schema) -> serde_json::Value {
668        use std::collections::HashMap;
669
670        // Group values by field, keeping track of field entry for multi check
671        let mut field_values_map: HashMap<Field, (String, bool, Vec<serde_json::Value>)> =
672            HashMap::new();
673
674        for (field, value) in &self.field_values {
675            if let Some(entry) = schema.get_field_entry(*field) {
676                let json_value = match value {
677                    FieldValue::Text(s) => serde_json::Value::String(s.clone()),
678                    FieldValue::U64(n) => serde_json::Value::Number((*n).into()),
679                    FieldValue::I64(n) => serde_json::Value::Number((*n).into()),
680                    FieldValue::F64(n) => serde_json::json!(n),
681                    FieldValue::Bytes(b) => {
682                        use base64::Engine;
683                        serde_json::Value::String(
684                            base64::engine::general_purpose::STANDARD.encode(b),
685                        )
686                    }
687                    FieldValue::SparseVector(entries) => {
688                        let indices: Vec<u32> = entries.iter().map(|(i, _)| *i).collect();
689                        let values: Vec<f32> = entries.iter().map(|(_, v)| *v).collect();
690                        serde_json::json!({
691                            "indices": indices,
692                            "values": values
693                        })
694                    }
695                    FieldValue::DenseVector(values) => {
696                        serde_json::json!(values)
697                    }
698                    FieldValue::Json(v) => v.clone(),
699                };
700                field_values_map
701                    .entry(*field)
702                    .or_insert_with(|| (entry.name.clone(), entry.multi, Vec::new()))
703                    .2
704                    .push(json_value);
705            }
706        }
707
708        // Convert to JSON object, using arrays for multi fields or when multiple values exist
709        let mut map = serde_json::Map::new();
710        for (_field, (name, is_multi, values)) in field_values_map {
711            let json_value = if is_multi || values.len() > 1 {
712                serde_json::Value::Array(values)
713            } else {
714                values.into_iter().next().unwrap()
715            };
716            map.insert(name, json_value);
717        }
718
719        serde_json::Value::Object(map)
720    }
721
722    /// Create a Document from a JSON object using field names from schema
723    ///
724    /// Supports:
725    /// - String values -> Text fields
726    /// - Number values -> U64/I64/F64 fields (based on schema type)
727    /// - Array values -> Multiple values for the same field (multifields)
728    ///
729    /// Unknown fields (not in schema) are silently ignored.
730    pub fn from_json(json: &serde_json::Value, schema: &Schema) -> Option<Self> {
731        let obj = json.as_object()?;
732        let mut doc = Document::new();
733
734        for (key, value) in obj {
735            if let Some(field) = schema.get_field(key) {
736                let field_entry = schema.get_field_entry(field)?;
737                Self::add_json_value(&mut doc, field, &field_entry.field_type, value);
738            }
739        }
740
741        Some(doc)
742    }
743
744    /// Helper to add a JSON value to a document, handling type conversion
745    fn add_json_value(
746        doc: &mut Document,
747        field: Field,
748        field_type: &FieldType,
749        value: &serde_json::Value,
750    ) {
751        match value {
752            serde_json::Value::String(s) => {
753                if matches!(field_type, FieldType::Text) {
754                    doc.add_text(field, s.clone());
755                }
756            }
757            serde_json::Value::Number(n) => {
758                match field_type {
759                    FieldType::I64 => {
760                        if let Some(i) = n.as_i64() {
761                            doc.add_i64(field, i);
762                        }
763                    }
764                    FieldType::U64 => {
765                        if let Some(u) = n.as_u64() {
766                            doc.add_u64(field, u);
767                        } else if let Some(i) = n.as_i64() {
768                            // Allow positive i64 as u64
769                            if i >= 0 {
770                                doc.add_u64(field, i as u64);
771                            }
772                        }
773                    }
774                    FieldType::F64 => {
775                        if let Some(f) = n.as_f64() {
776                            doc.add_f64(field, f);
777                        }
778                    }
779                    _ => {}
780                }
781            }
782            // Handle arrays (multifields) - add each element separately
783            serde_json::Value::Array(arr) => {
784                for item in arr {
785                    Self::add_json_value(doc, field, field_type, item);
786                }
787            }
788            // Handle sparse vector objects
789            serde_json::Value::Object(obj) if matches!(field_type, FieldType::SparseVector) => {
790                if let (Some(indices_val), Some(values_val)) =
791                    (obj.get("indices"), obj.get("values"))
792                {
793                    let indices: Vec<u32> = indices_val
794                        .as_array()
795                        .map(|arr| {
796                            arr.iter()
797                                .filter_map(|v| v.as_u64().map(|n| n as u32))
798                                .collect()
799                        })
800                        .unwrap_or_default();
801                    let values: Vec<f32> = values_val
802                        .as_array()
803                        .map(|arr| {
804                            arr.iter()
805                                .filter_map(|v| v.as_f64().map(|n| n as f32))
806                                .collect()
807                        })
808                        .unwrap_or_default();
809                    if indices.len() == values.len() {
810                        let entries: Vec<(u32, f32)> = indices.into_iter().zip(values).collect();
811                        doc.add_sparse_vector(field, entries);
812                    }
813                }
814            }
815            // Handle JSON fields - accept any value directly
816            _ if matches!(field_type, FieldType::Json) => {
817                doc.add_json(field, value.clone());
818            }
819            serde_json::Value::Object(_) => {}
820            _ => {}
821        }
822    }
823}
824
825#[cfg(test)]
826mod tests {
827    use super::*;
828
829    #[test]
830    fn test_schema_builder() {
831        let mut builder = Schema::builder();
832        let title = builder.add_text_field("title", true, true);
833        let body = builder.add_text_field("body", true, false);
834        let count = builder.add_u64_field("count", true, true);
835        let schema = builder.build();
836
837        assert_eq!(schema.get_field("title"), Some(title));
838        assert_eq!(schema.get_field("body"), Some(body));
839        assert_eq!(schema.get_field("count"), Some(count));
840        assert_eq!(schema.get_field("nonexistent"), None);
841    }
842
843    #[test]
844    fn test_document() {
845        let mut builder = Schema::builder();
846        let title = builder.add_text_field("title", true, true);
847        let count = builder.add_u64_field("count", true, true);
848        let _schema = builder.build();
849
850        let mut doc = Document::new();
851        doc.add_text(title, "Hello World");
852        doc.add_u64(count, 42);
853
854        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
855        assert_eq!(doc.get_first(count).unwrap().as_u64(), Some(42));
856    }
857
858    #[test]
859    fn test_document_serialization() {
860        let mut builder = Schema::builder();
861        let title = builder.add_text_field("title", true, true);
862        let count = builder.add_u64_field("count", true, true);
863        let _schema = builder.build();
864
865        let mut doc = Document::new();
866        doc.add_text(title, "Hello World");
867        doc.add_u64(count, 42);
868
869        // Serialize
870        let json = serde_json::to_string(&doc).unwrap();
871        println!("Serialized doc: {}", json);
872
873        // Deserialize
874        let doc2: Document = serde_json::from_str(&json).unwrap();
875        assert_eq!(
876            doc2.field_values().len(),
877            2,
878            "Should have 2 field values after deserialization"
879        );
880        assert_eq!(
881            doc2.get_first(title).unwrap().as_text(),
882            Some("Hello World")
883        );
884        assert_eq!(doc2.get_first(count).unwrap().as_u64(), Some(42));
885    }
886
887    #[test]
888    fn test_multivalue_field() {
889        let mut builder = Schema::builder();
890        let uris = builder.add_text_field("uris", true, true);
891        let title = builder.add_text_field("title", true, true);
892        let schema = builder.build();
893
894        // Create document with multiple values for the same field
895        let mut doc = Document::new();
896        doc.add_text(uris, "one");
897        doc.add_text(uris, "two");
898        doc.add_text(title, "Test Document");
899
900        // Verify get_first returns the first value
901        assert_eq!(doc.get_first(uris).unwrap().as_text(), Some("one"));
902
903        // Verify get_all returns all values
904        let all_uris: Vec<_> = doc.get_all(uris).collect();
905        assert_eq!(all_uris.len(), 2);
906        assert_eq!(all_uris[0].as_text(), Some("one"));
907        assert_eq!(all_uris[1].as_text(), Some("two"));
908
909        // Verify to_json returns array for multi-value field
910        let json = doc.to_json(&schema);
911        let uris_json = json.get("uris").unwrap();
912        assert!(uris_json.is_array(), "Multi-value field should be an array");
913        let uris_arr = uris_json.as_array().unwrap();
914        assert_eq!(uris_arr.len(), 2);
915        assert_eq!(uris_arr[0].as_str(), Some("one"));
916        assert_eq!(uris_arr[1].as_str(), Some("two"));
917
918        // Verify single-value field is NOT an array
919        let title_json = json.get("title").unwrap();
920        assert!(
921            title_json.is_string(),
922            "Single-value field should be a string"
923        );
924        assert_eq!(title_json.as_str(), Some("Test Document"));
925    }
926
927    #[test]
928    fn test_multivalue_from_json() {
929        let mut builder = Schema::builder();
930        let uris = builder.add_text_field("uris", true, true);
931        let title = builder.add_text_field("title", true, true);
932        let schema = builder.build();
933
934        // Create JSON with array value
935        let json = serde_json::json!({
936            "uris": ["one", "two"],
937            "title": "Test Document"
938        });
939
940        // Parse from JSON
941        let doc = Document::from_json(&json, &schema).unwrap();
942
943        // Verify all values are present
944        let all_uris: Vec<_> = doc.get_all(uris).collect();
945        assert_eq!(all_uris.len(), 2);
946        assert_eq!(all_uris[0].as_text(), Some("one"));
947        assert_eq!(all_uris[1].as_text(), Some("two"));
948
949        // Verify single value
950        assert_eq!(
951            doc.get_first(title).unwrap().as_text(),
952            Some("Test Document")
953        );
954
955        // Verify roundtrip: to_json should produce equivalent JSON
956        let json_out = doc.to_json(&schema);
957        let uris_out = json_out.get("uris").unwrap().as_array().unwrap();
958        assert_eq!(uris_out.len(), 2);
959        assert_eq!(uris_out[0].as_str(), Some("one"));
960        assert_eq!(uris_out[1].as_str(), Some("two"));
961    }
962
963    #[test]
964    fn test_multi_attribute_forces_array() {
965        // Test that fields marked as 'multi' are always serialized as arrays,
966        // even when they have only one value
967        let mut builder = Schema::builder();
968        let uris = builder.add_text_field("uris", true, true);
969        builder.set_multi(uris, true); // Mark as multi
970        let title = builder.add_text_field("title", true, true);
971        let schema = builder.build();
972
973        // Verify the multi attribute is set
974        assert!(schema.get_field_entry(uris).unwrap().multi);
975        assert!(!schema.get_field_entry(title).unwrap().multi);
976
977        // Create document with single value for multi field
978        let mut doc = Document::new();
979        doc.add_text(uris, "only_one");
980        doc.add_text(title, "Test Document");
981
982        // Verify to_json returns array for multi field even with single value
983        let json = doc.to_json(&schema);
984
985        let uris_json = json.get("uris").unwrap();
986        assert!(
987            uris_json.is_array(),
988            "Multi field should be array even with single value"
989        );
990        let uris_arr = uris_json.as_array().unwrap();
991        assert_eq!(uris_arr.len(), 1);
992        assert_eq!(uris_arr[0].as_str(), Some("only_one"));
993
994        // Verify non-multi field with single value is NOT an array
995        let title_json = json.get("title").unwrap();
996        assert!(
997            title_json.is_string(),
998            "Non-multi single-value field should be a string"
999        );
1000        assert_eq!(title_json.as_str(), Some("Test Document"));
1001    }
1002
1003    #[test]
1004    fn test_sparse_vector_field() {
1005        let mut builder = Schema::builder();
1006        let embedding = builder.add_sparse_vector_field("embedding", true, true);
1007        let title = builder.add_text_field("title", true, true);
1008        let schema = builder.build();
1009
1010        assert_eq!(schema.get_field("embedding"), Some(embedding));
1011        assert_eq!(
1012            schema.get_field_entry(embedding).unwrap().field_type,
1013            FieldType::SparseVector
1014        );
1015
1016        // Create document with sparse vector
1017        let mut doc = Document::new();
1018        doc.add_sparse_vector(embedding, vec![(0, 1.0), (5, 2.5), (10, 0.5)]);
1019        doc.add_text(title, "Test Document");
1020
1021        // Verify accessor
1022        let entries = doc
1023            .get_first(embedding)
1024            .unwrap()
1025            .as_sparse_vector()
1026            .unwrap();
1027        assert_eq!(entries, &[(0, 1.0), (5, 2.5), (10, 0.5)]);
1028
1029        // Verify JSON roundtrip
1030        let json = doc.to_json(&schema);
1031        let embedding_json = json.get("embedding").unwrap();
1032        assert!(embedding_json.is_object());
1033        assert_eq!(
1034            embedding_json
1035                .get("indices")
1036                .unwrap()
1037                .as_array()
1038                .unwrap()
1039                .len(),
1040            3
1041        );
1042
1043        // Parse back from JSON
1044        let doc2 = Document::from_json(&json, &schema).unwrap();
1045        let entries2 = doc2
1046            .get_first(embedding)
1047            .unwrap()
1048            .as_sparse_vector()
1049            .unwrap();
1050        assert_eq!(entries2[0].0, 0);
1051        assert!((entries2[0].1 - 1.0).abs() < 1e-6);
1052        assert_eq!(entries2[1].0, 5);
1053        assert!((entries2[1].1 - 2.5).abs() < 1e-6);
1054        assert_eq!(entries2[2].0, 10);
1055        assert!((entries2[2].1 - 0.5).abs() < 1e-6);
1056    }
1057
1058    #[test]
1059    fn test_json_field() {
1060        let mut builder = Schema::builder();
1061        let metadata = builder.add_json_field("metadata", true);
1062        let title = builder.add_text_field("title", true, true);
1063        let schema = builder.build();
1064
1065        assert_eq!(schema.get_field("metadata"), Some(metadata));
1066        assert_eq!(
1067            schema.get_field_entry(metadata).unwrap().field_type,
1068            FieldType::Json
1069        );
1070        // JSON fields are never indexed
1071        assert!(!schema.get_field_entry(metadata).unwrap().indexed);
1072        assert!(schema.get_field_entry(metadata).unwrap().stored);
1073
1074        // Create document with JSON value (object)
1075        let json_value = serde_json::json!({
1076            "author": "John Doe",
1077            "tags": ["rust", "search"],
1078            "nested": {"key": "value"}
1079        });
1080        let mut doc = Document::new();
1081        doc.add_json(metadata, json_value.clone());
1082        doc.add_text(title, "Test Document");
1083
1084        // Verify accessor
1085        let stored_json = doc.get_first(metadata).unwrap().as_json().unwrap();
1086        assert_eq!(stored_json, &json_value);
1087        assert_eq!(
1088            stored_json.get("author").unwrap().as_str(),
1089            Some("John Doe")
1090        );
1091
1092        // Verify JSON roundtrip via to_json/from_json
1093        let doc_json = doc.to_json(&schema);
1094        let metadata_out = doc_json.get("metadata").unwrap();
1095        assert_eq!(metadata_out, &json_value);
1096
1097        // Parse back from JSON
1098        let doc2 = Document::from_json(&doc_json, &schema).unwrap();
1099        let stored_json2 = doc2.get_first(metadata).unwrap().as_json().unwrap();
1100        assert_eq!(stored_json2, &json_value);
1101    }
1102
1103    #[test]
1104    fn test_json_field_various_types() {
1105        let mut builder = Schema::builder();
1106        let data = builder.add_json_field("data", true);
1107        let _schema = builder.build();
1108
1109        // Test with array
1110        let arr_value = serde_json::json!([1, 2, 3, "four", null]);
1111        let mut doc = Document::new();
1112        doc.add_json(data, arr_value.clone());
1113        assert_eq!(doc.get_first(data).unwrap().as_json().unwrap(), &arr_value);
1114
1115        // Test with string
1116        let str_value = serde_json::json!("just a string");
1117        let mut doc2 = Document::new();
1118        doc2.add_json(data, str_value.clone());
1119        assert_eq!(doc2.get_first(data).unwrap().as_json().unwrap(), &str_value);
1120
1121        // Test with number
1122        let num_value = serde_json::json!(42.5);
1123        let mut doc3 = Document::new();
1124        doc3.add_json(data, num_value.clone());
1125        assert_eq!(doc3.get_first(data).unwrap().as_json().unwrap(), &num_value);
1126
1127        // Test with null
1128        let null_value = serde_json::Value::Null;
1129        let mut doc4 = Document::new();
1130        doc4.add_json(data, null_value.clone());
1131        assert_eq!(
1132            doc4.get_first(data).unwrap().as_json().unwrap(),
1133            &null_value
1134        );
1135
1136        // Test with boolean
1137        let bool_value = serde_json::json!(true);
1138        let mut doc5 = Document::new();
1139        doc5.add_json(data, bool_value.clone());
1140        assert_eq!(
1141            doc5.get_first(data).unwrap().as_json().unwrap(),
1142            &bool_value
1143        );
1144    }
1145}