Skip to main content

hermes_core/dsl/
schema.rs

1//! Schema definitions for documents and fields
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// Field identifier
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
8pub struct Field(pub u32);
9
10/// Types of fields supported
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub enum FieldType {
13    /// Text field - tokenized and indexed
14    #[serde(rename = "text")]
15    Text,
16    /// Unsigned 64-bit integer
17    #[serde(rename = "u64")]
18    U64,
19    /// Signed 64-bit integer
20    #[serde(rename = "i64")]
21    I64,
22    /// 64-bit floating point
23    #[serde(rename = "f64")]
24    F64,
25    /// Raw bytes (not tokenized)
26    #[serde(rename = "bytes")]
27    Bytes,
28    /// Sparse vector field - indexed as inverted posting lists with quantized weights
29    #[serde(rename = "sparse_vector")]
30    SparseVector,
31    /// Dense vector field - indexed using RaBitQ binary quantization for ANN search
32    #[serde(rename = "dense_vector")]
33    DenseVector,
34    /// JSON field - arbitrary JSON data, stored but not indexed
35    #[serde(rename = "json")]
36    Json,
37}
38
39/// Field options
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct FieldEntry {
42    pub name: String,
43    pub field_type: FieldType,
44    pub indexed: bool,
45    pub stored: bool,
46    /// Name of the tokenizer to use for this field (for text fields)
47    pub tokenizer: Option<String>,
48    /// Whether this field can have multiple values (serialized as array in JSON)
49    #[serde(default)]
50    pub multi: bool,
51    /// Position tracking mode for phrase queries and multi-field element tracking
52    #[serde(default, skip_serializing_if = "Option::is_none")]
53    pub positions: Option<PositionMode>,
54    /// Configuration for sparse vector fields (index size, weight quantization)
55    #[serde(default, skip_serializing_if = "Option::is_none")]
56    pub sparse_vector_config: Option<crate::structures::SparseVectorConfig>,
57    /// Configuration for dense vector fields (dimension, quantization)
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub dense_vector_config: Option<DenseVectorConfig>,
60}
61
62/// Position tracking mode for text fields
63#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
64#[serde(rename_all = "snake_case")]
65pub enum PositionMode {
66    /// Track only element ordinal for multi-valued fields (which array element)
67    /// Useful for returning which element matched without full phrase query support
68    Ordinal,
69    /// Track only token position within text (for phrase queries)
70    /// Does not track element ordinal - all positions are relative to concatenated text
71    TokenPosition,
72    /// Track both element ordinal and token position (full support)
73    /// Position format: (element_ordinal << 20) | token_position
74    Full,
75}
76
77impl PositionMode {
78    /// Whether this mode tracks element ordinals
79    pub fn tracks_ordinal(&self) -> bool {
80        matches!(self, PositionMode::Ordinal | PositionMode::Full)
81    }
82
83    /// Whether this mode tracks token positions
84    pub fn tracks_token_position(&self) -> bool {
85        matches!(self, PositionMode::TokenPosition | PositionMode::Full)
86    }
87}
88
89/// Vector index algorithm type
90#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
91#[serde(rename_all = "snake_case")]
92pub enum VectorIndexType {
93    /// Flat - brute-force search over raw vectors (accumulating state)
94    Flat,
95    /// RaBitQ - binary quantization, good for small datasets (<100K)
96    #[default]
97    RaBitQ,
98    /// IVF-RaBitQ - inverted file with RaBitQ, good for medium datasets
99    IvfRaBitQ,
100    /// ScaNN - product quantization with OPQ and anisotropic loss, best for large datasets
101    ScaNN,
102}
103
104/// Configuration for dense vector fields using Flat, RaBitQ, IVF-RaBitQ, or ScaNN
105///
106/// Indexes operate in two states:
107/// - **Flat (accumulating)**: Brute-force search over raw vectors. Used when vector count
108///   is below `build_threshold` or before `build_index` is called.
109/// - **Built (ANN)**: Fast approximate nearest neighbor search using trained structures.
110///   Centroids and codebooks are trained from data and stored within the segment.
111#[derive(Debug, Clone, Serialize, Deserialize)]
112pub struct DenseVectorConfig {
113    /// Dimensionality of vectors
114    pub dim: usize,
115    /// Target vector index algorithm (Flat, RaBitQ, IVF-RaBitQ, or ScaNN)
116    /// When in accumulating state, search uses brute-force regardless of this setting.
117    #[serde(default)]
118    pub index_type: VectorIndexType,
119    /// Number of IVF clusters for IVF-RaBitQ and ScaNN (default: sqrt(n) capped at 4096)
120    /// If None, automatically determined based on dataset size.
121    #[serde(default, skip_serializing_if = "Option::is_none")]
122    pub num_clusters: Option<usize>,
123    /// Number of clusters to probe during search (default: 32)
124    #[serde(default = "default_nprobe")]
125    pub nprobe: usize,
126    /// Matryoshka/MRL dimension for index - use only first mrl_dim coordinates for indexing
127    /// Full vectors are stored but index uses truncated vectors for faster search
128    /// Must be <= dim. If None, uses full dim.
129    #[serde(default, skip_serializing_if = "Option::is_none")]
130    pub mrl_dim: Option<usize>,
131    /// Minimum number of vectors required before building ANN index.
132    /// Below this threshold, brute-force (Flat) search is used.
133    /// Default: 1000 for RaBitQ, 10000 for IVF-RaBitQ/ScaNN.
134    #[serde(default, skip_serializing_if = "Option::is_none")]
135    pub build_threshold: Option<usize>,
136}
137
138fn default_nprobe() -> usize {
139    32
140}
141
142impl DenseVectorConfig {
143    pub fn new(dim: usize) -> Self {
144        Self {
145            dim,
146            index_type: VectorIndexType::RaBitQ,
147            num_clusters: None,
148            nprobe: 32,
149            mrl_dim: None,
150            build_threshold: None,
151        }
152    }
153
154    /// Create IVF-RaBitQ configuration
155    pub fn with_ivf(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
156        Self {
157            dim,
158            index_type: VectorIndexType::IvfRaBitQ,
159            num_clusters,
160            nprobe,
161            mrl_dim: None,
162            build_threshold: None,
163        }
164    }
165
166    /// Create ScaNN configuration
167    pub fn with_scann(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
168        Self {
169            dim,
170            index_type: VectorIndexType::ScaNN,
171            num_clusters,
172            nprobe,
173            mrl_dim: None,
174            build_threshold: None,
175        }
176    }
177
178    /// Create Flat (brute-force) configuration - no ANN index
179    pub fn flat(dim: usize) -> Self {
180        Self {
181            dim,
182            index_type: VectorIndexType::Flat,
183            num_clusters: None,
184            nprobe: 0,
185            mrl_dim: None,
186            build_threshold: None,
187        }
188    }
189
190    /// Set matryoshka/MRL dimension for index truncation
191    pub fn with_mrl_dim(mut self, mrl_dim: usize) -> Self {
192        self.mrl_dim = Some(mrl_dim);
193        self
194    }
195
196    /// Set build threshold for auto-building ANN index
197    pub fn with_build_threshold(mut self, threshold: usize) -> Self {
198        self.build_threshold = Some(threshold);
199        self
200    }
201
202    /// Set number of IVF clusters
203    pub fn with_num_clusters(mut self, num_clusters: usize) -> Self {
204        self.num_clusters = Some(num_clusters);
205        self
206    }
207
208    /// Get the effective dimension for indexing (mrl_dim if set, otherwise dim)
209    pub fn index_dim(&self) -> usize {
210        self.mrl_dim.unwrap_or(self.dim)
211    }
212
213    /// Check if this config uses IVF
214    pub fn uses_ivf(&self) -> bool {
215        matches!(
216            self.index_type,
217            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN
218        )
219    }
220
221    /// Check if this config uses ScaNN
222    pub fn uses_scann(&self) -> bool {
223        self.index_type == VectorIndexType::ScaNN
224    }
225
226    /// Check if this config is flat (brute-force)
227    pub fn is_flat(&self) -> bool {
228        self.index_type == VectorIndexType::Flat
229    }
230
231    /// Get the default build threshold for this index type
232    pub fn default_build_threshold(&self) -> usize {
233        self.build_threshold.unwrap_or(match self.index_type {
234            VectorIndexType::Flat => usize::MAX, // Never auto-build
235            VectorIndexType::RaBitQ => 1000,
236            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN => 10000,
237        })
238    }
239
240    /// Calculate optimal number of clusters for given vector count
241    pub fn optimal_num_clusters(&self, num_vectors: usize) -> usize {
242        self.num_clusters.unwrap_or_else(|| {
243            // sqrt(n) heuristic, capped at 4096
244            let optimal = (num_vectors as f64).sqrt() as usize;
245            optimal.clamp(16, 4096)
246        })
247    }
248}
249
250use super::query_field_router::QueryRouterRule;
251
252/// Schema defining document structure
253#[derive(Debug, Clone, Default, Serialize, Deserialize)]
254pub struct Schema {
255    fields: Vec<FieldEntry>,
256    name_to_field: HashMap<String, Field>,
257    /// Default fields for query parsing (when no field is specified)
258    #[serde(default)]
259    default_fields: Vec<Field>,
260    /// Query router rules for routing queries to specific fields based on regex patterns
261    #[serde(default)]
262    query_routers: Vec<QueryRouterRule>,
263}
264
265impl Schema {
266    pub fn builder() -> SchemaBuilder {
267        SchemaBuilder::default()
268    }
269
270    pub fn get_field(&self, name: &str) -> Option<Field> {
271        self.name_to_field.get(name).copied()
272    }
273
274    pub fn get_field_entry(&self, field: Field) -> Option<&FieldEntry> {
275        self.fields.get(field.0 as usize)
276    }
277
278    pub fn get_field_name(&self, field: Field) -> Option<&str> {
279        self.fields.get(field.0 as usize).map(|e| e.name.as_str())
280    }
281
282    pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
283        self.fields
284            .iter()
285            .enumerate()
286            .map(|(i, e)| (Field(i as u32), e))
287    }
288
289    pub fn num_fields(&self) -> usize {
290        self.fields.len()
291    }
292
293    /// Get the default fields for query parsing
294    pub fn default_fields(&self) -> &[Field] {
295        &self.default_fields
296    }
297
298    /// Set default fields (used by builder)
299    pub fn set_default_fields(&mut self, fields: Vec<Field>) {
300        self.default_fields = fields;
301    }
302
303    /// Get the query router rules
304    pub fn query_routers(&self) -> &[QueryRouterRule] {
305        &self.query_routers
306    }
307
308    /// Set query router rules
309    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
310        self.query_routers = rules;
311    }
312}
313
314/// Builder for Schema
315#[derive(Debug, Default)]
316pub struct SchemaBuilder {
317    fields: Vec<FieldEntry>,
318    default_fields: Vec<String>,
319    query_routers: Vec<QueryRouterRule>,
320}
321
322impl SchemaBuilder {
323    pub fn add_text_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
324        self.add_field_with_tokenizer(
325            name,
326            FieldType::Text,
327            indexed,
328            stored,
329            Some("default".to_string()),
330        )
331    }
332
333    pub fn add_text_field_with_tokenizer(
334        &mut self,
335        name: &str,
336        indexed: bool,
337        stored: bool,
338        tokenizer: &str,
339    ) -> Field {
340        self.add_field_with_tokenizer(
341            name,
342            FieldType::Text,
343            indexed,
344            stored,
345            Some(tokenizer.to_string()),
346        )
347    }
348
349    pub fn add_u64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
350        self.add_field(name, FieldType::U64, indexed, stored)
351    }
352
353    pub fn add_i64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
354        self.add_field(name, FieldType::I64, indexed, stored)
355    }
356
357    pub fn add_f64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
358        self.add_field(name, FieldType::F64, indexed, stored)
359    }
360
361    pub fn add_bytes_field(&mut self, name: &str, stored: bool) -> Field {
362        self.add_field(name, FieldType::Bytes, false, stored)
363    }
364
365    /// Add a JSON field for storing arbitrary JSON data
366    ///
367    /// JSON fields are never indexed, only stored. They can hold any valid JSON value
368    /// (objects, arrays, strings, numbers, booleans, null).
369    pub fn add_json_field(&mut self, name: &str, stored: bool) -> Field {
370        self.add_field(name, FieldType::Json, false, stored)
371    }
372
373    /// Add a sparse vector field with default configuration
374    ///
375    /// Sparse vectors are indexed as inverted posting lists where each dimension
376    /// becomes a "term" and documents have quantized weights for each dimension.
377    pub fn add_sparse_vector_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
378        self.add_sparse_vector_field_with_config(
379            name,
380            indexed,
381            stored,
382            crate::structures::SparseVectorConfig::default(),
383        )
384    }
385
386    /// Add a sparse vector field with custom configuration
387    ///
388    /// Use `SparseVectorConfig::splade()` for SPLADE models (u16 indices, uint8 weights).
389    /// Use `SparseVectorConfig::compact()` for maximum compression (u16 indices, uint4 weights).
390    pub fn add_sparse_vector_field_with_config(
391        &mut self,
392        name: &str,
393        indexed: bool,
394        stored: bool,
395        config: crate::structures::SparseVectorConfig,
396    ) -> Field {
397        let field = Field(self.fields.len() as u32);
398        self.fields.push(FieldEntry {
399            name: name.to_string(),
400            field_type: FieldType::SparseVector,
401            indexed,
402            stored,
403            tokenizer: None,
404            multi: false,
405            positions: None,
406            sparse_vector_config: Some(config),
407            dense_vector_config: None,
408        });
409        field
410    }
411
412    /// Set sparse vector configuration for an existing field
413    pub fn set_sparse_vector_config(
414        &mut self,
415        field: Field,
416        config: crate::structures::SparseVectorConfig,
417    ) {
418        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
419            entry.sparse_vector_config = Some(config);
420        }
421    }
422
423    /// Add a dense vector field with default configuration
424    ///
425    /// Dense vectors are indexed using RaBitQ binary quantization for fast ANN search.
426    /// The dimension must be specified as it determines the quantization structure.
427    pub fn add_dense_vector_field(
428        &mut self,
429        name: &str,
430        dim: usize,
431        indexed: bool,
432        stored: bool,
433    ) -> Field {
434        self.add_dense_vector_field_with_config(name, indexed, stored, DenseVectorConfig::new(dim))
435    }
436
437    /// Add a dense vector field with custom configuration
438    pub fn add_dense_vector_field_with_config(
439        &mut self,
440        name: &str,
441        indexed: bool,
442        stored: bool,
443        config: DenseVectorConfig,
444    ) -> Field {
445        let field = Field(self.fields.len() as u32);
446        self.fields.push(FieldEntry {
447            name: name.to_string(),
448            field_type: FieldType::DenseVector,
449            indexed,
450            stored,
451            tokenizer: None,
452            multi: false,
453            positions: None,
454            sparse_vector_config: None,
455            dense_vector_config: Some(config),
456        });
457        field
458    }
459
460    fn add_field(
461        &mut self,
462        name: &str,
463        field_type: FieldType,
464        indexed: bool,
465        stored: bool,
466    ) -> Field {
467        self.add_field_with_tokenizer(name, field_type, indexed, stored, None)
468    }
469
470    fn add_field_with_tokenizer(
471        &mut self,
472        name: &str,
473        field_type: FieldType,
474        indexed: bool,
475        stored: bool,
476        tokenizer: Option<String>,
477    ) -> Field {
478        self.add_field_full(name, field_type, indexed, stored, tokenizer, false)
479    }
480
481    fn add_field_full(
482        &mut self,
483        name: &str,
484        field_type: FieldType,
485        indexed: bool,
486        stored: bool,
487        tokenizer: Option<String>,
488        multi: bool,
489    ) -> Field {
490        let field = Field(self.fields.len() as u32);
491        self.fields.push(FieldEntry {
492            name: name.to_string(),
493            field_type,
494            indexed,
495            stored,
496            tokenizer,
497            multi,
498            positions: None,
499            sparse_vector_config: None,
500            dense_vector_config: None,
501        });
502        field
503    }
504
505    /// Set the multi attribute on the last added field
506    pub fn set_multi(&mut self, field: Field, multi: bool) {
507        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
508            entry.multi = multi;
509        }
510    }
511
512    /// Set position tracking mode for phrase queries and multi-field element tracking
513    pub fn set_positions(&mut self, field: Field, mode: PositionMode) {
514        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
515            entry.positions = Some(mode);
516        }
517    }
518
519    /// Set default fields by name
520    pub fn set_default_fields(&mut self, field_names: Vec<String>) {
521        self.default_fields = field_names;
522    }
523
524    /// Set query router rules
525    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
526        self.query_routers = rules;
527    }
528
529    pub fn build(self) -> Schema {
530        let mut name_to_field = HashMap::new();
531        for (i, entry) in self.fields.iter().enumerate() {
532            name_to_field.insert(entry.name.clone(), Field(i as u32));
533        }
534
535        // Resolve default field names to Field IDs
536        let default_fields: Vec<Field> = self
537            .default_fields
538            .iter()
539            .filter_map(|name| name_to_field.get(name).copied())
540            .collect();
541
542        Schema {
543            fields: self.fields,
544            name_to_field,
545            default_fields,
546            query_routers: self.query_routers,
547        }
548    }
549}
550
551/// Value that can be stored in a field
552#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
553pub enum FieldValue {
554    #[serde(rename = "text")]
555    Text(String),
556    #[serde(rename = "u64")]
557    U64(u64),
558    #[serde(rename = "i64")]
559    I64(i64),
560    #[serde(rename = "f64")]
561    F64(f64),
562    #[serde(rename = "bytes")]
563    Bytes(Vec<u8>),
564    /// Sparse vector: list of (dimension_id, weight) pairs
565    #[serde(rename = "sparse_vector")]
566    SparseVector(Vec<(u32, f32)>),
567    /// Dense vector: float32 values
568    #[serde(rename = "dense_vector")]
569    DenseVector(Vec<f32>),
570    /// Arbitrary JSON value
571    #[serde(rename = "json")]
572    Json(serde_json::Value),
573}
574
575impl FieldValue {
576    pub fn as_text(&self) -> Option<&str> {
577        match self {
578            FieldValue::Text(s) => Some(s),
579            _ => None,
580        }
581    }
582
583    pub fn as_u64(&self) -> Option<u64> {
584        match self {
585            FieldValue::U64(v) => Some(*v),
586            _ => None,
587        }
588    }
589
590    pub fn as_i64(&self) -> Option<i64> {
591        match self {
592            FieldValue::I64(v) => Some(*v),
593            _ => None,
594        }
595    }
596
597    pub fn as_f64(&self) -> Option<f64> {
598        match self {
599            FieldValue::F64(v) => Some(*v),
600            _ => None,
601        }
602    }
603
604    pub fn as_bytes(&self) -> Option<&[u8]> {
605        match self {
606            FieldValue::Bytes(b) => Some(b),
607            _ => None,
608        }
609    }
610
611    pub fn as_sparse_vector(&self) -> Option<&[(u32, f32)]> {
612        match self {
613            FieldValue::SparseVector(entries) => Some(entries),
614            _ => None,
615        }
616    }
617
618    pub fn as_dense_vector(&self) -> Option<&[f32]> {
619        match self {
620            FieldValue::DenseVector(v) => Some(v),
621            _ => None,
622        }
623    }
624
625    pub fn as_json(&self) -> Option<&serde_json::Value> {
626        match self {
627            FieldValue::Json(v) => Some(v),
628            _ => None,
629        }
630    }
631}
632
633/// A document to be indexed
634#[derive(Debug, Clone, Default, Serialize, Deserialize)]
635pub struct Document {
636    field_values: Vec<(Field, FieldValue)>,
637}
638
639impl Document {
640    pub fn new() -> Self {
641        Self::default()
642    }
643
644    pub fn add_text(&mut self, field: Field, value: impl Into<String>) {
645        self.field_values
646            .push((field, FieldValue::Text(value.into())));
647    }
648
649    pub fn add_u64(&mut self, field: Field, value: u64) {
650        self.field_values.push((field, FieldValue::U64(value)));
651    }
652
653    pub fn add_i64(&mut self, field: Field, value: i64) {
654        self.field_values.push((field, FieldValue::I64(value)));
655    }
656
657    pub fn add_f64(&mut self, field: Field, value: f64) {
658        self.field_values.push((field, FieldValue::F64(value)));
659    }
660
661    pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
662        self.field_values.push((field, FieldValue::Bytes(value)));
663    }
664
665    pub fn add_sparse_vector(&mut self, field: Field, entries: Vec<(u32, f32)>) {
666        self.field_values
667            .push((field, FieldValue::SparseVector(entries)));
668    }
669
670    pub fn add_dense_vector(&mut self, field: Field, values: Vec<f32>) {
671        self.field_values
672            .push((field, FieldValue::DenseVector(values)));
673    }
674
675    pub fn add_json(&mut self, field: Field, value: serde_json::Value) {
676        self.field_values.push((field, FieldValue::Json(value)));
677    }
678
679    pub fn get_first(&self, field: Field) -> Option<&FieldValue> {
680        self.field_values
681            .iter()
682            .find(|(f, _)| *f == field)
683            .map(|(_, v)| v)
684    }
685
686    pub fn get_all(&self, field: Field) -> impl Iterator<Item = &FieldValue> {
687        self.field_values
688            .iter()
689            .filter(move |(f, _)| *f == field)
690            .map(|(_, v)| v)
691    }
692
693    pub fn field_values(&self) -> &[(Field, FieldValue)] {
694        &self.field_values
695    }
696
697    /// Return a new Document containing only fields marked as `stored` in the schema
698    pub fn filter_stored(&self, schema: &Schema) -> Document {
699        Document {
700            field_values: self
701                .field_values
702                .iter()
703                .filter(|(field, _)| {
704                    schema
705                        .get_field_entry(*field)
706                        .is_some_and(|entry| entry.stored)
707                })
708                .cloned()
709                .collect(),
710        }
711    }
712
713    /// Convert document to a JSON object using field names from schema
714    ///
715    /// Fields marked as `multi` in the schema are always returned as JSON arrays.
716    /// Other fields with multiple values are also returned as arrays.
717    /// Fields with a single value (and not marked multi) are returned as scalar values.
718    pub fn to_json(&self, schema: &Schema) -> serde_json::Value {
719        use std::collections::HashMap;
720
721        // Group values by field, keeping track of field entry for multi check
722        let mut field_values_map: HashMap<Field, (String, bool, Vec<serde_json::Value>)> =
723            HashMap::new();
724
725        for (field, value) in &self.field_values {
726            if let Some(entry) = schema.get_field_entry(*field) {
727                let json_value = match value {
728                    FieldValue::Text(s) => serde_json::Value::String(s.clone()),
729                    FieldValue::U64(n) => serde_json::Value::Number((*n).into()),
730                    FieldValue::I64(n) => serde_json::Value::Number((*n).into()),
731                    FieldValue::F64(n) => serde_json::json!(n),
732                    FieldValue::Bytes(b) => {
733                        use base64::Engine;
734                        serde_json::Value::String(
735                            base64::engine::general_purpose::STANDARD.encode(b),
736                        )
737                    }
738                    FieldValue::SparseVector(entries) => {
739                        let indices: Vec<u32> = entries.iter().map(|(i, _)| *i).collect();
740                        let values: Vec<f32> = entries.iter().map(|(_, v)| *v).collect();
741                        serde_json::json!({
742                            "indices": indices,
743                            "values": values
744                        })
745                    }
746                    FieldValue::DenseVector(values) => {
747                        serde_json::json!(values)
748                    }
749                    FieldValue::Json(v) => v.clone(),
750                };
751                field_values_map
752                    .entry(*field)
753                    .or_insert_with(|| (entry.name.clone(), entry.multi, Vec::new()))
754                    .2
755                    .push(json_value);
756            }
757        }
758
759        // Convert to JSON object, using arrays for multi fields or when multiple values exist
760        let mut map = serde_json::Map::new();
761        for (_field, (name, is_multi, values)) in field_values_map {
762            let json_value = if is_multi || values.len() > 1 {
763                serde_json::Value::Array(values)
764            } else {
765                values.into_iter().next().unwrap()
766            };
767            map.insert(name, json_value);
768        }
769
770        serde_json::Value::Object(map)
771    }
772
773    /// Create a Document from a JSON object using field names from schema
774    ///
775    /// Supports:
776    /// - String values -> Text fields
777    /// - Number values -> U64/I64/F64 fields (based on schema type)
778    /// - Array values -> Multiple values for the same field (multifields)
779    ///
780    /// Unknown fields (not in schema) are silently ignored.
781    pub fn from_json(json: &serde_json::Value, schema: &Schema) -> Option<Self> {
782        let obj = json.as_object()?;
783        let mut doc = Document::new();
784
785        for (key, value) in obj {
786            if let Some(field) = schema.get_field(key) {
787                let field_entry = schema.get_field_entry(field)?;
788                Self::add_json_value(&mut doc, field, &field_entry.field_type, value);
789            }
790        }
791
792        Some(doc)
793    }
794
795    /// Helper to add a JSON value to a document, handling type conversion
796    fn add_json_value(
797        doc: &mut Document,
798        field: Field,
799        field_type: &FieldType,
800        value: &serde_json::Value,
801    ) {
802        match value {
803            serde_json::Value::String(s) => {
804                if matches!(field_type, FieldType::Text) {
805                    doc.add_text(field, s.clone());
806                }
807            }
808            serde_json::Value::Number(n) => {
809                match field_type {
810                    FieldType::I64 => {
811                        if let Some(i) = n.as_i64() {
812                            doc.add_i64(field, i);
813                        }
814                    }
815                    FieldType::U64 => {
816                        if let Some(u) = n.as_u64() {
817                            doc.add_u64(field, u);
818                        } else if let Some(i) = n.as_i64() {
819                            // Allow positive i64 as u64
820                            if i >= 0 {
821                                doc.add_u64(field, i as u64);
822                            }
823                        }
824                    }
825                    FieldType::F64 => {
826                        if let Some(f) = n.as_f64() {
827                            doc.add_f64(field, f);
828                        }
829                    }
830                    _ => {}
831                }
832            }
833            // Handle arrays (multifields) - add each element separately
834            serde_json::Value::Array(arr) => {
835                for item in arr {
836                    Self::add_json_value(doc, field, field_type, item);
837                }
838            }
839            // Handle sparse vector objects
840            serde_json::Value::Object(obj) if matches!(field_type, FieldType::SparseVector) => {
841                if let (Some(indices_val), Some(values_val)) =
842                    (obj.get("indices"), obj.get("values"))
843                {
844                    let indices: Vec<u32> = indices_val
845                        .as_array()
846                        .map(|arr| {
847                            arr.iter()
848                                .filter_map(|v| v.as_u64().map(|n| n as u32))
849                                .collect()
850                        })
851                        .unwrap_or_default();
852                    let values: Vec<f32> = values_val
853                        .as_array()
854                        .map(|arr| {
855                            arr.iter()
856                                .filter_map(|v| v.as_f64().map(|n| n as f32))
857                                .collect()
858                        })
859                        .unwrap_or_default();
860                    if indices.len() == values.len() {
861                        let entries: Vec<(u32, f32)> = indices.into_iter().zip(values).collect();
862                        doc.add_sparse_vector(field, entries);
863                    }
864                }
865            }
866            // Handle JSON fields - accept any value directly
867            _ if matches!(field_type, FieldType::Json) => {
868                doc.add_json(field, value.clone());
869            }
870            serde_json::Value::Object(_) => {}
871            _ => {}
872        }
873    }
874}
875
876#[cfg(test)]
877mod tests {
878    use super::*;
879
880    #[test]
881    fn test_schema_builder() {
882        let mut builder = Schema::builder();
883        let title = builder.add_text_field("title", true, true);
884        let body = builder.add_text_field("body", true, false);
885        let count = builder.add_u64_field("count", true, true);
886        let schema = builder.build();
887
888        assert_eq!(schema.get_field("title"), Some(title));
889        assert_eq!(schema.get_field("body"), Some(body));
890        assert_eq!(schema.get_field("count"), Some(count));
891        assert_eq!(schema.get_field("nonexistent"), None);
892    }
893
894    #[test]
895    fn test_document() {
896        let mut builder = Schema::builder();
897        let title = builder.add_text_field("title", true, true);
898        let count = builder.add_u64_field("count", true, true);
899        let _schema = builder.build();
900
901        let mut doc = Document::new();
902        doc.add_text(title, "Hello World");
903        doc.add_u64(count, 42);
904
905        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
906        assert_eq!(doc.get_first(count).unwrap().as_u64(), Some(42));
907    }
908
909    #[test]
910    fn test_document_serialization() {
911        let mut builder = Schema::builder();
912        let title = builder.add_text_field("title", true, true);
913        let count = builder.add_u64_field("count", true, true);
914        let _schema = builder.build();
915
916        let mut doc = Document::new();
917        doc.add_text(title, "Hello World");
918        doc.add_u64(count, 42);
919
920        // Serialize
921        let json = serde_json::to_string(&doc).unwrap();
922        println!("Serialized doc: {}", json);
923
924        // Deserialize
925        let doc2: Document = serde_json::from_str(&json).unwrap();
926        assert_eq!(
927            doc2.field_values().len(),
928            2,
929            "Should have 2 field values after deserialization"
930        );
931        assert_eq!(
932            doc2.get_first(title).unwrap().as_text(),
933            Some("Hello World")
934        );
935        assert_eq!(doc2.get_first(count).unwrap().as_u64(), Some(42));
936    }
937
938    #[test]
939    fn test_multivalue_field() {
940        let mut builder = Schema::builder();
941        let uris = builder.add_text_field("uris", true, true);
942        let title = builder.add_text_field("title", true, true);
943        let schema = builder.build();
944
945        // Create document with multiple values for the same field
946        let mut doc = Document::new();
947        doc.add_text(uris, "one");
948        doc.add_text(uris, "two");
949        doc.add_text(title, "Test Document");
950
951        // Verify get_first returns the first value
952        assert_eq!(doc.get_first(uris).unwrap().as_text(), Some("one"));
953
954        // Verify get_all returns all values
955        let all_uris: Vec<_> = doc.get_all(uris).collect();
956        assert_eq!(all_uris.len(), 2);
957        assert_eq!(all_uris[0].as_text(), Some("one"));
958        assert_eq!(all_uris[1].as_text(), Some("two"));
959
960        // Verify to_json returns array for multi-value field
961        let json = doc.to_json(&schema);
962        let uris_json = json.get("uris").unwrap();
963        assert!(uris_json.is_array(), "Multi-value field should be an array");
964        let uris_arr = uris_json.as_array().unwrap();
965        assert_eq!(uris_arr.len(), 2);
966        assert_eq!(uris_arr[0].as_str(), Some("one"));
967        assert_eq!(uris_arr[1].as_str(), Some("two"));
968
969        // Verify single-value field is NOT an array
970        let title_json = json.get("title").unwrap();
971        assert!(
972            title_json.is_string(),
973            "Single-value field should be a string"
974        );
975        assert_eq!(title_json.as_str(), Some("Test Document"));
976    }
977
978    #[test]
979    fn test_multivalue_from_json() {
980        let mut builder = Schema::builder();
981        let uris = builder.add_text_field("uris", true, true);
982        let title = builder.add_text_field("title", true, true);
983        let schema = builder.build();
984
985        // Create JSON with array value
986        let json = serde_json::json!({
987            "uris": ["one", "two"],
988            "title": "Test Document"
989        });
990
991        // Parse from JSON
992        let doc = Document::from_json(&json, &schema).unwrap();
993
994        // Verify all values are present
995        let all_uris: Vec<_> = doc.get_all(uris).collect();
996        assert_eq!(all_uris.len(), 2);
997        assert_eq!(all_uris[0].as_text(), Some("one"));
998        assert_eq!(all_uris[1].as_text(), Some("two"));
999
1000        // Verify single value
1001        assert_eq!(
1002            doc.get_first(title).unwrap().as_text(),
1003            Some("Test Document")
1004        );
1005
1006        // Verify roundtrip: to_json should produce equivalent JSON
1007        let json_out = doc.to_json(&schema);
1008        let uris_out = json_out.get("uris").unwrap().as_array().unwrap();
1009        assert_eq!(uris_out.len(), 2);
1010        assert_eq!(uris_out[0].as_str(), Some("one"));
1011        assert_eq!(uris_out[1].as_str(), Some("two"));
1012    }
1013
1014    #[test]
1015    fn test_multi_attribute_forces_array() {
1016        // Test that fields marked as 'multi' are always serialized as arrays,
1017        // even when they have only one value
1018        let mut builder = Schema::builder();
1019        let uris = builder.add_text_field("uris", true, true);
1020        builder.set_multi(uris, true); // Mark as multi
1021        let title = builder.add_text_field("title", true, true);
1022        let schema = builder.build();
1023
1024        // Verify the multi attribute is set
1025        assert!(schema.get_field_entry(uris).unwrap().multi);
1026        assert!(!schema.get_field_entry(title).unwrap().multi);
1027
1028        // Create document with single value for multi field
1029        let mut doc = Document::new();
1030        doc.add_text(uris, "only_one");
1031        doc.add_text(title, "Test Document");
1032
1033        // Verify to_json returns array for multi field even with single value
1034        let json = doc.to_json(&schema);
1035
1036        let uris_json = json.get("uris").unwrap();
1037        assert!(
1038            uris_json.is_array(),
1039            "Multi field should be array even with single value"
1040        );
1041        let uris_arr = uris_json.as_array().unwrap();
1042        assert_eq!(uris_arr.len(), 1);
1043        assert_eq!(uris_arr[0].as_str(), Some("only_one"));
1044
1045        // Verify non-multi field with single value is NOT an array
1046        let title_json = json.get("title").unwrap();
1047        assert!(
1048            title_json.is_string(),
1049            "Non-multi single-value field should be a string"
1050        );
1051        assert_eq!(title_json.as_str(), Some("Test Document"));
1052    }
1053
1054    #[test]
1055    fn test_sparse_vector_field() {
1056        let mut builder = Schema::builder();
1057        let embedding = builder.add_sparse_vector_field("embedding", true, true);
1058        let title = builder.add_text_field("title", true, true);
1059        let schema = builder.build();
1060
1061        assert_eq!(schema.get_field("embedding"), Some(embedding));
1062        assert_eq!(
1063            schema.get_field_entry(embedding).unwrap().field_type,
1064            FieldType::SparseVector
1065        );
1066
1067        // Create document with sparse vector
1068        let mut doc = Document::new();
1069        doc.add_sparse_vector(embedding, vec![(0, 1.0), (5, 2.5), (10, 0.5)]);
1070        doc.add_text(title, "Test Document");
1071
1072        // Verify accessor
1073        let entries = doc
1074            .get_first(embedding)
1075            .unwrap()
1076            .as_sparse_vector()
1077            .unwrap();
1078        assert_eq!(entries, &[(0, 1.0), (5, 2.5), (10, 0.5)]);
1079
1080        // Verify JSON roundtrip
1081        let json = doc.to_json(&schema);
1082        let embedding_json = json.get("embedding").unwrap();
1083        assert!(embedding_json.is_object());
1084        assert_eq!(
1085            embedding_json
1086                .get("indices")
1087                .unwrap()
1088                .as_array()
1089                .unwrap()
1090                .len(),
1091            3
1092        );
1093
1094        // Parse back from JSON
1095        let doc2 = Document::from_json(&json, &schema).unwrap();
1096        let entries2 = doc2
1097            .get_first(embedding)
1098            .unwrap()
1099            .as_sparse_vector()
1100            .unwrap();
1101        assert_eq!(entries2[0].0, 0);
1102        assert!((entries2[0].1 - 1.0).abs() < 1e-6);
1103        assert_eq!(entries2[1].0, 5);
1104        assert!((entries2[1].1 - 2.5).abs() < 1e-6);
1105        assert_eq!(entries2[2].0, 10);
1106        assert!((entries2[2].1 - 0.5).abs() < 1e-6);
1107    }
1108
1109    #[test]
1110    fn test_json_field() {
1111        let mut builder = Schema::builder();
1112        let metadata = builder.add_json_field("metadata", true);
1113        let title = builder.add_text_field("title", true, true);
1114        let schema = builder.build();
1115
1116        assert_eq!(schema.get_field("metadata"), Some(metadata));
1117        assert_eq!(
1118            schema.get_field_entry(metadata).unwrap().field_type,
1119            FieldType::Json
1120        );
1121        // JSON fields are never indexed
1122        assert!(!schema.get_field_entry(metadata).unwrap().indexed);
1123        assert!(schema.get_field_entry(metadata).unwrap().stored);
1124
1125        // Create document with JSON value (object)
1126        let json_value = serde_json::json!({
1127            "author": "John Doe",
1128            "tags": ["rust", "search"],
1129            "nested": {"key": "value"}
1130        });
1131        let mut doc = Document::new();
1132        doc.add_json(metadata, json_value.clone());
1133        doc.add_text(title, "Test Document");
1134
1135        // Verify accessor
1136        let stored_json = doc.get_first(metadata).unwrap().as_json().unwrap();
1137        assert_eq!(stored_json, &json_value);
1138        assert_eq!(
1139            stored_json.get("author").unwrap().as_str(),
1140            Some("John Doe")
1141        );
1142
1143        // Verify JSON roundtrip via to_json/from_json
1144        let doc_json = doc.to_json(&schema);
1145        let metadata_out = doc_json.get("metadata").unwrap();
1146        assert_eq!(metadata_out, &json_value);
1147
1148        // Parse back from JSON
1149        let doc2 = Document::from_json(&doc_json, &schema).unwrap();
1150        let stored_json2 = doc2.get_first(metadata).unwrap().as_json().unwrap();
1151        assert_eq!(stored_json2, &json_value);
1152    }
1153
1154    #[test]
1155    fn test_json_field_various_types() {
1156        let mut builder = Schema::builder();
1157        let data = builder.add_json_field("data", true);
1158        let _schema = builder.build();
1159
1160        // Test with array
1161        let arr_value = serde_json::json!([1, 2, 3, "four", null]);
1162        let mut doc = Document::new();
1163        doc.add_json(data, arr_value.clone());
1164        assert_eq!(doc.get_first(data).unwrap().as_json().unwrap(), &arr_value);
1165
1166        // Test with string
1167        let str_value = serde_json::json!("just a string");
1168        let mut doc2 = Document::new();
1169        doc2.add_json(data, str_value.clone());
1170        assert_eq!(doc2.get_first(data).unwrap().as_json().unwrap(), &str_value);
1171
1172        // Test with number
1173        let num_value = serde_json::json!(42.5);
1174        let mut doc3 = Document::new();
1175        doc3.add_json(data, num_value.clone());
1176        assert_eq!(doc3.get_first(data).unwrap().as_json().unwrap(), &num_value);
1177
1178        // Test with null
1179        let null_value = serde_json::Value::Null;
1180        let mut doc4 = Document::new();
1181        doc4.add_json(data, null_value.clone());
1182        assert_eq!(
1183            doc4.get_first(data).unwrap().as_json().unwrap(),
1184            &null_value
1185        );
1186
1187        // Test with boolean
1188        let bool_value = serde_json::json!(true);
1189        let mut doc5 = Document::new();
1190        doc5.add_json(data, bool_value.clone());
1191        assert_eq!(
1192            doc5.get_first(data).unwrap().as_json().unwrap(),
1193            &bool_value
1194        );
1195    }
1196}