Skip to main content

hermes_core/dsl/
schema.rs

1//! Schema definitions for documents and fields
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// Field identifier
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
8pub struct Field(pub u32);
9
10/// Types of fields supported
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub enum FieldType {
13    /// Text field - tokenized and indexed
14    #[serde(rename = "text")]
15    Text,
16    /// Unsigned 64-bit integer
17    #[serde(rename = "u64")]
18    U64,
19    /// Signed 64-bit integer
20    #[serde(rename = "i64")]
21    I64,
22    /// 64-bit floating point
23    #[serde(rename = "f64")]
24    F64,
25    /// Raw bytes (not tokenized)
26    #[serde(rename = "bytes")]
27    Bytes,
28    /// Sparse vector field - indexed as inverted posting lists with quantized weights
29    #[serde(rename = "sparse_vector")]
30    SparseVector,
31    /// Dense vector field - indexed using RaBitQ binary quantization for ANN search
32    #[serde(rename = "dense_vector")]
33    DenseVector,
34    /// JSON field - arbitrary JSON data, stored but not indexed
35    #[serde(rename = "json")]
36    Json,
37}
38
39/// Field options
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct FieldEntry {
42    pub name: String,
43    pub field_type: FieldType,
44    pub indexed: bool,
45    pub stored: bool,
46    /// Name of the tokenizer to use for this field (for text fields)
47    pub tokenizer: Option<String>,
48    /// Whether this field can have multiple values (serialized as array in JSON)
49    #[serde(default)]
50    pub multi: bool,
51    /// Position tracking mode for phrase queries and multi-field element tracking
52    #[serde(default, skip_serializing_if = "Option::is_none")]
53    pub positions: Option<PositionMode>,
54    /// Configuration for sparse vector fields (index size, weight quantization)
55    #[serde(default, skip_serializing_if = "Option::is_none")]
56    pub sparse_vector_config: Option<crate::structures::SparseVectorConfig>,
57    /// Configuration for dense vector fields (dimension, quantization)
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub dense_vector_config: Option<DenseVectorConfig>,
60}
61
62/// Position tracking mode for text fields
63#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
64#[serde(rename_all = "snake_case")]
65pub enum PositionMode {
66    /// Track only element ordinal for multi-valued fields (which array element)
67    /// Useful for returning which element matched without full phrase query support
68    Ordinal,
69    /// Track only token position within text (for phrase queries)
70    /// Does not track element ordinal - all positions are relative to concatenated text
71    TokenPosition,
72    /// Track both element ordinal and token position (full support)
73    /// Position format: (element_ordinal << 20) | token_position
74    Full,
75}
76
77impl PositionMode {
78    /// Whether this mode tracks element ordinals
79    pub fn tracks_ordinal(&self) -> bool {
80        matches!(self, PositionMode::Ordinal | PositionMode::Full)
81    }
82
83    /// Whether this mode tracks token positions
84    pub fn tracks_token_position(&self) -> bool {
85        matches!(self, PositionMode::TokenPosition | PositionMode::Full)
86    }
87}
88
89/// Vector index algorithm type
90#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
91#[serde(rename_all = "snake_case")]
92pub enum VectorIndexType {
93    /// Flat - brute-force search over raw vectors (accumulating state)
94    Flat,
95    /// RaBitQ - binary quantization, good for small datasets (<100K)
96    #[default]
97    RaBitQ,
98    /// IVF-RaBitQ - inverted file with RaBitQ, good for medium datasets
99    IvfRaBitQ,
100    /// ScaNN - product quantization with OPQ and anisotropic loss, best for large datasets
101    ScaNN,
102}
103
104/// Configuration for dense vector fields using Flat, RaBitQ, IVF-RaBitQ, or ScaNN
105///
106/// Indexes operate in two states:
107/// - **Flat (accumulating)**: Brute-force search over raw vectors. Used when vector count
108///   is below `build_threshold` or before `build_index` is called.
109/// - **Built (ANN)**: Fast approximate nearest neighbor search using trained structures.
110///   Centroids and codebooks are trained from data and stored within the segment.
111#[derive(Debug, Clone, Serialize, Deserialize)]
112pub struct DenseVectorConfig {
113    /// Dimensionality of vectors
114    pub dim: usize,
115    /// Target vector index algorithm (Flat, RaBitQ, IVF-RaBitQ, or ScaNN)
116    /// When in accumulating state, search uses brute-force regardless of this setting.
117    #[serde(default)]
118    pub index_type: VectorIndexType,
119    /// Whether to store raw vectors for re-ranking (increases storage but improves accuracy)
120    #[serde(default = "default_store_raw")]
121    pub store_raw: bool,
122    /// Number of IVF clusters for IVF-RaBitQ and ScaNN (default: sqrt(n) capped at 4096)
123    /// If None, automatically determined based on dataset size.
124    #[serde(default, skip_serializing_if = "Option::is_none")]
125    pub num_clusters: Option<usize>,
126    /// Number of clusters to probe during search (default: 32)
127    #[serde(default = "default_nprobe")]
128    pub nprobe: usize,
129    /// Matryoshka/MRL dimension for index - use only first mrl_dim coordinates for indexing
130    /// Full vectors are stored but index uses truncated vectors for faster search
131    /// Must be <= dim. If None, uses full dim.
132    #[serde(default, skip_serializing_if = "Option::is_none")]
133    pub mrl_dim: Option<usize>,
134    /// Minimum number of vectors required before building ANN index.
135    /// Below this threshold, brute-force (Flat) search is used.
136    /// Default: 1000 for RaBitQ, 10000 for IVF-RaBitQ/ScaNN.
137    #[serde(default, skip_serializing_if = "Option::is_none")]
138    pub build_threshold: Option<usize>,
139}
140
141fn default_store_raw() -> bool {
142    true
143}
144
145fn default_nprobe() -> usize {
146    32
147}
148
149impl DenseVectorConfig {
150    pub fn new(dim: usize) -> Self {
151        Self {
152            dim,
153            index_type: VectorIndexType::RaBitQ,
154            store_raw: true,
155            num_clusters: None,
156            nprobe: 32,
157            mrl_dim: None,
158            build_threshold: None,
159        }
160    }
161
162    /// Create IVF-RaBitQ configuration
163    pub fn with_ivf(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
164        Self {
165            dim,
166            index_type: VectorIndexType::IvfRaBitQ,
167            store_raw: true,
168            num_clusters,
169            nprobe,
170            mrl_dim: None,
171            build_threshold: None,
172        }
173    }
174
175    /// Create ScaNN configuration
176    pub fn with_scann(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
177        Self {
178            dim,
179            index_type: VectorIndexType::ScaNN,
180            store_raw: true,
181            num_clusters,
182            nprobe,
183            mrl_dim: None,
184            build_threshold: None,
185        }
186    }
187
188    /// Create Flat (brute-force) configuration - no ANN index
189    pub fn flat(dim: usize) -> Self {
190        Self {
191            dim,
192            index_type: VectorIndexType::Flat,
193            store_raw: true,
194            num_clusters: None,
195            nprobe: 0,
196            mrl_dim: None,
197            build_threshold: None,
198        }
199    }
200
201    pub fn without_raw(dim: usize) -> Self {
202        Self {
203            dim,
204            index_type: VectorIndexType::RaBitQ,
205            store_raw: false,
206            num_clusters: None,
207            nprobe: 32,
208            mrl_dim: None,
209            build_threshold: None,
210        }
211    }
212
213    /// Set matryoshka/MRL dimension for index truncation
214    pub fn with_mrl_dim(mut self, mrl_dim: usize) -> Self {
215        self.mrl_dim = Some(mrl_dim);
216        self
217    }
218
219    /// Set build threshold for auto-building ANN index
220    pub fn with_build_threshold(mut self, threshold: usize) -> Self {
221        self.build_threshold = Some(threshold);
222        self
223    }
224
225    /// Set number of IVF clusters
226    pub fn with_num_clusters(mut self, num_clusters: usize) -> Self {
227        self.num_clusters = Some(num_clusters);
228        self
229    }
230
231    /// Get the effective dimension for indexing (mrl_dim if set, otherwise dim)
232    pub fn index_dim(&self) -> usize {
233        self.mrl_dim.unwrap_or(self.dim)
234    }
235
236    /// Check if this config uses IVF
237    pub fn uses_ivf(&self) -> bool {
238        matches!(
239            self.index_type,
240            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN
241        )
242    }
243
244    /// Check if this config uses ScaNN
245    pub fn uses_scann(&self) -> bool {
246        self.index_type == VectorIndexType::ScaNN
247    }
248
249    /// Check if this config is flat (brute-force)
250    pub fn is_flat(&self) -> bool {
251        self.index_type == VectorIndexType::Flat
252    }
253
254    /// Get the default build threshold for this index type
255    pub fn default_build_threshold(&self) -> usize {
256        self.build_threshold.unwrap_or(match self.index_type {
257            VectorIndexType::Flat => usize::MAX, // Never auto-build
258            VectorIndexType::RaBitQ => 1000,
259            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN => 10000,
260        })
261    }
262
263    /// Calculate optimal number of clusters for given vector count
264    pub fn optimal_num_clusters(&self, num_vectors: usize) -> usize {
265        self.num_clusters.unwrap_or_else(|| {
266            // sqrt(n) heuristic, capped at 4096
267            let optimal = (num_vectors as f64).sqrt() as usize;
268            optimal.clamp(16, 4096)
269        })
270    }
271}
272
273use super::query_field_router::QueryRouterRule;
274
275/// Schema defining document structure
276#[derive(Debug, Clone, Default, Serialize, Deserialize)]
277pub struct Schema {
278    fields: Vec<FieldEntry>,
279    name_to_field: HashMap<String, Field>,
280    /// Default fields for query parsing (when no field is specified)
281    #[serde(default)]
282    default_fields: Vec<Field>,
283    /// Query router rules for routing queries to specific fields based on regex patterns
284    #[serde(default)]
285    query_routers: Vec<QueryRouterRule>,
286}
287
288impl Schema {
289    pub fn builder() -> SchemaBuilder {
290        SchemaBuilder::default()
291    }
292
293    pub fn get_field(&self, name: &str) -> Option<Field> {
294        self.name_to_field.get(name).copied()
295    }
296
297    pub fn get_field_entry(&self, field: Field) -> Option<&FieldEntry> {
298        self.fields.get(field.0 as usize)
299    }
300
301    pub fn get_field_name(&self, field: Field) -> Option<&str> {
302        self.fields.get(field.0 as usize).map(|e| e.name.as_str())
303    }
304
305    pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
306        self.fields
307            .iter()
308            .enumerate()
309            .map(|(i, e)| (Field(i as u32), e))
310    }
311
312    pub fn num_fields(&self) -> usize {
313        self.fields.len()
314    }
315
316    /// Get the default fields for query parsing
317    pub fn default_fields(&self) -> &[Field] {
318        &self.default_fields
319    }
320
321    /// Set default fields (used by builder)
322    pub fn set_default_fields(&mut self, fields: Vec<Field>) {
323        self.default_fields = fields;
324    }
325
326    /// Get the query router rules
327    pub fn query_routers(&self) -> &[QueryRouterRule] {
328        &self.query_routers
329    }
330
331    /// Set query router rules
332    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
333        self.query_routers = rules;
334    }
335}
336
337/// Builder for Schema
338#[derive(Debug, Default)]
339pub struct SchemaBuilder {
340    fields: Vec<FieldEntry>,
341    default_fields: Vec<String>,
342    query_routers: Vec<QueryRouterRule>,
343}
344
345impl SchemaBuilder {
346    pub fn add_text_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
347        self.add_field_with_tokenizer(
348            name,
349            FieldType::Text,
350            indexed,
351            stored,
352            Some("default".to_string()),
353        )
354    }
355
356    pub fn add_text_field_with_tokenizer(
357        &mut self,
358        name: &str,
359        indexed: bool,
360        stored: bool,
361        tokenizer: &str,
362    ) -> Field {
363        self.add_field_with_tokenizer(
364            name,
365            FieldType::Text,
366            indexed,
367            stored,
368            Some(tokenizer.to_string()),
369        )
370    }
371
372    pub fn add_u64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
373        self.add_field(name, FieldType::U64, indexed, stored)
374    }
375
376    pub fn add_i64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
377        self.add_field(name, FieldType::I64, indexed, stored)
378    }
379
380    pub fn add_f64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
381        self.add_field(name, FieldType::F64, indexed, stored)
382    }
383
384    pub fn add_bytes_field(&mut self, name: &str, stored: bool) -> Field {
385        self.add_field(name, FieldType::Bytes, false, stored)
386    }
387
388    /// Add a JSON field for storing arbitrary JSON data
389    ///
390    /// JSON fields are never indexed, only stored. They can hold any valid JSON value
391    /// (objects, arrays, strings, numbers, booleans, null).
392    pub fn add_json_field(&mut self, name: &str, stored: bool) -> Field {
393        self.add_field(name, FieldType::Json, false, stored)
394    }
395
396    /// Add a sparse vector field with default configuration
397    ///
398    /// Sparse vectors are indexed as inverted posting lists where each dimension
399    /// becomes a "term" and documents have quantized weights for each dimension.
400    pub fn add_sparse_vector_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
401        self.add_sparse_vector_field_with_config(
402            name,
403            indexed,
404            stored,
405            crate::structures::SparseVectorConfig::default(),
406        )
407    }
408
409    /// Add a sparse vector field with custom configuration
410    ///
411    /// Use `SparseVectorConfig::splade()` for SPLADE models (u16 indices, uint8 weights).
412    /// Use `SparseVectorConfig::compact()` for maximum compression (u16 indices, uint4 weights).
413    pub fn add_sparse_vector_field_with_config(
414        &mut self,
415        name: &str,
416        indexed: bool,
417        stored: bool,
418        config: crate::structures::SparseVectorConfig,
419    ) -> Field {
420        let field = Field(self.fields.len() as u32);
421        self.fields.push(FieldEntry {
422            name: name.to_string(),
423            field_type: FieldType::SparseVector,
424            indexed,
425            stored,
426            tokenizer: None,
427            multi: false,
428            positions: None,
429            sparse_vector_config: Some(config),
430            dense_vector_config: None,
431        });
432        field
433    }
434
435    /// Set sparse vector configuration for an existing field
436    pub fn set_sparse_vector_config(
437        &mut self,
438        field: Field,
439        config: crate::structures::SparseVectorConfig,
440    ) {
441        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
442            entry.sparse_vector_config = Some(config);
443        }
444    }
445
446    /// Add a dense vector field with default configuration
447    ///
448    /// Dense vectors are indexed using RaBitQ binary quantization for fast ANN search.
449    /// The dimension must be specified as it determines the quantization structure.
450    pub fn add_dense_vector_field(
451        &mut self,
452        name: &str,
453        dim: usize,
454        indexed: bool,
455        stored: bool,
456    ) -> Field {
457        self.add_dense_vector_field_with_config(name, indexed, stored, DenseVectorConfig::new(dim))
458    }
459
460    /// Add a dense vector field with custom configuration
461    pub fn add_dense_vector_field_with_config(
462        &mut self,
463        name: &str,
464        indexed: bool,
465        stored: bool,
466        config: DenseVectorConfig,
467    ) -> Field {
468        let field = Field(self.fields.len() as u32);
469        self.fields.push(FieldEntry {
470            name: name.to_string(),
471            field_type: FieldType::DenseVector,
472            indexed,
473            stored,
474            tokenizer: None,
475            multi: false,
476            positions: None,
477            sparse_vector_config: None,
478            dense_vector_config: Some(config),
479        });
480        field
481    }
482
483    fn add_field(
484        &mut self,
485        name: &str,
486        field_type: FieldType,
487        indexed: bool,
488        stored: bool,
489    ) -> Field {
490        self.add_field_with_tokenizer(name, field_type, indexed, stored, None)
491    }
492
493    fn add_field_with_tokenizer(
494        &mut self,
495        name: &str,
496        field_type: FieldType,
497        indexed: bool,
498        stored: bool,
499        tokenizer: Option<String>,
500    ) -> Field {
501        self.add_field_full(name, field_type, indexed, stored, tokenizer, false)
502    }
503
504    fn add_field_full(
505        &mut self,
506        name: &str,
507        field_type: FieldType,
508        indexed: bool,
509        stored: bool,
510        tokenizer: Option<String>,
511        multi: bool,
512    ) -> Field {
513        let field = Field(self.fields.len() as u32);
514        self.fields.push(FieldEntry {
515            name: name.to_string(),
516            field_type,
517            indexed,
518            stored,
519            tokenizer,
520            multi,
521            positions: None,
522            sparse_vector_config: None,
523            dense_vector_config: None,
524        });
525        field
526    }
527
528    /// Set the multi attribute on the last added field
529    pub fn set_multi(&mut self, field: Field, multi: bool) {
530        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
531            entry.multi = multi;
532        }
533    }
534
535    /// Set position tracking mode for phrase queries and multi-field element tracking
536    pub fn set_positions(&mut self, field: Field, mode: PositionMode) {
537        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
538            entry.positions = Some(mode);
539        }
540    }
541
542    /// Set default fields by name
543    pub fn set_default_fields(&mut self, field_names: Vec<String>) {
544        self.default_fields = field_names;
545    }
546
547    /// Set query router rules
548    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
549        self.query_routers = rules;
550    }
551
552    pub fn build(self) -> Schema {
553        let mut name_to_field = HashMap::new();
554        for (i, entry) in self.fields.iter().enumerate() {
555            name_to_field.insert(entry.name.clone(), Field(i as u32));
556        }
557
558        // Resolve default field names to Field IDs
559        let default_fields: Vec<Field> = self
560            .default_fields
561            .iter()
562            .filter_map(|name| name_to_field.get(name).copied())
563            .collect();
564
565        Schema {
566            fields: self.fields,
567            name_to_field,
568            default_fields,
569            query_routers: self.query_routers,
570        }
571    }
572}
573
574/// Value that can be stored in a field
575#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
576pub enum FieldValue {
577    #[serde(rename = "text")]
578    Text(String),
579    #[serde(rename = "u64")]
580    U64(u64),
581    #[serde(rename = "i64")]
582    I64(i64),
583    #[serde(rename = "f64")]
584    F64(f64),
585    #[serde(rename = "bytes")]
586    Bytes(Vec<u8>),
587    /// Sparse vector: list of (dimension_id, weight) pairs
588    #[serde(rename = "sparse_vector")]
589    SparseVector(Vec<(u32, f32)>),
590    /// Dense vector: float32 values
591    #[serde(rename = "dense_vector")]
592    DenseVector(Vec<f32>),
593    /// Arbitrary JSON value
594    #[serde(rename = "json")]
595    Json(serde_json::Value),
596}
597
598impl FieldValue {
599    pub fn as_text(&self) -> Option<&str> {
600        match self {
601            FieldValue::Text(s) => Some(s),
602            _ => None,
603        }
604    }
605
606    pub fn as_u64(&self) -> Option<u64> {
607        match self {
608            FieldValue::U64(v) => Some(*v),
609            _ => None,
610        }
611    }
612
613    pub fn as_i64(&self) -> Option<i64> {
614        match self {
615            FieldValue::I64(v) => Some(*v),
616            _ => None,
617        }
618    }
619
620    pub fn as_f64(&self) -> Option<f64> {
621        match self {
622            FieldValue::F64(v) => Some(*v),
623            _ => None,
624        }
625    }
626
627    pub fn as_bytes(&self) -> Option<&[u8]> {
628        match self {
629            FieldValue::Bytes(b) => Some(b),
630            _ => None,
631        }
632    }
633
634    pub fn as_sparse_vector(&self) -> Option<&[(u32, f32)]> {
635        match self {
636            FieldValue::SparseVector(entries) => Some(entries),
637            _ => None,
638        }
639    }
640
641    pub fn as_dense_vector(&self) -> Option<&[f32]> {
642        match self {
643            FieldValue::DenseVector(v) => Some(v),
644            _ => None,
645        }
646    }
647
648    pub fn as_json(&self) -> Option<&serde_json::Value> {
649        match self {
650            FieldValue::Json(v) => Some(v),
651            _ => None,
652        }
653    }
654}
655
656/// A document to be indexed
657#[derive(Debug, Clone, Default, Serialize, Deserialize)]
658pub struct Document {
659    field_values: Vec<(Field, FieldValue)>,
660}
661
662impl Document {
663    pub fn new() -> Self {
664        Self::default()
665    }
666
667    pub fn add_text(&mut self, field: Field, value: impl Into<String>) {
668        self.field_values
669            .push((field, FieldValue::Text(value.into())));
670    }
671
672    pub fn add_u64(&mut self, field: Field, value: u64) {
673        self.field_values.push((field, FieldValue::U64(value)));
674    }
675
676    pub fn add_i64(&mut self, field: Field, value: i64) {
677        self.field_values.push((field, FieldValue::I64(value)));
678    }
679
680    pub fn add_f64(&mut self, field: Field, value: f64) {
681        self.field_values.push((field, FieldValue::F64(value)));
682    }
683
684    pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
685        self.field_values.push((field, FieldValue::Bytes(value)));
686    }
687
688    pub fn add_sparse_vector(&mut self, field: Field, entries: Vec<(u32, f32)>) {
689        self.field_values
690            .push((field, FieldValue::SparseVector(entries)));
691    }
692
693    pub fn add_dense_vector(&mut self, field: Field, values: Vec<f32>) {
694        self.field_values
695            .push((field, FieldValue::DenseVector(values)));
696    }
697
698    pub fn add_json(&mut self, field: Field, value: serde_json::Value) {
699        self.field_values.push((field, FieldValue::Json(value)));
700    }
701
702    pub fn get_first(&self, field: Field) -> Option<&FieldValue> {
703        self.field_values
704            .iter()
705            .find(|(f, _)| *f == field)
706            .map(|(_, v)| v)
707    }
708
709    pub fn get_all(&self, field: Field) -> impl Iterator<Item = &FieldValue> {
710        self.field_values
711            .iter()
712            .filter(move |(f, _)| *f == field)
713            .map(|(_, v)| v)
714    }
715
716    pub fn field_values(&self) -> &[(Field, FieldValue)] {
717        &self.field_values
718    }
719
720    /// Convert document to a JSON object using field names from schema
721    ///
722    /// Fields marked as `multi` in the schema are always returned as JSON arrays.
723    /// Other fields with multiple values are also returned as arrays.
724    /// Fields with a single value (and not marked multi) are returned as scalar values.
725    pub fn to_json(&self, schema: &Schema) -> serde_json::Value {
726        use std::collections::HashMap;
727
728        // Group values by field, keeping track of field entry for multi check
729        let mut field_values_map: HashMap<Field, (String, bool, Vec<serde_json::Value>)> =
730            HashMap::new();
731
732        for (field, value) in &self.field_values {
733            if let Some(entry) = schema.get_field_entry(*field) {
734                let json_value = match value {
735                    FieldValue::Text(s) => serde_json::Value::String(s.clone()),
736                    FieldValue::U64(n) => serde_json::Value::Number((*n).into()),
737                    FieldValue::I64(n) => serde_json::Value::Number((*n).into()),
738                    FieldValue::F64(n) => serde_json::json!(n),
739                    FieldValue::Bytes(b) => {
740                        use base64::Engine;
741                        serde_json::Value::String(
742                            base64::engine::general_purpose::STANDARD.encode(b),
743                        )
744                    }
745                    FieldValue::SparseVector(entries) => {
746                        let indices: Vec<u32> = entries.iter().map(|(i, _)| *i).collect();
747                        let values: Vec<f32> = entries.iter().map(|(_, v)| *v).collect();
748                        serde_json::json!({
749                            "indices": indices,
750                            "values": values
751                        })
752                    }
753                    FieldValue::DenseVector(values) => {
754                        serde_json::json!(values)
755                    }
756                    FieldValue::Json(v) => v.clone(),
757                };
758                field_values_map
759                    .entry(*field)
760                    .or_insert_with(|| (entry.name.clone(), entry.multi, Vec::new()))
761                    .2
762                    .push(json_value);
763            }
764        }
765
766        // Convert to JSON object, using arrays for multi fields or when multiple values exist
767        let mut map = serde_json::Map::new();
768        for (_field, (name, is_multi, values)) in field_values_map {
769            let json_value = if is_multi || values.len() > 1 {
770                serde_json::Value::Array(values)
771            } else {
772                values.into_iter().next().unwrap()
773            };
774            map.insert(name, json_value);
775        }
776
777        serde_json::Value::Object(map)
778    }
779
780    /// Create a Document from a JSON object using field names from schema
781    ///
782    /// Supports:
783    /// - String values -> Text fields
784    /// - Number values -> U64/I64/F64 fields (based on schema type)
785    /// - Array values -> Multiple values for the same field (multifields)
786    ///
787    /// Unknown fields (not in schema) are silently ignored.
788    pub fn from_json(json: &serde_json::Value, schema: &Schema) -> Option<Self> {
789        let obj = json.as_object()?;
790        let mut doc = Document::new();
791
792        for (key, value) in obj {
793            if let Some(field) = schema.get_field(key) {
794                let field_entry = schema.get_field_entry(field)?;
795                Self::add_json_value(&mut doc, field, &field_entry.field_type, value);
796            }
797        }
798
799        Some(doc)
800    }
801
802    /// Helper to add a JSON value to a document, handling type conversion
803    fn add_json_value(
804        doc: &mut Document,
805        field: Field,
806        field_type: &FieldType,
807        value: &serde_json::Value,
808    ) {
809        match value {
810            serde_json::Value::String(s) => {
811                if matches!(field_type, FieldType::Text) {
812                    doc.add_text(field, s.clone());
813                }
814            }
815            serde_json::Value::Number(n) => {
816                match field_type {
817                    FieldType::I64 => {
818                        if let Some(i) = n.as_i64() {
819                            doc.add_i64(field, i);
820                        }
821                    }
822                    FieldType::U64 => {
823                        if let Some(u) = n.as_u64() {
824                            doc.add_u64(field, u);
825                        } else if let Some(i) = n.as_i64() {
826                            // Allow positive i64 as u64
827                            if i >= 0 {
828                                doc.add_u64(field, i as u64);
829                            }
830                        }
831                    }
832                    FieldType::F64 => {
833                        if let Some(f) = n.as_f64() {
834                            doc.add_f64(field, f);
835                        }
836                    }
837                    _ => {}
838                }
839            }
840            // Handle arrays (multifields) - add each element separately
841            serde_json::Value::Array(arr) => {
842                for item in arr {
843                    Self::add_json_value(doc, field, field_type, item);
844                }
845            }
846            // Handle sparse vector objects
847            serde_json::Value::Object(obj) if matches!(field_type, FieldType::SparseVector) => {
848                if let (Some(indices_val), Some(values_val)) =
849                    (obj.get("indices"), obj.get("values"))
850                {
851                    let indices: Vec<u32> = indices_val
852                        .as_array()
853                        .map(|arr| {
854                            arr.iter()
855                                .filter_map(|v| v.as_u64().map(|n| n as u32))
856                                .collect()
857                        })
858                        .unwrap_or_default();
859                    let values: Vec<f32> = values_val
860                        .as_array()
861                        .map(|arr| {
862                            arr.iter()
863                                .filter_map(|v| v.as_f64().map(|n| n as f32))
864                                .collect()
865                        })
866                        .unwrap_or_default();
867                    if indices.len() == values.len() {
868                        let entries: Vec<(u32, f32)> = indices.into_iter().zip(values).collect();
869                        doc.add_sparse_vector(field, entries);
870                    }
871                }
872            }
873            // Handle JSON fields - accept any value directly
874            _ if matches!(field_type, FieldType::Json) => {
875                doc.add_json(field, value.clone());
876            }
877            serde_json::Value::Object(_) => {}
878            _ => {}
879        }
880    }
881}
882
883#[cfg(test)]
884mod tests {
885    use super::*;
886
887    #[test]
888    fn test_schema_builder() {
889        let mut builder = Schema::builder();
890        let title = builder.add_text_field("title", true, true);
891        let body = builder.add_text_field("body", true, false);
892        let count = builder.add_u64_field("count", true, true);
893        let schema = builder.build();
894
895        assert_eq!(schema.get_field("title"), Some(title));
896        assert_eq!(schema.get_field("body"), Some(body));
897        assert_eq!(schema.get_field("count"), Some(count));
898        assert_eq!(schema.get_field("nonexistent"), None);
899    }
900
901    #[test]
902    fn test_document() {
903        let mut builder = Schema::builder();
904        let title = builder.add_text_field("title", true, true);
905        let count = builder.add_u64_field("count", true, true);
906        let _schema = builder.build();
907
908        let mut doc = Document::new();
909        doc.add_text(title, "Hello World");
910        doc.add_u64(count, 42);
911
912        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
913        assert_eq!(doc.get_first(count).unwrap().as_u64(), Some(42));
914    }
915
916    #[test]
917    fn test_document_serialization() {
918        let mut builder = Schema::builder();
919        let title = builder.add_text_field("title", true, true);
920        let count = builder.add_u64_field("count", true, true);
921        let _schema = builder.build();
922
923        let mut doc = Document::new();
924        doc.add_text(title, "Hello World");
925        doc.add_u64(count, 42);
926
927        // Serialize
928        let json = serde_json::to_string(&doc).unwrap();
929        println!("Serialized doc: {}", json);
930
931        // Deserialize
932        let doc2: Document = serde_json::from_str(&json).unwrap();
933        assert_eq!(
934            doc2.field_values().len(),
935            2,
936            "Should have 2 field values after deserialization"
937        );
938        assert_eq!(
939            doc2.get_first(title).unwrap().as_text(),
940            Some("Hello World")
941        );
942        assert_eq!(doc2.get_first(count).unwrap().as_u64(), Some(42));
943    }
944
945    #[test]
946    fn test_multivalue_field() {
947        let mut builder = Schema::builder();
948        let uris = builder.add_text_field("uris", true, true);
949        let title = builder.add_text_field("title", true, true);
950        let schema = builder.build();
951
952        // Create document with multiple values for the same field
953        let mut doc = Document::new();
954        doc.add_text(uris, "one");
955        doc.add_text(uris, "two");
956        doc.add_text(title, "Test Document");
957
958        // Verify get_first returns the first value
959        assert_eq!(doc.get_first(uris).unwrap().as_text(), Some("one"));
960
961        // Verify get_all returns all values
962        let all_uris: Vec<_> = doc.get_all(uris).collect();
963        assert_eq!(all_uris.len(), 2);
964        assert_eq!(all_uris[0].as_text(), Some("one"));
965        assert_eq!(all_uris[1].as_text(), Some("two"));
966
967        // Verify to_json returns array for multi-value field
968        let json = doc.to_json(&schema);
969        let uris_json = json.get("uris").unwrap();
970        assert!(uris_json.is_array(), "Multi-value field should be an array");
971        let uris_arr = uris_json.as_array().unwrap();
972        assert_eq!(uris_arr.len(), 2);
973        assert_eq!(uris_arr[0].as_str(), Some("one"));
974        assert_eq!(uris_arr[1].as_str(), Some("two"));
975
976        // Verify single-value field is NOT an array
977        let title_json = json.get("title").unwrap();
978        assert!(
979            title_json.is_string(),
980            "Single-value field should be a string"
981        );
982        assert_eq!(title_json.as_str(), Some("Test Document"));
983    }
984
985    #[test]
986    fn test_multivalue_from_json() {
987        let mut builder = Schema::builder();
988        let uris = builder.add_text_field("uris", true, true);
989        let title = builder.add_text_field("title", true, true);
990        let schema = builder.build();
991
992        // Create JSON with array value
993        let json = serde_json::json!({
994            "uris": ["one", "two"],
995            "title": "Test Document"
996        });
997
998        // Parse from JSON
999        let doc = Document::from_json(&json, &schema).unwrap();
1000
1001        // Verify all values are present
1002        let all_uris: Vec<_> = doc.get_all(uris).collect();
1003        assert_eq!(all_uris.len(), 2);
1004        assert_eq!(all_uris[0].as_text(), Some("one"));
1005        assert_eq!(all_uris[1].as_text(), Some("two"));
1006
1007        // Verify single value
1008        assert_eq!(
1009            doc.get_first(title).unwrap().as_text(),
1010            Some("Test Document")
1011        );
1012
1013        // Verify roundtrip: to_json should produce equivalent JSON
1014        let json_out = doc.to_json(&schema);
1015        let uris_out = json_out.get("uris").unwrap().as_array().unwrap();
1016        assert_eq!(uris_out.len(), 2);
1017        assert_eq!(uris_out[0].as_str(), Some("one"));
1018        assert_eq!(uris_out[1].as_str(), Some("two"));
1019    }
1020
1021    #[test]
1022    fn test_multi_attribute_forces_array() {
1023        // Test that fields marked as 'multi' are always serialized as arrays,
1024        // even when they have only one value
1025        let mut builder = Schema::builder();
1026        let uris = builder.add_text_field("uris", true, true);
1027        builder.set_multi(uris, true); // Mark as multi
1028        let title = builder.add_text_field("title", true, true);
1029        let schema = builder.build();
1030
1031        // Verify the multi attribute is set
1032        assert!(schema.get_field_entry(uris).unwrap().multi);
1033        assert!(!schema.get_field_entry(title).unwrap().multi);
1034
1035        // Create document with single value for multi field
1036        let mut doc = Document::new();
1037        doc.add_text(uris, "only_one");
1038        doc.add_text(title, "Test Document");
1039
1040        // Verify to_json returns array for multi field even with single value
1041        let json = doc.to_json(&schema);
1042
1043        let uris_json = json.get("uris").unwrap();
1044        assert!(
1045            uris_json.is_array(),
1046            "Multi field should be array even with single value"
1047        );
1048        let uris_arr = uris_json.as_array().unwrap();
1049        assert_eq!(uris_arr.len(), 1);
1050        assert_eq!(uris_arr[0].as_str(), Some("only_one"));
1051
1052        // Verify non-multi field with single value is NOT an array
1053        let title_json = json.get("title").unwrap();
1054        assert!(
1055            title_json.is_string(),
1056            "Non-multi single-value field should be a string"
1057        );
1058        assert_eq!(title_json.as_str(), Some("Test Document"));
1059    }
1060
1061    #[test]
1062    fn test_sparse_vector_field() {
1063        let mut builder = Schema::builder();
1064        let embedding = builder.add_sparse_vector_field("embedding", true, true);
1065        let title = builder.add_text_field("title", true, true);
1066        let schema = builder.build();
1067
1068        assert_eq!(schema.get_field("embedding"), Some(embedding));
1069        assert_eq!(
1070            schema.get_field_entry(embedding).unwrap().field_type,
1071            FieldType::SparseVector
1072        );
1073
1074        // Create document with sparse vector
1075        let mut doc = Document::new();
1076        doc.add_sparse_vector(embedding, vec![(0, 1.0), (5, 2.5), (10, 0.5)]);
1077        doc.add_text(title, "Test Document");
1078
1079        // Verify accessor
1080        let entries = doc
1081            .get_first(embedding)
1082            .unwrap()
1083            .as_sparse_vector()
1084            .unwrap();
1085        assert_eq!(entries, &[(0, 1.0), (5, 2.5), (10, 0.5)]);
1086
1087        // Verify JSON roundtrip
1088        let json = doc.to_json(&schema);
1089        let embedding_json = json.get("embedding").unwrap();
1090        assert!(embedding_json.is_object());
1091        assert_eq!(
1092            embedding_json
1093                .get("indices")
1094                .unwrap()
1095                .as_array()
1096                .unwrap()
1097                .len(),
1098            3
1099        );
1100
1101        // Parse back from JSON
1102        let doc2 = Document::from_json(&json, &schema).unwrap();
1103        let entries2 = doc2
1104            .get_first(embedding)
1105            .unwrap()
1106            .as_sparse_vector()
1107            .unwrap();
1108        assert_eq!(entries2[0].0, 0);
1109        assert!((entries2[0].1 - 1.0).abs() < 1e-6);
1110        assert_eq!(entries2[1].0, 5);
1111        assert!((entries2[1].1 - 2.5).abs() < 1e-6);
1112        assert_eq!(entries2[2].0, 10);
1113        assert!((entries2[2].1 - 0.5).abs() < 1e-6);
1114    }
1115
1116    #[test]
1117    fn test_json_field() {
1118        let mut builder = Schema::builder();
1119        let metadata = builder.add_json_field("metadata", true);
1120        let title = builder.add_text_field("title", true, true);
1121        let schema = builder.build();
1122
1123        assert_eq!(schema.get_field("metadata"), Some(metadata));
1124        assert_eq!(
1125            schema.get_field_entry(metadata).unwrap().field_type,
1126            FieldType::Json
1127        );
1128        // JSON fields are never indexed
1129        assert!(!schema.get_field_entry(metadata).unwrap().indexed);
1130        assert!(schema.get_field_entry(metadata).unwrap().stored);
1131
1132        // Create document with JSON value (object)
1133        let json_value = serde_json::json!({
1134            "author": "John Doe",
1135            "tags": ["rust", "search"],
1136            "nested": {"key": "value"}
1137        });
1138        let mut doc = Document::new();
1139        doc.add_json(metadata, json_value.clone());
1140        doc.add_text(title, "Test Document");
1141
1142        // Verify accessor
1143        let stored_json = doc.get_first(metadata).unwrap().as_json().unwrap();
1144        assert_eq!(stored_json, &json_value);
1145        assert_eq!(
1146            stored_json.get("author").unwrap().as_str(),
1147            Some("John Doe")
1148        );
1149
1150        // Verify JSON roundtrip via to_json/from_json
1151        let doc_json = doc.to_json(&schema);
1152        let metadata_out = doc_json.get("metadata").unwrap();
1153        assert_eq!(metadata_out, &json_value);
1154
1155        // Parse back from JSON
1156        let doc2 = Document::from_json(&doc_json, &schema).unwrap();
1157        let stored_json2 = doc2.get_first(metadata).unwrap().as_json().unwrap();
1158        assert_eq!(stored_json2, &json_value);
1159    }
1160
1161    #[test]
1162    fn test_json_field_various_types() {
1163        let mut builder = Schema::builder();
1164        let data = builder.add_json_field("data", true);
1165        let _schema = builder.build();
1166
1167        // Test with array
1168        let arr_value = serde_json::json!([1, 2, 3, "four", null]);
1169        let mut doc = Document::new();
1170        doc.add_json(data, arr_value.clone());
1171        assert_eq!(doc.get_first(data).unwrap().as_json().unwrap(), &arr_value);
1172
1173        // Test with string
1174        let str_value = serde_json::json!("just a string");
1175        let mut doc2 = Document::new();
1176        doc2.add_json(data, str_value.clone());
1177        assert_eq!(doc2.get_first(data).unwrap().as_json().unwrap(), &str_value);
1178
1179        // Test with number
1180        let num_value = serde_json::json!(42.5);
1181        let mut doc3 = Document::new();
1182        doc3.add_json(data, num_value.clone());
1183        assert_eq!(doc3.get_first(data).unwrap().as_json().unwrap(), &num_value);
1184
1185        // Test with null
1186        let null_value = serde_json::Value::Null;
1187        let mut doc4 = Document::new();
1188        doc4.add_json(data, null_value.clone());
1189        assert_eq!(
1190            doc4.get_first(data).unwrap().as_json().unwrap(),
1191            &null_value
1192        );
1193
1194        // Test with boolean
1195        let bool_value = serde_json::json!(true);
1196        let mut doc5 = Document::new();
1197        doc5.add_json(data, bool_value.clone());
1198        assert_eq!(
1199            doc5.get_first(data).unwrap().as_json().unwrap(),
1200            &bool_value
1201        );
1202    }
1203}