Skip to main content

hermes_core/dsl/
schema.rs

1//! Schema definitions for documents and fields
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// Field identifier
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
8pub struct Field(pub u32);
9
10/// Types of fields supported
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub enum FieldType {
13    /// Text field - tokenized and indexed
14    #[serde(rename = "text")]
15    Text,
16    /// Unsigned 64-bit integer
17    #[serde(rename = "u64")]
18    U64,
19    /// Signed 64-bit integer
20    #[serde(rename = "i64")]
21    I64,
22    /// 64-bit floating point
23    #[serde(rename = "f64")]
24    F64,
25    /// Raw bytes (not tokenized)
26    #[serde(rename = "bytes")]
27    Bytes,
28    /// Sparse vector field - indexed as inverted posting lists with quantized weights
29    #[serde(rename = "sparse_vector")]
30    SparseVector,
31    /// Dense vector field - indexed using RaBitQ binary quantization for ANN search
32    #[serde(rename = "dense_vector")]
33    DenseVector,
34    /// JSON field - arbitrary JSON data, stored but not indexed
35    #[serde(rename = "json")]
36    Json,
37}
38
39/// Field options
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct FieldEntry {
42    pub name: String,
43    pub field_type: FieldType,
44    pub indexed: bool,
45    pub stored: bool,
46    /// Name of the tokenizer to use for this field (for text fields)
47    pub tokenizer: Option<String>,
48    /// Whether this field can have multiple values (serialized as array in JSON)
49    #[serde(default)]
50    pub multi: bool,
51    /// Position tracking mode for phrase queries and multi-field element tracking
52    #[serde(default, skip_serializing_if = "Option::is_none")]
53    pub positions: Option<PositionMode>,
54    /// Configuration for sparse vector fields (index size, weight quantization)
55    #[serde(default, skip_serializing_if = "Option::is_none")]
56    pub sparse_vector_config: Option<crate::structures::SparseVectorConfig>,
57    /// Configuration for dense vector fields (dimension, quantization)
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub dense_vector_config: Option<DenseVectorConfig>,
60}
61
62/// Position tracking mode for text fields
63#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
64#[serde(rename_all = "snake_case")]
65pub enum PositionMode {
66    /// Track only element ordinal for multi-valued fields (which array element)
67    /// Useful for returning which element matched without full phrase query support
68    Ordinal,
69    /// Track only token position within text (for phrase queries)
70    /// Does not track element ordinal - all positions are relative to concatenated text
71    TokenPosition,
72    /// Track both element ordinal and token position (full support)
73    /// Position format: (element_ordinal << 20) | token_position
74    Full,
75}
76
77impl PositionMode {
78    /// Whether this mode tracks element ordinals
79    pub fn tracks_ordinal(&self) -> bool {
80        matches!(self, PositionMode::Ordinal | PositionMode::Full)
81    }
82
83    /// Whether this mode tracks token positions
84    pub fn tracks_token_position(&self) -> bool {
85        matches!(self, PositionMode::TokenPosition | PositionMode::Full)
86    }
87}
88
89/// Vector index algorithm type
90#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
91#[serde(rename_all = "snake_case")]
92pub enum VectorIndexType {
93    /// Flat - brute-force search over raw vectors (accumulating state)
94    Flat,
95    /// RaBitQ - binary quantization, good for small datasets (<100K)
96    #[default]
97    RaBitQ,
98    /// IVF-RaBitQ - inverted file with RaBitQ, good for medium datasets
99    IvfRaBitQ,
100    /// ScaNN - product quantization with OPQ and anisotropic loss, best for large datasets
101    ScaNN,
102}
103
104/// Storage quantization for dense vector elements
105///
106/// Controls the precision of each vector coordinate in `.vectors` files.
107/// Lower precision reduces storage and memory bandwidth; scoring uses
108/// native-precision SIMD (no dequantization on the hot path).
109#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
110#[serde(rename_all = "snake_case")]
111pub enum DenseVectorQuantization {
112    /// 32-bit IEEE 754 float (4 bytes/dim) — full precision, baseline
113    #[default]
114    F32,
115    /// 16-bit IEEE 754 half-float (2 bytes/dim) — <0.1% recall loss for normalized embeddings
116    F16,
117    /// 8-bit unsigned scalar quantization (1 byte/dim) — maps [-1,1] → [0,255]
118    UInt8,
119}
120
121impl DenseVectorQuantization {
122    /// Bytes per element for this quantization type
123    pub fn element_size(self) -> usize {
124        match self {
125            Self::F32 => 4,
126            Self::F16 => 2,
127            Self::UInt8 => 1,
128        }
129    }
130
131    /// Wire format tag (stored in .vectors header)
132    pub fn tag(self) -> u8 {
133        match self {
134            Self::F32 => 0,
135            Self::F16 => 1,
136            Self::UInt8 => 2,
137        }
138    }
139
140    /// Decode wire format tag
141    pub fn from_tag(tag: u8) -> Option<Self> {
142        match tag {
143            0 => Some(Self::F32),
144            1 => Some(Self::F16),
145            2 => Some(Self::UInt8),
146            _ => None,
147        }
148    }
149}
150
151/// Configuration for dense vector fields using Flat, RaBitQ, IVF-RaBitQ, or ScaNN
152///
153/// Indexes operate in two states:
154/// - **Flat (accumulating)**: Brute-force search over raw vectors. Used when vector count
155///   is below `build_threshold` or before `build_index` is called.
156/// - **Built (ANN)**: Fast approximate nearest neighbor search using trained structures.
157///   Centroids and codebooks are trained from data and stored within the segment.
158#[derive(Debug, Clone, Serialize, Deserialize)]
159pub struct DenseVectorConfig {
160    /// Dimensionality of vectors
161    pub dim: usize,
162    /// Target vector index algorithm (Flat, RaBitQ, IVF-RaBitQ, or ScaNN)
163    /// When in accumulating state, search uses brute-force regardless of this setting.
164    #[serde(default)]
165    pub index_type: VectorIndexType,
166    /// Storage quantization for vector elements (f32, f16, uint8)
167    #[serde(default)]
168    pub quantization: DenseVectorQuantization,
169    /// Number of IVF clusters for IVF-RaBitQ and ScaNN (default: sqrt(n) capped at 4096)
170    /// If None, automatically determined based on dataset size.
171    #[serde(default, skip_serializing_if = "Option::is_none")]
172    pub num_clusters: Option<usize>,
173    /// Number of clusters to probe during search (default: 32)
174    #[serde(default = "default_nprobe")]
175    pub nprobe: usize,
176    /// Minimum number of vectors required before building ANN index.
177    /// Below this threshold, brute-force (Flat) search is used.
178    /// Default: 1000 for RaBitQ, 10000 for IVF-RaBitQ/ScaNN.
179    #[serde(default, skip_serializing_if = "Option::is_none")]
180    pub build_threshold: Option<usize>,
181}
182
183fn default_nprobe() -> usize {
184    32
185}
186
187impl DenseVectorConfig {
188    pub fn new(dim: usize) -> Self {
189        Self {
190            dim,
191            index_type: VectorIndexType::RaBitQ,
192            quantization: DenseVectorQuantization::F32,
193            num_clusters: None,
194            nprobe: 32,
195            build_threshold: None,
196        }
197    }
198
199    /// Create IVF-RaBitQ configuration
200    pub fn with_ivf(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
201        Self {
202            dim,
203            index_type: VectorIndexType::IvfRaBitQ,
204            quantization: DenseVectorQuantization::F32,
205            num_clusters,
206            nprobe,
207            build_threshold: None,
208        }
209    }
210
211    /// Create ScaNN configuration
212    pub fn with_scann(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
213        Self {
214            dim,
215            index_type: VectorIndexType::ScaNN,
216            quantization: DenseVectorQuantization::F32,
217            num_clusters,
218            nprobe,
219            build_threshold: None,
220        }
221    }
222
223    /// Create Flat (brute-force) configuration - no ANN index
224    pub fn flat(dim: usize) -> Self {
225        Self {
226            dim,
227            index_type: VectorIndexType::Flat,
228            quantization: DenseVectorQuantization::F32,
229            num_clusters: None,
230            nprobe: 0,
231            build_threshold: None,
232        }
233    }
234
235    /// Set storage quantization
236    pub fn with_quantization(mut self, quantization: DenseVectorQuantization) -> Self {
237        self.quantization = quantization;
238        self
239    }
240
241    /// Set build threshold for auto-building ANN index
242    pub fn with_build_threshold(mut self, threshold: usize) -> Self {
243        self.build_threshold = Some(threshold);
244        self
245    }
246
247    /// Set number of IVF clusters
248    pub fn with_num_clusters(mut self, num_clusters: usize) -> Self {
249        self.num_clusters = Some(num_clusters);
250        self
251    }
252
253    /// Check if this config uses IVF
254    pub fn uses_ivf(&self) -> bool {
255        matches!(
256            self.index_type,
257            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN
258        )
259    }
260
261    /// Check if this config uses ScaNN
262    pub fn uses_scann(&self) -> bool {
263        self.index_type == VectorIndexType::ScaNN
264    }
265
266    /// Check if this config is flat (brute-force)
267    pub fn is_flat(&self) -> bool {
268        self.index_type == VectorIndexType::Flat
269    }
270
271    /// Get the default build threshold for this index type
272    pub fn default_build_threshold(&self) -> usize {
273        self.build_threshold.unwrap_or(match self.index_type {
274            VectorIndexType::Flat => usize::MAX, // Never auto-build
275            VectorIndexType::RaBitQ => 1000,
276            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN => 10000,
277        })
278    }
279
280    /// Calculate optimal number of clusters for given vector count
281    pub fn optimal_num_clusters(&self, num_vectors: usize) -> usize {
282        self.num_clusters.unwrap_or_else(|| {
283            // sqrt(n) heuristic, capped at 4096
284            let optimal = (num_vectors as f64).sqrt() as usize;
285            optimal.clamp(16, 4096)
286        })
287    }
288}
289
290use super::query_field_router::QueryRouterRule;
291
292/// Schema defining document structure
293#[derive(Debug, Clone, Default, Serialize, Deserialize)]
294pub struct Schema {
295    fields: Vec<FieldEntry>,
296    name_to_field: HashMap<String, Field>,
297    /// Default fields for query parsing (when no field is specified)
298    #[serde(default)]
299    default_fields: Vec<Field>,
300    /// Query router rules for routing queries to specific fields based on regex patterns
301    #[serde(default)]
302    query_routers: Vec<QueryRouterRule>,
303}
304
305impl Schema {
306    pub fn builder() -> SchemaBuilder {
307        SchemaBuilder::default()
308    }
309
310    pub fn get_field(&self, name: &str) -> Option<Field> {
311        self.name_to_field.get(name).copied()
312    }
313
314    pub fn get_field_entry(&self, field: Field) -> Option<&FieldEntry> {
315        self.fields.get(field.0 as usize)
316    }
317
318    pub fn get_field_name(&self, field: Field) -> Option<&str> {
319        self.fields.get(field.0 as usize).map(|e| e.name.as_str())
320    }
321
322    pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
323        self.fields
324            .iter()
325            .enumerate()
326            .map(|(i, e)| (Field(i as u32), e))
327    }
328
329    pub fn num_fields(&self) -> usize {
330        self.fields.len()
331    }
332
333    /// Get the default fields for query parsing
334    pub fn default_fields(&self) -> &[Field] {
335        &self.default_fields
336    }
337
338    /// Set default fields (used by builder)
339    pub fn set_default_fields(&mut self, fields: Vec<Field>) {
340        self.default_fields = fields;
341    }
342
343    /// Get the query router rules
344    pub fn query_routers(&self) -> &[QueryRouterRule] {
345        &self.query_routers
346    }
347
348    /// Set query router rules
349    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
350        self.query_routers = rules;
351    }
352}
353
354/// Builder for Schema
355#[derive(Debug, Default)]
356pub struct SchemaBuilder {
357    fields: Vec<FieldEntry>,
358    default_fields: Vec<String>,
359    query_routers: Vec<QueryRouterRule>,
360}
361
362impl SchemaBuilder {
363    pub fn add_text_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
364        self.add_field_with_tokenizer(
365            name,
366            FieldType::Text,
367            indexed,
368            stored,
369            Some("default".to_string()),
370        )
371    }
372
373    pub fn add_text_field_with_tokenizer(
374        &mut self,
375        name: &str,
376        indexed: bool,
377        stored: bool,
378        tokenizer: &str,
379    ) -> Field {
380        self.add_field_with_tokenizer(
381            name,
382            FieldType::Text,
383            indexed,
384            stored,
385            Some(tokenizer.to_string()),
386        )
387    }
388
389    pub fn add_u64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
390        self.add_field(name, FieldType::U64, indexed, stored)
391    }
392
393    pub fn add_i64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
394        self.add_field(name, FieldType::I64, indexed, stored)
395    }
396
397    pub fn add_f64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
398        self.add_field(name, FieldType::F64, indexed, stored)
399    }
400
401    pub fn add_bytes_field(&mut self, name: &str, stored: bool) -> Field {
402        self.add_field(name, FieldType::Bytes, false, stored)
403    }
404
405    /// Add a JSON field for storing arbitrary JSON data
406    ///
407    /// JSON fields are never indexed, only stored. They can hold any valid JSON value
408    /// (objects, arrays, strings, numbers, booleans, null).
409    pub fn add_json_field(&mut self, name: &str, stored: bool) -> Field {
410        self.add_field(name, FieldType::Json, false, stored)
411    }
412
413    /// Add a sparse vector field with default configuration
414    ///
415    /// Sparse vectors are indexed as inverted posting lists where each dimension
416    /// becomes a "term" and documents have quantized weights for each dimension.
417    pub fn add_sparse_vector_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
418        self.add_sparse_vector_field_with_config(
419            name,
420            indexed,
421            stored,
422            crate::structures::SparseVectorConfig::default(),
423        )
424    }
425
426    /// Add a sparse vector field with custom configuration
427    ///
428    /// Use `SparseVectorConfig::splade()` for SPLADE models (u16 indices, uint8 weights).
429    /// Use `SparseVectorConfig::compact()` for maximum compression (u16 indices, uint4 weights).
430    pub fn add_sparse_vector_field_with_config(
431        &mut self,
432        name: &str,
433        indexed: bool,
434        stored: bool,
435        config: crate::structures::SparseVectorConfig,
436    ) -> Field {
437        let field = Field(self.fields.len() as u32);
438        self.fields.push(FieldEntry {
439            name: name.to_string(),
440            field_type: FieldType::SparseVector,
441            indexed,
442            stored,
443            tokenizer: None,
444            multi: false,
445            positions: None,
446            sparse_vector_config: Some(config),
447            dense_vector_config: None,
448        });
449        field
450    }
451
452    /// Set sparse vector configuration for an existing field
453    pub fn set_sparse_vector_config(
454        &mut self,
455        field: Field,
456        config: crate::structures::SparseVectorConfig,
457    ) {
458        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
459            entry.sparse_vector_config = Some(config);
460        }
461    }
462
463    /// Add a dense vector field with default configuration
464    ///
465    /// Dense vectors are indexed using RaBitQ binary quantization for fast ANN search.
466    /// The dimension must be specified as it determines the quantization structure.
467    pub fn add_dense_vector_field(
468        &mut self,
469        name: &str,
470        dim: usize,
471        indexed: bool,
472        stored: bool,
473    ) -> Field {
474        self.add_dense_vector_field_with_config(name, indexed, stored, DenseVectorConfig::new(dim))
475    }
476
477    /// Add a dense vector field with custom configuration
478    pub fn add_dense_vector_field_with_config(
479        &mut self,
480        name: &str,
481        indexed: bool,
482        stored: bool,
483        config: DenseVectorConfig,
484    ) -> Field {
485        let field = Field(self.fields.len() as u32);
486        self.fields.push(FieldEntry {
487            name: name.to_string(),
488            field_type: FieldType::DenseVector,
489            indexed,
490            stored,
491            tokenizer: None,
492            multi: false,
493            positions: None,
494            sparse_vector_config: None,
495            dense_vector_config: Some(config),
496        });
497        field
498    }
499
500    fn add_field(
501        &mut self,
502        name: &str,
503        field_type: FieldType,
504        indexed: bool,
505        stored: bool,
506    ) -> Field {
507        self.add_field_with_tokenizer(name, field_type, indexed, stored, None)
508    }
509
510    fn add_field_with_tokenizer(
511        &mut self,
512        name: &str,
513        field_type: FieldType,
514        indexed: bool,
515        stored: bool,
516        tokenizer: Option<String>,
517    ) -> Field {
518        self.add_field_full(name, field_type, indexed, stored, tokenizer, false)
519    }
520
521    fn add_field_full(
522        &mut self,
523        name: &str,
524        field_type: FieldType,
525        indexed: bool,
526        stored: bool,
527        tokenizer: Option<String>,
528        multi: bool,
529    ) -> Field {
530        let field = Field(self.fields.len() as u32);
531        self.fields.push(FieldEntry {
532            name: name.to_string(),
533            field_type,
534            indexed,
535            stored,
536            tokenizer,
537            multi,
538            positions: None,
539            sparse_vector_config: None,
540            dense_vector_config: None,
541        });
542        field
543    }
544
545    /// Set the multi attribute on the last added field
546    pub fn set_multi(&mut self, field: Field, multi: bool) {
547        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
548            entry.multi = multi;
549        }
550    }
551
552    /// Set position tracking mode for phrase queries and multi-field element tracking
553    pub fn set_positions(&mut self, field: Field, mode: PositionMode) {
554        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
555            entry.positions = Some(mode);
556        }
557    }
558
559    /// Set default fields by name
560    pub fn set_default_fields(&mut self, field_names: Vec<String>) {
561        self.default_fields = field_names;
562    }
563
564    /// Set query router rules
565    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
566        self.query_routers = rules;
567    }
568
569    pub fn build(self) -> Schema {
570        let mut name_to_field = HashMap::new();
571        for (i, entry) in self.fields.iter().enumerate() {
572            name_to_field.insert(entry.name.clone(), Field(i as u32));
573        }
574
575        // Resolve default field names to Field IDs
576        let default_fields: Vec<Field> = self
577            .default_fields
578            .iter()
579            .filter_map(|name| name_to_field.get(name).copied())
580            .collect();
581
582        Schema {
583            fields: self.fields,
584            name_to_field,
585            default_fields,
586            query_routers: self.query_routers,
587        }
588    }
589}
590
591/// Value that can be stored in a field
592#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
593pub enum FieldValue {
594    #[serde(rename = "text")]
595    Text(String),
596    #[serde(rename = "u64")]
597    U64(u64),
598    #[serde(rename = "i64")]
599    I64(i64),
600    #[serde(rename = "f64")]
601    F64(f64),
602    #[serde(rename = "bytes")]
603    Bytes(Vec<u8>),
604    /// Sparse vector: list of (dimension_id, weight) pairs
605    #[serde(rename = "sparse_vector")]
606    SparseVector(Vec<(u32, f32)>),
607    /// Dense vector: float32 values
608    #[serde(rename = "dense_vector")]
609    DenseVector(Vec<f32>),
610    /// Arbitrary JSON value
611    #[serde(rename = "json")]
612    Json(serde_json::Value),
613}
614
615impl FieldValue {
616    pub fn as_text(&self) -> Option<&str> {
617        match self {
618            FieldValue::Text(s) => Some(s),
619            _ => None,
620        }
621    }
622
623    pub fn as_u64(&self) -> Option<u64> {
624        match self {
625            FieldValue::U64(v) => Some(*v),
626            _ => None,
627        }
628    }
629
630    pub fn as_i64(&self) -> Option<i64> {
631        match self {
632            FieldValue::I64(v) => Some(*v),
633            _ => None,
634        }
635    }
636
637    pub fn as_f64(&self) -> Option<f64> {
638        match self {
639            FieldValue::F64(v) => Some(*v),
640            _ => None,
641        }
642    }
643
644    pub fn as_bytes(&self) -> Option<&[u8]> {
645        match self {
646            FieldValue::Bytes(b) => Some(b),
647            _ => None,
648        }
649    }
650
651    pub fn as_sparse_vector(&self) -> Option<&[(u32, f32)]> {
652        match self {
653            FieldValue::SparseVector(entries) => Some(entries),
654            _ => None,
655        }
656    }
657
658    pub fn as_dense_vector(&self) -> Option<&[f32]> {
659        match self {
660            FieldValue::DenseVector(v) => Some(v),
661            _ => None,
662        }
663    }
664
665    pub fn as_json(&self) -> Option<&serde_json::Value> {
666        match self {
667            FieldValue::Json(v) => Some(v),
668            _ => None,
669        }
670    }
671}
672
673/// A document to be indexed
674#[derive(Debug, Clone, Default, Serialize, Deserialize)]
675pub struct Document {
676    field_values: Vec<(Field, FieldValue)>,
677}
678
679impl Document {
680    pub fn new() -> Self {
681        Self::default()
682    }
683
684    pub fn add_text(&mut self, field: Field, value: impl Into<String>) {
685        self.field_values
686            .push((field, FieldValue::Text(value.into())));
687    }
688
689    pub fn add_u64(&mut self, field: Field, value: u64) {
690        self.field_values.push((field, FieldValue::U64(value)));
691    }
692
693    pub fn add_i64(&mut self, field: Field, value: i64) {
694        self.field_values.push((field, FieldValue::I64(value)));
695    }
696
697    pub fn add_f64(&mut self, field: Field, value: f64) {
698        self.field_values.push((field, FieldValue::F64(value)));
699    }
700
701    pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
702        self.field_values.push((field, FieldValue::Bytes(value)));
703    }
704
705    pub fn add_sparse_vector(&mut self, field: Field, entries: Vec<(u32, f32)>) {
706        self.field_values
707            .push((field, FieldValue::SparseVector(entries)));
708    }
709
710    pub fn add_dense_vector(&mut self, field: Field, values: Vec<f32>) {
711        self.field_values
712            .push((field, FieldValue::DenseVector(values)));
713    }
714
715    pub fn add_json(&mut self, field: Field, value: serde_json::Value) {
716        self.field_values.push((field, FieldValue::Json(value)));
717    }
718
719    pub fn get_first(&self, field: Field) -> Option<&FieldValue> {
720        self.field_values
721            .iter()
722            .find(|(f, _)| *f == field)
723            .map(|(_, v)| v)
724    }
725
726    pub fn get_all(&self, field: Field) -> impl Iterator<Item = &FieldValue> {
727        self.field_values
728            .iter()
729            .filter(move |(f, _)| *f == field)
730            .map(|(_, v)| v)
731    }
732
733    pub fn field_values(&self) -> &[(Field, FieldValue)] {
734        &self.field_values
735    }
736
737    /// Return a new Document containing only fields marked as `stored` in the schema
738    pub fn filter_stored(&self, schema: &Schema) -> Document {
739        Document {
740            field_values: self
741                .field_values
742                .iter()
743                .filter(|(field, _)| {
744                    schema
745                        .get_field_entry(*field)
746                        .is_some_and(|entry| entry.stored)
747                })
748                .cloned()
749                .collect(),
750        }
751    }
752
753    /// Convert document to a JSON object using field names from schema
754    ///
755    /// Fields marked as `multi` in the schema are always returned as JSON arrays.
756    /// Other fields with multiple values are also returned as arrays.
757    /// Fields with a single value (and not marked multi) are returned as scalar values.
758    pub fn to_json(&self, schema: &Schema) -> serde_json::Value {
759        use std::collections::HashMap;
760
761        // Group values by field, keeping track of field entry for multi check
762        let mut field_values_map: HashMap<Field, (String, bool, Vec<serde_json::Value>)> =
763            HashMap::new();
764
765        for (field, value) in &self.field_values {
766            if let Some(entry) = schema.get_field_entry(*field) {
767                let json_value = match value {
768                    FieldValue::Text(s) => serde_json::Value::String(s.clone()),
769                    FieldValue::U64(n) => serde_json::Value::Number((*n).into()),
770                    FieldValue::I64(n) => serde_json::Value::Number((*n).into()),
771                    FieldValue::F64(n) => serde_json::json!(n),
772                    FieldValue::Bytes(b) => {
773                        use base64::Engine;
774                        serde_json::Value::String(
775                            base64::engine::general_purpose::STANDARD.encode(b),
776                        )
777                    }
778                    FieldValue::SparseVector(entries) => {
779                        let indices: Vec<u32> = entries.iter().map(|(i, _)| *i).collect();
780                        let values: Vec<f32> = entries.iter().map(|(_, v)| *v).collect();
781                        serde_json::json!({
782                            "indices": indices,
783                            "values": values
784                        })
785                    }
786                    FieldValue::DenseVector(values) => {
787                        serde_json::json!(values)
788                    }
789                    FieldValue::Json(v) => v.clone(),
790                };
791                field_values_map
792                    .entry(*field)
793                    .or_insert_with(|| (entry.name.clone(), entry.multi, Vec::new()))
794                    .2
795                    .push(json_value);
796            }
797        }
798
799        // Convert to JSON object, using arrays for multi fields or when multiple values exist
800        let mut map = serde_json::Map::new();
801        for (_field, (name, is_multi, values)) in field_values_map {
802            let json_value = if is_multi || values.len() > 1 {
803                serde_json::Value::Array(values)
804            } else {
805                values.into_iter().next().unwrap()
806            };
807            map.insert(name, json_value);
808        }
809
810        serde_json::Value::Object(map)
811    }
812
813    /// Create a Document from a JSON object using field names from schema
814    ///
815    /// Supports:
816    /// - String values -> Text fields
817    /// - Number values -> U64/I64/F64 fields (based on schema type)
818    /// - Array values -> Multiple values for the same field (multifields)
819    ///
820    /// Unknown fields (not in schema) are silently ignored.
821    pub fn from_json(json: &serde_json::Value, schema: &Schema) -> Option<Self> {
822        let obj = json.as_object()?;
823        let mut doc = Document::new();
824
825        for (key, value) in obj {
826            if let Some(field) = schema.get_field(key) {
827                let field_entry = schema.get_field_entry(field)?;
828                Self::add_json_value(&mut doc, field, &field_entry.field_type, value);
829            }
830        }
831
832        Some(doc)
833    }
834
835    /// Helper to add a JSON value to a document, handling type conversion
836    fn add_json_value(
837        doc: &mut Document,
838        field: Field,
839        field_type: &FieldType,
840        value: &serde_json::Value,
841    ) {
842        match value {
843            serde_json::Value::String(s) => {
844                if matches!(field_type, FieldType::Text) {
845                    doc.add_text(field, s.clone());
846                }
847            }
848            serde_json::Value::Number(n) => {
849                match field_type {
850                    FieldType::I64 => {
851                        if let Some(i) = n.as_i64() {
852                            doc.add_i64(field, i);
853                        }
854                    }
855                    FieldType::U64 => {
856                        if let Some(u) = n.as_u64() {
857                            doc.add_u64(field, u);
858                        } else if let Some(i) = n.as_i64() {
859                            // Allow positive i64 as u64
860                            if i >= 0 {
861                                doc.add_u64(field, i as u64);
862                            }
863                        }
864                    }
865                    FieldType::F64 => {
866                        if let Some(f) = n.as_f64() {
867                            doc.add_f64(field, f);
868                        }
869                    }
870                    _ => {}
871                }
872            }
873            // Handle arrays (multifields) - add each element separately
874            serde_json::Value::Array(arr) => {
875                for item in arr {
876                    Self::add_json_value(doc, field, field_type, item);
877                }
878            }
879            // Handle sparse vector objects
880            serde_json::Value::Object(obj) if matches!(field_type, FieldType::SparseVector) => {
881                if let (Some(indices_val), Some(values_val)) =
882                    (obj.get("indices"), obj.get("values"))
883                {
884                    let indices: Vec<u32> = indices_val
885                        .as_array()
886                        .map(|arr| {
887                            arr.iter()
888                                .filter_map(|v| v.as_u64().map(|n| n as u32))
889                                .collect()
890                        })
891                        .unwrap_or_default();
892                    let values: Vec<f32> = values_val
893                        .as_array()
894                        .map(|arr| {
895                            arr.iter()
896                                .filter_map(|v| v.as_f64().map(|n| n as f32))
897                                .collect()
898                        })
899                        .unwrap_or_default();
900                    if indices.len() == values.len() {
901                        let entries: Vec<(u32, f32)> = indices.into_iter().zip(values).collect();
902                        doc.add_sparse_vector(field, entries);
903                    }
904                }
905            }
906            // Handle JSON fields - accept any value directly
907            _ if matches!(field_type, FieldType::Json) => {
908                doc.add_json(field, value.clone());
909            }
910            serde_json::Value::Object(_) => {}
911            _ => {}
912        }
913    }
914}
915
916#[cfg(test)]
917mod tests {
918    use super::*;
919
920    #[test]
921    fn test_schema_builder() {
922        let mut builder = Schema::builder();
923        let title = builder.add_text_field("title", true, true);
924        let body = builder.add_text_field("body", true, false);
925        let count = builder.add_u64_field("count", true, true);
926        let schema = builder.build();
927
928        assert_eq!(schema.get_field("title"), Some(title));
929        assert_eq!(schema.get_field("body"), Some(body));
930        assert_eq!(schema.get_field("count"), Some(count));
931        assert_eq!(schema.get_field("nonexistent"), None);
932    }
933
934    #[test]
935    fn test_document() {
936        let mut builder = Schema::builder();
937        let title = builder.add_text_field("title", true, true);
938        let count = builder.add_u64_field("count", true, true);
939        let _schema = builder.build();
940
941        let mut doc = Document::new();
942        doc.add_text(title, "Hello World");
943        doc.add_u64(count, 42);
944
945        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
946        assert_eq!(doc.get_first(count).unwrap().as_u64(), Some(42));
947    }
948
949    #[test]
950    fn test_document_serialization() {
951        let mut builder = Schema::builder();
952        let title = builder.add_text_field("title", true, true);
953        let count = builder.add_u64_field("count", true, true);
954        let _schema = builder.build();
955
956        let mut doc = Document::new();
957        doc.add_text(title, "Hello World");
958        doc.add_u64(count, 42);
959
960        // Serialize
961        let json = serde_json::to_string(&doc).unwrap();
962        println!("Serialized doc: {}", json);
963
964        // Deserialize
965        let doc2: Document = serde_json::from_str(&json).unwrap();
966        assert_eq!(
967            doc2.field_values().len(),
968            2,
969            "Should have 2 field values after deserialization"
970        );
971        assert_eq!(
972            doc2.get_first(title).unwrap().as_text(),
973            Some("Hello World")
974        );
975        assert_eq!(doc2.get_first(count).unwrap().as_u64(), Some(42));
976    }
977
978    #[test]
979    fn test_multivalue_field() {
980        let mut builder = Schema::builder();
981        let uris = builder.add_text_field("uris", true, true);
982        let title = builder.add_text_field("title", true, true);
983        let schema = builder.build();
984
985        // Create document with multiple values for the same field
986        let mut doc = Document::new();
987        doc.add_text(uris, "one");
988        doc.add_text(uris, "two");
989        doc.add_text(title, "Test Document");
990
991        // Verify get_first returns the first value
992        assert_eq!(doc.get_first(uris).unwrap().as_text(), Some("one"));
993
994        // Verify get_all returns all values
995        let all_uris: Vec<_> = doc.get_all(uris).collect();
996        assert_eq!(all_uris.len(), 2);
997        assert_eq!(all_uris[0].as_text(), Some("one"));
998        assert_eq!(all_uris[1].as_text(), Some("two"));
999
1000        // Verify to_json returns array for multi-value field
1001        let json = doc.to_json(&schema);
1002        let uris_json = json.get("uris").unwrap();
1003        assert!(uris_json.is_array(), "Multi-value field should be an array");
1004        let uris_arr = uris_json.as_array().unwrap();
1005        assert_eq!(uris_arr.len(), 2);
1006        assert_eq!(uris_arr[0].as_str(), Some("one"));
1007        assert_eq!(uris_arr[1].as_str(), Some("two"));
1008
1009        // Verify single-value field is NOT an array
1010        let title_json = json.get("title").unwrap();
1011        assert!(
1012            title_json.is_string(),
1013            "Single-value field should be a string"
1014        );
1015        assert_eq!(title_json.as_str(), Some("Test Document"));
1016    }
1017
1018    #[test]
1019    fn test_multivalue_from_json() {
1020        let mut builder = Schema::builder();
1021        let uris = builder.add_text_field("uris", true, true);
1022        let title = builder.add_text_field("title", true, true);
1023        let schema = builder.build();
1024
1025        // Create JSON with array value
1026        let json = serde_json::json!({
1027            "uris": ["one", "two"],
1028            "title": "Test Document"
1029        });
1030
1031        // Parse from JSON
1032        let doc = Document::from_json(&json, &schema).unwrap();
1033
1034        // Verify all values are present
1035        let all_uris: Vec<_> = doc.get_all(uris).collect();
1036        assert_eq!(all_uris.len(), 2);
1037        assert_eq!(all_uris[0].as_text(), Some("one"));
1038        assert_eq!(all_uris[1].as_text(), Some("two"));
1039
1040        // Verify single value
1041        assert_eq!(
1042            doc.get_first(title).unwrap().as_text(),
1043            Some("Test Document")
1044        );
1045
1046        // Verify roundtrip: to_json should produce equivalent JSON
1047        let json_out = doc.to_json(&schema);
1048        let uris_out = json_out.get("uris").unwrap().as_array().unwrap();
1049        assert_eq!(uris_out.len(), 2);
1050        assert_eq!(uris_out[0].as_str(), Some("one"));
1051        assert_eq!(uris_out[1].as_str(), Some("two"));
1052    }
1053
1054    #[test]
1055    fn test_multi_attribute_forces_array() {
1056        // Test that fields marked as 'multi' are always serialized as arrays,
1057        // even when they have only one value
1058        let mut builder = Schema::builder();
1059        let uris = builder.add_text_field("uris", true, true);
1060        builder.set_multi(uris, true); // Mark as multi
1061        let title = builder.add_text_field("title", true, true);
1062        let schema = builder.build();
1063
1064        // Verify the multi attribute is set
1065        assert!(schema.get_field_entry(uris).unwrap().multi);
1066        assert!(!schema.get_field_entry(title).unwrap().multi);
1067
1068        // Create document with single value for multi field
1069        let mut doc = Document::new();
1070        doc.add_text(uris, "only_one");
1071        doc.add_text(title, "Test Document");
1072
1073        // Verify to_json returns array for multi field even with single value
1074        let json = doc.to_json(&schema);
1075
1076        let uris_json = json.get("uris").unwrap();
1077        assert!(
1078            uris_json.is_array(),
1079            "Multi field should be array even with single value"
1080        );
1081        let uris_arr = uris_json.as_array().unwrap();
1082        assert_eq!(uris_arr.len(), 1);
1083        assert_eq!(uris_arr[0].as_str(), Some("only_one"));
1084
1085        // Verify non-multi field with single value is NOT an array
1086        let title_json = json.get("title").unwrap();
1087        assert!(
1088            title_json.is_string(),
1089            "Non-multi single-value field should be a string"
1090        );
1091        assert_eq!(title_json.as_str(), Some("Test Document"));
1092    }
1093
1094    #[test]
1095    fn test_sparse_vector_field() {
1096        let mut builder = Schema::builder();
1097        let embedding = builder.add_sparse_vector_field("embedding", true, true);
1098        let title = builder.add_text_field("title", true, true);
1099        let schema = builder.build();
1100
1101        assert_eq!(schema.get_field("embedding"), Some(embedding));
1102        assert_eq!(
1103            schema.get_field_entry(embedding).unwrap().field_type,
1104            FieldType::SparseVector
1105        );
1106
1107        // Create document with sparse vector
1108        let mut doc = Document::new();
1109        doc.add_sparse_vector(embedding, vec![(0, 1.0), (5, 2.5), (10, 0.5)]);
1110        doc.add_text(title, "Test Document");
1111
1112        // Verify accessor
1113        let entries = doc
1114            .get_first(embedding)
1115            .unwrap()
1116            .as_sparse_vector()
1117            .unwrap();
1118        assert_eq!(entries, &[(0, 1.0), (5, 2.5), (10, 0.5)]);
1119
1120        // Verify JSON roundtrip
1121        let json = doc.to_json(&schema);
1122        let embedding_json = json.get("embedding").unwrap();
1123        assert!(embedding_json.is_object());
1124        assert_eq!(
1125            embedding_json
1126                .get("indices")
1127                .unwrap()
1128                .as_array()
1129                .unwrap()
1130                .len(),
1131            3
1132        );
1133
1134        // Parse back from JSON
1135        let doc2 = Document::from_json(&json, &schema).unwrap();
1136        let entries2 = doc2
1137            .get_first(embedding)
1138            .unwrap()
1139            .as_sparse_vector()
1140            .unwrap();
1141        assert_eq!(entries2[0].0, 0);
1142        assert!((entries2[0].1 - 1.0).abs() < 1e-6);
1143        assert_eq!(entries2[1].0, 5);
1144        assert!((entries2[1].1 - 2.5).abs() < 1e-6);
1145        assert_eq!(entries2[2].0, 10);
1146        assert!((entries2[2].1 - 0.5).abs() < 1e-6);
1147    }
1148
1149    #[test]
1150    fn test_json_field() {
1151        let mut builder = Schema::builder();
1152        let metadata = builder.add_json_field("metadata", true);
1153        let title = builder.add_text_field("title", true, true);
1154        let schema = builder.build();
1155
1156        assert_eq!(schema.get_field("metadata"), Some(metadata));
1157        assert_eq!(
1158            schema.get_field_entry(metadata).unwrap().field_type,
1159            FieldType::Json
1160        );
1161        // JSON fields are never indexed
1162        assert!(!schema.get_field_entry(metadata).unwrap().indexed);
1163        assert!(schema.get_field_entry(metadata).unwrap().stored);
1164
1165        // Create document with JSON value (object)
1166        let json_value = serde_json::json!({
1167            "author": "John Doe",
1168            "tags": ["rust", "search"],
1169            "nested": {"key": "value"}
1170        });
1171        let mut doc = Document::new();
1172        doc.add_json(metadata, json_value.clone());
1173        doc.add_text(title, "Test Document");
1174
1175        // Verify accessor
1176        let stored_json = doc.get_first(metadata).unwrap().as_json().unwrap();
1177        assert_eq!(stored_json, &json_value);
1178        assert_eq!(
1179            stored_json.get("author").unwrap().as_str(),
1180            Some("John Doe")
1181        );
1182
1183        // Verify JSON roundtrip via to_json/from_json
1184        let doc_json = doc.to_json(&schema);
1185        let metadata_out = doc_json.get("metadata").unwrap();
1186        assert_eq!(metadata_out, &json_value);
1187
1188        // Parse back from JSON
1189        let doc2 = Document::from_json(&doc_json, &schema).unwrap();
1190        let stored_json2 = doc2.get_first(metadata).unwrap().as_json().unwrap();
1191        assert_eq!(stored_json2, &json_value);
1192    }
1193
1194    #[test]
1195    fn test_json_field_various_types() {
1196        let mut builder = Schema::builder();
1197        let data = builder.add_json_field("data", true);
1198        let _schema = builder.build();
1199
1200        // Test with array
1201        let arr_value = serde_json::json!([1, 2, 3, "four", null]);
1202        let mut doc = Document::new();
1203        doc.add_json(data, arr_value.clone());
1204        assert_eq!(doc.get_first(data).unwrap().as_json().unwrap(), &arr_value);
1205
1206        // Test with string
1207        let str_value = serde_json::json!("just a string");
1208        let mut doc2 = Document::new();
1209        doc2.add_json(data, str_value.clone());
1210        assert_eq!(doc2.get_first(data).unwrap().as_json().unwrap(), &str_value);
1211
1212        // Test with number
1213        let num_value = serde_json::json!(42.5);
1214        let mut doc3 = Document::new();
1215        doc3.add_json(data, num_value.clone());
1216        assert_eq!(doc3.get_first(data).unwrap().as_json().unwrap(), &num_value);
1217
1218        // Test with null
1219        let null_value = serde_json::Value::Null;
1220        let mut doc4 = Document::new();
1221        doc4.add_json(data, null_value.clone());
1222        assert_eq!(
1223            doc4.get_first(data).unwrap().as_json().unwrap(),
1224            &null_value
1225        );
1226
1227        // Test with boolean
1228        let bool_value = serde_json::json!(true);
1229        let mut doc5 = Document::new();
1230        doc5.add_json(data, bool_value.clone());
1231        assert_eq!(
1232            doc5.get_first(data).unwrap().as_json().unwrap(),
1233            &bool_value
1234        );
1235    }
1236}