Skip to main content

hermes_core/dsl/
schema.rs

1//! Schema definitions for documents and fields
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// Field identifier
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
8pub struct Field(pub u32);
9
10/// Types of fields supported
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub enum FieldType {
13    /// Text field - tokenized and indexed
14    #[serde(rename = "text")]
15    Text,
16    /// Unsigned 64-bit integer
17    #[serde(rename = "u64")]
18    U64,
19    /// Signed 64-bit integer
20    #[serde(rename = "i64")]
21    I64,
22    /// 64-bit floating point
23    #[serde(rename = "f64")]
24    F64,
25    /// Raw bytes (not tokenized)
26    #[serde(rename = "bytes")]
27    Bytes,
28    /// Sparse vector field - indexed as inverted posting lists with quantized weights
29    #[serde(rename = "sparse_vector")]
30    SparseVector,
31    /// Dense vector field - indexed using RaBitQ binary quantization for ANN search
32    #[serde(rename = "dense_vector")]
33    DenseVector,
34    /// JSON field - arbitrary JSON data, stored but not indexed
35    #[serde(rename = "json")]
36    Json,
37}
38
39/// Field options
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct FieldEntry {
42    pub name: String,
43    pub field_type: FieldType,
44    pub indexed: bool,
45    pub stored: bool,
46    /// Name of the tokenizer to use for this field (for text fields)
47    pub tokenizer: Option<String>,
48    /// Whether this field can have multiple values (serialized as array in JSON)
49    #[serde(default)]
50    pub multi: bool,
51    /// Position tracking mode for phrase queries and multi-field element tracking
52    #[serde(default, skip_serializing_if = "Option::is_none")]
53    pub positions: Option<PositionMode>,
54    /// Configuration for sparse vector fields (index size, weight quantization)
55    #[serde(default, skip_serializing_if = "Option::is_none")]
56    pub sparse_vector_config: Option<crate::structures::SparseVectorConfig>,
57    /// Configuration for dense vector fields (dimension, quantization)
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub dense_vector_config: Option<DenseVectorConfig>,
60    /// Whether this field has columnar fast-field storage for O(1) doc→value access.
61    /// Valid for u64, i64, f64, and text fields.
62    #[serde(default)]
63    pub fast: bool,
64}
65
66/// Position tracking mode for text fields
67#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
68#[serde(rename_all = "snake_case")]
69pub enum PositionMode {
70    /// Track only element ordinal for multi-valued fields (which array element)
71    /// Useful for returning which element matched without full phrase query support
72    Ordinal,
73    /// Track only token position within text (for phrase queries)
74    /// Does not track element ordinal - all positions are relative to concatenated text
75    TokenPosition,
76    /// Track both element ordinal and token position (full support)
77    /// Position format: (element_ordinal << 20) | token_position
78    Full,
79}
80
81impl PositionMode {
82    /// Whether this mode tracks element ordinals
83    pub fn tracks_ordinal(&self) -> bool {
84        matches!(self, PositionMode::Ordinal | PositionMode::Full)
85    }
86
87    /// Whether this mode tracks token positions
88    pub fn tracks_token_position(&self) -> bool {
89        matches!(self, PositionMode::TokenPosition | PositionMode::Full)
90    }
91}
92
93/// Vector index algorithm type
94#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
95#[serde(rename_all = "snake_case")]
96pub enum VectorIndexType {
97    /// Flat - brute-force search over raw vectors (accumulating state)
98    Flat,
99    /// RaBitQ - binary quantization, good for small datasets (<100K)
100    #[default]
101    RaBitQ,
102    /// IVF-RaBitQ - inverted file with RaBitQ, good for medium datasets
103    IvfRaBitQ,
104    /// ScaNN - product quantization with OPQ and anisotropic loss, best for large datasets
105    ScaNN,
106}
107
108/// Storage quantization for dense vector elements
109///
110/// Controls the precision of each vector coordinate in `.vectors` files.
111/// Lower precision reduces storage and memory bandwidth; scoring uses
112/// native-precision SIMD (no dequantization on the hot path).
113#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
114#[serde(rename_all = "snake_case")]
115pub enum DenseVectorQuantization {
116    /// 32-bit IEEE 754 float (4 bytes/dim) — full precision, baseline
117    #[default]
118    F32,
119    /// 16-bit IEEE 754 half-float (2 bytes/dim) — <0.1% recall loss for normalized embeddings
120    F16,
121    /// 8-bit unsigned scalar quantization (1 byte/dim) — maps [-1,1] → [0,255]
122    UInt8,
123}
124
125impl DenseVectorQuantization {
126    /// Bytes per element for this quantization type
127    pub fn element_size(self) -> usize {
128        match self {
129            Self::F32 => 4,
130            Self::F16 => 2,
131            Self::UInt8 => 1,
132        }
133    }
134
135    /// Wire format tag (stored in .vectors header)
136    pub fn tag(self) -> u8 {
137        match self {
138            Self::F32 => 0,
139            Self::F16 => 1,
140            Self::UInt8 => 2,
141        }
142    }
143
144    /// Decode wire format tag
145    pub fn from_tag(tag: u8) -> Option<Self> {
146        match tag {
147            0 => Some(Self::F32),
148            1 => Some(Self::F16),
149            2 => Some(Self::UInt8),
150            _ => None,
151        }
152    }
153}
154
155/// Configuration for dense vector fields using Flat, RaBitQ, IVF-RaBitQ, or ScaNN
156///
157/// Indexes operate in two states:
158/// - **Flat (accumulating)**: Brute-force search over raw vectors. Used when vector count
159///   is below `build_threshold` or before `build_index` is called.
160/// - **Built (ANN)**: Fast approximate nearest neighbor search using trained structures.
161///   Centroids and codebooks are trained from data and stored within the segment.
162#[derive(Debug, Clone, Serialize, Deserialize)]
163pub struct DenseVectorConfig {
164    /// Dimensionality of vectors
165    pub dim: usize,
166    /// Target vector index algorithm (Flat, RaBitQ, IVF-RaBitQ, or ScaNN)
167    /// When in accumulating state, search uses brute-force regardless of this setting.
168    #[serde(default)]
169    pub index_type: VectorIndexType,
170    /// Storage quantization for vector elements (f32, f16, uint8)
171    #[serde(default)]
172    pub quantization: DenseVectorQuantization,
173    /// Number of IVF clusters for IVF-RaBitQ and ScaNN (default: sqrt(n) capped at 4096)
174    /// If None, automatically determined based on dataset size.
175    #[serde(default, skip_serializing_if = "Option::is_none")]
176    pub num_clusters: Option<usize>,
177    /// Number of clusters to probe during search (default: 32)
178    #[serde(default = "default_nprobe")]
179    pub nprobe: usize,
180    /// Minimum number of vectors required before building ANN index.
181    /// Below this threshold, brute-force (Flat) search is used.
182    /// Default: 1000 for RaBitQ, 10000 for IVF-RaBitQ/ScaNN.
183    #[serde(default, skip_serializing_if = "Option::is_none")]
184    pub build_threshold: Option<usize>,
185    /// Whether stored vectors are pre-normalized to unit L2 norm.
186    /// When true, scoring skips per-vector norm computation (cosine = dot / ||q||),
187    /// reducing compute by ~40%. Common for embedding models (e.g. OpenAI, Cohere).
188    /// Default: true (most embedding models produce L2-normalized vectors).
189    #[serde(default = "default_unit_norm")]
190    pub unit_norm: bool,
191}
192
193fn default_nprobe() -> usize {
194    32
195}
196
197fn default_unit_norm() -> bool {
198    true
199}
200
201impl DenseVectorConfig {
202    pub fn new(dim: usize) -> Self {
203        Self {
204            dim,
205            index_type: VectorIndexType::RaBitQ,
206            quantization: DenseVectorQuantization::F32,
207            num_clusters: None,
208            nprobe: 32,
209            build_threshold: None,
210            unit_norm: true,
211        }
212    }
213
214    /// Create IVF-RaBitQ configuration
215    pub fn with_ivf(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
216        Self {
217            dim,
218            index_type: VectorIndexType::IvfRaBitQ,
219            quantization: DenseVectorQuantization::F32,
220            num_clusters,
221            nprobe,
222            build_threshold: None,
223            unit_norm: true,
224        }
225    }
226
227    /// Create ScaNN configuration
228    pub fn with_scann(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
229        Self {
230            dim,
231            index_type: VectorIndexType::ScaNN,
232            quantization: DenseVectorQuantization::F32,
233            num_clusters,
234            nprobe,
235            build_threshold: None,
236            unit_norm: true,
237        }
238    }
239
240    /// Create Flat (brute-force) configuration - no ANN index
241    pub fn flat(dim: usize) -> Self {
242        Self {
243            dim,
244            index_type: VectorIndexType::Flat,
245            quantization: DenseVectorQuantization::F32,
246            num_clusters: None,
247            nprobe: 0,
248            build_threshold: None,
249            unit_norm: true,
250        }
251    }
252
253    /// Set storage quantization
254    pub fn with_quantization(mut self, quantization: DenseVectorQuantization) -> Self {
255        self.quantization = quantization;
256        self
257    }
258
259    /// Set build threshold for auto-building ANN index
260    pub fn with_build_threshold(mut self, threshold: usize) -> Self {
261        self.build_threshold = Some(threshold);
262        self
263    }
264
265    /// Mark vectors as pre-normalized to unit L2 norm
266    pub fn with_unit_norm(mut self) -> Self {
267        self.unit_norm = true;
268        self
269    }
270
271    /// Set number of IVF clusters
272    pub fn with_num_clusters(mut self, num_clusters: usize) -> Self {
273        self.num_clusters = Some(num_clusters);
274        self
275    }
276
277    /// Check if this config uses IVF
278    pub fn uses_ivf(&self) -> bool {
279        matches!(
280            self.index_type,
281            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN
282        )
283    }
284
285    /// Check if this config uses ScaNN
286    pub fn uses_scann(&self) -> bool {
287        self.index_type == VectorIndexType::ScaNN
288    }
289
290    /// Check if this config is flat (brute-force)
291    pub fn is_flat(&self) -> bool {
292        self.index_type == VectorIndexType::Flat
293    }
294
295    /// Get the default build threshold for this index type
296    pub fn default_build_threshold(&self) -> usize {
297        self.build_threshold.unwrap_or(match self.index_type {
298            VectorIndexType::Flat => usize::MAX, // Never auto-build
299            VectorIndexType::RaBitQ => 1000,
300            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN => 10000,
301        })
302    }
303
304    /// Calculate optimal number of clusters for given vector count
305    pub fn optimal_num_clusters(&self, num_vectors: usize) -> usize {
306        self.num_clusters.unwrap_or_else(|| {
307            // sqrt(n) heuristic, capped at 4096
308            let optimal = (num_vectors as f64).sqrt() as usize;
309            optimal.clamp(16, 4096)
310        })
311    }
312}
313
314use super::query_field_router::QueryRouterRule;
315
316/// Schema defining document structure
317#[derive(Debug, Clone, Default, Serialize, Deserialize)]
318pub struct Schema {
319    fields: Vec<FieldEntry>,
320    name_to_field: HashMap<String, Field>,
321    /// Default fields for query parsing (when no field is specified)
322    #[serde(default)]
323    default_fields: Vec<Field>,
324    /// Query router rules for routing queries to specific fields based on regex patterns
325    #[serde(default)]
326    query_routers: Vec<QueryRouterRule>,
327}
328
329impl Schema {
330    pub fn builder() -> SchemaBuilder {
331        SchemaBuilder::default()
332    }
333
334    pub fn get_field(&self, name: &str) -> Option<Field> {
335        self.name_to_field.get(name).copied()
336    }
337
338    pub fn get_field_entry(&self, field: Field) -> Option<&FieldEntry> {
339        self.fields.get(field.0 as usize)
340    }
341
342    pub fn get_field_name(&self, field: Field) -> Option<&str> {
343        self.fields.get(field.0 as usize).map(|e| e.name.as_str())
344    }
345
346    pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
347        self.fields
348            .iter()
349            .enumerate()
350            .map(|(i, e)| (Field(i as u32), e))
351    }
352
353    pub fn num_fields(&self) -> usize {
354        self.fields.len()
355    }
356
357    /// Get the default fields for query parsing
358    pub fn default_fields(&self) -> &[Field] {
359        &self.default_fields
360    }
361
362    /// Set default fields (used by builder)
363    pub fn set_default_fields(&mut self, fields: Vec<Field>) {
364        self.default_fields = fields;
365    }
366
367    /// Get the query router rules
368    pub fn query_routers(&self) -> &[QueryRouterRule] {
369        &self.query_routers
370    }
371
372    /// Set query router rules
373    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
374        self.query_routers = rules;
375    }
376}
377
378/// Builder for Schema
379#[derive(Debug, Default)]
380pub struct SchemaBuilder {
381    fields: Vec<FieldEntry>,
382    default_fields: Vec<String>,
383    query_routers: Vec<QueryRouterRule>,
384}
385
386impl SchemaBuilder {
387    pub fn add_text_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
388        self.add_field_with_tokenizer(
389            name,
390            FieldType::Text,
391            indexed,
392            stored,
393            Some("default".to_string()),
394        )
395    }
396
397    pub fn add_text_field_with_tokenizer(
398        &mut self,
399        name: &str,
400        indexed: bool,
401        stored: bool,
402        tokenizer: &str,
403    ) -> Field {
404        self.add_field_with_tokenizer(
405            name,
406            FieldType::Text,
407            indexed,
408            stored,
409            Some(tokenizer.to_string()),
410        )
411    }
412
413    pub fn add_u64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
414        self.add_field(name, FieldType::U64, indexed, stored)
415    }
416
417    pub fn add_i64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
418        self.add_field(name, FieldType::I64, indexed, stored)
419    }
420
421    pub fn add_f64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
422        self.add_field(name, FieldType::F64, indexed, stored)
423    }
424
425    pub fn add_bytes_field(&mut self, name: &str, stored: bool) -> Field {
426        self.add_field(name, FieldType::Bytes, false, stored)
427    }
428
429    /// Add a JSON field for storing arbitrary JSON data
430    ///
431    /// JSON fields are never indexed, only stored. They can hold any valid JSON value
432    /// (objects, arrays, strings, numbers, booleans, null).
433    pub fn add_json_field(&mut self, name: &str, stored: bool) -> Field {
434        self.add_field(name, FieldType::Json, false, stored)
435    }
436
437    /// Add a sparse vector field with default configuration
438    ///
439    /// Sparse vectors are indexed as inverted posting lists where each dimension
440    /// becomes a "term" and documents have quantized weights for each dimension.
441    pub fn add_sparse_vector_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
442        self.add_sparse_vector_field_with_config(
443            name,
444            indexed,
445            stored,
446            crate::structures::SparseVectorConfig::default(),
447        )
448    }
449
450    /// Add a sparse vector field with custom configuration
451    ///
452    /// Use `SparseVectorConfig::splade()` for SPLADE models (u16 indices, uint8 weights).
453    /// Use `SparseVectorConfig::compact()` for maximum compression (u16 indices, uint4 weights).
454    pub fn add_sparse_vector_field_with_config(
455        &mut self,
456        name: &str,
457        indexed: bool,
458        stored: bool,
459        config: crate::structures::SparseVectorConfig,
460    ) -> Field {
461        let field = Field(self.fields.len() as u32);
462        self.fields.push(FieldEntry {
463            name: name.to_string(),
464            field_type: FieldType::SparseVector,
465            indexed,
466            stored,
467            tokenizer: None,
468            multi: false,
469            positions: None,
470            sparse_vector_config: Some(config),
471            dense_vector_config: None,
472            fast: false,
473        });
474        field
475    }
476
477    /// Set sparse vector configuration for an existing field
478    pub fn set_sparse_vector_config(
479        &mut self,
480        field: Field,
481        config: crate::structures::SparseVectorConfig,
482    ) {
483        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
484            entry.sparse_vector_config = Some(config);
485        }
486    }
487
488    /// Add a dense vector field with default configuration
489    ///
490    /// Dense vectors are indexed using RaBitQ binary quantization for fast ANN search.
491    /// The dimension must be specified as it determines the quantization structure.
492    pub fn add_dense_vector_field(
493        &mut self,
494        name: &str,
495        dim: usize,
496        indexed: bool,
497        stored: bool,
498    ) -> Field {
499        self.add_dense_vector_field_with_config(name, indexed, stored, DenseVectorConfig::new(dim))
500    }
501
502    /// Add a dense vector field with custom configuration
503    pub fn add_dense_vector_field_with_config(
504        &mut self,
505        name: &str,
506        indexed: bool,
507        stored: bool,
508        config: DenseVectorConfig,
509    ) -> Field {
510        let field = Field(self.fields.len() as u32);
511        self.fields.push(FieldEntry {
512            name: name.to_string(),
513            field_type: FieldType::DenseVector,
514            indexed,
515            stored,
516            tokenizer: None,
517            multi: false,
518            positions: None,
519            sparse_vector_config: None,
520            dense_vector_config: Some(config),
521            fast: false,
522        });
523        field
524    }
525
526    fn add_field(
527        &mut self,
528        name: &str,
529        field_type: FieldType,
530        indexed: bool,
531        stored: bool,
532    ) -> Field {
533        self.add_field_with_tokenizer(name, field_type, indexed, stored, None)
534    }
535
536    fn add_field_with_tokenizer(
537        &mut self,
538        name: &str,
539        field_type: FieldType,
540        indexed: bool,
541        stored: bool,
542        tokenizer: Option<String>,
543    ) -> Field {
544        self.add_field_full(name, field_type, indexed, stored, tokenizer, false)
545    }
546
547    fn add_field_full(
548        &mut self,
549        name: &str,
550        field_type: FieldType,
551        indexed: bool,
552        stored: bool,
553        tokenizer: Option<String>,
554        multi: bool,
555    ) -> Field {
556        let field = Field(self.fields.len() as u32);
557        self.fields.push(FieldEntry {
558            name: name.to_string(),
559            field_type,
560            indexed,
561            stored,
562            tokenizer,
563            multi,
564            positions: None,
565            sparse_vector_config: None,
566            dense_vector_config: None,
567            fast: false,
568        });
569        field
570    }
571
572    /// Set the multi attribute on the last added field
573    pub fn set_multi(&mut self, field: Field, multi: bool) {
574        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
575            entry.multi = multi;
576        }
577    }
578
579    /// Set fast-field columnar storage for O(1) doc→value access.
580    /// Valid for u64, i64, f64, and text fields.
581    pub fn set_fast(&mut self, field: Field, fast: bool) {
582        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
583            entry.fast = fast;
584        }
585    }
586
587    /// Set position tracking mode for phrase queries and multi-field element tracking
588    pub fn set_positions(&mut self, field: Field, mode: PositionMode) {
589        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
590            entry.positions = Some(mode);
591        }
592    }
593
594    /// Set default fields by name
595    pub fn set_default_fields(&mut self, field_names: Vec<String>) {
596        self.default_fields = field_names;
597    }
598
599    /// Set query router rules
600    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
601        self.query_routers = rules;
602    }
603
604    pub fn build(self) -> Schema {
605        let mut name_to_field = HashMap::new();
606        for (i, entry) in self.fields.iter().enumerate() {
607            name_to_field.insert(entry.name.clone(), Field(i as u32));
608        }
609
610        // Resolve default field names to Field IDs
611        let default_fields: Vec<Field> = self
612            .default_fields
613            .iter()
614            .filter_map(|name| name_to_field.get(name).copied())
615            .collect();
616
617        Schema {
618            fields: self.fields,
619            name_to_field,
620            default_fields,
621            query_routers: self.query_routers,
622        }
623    }
624}
625
626/// Value that can be stored in a field
627#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
628pub enum FieldValue {
629    #[serde(rename = "text")]
630    Text(String),
631    #[serde(rename = "u64")]
632    U64(u64),
633    #[serde(rename = "i64")]
634    I64(i64),
635    #[serde(rename = "f64")]
636    F64(f64),
637    #[serde(rename = "bytes")]
638    Bytes(Vec<u8>),
639    /// Sparse vector: list of (dimension_id, weight) pairs
640    #[serde(rename = "sparse_vector")]
641    SparseVector(Vec<(u32, f32)>),
642    /// Dense vector: float32 values
643    #[serde(rename = "dense_vector")]
644    DenseVector(Vec<f32>),
645    /// Arbitrary JSON value
646    #[serde(rename = "json")]
647    Json(serde_json::Value),
648}
649
650impl FieldValue {
651    pub fn as_text(&self) -> Option<&str> {
652        match self {
653            FieldValue::Text(s) => Some(s),
654            _ => None,
655        }
656    }
657
658    pub fn as_u64(&self) -> Option<u64> {
659        match self {
660            FieldValue::U64(v) => Some(*v),
661            _ => None,
662        }
663    }
664
665    pub fn as_i64(&self) -> Option<i64> {
666        match self {
667            FieldValue::I64(v) => Some(*v),
668            _ => None,
669        }
670    }
671
672    pub fn as_f64(&self) -> Option<f64> {
673        match self {
674            FieldValue::F64(v) => Some(*v),
675            _ => None,
676        }
677    }
678
679    pub fn as_bytes(&self) -> Option<&[u8]> {
680        match self {
681            FieldValue::Bytes(b) => Some(b),
682            _ => None,
683        }
684    }
685
686    pub fn as_sparse_vector(&self) -> Option<&[(u32, f32)]> {
687        match self {
688            FieldValue::SparseVector(entries) => Some(entries),
689            _ => None,
690        }
691    }
692
693    pub fn as_dense_vector(&self) -> Option<&[f32]> {
694        match self {
695            FieldValue::DenseVector(v) => Some(v),
696            _ => None,
697        }
698    }
699
700    pub fn as_json(&self) -> Option<&serde_json::Value> {
701        match self {
702            FieldValue::Json(v) => Some(v),
703            _ => None,
704        }
705    }
706}
707
708/// A document to be indexed
709#[derive(Debug, Clone, Default, Serialize, Deserialize)]
710pub struct Document {
711    field_values: Vec<(Field, FieldValue)>,
712}
713
714impl Document {
715    pub fn new() -> Self {
716        Self::default()
717    }
718
719    pub fn add_text(&mut self, field: Field, value: impl Into<String>) {
720        self.field_values
721            .push((field, FieldValue::Text(value.into())));
722    }
723
724    pub fn add_u64(&mut self, field: Field, value: u64) {
725        self.field_values.push((field, FieldValue::U64(value)));
726    }
727
728    pub fn add_i64(&mut self, field: Field, value: i64) {
729        self.field_values.push((field, FieldValue::I64(value)));
730    }
731
732    pub fn add_f64(&mut self, field: Field, value: f64) {
733        self.field_values.push((field, FieldValue::F64(value)));
734    }
735
736    pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
737        self.field_values.push((field, FieldValue::Bytes(value)));
738    }
739
740    pub fn add_sparse_vector(&mut self, field: Field, entries: Vec<(u32, f32)>) {
741        self.field_values
742            .push((field, FieldValue::SparseVector(entries)));
743    }
744
745    pub fn add_dense_vector(&mut self, field: Field, values: Vec<f32>) {
746        self.field_values
747            .push((field, FieldValue::DenseVector(values)));
748    }
749
750    pub fn add_json(&mut self, field: Field, value: serde_json::Value) {
751        self.field_values.push((field, FieldValue::Json(value)));
752    }
753
754    pub fn get_first(&self, field: Field) -> Option<&FieldValue> {
755        self.field_values
756            .iter()
757            .find(|(f, _)| *f == field)
758            .map(|(_, v)| v)
759    }
760
761    pub fn get_all(&self, field: Field) -> impl Iterator<Item = &FieldValue> {
762        self.field_values
763            .iter()
764            .filter(move |(f, _)| *f == field)
765            .map(|(_, v)| v)
766    }
767
768    pub fn field_values(&self) -> &[(Field, FieldValue)] {
769        &self.field_values
770    }
771
772    /// Return a new Document containing only fields marked as `stored` in the schema
773    pub fn filter_stored(&self, schema: &Schema) -> Document {
774        Document {
775            field_values: self
776                .field_values
777                .iter()
778                .filter(|(field, _)| {
779                    schema
780                        .get_field_entry(*field)
781                        .is_some_and(|entry| entry.stored)
782                })
783                .cloned()
784                .collect(),
785        }
786    }
787
788    /// Convert document to a JSON object using field names from schema
789    ///
790    /// Fields marked as `multi` in the schema are always returned as JSON arrays.
791    /// Other fields with multiple values are also returned as arrays.
792    /// Fields with a single value (and not marked multi) are returned as scalar values.
793    pub fn to_json(&self, schema: &Schema) -> serde_json::Value {
794        use std::collections::HashMap;
795
796        // Group values by field, keeping track of field entry for multi check
797        let mut field_values_map: HashMap<Field, (String, bool, Vec<serde_json::Value>)> =
798            HashMap::new();
799
800        for (field, value) in &self.field_values {
801            if let Some(entry) = schema.get_field_entry(*field) {
802                let json_value = match value {
803                    FieldValue::Text(s) => serde_json::Value::String(s.clone()),
804                    FieldValue::U64(n) => serde_json::Value::Number((*n).into()),
805                    FieldValue::I64(n) => serde_json::Value::Number((*n).into()),
806                    FieldValue::F64(n) => serde_json::json!(n),
807                    FieldValue::Bytes(b) => {
808                        use base64::Engine;
809                        serde_json::Value::String(
810                            base64::engine::general_purpose::STANDARD.encode(b),
811                        )
812                    }
813                    FieldValue::SparseVector(entries) => {
814                        let indices: Vec<u32> = entries.iter().map(|(i, _)| *i).collect();
815                        let values: Vec<f32> = entries.iter().map(|(_, v)| *v).collect();
816                        serde_json::json!({
817                            "indices": indices,
818                            "values": values
819                        })
820                    }
821                    FieldValue::DenseVector(values) => {
822                        serde_json::json!(values)
823                    }
824                    FieldValue::Json(v) => v.clone(),
825                };
826                field_values_map
827                    .entry(*field)
828                    .or_insert_with(|| (entry.name.clone(), entry.multi, Vec::new()))
829                    .2
830                    .push(json_value);
831            }
832        }
833
834        // Convert to JSON object, using arrays for multi fields or when multiple values exist
835        let mut map = serde_json::Map::new();
836        for (_field, (name, is_multi, values)) in field_values_map {
837            let json_value = if is_multi || values.len() > 1 {
838                serde_json::Value::Array(values)
839            } else {
840                values.into_iter().next().unwrap()
841            };
842            map.insert(name, json_value);
843        }
844
845        serde_json::Value::Object(map)
846    }
847
848    /// Create a Document from a JSON object using field names from schema
849    ///
850    /// Supports:
851    /// - String values -> Text fields
852    /// - Number values -> U64/I64/F64 fields (based on schema type)
853    /// - Array values -> Multiple values for the same field (multifields)
854    ///
855    /// Unknown fields (not in schema) are silently ignored.
856    pub fn from_json(json: &serde_json::Value, schema: &Schema) -> Option<Self> {
857        let obj = json.as_object()?;
858        let mut doc = Document::new();
859
860        for (key, value) in obj {
861            if let Some(field) = schema.get_field(key) {
862                let field_entry = schema.get_field_entry(field)?;
863                Self::add_json_value(&mut doc, field, &field_entry.field_type, value);
864            }
865        }
866
867        Some(doc)
868    }
869
870    /// Helper to add a JSON value to a document, handling type conversion
871    fn add_json_value(
872        doc: &mut Document,
873        field: Field,
874        field_type: &FieldType,
875        value: &serde_json::Value,
876    ) {
877        match value {
878            serde_json::Value::String(s) => {
879                if matches!(field_type, FieldType::Text) {
880                    doc.add_text(field, s.clone());
881                }
882            }
883            serde_json::Value::Number(n) => {
884                match field_type {
885                    FieldType::I64 => {
886                        if let Some(i) = n.as_i64() {
887                            doc.add_i64(field, i);
888                        }
889                    }
890                    FieldType::U64 => {
891                        if let Some(u) = n.as_u64() {
892                            doc.add_u64(field, u);
893                        } else if let Some(i) = n.as_i64() {
894                            // Allow positive i64 as u64
895                            if i >= 0 {
896                                doc.add_u64(field, i as u64);
897                            }
898                        }
899                    }
900                    FieldType::F64 => {
901                        if let Some(f) = n.as_f64() {
902                            doc.add_f64(field, f);
903                        }
904                    }
905                    _ => {}
906                }
907            }
908            // Handle arrays (multifields) - add each element separately
909            serde_json::Value::Array(arr) => {
910                for item in arr {
911                    Self::add_json_value(doc, field, field_type, item);
912                }
913            }
914            // Handle sparse vector objects
915            serde_json::Value::Object(obj) if matches!(field_type, FieldType::SparseVector) => {
916                if let (Some(indices_val), Some(values_val)) =
917                    (obj.get("indices"), obj.get("values"))
918                {
919                    let indices: Vec<u32> = indices_val
920                        .as_array()
921                        .map(|arr| {
922                            arr.iter()
923                                .filter_map(|v| v.as_u64().map(|n| n as u32))
924                                .collect()
925                        })
926                        .unwrap_or_default();
927                    let values: Vec<f32> = values_val
928                        .as_array()
929                        .map(|arr| {
930                            arr.iter()
931                                .filter_map(|v| v.as_f64().map(|n| n as f32))
932                                .collect()
933                        })
934                        .unwrap_or_default();
935                    if indices.len() == values.len() {
936                        let entries: Vec<(u32, f32)> = indices.into_iter().zip(values).collect();
937                        doc.add_sparse_vector(field, entries);
938                    }
939                }
940            }
941            // Handle JSON fields - accept any value directly
942            _ if matches!(field_type, FieldType::Json) => {
943                doc.add_json(field, value.clone());
944            }
945            serde_json::Value::Object(_) => {}
946            _ => {}
947        }
948    }
949}
950
951#[cfg(test)]
952mod tests {
953    use super::*;
954
955    #[test]
956    fn test_schema_builder() {
957        let mut builder = Schema::builder();
958        let title = builder.add_text_field("title", true, true);
959        let body = builder.add_text_field("body", true, false);
960        let count = builder.add_u64_field("count", true, true);
961        let schema = builder.build();
962
963        assert_eq!(schema.get_field("title"), Some(title));
964        assert_eq!(schema.get_field("body"), Some(body));
965        assert_eq!(schema.get_field("count"), Some(count));
966        assert_eq!(schema.get_field("nonexistent"), None);
967    }
968
969    #[test]
970    fn test_document() {
971        let mut builder = Schema::builder();
972        let title = builder.add_text_field("title", true, true);
973        let count = builder.add_u64_field("count", true, true);
974        let _schema = builder.build();
975
976        let mut doc = Document::new();
977        doc.add_text(title, "Hello World");
978        doc.add_u64(count, 42);
979
980        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
981        assert_eq!(doc.get_first(count).unwrap().as_u64(), Some(42));
982    }
983
984    #[test]
985    fn test_document_serialization() {
986        let mut builder = Schema::builder();
987        let title = builder.add_text_field("title", true, true);
988        let count = builder.add_u64_field("count", true, true);
989        let _schema = builder.build();
990
991        let mut doc = Document::new();
992        doc.add_text(title, "Hello World");
993        doc.add_u64(count, 42);
994
995        // Serialize
996        let json = serde_json::to_string(&doc).unwrap();
997        println!("Serialized doc: {}", json);
998
999        // Deserialize
1000        let doc2: Document = serde_json::from_str(&json).unwrap();
1001        assert_eq!(
1002            doc2.field_values().len(),
1003            2,
1004            "Should have 2 field values after deserialization"
1005        );
1006        assert_eq!(
1007            doc2.get_first(title).unwrap().as_text(),
1008            Some("Hello World")
1009        );
1010        assert_eq!(doc2.get_first(count).unwrap().as_u64(), Some(42));
1011    }
1012
1013    #[test]
1014    fn test_multivalue_field() {
1015        let mut builder = Schema::builder();
1016        let uris = builder.add_text_field("uris", true, true);
1017        let title = builder.add_text_field("title", true, true);
1018        let schema = builder.build();
1019
1020        // Create document with multiple values for the same field
1021        let mut doc = Document::new();
1022        doc.add_text(uris, "one");
1023        doc.add_text(uris, "two");
1024        doc.add_text(title, "Test Document");
1025
1026        // Verify get_first returns the first value
1027        assert_eq!(doc.get_first(uris).unwrap().as_text(), Some("one"));
1028
1029        // Verify get_all returns all values
1030        let all_uris: Vec<_> = doc.get_all(uris).collect();
1031        assert_eq!(all_uris.len(), 2);
1032        assert_eq!(all_uris[0].as_text(), Some("one"));
1033        assert_eq!(all_uris[1].as_text(), Some("two"));
1034
1035        // Verify to_json returns array for multi-value field
1036        let json = doc.to_json(&schema);
1037        let uris_json = json.get("uris").unwrap();
1038        assert!(uris_json.is_array(), "Multi-value field should be an array");
1039        let uris_arr = uris_json.as_array().unwrap();
1040        assert_eq!(uris_arr.len(), 2);
1041        assert_eq!(uris_arr[0].as_str(), Some("one"));
1042        assert_eq!(uris_arr[1].as_str(), Some("two"));
1043
1044        // Verify single-value field is NOT an array
1045        let title_json = json.get("title").unwrap();
1046        assert!(
1047            title_json.is_string(),
1048            "Single-value field should be a string"
1049        );
1050        assert_eq!(title_json.as_str(), Some("Test Document"));
1051    }
1052
1053    #[test]
1054    fn test_multivalue_from_json() {
1055        let mut builder = Schema::builder();
1056        let uris = builder.add_text_field("uris", true, true);
1057        let title = builder.add_text_field("title", true, true);
1058        let schema = builder.build();
1059
1060        // Create JSON with array value
1061        let json = serde_json::json!({
1062            "uris": ["one", "two"],
1063            "title": "Test Document"
1064        });
1065
1066        // Parse from JSON
1067        let doc = Document::from_json(&json, &schema).unwrap();
1068
1069        // Verify all values are present
1070        let all_uris: Vec<_> = doc.get_all(uris).collect();
1071        assert_eq!(all_uris.len(), 2);
1072        assert_eq!(all_uris[0].as_text(), Some("one"));
1073        assert_eq!(all_uris[1].as_text(), Some("two"));
1074
1075        // Verify single value
1076        assert_eq!(
1077            doc.get_first(title).unwrap().as_text(),
1078            Some("Test Document")
1079        );
1080
1081        // Verify roundtrip: to_json should produce equivalent JSON
1082        let json_out = doc.to_json(&schema);
1083        let uris_out = json_out.get("uris").unwrap().as_array().unwrap();
1084        assert_eq!(uris_out.len(), 2);
1085        assert_eq!(uris_out[0].as_str(), Some("one"));
1086        assert_eq!(uris_out[1].as_str(), Some("two"));
1087    }
1088
1089    #[test]
1090    fn test_multi_attribute_forces_array() {
1091        // Test that fields marked as 'multi' are always serialized as arrays,
1092        // even when they have only one value
1093        let mut builder = Schema::builder();
1094        let uris = builder.add_text_field("uris", true, true);
1095        builder.set_multi(uris, true); // Mark as multi
1096        let title = builder.add_text_field("title", true, true);
1097        let schema = builder.build();
1098
1099        // Verify the multi attribute is set
1100        assert!(schema.get_field_entry(uris).unwrap().multi);
1101        assert!(!schema.get_field_entry(title).unwrap().multi);
1102
1103        // Create document with single value for multi field
1104        let mut doc = Document::new();
1105        doc.add_text(uris, "only_one");
1106        doc.add_text(title, "Test Document");
1107
1108        // Verify to_json returns array for multi field even with single value
1109        let json = doc.to_json(&schema);
1110
1111        let uris_json = json.get("uris").unwrap();
1112        assert!(
1113            uris_json.is_array(),
1114            "Multi field should be array even with single value"
1115        );
1116        let uris_arr = uris_json.as_array().unwrap();
1117        assert_eq!(uris_arr.len(), 1);
1118        assert_eq!(uris_arr[0].as_str(), Some("only_one"));
1119
1120        // Verify non-multi field with single value is NOT an array
1121        let title_json = json.get("title").unwrap();
1122        assert!(
1123            title_json.is_string(),
1124            "Non-multi single-value field should be a string"
1125        );
1126        assert_eq!(title_json.as_str(), Some("Test Document"));
1127    }
1128
1129    #[test]
1130    fn test_sparse_vector_field() {
1131        let mut builder = Schema::builder();
1132        let embedding = builder.add_sparse_vector_field("embedding", true, true);
1133        let title = builder.add_text_field("title", true, true);
1134        let schema = builder.build();
1135
1136        assert_eq!(schema.get_field("embedding"), Some(embedding));
1137        assert_eq!(
1138            schema.get_field_entry(embedding).unwrap().field_type,
1139            FieldType::SparseVector
1140        );
1141
1142        // Create document with sparse vector
1143        let mut doc = Document::new();
1144        doc.add_sparse_vector(embedding, vec![(0, 1.0), (5, 2.5), (10, 0.5)]);
1145        doc.add_text(title, "Test Document");
1146
1147        // Verify accessor
1148        let entries = doc
1149            .get_first(embedding)
1150            .unwrap()
1151            .as_sparse_vector()
1152            .unwrap();
1153        assert_eq!(entries, &[(0, 1.0), (5, 2.5), (10, 0.5)]);
1154
1155        // Verify JSON roundtrip
1156        let json = doc.to_json(&schema);
1157        let embedding_json = json.get("embedding").unwrap();
1158        assert!(embedding_json.is_object());
1159        assert_eq!(
1160            embedding_json
1161                .get("indices")
1162                .unwrap()
1163                .as_array()
1164                .unwrap()
1165                .len(),
1166            3
1167        );
1168
1169        // Parse back from JSON
1170        let doc2 = Document::from_json(&json, &schema).unwrap();
1171        let entries2 = doc2
1172            .get_first(embedding)
1173            .unwrap()
1174            .as_sparse_vector()
1175            .unwrap();
1176        assert_eq!(entries2[0].0, 0);
1177        assert!((entries2[0].1 - 1.0).abs() < 1e-6);
1178        assert_eq!(entries2[1].0, 5);
1179        assert!((entries2[1].1 - 2.5).abs() < 1e-6);
1180        assert_eq!(entries2[2].0, 10);
1181        assert!((entries2[2].1 - 0.5).abs() < 1e-6);
1182    }
1183
1184    #[test]
1185    fn test_json_field() {
1186        let mut builder = Schema::builder();
1187        let metadata = builder.add_json_field("metadata", true);
1188        let title = builder.add_text_field("title", true, true);
1189        let schema = builder.build();
1190
1191        assert_eq!(schema.get_field("metadata"), Some(metadata));
1192        assert_eq!(
1193            schema.get_field_entry(metadata).unwrap().field_type,
1194            FieldType::Json
1195        );
1196        // JSON fields are never indexed
1197        assert!(!schema.get_field_entry(metadata).unwrap().indexed);
1198        assert!(schema.get_field_entry(metadata).unwrap().stored);
1199
1200        // Create document with JSON value (object)
1201        let json_value = serde_json::json!({
1202            "author": "John Doe",
1203            "tags": ["rust", "search"],
1204            "nested": {"key": "value"}
1205        });
1206        let mut doc = Document::new();
1207        doc.add_json(metadata, json_value.clone());
1208        doc.add_text(title, "Test Document");
1209
1210        // Verify accessor
1211        let stored_json = doc.get_first(metadata).unwrap().as_json().unwrap();
1212        assert_eq!(stored_json, &json_value);
1213        assert_eq!(
1214            stored_json.get("author").unwrap().as_str(),
1215            Some("John Doe")
1216        );
1217
1218        // Verify JSON roundtrip via to_json/from_json
1219        let doc_json = doc.to_json(&schema);
1220        let metadata_out = doc_json.get("metadata").unwrap();
1221        assert_eq!(metadata_out, &json_value);
1222
1223        // Parse back from JSON
1224        let doc2 = Document::from_json(&doc_json, &schema).unwrap();
1225        let stored_json2 = doc2.get_first(metadata).unwrap().as_json().unwrap();
1226        assert_eq!(stored_json2, &json_value);
1227    }
1228
1229    #[test]
1230    fn test_json_field_various_types() {
1231        let mut builder = Schema::builder();
1232        let data = builder.add_json_field("data", true);
1233        let _schema = builder.build();
1234
1235        // Test with array
1236        let arr_value = serde_json::json!([1, 2, 3, "four", null]);
1237        let mut doc = Document::new();
1238        doc.add_json(data, arr_value.clone());
1239        assert_eq!(doc.get_first(data).unwrap().as_json().unwrap(), &arr_value);
1240
1241        // Test with string
1242        let str_value = serde_json::json!("just a string");
1243        let mut doc2 = Document::new();
1244        doc2.add_json(data, str_value.clone());
1245        assert_eq!(doc2.get_first(data).unwrap().as_json().unwrap(), &str_value);
1246
1247        // Test with number
1248        let num_value = serde_json::json!(42.5);
1249        let mut doc3 = Document::new();
1250        doc3.add_json(data, num_value.clone());
1251        assert_eq!(doc3.get_first(data).unwrap().as_json().unwrap(), &num_value);
1252
1253        // Test with null
1254        let null_value = serde_json::Value::Null;
1255        let mut doc4 = Document::new();
1256        doc4.add_json(data, null_value.clone());
1257        assert_eq!(
1258            doc4.get_first(data).unwrap().as_json().unwrap(),
1259            &null_value
1260        );
1261
1262        // Test with boolean
1263        let bool_value = serde_json::json!(true);
1264        let mut doc5 = Document::new();
1265        doc5.add_json(data, bool_value.clone());
1266        assert_eq!(
1267            doc5.get_first(data).unwrap().as_json().unwrap(),
1268            &bool_value
1269        );
1270    }
1271}