Skip to main content

hermes_core/dsl/
schema.rs

1//! Schema definitions for documents and fields
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// Field identifier
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
8pub struct Field(pub u32);
9
10/// Types of fields supported
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub enum FieldType {
13    /// Text field - tokenized and indexed
14    #[serde(rename = "text")]
15    Text,
16    /// Unsigned 64-bit integer
17    #[serde(rename = "u64")]
18    U64,
19    /// Signed 64-bit integer
20    #[serde(rename = "i64")]
21    I64,
22    /// 64-bit floating point
23    #[serde(rename = "f64")]
24    F64,
25    /// Raw bytes (not tokenized)
26    #[serde(rename = "bytes")]
27    Bytes,
28    /// Sparse vector field - indexed as inverted posting lists with quantized weights
29    #[serde(rename = "sparse_vector")]
30    SparseVector,
31    /// Dense vector field - indexed using RaBitQ binary quantization for ANN search
32    #[serde(rename = "dense_vector")]
33    DenseVector,
34    /// JSON field - arbitrary JSON data, stored but not indexed
35    #[serde(rename = "json")]
36    Json,
37}
38
39/// Field options
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct FieldEntry {
42    pub name: String,
43    pub field_type: FieldType,
44    pub indexed: bool,
45    pub stored: bool,
46    /// Name of the tokenizer to use for this field (for text fields)
47    pub tokenizer: Option<String>,
48    /// Whether this field can have multiple values (serialized as array in JSON)
49    #[serde(default)]
50    pub multi: bool,
51    /// Position tracking mode for phrase queries and multi-field element tracking
52    #[serde(default, skip_serializing_if = "Option::is_none")]
53    pub positions: Option<PositionMode>,
54    /// Configuration for sparse vector fields (index size, weight quantization)
55    #[serde(default, skip_serializing_if = "Option::is_none")]
56    pub sparse_vector_config: Option<crate::structures::SparseVectorConfig>,
57    /// Configuration for dense vector fields (dimension, quantization)
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub dense_vector_config: Option<DenseVectorConfig>,
60}
61
62/// Position tracking mode for text fields
63#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
64#[serde(rename_all = "snake_case")]
65pub enum PositionMode {
66    /// Track only element ordinal for multi-valued fields (which array element)
67    /// Useful for returning which element matched without full phrase query support
68    Ordinal,
69    /// Track only token position within text (for phrase queries)
70    /// Does not track element ordinal - all positions are relative to concatenated text
71    TokenPosition,
72    /// Track both element ordinal and token position (full support)
73    /// Position format: (element_ordinal << 20) | token_position
74    Full,
75}
76
77impl PositionMode {
78    /// Whether this mode tracks element ordinals
79    pub fn tracks_ordinal(&self) -> bool {
80        matches!(self, PositionMode::Ordinal | PositionMode::Full)
81    }
82
83    /// Whether this mode tracks token positions
84    pub fn tracks_token_position(&self) -> bool {
85        matches!(self, PositionMode::TokenPosition | PositionMode::Full)
86    }
87}
88
89/// Vector index algorithm type
90#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
91#[serde(rename_all = "snake_case")]
92pub enum VectorIndexType {
93    /// Flat - brute-force search over raw vectors (accumulating state)
94    Flat,
95    /// RaBitQ - binary quantization, good for small datasets (<100K)
96    #[default]
97    RaBitQ,
98    /// IVF-RaBitQ - inverted file with RaBitQ, good for medium datasets
99    IvfRaBitQ,
100    /// ScaNN - product quantization with OPQ and anisotropic loss, best for large datasets
101    ScaNN,
102}
103
104/// Storage quantization for dense vector elements
105///
106/// Controls the precision of each vector coordinate in `.vectors` files.
107/// Lower precision reduces storage and memory bandwidth; scoring uses
108/// native-precision SIMD (no dequantization on the hot path).
109#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
110#[serde(rename_all = "snake_case")]
111pub enum DenseVectorQuantization {
112    /// 32-bit IEEE 754 float (4 bytes/dim) — full precision, baseline
113    #[default]
114    F32,
115    /// 16-bit IEEE 754 half-float (2 bytes/dim) — <0.1% recall loss for normalized embeddings
116    F16,
117    /// 8-bit unsigned scalar quantization (1 byte/dim) — maps [-1,1] → [0,255]
118    UInt8,
119}
120
121impl DenseVectorQuantization {
122    /// Bytes per element for this quantization type
123    pub fn element_size(self) -> usize {
124        match self {
125            Self::F32 => 4,
126            Self::F16 => 2,
127            Self::UInt8 => 1,
128        }
129    }
130
131    /// Wire format tag (stored in .vectors header)
132    pub fn tag(self) -> u8 {
133        match self {
134            Self::F32 => 0,
135            Self::F16 => 1,
136            Self::UInt8 => 2,
137        }
138    }
139
140    /// Decode wire format tag
141    pub fn from_tag(tag: u8) -> Option<Self> {
142        match tag {
143            0 => Some(Self::F32),
144            1 => Some(Self::F16),
145            2 => Some(Self::UInt8),
146            _ => None,
147        }
148    }
149}
150
151/// Configuration for dense vector fields using Flat, RaBitQ, IVF-RaBitQ, or ScaNN
152///
153/// Indexes operate in two states:
154/// - **Flat (accumulating)**: Brute-force search over raw vectors. Used when vector count
155///   is below `build_threshold` or before `build_index` is called.
156/// - **Built (ANN)**: Fast approximate nearest neighbor search using trained structures.
157///   Centroids and codebooks are trained from data and stored within the segment.
158#[derive(Debug, Clone, Serialize, Deserialize)]
159pub struct DenseVectorConfig {
160    /// Dimensionality of vectors
161    pub dim: usize,
162    /// Target vector index algorithm (Flat, RaBitQ, IVF-RaBitQ, or ScaNN)
163    /// When in accumulating state, search uses brute-force regardless of this setting.
164    #[serde(default)]
165    pub index_type: VectorIndexType,
166    /// Storage quantization for vector elements (f32, f16, uint8)
167    #[serde(default)]
168    pub quantization: DenseVectorQuantization,
169    /// Number of IVF clusters for IVF-RaBitQ and ScaNN (default: sqrt(n) capped at 4096)
170    /// If None, automatically determined based on dataset size.
171    #[serde(default, skip_serializing_if = "Option::is_none")]
172    pub num_clusters: Option<usize>,
173    /// Number of clusters to probe during search (default: 32)
174    #[serde(default = "default_nprobe")]
175    pub nprobe: usize,
176    /// Minimum number of vectors required before building ANN index.
177    /// Below this threshold, brute-force (Flat) search is used.
178    /// Default: 1000 for RaBitQ, 10000 for IVF-RaBitQ/ScaNN.
179    #[serde(default, skip_serializing_if = "Option::is_none")]
180    pub build_threshold: Option<usize>,
181    /// Whether stored vectors are pre-normalized to unit L2 norm.
182    /// When true, scoring skips per-vector norm computation (cosine = dot / ||q||),
183    /// reducing compute by ~40%. Common for embedding models (e.g. OpenAI, Cohere).
184    /// Default: true (most embedding models produce L2-normalized vectors).
185    #[serde(default = "default_unit_norm")]
186    pub unit_norm: bool,
187}
188
189fn default_nprobe() -> usize {
190    32
191}
192
193fn default_unit_norm() -> bool {
194    true
195}
196
197impl DenseVectorConfig {
198    pub fn new(dim: usize) -> Self {
199        Self {
200            dim,
201            index_type: VectorIndexType::RaBitQ,
202            quantization: DenseVectorQuantization::F32,
203            num_clusters: None,
204            nprobe: 32,
205            build_threshold: None,
206            unit_norm: true,
207        }
208    }
209
210    /// Create IVF-RaBitQ configuration
211    pub fn with_ivf(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
212        Self {
213            dim,
214            index_type: VectorIndexType::IvfRaBitQ,
215            quantization: DenseVectorQuantization::F32,
216            num_clusters,
217            nprobe,
218            build_threshold: None,
219            unit_norm: true,
220        }
221    }
222
223    /// Create ScaNN configuration
224    pub fn with_scann(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
225        Self {
226            dim,
227            index_type: VectorIndexType::ScaNN,
228            quantization: DenseVectorQuantization::F32,
229            num_clusters,
230            nprobe,
231            build_threshold: None,
232            unit_norm: true,
233        }
234    }
235
236    /// Create Flat (brute-force) configuration - no ANN index
237    pub fn flat(dim: usize) -> Self {
238        Self {
239            dim,
240            index_type: VectorIndexType::Flat,
241            quantization: DenseVectorQuantization::F32,
242            num_clusters: None,
243            nprobe: 0,
244            build_threshold: None,
245            unit_norm: true,
246        }
247    }
248
249    /// Set storage quantization
250    pub fn with_quantization(mut self, quantization: DenseVectorQuantization) -> Self {
251        self.quantization = quantization;
252        self
253    }
254
255    /// Set build threshold for auto-building ANN index
256    pub fn with_build_threshold(mut self, threshold: usize) -> Self {
257        self.build_threshold = Some(threshold);
258        self
259    }
260
261    /// Mark vectors as pre-normalized to unit L2 norm
262    pub fn with_unit_norm(mut self) -> Self {
263        self.unit_norm = true;
264        self
265    }
266
267    /// Set number of IVF clusters
268    pub fn with_num_clusters(mut self, num_clusters: usize) -> Self {
269        self.num_clusters = Some(num_clusters);
270        self
271    }
272
273    /// Check if this config uses IVF
274    pub fn uses_ivf(&self) -> bool {
275        matches!(
276            self.index_type,
277            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN
278        )
279    }
280
281    /// Check if this config uses ScaNN
282    pub fn uses_scann(&self) -> bool {
283        self.index_type == VectorIndexType::ScaNN
284    }
285
286    /// Check if this config is flat (brute-force)
287    pub fn is_flat(&self) -> bool {
288        self.index_type == VectorIndexType::Flat
289    }
290
291    /// Get the default build threshold for this index type
292    pub fn default_build_threshold(&self) -> usize {
293        self.build_threshold.unwrap_or(match self.index_type {
294            VectorIndexType::Flat => usize::MAX, // Never auto-build
295            VectorIndexType::RaBitQ => 1000,
296            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN => 10000,
297        })
298    }
299
300    /// Calculate optimal number of clusters for given vector count
301    pub fn optimal_num_clusters(&self, num_vectors: usize) -> usize {
302        self.num_clusters.unwrap_or_else(|| {
303            // sqrt(n) heuristic, capped at 4096
304            let optimal = (num_vectors as f64).sqrt() as usize;
305            optimal.clamp(16, 4096)
306        })
307    }
308}
309
310use super::query_field_router::QueryRouterRule;
311
312/// Schema defining document structure
313#[derive(Debug, Clone, Default, Serialize, Deserialize)]
314pub struct Schema {
315    fields: Vec<FieldEntry>,
316    name_to_field: HashMap<String, Field>,
317    /// Default fields for query parsing (when no field is specified)
318    #[serde(default)]
319    default_fields: Vec<Field>,
320    /// Query router rules for routing queries to specific fields based on regex patterns
321    #[serde(default)]
322    query_routers: Vec<QueryRouterRule>,
323}
324
325impl Schema {
326    pub fn builder() -> SchemaBuilder {
327        SchemaBuilder::default()
328    }
329
330    pub fn get_field(&self, name: &str) -> Option<Field> {
331        self.name_to_field.get(name).copied()
332    }
333
334    pub fn get_field_entry(&self, field: Field) -> Option<&FieldEntry> {
335        self.fields.get(field.0 as usize)
336    }
337
338    pub fn get_field_name(&self, field: Field) -> Option<&str> {
339        self.fields.get(field.0 as usize).map(|e| e.name.as_str())
340    }
341
342    pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
343        self.fields
344            .iter()
345            .enumerate()
346            .map(|(i, e)| (Field(i as u32), e))
347    }
348
349    pub fn num_fields(&self) -> usize {
350        self.fields.len()
351    }
352
353    /// Get the default fields for query parsing
354    pub fn default_fields(&self) -> &[Field] {
355        &self.default_fields
356    }
357
358    /// Set default fields (used by builder)
359    pub fn set_default_fields(&mut self, fields: Vec<Field>) {
360        self.default_fields = fields;
361    }
362
363    /// Get the query router rules
364    pub fn query_routers(&self) -> &[QueryRouterRule] {
365        &self.query_routers
366    }
367
368    /// Set query router rules
369    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
370        self.query_routers = rules;
371    }
372}
373
374/// Builder for Schema
375#[derive(Debug, Default)]
376pub struct SchemaBuilder {
377    fields: Vec<FieldEntry>,
378    default_fields: Vec<String>,
379    query_routers: Vec<QueryRouterRule>,
380}
381
382impl SchemaBuilder {
383    pub fn add_text_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
384        self.add_field_with_tokenizer(
385            name,
386            FieldType::Text,
387            indexed,
388            stored,
389            Some("default".to_string()),
390        )
391    }
392
393    pub fn add_text_field_with_tokenizer(
394        &mut self,
395        name: &str,
396        indexed: bool,
397        stored: bool,
398        tokenizer: &str,
399    ) -> Field {
400        self.add_field_with_tokenizer(
401            name,
402            FieldType::Text,
403            indexed,
404            stored,
405            Some(tokenizer.to_string()),
406        )
407    }
408
409    pub fn add_u64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
410        self.add_field(name, FieldType::U64, indexed, stored)
411    }
412
413    pub fn add_i64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
414        self.add_field(name, FieldType::I64, indexed, stored)
415    }
416
417    pub fn add_f64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
418        self.add_field(name, FieldType::F64, indexed, stored)
419    }
420
421    pub fn add_bytes_field(&mut self, name: &str, stored: bool) -> Field {
422        self.add_field(name, FieldType::Bytes, false, stored)
423    }
424
425    /// Add a JSON field for storing arbitrary JSON data
426    ///
427    /// JSON fields are never indexed, only stored. They can hold any valid JSON value
428    /// (objects, arrays, strings, numbers, booleans, null).
429    pub fn add_json_field(&mut self, name: &str, stored: bool) -> Field {
430        self.add_field(name, FieldType::Json, false, stored)
431    }
432
433    /// Add a sparse vector field with default configuration
434    ///
435    /// Sparse vectors are indexed as inverted posting lists where each dimension
436    /// becomes a "term" and documents have quantized weights for each dimension.
437    pub fn add_sparse_vector_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
438        self.add_sparse_vector_field_with_config(
439            name,
440            indexed,
441            stored,
442            crate::structures::SparseVectorConfig::default(),
443        )
444    }
445
446    /// Add a sparse vector field with custom configuration
447    ///
448    /// Use `SparseVectorConfig::splade()` for SPLADE models (u16 indices, uint8 weights).
449    /// Use `SparseVectorConfig::compact()` for maximum compression (u16 indices, uint4 weights).
450    pub fn add_sparse_vector_field_with_config(
451        &mut self,
452        name: &str,
453        indexed: bool,
454        stored: bool,
455        config: crate::structures::SparseVectorConfig,
456    ) -> Field {
457        let field = Field(self.fields.len() as u32);
458        self.fields.push(FieldEntry {
459            name: name.to_string(),
460            field_type: FieldType::SparseVector,
461            indexed,
462            stored,
463            tokenizer: None,
464            multi: false,
465            positions: None,
466            sparse_vector_config: Some(config),
467            dense_vector_config: None,
468        });
469        field
470    }
471
472    /// Set sparse vector configuration for an existing field
473    pub fn set_sparse_vector_config(
474        &mut self,
475        field: Field,
476        config: crate::structures::SparseVectorConfig,
477    ) {
478        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
479            entry.sparse_vector_config = Some(config);
480        }
481    }
482
483    /// Add a dense vector field with default configuration
484    ///
485    /// Dense vectors are indexed using RaBitQ binary quantization for fast ANN search.
486    /// The dimension must be specified as it determines the quantization structure.
487    pub fn add_dense_vector_field(
488        &mut self,
489        name: &str,
490        dim: usize,
491        indexed: bool,
492        stored: bool,
493    ) -> Field {
494        self.add_dense_vector_field_with_config(name, indexed, stored, DenseVectorConfig::new(dim))
495    }
496
497    /// Add a dense vector field with custom configuration
498    pub fn add_dense_vector_field_with_config(
499        &mut self,
500        name: &str,
501        indexed: bool,
502        stored: bool,
503        config: DenseVectorConfig,
504    ) -> Field {
505        let field = Field(self.fields.len() as u32);
506        self.fields.push(FieldEntry {
507            name: name.to_string(),
508            field_type: FieldType::DenseVector,
509            indexed,
510            stored,
511            tokenizer: None,
512            multi: false,
513            positions: None,
514            sparse_vector_config: None,
515            dense_vector_config: Some(config),
516        });
517        field
518    }
519
520    fn add_field(
521        &mut self,
522        name: &str,
523        field_type: FieldType,
524        indexed: bool,
525        stored: bool,
526    ) -> Field {
527        self.add_field_with_tokenizer(name, field_type, indexed, stored, None)
528    }
529
530    fn add_field_with_tokenizer(
531        &mut self,
532        name: &str,
533        field_type: FieldType,
534        indexed: bool,
535        stored: bool,
536        tokenizer: Option<String>,
537    ) -> Field {
538        self.add_field_full(name, field_type, indexed, stored, tokenizer, false)
539    }
540
541    fn add_field_full(
542        &mut self,
543        name: &str,
544        field_type: FieldType,
545        indexed: bool,
546        stored: bool,
547        tokenizer: Option<String>,
548        multi: bool,
549    ) -> Field {
550        let field = Field(self.fields.len() as u32);
551        self.fields.push(FieldEntry {
552            name: name.to_string(),
553            field_type,
554            indexed,
555            stored,
556            tokenizer,
557            multi,
558            positions: None,
559            sparse_vector_config: None,
560            dense_vector_config: None,
561        });
562        field
563    }
564
565    /// Set the multi attribute on the last added field
566    pub fn set_multi(&mut self, field: Field, multi: bool) {
567        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
568            entry.multi = multi;
569        }
570    }
571
572    /// Set position tracking mode for phrase queries and multi-field element tracking
573    pub fn set_positions(&mut self, field: Field, mode: PositionMode) {
574        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
575            entry.positions = Some(mode);
576        }
577    }
578
579    /// Set default fields by name
580    pub fn set_default_fields(&mut self, field_names: Vec<String>) {
581        self.default_fields = field_names;
582    }
583
584    /// Set query router rules
585    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
586        self.query_routers = rules;
587    }
588
589    pub fn build(self) -> Schema {
590        let mut name_to_field = HashMap::new();
591        for (i, entry) in self.fields.iter().enumerate() {
592            name_to_field.insert(entry.name.clone(), Field(i as u32));
593        }
594
595        // Resolve default field names to Field IDs
596        let default_fields: Vec<Field> = self
597            .default_fields
598            .iter()
599            .filter_map(|name| name_to_field.get(name).copied())
600            .collect();
601
602        Schema {
603            fields: self.fields,
604            name_to_field,
605            default_fields,
606            query_routers: self.query_routers,
607        }
608    }
609}
610
611/// Value that can be stored in a field
612#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
613pub enum FieldValue {
614    #[serde(rename = "text")]
615    Text(String),
616    #[serde(rename = "u64")]
617    U64(u64),
618    #[serde(rename = "i64")]
619    I64(i64),
620    #[serde(rename = "f64")]
621    F64(f64),
622    #[serde(rename = "bytes")]
623    Bytes(Vec<u8>),
624    /// Sparse vector: list of (dimension_id, weight) pairs
625    #[serde(rename = "sparse_vector")]
626    SparseVector(Vec<(u32, f32)>),
627    /// Dense vector: float32 values
628    #[serde(rename = "dense_vector")]
629    DenseVector(Vec<f32>),
630    /// Arbitrary JSON value
631    #[serde(rename = "json")]
632    Json(serde_json::Value),
633}
634
635impl FieldValue {
636    pub fn as_text(&self) -> Option<&str> {
637        match self {
638            FieldValue::Text(s) => Some(s),
639            _ => None,
640        }
641    }
642
643    pub fn as_u64(&self) -> Option<u64> {
644        match self {
645            FieldValue::U64(v) => Some(*v),
646            _ => None,
647        }
648    }
649
650    pub fn as_i64(&self) -> Option<i64> {
651        match self {
652            FieldValue::I64(v) => Some(*v),
653            _ => None,
654        }
655    }
656
657    pub fn as_f64(&self) -> Option<f64> {
658        match self {
659            FieldValue::F64(v) => Some(*v),
660            _ => None,
661        }
662    }
663
664    pub fn as_bytes(&self) -> Option<&[u8]> {
665        match self {
666            FieldValue::Bytes(b) => Some(b),
667            _ => None,
668        }
669    }
670
671    pub fn as_sparse_vector(&self) -> Option<&[(u32, f32)]> {
672        match self {
673            FieldValue::SparseVector(entries) => Some(entries),
674            _ => None,
675        }
676    }
677
678    pub fn as_dense_vector(&self) -> Option<&[f32]> {
679        match self {
680            FieldValue::DenseVector(v) => Some(v),
681            _ => None,
682        }
683    }
684
685    pub fn as_json(&self) -> Option<&serde_json::Value> {
686        match self {
687            FieldValue::Json(v) => Some(v),
688            _ => None,
689        }
690    }
691}
692
693/// A document to be indexed
694#[derive(Debug, Clone, Default, Serialize, Deserialize)]
695pub struct Document {
696    field_values: Vec<(Field, FieldValue)>,
697}
698
699impl Document {
700    pub fn new() -> Self {
701        Self::default()
702    }
703
704    pub fn add_text(&mut self, field: Field, value: impl Into<String>) {
705        self.field_values
706            .push((field, FieldValue::Text(value.into())));
707    }
708
709    pub fn add_u64(&mut self, field: Field, value: u64) {
710        self.field_values.push((field, FieldValue::U64(value)));
711    }
712
713    pub fn add_i64(&mut self, field: Field, value: i64) {
714        self.field_values.push((field, FieldValue::I64(value)));
715    }
716
717    pub fn add_f64(&mut self, field: Field, value: f64) {
718        self.field_values.push((field, FieldValue::F64(value)));
719    }
720
721    pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
722        self.field_values.push((field, FieldValue::Bytes(value)));
723    }
724
725    pub fn add_sparse_vector(&mut self, field: Field, entries: Vec<(u32, f32)>) {
726        self.field_values
727            .push((field, FieldValue::SparseVector(entries)));
728    }
729
730    pub fn add_dense_vector(&mut self, field: Field, values: Vec<f32>) {
731        self.field_values
732            .push((field, FieldValue::DenseVector(values)));
733    }
734
735    pub fn add_json(&mut self, field: Field, value: serde_json::Value) {
736        self.field_values.push((field, FieldValue::Json(value)));
737    }
738
739    pub fn get_first(&self, field: Field) -> Option<&FieldValue> {
740        self.field_values
741            .iter()
742            .find(|(f, _)| *f == field)
743            .map(|(_, v)| v)
744    }
745
746    pub fn get_all(&self, field: Field) -> impl Iterator<Item = &FieldValue> {
747        self.field_values
748            .iter()
749            .filter(move |(f, _)| *f == field)
750            .map(|(_, v)| v)
751    }
752
753    pub fn field_values(&self) -> &[(Field, FieldValue)] {
754        &self.field_values
755    }
756
757    /// Return a new Document containing only fields marked as `stored` in the schema
758    pub fn filter_stored(&self, schema: &Schema) -> Document {
759        Document {
760            field_values: self
761                .field_values
762                .iter()
763                .filter(|(field, _)| {
764                    schema
765                        .get_field_entry(*field)
766                        .is_some_and(|entry| entry.stored)
767                })
768                .cloned()
769                .collect(),
770        }
771    }
772
773    /// Convert document to a JSON object using field names from schema
774    ///
775    /// Fields marked as `multi` in the schema are always returned as JSON arrays.
776    /// Other fields with multiple values are also returned as arrays.
777    /// Fields with a single value (and not marked multi) are returned as scalar values.
778    pub fn to_json(&self, schema: &Schema) -> serde_json::Value {
779        use std::collections::HashMap;
780
781        // Group values by field, keeping track of field entry for multi check
782        let mut field_values_map: HashMap<Field, (String, bool, Vec<serde_json::Value>)> =
783            HashMap::new();
784
785        for (field, value) in &self.field_values {
786            if let Some(entry) = schema.get_field_entry(*field) {
787                let json_value = match value {
788                    FieldValue::Text(s) => serde_json::Value::String(s.clone()),
789                    FieldValue::U64(n) => serde_json::Value::Number((*n).into()),
790                    FieldValue::I64(n) => serde_json::Value::Number((*n).into()),
791                    FieldValue::F64(n) => serde_json::json!(n),
792                    FieldValue::Bytes(b) => {
793                        use base64::Engine;
794                        serde_json::Value::String(
795                            base64::engine::general_purpose::STANDARD.encode(b),
796                        )
797                    }
798                    FieldValue::SparseVector(entries) => {
799                        let indices: Vec<u32> = entries.iter().map(|(i, _)| *i).collect();
800                        let values: Vec<f32> = entries.iter().map(|(_, v)| *v).collect();
801                        serde_json::json!({
802                            "indices": indices,
803                            "values": values
804                        })
805                    }
806                    FieldValue::DenseVector(values) => {
807                        serde_json::json!(values)
808                    }
809                    FieldValue::Json(v) => v.clone(),
810                };
811                field_values_map
812                    .entry(*field)
813                    .or_insert_with(|| (entry.name.clone(), entry.multi, Vec::new()))
814                    .2
815                    .push(json_value);
816            }
817        }
818
819        // Convert to JSON object, using arrays for multi fields or when multiple values exist
820        let mut map = serde_json::Map::new();
821        for (_field, (name, is_multi, values)) in field_values_map {
822            let json_value = if is_multi || values.len() > 1 {
823                serde_json::Value::Array(values)
824            } else {
825                values.into_iter().next().unwrap()
826            };
827            map.insert(name, json_value);
828        }
829
830        serde_json::Value::Object(map)
831    }
832
833    /// Create a Document from a JSON object using field names from schema
834    ///
835    /// Supports:
836    /// - String values -> Text fields
837    /// - Number values -> U64/I64/F64 fields (based on schema type)
838    /// - Array values -> Multiple values for the same field (multifields)
839    ///
840    /// Unknown fields (not in schema) are silently ignored.
841    pub fn from_json(json: &serde_json::Value, schema: &Schema) -> Option<Self> {
842        let obj = json.as_object()?;
843        let mut doc = Document::new();
844
845        for (key, value) in obj {
846            if let Some(field) = schema.get_field(key) {
847                let field_entry = schema.get_field_entry(field)?;
848                Self::add_json_value(&mut doc, field, &field_entry.field_type, value);
849            }
850        }
851
852        Some(doc)
853    }
854
855    /// Helper to add a JSON value to a document, handling type conversion
856    fn add_json_value(
857        doc: &mut Document,
858        field: Field,
859        field_type: &FieldType,
860        value: &serde_json::Value,
861    ) {
862        match value {
863            serde_json::Value::String(s) => {
864                if matches!(field_type, FieldType::Text) {
865                    doc.add_text(field, s.clone());
866                }
867            }
868            serde_json::Value::Number(n) => {
869                match field_type {
870                    FieldType::I64 => {
871                        if let Some(i) = n.as_i64() {
872                            doc.add_i64(field, i);
873                        }
874                    }
875                    FieldType::U64 => {
876                        if let Some(u) = n.as_u64() {
877                            doc.add_u64(field, u);
878                        } else if let Some(i) = n.as_i64() {
879                            // Allow positive i64 as u64
880                            if i >= 0 {
881                                doc.add_u64(field, i as u64);
882                            }
883                        }
884                    }
885                    FieldType::F64 => {
886                        if let Some(f) = n.as_f64() {
887                            doc.add_f64(field, f);
888                        }
889                    }
890                    _ => {}
891                }
892            }
893            // Handle arrays (multifields) - add each element separately
894            serde_json::Value::Array(arr) => {
895                for item in arr {
896                    Self::add_json_value(doc, field, field_type, item);
897                }
898            }
899            // Handle sparse vector objects
900            serde_json::Value::Object(obj) if matches!(field_type, FieldType::SparseVector) => {
901                if let (Some(indices_val), Some(values_val)) =
902                    (obj.get("indices"), obj.get("values"))
903                {
904                    let indices: Vec<u32> = indices_val
905                        .as_array()
906                        .map(|arr| {
907                            arr.iter()
908                                .filter_map(|v| v.as_u64().map(|n| n as u32))
909                                .collect()
910                        })
911                        .unwrap_or_default();
912                    let values: Vec<f32> = values_val
913                        .as_array()
914                        .map(|arr| {
915                            arr.iter()
916                                .filter_map(|v| v.as_f64().map(|n| n as f32))
917                                .collect()
918                        })
919                        .unwrap_or_default();
920                    if indices.len() == values.len() {
921                        let entries: Vec<(u32, f32)> = indices.into_iter().zip(values).collect();
922                        doc.add_sparse_vector(field, entries);
923                    }
924                }
925            }
926            // Handle JSON fields - accept any value directly
927            _ if matches!(field_type, FieldType::Json) => {
928                doc.add_json(field, value.clone());
929            }
930            serde_json::Value::Object(_) => {}
931            _ => {}
932        }
933    }
934}
935
936#[cfg(test)]
937mod tests {
938    use super::*;
939
940    #[test]
941    fn test_schema_builder() {
942        let mut builder = Schema::builder();
943        let title = builder.add_text_field("title", true, true);
944        let body = builder.add_text_field("body", true, false);
945        let count = builder.add_u64_field("count", true, true);
946        let schema = builder.build();
947
948        assert_eq!(schema.get_field("title"), Some(title));
949        assert_eq!(schema.get_field("body"), Some(body));
950        assert_eq!(schema.get_field("count"), Some(count));
951        assert_eq!(schema.get_field("nonexistent"), None);
952    }
953
954    #[test]
955    fn test_document() {
956        let mut builder = Schema::builder();
957        let title = builder.add_text_field("title", true, true);
958        let count = builder.add_u64_field("count", true, true);
959        let _schema = builder.build();
960
961        let mut doc = Document::new();
962        doc.add_text(title, "Hello World");
963        doc.add_u64(count, 42);
964
965        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
966        assert_eq!(doc.get_first(count).unwrap().as_u64(), Some(42));
967    }
968
969    #[test]
970    fn test_document_serialization() {
971        let mut builder = Schema::builder();
972        let title = builder.add_text_field("title", true, true);
973        let count = builder.add_u64_field("count", true, true);
974        let _schema = builder.build();
975
976        let mut doc = Document::new();
977        doc.add_text(title, "Hello World");
978        doc.add_u64(count, 42);
979
980        // Serialize
981        let json = serde_json::to_string(&doc).unwrap();
982        println!("Serialized doc: {}", json);
983
984        // Deserialize
985        let doc2: Document = serde_json::from_str(&json).unwrap();
986        assert_eq!(
987            doc2.field_values().len(),
988            2,
989            "Should have 2 field values after deserialization"
990        );
991        assert_eq!(
992            doc2.get_first(title).unwrap().as_text(),
993            Some("Hello World")
994        );
995        assert_eq!(doc2.get_first(count).unwrap().as_u64(), Some(42));
996    }
997
998    #[test]
999    fn test_multivalue_field() {
1000        let mut builder = Schema::builder();
1001        let uris = builder.add_text_field("uris", true, true);
1002        let title = builder.add_text_field("title", true, true);
1003        let schema = builder.build();
1004
1005        // Create document with multiple values for the same field
1006        let mut doc = Document::new();
1007        doc.add_text(uris, "one");
1008        doc.add_text(uris, "two");
1009        doc.add_text(title, "Test Document");
1010
1011        // Verify get_first returns the first value
1012        assert_eq!(doc.get_first(uris).unwrap().as_text(), Some("one"));
1013
1014        // Verify get_all returns all values
1015        let all_uris: Vec<_> = doc.get_all(uris).collect();
1016        assert_eq!(all_uris.len(), 2);
1017        assert_eq!(all_uris[0].as_text(), Some("one"));
1018        assert_eq!(all_uris[1].as_text(), Some("two"));
1019
1020        // Verify to_json returns array for multi-value field
1021        let json = doc.to_json(&schema);
1022        let uris_json = json.get("uris").unwrap();
1023        assert!(uris_json.is_array(), "Multi-value field should be an array");
1024        let uris_arr = uris_json.as_array().unwrap();
1025        assert_eq!(uris_arr.len(), 2);
1026        assert_eq!(uris_arr[0].as_str(), Some("one"));
1027        assert_eq!(uris_arr[1].as_str(), Some("two"));
1028
1029        // Verify single-value field is NOT an array
1030        let title_json = json.get("title").unwrap();
1031        assert!(
1032            title_json.is_string(),
1033            "Single-value field should be a string"
1034        );
1035        assert_eq!(title_json.as_str(), Some("Test Document"));
1036    }
1037
1038    #[test]
1039    fn test_multivalue_from_json() {
1040        let mut builder = Schema::builder();
1041        let uris = builder.add_text_field("uris", true, true);
1042        let title = builder.add_text_field("title", true, true);
1043        let schema = builder.build();
1044
1045        // Create JSON with array value
1046        let json = serde_json::json!({
1047            "uris": ["one", "two"],
1048            "title": "Test Document"
1049        });
1050
1051        // Parse from JSON
1052        let doc = Document::from_json(&json, &schema).unwrap();
1053
1054        // Verify all values are present
1055        let all_uris: Vec<_> = doc.get_all(uris).collect();
1056        assert_eq!(all_uris.len(), 2);
1057        assert_eq!(all_uris[0].as_text(), Some("one"));
1058        assert_eq!(all_uris[1].as_text(), Some("two"));
1059
1060        // Verify single value
1061        assert_eq!(
1062            doc.get_first(title).unwrap().as_text(),
1063            Some("Test Document")
1064        );
1065
1066        // Verify roundtrip: to_json should produce equivalent JSON
1067        let json_out = doc.to_json(&schema);
1068        let uris_out = json_out.get("uris").unwrap().as_array().unwrap();
1069        assert_eq!(uris_out.len(), 2);
1070        assert_eq!(uris_out[0].as_str(), Some("one"));
1071        assert_eq!(uris_out[1].as_str(), Some("two"));
1072    }
1073
1074    #[test]
1075    fn test_multi_attribute_forces_array() {
1076        // Test that fields marked as 'multi' are always serialized as arrays,
1077        // even when they have only one value
1078        let mut builder = Schema::builder();
1079        let uris = builder.add_text_field("uris", true, true);
1080        builder.set_multi(uris, true); // Mark as multi
1081        let title = builder.add_text_field("title", true, true);
1082        let schema = builder.build();
1083
1084        // Verify the multi attribute is set
1085        assert!(schema.get_field_entry(uris).unwrap().multi);
1086        assert!(!schema.get_field_entry(title).unwrap().multi);
1087
1088        // Create document with single value for multi field
1089        let mut doc = Document::new();
1090        doc.add_text(uris, "only_one");
1091        doc.add_text(title, "Test Document");
1092
1093        // Verify to_json returns array for multi field even with single value
1094        let json = doc.to_json(&schema);
1095
1096        let uris_json = json.get("uris").unwrap();
1097        assert!(
1098            uris_json.is_array(),
1099            "Multi field should be array even with single value"
1100        );
1101        let uris_arr = uris_json.as_array().unwrap();
1102        assert_eq!(uris_arr.len(), 1);
1103        assert_eq!(uris_arr[0].as_str(), Some("only_one"));
1104
1105        // Verify non-multi field with single value is NOT an array
1106        let title_json = json.get("title").unwrap();
1107        assert!(
1108            title_json.is_string(),
1109            "Non-multi single-value field should be a string"
1110        );
1111        assert_eq!(title_json.as_str(), Some("Test Document"));
1112    }
1113
1114    #[test]
1115    fn test_sparse_vector_field() {
1116        let mut builder = Schema::builder();
1117        let embedding = builder.add_sparse_vector_field("embedding", true, true);
1118        let title = builder.add_text_field("title", true, true);
1119        let schema = builder.build();
1120
1121        assert_eq!(schema.get_field("embedding"), Some(embedding));
1122        assert_eq!(
1123            schema.get_field_entry(embedding).unwrap().field_type,
1124            FieldType::SparseVector
1125        );
1126
1127        // Create document with sparse vector
1128        let mut doc = Document::new();
1129        doc.add_sparse_vector(embedding, vec![(0, 1.0), (5, 2.5), (10, 0.5)]);
1130        doc.add_text(title, "Test Document");
1131
1132        // Verify accessor
1133        let entries = doc
1134            .get_first(embedding)
1135            .unwrap()
1136            .as_sparse_vector()
1137            .unwrap();
1138        assert_eq!(entries, &[(0, 1.0), (5, 2.5), (10, 0.5)]);
1139
1140        // Verify JSON roundtrip
1141        let json = doc.to_json(&schema);
1142        let embedding_json = json.get("embedding").unwrap();
1143        assert!(embedding_json.is_object());
1144        assert_eq!(
1145            embedding_json
1146                .get("indices")
1147                .unwrap()
1148                .as_array()
1149                .unwrap()
1150                .len(),
1151            3
1152        );
1153
1154        // Parse back from JSON
1155        let doc2 = Document::from_json(&json, &schema).unwrap();
1156        let entries2 = doc2
1157            .get_first(embedding)
1158            .unwrap()
1159            .as_sparse_vector()
1160            .unwrap();
1161        assert_eq!(entries2[0].0, 0);
1162        assert!((entries2[0].1 - 1.0).abs() < 1e-6);
1163        assert_eq!(entries2[1].0, 5);
1164        assert!((entries2[1].1 - 2.5).abs() < 1e-6);
1165        assert_eq!(entries2[2].0, 10);
1166        assert!((entries2[2].1 - 0.5).abs() < 1e-6);
1167    }
1168
1169    #[test]
1170    fn test_json_field() {
1171        let mut builder = Schema::builder();
1172        let metadata = builder.add_json_field("metadata", true);
1173        let title = builder.add_text_field("title", true, true);
1174        let schema = builder.build();
1175
1176        assert_eq!(schema.get_field("metadata"), Some(metadata));
1177        assert_eq!(
1178            schema.get_field_entry(metadata).unwrap().field_type,
1179            FieldType::Json
1180        );
1181        // JSON fields are never indexed
1182        assert!(!schema.get_field_entry(metadata).unwrap().indexed);
1183        assert!(schema.get_field_entry(metadata).unwrap().stored);
1184
1185        // Create document with JSON value (object)
1186        let json_value = serde_json::json!({
1187            "author": "John Doe",
1188            "tags": ["rust", "search"],
1189            "nested": {"key": "value"}
1190        });
1191        let mut doc = Document::new();
1192        doc.add_json(metadata, json_value.clone());
1193        doc.add_text(title, "Test Document");
1194
1195        // Verify accessor
1196        let stored_json = doc.get_first(metadata).unwrap().as_json().unwrap();
1197        assert_eq!(stored_json, &json_value);
1198        assert_eq!(
1199            stored_json.get("author").unwrap().as_str(),
1200            Some("John Doe")
1201        );
1202
1203        // Verify JSON roundtrip via to_json/from_json
1204        let doc_json = doc.to_json(&schema);
1205        let metadata_out = doc_json.get("metadata").unwrap();
1206        assert_eq!(metadata_out, &json_value);
1207
1208        // Parse back from JSON
1209        let doc2 = Document::from_json(&doc_json, &schema).unwrap();
1210        let stored_json2 = doc2.get_first(metadata).unwrap().as_json().unwrap();
1211        assert_eq!(stored_json2, &json_value);
1212    }
1213
1214    #[test]
1215    fn test_json_field_various_types() {
1216        let mut builder = Schema::builder();
1217        let data = builder.add_json_field("data", true);
1218        let _schema = builder.build();
1219
1220        // Test with array
1221        let arr_value = serde_json::json!([1, 2, 3, "four", null]);
1222        let mut doc = Document::new();
1223        doc.add_json(data, arr_value.clone());
1224        assert_eq!(doc.get_first(data).unwrap().as_json().unwrap(), &arr_value);
1225
1226        // Test with string
1227        let str_value = serde_json::json!("just a string");
1228        let mut doc2 = Document::new();
1229        doc2.add_json(data, str_value.clone());
1230        assert_eq!(doc2.get_first(data).unwrap().as_json().unwrap(), &str_value);
1231
1232        // Test with number
1233        let num_value = serde_json::json!(42.5);
1234        let mut doc3 = Document::new();
1235        doc3.add_json(data, num_value.clone());
1236        assert_eq!(doc3.get_first(data).unwrap().as_json().unwrap(), &num_value);
1237
1238        // Test with null
1239        let null_value = serde_json::Value::Null;
1240        let mut doc4 = Document::new();
1241        doc4.add_json(data, null_value.clone());
1242        assert_eq!(
1243            doc4.get_first(data).unwrap().as_json().unwrap(),
1244            &null_value
1245        );
1246
1247        // Test with boolean
1248        let bool_value = serde_json::json!(true);
1249        let mut doc5 = Document::new();
1250        doc5.add_json(data, bool_value.clone());
1251        assert_eq!(
1252            doc5.get_first(data).unwrap().as_json().unwrap(),
1253            &bool_value
1254        );
1255    }
1256}