Skip to main content

hermes_core/dsl/
schema.rs

1//! Schema definitions for documents and fields
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// Field identifier
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
8pub struct Field(pub u32);
9
10/// Types of fields supported
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub enum FieldType {
13    /// Text field - tokenized and indexed
14    #[serde(rename = "text")]
15    Text,
16    /// Unsigned 64-bit integer
17    #[serde(rename = "u64")]
18    U64,
19    /// Signed 64-bit integer
20    #[serde(rename = "i64")]
21    I64,
22    /// 64-bit floating point
23    #[serde(rename = "f64")]
24    F64,
25    /// Raw bytes (not tokenized)
26    #[serde(rename = "bytes")]
27    Bytes,
28    /// Sparse vector field - indexed as inverted posting lists with quantized weights
29    #[serde(rename = "sparse_vector")]
30    SparseVector,
31    /// Dense vector field - indexed using RaBitQ binary quantization for ANN search
32    #[serde(rename = "dense_vector")]
33    DenseVector,
34    /// JSON field - arbitrary JSON data, stored but not indexed
35    #[serde(rename = "json")]
36    Json,
37}
38
39/// Field options
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct FieldEntry {
42    pub name: String,
43    pub field_type: FieldType,
44    pub indexed: bool,
45    pub stored: bool,
46    /// Name of the tokenizer to use for this field (for text fields)
47    pub tokenizer: Option<String>,
48    /// Whether this field can have multiple values (serialized as array in JSON)
49    #[serde(default)]
50    pub multi: bool,
51    /// Position tracking mode for phrase queries and multi-field element tracking
52    #[serde(default, skip_serializing_if = "Option::is_none")]
53    pub positions: Option<PositionMode>,
54    /// Configuration for sparse vector fields (index size, weight quantization)
55    #[serde(default, skip_serializing_if = "Option::is_none")]
56    pub sparse_vector_config: Option<crate::structures::SparseVectorConfig>,
57    /// Configuration for dense vector fields (dimension, quantization)
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub dense_vector_config: Option<DenseVectorConfig>,
60    /// Whether this field has columnar fast-field storage for O(1) doc→value access.
61    /// Valid for u64, i64, f64, and text fields.
62    #[serde(default)]
63    pub fast: bool,
64    /// Whether this field is a primary key (unique constraint, at most one per schema)
65    #[serde(default)]
66    pub primary_key: bool,
67}
68
69/// Position tracking mode for text fields
70#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
71#[serde(rename_all = "snake_case")]
72pub enum PositionMode {
73    /// Track only element ordinal for multi-valued fields (which array element)
74    /// Useful for returning which element matched without full phrase query support
75    Ordinal,
76    /// Track only token position within text (for phrase queries)
77    /// Does not track element ordinal - all positions are relative to concatenated text
78    TokenPosition,
79    /// Track both element ordinal and token position (full support)
80    /// Position format: (element_ordinal << 20) | token_position
81    Full,
82}
83
84impl PositionMode {
85    /// Whether this mode tracks element ordinals
86    pub fn tracks_ordinal(&self) -> bool {
87        matches!(self, PositionMode::Ordinal | PositionMode::Full)
88    }
89
90    /// Whether this mode tracks token positions
91    pub fn tracks_token_position(&self) -> bool {
92        matches!(self, PositionMode::TokenPosition | PositionMode::Full)
93    }
94}
95
96/// Vector index algorithm type
97#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
98#[serde(rename_all = "snake_case")]
99pub enum VectorIndexType {
100    /// Flat - brute-force search over raw vectors (accumulating state)
101    Flat,
102    /// RaBitQ - binary quantization, good for small datasets (<100K)
103    #[default]
104    RaBitQ,
105    /// IVF-RaBitQ - inverted file with RaBitQ, good for medium datasets
106    IvfRaBitQ,
107    /// ScaNN - product quantization with OPQ and anisotropic loss, best for large datasets
108    ScaNN,
109}
110
111/// Storage quantization for dense vector elements
112///
113/// Controls the precision of each vector coordinate in `.vectors` files.
114/// Lower precision reduces storage and memory bandwidth; scoring uses
115/// native-precision SIMD (no dequantization on the hot path).
116#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
117#[serde(rename_all = "snake_case")]
118pub enum DenseVectorQuantization {
119    /// 32-bit IEEE 754 float (4 bytes/dim) — full precision, baseline
120    #[default]
121    F32,
122    /// 16-bit IEEE 754 half-float (2 bytes/dim) — <0.1% recall loss for normalized embeddings
123    F16,
124    /// 8-bit unsigned scalar quantization (1 byte/dim) — maps [-1,1] → [0,255]
125    UInt8,
126}
127
128impl DenseVectorQuantization {
129    /// Bytes per element for this quantization type
130    pub fn element_size(self) -> usize {
131        match self {
132            Self::F32 => 4,
133            Self::F16 => 2,
134            Self::UInt8 => 1,
135        }
136    }
137
138    /// Wire format tag (stored in .vectors header)
139    pub fn tag(self) -> u8 {
140        match self {
141            Self::F32 => 0,
142            Self::F16 => 1,
143            Self::UInt8 => 2,
144        }
145    }
146
147    /// Decode wire format tag
148    pub fn from_tag(tag: u8) -> Option<Self> {
149        match tag {
150            0 => Some(Self::F32),
151            1 => Some(Self::F16),
152            2 => Some(Self::UInt8),
153            _ => None,
154        }
155    }
156}
157
158/// Configuration for dense vector fields using Flat, RaBitQ, IVF-RaBitQ, or ScaNN
159///
160/// Indexes operate in two states:
161/// - **Flat (accumulating)**: Brute-force search over raw vectors. Used when vector count
162///   is below `build_threshold` or before `build_index` is called.
163/// - **Built (ANN)**: Fast approximate nearest neighbor search using trained structures.
164///   Centroids and codebooks are trained from data and stored within the segment.
165#[derive(Debug, Clone, Serialize, Deserialize)]
166pub struct DenseVectorConfig {
167    /// Dimensionality of vectors
168    pub dim: usize,
169    /// Target vector index algorithm (Flat, RaBitQ, IVF-RaBitQ, or ScaNN)
170    /// When in accumulating state, search uses brute-force regardless of this setting.
171    #[serde(default)]
172    pub index_type: VectorIndexType,
173    /// Storage quantization for vector elements (f32, f16, uint8)
174    #[serde(default)]
175    pub quantization: DenseVectorQuantization,
176    /// Number of IVF clusters for IVF-RaBitQ and ScaNN (default: sqrt(n) capped at 4096)
177    /// If None, automatically determined based on dataset size.
178    #[serde(default, skip_serializing_if = "Option::is_none")]
179    pub num_clusters: Option<usize>,
180    /// Number of clusters to probe during search (default: 32)
181    #[serde(default = "default_nprobe")]
182    pub nprobe: usize,
183    /// Minimum number of vectors required before building ANN index.
184    /// Below this threshold, brute-force (Flat) search is used.
185    /// Default: 1000 for RaBitQ, 10000 for IVF-RaBitQ/ScaNN.
186    #[serde(default, skip_serializing_if = "Option::is_none")]
187    pub build_threshold: Option<usize>,
188    /// Whether stored vectors are pre-normalized to unit L2 norm.
189    /// When true, scoring skips per-vector norm computation (cosine = dot / ||q||),
190    /// reducing compute by ~40%. Common for embedding models (e.g. OpenAI, Cohere).
191    /// Default: true (most embedding models produce L2-normalized vectors).
192    #[serde(default = "default_unit_norm")]
193    pub unit_norm: bool,
194}
195
196fn default_nprobe() -> usize {
197    32
198}
199
200fn default_unit_norm() -> bool {
201    true
202}
203
204impl DenseVectorConfig {
205    pub fn new(dim: usize) -> Self {
206        Self {
207            dim,
208            index_type: VectorIndexType::RaBitQ,
209            quantization: DenseVectorQuantization::F32,
210            num_clusters: None,
211            nprobe: 32,
212            build_threshold: None,
213            unit_norm: true,
214        }
215    }
216
217    /// Create IVF-RaBitQ configuration
218    pub fn with_ivf(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
219        Self {
220            dim,
221            index_type: VectorIndexType::IvfRaBitQ,
222            quantization: DenseVectorQuantization::F32,
223            num_clusters,
224            nprobe,
225            build_threshold: None,
226            unit_norm: true,
227        }
228    }
229
230    /// Create ScaNN configuration
231    pub fn with_scann(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
232        Self {
233            dim,
234            index_type: VectorIndexType::ScaNN,
235            quantization: DenseVectorQuantization::F32,
236            num_clusters,
237            nprobe,
238            build_threshold: None,
239            unit_norm: true,
240        }
241    }
242
243    /// Create Flat (brute-force) configuration - no ANN index
244    pub fn flat(dim: usize) -> Self {
245        Self {
246            dim,
247            index_type: VectorIndexType::Flat,
248            quantization: DenseVectorQuantization::F32,
249            num_clusters: None,
250            nprobe: 0,
251            build_threshold: None,
252            unit_norm: true,
253        }
254    }
255
256    /// Set storage quantization
257    pub fn with_quantization(mut self, quantization: DenseVectorQuantization) -> Self {
258        self.quantization = quantization;
259        self
260    }
261
262    /// Set build threshold for auto-building ANN index
263    pub fn with_build_threshold(mut self, threshold: usize) -> Self {
264        self.build_threshold = Some(threshold);
265        self
266    }
267
268    /// Mark vectors as pre-normalized to unit L2 norm
269    pub fn with_unit_norm(mut self) -> Self {
270        self.unit_norm = true;
271        self
272    }
273
274    /// Set number of IVF clusters
275    pub fn with_num_clusters(mut self, num_clusters: usize) -> Self {
276        self.num_clusters = Some(num_clusters);
277        self
278    }
279
280    /// Check if this config uses IVF
281    pub fn uses_ivf(&self) -> bool {
282        matches!(
283            self.index_type,
284            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN
285        )
286    }
287
288    /// Check if this config uses ScaNN
289    pub fn uses_scann(&self) -> bool {
290        self.index_type == VectorIndexType::ScaNN
291    }
292
293    /// Check if this config is flat (brute-force)
294    pub fn is_flat(&self) -> bool {
295        self.index_type == VectorIndexType::Flat
296    }
297
298    /// Get the default build threshold for this index type
299    pub fn default_build_threshold(&self) -> usize {
300        self.build_threshold.unwrap_or(match self.index_type {
301            VectorIndexType::Flat => usize::MAX, // Never auto-build
302            VectorIndexType::RaBitQ => 1000,
303            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN => 10000,
304        })
305    }
306
307    /// Calculate optimal number of clusters for given vector count
308    pub fn optimal_num_clusters(&self, num_vectors: usize) -> usize {
309        self.num_clusters.unwrap_or_else(|| {
310            // sqrt(n) heuristic, capped at 4096
311            let optimal = (num_vectors as f64).sqrt() as usize;
312            optimal.clamp(16, 4096)
313        })
314    }
315}
316
317use super::query_field_router::QueryRouterRule;
318
319/// Schema defining document structure
320#[derive(Debug, Clone, Default, Serialize, Deserialize)]
321pub struct Schema {
322    fields: Vec<FieldEntry>,
323    name_to_field: HashMap<String, Field>,
324    /// Default fields for query parsing (when no field is specified)
325    #[serde(default)]
326    default_fields: Vec<Field>,
327    /// Query router rules for routing queries to specific fields based on regex patterns
328    #[serde(default)]
329    query_routers: Vec<QueryRouterRule>,
330}
331
332impl Schema {
333    pub fn builder() -> SchemaBuilder {
334        SchemaBuilder::default()
335    }
336
337    pub fn get_field(&self, name: &str) -> Option<Field> {
338        self.name_to_field.get(name).copied()
339    }
340
341    pub fn get_field_entry(&self, field: Field) -> Option<&FieldEntry> {
342        self.fields.get(field.0 as usize)
343    }
344
345    pub fn get_field_name(&self, field: Field) -> Option<&str> {
346        self.fields.get(field.0 as usize).map(|e| e.name.as_str())
347    }
348
349    pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
350        self.fields
351            .iter()
352            .enumerate()
353            .map(|(i, e)| (Field(i as u32), e))
354    }
355
356    pub fn num_fields(&self) -> usize {
357        self.fields.len()
358    }
359
360    /// Get the default fields for query parsing
361    pub fn default_fields(&self) -> &[Field] {
362        &self.default_fields
363    }
364
365    /// Set default fields (used by builder)
366    pub fn set_default_fields(&mut self, fields: Vec<Field>) {
367        self.default_fields = fields;
368    }
369
370    /// Get the query router rules
371    pub fn query_routers(&self) -> &[QueryRouterRule] {
372        &self.query_routers
373    }
374
375    /// Set query router rules
376    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
377        self.query_routers = rules;
378    }
379
380    /// Get the primary key field, if one is defined
381    pub fn primary_field(&self) -> Option<Field> {
382        self.fields
383            .iter()
384            .enumerate()
385            .find(|(_, e)| e.primary_key)
386            .map(|(i, _)| Field(i as u32))
387    }
388}
389
390/// Builder for Schema
391#[derive(Debug, Default)]
392pub struct SchemaBuilder {
393    fields: Vec<FieldEntry>,
394    default_fields: Vec<String>,
395    query_routers: Vec<QueryRouterRule>,
396}
397
398impl SchemaBuilder {
399    pub fn add_text_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
400        self.add_field_with_tokenizer(
401            name,
402            FieldType::Text,
403            indexed,
404            stored,
405            Some("simple".to_string()),
406        )
407    }
408
409    pub fn add_text_field_with_tokenizer(
410        &mut self,
411        name: &str,
412        indexed: bool,
413        stored: bool,
414        tokenizer: &str,
415    ) -> Field {
416        self.add_field_with_tokenizer(
417            name,
418            FieldType::Text,
419            indexed,
420            stored,
421            Some(tokenizer.to_string()),
422        )
423    }
424
425    pub fn add_u64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
426        self.add_field(name, FieldType::U64, indexed, stored)
427    }
428
429    pub fn add_i64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
430        self.add_field(name, FieldType::I64, indexed, stored)
431    }
432
433    pub fn add_f64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
434        self.add_field(name, FieldType::F64, indexed, stored)
435    }
436
437    pub fn add_bytes_field(&mut self, name: &str, stored: bool) -> Field {
438        self.add_field(name, FieldType::Bytes, false, stored)
439    }
440
441    /// Add a JSON field for storing arbitrary JSON data
442    ///
443    /// JSON fields are never indexed, only stored. They can hold any valid JSON value
444    /// (objects, arrays, strings, numbers, booleans, null).
445    pub fn add_json_field(&mut self, name: &str, stored: bool) -> Field {
446        self.add_field(name, FieldType::Json, false, stored)
447    }
448
449    /// Add a sparse vector field with default configuration
450    ///
451    /// Sparse vectors are indexed as inverted posting lists where each dimension
452    /// becomes a "term" and documents have quantized weights for each dimension.
453    pub fn add_sparse_vector_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
454        self.add_sparse_vector_field_with_config(
455            name,
456            indexed,
457            stored,
458            crate::structures::SparseVectorConfig::default(),
459        )
460    }
461
462    /// Add a sparse vector field with custom configuration
463    ///
464    /// Use `SparseVectorConfig::splade()` for SPLADE models (u16 indices, uint8 weights).
465    /// Use `SparseVectorConfig::compact()` for maximum compression (u16 indices, uint4 weights).
466    pub fn add_sparse_vector_field_with_config(
467        &mut self,
468        name: &str,
469        indexed: bool,
470        stored: bool,
471        config: crate::structures::SparseVectorConfig,
472    ) -> Field {
473        let field = Field(self.fields.len() as u32);
474        self.fields.push(FieldEntry {
475            name: name.to_string(),
476            field_type: FieldType::SparseVector,
477            indexed,
478            stored,
479            tokenizer: None,
480            multi: false,
481            positions: None,
482            sparse_vector_config: Some(config),
483            dense_vector_config: None,
484            fast: false,
485            primary_key: false,
486        });
487        field
488    }
489
490    /// Set sparse vector configuration for an existing field
491    pub fn set_sparse_vector_config(
492        &mut self,
493        field: Field,
494        config: crate::structures::SparseVectorConfig,
495    ) {
496        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
497            entry.sparse_vector_config = Some(config);
498        }
499    }
500
501    /// Add a dense vector field with default configuration
502    ///
503    /// Dense vectors are indexed using RaBitQ binary quantization for fast ANN search.
504    /// The dimension must be specified as it determines the quantization structure.
505    pub fn add_dense_vector_field(
506        &mut self,
507        name: &str,
508        dim: usize,
509        indexed: bool,
510        stored: bool,
511    ) -> Field {
512        self.add_dense_vector_field_with_config(name, indexed, stored, DenseVectorConfig::new(dim))
513    }
514
515    /// Add a dense vector field with custom configuration
516    pub fn add_dense_vector_field_with_config(
517        &mut self,
518        name: &str,
519        indexed: bool,
520        stored: bool,
521        config: DenseVectorConfig,
522    ) -> Field {
523        let field = Field(self.fields.len() as u32);
524        self.fields.push(FieldEntry {
525            name: name.to_string(),
526            field_type: FieldType::DenseVector,
527            indexed,
528            stored,
529            tokenizer: None,
530            multi: false,
531            positions: None,
532            sparse_vector_config: None,
533            dense_vector_config: Some(config),
534            fast: false,
535            primary_key: false,
536        });
537        field
538    }
539
540    fn add_field(
541        &mut self,
542        name: &str,
543        field_type: FieldType,
544        indexed: bool,
545        stored: bool,
546    ) -> Field {
547        self.add_field_with_tokenizer(name, field_type, indexed, stored, None)
548    }
549
550    fn add_field_with_tokenizer(
551        &mut self,
552        name: &str,
553        field_type: FieldType,
554        indexed: bool,
555        stored: bool,
556        tokenizer: Option<String>,
557    ) -> Field {
558        self.add_field_full(name, field_type, indexed, stored, tokenizer, false)
559    }
560
561    fn add_field_full(
562        &mut self,
563        name: &str,
564        field_type: FieldType,
565        indexed: bool,
566        stored: bool,
567        tokenizer: Option<String>,
568        multi: bool,
569    ) -> Field {
570        let field = Field(self.fields.len() as u32);
571        self.fields.push(FieldEntry {
572            name: name.to_string(),
573            field_type,
574            indexed,
575            stored,
576            tokenizer,
577            multi,
578            positions: None,
579            sparse_vector_config: None,
580            dense_vector_config: None,
581            fast: false,
582            primary_key: false,
583        });
584        field
585    }
586
587    /// Set the multi attribute on the last added field
588    pub fn set_multi(&mut self, field: Field, multi: bool) {
589        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
590            entry.multi = multi;
591        }
592    }
593
594    /// Set fast-field columnar storage for O(1) doc→value access.
595    /// Valid for u64, i64, f64, and text fields.
596    pub fn set_fast(&mut self, field: Field, fast: bool) {
597        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
598            entry.fast = fast;
599        }
600    }
601
602    /// Mark a field as the primary key (unique constraint)
603    pub fn set_primary_key(&mut self, field: Field) {
604        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
605            entry.primary_key = true;
606        }
607    }
608
609    /// Set position tracking mode for phrase queries and multi-field element tracking
610    pub fn set_positions(&mut self, field: Field, mode: PositionMode) {
611        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
612            entry.positions = Some(mode);
613        }
614    }
615
616    /// Set default fields by name
617    pub fn set_default_fields(&mut self, field_names: Vec<String>) {
618        self.default_fields = field_names;
619    }
620
621    /// Set query router rules
622    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
623        self.query_routers = rules;
624    }
625
626    pub fn build(self) -> Schema {
627        let mut name_to_field = HashMap::new();
628        for (i, entry) in self.fields.iter().enumerate() {
629            name_to_field.insert(entry.name.clone(), Field(i as u32));
630        }
631
632        // Resolve default field names to Field IDs
633        let default_fields: Vec<Field> = self
634            .default_fields
635            .iter()
636            .filter_map(|name| name_to_field.get(name).copied())
637            .collect();
638
639        Schema {
640            fields: self.fields,
641            name_to_field,
642            default_fields,
643            query_routers: self.query_routers,
644        }
645    }
646}
647
648/// Value that can be stored in a field
649#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
650pub enum FieldValue {
651    #[serde(rename = "text")]
652    Text(String),
653    #[serde(rename = "u64")]
654    U64(u64),
655    #[serde(rename = "i64")]
656    I64(i64),
657    #[serde(rename = "f64")]
658    F64(f64),
659    #[serde(rename = "bytes")]
660    Bytes(Vec<u8>),
661    /// Sparse vector: list of (dimension_id, weight) pairs
662    #[serde(rename = "sparse_vector")]
663    SparseVector(Vec<(u32, f32)>),
664    /// Dense vector: float32 values
665    #[serde(rename = "dense_vector")]
666    DenseVector(Vec<f32>),
667    /// Arbitrary JSON value
668    #[serde(rename = "json")]
669    Json(serde_json::Value),
670}
671
672impl FieldValue {
673    pub fn as_text(&self) -> Option<&str> {
674        match self {
675            FieldValue::Text(s) => Some(s),
676            _ => None,
677        }
678    }
679
680    pub fn as_u64(&self) -> Option<u64> {
681        match self {
682            FieldValue::U64(v) => Some(*v),
683            _ => None,
684        }
685    }
686
687    pub fn as_i64(&self) -> Option<i64> {
688        match self {
689            FieldValue::I64(v) => Some(*v),
690            _ => None,
691        }
692    }
693
694    pub fn as_f64(&self) -> Option<f64> {
695        match self {
696            FieldValue::F64(v) => Some(*v),
697            _ => None,
698        }
699    }
700
701    pub fn as_bytes(&self) -> Option<&[u8]> {
702        match self {
703            FieldValue::Bytes(b) => Some(b),
704            _ => None,
705        }
706    }
707
708    pub fn as_sparse_vector(&self) -> Option<&[(u32, f32)]> {
709        match self {
710            FieldValue::SparseVector(entries) => Some(entries),
711            _ => None,
712        }
713    }
714
715    pub fn as_dense_vector(&self) -> Option<&[f32]> {
716        match self {
717            FieldValue::DenseVector(v) => Some(v),
718            _ => None,
719        }
720    }
721
722    pub fn as_json(&self) -> Option<&serde_json::Value> {
723        match self {
724            FieldValue::Json(v) => Some(v),
725            _ => None,
726        }
727    }
728}
729
730/// A document to be indexed
731#[derive(Debug, Clone, Default, Serialize, Deserialize)]
732pub struct Document {
733    field_values: Vec<(Field, FieldValue)>,
734}
735
736impl Document {
737    pub fn new() -> Self {
738        Self::default()
739    }
740
741    pub fn add_text(&mut self, field: Field, value: impl Into<String>) {
742        self.field_values
743            .push((field, FieldValue::Text(value.into())));
744    }
745
746    pub fn add_u64(&mut self, field: Field, value: u64) {
747        self.field_values.push((field, FieldValue::U64(value)));
748    }
749
750    pub fn add_i64(&mut self, field: Field, value: i64) {
751        self.field_values.push((field, FieldValue::I64(value)));
752    }
753
754    pub fn add_f64(&mut self, field: Field, value: f64) {
755        self.field_values.push((field, FieldValue::F64(value)));
756    }
757
758    pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
759        self.field_values.push((field, FieldValue::Bytes(value)));
760    }
761
762    pub fn add_sparse_vector(&mut self, field: Field, entries: Vec<(u32, f32)>) {
763        self.field_values
764            .push((field, FieldValue::SparseVector(entries)));
765    }
766
767    pub fn add_dense_vector(&mut self, field: Field, values: Vec<f32>) {
768        self.field_values
769            .push((field, FieldValue::DenseVector(values)));
770    }
771
772    pub fn add_json(&mut self, field: Field, value: serde_json::Value) {
773        self.field_values.push((field, FieldValue::Json(value)));
774    }
775
776    pub fn get_first(&self, field: Field) -> Option<&FieldValue> {
777        self.field_values
778            .iter()
779            .find(|(f, _)| *f == field)
780            .map(|(_, v)| v)
781    }
782
783    pub fn get_all(&self, field: Field) -> impl Iterator<Item = &FieldValue> {
784        self.field_values
785            .iter()
786            .filter(move |(f, _)| *f == field)
787            .map(|(_, v)| v)
788    }
789
790    pub fn field_values(&self) -> &[(Field, FieldValue)] {
791        &self.field_values
792    }
793
794    /// Return a new Document containing only fields marked as `stored` in the schema
795    pub fn filter_stored(&self, schema: &Schema) -> Document {
796        Document {
797            field_values: self
798                .field_values
799                .iter()
800                .filter(|(field, _)| {
801                    schema
802                        .get_field_entry(*field)
803                        .is_some_and(|entry| entry.stored)
804                })
805                .cloned()
806                .collect(),
807        }
808    }
809
810    /// Convert document to a JSON object using field names from schema
811    ///
812    /// Fields marked as `multi` in the schema are always returned as JSON arrays.
813    /// Other fields with multiple values are also returned as arrays.
814    /// Fields with a single value (and not marked multi) are returned as scalar values.
815    pub fn to_json(&self, schema: &Schema) -> serde_json::Value {
816        use std::collections::HashMap;
817
818        // Group values by field, keeping track of field entry for multi check
819        let mut field_values_map: HashMap<Field, (String, bool, Vec<serde_json::Value>)> =
820            HashMap::new();
821
822        for (field, value) in &self.field_values {
823            if let Some(entry) = schema.get_field_entry(*field) {
824                let json_value = match value {
825                    FieldValue::Text(s) => serde_json::Value::String(s.clone()),
826                    FieldValue::U64(n) => serde_json::Value::Number((*n).into()),
827                    FieldValue::I64(n) => serde_json::Value::Number((*n).into()),
828                    FieldValue::F64(n) => serde_json::json!(n),
829                    FieldValue::Bytes(b) => {
830                        use base64::Engine;
831                        serde_json::Value::String(
832                            base64::engine::general_purpose::STANDARD.encode(b),
833                        )
834                    }
835                    FieldValue::SparseVector(entries) => {
836                        let indices: Vec<u32> = entries.iter().map(|(i, _)| *i).collect();
837                        let values: Vec<f32> = entries.iter().map(|(_, v)| *v).collect();
838                        serde_json::json!({
839                            "indices": indices,
840                            "values": values
841                        })
842                    }
843                    FieldValue::DenseVector(values) => {
844                        serde_json::json!(values)
845                    }
846                    FieldValue::Json(v) => v.clone(),
847                };
848                field_values_map
849                    .entry(*field)
850                    .or_insert_with(|| (entry.name.clone(), entry.multi, Vec::new()))
851                    .2
852                    .push(json_value);
853            }
854        }
855
856        // Convert to JSON object, using arrays for multi fields or when multiple values exist
857        let mut map = serde_json::Map::new();
858        for (_field, (name, is_multi, values)) in field_values_map {
859            let json_value = if is_multi || values.len() > 1 {
860                serde_json::Value::Array(values)
861            } else {
862                values.into_iter().next().unwrap()
863            };
864            map.insert(name, json_value);
865        }
866
867        serde_json::Value::Object(map)
868    }
869
870    /// Create a Document from a JSON object using field names from schema
871    ///
872    /// Supports:
873    /// - String values -> Text fields
874    /// - Number values -> U64/I64/F64 fields (based on schema type)
875    /// - Array values -> Multiple values for the same field (multifields)
876    ///
877    /// Unknown fields (not in schema) are silently ignored.
878    pub fn from_json(json: &serde_json::Value, schema: &Schema) -> Option<Self> {
879        let obj = json.as_object()?;
880        let mut doc = Document::new();
881
882        for (key, value) in obj {
883            if let Some(field) = schema.get_field(key) {
884                let field_entry = schema.get_field_entry(field)?;
885                Self::add_json_value(&mut doc, field, &field_entry.field_type, value);
886            }
887        }
888
889        Some(doc)
890    }
891
892    /// Helper to add a JSON value to a document, handling type conversion
893    fn add_json_value(
894        doc: &mut Document,
895        field: Field,
896        field_type: &FieldType,
897        value: &serde_json::Value,
898    ) {
899        match value {
900            serde_json::Value::String(s) => {
901                if matches!(field_type, FieldType::Text) {
902                    doc.add_text(field, s.clone());
903                }
904            }
905            serde_json::Value::Number(n) => {
906                match field_type {
907                    FieldType::I64 => {
908                        if let Some(i) = n.as_i64() {
909                            doc.add_i64(field, i);
910                        }
911                    }
912                    FieldType::U64 => {
913                        if let Some(u) = n.as_u64() {
914                            doc.add_u64(field, u);
915                        } else if let Some(i) = n.as_i64() {
916                            // Allow positive i64 as u64
917                            if i >= 0 {
918                                doc.add_u64(field, i as u64);
919                            }
920                        }
921                    }
922                    FieldType::F64 => {
923                        if let Some(f) = n.as_f64() {
924                            doc.add_f64(field, f);
925                        }
926                    }
927                    _ => {}
928                }
929            }
930            // Handle arrays (multifields) - add each element separately
931            serde_json::Value::Array(arr) => {
932                for item in arr {
933                    Self::add_json_value(doc, field, field_type, item);
934                }
935            }
936            // Handle sparse vector objects
937            serde_json::Value::Object(obj) if matches!(field_type, FieldType::SparseVector) => {
938                if let (Some(indices_val), Some(values_val)) =
939                    (obj.get("indices"), obj.get("values"))
940                {
941                    let indices: Vec<u32> = indices_val
942                        .as_array()
943                        .map(|arr| {
944                            arr.iter()
945                                .filter_map(|v| v.as_u64().map(|n| n as u32))
946                                .collect()
947                        })
948                        .unwrap_or_default();
949                    let values: Vec<f32> = values_val
950                        .as_array()
951                        .map(|arr| {
952                            arr.iter()
953                                .filter_map(|v| v.as_f64().map(|n| n as f32))
954                                .collect()
955                        })
956                        .unwrap_or_default();
957                    if indices.len() == values.len() {
958                        let entries: Vec<(u32, f32)> = indices.into_iter().zip(values).collect();
959                        doc.add_sparse_vector(field, entries);
960                    }
961                }
962            }
963            // Handle JSON fields - accept any value directly
964            _ if matches!(field_type, FieldType::Json) => {
965                doc.add_json(field, value.clone());
966            }
967            serde_json::Value::Object(_) => {}
968            _ => {}
969        }
970    }
971}
972
973#[cfg(test)]
974mod tests {
975    use super::*;
976
977    #[test]
978    fn test_schema_builder() {
979        let mut builder = Schema::builder();
980        let title = builder.add_text_field("title", true, true);
981        let body = builder.add_text_field("body", true, false);
982        let count = builder.add_u64_field("count", true, true);
983        let schema = builder.build();
984
985        assert_eq!(schema.get_field("title"), Some(title));
986        assert_eq!(schema.get_field("body"), Some(body));
987        assert_eq!(schema.get_field("count"), Some(count));
988        assert_eq!(schema.get_field("nonexistent"), None);
989    }
990
991    #[test]
992    fn test_document() {
993        let mut builder = Schema::builder();
994        let title = builder.add_text_field("title", true, true);
995        let count = builder.add_u64_field("count", true, true);
996        let _schema = builder.build();
997
998        let mut doc = Document::new();
999        doc.add_text(title, "Hello World");
1000        doc.add_u64(count, 42);
1001
1002        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
1003        assert_eq!(doc.get_first(count).unwrap().as_u64(), Some(42));
1004    }
1005
1006    #[test]
1007    fn test_document_serialization() {
1008        let mut builder = Schema::builder();
1009        let title = builder.add_text_field("title", true, true);
1010        let count = builder.add_u64_field("count", true, true);
1011        let _schema = builder.build();
1012
1013        let mut doc = Document::new();
1014        doc.add_text(title, "Hello World");
1015        doc.add_u64(count, 42);
1016
1017        // Serialize
1018        let json = serde_json::to_string(&doc).unwrap();
1019        println!("Serialized doc: {}", json);
1020
1021        // Deserialize
1022        let doc2: Document = serde_json::from_str(&json).unwrap();
1023        assert_eq!(
1024            doc2.field_values().len(),
1025            2,
1026            "Should have 2 field values after deserialization"
1027        );
1028        assert_eq!(
1029            doc2.get_first(title).unwrap().as_text(),
1030            Some("Hello World")
1031        );
1032        assert_eq!(doc2.get_first(count).unwrap().as_u64(), Some(42));
1033    }
1034
1035    #[test]
1036    fn test_multivalue_field() {
1037        let mut builder = Schema::builder();
1038        let uris = builder.add_text_field("uris", true, true);
1039        let title = builder.add_text_field("title", true, true);
1040        let schema = builder.build();
1041
1042        // Create document with multiple values for the same field
1043        let mut doc = Document::new();
1044        doc.add_text(uris, "one");
1045        doc.add_text(uris, "two");
1046        doc.add_text(title, "Test Document");
1047
1048        // Verify get_first returns the first value
1049        assert_eq!(doc.get_first(uris).unwrap().as_text(), Some("one"));
1050
1051        // Verify get_all returns all values
1052        let all_uris: Vec<_> = doc.get_all(uris).collect();
1053        assert_eq!(all_uris.len(), 2);
1054        assert_eq!(all_uris[0].as_text(), Some("one"));
1055        assert_eq!(all_uris[1].as_text(), Some("two"));
1056
1057        // Verify to_json returns array for multi-value field
1058        let json = doc.to_json(&schema);
1059        let uris_json = json.get("uris").unwrap();
1060        assert!(uris_json.is_array(), "Multi-value field should be an array");
1061        let uris_arr = uris_json.as_array().unwrap();
1062        assert_eq!(uris_arr.len(), 2);
1063        assert_eq!(uris_arr[0].as_str(), Some("one"));
1064        assert_eq!(uris_arr[1].as_str(), Some("two"));
1065
1066        // Verify single-value field is NOT an array
1067        let title_json = json.get("title").unwrap();
1068        assert!(
1069            title_json.is_string(),
1070            "Single-value field should be a string"
1071        );
1072        assert_eq!(title_json.as_str(), Some("Test Document"));
1073    }
1074
1075    #[test]
1076    fn test_multivalue_from_json() {
1077        let mut builder = Schema::builder();
1078        let uris = builder.add_text_field("uris", true, true);
1079        let title = builder.add_text_field("title", true, true);
1080        let schema = builder.build();
1081
1082        // Create JSON with array value
1083        let json = serde_json::json!({
1084            "uris": ["one", "two"],
1085            "title": "Test Document"
1086        });
1087
1088        // Parse from JSON
1089        let doc = Document::from_json(&json, &schema).unwrap();
1090
1091        // Verify all values are present
1092        let all_uris: Vec<_> = doc.get_all(uris).collect();
1093        assert_eq!(all_uris.len(), 2);
1094        assert_eq!(all_uris[0].as_text(), Some("one"));
1095        assert_eq!(all_uris[1].as_text(), Some("two"));
1096
1097        // Verify single value
1098        assert_eq!(
1099            doc.get_first(title).unwrap().as_text(),
1100            Some("Test Document")
1101        );
1102
1103        // Verify roundtrip: to_json should produce equivalent JSON
1104        let json_out = doc.to_json(&schema);
1105        let uris_out = json_out.get("uris").unwrap().as_array().unwrap();
1106        assert_eq!(uris_out.len(), 2);
1107        assert_eq!(uris_out[0].as_str(), Some("one"));
1108        assert_eq!(uris_out[1].as_str(), Some("two"));
1109    }
1110
1111    #[test]
1112    fn test_multi_attribute_forces_array() {
1113        // Test that fields marked as 'multi' are always serialized as arrays,
1114        // even when they have only one value
1115        let mut builder = Schema::builder();
1116        let uris = builder.add_text_field("uris", true, true);
1117        builder.set_multi(uris, true); // Mark as multi
1118        let title = builder.add_text_field("title", true, true);
1119        let schema = builder.build();
1120
1121        // Verify the multi attribute is set
1122        assert!(schema.get_field_entry(uris).unwrap().multi);
1123        assert!(!schema.get_field_entry(title).unwrap().multi);
1124
1125        // Create document with single value for multi field
1126        let mut doc = Document::new();
1127        doc.add_text(uris, "only_one");
1128        doc.add_text(title, "Test Document");
1129
1130        // Verify to_json returns array for multi field even with single value
1131        let json = doc.to_json(&schema);
1132
1133        let uris_json = json.get("uris").unwrap();
1134        assert!(
1135            uris_json.is_array(),
1136            "Multi field should be array even with single value"
1137        );
1138        let uris_arr = uris_json.as_array().unwrap();
1139        assert_eq!(uris_arr.len(), 1);
1140        assert_eq!(uris_arr[0].as_str(), Some("only_one"));
1141
1142        // Verify non-multi field with single value is NOT an array
1143        let title_json = json.get("title").unwrap();
1144        assert!(
1145            title_json.is_string(),
1146            "Non-multi single-value field should be a string"
1147        );
1148        assert_eq!(title_json.as_str(), Some("Test Document"));
1149    }
1150
1151    #[test]
1152    fn test_sparse_vector_field() {
1153        let mut builder = Schema::builder();
1154        let embedding = builder.add_sparse_vector_field("embedding", true, true);
1155        let title = builder.add_text_field("title", true, true);
1156        let schema = builder.build();
1157
1158        assert_eq!(schema.get_field("embedding"), Some(embedding));
1159        assert_eq!(
1160            schema.get_field_entry(embedding).unwrap().field_type,
1161            FieldType::SparseVector
1162        );
1163
1164        // Create document with sparse vector
1165        let mut doc = Document::new();
1166        doc.add_sparse_vector(embedding, vec![(0, 1.0), (5, 2.5), (10, 0.5)]);
1167        doc.add_text(title, "Test Document");
1168
1169        // Verify accessor
1170        let entries = doc
1171            .get_first(embedding)
1172            .unwrap()
1173            .as_sparse_vector()
1174            .unwrap();
1175        assert_eq!(entries, &[(0, 1.0), (5, 2.5), (10, 0.5)]);
1176
1177        // Verify JSON roundtrip
1178        let json = doc.to_json(&schema);
1179        let embedding_json = json.get("embedding").unwrap();
1180        assert!(embedding_json.is_object());
1181        assert_eq!(
1182            embedding_json
1183                .get("indices")
1184                .unwrap()
1185                .as_array()
1186                .unwrap()
1187                .len(),
1188            3
1189        );
1190
1191        // Parse back from JSON
1192        let doc2 = Document::from_json(&json, &schema).unwrap();
1193        let entries2 = doc2
1194            .get_first(embedding)
1195            .unwrap()
1196            .as_sparse_vector()
1197            .unwrap();
1198        assert_eq!(entries2[0].0, 0);
1199        assert!((entries2[0].1 - 1.0).abs() < 1e-6);
1200        assert_eq!(entries2[1].0, 5);
1201        assert!((entries2[1].1 - 2.5).abs() < 1e-6);
1202        assert_eq!(entries2[2].0, 10);
1203        assert!((entries2[2].1 - 0.5).abs() < 1e-6);
1204    }
1205
1206    #[test]
1207    fn test_json_field() {
1208        let mut builder = Schema::builder();
1209        let metadata = builder.add_json_field("metadata", true);
1210        let title = builder.add_text_field("title", true, true);
1211        let schema = builder.build();
1212
1213        assert_eq!(schema.get_field("metadata"), Some(metadata));
1214        assert_eq!(
1215            schema.get_field_entry(metadata).unwrap().field_type,
1216            FieldType::Json
1217        );
1218        // JSON fields are never indexed
1219        assert!(!schema.get_field_entry(metadata).unwrap().indexed);
1220        assert!(schema.get_field_entry(metadata).unwrap().stored);
1221
1222        // Create document with JSON value (object)
1223        let json_value = serde_json::json!({
1224            "author": "John Doe",
1225            "tags": ["rust", "search"],
1226            "nested": {"key": "value"}
1227        });
1228        let mut doc = Document::new();
1229        doc.add_json(metadata, json_value.clone());
1230        doc.add_text(title, "Test Document");
1231
1232        // Verify accessor
1233        let stored_json = doc.get_first(metadata).unwrap().as_json().unwrap();
1234        assert_eq!(stored_json, &json_value);
1235        assert_eq!(
1236            stored_json.get("author").unwrap().as_str(),
1237            Some("John Doe")
1238        );
1239
1240        // Verify JSON roundtrip via to_json/from_json
1241        let doc_json = doc.to_json(&schema);
1242        let metadata_out = doc_json.get("metadata").unwrap();
1243        assert_eq!(metadata_out, &json_value);
1244
1245        // Parse back from JSON
1246        let doc2 = Document::from_json(&doc_json, &schema).unwrap();
1247        let stored_json2 = doc2.get_first(metadata).unwrap().as_json().unwrap();
1248        assert_eq!(stored_json2, &json_value);
1249    }
1250
1251    #[test]
1252    fn test_json_field_various_types() {
1253        let mut builder = Schema::builder();
1254        let data = builder.add_json_field("data", true);
1255        let _schema = builder.build();
1256
1257        // Test with array
1258        let arr_value = serde_json::json!([1, 2, 3, "four", null]);
1259        let mut doc = Document::new();
1260        doc.add_json(data, arr_value.clone());
1261        assert_eq!(doc.get_first(data).unwrap().as_json().unwrap(), &arr_value);
1262
1263        // Test with string
1264        let str_value = serde_json::json!("just a string");
1265        let mut doc2 = Document::new();
1266        doc2.add_json(data, str_value.clone());
1267        assert_eq!(doc2.get_first(data).unwrap().as_json().unwrap(), &str_value);
1268
1269        // Test with number
1270        let num_value = serde_json::json!(42.5);
1271        let mut doc3 = Document::new();
1272        doc3.add_json(data, num_value.clone());
1273        assert_eq!(doc3.get_first(data).unwrap().as_json().unwrap(), &num_value);
1274
1275        // Test with null
1276        let null_value = serde_json::Value::Null;
1277        let mut doc4 = Document::new();
1278        doc4.add_json(data, null_value.clone());
1279        assert_eq!(
1280            doc4.get_first(data).unwrap().as_json().unwrap(),
1281            &null_value
1282        );
1283
1284        // Test with boolean
1285        let bool_value = serde_json::json!(true);
1286        let mut doc5 = Document::new();
1287        doc5.add_json(data, bool_value.clone());
1288        assert_eq!(
1289            doc5.get_first(data).unwrap().as_json().unwrap(),
1290            &bool_value
1291        );
1292    }
1293}