Skip to main content

hermes_core/dsl/
schema.rs

1//! Schema definitions for documents and fields
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// Field identifier
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
8pub struct Field(pub u32);
9
10/// Types of fields supported
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub enum FieldType {
13    /// Text field - tokenized and indexed
14    #[serde(rename = "text")]
15    Text,
16    /// Unsigned 64-bit integer
17    #[serde(rename = "u64")]
18    U64,
19    /// Signed 64-bit integer
20    #[serde(rename = "i64")]
21    I64,
22    /// 64-bit floating point
23    #[serde(rename = "f64")]
24    F64,
25    /// Raw bytes (not tokenized)
26    #[serde(rename = "bytes")]
27    Bytes,
28    /// Sparse vector field - indexed as inverted posting lists with quantized weights
29    #[serde(rename = "sparse_vector")]
30    SparseVector,
31    /// Dense vector field - indexed using RaBitQ binary quantization for ANN search
32    #[serde(rename = "dense_vector")]
33    DenseVector,
34    /// JSON field - arbitrary JSON data, stored but not indexed
35    #[serde(rename = "json")]
36    Json,
37}
38
39/// Field options
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct FieldEntry {
42    pub name: String,
43    pub field_type: FieldType,
44    pub indexed: bool,
45    pub stored: bool,
46    /// Name of the tokenizer to use for this field (for text fields)
47    pub tokenizer: Option<String>,
48    /// Whether this field can have multiple values (serialized as array in JSON)
49    #[serde(default)]
50    pub multi: bool,
51    /// Position tracking mode for phrase queries and multi-field element tracking
52    #[serde(default, skip_serializing_if = "Option::is_none")]
53    pub positions: Option<PositionMode>,
54    /// Configuration for sparse vector fields (index size, weight quantization)
55    #[serde(default, skip_serializing_if = "Option::is_none")]
56    pub sparse_vector_config: Option<crate::structures::SparseVectorConfig>,
57    /// Configuration for dense vector fields (dimension, quantization)
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub dense_vector_config: Option<DenseVectorConfig>,
60    /// Whether this field has columnar fast-field storage for O(1) doc→value access.
61    /// Valid for u64, i64, f64, and text fields.
62    #[serde(default)]
63    pub fast: bool,
64    /// Whether this field is a primary key (unique constraint, at most one per schema)
65    #[serde(default)]
66    pub primary_key: bool,
67    /// Whether this sparse_vector field has auto-computed SimHash for BMP block reordering
68    #[serde(default)]
69    pub simhash: bool,
70}
71
72/// Position tracking mode for text fields
73#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
74#[serde(rename_all = "snake_case")]
75pub enum PositionMode {
76    /// Track only element ordinal for multi-valued fields (which array element)
77    /// Useful for returning which element matched without full phrase query support
78    Ordinal,
79    /// Track only token position within text (for phrase queries)
80    /// Does not track element ordinal - all positions are relative to concatenated text
81    TokenPosition,
82    /// Track both element ordinal and token position (full support)
83    /// Position format: (element_ordinal << 20) | token_position
84    Full,
85}
86
87impl PositionMode {
88    /// Whether this mode tracks element ordinals
89    pub fn tracks_ordinal(&self) -> bool {
90        matches!(self, PositionMode::Ordinal | PositionMode::Full)
91    }
92
93    /// Whether this mode tracks token positions
94    pub fn tracks_token_position(&self) -> bool {
95        matches!(self, PositionMode::TokenPosition | PositionMode::Full)
96    }
97}
98
99/// Vector index algorithm type
100#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
101#[serde(rename_all = "snake_case")]
102pub enum VectorIndexType {
103    /// Flat - brute-force search over raw vectors (accumulating state)
104    Flat,
105    /// RaBitQ - binary quantization, good for small datasets (<100K)
106    #[default]
107    RaBitQ,
108    /// IVF-RaBitQ - inverted file with RaBitQ, good for medium datasets
109    IvfRaBitQ,
110    /// ScaNN - product quantization with OPQ and anisotropic loss, best for large datasets
111    ScaNN,
112}
113
114/// Storage quantization for dense vector elements
115///
116/// Controls the precision of each vector coordinate in `.vectors` files.
117/// Lower precision reduces storage and memory bandwidth; scoring uses
118/// native-precision SIMD (no dequantization on the hot path).
119#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
120#[serde(rename_all = "snake_case")]
121pub enum DenseVectorQuantization {
122    /// 32-bit IEEE 754 float (4 bytes/dim) — full precision, baseline
123    #[default]
124    F32,
125    /// 16-bit IEEE 754 half-float (2 bytes/dim) — <0.1% recall loss for normalized embeddings
126    F16,
127    /// 8-bit unsigned scalar quantization (1 byte/dim) — maps [-1,1] → [0,255]
128    UInt8,
129}
130
131impl DenseVectorQuantization {
132    /// Bytes per element for this quantization type
133    pub fn element_size(self) -> usize {
134        match self {
135            Self::F32 => 4,
136            Self::F16 => 2,
137            Self::UInt8 => 1,
138        }
139    }
140
141    /// Wire format tag (stored in .vectors header)
142    pub fn tag(self) -> u8 {
143        match self {
144            Self::F32 => 0,
145            Self::F16 => 1,
146            Self::UInt8 => 2,
147        }
148    }
149
150    /// Decode wire format tag
151    pub fn from_tag(tag: u8) -> Option<Self> {
152        match tag {
153            0 => Some(Self::F32),
154            1 => Some(Self::F16),
155            2 => Some(Self::UInt8),
156            _ => None,
157        }
158    }
159}
160
161/// Configuration for dense vector fields using Flat, RaBitQ, IVF-RaBitQ, or ScaNN
162///
163/// Indexes operate in two states:
164/// - **Flat (accumulating)**: Brute-force search over raw vectors. Used when vector count
165///   is below `build_threshold` or before `build_index` is called.
166/// - **Built (ANN)**: Fast approximate nearest neighbor search using trained structures.
167///   Centroids and codebooks are trained from data and stored within the segment.
168#[derive(Debug, Clone, Serialize, Deserialize)]
169pub struct DenseVectorConfig {
170    /// Dimensionality of vectors
171    pub dim: usize,
172    /// Target vector index algorithm (Flat, RaBitQ, IVF-RaBitQ, or ScaNN)
173    /// When in accumulating state, search uses brute-force regardless of this setting.
174    #[serde(default)]
175    pub index_type: VectorIndexType,
176    /// Storage quantization for vector elements (f32, f16, uint8)
177    #[serde(default)]
178    pub quantization: DenseVectorQuantization,
179    /// Number of IVF clusters for IVF-RaBitQ and ScaNN (default: sqrt(n) capped at 4096)
180    /// If None, automatically determined based on dataset size.
181    #[serde(default, skip_serializing_if = "Option::is_none")]
182    pub num_clusters: Option<usize>,
183    /// Number of clusters to probe during search (default: 32)
184    #[serde(default = "default_nprobe")]
185    pub nprobe: usize,
186    /// Minimum number of vectors required before building ANN index.
187    /// Below this threshold, brute-force (Flat) search is used.
188    /// Default: 1000 for RaBitQ, 10000 for IVF-RaBitQ/ScaNN.
189    #[serde(default, skip_serializing_if = "Option::is_none")]
190    pub build_threshold: Option<usize>,
191    /// Whether stored vectors are pre-normalized to unit L2 norm.
192    /// When true, scoring skips per-vector norm computation (cosine = dot / ||q||),
193    /// reducing compute by ~40%. Common for embedding models (e.g. OpenAI, Cohere).
194    /// Default: true (most embedding models produce L2-normalized vectors).
195    #[serde(default = "default_unit_norm")]
196    pub unit_norm: bool,
197}
198
199fn default_nprobe() -> usize {
200    32
201}
202
203fn default_unit_norm() -> bool {
204    true
205}
206
207impl DenseVectorConfig {
208    pub fn new(dim: usize) -> Self {
209        Self {
210            dim,
211            index_type: VectorIndexType::RaBitQ,
212            quantization: DenseVectorQuantization::F32,
213            num_clusters: None,
214            nprobe: 32,
215            build_threshold: None,
216            unit_norm: true,
217        }
218    }
219
220    /// Create IVF-RaBitQ configuration
221    pub fn with_ivf(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
222        Self {
223            dim,
224            index_type: VectorIndexType::IvfRaBitQ,
225            quantization: DenseVectorQuantization::F32,
226            num_clusters,
227            nprobe,
228            build_threshold: None,
229            unit_norm: true,
230        }
231    }
232
233    /// Create ScaNN configuration
234    pub fn with_scann(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
235        Self {
236            dim,
237            index_type: VectorIndexType::ScaNN,
238            quantization: DenseVectorQuantization::F32,
239            num_clusters,
240            nprobe,
241            build_threshold: None,
242            unit_norm: true,
243        }
244    }
245
246    /// Create Flat (brute-force) configuration - no ANN index
247    pub fn flat(dim: usize) -> Self {
248        Self {
249            dim,
250            index_type: VectorIndexType::Flat,
251            quantization: DenseVectorQuantization::F32,
252            num_clusters: None,
253            nprobe: 0,
254            build_threshold: None,
255            unit_norm: true,
256        }
257    }
258
259    /// Set storage quantization
260    pub fn with_quantization(mut self, quantization: DenseVectorQuantization) -> Self {
261        self.quantization = quantization;
262        self
263    }
264
265    /// Set build threshold for auto-building ANN index
266    pub fn with_build_threshold(mut self, threshold: usize) -> Self {
267        self.build_threshold = Some(threshold);
268        self
269    }
270
271    /// Mark vectors as pre-normalized to unit L2 norm
272    pub fn with_unit_norm(mut self) -> Self {
273        self.unit_norm = true;
274        self
275    }
276
277    /// Set number of IVF clusters
278    pub fn with_num_clusters(mut self, num_clusters: usize) -> Self {
279        self.num_clusters = Some(num_clusters);
280        self
281    }
282
283    /// Check if this config uses IVF
284    pub fn uses_ivf(&self) -> bool {
285        matches!(
286            self.index_type,
287            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN
288        )
289    }
290
291    /// Check if this config uses ScaNN
292    pub fn uses_scann(&self) -> bool {
293        self.index_type == VectorIndexType::ScaNN
294    }
295
296    /// Check if this config is flat (brute-force)
297    pub fn is_flat(&self) -> bool {
298        self.index_type == VectorIndexType::Flat
299    }
300
301    /// Get the default build threshold for this index type
302    pub fn default_build_threshold(&self) -> usize {
303        self.build_threshold.unwrap_or(match self.index_type {
304            VectorIndexType::Flat => usize::MAX, // Never auto-build
305            VectorIndexType::RaBitQ => 1000,
306            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN => 10000,
307        })
308    }
309
310    /// Calculate optimal number of clusters for given vector count
311    pub fn optimal_num_clusters(&self, num_vectors: usize) -> usize {
312        self.num_clusters.unwrap_or_else(|| {
313            // sqrt(n) heuristic, capped at 4096
314            let optimal = (num_vectors as f64).sqrt() as usize;
315            optimal.clamp(16, 4096)
316        })
317    }
318}
319
320use super::query_field_router::QueryRouterRule;
321
322/// Schema defining document structure
323#[derive(Debug, Clone, Default, Serialize, Deserialize)]
324pub struct Schema {
325    fields: Vec<FieldEntry>,
326    name_to_field: HashMap<String, Field>,
327    /// Default fields for query parsing (when no field is specified)
328    #[serde(default)]
329    default_fields: Vec<Field>,
330    /// Query router rules for routing queries to specific fields based on regex patterns
331    #[serde(default)]
332    query_routers: Vec<QueryRouterRule>,
333}
334
335impl Schema {
336    pub fn builder() -> SchemaBuilder {
337        SchemaBuilder::default()
338    }
339
340    pub fn get_field(&self, name: &str) -> Option<Field> {
341        self.name_to_field.get(name).copied()
342    }
343
344    pub fn get_field_entry(&self, field: Field) -> Option<&FieldEntry> {
345        self.fields.get(field.0 as usize)
346    }
347
348    pub fn get_field_name(&self, field: Field) -> Option<&str> {
349        self.fields.get(field.0 as usize).map(|e| e.name.as_str())
350    }
351
352    pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
353        self.fields
354            .iter()
355            .enumerate()
356            .map(|(i, e)| (Field(i as u32), e))
357    }
358
359    pub fn num_fields(&self) -> usize {
360        self.fields.len()
361    }
362
363    /// Get the default fields for query parsing
364    pub fn default_fields(&self) -> &[Field] {
365        &self.default_fields
366    }
367
368    /// Set default fields (used by builder)
369    pub fn set_default_fields(&mut self, fields: Vec<Field>) {
370        self.default_fields = fields;
371    }
372
373    /// Get the query router rules
374    pub fn query_routers(&self) -> &[QueryRouterRule] {
375        &self.query_routers
376    }
377
378    /// Set query router rules
379    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
380        self.query_routers = rules;
381    }
382
383    /// Get the primary key field, if one is defined
384    pub fn primary_field(&self) -> Option<Field> {
385        self.fields
386            .iter()
387            .enumerate()
388            .find(|(_, e)| e.primary_key)
389            .map(|(i, _)| Field(i as u32))
390    }
391}
392
393/// Builder for Schema
394#[derive(Debug, Default)]
395pub struct SchemaBuilder {
396    fields: Vec<FieldEntry>,
397    default_fields: Vec<String>,
398    query_routers: Vec<QueryRouterRule>,
399}
400
401impl SchemaBuilder {
402    pub fn add_text_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
403        self.add_field_with_tokenizer(
404            name,
405            FieldType::Text,
406            indexed,
407            stored,
408            Some("simple".to_string()),
409        )
410    }
411
412    pub fn add_text_field_with_tokenizer(
413        &mut self,
414        name: &str,
415        indexed: bool,
416        stored: bool,
417        tokenizer: &str,
418    ) -> Field {
419        self.add_field_with_tokenizer(
420            name,
421            FieldType::Text,
422            indexed,
423            stored,
424            Some(tokenizer.to_string()),
425        )
426    }
427
428    pub fn add_u64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
429        self.add_field(name, FieldType::U64, indexed, stored)
430    }
431
432    pub fn add_i64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
433        self.add_field(name, FieldType::I64, indexed, stored)
434    }
435
436    pub fn add_f64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
437        self.add_field(name, FieldType::F64, indexed, stored)
438    }
439
440    pub fn add_bytes_field(&mut self, name: &str, stored: bool) -> Field {
441        self.add_field(name, FieldType::Bytes, false, stored)
442    }
443
444    /// Add a JSON field for storing arbitrary JSON data
445    ///
446    /// JSON fields are never indexed, only stored. They can hold any valid JSON value
447    /// (objects, arrays, strings, numbers, booleans, null).
448    pub fn add_json_field(&mut self, name: &str, stored: bool) -> Field {
449        self.add_field(name, FieldType::Json, false, stored)
450    }
451
452    /// Add a sparse vector field with default configuration
453    ///
454    /// Sparse vectors are indexed as inverted posting lists where each dimension
455    /// becomes a "term" and documents have quantized weights for each dimension.
456    pub fn add_sparse_vector_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
457        self.add_sparse_vector_field_with_config(
458            name,
459            indexed,
460            stored,
461            crate::structures::SparseVectorConfig::default(),
462        )
463    }
464
465    /// Add a sparse vector field with custom configuration
466    ///
467    /// Use `SparseVectorConfig::splade()` for SPLADE models (u16 indices, uint8 weights).
468    /// Use `SparseVectorConfig::compact()` for maximum compression (u16 indices, uint4 weights).
469    pub fn add_sparse_vector_field_with_config(
470        &mut self,
471        name: &str,
472        indexed: bool,
473        stored: bool,
474        config: crate::structures::SparseVectorConfig,
475    ) -> Field {
476        let field = Field(self.fields.len() as u32);
477        self.fields.push(FieldEntry {
478            name: name.to_string(),
479            field_type: FieldType::SparseVector,
480            indexed,
481            stored,
482            tokenizer: None,
483            multi: false,
484            positions: None,
485            sparse_vector_config: Some(config),
486            dense_vector_config: None,
487            fast: false,
488            primary_key: false,
489            simhash: false,
490        });
491        field
492    }
493
494    /// Set sparse vector configuration for an existing field
495    pub fn set_sparse_vector_config(
496        &mut self,
497        field: Field,
498        config: crate::structures::SparseVectorConfig,
499    ) {
500        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
501            entry.sparse_vector_config = Some(config);
502        }
503    }
504
505    /// Add a dense vector field with default configuration
506    ///
507    /// Dense vectors are indexed using RaBitQ binary quantization for fast ANN search.
508    /// The dimension must be specified as it determines the quantization structure.
509    pub fn add_dense_vector_field(
510        &mut self,
511        name: &str,
512        dim: usize,
513        indexed: bool,
514        stored: bool,
515    ) -> Field {
516        self.add_dense_vector_field_with_config(name, indexed, stored, DenseVectorConfig::new(dim))
517    }
518
519    /// Add a dense vector field with custom configuration
520    pub fn add_dense_vector_field_with_config(
521        &mut self,
522        name: &str,
523        indexed: bool,
524        stored: bool,
525        config: DenseVectorConfig,
526    ) -> Field {
527        let field = Field(self.fields.len() as u32);
528        self.fields.push(FieldEntry {
529            name: name.to_string(),
530            field_type: FieldType::DenseVector,
531            indexed,
532            stored,
533            tokenizer: None,
534            multi: false,
535            positions: None,
536            sparse_vector_config: None,
537            dense_vector_config: Some(config),
538            fast: false,
539            primary_key: false,
540            simhash: false,
541        });
542        field
543    }
544
545    fn add_field(
546        &mut self,
547        name: &str,
548        field_type: FieldType,
549        indexed: bool,
550        stored: bool,
551    ) -> Field {
552        self.add_field_with_tokenizer(name, field_type, indexed, stored, None)
553    }
554
555    fn add_field_with_tokenizer(
556        &mut self,
557        name: &str,
558        field_type: FieldType,
559        indexed: bool,
560        stored: bool,
561        tokenizer: Option<String>,
562    ) -> Field {
563        self.add_field_full(name, field_type, indexed, stored, tokenizer, false)
564    }
565
566    fn add_field_full(
567        &mut self,
568        name: &str,
569        field_type: FieldType,
570        indexed: bool,
571        stored: bool,
572        tokenizer: Option<String>,
573        multi: bool,
574    ) -> Field {
575        let field = Field(self.fields.len() as u32);
576        self.fields.push(FieldEntry {
577            name: name.to_string(),
578            field_type,
579            indexed,
580            stored,
581            tokenizer,
582            multi,
583            positions: None,
584            sparse_vector_config: None,
585            dense_vector_config: None,
586            fast: false,
587            primary_key: false,
588            simhash: false,
589        });
590        field
591    }
592
593    /// Set the multi attribute on the last added field
594    pub fn set_multi(&mut self, field: Field, multi: bool) {
595        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
596            entry.multi = multi;
597        }
598    }
599
600    /// Set fast-field columnar storage for O(1) doc→value access.
601    /// Valid for u64, i64, f64, and text fields.
602    pub fn set_fast(&mut self, field: Field, fast: bool) {
603        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
604            entry.fast = fast;
605        }
606    }
607
608    /// Mark a field as the primary key (unique constraint)
609    pub fn set_primary_key(&mut self, field: Field) {
610        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
611            entry.primary_key = true;
612        }
613    }
614
615    /// Enable auto-computed SimHash on a sparse_vector field for BMP block reordering
616    pub fn set_simhash(&mut self, field: Field, simhash: bool) {
617        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
618            entry.simhash = simhash;
619        }
620    }
621
622    /// Set position tracking mode for phrase queries and multi-field element tracking
623    pub fn set_positions(&mut self, field: Field, mode: PositionMode) {
624        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
625            entry.positions = Some(mode);
626        }
627    }
628
629    /// Set default fields by name
630    pub fn set_default_fields(&mut self, field_names: Vec<String>) {
631        self.default_fields = field_names;
632    }
633
634    /// Set query router rules
635    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
636        self.query_routers = rules;
637    }
638
639    pub fn build(self) -> Schema {
640        let mut name_to_field = HashMap::new();
641        for (i, entry) in self.fields.iter().enumerate() {
642            name_to_field.insert(entry.name.clone(), Field(i as u32));
643        }
644
645        // Resolve default field names to Field IDs
646        let default_fields: Vec<Field> = self
647            .default_fields
648            .iter()
649            .filter_map(|name| name_to_field.get(name).copied())
650            .collect();
651
652        Schema {
653            fields: self.fields,
654            name_to_field,
655            default_fields,
656            query_routers: self.query_routers,
657        }
658    }
659}
660
661/// Value that can be stored in a field
662#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
663pub enum FieldValue {
664    #[serde(rename = "text")]
665    Text(String),
666    #[serde(rename = "u64")]
667    U64(u64),
668    #[serde(rename = "i64")]
669    I64(i64),
670    #[serde(rename = "f64")]
671    F64(f64),
672    #[serde(rename = "bytes")]
673    Bytes(Vec<u8>),
674    /// Sparse vector: list of (dimension_id, weight) pairs
675    #[serde(rename = "sparse_vector")]
676    SparseVector(Vec<(u32, f32)>),
677    /// Dense vector: float32 values
678    #[serde(rename = "dense_vector")]
679    DenseVector(Vec<f32>),
680    /// Arbitrary JSON value
681    #[serde(rename = "json")]
682    Json(serde_json::Value),
683}
684
685impl FieldValue {
686    pub fn as_text(&self) -> Option<&str> {
687        match self {
688            FieldValue::Text(s) => Some(s),
689            _ => None,
690        }
691    }
692
693    pub fn as_u64(&self) -> Option<u64> {
694        match self {
695            FieldValue::U64(v) => Some(*v),
696            _ => None,
697        }
698    }
699
700    pub fn as_i64(&self) -> Option<i64> {
701        match self {
702            FieldValue::I64(v) => Some(*v),
703            _ => None,
704        }
705    }
706
707    pub fn as_f64(&self) -> Option<f64> {
708        match self {
709            FieldValue::F64(v) => Some(*v),
710            _ => None,
711        }
712    }
713
714    pub fn as_bytes(&self) -> Option<&[u8]> {
715        match self {
716            FieldValue::Bytes(b) => Some(b),
717            _ => None,
718        }
719    }
720
721    pub fn as_sparse_vector(&self) -> Option<&[(u32, f32)]> {
722        match self {
723            FieldValue::SparseVector(entries) => Some(entries),
724            _ => None,
725        }
726    }
727
728    pub fn as_dense_vector(&self) -> Option<&[f32]> {
729        match self {
730            FieldValue::DenseVector(v) => Some(v),
731            _ => None,
732        }
733    }
734
735    pub fn as_json(&self) -> Option<&serde_json::Value> {
736        match self {
737            FieldValue::Json(v) => Some(v),
738            _ => None,
739        }
740    }
741}
742
743/// A document to be indexed
744#[derive(Debug, Clone, Default, Serialize, Deserialize)]
745pub struct Document {
746    field_values: Vec<(Field, FieldValue)>,
747}
748
749impl Document {
750    pub fn new() -> Self {
751        Self::default()
752    }
753
754    pub fn add_text(&mut self, field: Field, value: impl Into<String>) {
755        self.field_values
756            .push((field, FieldValue::Text(value.into())));
757    }
758
759    pub fn add_u64(&mut self, field: Field, value: u64) {
760        self.field_values.push((field, FieldValue::U64(value)));
761    }
762
763    pub fn add_i64(&mut self, field: Field, value: i64) {
764        self.field_values.push((field, FieldValue::I64(value)));
765    }
766
767    pub fn add_f64(&mut self, field: Field, value: f64) {
768        self.field_values.push((field, FieldValue::F64(value)));
769    }
770
771    pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
772        self.field_values.push((field, FieldValue::Bytes(value)));
773    }
774
775    pub fn add_sparse_vector(&mut self, field: Field, entries: Vec<(u32, f32)>) {
776        self.field_values
777            .push((field, FieldValue::SparseVector(entries)));
778    }
779
780    pub fn add_dense_vector(&mut self, field: Field, values: Vec<f32>) {
781        self.field_values
782            .push((field, FieldValue::DenseVector(values)));
783    }
784
785    pub fn add_json(&mut self, field: Field, value: serde_json::Value) {
786        self.field_values.push((field, FieldValue::Json(value)));
787    }
788
789    pub fn get_first(&self, field: Field) -> Option<&FieldValue> {
790        self.field_values
791            .iter()
792            .find(|(f, _)| *f == field)
793            .map(|(_, v)| v)
794    }
795
796    pub fn get_all(&self, field: Field) -> impl Iterator<Item = &FieldValue> {
797        self.field_values
798            .iter()
799            .filter(move |(f, _)| *f == field)
800            .map(|(_, v)| v)
801    }
802
803    pub fn field_values(&self) -> &[(Field, FieldValue)] {
804        &self.field_values
805    }
806
807    /// Return a new Document containing only fields marked as `stored` in the schema
808    pub fn filter_stored(&self, schema: &Schema) -> Document {
809        Document {
810            field_values: self
811                .field_values
812                .iter()
813                .filter(|(field, _)| {
814                    schema
815                        .get_field_entry(*field)
816                        .is_some_and(|entry| entry.stored)
817                })
818                .cloned()
819                .collect(),
820        }
821    }
822
823    /// Convert document to a JSON object using field names from schema
824    ///
825    /// Fields marked as `multi` in the schema are always returned as JSON arrays.
826    /// Other fields with multiple values are also returned as arrays.
827    /// Fields with a single value (and not marked multi) are returned as scalar values.
828    pub fn to_json(&self, schema: &Schema) -> serde_json::Value {
829        use std::collections::HashMap;
830
831        // Group values by field, keeping track of field entry for multi check
832        let mut field_values_map: HashMap<Field, (String, bool, Vec<serde_json::Value>)> =
833            HashMap::new();
834
835        for (field, value) in &self.field_values {
836            if let Some(entry) = schema.get_field_entry(*field) {
837                let json_value = match value {
838                    FieldValue::Text(s) => serde_json::Value::String(s.clone()),
839                    FieldValue::U64(n) => serde_json::Value::Number((*n).into()),
840                    FieldValue::I64(n) => serde_json::Value::Number((*n).into()),
841                    FieldValue::F64(n) => serde_json::json!(n),
842                    FieldValue::Bytes(b) => {
843                        use base64::Engine;
844                        serde_json::Value::String(
845                            base64::engine::general_purpose::STANDARD.encode(b),
846                        )
847                    }
848                    FieldValue::SparseVector(entries) => {
849                        let indices: Vec<u32> = entries.iter().map(|(i, _)| *i).collect();
850                        let values: Vec<f32> = entries.iter().map(|(_, v)| *v).collect();
851                        serde_json::json!({
852                            "indices": indices,
853                            "values": values
854                        })
855                    }
856                    FieldValue::DenseVector(values) => {
857                        serde_json::json!(values)
858                    }
859                    FieldValue::Json(v) => v.clone(),
860                };
861                field_values_map
862                    .entry(*field)
863                    .or_insert_with(|| (entry.name.clone(), entry.multi, Vec::new()))
864                    .2
865                    .push(json_value);
866            }
867        }
868
869        // Convert to JSON object, using arrays for multi fields or when multiple values exist
870        let mut map = serde_json::Map::new();
871        for (_field, (name, is_multi, values)) in field_values_map {
872            let json_value = if is_multi || values.len() > 1 {
873                serde_json::Value::Array(values)
874            } else {
875                values.into_iter().next().unwrap()
876            };
877            map.insert(name, json_value);
878        }
879
880        serde_json::Value::Object(map)
881    }
882
883    /// Create a Document from a JSON object using field names from schema
884    ///
885    /// Supports:
886    /// - String values -> Text fields
887    /// - Number values -> U64/I64/F64 fields (based on schema type)
888    /// - Array values -> Multiple values for the same field (multifields)
889    ///
890    /// Unknown fields (not in schema) are silently ignored.
891    pub fn from_json(json: &serde_json::Value, schema: &Schema) -> Option<Self> {
892        let obj = json.as_object()?;
893        let mut doc = Document::new();
894
895        for (key, value) in obj {
896            if let Some(field) = schema.get_field(key) {
897                let field_entry = schema.get_field_entry(field)?;
898                Self::add_json_value(&mut doc, field, &field_entry.field_type, value);
899            }
900        }
901
902        Some(doc)
903    }
904
905    /// Helper to add a JSON value to a document, handling type conversion
906    fn add_json_value(
907        doc: &mut Document,
908        field: Field,
909        field_type: &FieldType,
910        value: &serde_json::Value,
911    ) {
912        match value {
913            serde_json::Value::String(s) => {
914                if matches!(field_type, FieldType::Text) {
915                    doc.add_text(field, s.clone());
916                }
917            }
918            serde_json::Value::Number(n) => {
919                match field_type {
920                    FieldType::I64 => {
921                        if let Some(i) = n.as_i64() {
922                            doc.add_i64(field, i);
923                        }
924                    }
925                    FieldType::U64 => {
926                        if let Some(u) = n.as_u64() {
927                            doc.add_u64(field, u);
928                        } else if let Some(i) = n.as_i64() {
929                            // Allow positive i64 as u64
930                            if i >= 0 {
931                                doc.add_u64(field, i as u64);
932                            }
933                        }
934                    }
935                    FieldType::F64 => {
936                        if let Some(f) = n.as_f64() {
937                            doc.add_f64(field, f);
938                        }
939                    }
940                    _ => {}
941                }
942            }
943            // Handle arrays (multifields) - add each element separately
944            serde_json::Value::Array(arr) => {
945                for item in arr {
946                    Self::add_json_value(doc, field, field_type, item);
947                }
948            }
949            // Handle sparse vector objects
950            serde_json::Value::Object(obj) if matches!(field_type, FieldType::SparseVector) => {
951                if let (Some(indices_val), Some(values_val)) =
952                    (obj.get("indices"), obj.get("values"))
953                {
954                    let indices: Vec<u32> = indices_val
955                        .as_array()
956                        .map(|arr| {
957                            arr.iter()
958                                .filter_map(|v| v.as_u64().map(|n| n as u32))
959                                .collect()
960                        })
961                        .unwrap_or_default();
962                    let values: Vec<f32> = values_val
963                        .as_array()
964                        .map(|arr| {
965                            arr.iter()
966                                .filter_map(|v| v.as_f64().map(|n| n as f32))
967                                .collect()
968                        })
969                        .unwrap_or_default();
970                    if indices.len() == values.len() {
971                        let entries: Vec<(u32, f32)> = indices.into_iter().zip(values).collect();
972                        doc.add_sparse_vector(field, entries);
973                    }
974                }
975            }
976            // Handle JSON fields - accept any value directly
977            _ if matches!(field_type, FieldType::Json) => {
978                doc.add_json(field, value.clone());
979            }
980            serde_json::Value::Object(_) => {}
981            _ => {}
982        }
983    }
984}
985
986#[cfg(test)]
987mod tests {
988    use super::*;
989
990    #[test]
991    fn test_schema_builder() {
992        let mut builder = Schema::builder();
993        let title = builder.add_text_field("title", true, true);
994        let body = builder.add_text_field("body", true, false);
995        let count = builder.add_u64_field("count", true, true);
996        let schema = builder.build();
997
998        assert_eq!(schema.get_field("title"), Some(title));
999        assert_eq!(schema.get_field("body"), Some(body));
1000        assert_eq!(schema.get_field("count"), Some(count));
1001        assert_eq!(schema.get_field("nonexistent"), None);
1002    }
1003
1004    #[test]
1005    fn test_document() {
1006        let mut builder = Schema::builder();
1007        let title = builder.add_text_field("title", true, true);
1008        let count = builder.add_u64_field("count", true, true);
1009        let _schema = builder.build();
1010
1011        let mut doc = Document::new();
1012        doc.add_text(title, "Hello World");
1013        doc.add_u64(count, 42);
1014
1015        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
1016        assert_eq!(doc.get_first(count).unwrap().as_u64(), Some(42));
1017    }
1018
1019    #[test]
1020    fn test_document_serialization() {
1021        let mut builder = Schema::builder();
1022        let title = builder.add_text_field("title", true, true);
1023        let count = builder.add_u64_field("count", true, true);
1024        let _schema = builder.build();
1025
1026        let mut doc = Document::new();
1027        doc.add_text(title, "Hello World");
1028        doc.add_u64(count, 42);
1029
1030        // Serialize
1031        let json = serde_json::to_string(&doc).unwrap();
1032        println!("Serialized doc: {}", json);
1033
1034        // Deserialize
1035        let doc2: Document = serde_json::from_str(&json).unwrap();
1036        assert_eq!(
1037            doc2.field_values().len(),
1038            2,
1039            "Should have 2 field values after deserialization"
1040        );
1041        assert_eq!(
1042            doc2.get_first(title).unwrap().as_text(),
1043            Some("Hello World")
1044        );
1045        assert_eq!(doc2.get_first(count).unwrap().as_u64(), Some(42));
1046    }
1047
1048    #[test]
1049    fn test_multivalue_field() {
1050        let mut builder = Schema::builder();
1051        let uris = builder.add_text_field("uris", true, true);
1052        let title = builder.add_text_field("title", true, true);
1053        let schema = builder.build();
1054
1055        // Create document with multiple values for the same field
1056        let mut doc = Document::new();
1057        doc.add_text(uris, "one");
1058        doc.add_text(uris, "two");
1059        doc.add_text(title, "Test Document");
1060
1061        // Verify get_first returns the first value
1062        assert_eq!(doc.get_first(uris).unwrap().as_text(), Some("one"));
1063
1064        // Verify get_all returns all values
1065        let all_uris: Vec<_> = doc.get_all(uris).collect();
1066        assert_eq!(all_uris.len(), 2);
1067        assert_eq!(all_uris[0].as_text(), Some("one"));
1068        assert_eq!(all_uris[1].as_text(), Some("two"));
1069
1070        // Verify to_json returns array for multi-value field
1071        let json = doc.to_json(&schema);
1072        let uris_json = json.get("uris").unwrap();
1073        assert!(uris_json.is_array(), "Multi-value field should be an array");
1074        let uris_arr = uris_json.as_array().unwrap();
1075        assert_eq!(uris_arr.len(), 2);
1076        assert_eq!(uris_arr[0].as_str(), Some("one"));
1077        assert_eq!(uris_arr[1].as_str(), Some("two"));
1078
1079        // Verify single-value field is NOT an array
1080        let title_json = json.get("title").unwrap();
1081        assert!(
1082            title_json.is_string(),
1083            "Single-value field should be a string"
1084        );
1085        assert_eq!(title_json.as_str(), Some("Test Document"));
1086    }
1087
1088    #[test]
1089    fn test_multivalue_from_json() {
1090        let mut builder = Schema::builder();
1091        let uris = builder.add_text_field("uris", true, true);
1092        let title = builder.add_text_field("title", true, true);
1093        let schema = builder.build();
1094
1095        // Create JSON with array value
1096        let json = serde_json::json!({
1097            "uris": ["one", "two"],
1098            "title": "Test Document"
1099        });
1100
1101        // Parse from JSON
1102        let doc = Document::from_json(&json, &schema).unwrap();
1103
1104        // Verify all values are present
1105        let all_uris: Vec<_> = doc.get_all(uris).collect();
1106        assert_eq!(all_uris.len(), 2);
1107        assert_eq!(all_uris[0].as_text(), Some("one"));
1108        assert_eq!(all_uris[1].as_text(), Some("two"));
1109
1110        // Verify single value
1111        assert_eq!(
1112            doc.get_first(title).unwrap().as_text(),
1113            Some("Test Document")
1114        );
1115
1116        // Verify roundtrip: to_json should produce equivalent JSON
1117        let json_out = doc.to_json(&schema);
1118        let uris_out = json_out.get("uris").unwrap().as_array().unwrap();
1119        assert_eq!(uris_out.len(), 2);
1120        assert_eq!(uris_out[0].as_str(), Some("one"));
1121        assert_eq!(uris_out[1].as_str(), Some("two"));
1122    }
1123
1124    #[test]
1125    fn test_multi_attribute_forces_array() {
1126        // Test that fields marked as 'multi' are always serialized as arrays,
1127        // even when they have only one value
1128        let mut builder = Schema::builder();
1129        let uris = builder.add_text_field("uris", true, true);
1130        builder.set_multi(uris, true); // Mark as multi
1131        let title = builder.add_text_field("title", true, true);
1132        let schema = builder.build();
1133
1134        // Verify the multi attribute is set
1135        assert!(schema.get_field_entry(uris).unwrap().multi);
1136        assert!(!schema.get_field_entry(title).unwrap().multi);
1137
1138        // Create document with single value for multi field
1139        let mut doc = Document::new();
1140        doc.add_text(uris, "only_one");
1141        doc.add_text(title, "Test Document");
1142
1143        // Verify to_json returns array for multi field even with single value
1144        let json = doc.to_json(&schema);
1145
1146        let uris_json = json.get("uris").unwrap();
1147        assert!(
1148            uris_json.is_array(),
1149            "Multi field should be array even with single value"
1150        );
1151        let uris_arr = uris_json.as_array().unwrap();
1152        assert_eq!(uris_arr.len(), 1);
1153        assert_eq!(uris_arr[0].as_str(), Some("only_one"));
1154
1155        // Verify non-multi field with single value is NOT an array
1156        let title_json = json.get("title").unwrap();
1157        assert!(
1158            title_json.is_string(),
1159            "Non-multi single-value field should be a string"
1160        );
1161        assert_eq!(title_json.as_str(), Some("Test Document"));
1162    }
1163
1164    #[test]
1165    fn test_sparse_vector_field() {
1166        let mut builder = Schema::builder();
1167        let embedding = builder.add_sparse_vector_field("embedding", true, true);
1168        let title = builder.add_text_field("title", true, true);
1169        let schema = builder.build();
1170
1171        assert_eq!(schema.get_field("embedding"), Some(embedding));
1172        assert_eq!(
1173            schema.get_field_entry(embedding).unwrap().field_type,
1174            FieldType::SparseVector
1175        );
1176
1177        // Create document with sparse vector
1178        let mut doc = Document::new();
1179        doc.add_sparse_vector(embedding, vec![(0, 1.0), (5, 2.5), (10, 0.5)]);
1180        doc.add_text(title, "Test Document");
1181
1182        // Verify accessor
1183        let entries = doc
1184            .get_first(embedding)
1185            .unwrap()
1186            .as_sparse_vector()
1187            .unwrap();
1188        assert_eq!(entries, &[(0, 1.0), (5, 2.5), (10, 0.5)]);
1189
1190        // Verify JSON roundtrip
1191        let json = doc.to_json(&schema);
1192        let embedding_json = json.get("embedding").unwrap();
1193        assert!(embedding_json.is_object());
1194        assert_eq!(
1195            embedding_json
1196                .get("indices")
1197                .unwrap()
1198                .as_array()
1199                .unwrap()
1200                .len(),
1201            3
1202        );
1203
1204        // Parse back from JSON
1205        let doc2 = Document::from_json(&json, &schema).unwrap();
1206        let entries2 = doc2
1207            .get_first(embedding)
1208            .unwrap()
1209            .as_sparse_vector()
1210            .unwrap();
1211        assert_eq!(entries2[0].0, 0);
1212        assert!((entries2[0].1 - 1.0).abs() < 1e-6);
1213        assert_eq!(entries2[1].0, 5);
1214        assert!((entries2[1].1 - 2.5).abs() < 1e-6);
1215        assert_eq!(entries2[2].0, 10);
1216        assert!((entries2[2].1 - 0.5).abs() < 1e-6);
1217    }
1218
1219    #[test]
1220    fn test_json_field() {
1221        let mut builder = Schema::builder();
1222        let metadata = builder.add_json_field("metadata", true);
1223        let title = builder.add_text_field("title", true, true);
1224        let schema = builder.build();
1225
1226        assert_eq!(schema.get_field("metadata"), Some(metadata));
1227        assert_eq!(
1228            schema.get_field_entry(metadata).unwrap().field_type,
1229            FieldType::Json
1230        );
1231        // JSON fields are never indexed
1232        assert!(!schema.get_field_entry(metadata).unwrap().indexed);
1233        assert!(schema.get_field_entry(metadata).unwrap().stored);
1234
1235        // Create document with JSON value (object)
1236        let json_value = serde_json::json!({
1237            "author": "John Doe",
1238            "tags": ["rust", "search"],
1239            "nested": {"key": "value"}
1240        });
1241        let mut doc = Document::new();
1242        doc.add_json(metadata, json_value.clone());
1243        doc.add_text(title, "Test Document");
1244
1245        // Verify accessor
1246        let stored_json = doc.get_first(metadata).unwrap().as_json().unwrap();
1247        assert_eq!(stored_json, &json_value);
1248        assert_eq!(
1249            stored_json.get("author").unwrap().as_str(),
1250            Some("John Doe")
1251        );
1252
1253        // Verify JSON roundtrip via to_json/from_json
1254        let doc_json = doc.to_json(&schema);
1255        let metadata_out = doc_json.get("metadata").unwrap();
1256        assert_eq!(metadata_out, &json_value);
1257
1258        // Parse back from JSON
1259        let doc2 = Document::from_json(&doc_json, &schema).unwrap();
1260        let stored_json2 = doc2.get_first(metadata).unwrap().as_json().unwrap();
1261        assert_eq!(stored_json2, &json_value);
1262    }
1263
1264    #[test]
1265    fn test_json_field_various_types() {
1266        let mut builder = Schema::builder();
1267        let data = builder.add_json_field("data", true);
1268        let _schema = builder.build();
1269
1270        // Test with array
1271        let arr_value = serde_json::json!([1, 2, 3, "four", null]);
1272        let mut doc = Document::new();
1273        doc.add_json(data, arr_value.clone());
1274        assert_eq!(doc.get_first(data).unwrap().as_json().unwrap(), &arr_value);
1275
1276        // Test with string
1277        let str_value = serde_json::json!("just a string");
1278        let mut doc2 = Document::new();
1279        doc2.add_json(data, str_value.clone());
1280        assert_eq!(doc2.get_first(data).unwrap().as_json().unwrap(), &str_value);
1281
1282        // Test with number
1283        let num_value = serde_json::json!(42.5);
1284        let mut doc3 = Document::new();
1285        doc3.add_json(data, num_value.clone());
1286        assert_eq!(doc3.get_first(data).unwrap().as_json().unwrap(), &num_value);
1287
1288        // Test with null
1289        let null_value = serde_json::Value::Null;
1290        let mut doc4 = Document::new();
1291        doc4.add_json(data, null_value.clone());
1292        assert_eq!(
1293            doc4.get_first(data).unwrap().as_json().unwrap(),
1294            &null_value
1295        );
1296
1297        // Test with boolean
1298        let bool_value = serde_json::json!(true);
1299        let mut doc5 = Document::new();
1300        doc5.add_json(data, bool_value.clone());
1301        assert_eq!(
1302            doc5.get_first(data).unwrap().as_json().unwrap(),
1303            &bool_value
1304        );
1305    }
1306}