Skip to main content

hermes_core/dsl/
schema.rs

1//! Schema definitions for documents and fields
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// Field identifier
7#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
8pub struct Field(pub u32);
9
10/// Types of fields supported
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
12pub enum FieldType {
13    /// Text field - tokenized and indexed
14    #[serde(rename = "text")]
15    Text,
16    /// Unsigned 64-bit integer
17    #[serde(rename = "u64")]
18    U64,
19    /// Signed 64-bit integer
20    #[serde(rename = "i64")]
21    I64,
22    /// 64-bit floating point
23    #[serde(rename = "f64")]
24    F64,
25    /// Raw bytes (not tokenized)
26    #[serde(rename = "bytes")]
27    Bytes,
28    /// Sparse vector field - indexed as inverted posting lists with quantized weights
29    #[serde(rename = "sparse_vector")]
30    SparseVector,
31    /// Dense vector field - indexed using RaBitQ binary quantization for ANN search
32    #[serde(rename = "dense_vector")]
33    DenseVector,
34    /// JSON field - arbitrary JSON data, stored but not indexed
35    #[serde(rename = "json")]
36    Json,
37}
38
39/// Field options
40#[derive(Debug, Clone, Serialize, Deserialize)]
41pub struct FieldEntry {
42    pub name: String,
43    pub field_type: FieldType,
44    pub indexed: bool,
45    pub stored: bool,
46    /// Name of the tokenizer to use for this field (for text fields)
47    pub tokenizer: Option<String>,
48    /// Whether this field can have multiple values (serialized as array in JSON)
49    #[serde(default)]
50    pub multi: bool,
51    /// Position tracking mode for phrase queries and multi-field element tracking
52    #[serde(default, skip_serializing_if = "Option::is_none")]
53    pub positions: Option<PositionMode>,
54    /// Configuration for sparse vector fields (index size, weight quantization)
55    #[serde(default, skip_serializing_if = "Option::is_none")]
56    pub sparse_vector_config: Option<crate::structures::SparseVectorConfig>,
57    /// Configuration for dense vector fields (dimension, quantization)
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub dense_vector_config: Option<DenseVectorConfig>,
60    /// Whether this field has columnar fast-field storage for O(1) doc→value access.
61    /// Valid for u64, i64, f64, and text fields.
62    #[serde(default)]
63    pub fast: bool,
64    /// Whether this field is a primary key (unique constraint, at most one per schema)
65    #[serde(default)]
66    pub primary_key: bool,
67    /// Whether build-time document reordering (Recursive Graph Bisection) is enabled.
68    /// Valid for sparse_vector fields with BMP format. Clusters similar documents
69    /// into the same blocks for better pruning effectiveness.
70    #[serde(default)]
71    pub reorder: bool,
72}
73
74/// Position tracking mode for text fields
75#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
76#[serde(rename_all = "snake_case")]
77pub enum PositionMode {
78    /// Track only element ordinal for multi-valued fields (which array element)
79    /// Useful for returning which element matched without full phrase query support
80    Ordinal,
81    /// Track only token position within text (for phrase queries)
82    /// Does not track element ordinal - all positions are relative to concatenated text
83    TokenPosition,
84    /// Track both element ordinal and token position (full support)
85    /// Position format: (element_ordinal << 20) | token_position
86    Full,
87}
88
89impl PositionMode {
90    /// Whether this mode tracks element ordinals
91    pub fn tracks_ordinal(&self) -> bool {
92        matches!(self, PositionMode::Ordinal | PositionMode::Full)
93    }
94
95    /// Whether this mode tracks token positions
96    pub fn tracks_token_position(&self) -> bool {
97        matches!(self, PositionMode::TokenPosition | PositionMode::Full)
98    }
99}
100
101/// Vector index algorithm type
102#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
103#[serde(rename_all = "snake_case")]
104pub enum VectorIndexType {
105    /// Flat - brute-force search over raw vectors (accumulating state)
106    Flat,
107    /// RaBitQ - binary quantization, good for small datasets (<100K)
108    #[default]
109    RaBitQ,
110    /// IVF-RaBitQ - inverted file with RaBitQ, good for medium datasets
111    IvfRaBitQ,
112    /// ScaNN - product quantization with OPQ and anisotropic loss, best for large datasets
113    ScaNN,
114}
115
116/// Storage quantization for dense vector elements
117///
118/// Controls the precision of each vector coordinate in `.vectors` files.
119/// Lower precision reduces storage and memory bandwidth; scoring uses
120/// native-precision SIMD (no dequantization on the hot path).
121#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, Serialize, Deserialize)]
122#[serde(rename_all = "snake_case")]
123pub enum DenseVectorQuantization {
124    /// 32-bit IEEE 754 float (4 bytes/dim) — full precision, baseline
125    #[default]
126    F32,
127    /// 16-bit IEEE 754 half-float (2 bytes/dim) — <0.1% recall loss for normalized embeddings
128    F16,
129    /// 8-bit unsigned scalar quantization (1 byte/dim) — maps [-1,1] → [0,255]
130    UInt8,
131}
132
133impl DenseVectorQuantization {
134    /// Bytes per element for this quantization type
135    pub fn element_size(self) -> usize {
136        match self {
137            Self::F32 => 4,
138            Self::F16 => 2,
139            Self::UInt8 => 1,
140        }
141    }
142
143    /// Wire format tag (stored in .vectors header)
144    pub fn tag(self) -> u8 {
145        match self {
146            Self::F32 => 0,
147            Self::F16 => 1,
148            Self::UInt8 => 2,
149        }
150    }
151
152    /// Decode wire format tag
153    pub fn from_tag(tag: u8) -> Option<Self> {
154        match tag {
155            0 => Some(Self::F32),
156            1 => Some(Self::F16),
157            2 => Some(Self::UInt8),
158            _ => None,
159        }
160    }
161}
162
163/// Configuration for dense vector fields using Flat, RaBitQ, IVF-RaBitQ, or ScaNN
164///
165/// Indexes operate in two states:
166/// - **Flat (accumulating)**: Brute-force search over raw vectors. Used when vector count
167///   is below `build_threshold` or before `build_index` is called.
168/// - **Built (ANN)**: Fast approximate nearest neighbor search using trained structures.
169///   Centroids and codebooks are trained from data and stored within the segment.
170#[derive(Debug, Clone, Serialize, Deserialize)]
171pub struct DenseVectorConfig {
172    /// Dimensionality of vectors
173    pub dim: usize,
174    /// Target vector index algorithm (Flat, RaBitQ, IVF-RaBitQ, or ScaNN)
175    /// When in accumulating state, search uses brute-force regardless of this setting.
176    #[serde(default)]
177    pub index_type: VectorIndexType,
178    /// Storage quantization for vector elements (f32, f16, uint8)
179    #[serde(default)]
180    pub quantization: DenseVectorQuantization,
181    /// Number of IVF clusters for IVF-RaBitQ and ScaNN (default: sqrt(n) capped at 4096)
182    /// If None, automatically determined based on dataset size.
183    #[serde(default, skip_serializing_if = "Option::is_none")]
184    pub num_clusters: Option<usize>,
185    /// Number of clusters to probe during search (default: 32)
186    #[serde(default = "default_nprobe")]
187    pub nprobe: usize,
188    /// Minimum number of vectors required before building ANN index.
189    /// Below this threshold, brute-force (Flat) search is used.
190    /// Default: 1000 for RaBitQ, 10000 for IVF-RaBitQ/ScaNN.
191    #[serde(default, skip_serializing_if = "Option::is_none")]
192    pub build_threshold: Option<usize>,
193    /// Whether stored vectors are pre-normalized to unit L2 norm.
194    /// When true, scoring skips per-vector norm computation (cosine = dot / ||q||),
195    /// reducing compute by ~40%. Common for embedding models (e.g. OpenAI, Cohere).
196    /// Default: true (most embedding models produce L2-normalized vectors).
197    #[serde(default = "default_unit_norm")]
198    pub unit_norm: bool,
199}
200
201fn default_nprobe() -> usize {
202    32
203}
204
205fn default_unit_norm() -> bool {
206    true
207}
208
209impl DenseVectorConfig {
210    pub fn new(dim: usize) -> Self {
211        Self {
212            dim,
213            index_type: VectorIndexType::RaBitQ,
214            quantization: DenseVectorQuantization::F32,
215            num_clusters: None,
216            nprobe: 32,
217            build_threshold: None,
218            unit_norm: true,
219        }
220    }
221
222    /// Create IVF-RaBitQ configuration
223    pub fn with_ivf(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
224        Self {
225            dim,
226            index_type: VectorIndexType::IvfRaBitQ,
227            quantization: DenseVectorQuantization::F32,
228            num_clusters,
229            nprobe,
230            build_threshold: None,
231            unit_norm: true,
232        }
233    }
234
235    /// Create ScaNN configuration
236    pub fn with_scann(dim: usize, num_clusters: Option<usize>, nprobe: usize) -> Self {
237        Self {
238            dim,
239            index_type: VectorIndexType::ScaNN,
240            quantization: DenseVectorQuantization::F32,
241            num_clusters,
242            nprobe,
243            build_threshold: None,
244            unit_norm: true,
245        }
246    }
247
248    /// Create Flat (brute-force) configuration - no ANN index
249    pub fn flat(dim: usize) -> Self {
250        Self {
251            dim,
252            index_type: VectorIndexType::Flat,
253            quantization: DenseVectorQuantization::F32,
254            num_clusters: None,
255            nprobe: 0,
256            build_threshold: None,
257            unit_norm: true,
258        }
259    }
260
261    /// Set storage quantization
262    pub fn with_quantization(mut self, quantization: DenseVectorQuantization) -> Self {
263        self.quantization = quantization;
264        self
265    }
266
267    /// Set build threshold for auto-building ANN index
268    pub fn with_build_threshold(mut self, threshold: usize) -> Self {
269        self.build_threshold = Some(threshold);
270        self
271    }
272
273    /// Mark vectors as pre-normalized to unit L2 norm
274    pub fn with_unit_norm(mut self) -> Self {
275        self.unit_norm = true;
276        self
277    }
278
279    /// Set number of IVF clusters
280    pub fn with_num_clusters(mut self, num_clusters: usize) -> Self {
281        self.num_clusters = Some(num_clusters);
282        self
283    }
284
285    /// Check if this config uses IVF
286    pub fn uses_ivf(&self) -> bool {
287        matches!(
288            self.index_type,
289            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN
290        )
291    }
292
293    /// Check if this config uses ScaNN
294    pub fn uses_scann(&self) -> bool {
295        self.index_type == VectorIndexType::ScaNN
296    }
297
298    /// Check if this config is flat (brute-force)
299    pub fn is_flat(&self) -> bool {
300        self.index_type == VectorIndexType::Flat
301    }
302
303    /// Get the default build threshold for this index type
304    pub fn default_build_threshold(&self) -> usize {
305        self.build_threshold.unwrap_or(match self.index_type {
306            VectorIndexType::Flat => usize::MAX, // Never auto-build
307            VectorIndexType::RaBitQ => 1000,
308            VectorIndexType::IvfRaBitQ | VectorIndexType::ScaNN => 10000,
309        })
310    }
311
312    /// Calculate optimal number of clusters for given vector count
313    pub fn optimal_num_clusters(&self, num_vectors: usize) -> usize {
314        self.num_clusters.unwrap_or_else(|| {
315            // sqrt(n) heuristic, capped at 4096
316            let optimal = (num_vectors as f64).sqrt() as usize;
317            optimal.clamp(16, 4096)
318        })
319    }
320}
321
322use super::query_field_router::QueryRouterRule;
323
324/// Schema defining document structure
325#[derive(Debug, Clone, Default, Serialize, Deserialize)]
326pub struct Schema {
327    fields: Vec<FieldEntry>,
328    name_to_field: HashMap<String, Field>,
329    /// Default fields for query parsing (when no field is specified)
330    #[serde(default)]
331    default_fields: Vec<Field>,
332    /// Query router rules for routing queries to specific fields based on regex patterns
333    #[serde(default)]
334    query_routers: Vec<QueryRouterRule>,
335}
336
337impl Schema {
338    pub fn builder() -> SchemaBuilder {
339        SchemaBuilder::default()
340    }
341
342    pub fn get_field(&self, name: &str) -> Option<Field> {
343        self.name_to_field.get(name).copied()
344    }
345
346    pub fn get_field_entry(&self, field: Field) -> Option<&FieldEntry> {
347        self.fields.get(field.0 as usize)
348    }
349
350    pub fn get_field_name(&self, field: Field) -> Option<&str> {
351        self.fields.get(field.0 as usize).map(|e| e.name.as_str())
352    }
353
354    pub fn fields(&self) -> impl Iterator<Item = (Field, &FieldEntry)> {
355        self.fields
356            .iter()
357            .enumerate()
358            .map(|(i, e)| (Field(i as u32), e))
359    }
360
361    pub fn num_fields(&self) -> usize {
362        self.fields.len()
363    }
364
365    /// Whether any field has the `reorder` attribute set.
366    /// Used by the background optimizer to determine which indexes need BP reordering.
367    pub fn has_reorder_fields(&self) -> bool {
368        self.fields.iter().any(|e| e.reorder)
369    }
370
371    /// Get the default fields for query parsing
372    pub fn default_fields(&self) -> &[Field] {
373        &self.default_fields
374    }
375
376    /// Set default fields (used by builder)
377    pub fn set_default_fields(&mut self, fields: Vec<Field>) {
378        self.default_fields = fields;
379    }
380
381    /// Get the query router rules
382    pub fn query_routers(&self) -> &[QueryRouterRule] {
383        &self.query_routers
384    }
385
386    /// Set query router rules
387    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
388        self.query_routers = rules;
389    }
390
391    /// Get the primary key field, if one is defined
392    pub fn primary_field(&self) -> Option<Field> {
393        self.fields
394            .iter()
395            .enumerate()
396            .find(|(_, e)| e.primary_key)
397            .map(|(i, _)| Field(i as u32))
398    }
399}
400
401/// Builder for Schema
402#[derive(Debug, Default)]
403pub struct SchemaBuilder {
404    fields: Vec<FieldEntry>,
405    default_fields: Vec<String>,
406    query_routers: Vec<QueryRouterRule>,
407}
408
409impl SchemaBuilder {
410    pub fn add_text_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
411        self.add_field_with_tokenizer(
412            name,
413            FieldType::Text,
414            indexed,
415            stored,
416            Some("simple".to_string()),
417        )
418    }
419
420    pub fn add_text_field_with_tokenizer(
421        &mut self,
422        name: &str,
423        indexed: bool,
424        stored: bool,
425        tokenizer: &str,
426    ) -> Field {
427        self.add_field_with_tokenizer(
428            name,
429            FieldType::Text,
430            indexed,
431            stored,
432            Some(tokenizer.to_string()),
433        )
434    }
435
436    pub fn add_u64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
437        self.add_field(name, FieldType::U64, indexed, stored)
438    }
439
440    pub fn add_i64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
441        self.add_field(name, FieldType::I64, indexed, stored)
442    }
443
444    pub fn add_f64_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
445        self.add_field(name, FieldType::F64, indexed, stored)
446    }
447
448    pub fn add_bytes_field(&mut self, name: &str, stored: bool) -> Field {
449        self.add_field(name, FieldType::Bytes, false, stored)
450    }
451
452    /// Add a JSON field for storing arbitrary JSON data
453    ///
454    /// JSON fields are never indexed, only stored. They can hold any valid JSON value
455    /// (objects, arrays, strings, numbers, booleans, null).
456    pub fn add_json_field(&mut self, name: &str, stored: bool) -> Field {
457        self.add_field(name, FieldType::Json, false, stored)
458    }
459
460    /// Add a sparse vector field with default configuration
461    ///
462    /// Sparse vectors are indexed as inverted posting lists where each dimension
463    /// becomes a "term" and documents have quantized weights for each dimension.
464    pub fn add_sparse_vector_field(&mut self, name: &str, indexed: bool, stored: bool) -> Field {
465        self.add_sparse_vector_field_with_config(
466            name,
467            indexed,
468            stored,
469            crate::structures::SparseVectorConfig::default(),
470        )
471    }
472
473    /// Add a sparse vector field with custom configuration
474    ///
475    /// Use `SparseVectorConfig::splade()` for SPLADE models (u16 indices, uint8 weights).
476    /// Use `SparseVectorConfig::compact()` for maximum compression (u16 indices, uint4 weights).
477    pub fn add_sparse_vector_field_with_config(
478        &mut self,
479        name: &str,
480        indexed: bool,
481        stored: bool,
482        config: crate::structures::SparseVectorConfig,
483    ) -> Field {
484        let field = Field(self.fields.len() as u32);
485        self.fields.push(FieldEntry {
486            name: name.to_string(),
487            field_type: FieldType::SparseVector,
488            indexed,
489            stored,
490            tokenizer: None,
491            multi: false,
492            positions: None,
493            sparse_vector_config: Some(config),
494            dense_vector_config: None,
495            fast: false,
496            primary_key: false,
497            reorder: false,
498        });
499        field
500    }
501
502    /// Set sparse vector configuration for an existing field
503    pub fn set_sparse_vector_config(
504        &mut self,
505        field: Field,
506        config: crate::structures::SparseVectorConfig,
507    ) {
508        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
509            entry.sparse_vector_config = Some(config);
510        }
511    }
512
513    /// Add a dense vector field with default configuration
514    ///
515    /// Dense vectors are indexed using RaBitQ binary quantization for fast ANN search.
516    /// The dimension must be specified as it determines the quantization structure.
517    pub fn add_dense_vector_field(
518        &mut self,
519        name: &str,
520        dim: usize,
521        indexed: bool,
522        stored: bool,
523    ) -> Field {
524        self.add_dense_vector_field_with_config(name, indexed, stored, DenseVectorConfig::new(dim))
525    }
526
527    /// Add a dense vector field with custom configuration
528    pub fn add_dense_vector_field_with_config(
529        &mut self,
530        name: &str,
531        indexed: bool,
532        stored: bool,
533        config: DenseVectorConfig,
534    ) -> Field {
535        let field = Field(self.fields.len() as u32);
536        self.fields.push(FieldEntry {
537            name: name.to_string(),
538            field_type: FieldType::DenseVector,
539            indexed,
540            stored,
541            tokenizer: None,
542            multi: false,
543            positions: None,
544            sparse_vector_config: None,
545            dense_vector_config: Some(config),
546            fast: false,
547            primary_key: false,
548            reorder: false,
549        });
550        field
551    }
552
553    fn add_field(
554        &mut self,
555        name: &str,
556        field_type: FieldType,
557        indexed: bool,
558        stored: bool,
559    ) -> Field {
560        self.add_field_with_tokenizer(name, field_type, indexed, stored, None)
561    }
562
563    fn add_field_with_tokenizer(
564        &mut self,
565        name: &str,
566        field_type: FieldType,
567        indexed: bool,
568        stored: bool,
569        tokenizer: Option<String>,
570    ) -> Field {
571        self.add_field_full(name, field_type, indexed, stored, tokenizer, false)
572    }
573
574    fn add_field_full(
575        &mut self,
576        name: &str,
577        field_type: FieldType,
578        indexed: bool,
579        stored: bool,
580        tokenizer: Option<String>,
581        multi: bool,
582    ) -> Field {
583        let field = Field(self.fields.len() as u32);
584        self.fields.push(FieldEntry {
585            name: name.to_string(),
586            field_type,
587            indexed,
588            stored,
589            tokenizer,
590            multi,
591            positions: None,
592            sparse_vector_config: None,
593            dense_vector_config: None,
594            fast: false,
595            primary_key: false,
596            reorder: false,
597        });
598        field
599    }
600
601    /// Set the multi attribute on the last added field
602    pub fn set_multi(&mut self, field: Field, multi: bool) {
603        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
604            entry.multi = multi;
605        }
606    }
607
608    /// Set fast-field columnar storage for O(1) doc→value access.
609    /// Valid for u64, i64, f64, and text fields.
610    pub fn set_fast(&mut self, field: Field, fast: bool) {
611        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
612            entry.fast = fast;
613        }
614    }
615
616    /// Mark a field as the primary key (unique constraint)
617    pub fn set_primary_key(&mut self, field: Field) {
618        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
619            entry.primary_key = true;
620        }
621    }
622
623    /// Enable build-time document reordering (Recursive Graph Bisection) for BMP fields
624    pub fn set_reorder(&mut self, field: Field, reorder: bool) {
625        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
626            entry.reorder = reorder;
627        }
628    }
629
630    /// Set position tracking mode for phrase queries and multi-field element tracking
631    pub fn set_positions(&mut self, field: Field, mode: PositionMode) {
632        if let Some(entry) = self.fields.get_mut(field.0 as usize) {
633            entry.positions = Some(mode);
634        }
635    }
636
637    /// Set default fields by name
638    pub fn set_default_fields(&mut self, field_names: Vec<String>) {
639        self.default_fields = field_names;
640    }
641
642    /// Set query router rules
643    pub fn set_query_routers(&mut self, rules: Vec<QueryRouterRule>) {
644        self.query_routers = rules;
645    }
646
647    pub fn build(self) -> Schema {
648        let mut name_to_field = HashMap::new();
649        for (i, entry) in self.fields.iter().enumerate() {
650            name_to_field.insert(entry.name.clone(), Field(i as u32));
651        }
652
653        // Resolve default field names to Field IDs
654        let default_fields: Vec<Field> = self
655            .default_fields
656            .iter()
657            .filter_map(|name| name_to_field.get(name).copied())
658            .collect();
659
660        Schema {
661            fields: self.fields,
662            name_to_field,
663            default_fields,
664            query_routers: self.query_routers,
665        }
666    }
667}
668
669/// Value that can be stored in a field
670#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
671pub enum FieldValue {
672    #[serde(rename = "text")]
673    Text(String),
674    #[serde(rename = "u64")]
675    U64(u64),
676    #[serde(rename = "i64")]
677    I64(i64),
678    #[serde(rename = "f64")]
679    F64(f64),
680    #[serde(rename = "bytes")]
681    Bytes(Vec<u8>),
682    /// Sparse vector: list of (dimension_id, weight) pairs
683    #[serde(rename = "sparse_vector")]
684    SparseVector(Vec<(u32, f32)>),
685    /// Dense vector: float32 values
686    #[serde(rename = "dense_vector")]
687    DenseVector(Vec<f32>),
688    /// Arbitrary JSON value
689    #[serde(rename = "json")]
690    Json(serde_json::Value),
691}
692
693impl FieldValue {
694    pub fn as_text(&self) -> Option<&str> {
695        match self {
696            FieldValue::Text(s) => Some(s),
697            _ => None,
698        }
699    }
700
701    pub fn as_u64(&self) -> Option<u64> {
702        match self {
703            FieldValue::U64(v) => Some(*v),
704            _ => None,
705        }
706    }
707
708    pub fn as_i64(&self) -> Option<i64> {
709        match self {
710            FieldValue::I64(v) => Some(*v),
711            _ => None,
712        }
713    }
714
715    pub fn as_f64(&self) -> Option<f64> {
716        match self {
717            FieldValue::F64(v) => Some(*v),
718            _ => None,
719        }
720    }
721
722    pub fn as_bytes(&self) -> Option<&[u8]> {
723        match self {
724            FieldValue::Bytes(b) => Some(b),
725            _ => None,
726        }
727    }
728
729    pub fn as_sparse_vector(&self) -> Option<&[(u32, f32)]> {
730        match self {
731            FieldValue::SparseVector(entries) => Some(entries),
732            _ => None,
733        }
734    }
735
736    pub fn as_dense_vector(&self) -> Option<&[f32]> {
737        match self {
738            FieldValue::DenseVector(v) => Some(v),
739            _ => None,
740        }
741    }
742
743    pub fn as_json(&self) -> Option<&serde_json::Value> {
744        match self {
745            FieldValue::Json(v) => Some(v),
746            _ => None,
747        }
748    }
749}
750
751/// A document to be indexed
752#[derive(Debug, Clone, Default, Serialize, Deserialize)]
753pub struct Document {
754    field_values: Vec<(Field, FieldValue)>,
755}
756
757impl Document {
758    pub fn new() -> Self {
759        Self::default()
760    }
761
762    pub fn add_text(&mut self, field: Field, value: impl Into<String>) {
763        self.field_values
764            .push((field, FieldValue::Text(value.into())));
765    }
766
767    pub fn add_u64(&mut self, field: Field, value: u64) {
768        self.field_values.push((field, FieldValue::U64(value)));
769    }
770
771    pub fn add_i64(&mut self, field: Field, value: i64) {
772        self.field_values.push((field, FieldValue::I64(value)));
773    }
774
775    pub fn add_f64(&mut self, field: Field, value: f64) {
776        self.field_values.push((field, FieldValue::F64(value)));
777    }
778
779    pub fn add_bytes(&mut self, field: Field, value: Vec<u8>) {
780        self.field_values.push((field, FieldValue::Bytes(value)));
781    }
782
783    pub fn add_sparse_vector(&mut self, field: Field, entries: Vec<(u32, f32)>) {
784        self.field_values
785            .push((field, FieldValue::SparseVector(entries)));
786    }
787
788    pub fn add_dense_vector(&mut self, field: Field, values: Vec<f32>) {
789        self.field_values
790            .push((field, FieldValue::DenseVector(values)));
791    }
792
793    pub fn add_json(&mut self, field: Field, value: serde_json::Value) {
794        self.field_values.push((field, FieldValue::Json(value)));
795    }
796
797    pub fn get_first(&self, field: Field) -> Option<&FieldValue> {
798        self.field_values
799            .iter()
800            .find(|(f, _)| *f == field)
801            .map(|(_, v)| v)
802    }
803
804    pub fn get_all(&self, field: Field) -> impl Iterator<Item = &FieldValue> {
805        self.field_values
806            .iter()
807            .filter(move |(f, _)| *f == field)
808            .map(|(_, v)| v)
809    }
810
811    pub fn field_values(&self) -> &[(Field, FieldValue)] {
812        &self.field_values
813    }
814
815    /// Return a new Document containing only fields marked as `stored` in the schema
816    pub fn filter_stored(&self, schema: &Schema) -> Document {
817        Document {
818            field_values: self
819                .field_values
820                .iter()
821                .filter(|(field, _)| {
822                    schema
823                        .get_field_entry(*field)
824                        .is_some_and(|entry| entry.stored)
825                })
826                .cloned()
827                .collect(),
828        }
829    }
830
831    /// Convert document to a JSON object using field names from schema
832    ///
833    /// Fields marked as `multi` in the schema are always returned as JSON arrays.
834    /// Other fields with multiple values are also returned as arrays.
835    /// Fields with a single value (and not marked multi) are returned as scalar values.
836    pub fn to_json(&self, schema: &Schema) -> serde_json::Value {
837        use std::collections::HashMap;
838
839        // Group values by field, keeping track of field entry for multi check
840        let mut field_values_map: HashMap<Field, (String, bool, Vec<serde_json::Value>)> =
841            HashMap::new();
842
843        for (field, value) in &self.field_values {
844            if let Some(entry) = schema.get_field_entry(*field) {
845                let json_value = match value {
846                    FieldValue::Text(s) => serde_json::Value::String(s.clone()),
847                    FieldValue::U64(n) => serde_json::Value::Number((*n).into()),
848                    FieldValue::I64(n) => serde_json::Value::Number((*n).into()),
849                    FieldValue::F64(n) => serde_json::json!(n),
850                    FieldValue::Bytes(b) => {
851                        use base64::Engine;
852                        serde_json::Value::String(
853                            base64::engine::general_purpose::STANDARD.encode(b),
854                        )
855                    }
856                    FieldValue::SparseVector(entries) => {
857                        let indices: Vec<u32> = entries.iter().map(|(i, _)| *i).collect();
858                        let values: Vec<f32> = entries.iter().map(|(_, v)| *v).collect();
859                        serde_json::json!({
860                            "indices": indices,
861                            "values": values
862                        })
863                    }
864                    FieldValue::DenseVector(values) => {
865                        serde_json::json!(values)
866                    }
867                    FieldValue::Json(v) => v.clone(),
868                };
869                field_values_map
870                    .entry(*field)
871                    .or_insert_with(|| (entry.name.clone(), entry.multi, Vec::new()))
872                    .2
873                    .push(json_value);
874            }
875        }
876
877        // Convert to JSON object, using arrays for multi fields or when multiple values exist
878        let mut map = serde_json::Map::new();
879        for (_field, (name, is_multi, values)) in field_values_map {
880            let json_value = if is_multi || values.len() > 1 {
881                serde_json::Value::Array(values)
882            } else {
883                values.into_iter().next().unwrap()
884            };
885            map.insert(name, json_value);
886        }
887
888        serde_json::Value::Object(map)
889    }
890
891    /// Create a Document from a JSON object using field names from schema
892    ///
893    /// Supports:
894    /// - String values -> Text fields
895    /// - Number values -> U64/I64/F64 fields (based on schema type)
896    /// - Array values -> Multiple values for the same field (multifields)
897    ///
898    /// Unknown fields (not in schema) are silently ignored.
899    pub fn from_json(json: &serde_json::Value, schema: &Schema) -> Option<Self> {
900        let obj = json.as_object()?;
901        let mut doc = Document::new();
902
903        for (key, value) in obj {
904            if let Some(field) = schema.get_field(key) {
905                let field_entry = schema.get_field_entry(field)?;
906                Self::add_json_value(&mut doc, field, &field_entry.field_type, value);
907            }
908        }
909
910        Some(doc)
911    }
912
913    /// Helper to add a JSON value to a document, handling type conversion
914    fn add_json_value(
915        doc: &mut Document,
916        field: Field,
917        field_type: &FieldType,
918        value: &serde_json::Value,
919    ) {
920        match value {
921            serde_json::Value::String(s) => {
922                if matches!(field_type, FieldType::Text) {
923                    doc.add_text(field, s.clone());
924                }
925            }
926            serde_json::Value::Number(n) => {
927                match field_type {
928                    FieldType::I64 => {
929                        if let Some(i) = n.as_i64() {
930                            doc.add_i64(field, i);
931                        }
932                    }
933                    FieldType::U64 => {
934                        if let Some(u) = n.as_u64() {
935                            doc.add_u64(field, u);
936                        } else if let Some(i) = n.as_i64() {
937                            // Allow positive i64 as u64
938                            if i >= 0 {
939                                doc.add_u64(field, i as u64);
940                            }
941                        }
942                    }
943                    FieldType::F64 => {
944                        if let Some(f) = n.as_f64() {
945                            doc.add_f64(field, f);
946                        }
947                    }
948                    _ => {}
949                }
950            }
951            // Handle arrays (multifields) - add each element separately
952            serde_json::Value::Array(arr) => {
953                for item in arr {
954                    Self::add_json_value(doc, field, field_type, item);
955                }
956            }
957            // Handle sparse vector objects
958            serde_json::Value::Object(obj) if matches!(field_type, FieldType::SparseVector) => {
959                if let (Some(indices_val), Some(values_val)) =
960                    (obj.get("indices"), obj.get("values"))
961                {
962                    let indices: Vec<u32> = indices_val
963                        .as_array()
964                        .map(|arr| {
965                            arr.iter()
966                                .filter_map(|v| v.as_u64().map(|n| n as u32))
967                                .collect()
968                        })
969                        .unwrap_or_default();
970                    let values: Vec<f32> = values_val
971                        .as_array()
972                        .map(|arr| {
973                            arr.iter()
974                                .filter_map(|v| v.as_f64().map(|n| n as f32))
975                                .collect()
976                        })
977                        .unwrap_or_default();
978                    if indices.len() == values.len() {
979                        let entries: Vec<(u32, f32)> = indices.into_iter().zip(values).collect();
980                        doc.add_sparse_vector(field, entries);
981                    }
982                }
983            }
984            // Handle JSON fields - accept any value directly
985            _ if matches!(field_type, FieldType::Json) => {
986                doc.add_json(field, value.clone());
987            }
988            serde_json::Value::Object(_) => {}
989            _ => {}
990        }
991    }
992}
993
994#[cfg(test)]
995mod tests {
996    use super::*;
997
998    #[test]
999    fn test_schema_builder() {
1000        let mut builder = Schema::builder();
1001        let title = builder.add_text_field("title", true, true);
1002        let body = builder.add_text_field("body", true, false);
1003        let count = builder.add_u64_field("count", true, true);
1004        let schema = builder.build();
1005
1006        assert_eq!(schema.get_field("title"), Some(title));
1007        assert_eq!(schema.get_field("body"), Some(body));
1008        assert_eq!(schema.get_field("count"), Some(count));
1009        assert_eq!(schema.get_field("nonexistent"), None);
1010    }
1011
1012    #[test]
1013    fn test_document() {
1014        let mut builder = Schema::builder();
1015        let title = builder.add_text_field("title", true, true);
1016        let count = builder.add_u64_field("count", true, true);
1017        let _schema = builder.build();
1018
1019        let mut doc = Document::new();
1020        doc.add_text(title, "Hello World");
1021        doc.add_u64(count, 42);
1022
1023        assert_eq!(doc.get_first(title).unwrap().as_text(), Some("Hello World"));
1024        assert_eq!(doc.get_first(count).unwrap().as_u64(), Some(42));
1025    }
1026
1027    #[test]
1028    fn test_document_serialization() {
1029        let mut builder = Schema::builder();
1030        let title = builder.add_text_field("title", true, true);
1031        let count = builder.add_u64_field("count", true, true);
1032        let _schema = builder.build();
1033
1034        let mut doc = Document::new();
1035        doc.add_text(title, "Hello World");
1036        doc.add_u64(count, 42);
1037
1038        // Serialize
1039        let json = serde_json::to_string(&doc).unwrap();
1040        println!("Serialized doc: {}", json);
1041
1042        // Deserialize
1043        let doc2: Document = serde_json::from_str(&json).unwrap();
1044        assert_eq!(
1045            doc2.field_values().len(),
1046            2,
1047            "Should have 2 field values after deserialization"
1048        );
1049        assert_eq!(
1050            doc2.get_first(title).unwrap().as_text(),
1051            Some("Hello World")
1052        );
1053        assert_eq!(doc2.get_first(count).unwrap().as_u64(), Some(42));
1054    }
1055
1056    #[test]
1057    fn test_multivalue_field() {
1058        let mut builder = Schema::builder();
1059        let uris = builder.add_text_field("uris", true, true);
1060        let title = builder.add_text_field("title", true, true);
1061        let schema = builder.build();
1062
1063        // Create document with multiple values for the same field
1064        let mut doc = Document::new();
1065        doc.add_text(uris, "one");
1066        doc.add_text(uris, "two");
1067        doc.add_text(title, "Test Document");
1068
1069        // Verify get_first returns the first value
1070        assert_eq!(doc.get_first(uris).unwrap().as_text(), Some("one"));
1071
1072        // Verify get_all returns all values
1073        let all_uris: Vec<_> = doc.get_all(uris).collect();
1074        assert_eq!(all_uris.len(), 2);
1075        assert_eq!(all_uris[0].as_text(), Some("one"));
1076        assert_eq!(all_uris[1].as_text(), Some("two"));
1077
1078        // Verify to_json returns array for multi-value field
1079        let json = doc.to_json(&schema);
1080        let uris_json = json.get("uris").unwrap();
1081        assert!(uris_json.is_array(), "Multi-value field should be an array");
1082        let uris_arr = uris_json.as_array().unwrap();
1083        assert_eq!(uris_arr.len(), 2);
1084        assert_eq!(uris_arr[0].as_str(), Some("one"));
1085        assert_eq!(uris_arr[1].as_str(), Some("two"));
1086
1087        // Verify single-value field is NOT an array
1088        let title_json = json.get("title").unwrap();
1089        assert!(
1090            title_json.is_string(),
1091            "Single-value field should be a string"
1092        );
1093        assert_eq!(title_json.as_str(), Some("Test Document"));
1094    }
1095
1096    #[test]
1097    fn test_multivalue_from_json() {
1098        let mut builder = Schema::builder();
1099        let uris = builder.add_text_field("uris", true, true);
1100        let title = builder.add_text_field("title", true, true);
1101        let schema = builder.build();
1102
1103        // Create JSON with array value
1104        let json = serde_json::json!({
1105            "uris": ["one", "two"],
1106            "title": "Test Document"
1107        });
1108
1109        // Parse from JSON
1110        let doc = Document::from_json(&json, &schema).unwrap();
1111
1112        // Verify all values are present
1113        let all_uris: Vec<_> = doc.get_all(uris).collect();
1114        assert_eq!(all_uris.len(), 2);
1115        assert_eq!(all_uris[0].as_text(), Some("one"));
1116        assert_eq!(all_uris[1].as_text(), Some("two"));
1117
1118        // Verify single value
1119        assert_eq!(
1120            doc.get_first(title).unwrap().as_text(),
1121            Some("Test Document")
1122        );
1123
1124        // Verify roundtrip: to_json should produce equivalent JSON
1125        let json_out = doc.to_json(&schema);
1126        let uris_out = json_out.get("uris").unwrap().as_array().unwrap();
1127        assert_eq!(uris_out.len(), 2);
1128        assert_eq!(uris_out[0].as_str(), Some("one"));
1129        assert_eq!(uris_out[1].as_str(), Some("two"));
1130    }
1131
1132    #[test]
1133    fn test_multi_attribute_forces_array() {
1134        // Test that fields marked as 'multi' are always serialized as arrays,
1135        // even when they have only one value
1136        let mut builder = Schema::builder();
1137        let uris = builder.add_text_field("uris", true, true);
1138        builder.set_multi(uris, true); // Mark as multi
1139        let title = builder.add_text_field("title", true, true);
1140        let schema = builder.build();
1141
1142        // Verify the multi attribute is set
1143        assert!(schema.get_field_entry(uris).unwrap().multi);
1144        assert!(!schema.get_field_entry(title).unwrap().multi);
1145
1146        // Create document with single value for multi field
1147        let mut doc = Document::new();
1148        doc.add_text(uris, "only_one");
1149        doc.add_text(title, "Test Document");
1150
1151        // Verify to_json returns array for multi field even with single value
1152        let json = doc.to_json(&schema);
1153
1154        let uris_json = json.get("uris").unwrap();
1155        assert!(
1156            uris_json.is_array(),
1157            "Multi field should be array even with single value"
1158        );
1159        let uris_arr = uris_json.as_array().unwrap();
1160        assert_eq!(uris_arr.len(), 1);
1161        assert_eq!(uris_arr[0].as_str(), Some("only_one"));
1162
1163        // Verify non-multi field with single value is NOT an array
1164        let title_json = json.get("title").unwrap();
1165        assert!(
1166            title_json.is_string(),
1167            "Non-multi single-value field should be a string"
1168        );
1169        assert_eq!(title_json.as_str(), Some("Test Document"));
1170    }
1171
1172    #[test]
1173    fn test_sparse_vector_field() {
1174        let mut builder = Schema::builder();
1175        let embedding = builder.add_sparse_vector_field("embedding", true, true);
1176        let title = builder.add_text_field("title", true, true);
1177        let schema = builder.build();
1178
1179        assert_eq!(schema.get_field("embedding"), Some(embedding));
1180        assert_eq!(
1181            schema.get_field_entry(embedding).unwrap().field_type,
1182            FieldType::SparseVector
1183        );
1184
1185        // Create document with sparse vector
1186        let mut doc = Document::new();
1187        doc.add_sparse_vector(embedding, vec![(0, 1.0), (5, 2.5), (10, 0.5)]);
1188        doc.add_text(title, "Test Document");
1189
1190        // Verify accessor
1191        let entries = doc
1192            .get_first(embedding)
1193            .unwrap()
1194            .as_sparse_vector()
1195            .unwrap();
1196        assert_eq!(entries, &[(0, 1.0), (5, 2.5), (10, 0.5)]);
1197
1198        // Verify JSON roundtrip
1199        let json = doc.to_json(&schema);
1200        let embedding_json = json.get("embedding").unwrap();
1201        assert!(embedding_json.is_object());
1202        assert_eq!(
1203            embedding_json
1204                .get("indices")
1205                .unwrap()
1206                .as_array()
1207                .unwrap()
1208                .len(),
1209            3
1210        );
1211
1212        // Parse back from JSON
1213        let doc2 = Document::from_json(&json, &schema).unwrap();
1214        let entries2 = doc2
1215            .get_first(embedding)
1216            .unwrap()
1217            .as_sparse_vector()
1218            .unwrap();
1219        assert_eq!(entries2[0].0, 0);
1220        assert!((entries2[0].1 - 1.0).abs() < 1e-6);
1221        assert_eq!(entries2[1].0, 5);
1222        assert!((entries2[1].1 - 2.5).abs() < 1e-6);
1223        assert_eq!(entries2[2].0, 10);
1224        assert!((entries2[2].1 - 0.5).abs() < 1e-6);
1225    }
1226
1227    #[test]
1228    fn test_json_field() {
1229        let mut builder = Schema::builder();
1230        let metadata = builder.add_json_field("metadata", true);
1231        let title = builder.add_text_field("title", true, true);
1232        let schema = builder.build();
1233
1234        assert_eq!(schema.get_field("metadata"), Some(metadata));
1235        assert_eq!(
1236            schema.get_field_entry(metadata).unwrap().field_type,
1237            FieldType::Json
1238        );
1239        // JSON fields are never indexed
1240        assert!(!schema.get_field_entry(metadata).unwrap().indexed);
1241        assert!(schema.get_field_entry(metadata).unwrap().stored);
1242
1243        // Create document with JSON value (object)
1244        let json_value = serde_json::json!({
1245            "author": "John Doe",
1246            "tags": ["rust", "search"],
1247            "nested": {"key": "value"}
1248        });
1249        let mut doc = Document::new();
1250        doc.add_json(metadata, json_value.clone());
1251        doc.add_text(title, "Test Document");
1252
1253        // Verify accessor
1254        let stored_json = doc.get_first(metadata).unwrap().as_json().unwrap();
1255        assert_eq!(stored_json, &json_value);
1256        assert_eq!(
1257            stored_json.get("author").unwrap().as_str(),
1258            Some("John Doe")
1259        );
1260
1261        // Verify JSON roundtrip via to_json/from_json
1262        let doc_json = doc.to_json(&schema);
1263        let metadata_out = doc_json.get("metadata").unwrap();
1264        assert_eq!(metadata_out, &json_value);
1265
1266        // Parse back from JSON
1267        let doc2 = Document::from_json(&doc_json, &schema).unwrap();
1268        let stored_json2 = doc2.get_first(metadata).unwrap().as_json().unwrap();
1269        assert_eq!(stored_json2, &json_value);
1270    }
1271
1272    #[test]
1273    fn test_json_field_various_types() {
1274        let mut builder = Schema::builder();
1275        let data = builder.add_json_field("data", true);
1276        let _schema = builder.build();
1277
1278        // Test with array
1279        let arr_value = serde_json::json!([1, 2, 3, "four", null]);
1280        let mut doc = Document::new();
1281        doc.add_json(data, arr_value.clone());
1282        assert_eq!(doc.get_first(data).unwrap().as_json().unwrap(), &arr_value);
1283
1284        // Test with string
1285        let str_value = serde_json::json!("just a string");
1286        let mut doc2 = Document::new();
1287        doc2.add_json(data, str_value.clone());
1288        assert_eq!(doc2.get_first(data).unwrap().as_json().unwrap(), &str_value);
1289
1290        // Test with number
1291        let num_value = serde_json::json!(42.5);
1292        let mut doc3 = Document::new();
1293        doc3.add_json(data, num_value.clone());
1294        assert_eq!(doc3.get_first(data).unwrap().as_json().unwrap(), &num_value);
1295
1296        // Test with null
1297        let null_value = serde_json::Value::Null;
1298        let mut doc4 = Document::new();
1299        doc4.add_json(data, null_value.clone());
1300        assert_eq!(
1301            doc4.get_first(data).unwrap().as_json().unwrap(),
1302            &null_value
1303        );
1304
1305        // Test with boolean
1306        let bool_value = serde_json::json!(true);
1307        let mut doc5 = Document::new();
1308        doc5.add_json(data, bool_value.clone());
1309        assert_eq!(
1310            doc5.get_first(data).unwrap().as_json().unwrap(),
1311            &bool_value
1312        );
1313    }
1314}