Skip to main content

dynomite/vector/
schema.rs

1//! Schema types for the vector subsystem.
2//!
3//! These types model the subset of the RediSearch FT.CREATE
4//! grammar that `dynomite::vector` will eventually parse and
5//! act on. Phase B (this commit) lands the data shapes; Phase
6//! C will add the FT.* command parser that builds them and
7//! routes through [`super::registry::VectorRegistry`].
8//!
9//! The schema types are intentionally distinct from the engine
10//! types in [`dynvec`]. The engine types are storage-level (a
11//! [`dynvec::Codec`] is a storage codec, a [`dynvec::Distance`]
12//! is a scoring function); the schema types are the
13//! protocol-level shape clients send over the wire. The
14//! conversion functions on [`VectorType`], [`DistanceMetric`],
15//! and [`IndexAlgorithm`] turn one into the other.
16
17use serde::{Deserialize, Serialize};
18
19use dynvec::distance::Distance as EngineDistance;
20use dynvec::encoding::Codec as EngineCodec;
21
22/// RediSearch-flavoured vector field type.
23///
24/// Today the engine supports `Float32` (mapped to
25/// [`EngineCodec::Fp16`] on disk; the API is float32-in /
26/// float32-out) and `Float16` (the same Fp16 codec exposed
27/// directly). `Int8` is reserved for the
28/// [`EngineCodec::Int8Quantized`] path. Future codecs (PQ, BQ)
29/// will land as new variants.
30#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
31#[serde(rename_all = "snake_case")]
32#[non_exhaustive]
33pub enum VectorType {
34    /// IEEE 754 single-precision floats (RediSearch `FLOAT32`).
35    Float32,
36    /// IEEE 754 half-precision floats (RediSearch `FLOAT16`).
37    Float16,
38    /// Per-vector int8 quantisation. Not in stock RediSearch
39    /// but exposed under our extension namespace.
40    Int8,
41}
42
43impl VectorType {
44    /// Map to the storage-layer codec.
45    #[must_use]
46    pub fn engine_codec(self) -> EngineCodec {
47        match self {
48            Self::Float32 | Self::Float16 => EngineCodec::Fp16,
49            Self::Int8 => EngineCodec::Int8Quantized,
50        }
51    }
52}
53
54/// RediSearch distance metric.
55///
56/// Mirrors the wire-level keywords (`L2`, `IP`, `COSINE`).
57#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
58#[serde(rename_all = "snake_case")]
59#[non_exhaustive]
60pub enum DistanceMetric {
61    /// Squared L2 distance.
62    L2,
63    /// Inner product (negated dot product, smaller-is-closer).
64    InnerProduct,
65    /// Cosine distance.
66    Cosine,
67}
68
69impl DistanceMetric {
70    /// Map to the engine's distance enum.
71    #[must_use]
72    pub fn engine_distance(self) -> EngineDistance {
73        match self {
74            Self::L2 => EngineDistance::Euclidean,
75            Self::InnerProduct => EngineDistance::DotProduct,
76            Self::Cosine => EngineDistance::Cosine,
77        }
78    }
79}
80
81/// Index algorithm.
82///
83/// RediSearch ships HNSW and FLAT (brute-force). This crate
84/// only implements HNSW today; the FLAT variant exists so the
85/// FT.CREATE parser can record the operator's request and the
86/// command handler can return a `not-implemented` error
87/// rather than silently downgrade.
88#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
89#[serde(rename_all = "snake_case")]
90#[non_exhaustive]
91pub enum IndexAlgorithm {
92    /// Hierarchical navigable small world graph.
93    Hnsw,
94    /// Brute-force scan (every vector compared on every
95    /// query).
96    Flat,
97}
98
99/// RediSearch metadata field type.
100///
101/// FT.CREATE accepts schema fields that decorate the indexed
102/// document with searchable metadata. Phase B (this commit)
103/// records the field set; Phase C will wire the per-field
104/// search semantics into the FT.SEARCH evaluator.
105#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
106#[serde(rename_all = "snake_case")]
107#[non_exhaustive]
108pub enum MetadataFieldType {
109    /// Tokenised free text (RediSearch `TEXT`).
110    Text,
111    /// Numeric range (RediSearch `NUMERIC`).
112    Numeric,
113    /// Discrete tag (RediSearch `TAG`).
114    Tag,
115    /// Geographic coordinate (RediSearch `GEO`).
116    Geo,
117}
118
119/// One non-vector schema field.
120#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
121pub struct MetadataField {
122    /// Field name on the underlying hash document.
123    pub name: String,
124    /// Field type.
125    pub field_type: MetadataFieldType,
126    /// Per-tag separator for `TAG` fields. `None` selects the
127    /// RediSearch default (`,`). Ignored for non-`TAG` field
128    /// types. The separator is a single ASCII byte; the
129    /// FT.CREATE parser rejects multi-byte strings.
130    #[serde(default)]
131    pub tag_separator: Option<u8>,
132}
133
134impl MetadataField {
135    /// Effective TAG separator. Returns the configured value
136    /// for `TAG` fields, or `b','` (the RediSearch default)
137    /// when none was supplied. Defined for non-`TAG` fields
138    /// too so callers can use it uniformly; the value is
139    /// meaningless outside the `TAG` path.
140    #[must_use]
141    pub fn effective_tag_separator(&self) -> u8 {
142        self.tag_separator.unwrap_or(b',')
143    }
144}
145
146/// Compiled FT.CREATE schema.
147///
148/// One [`VectorSchema`] describes one index: the vector field,
149/// its dimension and codec, the distance metric, the chosen
150/// algorithm, the prefix set (FT.CREATE `PREFIX <n> ...`), and
151/// the set of non-vector metadata fields.
152#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
153pub struct VectorSchema {
154    /// Name of the document field that carries the vector.
155    pub vector_field: String,
156    /// Codec.
157    pub vector_type: VectorType,
158    /// Frozen vector dimension.
159    pub dim: u16,
160    /// Distance metric.
161    pub distance: DistanceMetric,
162    /// Index algorithm.
163    pub algorithm: IndexAlgorithm,
164    /// Document key prefixes (`FT.CREATE ... PREFIX <n>
165    /// <prefix>...`). HSET commands whose key starts with any
166    /// of these prefixes get intercepted and indexed.
167    pub prefixes: Vec<Vec<u8>>,
168    /// Non-vector schema fields.
169    pub metadata_fields: Vec<MetadataField>,
170}
171
172impl VectorSchema {
173    /// Build the corresponding [`dynvec::TableSchema`] for a
174    /// table named `table_name`.
175    ///
176    /// Picks default HNSW tuning; future revisions will pull
177    /// the parameters out of the FT.CREATE clause.
178    #[must_use]
179    pub fn to_engine_schema(&self, table_name: &str) -> dynvec::storage::TableSchema {
180        dynvec::storage::TableSchema {
181            name: table_name.to_string(),
182            dim: self.dim,
183            codec: self.vector_type.engine_codec(),
184            distance: self.distance.engine_distance(),
185            hnsw: dynvec::index::HnswParams::default(),
186        }
187    }
188}
189
190#[cfg(test)]
191mod tests {
192    use super::*;
193
194    #[test]
195    fn vector_type_maps_to_engine_codec() {
196        assert_eq!(VectorType::Float32.engine_codec(), EngineCodec::Fp16);
197        assert_eq!(VectorType::Float16.engine_codec(), EngineCodec::Fp16);
198        assert_eq!(VectorType::Int8.engine_codec(), EngineCodec::Int8Quantized);
199    }
200
201    #[test]
202    fn distance_metric_maps_to_engine_distance() {
203        assert_eq!(
204            DistanceMetric::L2.engine_distance(),
205            EngineDistance::Euclidean
206        );
207        assert_eq!(
208            DistanceMetric::InnerProduct.engine_distance(),
209            EngineDistance::DotProduct
210        );
211        assert_eq!(
212            DistanceMetric::Cosine.engine_distance(),
213            EngineDistance::Cosine
214        );
215    }
216
217    #[test]
218    fn schema_compiles_to_engine_schema() {
219        let schema = VectorSchema {
220            vector_field: "vec".to_string(),
221            vector_type: VectorType::Float32,
222            dim: 16,
223            distance: DistanceMetric::Cosine,
224            algorithm: IndexAlgorithm::Hnsw,
225            prefixes: Vec::new(),
226            metadata_fields: vec![MetadataField {
227                name: "title".to_string(),
228                field_type: MetadataFieldType::Text,
229                tag_separator: None,
230            }],
231        };
232        let engine = schema.to_engine_schema("docs");
233        assert_eq!(engine.name, "docs");
234        assert_eq!(engine.dim, 16);
235        assert_eq!(engine.codec, EngineCodec::Fp16);
236        assert_eq!(engine.distance, EngineDistance::Cosine);
237    }
238}