dynomite/vector/schema.rs
1//! Schema types for the vector subsystem.
2//!
3//! These types model the subset of the RediSearch FT.CREATE
4//! grammar that `dynomite::vector` will eventually parse and
5//! act on. Phase B (this commit) lands the data shapes; Phase
6//! C will add the FT.* command parser that builds them and
7//! routes through [`super::registry::VectorRegistry`].
8//!
9//! The schema types are intentionally distinct from the engine
10//! types in [`dynvec`]. The engine types are storage-level (a
11//! [`dynvec::Codec`] is a storage codec, a [`dynvec::Distance`]
12//! is a scoring function); the schema types are the
13//! protocol-level shape clients send over the wire. The
14//! conversion functions on [`VectorType`], [`DistanceMetric`],
15//! and [`IndexAlgorithm`] turn one into the other.
16
17use serde::{Deserialize, Serialize};
18
19use dynvec::distance::Distance as EngineDistance;
20use dynvec::encoding::Codec as EngineCodec;
21
22/// RediSearch-flavoured vector field type.
23///
24/// Today the engine supports `Float32` (mapped to
25/// [`EngineCodec::Fp16`] on disk; the API is float32-in /
26/// float32-out) and `Float16` (the same Fp16 codec exposed
27/// directly). `Int8` is reserved for the
28/// [`EngineCodec::Int8Quantized`] path. Future codecs (PQ, BQ)
29/// will land as new variants.
30#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
31#[serde(rename_all = "snake_case")]
32#[non_exhaustive]
33pub enum VectorType {
34 /// IEEE 754 single-precision floats (RediSearch `FLOAT32`).
35 Float32,
36 /// IEEE 754 half-precision floats (RediSearch `FLOAT16`).
37 Float16,
38 /// Per-vector int8 quantisation. Not in stock RediSearch
39 /// but exposed under our extension namespace.
40 Int8,
41}
42
43impl VectorType {
44 /// Map to the storage-layer codec.
45 #[must_use]
46 pub fn engine_codec(self) -> EngineCodec {
47 match self {
48 Self::Float32 | Self::Float16 => EngineCodec::Fp16,
49 Self::Int8 => EngineCodec::Int8Quantized,
50 }
51 }
52}
53
54/// RediSearch distance metric.
55///
56/// Mirrors the wire-level keywords (`L2`, `IP`, `COSINE`).
57#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
58#[serde(rename_all = "snake_case")]
59#[non_exhaustive]
60pub enum DistanceMetric {
61 /// Squared L2 distance.
62 L2,
63 /// Inner product (negated dot product, smaller-is-closer).
64 InnerProduct,
65 /// Cosine distance.
66 Cosine,
67}
68
69impl DistanceMetric {
70 /// Map to the engine's distance enum.
71 #[must_use]
72 pub fn engine_distance(self) -> EngineDistance {
73 match self {
74 Self::L2 => EngineDistance::Euclidean,
75 Self::InnerProduct => EngineDistance::DotProduct,
76 Self::Cosine => EngineDistance::Cosine,
77 }
78 }
79}
80
81/// Index algorithm.
82///
83/// RediSearch ships HNSW and FLAT (brute-force). This crate
84/// only implements HNSW today; the FLAT variant exists so the
85/// FT.CREATE parser can record the operator's request and the
86/// command handler can return a `not-implemented` error
87/// rather than silently downgrade.
88#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
89#[serde(rename_all = "snake_case")]
90#[non_exhaustive]
91pub enum IndexAlgorithm {
92 /// Hierarchical navigable small world graph.
93 Hnsw,
94 /// Brute-force scan (every vector compared on every
95 /// query).
96 Flat,
97}
98
99/// RediSearch metadata field type.
100///
101/// FT.CREATE accepts schema fields that decorate the indexed
102/// document with searchable metadata. Phase B (this commit)
103/// records the field set; Phase C will wire the per-field
104/// search semantics into the FT.SEARCH evaluator.
105#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, Deserialize)]
106#[serde(rename_all = "snake_case")]
107#[non_exhaustive]
108pub enum MetadataFieldType {
109 /// Tokenised free text (RediSearch `TEXT`).
110 Text,
111 /// Numeric range (RediSearch `NUMERIC`).
112 Numeric,
113 /// Discrete tag (RediSearch `TAG`).
114 Tag,
115 /// Geographic coordinate (RediSearch `GEO`).
116 Geo,
117}
118
119/// One non-vector schema field.
120#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
121pub struct MetadataField {
122 /// Field name on the underlying hash document.
123 pub name: String,
124 /// Field type.
125 pub field_type: MetadataFieldType,
126 /// Per-tag separator for `TAG` fields. `None` selects the
127 /// RediSearch default (`,`). Ignored for non-`TAG` field
128 /// types. The separator is a single ASCII byte; the
129 /// FT.CREATE parser rejects multi-byte strings.
130 #[serde(default)]
131 pub tag_separator: Option<u8>,
132}
133
134impl MetadataField {
135 /// Effective TAG separator. Returns the configured value
136 /// for `TAG` fields, or `b','` (the RediSearch default)
137 /// when none was supplied. Defined for non-`TAG` fields
138 /// too so callers can use it uniformly; the value is
139 /// meaningless outside the `TAG` path.
140 #[must_use]
141 pub fn effective_tag_separator(&self) -> u8 {
142 self.tag_separator.unwrap_or(b',')
143 }
144}
145
146/// Compiled FT.CREATE schema.
147///
148/// One [`VectorSchema`] describes one index: the vector field,
149/// its dimension and codec, the distance metric, the chosen
150/// algorithm, the prefix set (FT.CREATE `PREFIX <n> ...`), and
151/// the set of non-vector metadata fields.
152#[derive(Clone, Debug, Eq, PartialEq, Serialize, Deserialize)]
153pub struct VectorSchema {
154 /// Name of the document field that carries the vector.
155 pub vector_field: String,
156 /// Codec.
157 pub vector_type: VectorType,
158 /// Frozen vector dimension.
159 pub dim: u16,
160 /// Distance metric.
161 pub distance: DistanceMetric,
162 /// Index algorithm.
163 pub algorithm: IndexAlgorithm,
164 /// Document key prefixes (`FT.CREATE ... PREFIX <n>
165 /// <prefix>...`). HSET commands whose key starts with any
166 /// of these prefixes get intercepted and indexed.
167 pub prefixes: Vec<Vec<u8>>,
168 /// Non-vector schema fields.
169 pub metadata_fields: Vec<MetadataField>,
170}
171
172impl VectorSchema {
173 /// Build the corresponding [`dynvec::TableSchema`] for a
174 /// table named `table_name`.
175 ///
176 /// Picks default HNSW tuning; future revisions will pull
177 /// the parameters out of the FT.CREATE clause.
178 #[must_use]
179 pub fn to_engine_schema(&self, table_name: &str) -> dynvec::storage::TableSchema {
180 dynvec::storage::TableSchema {
181 name: table_name.to_string(),
182 dim: self.dim,
183 codec: self.vector_type.engine_codec(),
184 distance: self.distance.engine_distance(),
185 hnsw: dynvec::index::HnswParams::default(),
186 }
187 }
188}
189
190#[cfg(test)]
191mod tests {
192 use super::*;
193
194 #[test]
195 fn vector_type_maps_to_engine_codec() {
196 assert_eq!(VectorType::Float32.engine_codec(), EngineCodec::Fp16);
197 assert_eq!(VectorType::Float16.engine_codec(), EngineCodec::Fp16);
198 assert_eq!(VectorType::Int8.engine_codec(), EngineCodec::Int8Quantized);
199 }
200
201 #[test]
202 fn distance_metric_maps_to_engine_distance() {
203 assert_eq!(
204 DistanceMetric::L2.engine_distance(),
205 EngineDistance::Euclidean
206 );
207 assert_eq!(
208 DistanceMetric::InnerProduct.engine_distance(),
209 EngineDistance::DotProduct
210 );
211 assert_eq!(
212 DistanceMetric::Cosine.engine_distance(),
213 EngineDistance::Cosine
214 );
215 }
216
217 #[test]
218 fn schema_compiles_to_engine_schema() {
219 let schema = VectorSchema {
220 vector_field: "vec".to_string(),
221 vector_type: VectorType::Float32,
222 dim: 16,
223 distance: DistanceMetric::Cosine,
224 algorithm: IndexAlgorithm::Hnsw,
225 prefixes: Vec::new(),
226 metadata_fields: vec![MetadataField {
227 name: "title".to_string(),
228 field_type: MetadataFieldType::Text,
229 tag_separator: None,
230 }],
231 };
232 let engine = schema.to_engine_schema("docs");
233 assert_eq!(engine.name, "docs");
234 assert_eq!(engine.dim, 16);
235 assert_eq!(engine.codec, EngineCodec::Fp16);
236 assert_eq!(engine.distance, EngineDistance::Cosine);
237 }
238}