Skip to main content

nodedb_types/
collection_config.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Vector-primary collection configuration types.
4//!
5//! `PrimaryEngine` is a parallel attribute to `CollectionType` that tells the
6//! planner which engine is the primary access path for a collection.
7//! Vectors remain an index — not a collection type — but a `primary = 'vector'`
8//! attribute means the vector index is the hot path and the document store is
9//! a metadata sidecar.
10
11use crate::collection::CollectionType;
12use crate::columnar::{ColumnarProfile, DocumentMode};
13use crate::vector_ann::VectorQuantization;
14use crate::vector_distance::DistanceMetric;
15use crate::vector_dtype::VectorStorageDtype;
16
17/// Which engine serves as the primary access path for a collection.
18///
19/// This is independent of `CollectionType` — it is an optimizer hint that
20/// instructs the planner and executor to use the named engine as the hot path.
21/// The default is inferred from `CollectionType` so existing collections need
22/// no migration.
23#[repr(u8)]
24#[derive(
25    Debug,
26    Clone,
27    Copy,
28    Default,
29    PartialEq,
30    Eq,
31    Hash,
32    serde::Serialize,
33    serde::Deserialize,
34    zerompk::ToMessagePack,
35    zerompk::FromMessagePack,
36)]
37#[non_exhaustive]
38pub enum PrimaryEngine {
39    /// Schemaless document (MessagePack). The historic default.
40    #[default]
41    Document = 0,
42    /// Strict document (Binary Tuples).
43    Strict = 1,
44    /// Key-Value hash store.
45    KeyValue = 2,
46    /// Columnar / plain-analytics.
47    Columnar = 3,
48    /// Columnar with spatial profile.
49    Spatial = 4,
50    /// Vector-primary: HNSW is the hot path; document store is a metadata sidecar.
51    Vector = 10,
52}
53
54impl PrimaryEngine {
55    /// Infer the primary engine from a `CollectionType`.
56    ///
57    /// Used when reading catalog entries that predate the `primary` field —
58    /// guarantees that existing collections behave as before.
59    pub fn infer_from_collection_type(ct: &CollectionType) -> Self {
60        match ct {
61            CollectionType::Document(DocumentMode::Schemaless) => Self::Document,
62            CollectionType::Document(DocumentMode::Strict(_)) => Self::Strict,
63            CollectionType::Columnar(ColumnarProfile::Plain) => Self::Columnar,
64            CollectionType::Columnar(ColumnarProfile::Timeseries { .. }) => Self::Columnar,
65            CollectionType::Columnar(ColumnarProfile::Spatial { .. }) => Self::Spatial,
66            CollectionType::KeyValue(_) => Self::KeyValue,
67        }
68    }
69}
70
71/// Configuration for a vector-primary collection.
72///
73/// Stored in `StoredCollection::vector_primary` when `primary == PrimaryEngine::Vector`.
74/// All options correspond to HNSW construction parameters and codec selection for the
75/// primary vector index.
76#[derive(
77    Debug,
78    Clone,
79    PartialEq,
80    serde::Serialize,
81    serde::Deserialize,
82    zerompk::ToMessagePack,
83    zerompk::FromMessagePack,
84)]
85pub struct VectorPrimaryConfig {
86    /// The name of the column that holds vector data (must be of type VECTOR(n)).
87    pub vector_field: String,
88    /// Vector dimensionality.
89    pub dim: u32,
90    /// Quantization codec for the primary HNSW index.
91    pub quantization: VectorQuantization,
92    /// HNSW `M` parameter (number of connections per node).
93    pub m: u8,
94    /// HNSW `ef_construction` parameter (beam width during index construction).
95    pub ef_construction: u16,
96    /// Distance metric used for similarity search.
97    pub metric: DistanceMetric,
98    /// Native storage dtype for vector values. Controls whether incoming
99    /// f32 components are stored as-is (F32), downsized to half precision
100    /// (F16), or brain-float (BF16) to halve memory at the cost of reduced
101    /// mantissa precision. Quantization codecs (RaBitQ, BBQ, SQ8 …) apply
102    /// on top and are orthogonal to this setting.
103    pub storage_dtype: VectorStorageDtype,
104    /// Payload field names that receive in-memory bitmap indexes for fast
105    /// pre-filtering, paired with the storage kind (Equality / Range /
106    /// Boolean). The DDL handler infers the kind from the column type:
107    /// numeric / timestamp / decimal → Range; bool → Boolean; everything
108    /// else → Equality.
109    pub payload_indexes: Vec<(String, PayloadIndexKind)>,
110}
111
112impl Default for VectorPrimaryConfig {
113    fn default() -> Self {
114        Self {
115            vector_field: String::new(),
116            dim: 0,
117            quantization: VectorQuantization::default(),
118            m: 16,
119            ef_construction: 200,
120            metric: DistanceMetric::Cosine,
121            storage_dtype: VectorStorageDtype::F32,
122            payload_indexes: Vec::new(),
123        }
124    }
125}
126
127/// Storage kind for a payload bitmap index. Equality fields use a
128/// `HashMap<key, bitmap>` (O(1) lookup); Range fields use a `BTreeMap`
129/// for sorted range scans; Boolean is a low-cardinality equality variant.
130#[derive(
131    Debug,
132    Clone,
133    Copy,
134    Default,
135    PartialEq,
136    Eq,
137    serde::Serialize,
138    serde::Deserialize,
139    zerompk::ToMessagePack,
140    zerompk::FromMessagePack,
141)]
142#[non_exhaustive]
143pub enum PayloadIndexKind {
144    #[default]
145    Equality,
146    Range,
147    Boolean,
148}
149
150/// A single payload-bitmap predicate atom emitted by the SQL planner and
151/// consumed by the vector search handler. The handler ANDs all atoms in
152/// `VectorOp::Search::payload_filters`; each atom may itself be a
153/// disjunction (`In`).
154#[derive(
155    Debug,
156    Clone,
157    PartialEq,
158    serde::Serialize,
159    serde::Deserialize,
160    zerompk::ToMessagePack,
161    zerompk::FromMessagePack,
162)]
163#[non_exhaustive]
164pub enum PayloadAtom {
165    /// `field = value` — single equality bitmap lookup.
166    Eq(String, crate::Value),
167    /// `field IN (v1, v2, ...)` — union of per-value bitmaps.
168    In(String, Vec<crate::Value>),
169    /// `field >= low AND field <= high` — sorted range scan over a
170    /// `PayloadIndexKind::Range` index. Either bound being `None` means
171    /// open on that side.
172    Range {
173        field: String,
174        low: Option<crate::Value>,
175        low_inclusive: bool,
176        high: Option<crate::Value>,
177        high_inclusive: bool,
178    },
179}
180
181#[cfg(test)]
182mod tests {
183    use super::*;
184
185    #[test]
186    fn primary_engine_default_is_document() {
187        assert_eq!(PrimaryEngine::default(), PrimaryEngine::Document);
188    }
189
190    #[test]
191    fn infer_from_collection_type_document_schemaless() {
192        let ct = CollectionType::document();
193        assert_eq!(
194            PrimaryEngine::infer_from_collection_type(&ct),
195            PrimaryEngine::Document
196        );
197    }
198
199    #[test]
200    fn infer_from_collection_type_document_strict() {
201        use crate::columnar::{ColumnDef, ColumnType, StrictSchema};
202        let schema = StrictSchema::new(vec![
203            ColumnDef::required("id", ColumnType::Int64).with_primary_key(),
204        ])
205        .unwrap();
206        let ct = CollectionType::strict(schema);
207        assert_eq!(
208            PrimaryEngine::infer_from_collection_type(&ct),
209            PrimaryEngine::Strict
210        );
211    }
212
213    #[test]
214    fn infer_from_collection_type_columnar_plain() {
215        let ct = CollectionType::columnar();
216        assert_eq!(
217            PrimaryEngine::infer_from_collection_type(&ct),
218            PrimaryEngine::Columnar
219        );
220    }
221
222    #[test]
223    fn infer_from_collection_type_columnar_timeseries() {
224        let ct = CollectionType::timeseries("ts", "1h");
225        assert_eq!(
226            PrimaryEngine::infer_from_collection_type(&ct),
227            PrimaryEngine::Columnar
228        );
229    }
230
231    #[test]
232    fn infer_from_collection_type_columnar_spatial() {
233        let ct = CollectionType::spatial("geom");
234        assert_eq!(
235            PrimaryEngine::infer_from_collection_type(&ct),
236            PrimaryEngine::Spatial
237        );
238    }
239
240    #[test]
241    fn infer_from_collection_type_kv() {
242        use crate::columnar::{ColumnDef, ColumnType, StrictSchema};
243        let schema = StrictSchema::new(vec![
244            ColumnDef::required("k", ColumnType::String).with_primary_key(),
245        ])
246        .unwrap();
247        let ct = CollectionType::kv(schema);
248        assert_eq!(
249            PrimaryEngine::infer_from_collection_type(&ct),
250            PrimaryEngine::KeyValue
251        );
252    }
253
254    #[test]
255    fn primary_engine_serde_roundtrip() {
256        for variant in [
257            PrimaryEngine::Document,
258            PrimaryEngine::Strict,
259            PrimaryEngine::KeyValue,
260            PrimaryEngine::Columnar,
261            PrimaryEngine::Spatial,
262            PrimaryEngine::Vector,
263        ] {
264            let json = sonic_rs::to_string(&variant).unwrap();
265            let back: PrimaryEngine = sonic_rs::from_str(&json).unwrap();
266            assert_eq!(back, variant);
267        }
268    }
269
270    #[test]
271    fn primary_engine_msgpack_roundtrip() {
272        for variant in [
273            PrimaryEngine::Document,
274            PrimaryEngine::Strict,
275            PrimaryEngine::KeyValue,
276            PrimaryEngine::Columnar,
277            PrimaryEngine::Spatial,
278            PrimaryEngine::Vector,
279        ] {
280            let bytes = zerompk::to_msgpack_vec(&variant).unwrap();
281            let back: PrimaryEngine = zerompk::from_msgpack(&bytes).unwrap();
282            assert_eq!(back, variant);
283        }
284    }
285
286    #[test]
287    fn vector_primary_config_serde_roundtrip() {
288        let cfg = VectorPrimaryConfig {
289            vector_field: "embedding".to_string(),
290            dim: 1024,
291            quantization: VectorQuantization::RaBitQ,
292            m: 32,
293            ef_construction: 200,
294            metric: DistanceMetric::Cosine,
295            storage_dtype: VectorStorageDtype::F32,
296            payload_indexes: vec![
297                ("category".to_string(), PayloadIndexKind::Equality),
298                ("timestamp".to_string(), PayloadIndexKind::Range),
299            ],
300        };
301        let json = sonic_rs::to_string(&cfg).unwrap();
302        let back: VectorPrimaryConfig = sonic_rs::from_str(&json).unwrap();
303        assert_eq!(back, cfg);
304    }
305
306    #[test]
307    fn vector_primary_config_msgpack_roundtrip() {
308        let cfg = VectorPrimaryConfig {
309            vector_field: "vec".to_string(),
310            dim: 512,
311            quantization: VectorQuantization::Bbq,
312            m: 16,
313            ef_construction: 100,
314            metric: DistanceMetric::L2,
315            storage_dtype: VectorStorageDtype::F32,
316            payload_indexes: vec![],
317        };
318        let bytes = zerompk::to_msgpack_vec(&cfg).unwrap();
319        let back: VectorPrimaryConfig = zerompk::from_msgpack(&bytes).unwrap();
320        assert_eq!(back, cfg);
321    }
322}