Skip to main content

nodedb_types/
collection_config.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Vector-primary collection configuration types.
4//!
5//! `PrimaryEngine` is a parallel attribute to `CollectionType` that tells the
6//! planner which engine is the primary access path for a collection.
7//! Vectors remain an index — not a collection type — but a `primary = 'vector'`
8//! attribute means the vector index is the hot path and the document store is
9//! a metadata sidecar.
10
11use crate::collection::CollectionType;
12use crate::columnar::{ColumnarProfile, DocumentMode};
13use crate::vector_ann::VectorQuantization;
14use crate::vector_distance::DistanceMetric;
15
16/// Which engine serves as the primary access path for a collection.
17///
18/// This is independent of `CollectionType` — it is an optimizer hint that
19/// instructs the planner and executor to use the named engine as the hot path.
20/// The default is inferred from `CollectionType` so existing collections need
21/// no migration.
22#[repr(u8)]
23#[derive(
24    Debug,
25    Clone,
26    Copy,
27    Default,
28    PartialEq,
29    Eq,
30    Hash,
31    serde::Serialize,
32    serde::Deserialize,
33    zerompk::ToMessagePack,
34    zerompk::FromMessagePack,
35)]
36#[non_exhaustive]
37pub enum PrimaryEngine {
38    /// Schemaless document (MessagePack). The historic default.
39    #[default]
40    Document = 0,
41    /// Strict document (Binary Tuples).
42    Strict = 1,
43    /// Key-Value hash store.
44    KeyValue = 2,
45    /// Columnar / plain-analytics.
46    Columnar = 3,
47    /// Columnar with spatial profile.
48    Spatial = 4,
49    /// Vector-primary: HNSW is the hot path; document store is a metadata sidecar.
50    Vector = 10,
51}
52
53impl PrimaryEngine {
54    /// Infer the primary engine from a `CollectionType`.
55    ///
56    /// Used when reading catalog entries that predate the `primary` field —
57    /// guarantees that existing collections behave as before.
58    pub fn infer_from_collection_type(ct: &CollectionType) -> Self {
59        match ct {
60            CollectionType::Document(DocumentMode::Schemaless) => Self::Document,
61            CollectionType::Document(DocumentMode::Strict(_)) => Self::Strict,
62            CollectionType::Columnar(ColumnarProfile::Plain) => Self::Columnar,
63            CollectionType::Columnar(ColumnarProfile::Timeseries { .. }) => Self::Columnar,
64            CollectionType::Columnar(ColumnarProfile::Spatial { .. }) => Self::Spatial,
65            CollectionType::KeyValue(_) => Self::KeyValue,
66        }
67    }
68}
69
70/// Configuration for a vector-primary collection.
71///
72/// Stored in `StoredCollection::vector_primary` when `primary == PrimaryEngine::Vector`.
73/// All options correspond to HNSW construction parameters and codec selection for the
74/// primary vector index.
75#[derive(
76    Debug,
77    Clone,
78    PartialEq,
79    serde::Serialize,
80    serde::Deserialize,
81    zerompk::ToMessagePack,
82    zerompk::FromMessagePack,
83)]
84pub struct VectorPrimaryConfig {
85    /// The name of the column that holds vector data (must be of type VECTOR(n)).
86    pub vector_field: String,
87    /// Vector dimensionality.
88    pub dim: u32,
89    /// Quantization codec for the primary HNSW index.
90    pub quantization: VectorQuantization,
91    /// HNSW `M` parameter (number of connections per node).
92    pub m: u8,
93    /// HNSW `ef_construction` parameter (beam width during index construction).
94    pub ef_construction: u16,
95    /// Distance metric used for similarity search.
96    pub metric: DistanceMetric,
97    /// Payload field names that receive in-memory bitmap indexes for fast
98    /// pre-filtering, paired with the storage kind (Equality / Range /
99    /// Boolean). The DDL handler infers the kind from the column type:
100    /// numeric / timestamp / decimal → Range; bool → Boolean; everything
101    /// else → Equality.
102    pub payload_indexes: Vec<(String, PayloadIndexKind)>,
103}
104
105impl Default for VectorPrimaryConfig {
106    fn default() -> Self {
107        Self {
108            vector_field: String::new(),
109            dim: 0,
110            quantization: VectorQuantization::default(),
111            m: 16,
112            ef_construction: 200,
113            metric: DistanceMetric::Cosine,
114            payload_indexes: Vec::new(),
115        }
116    }
117}
118
119/// Storage kind for a payload bitmap index. Equality fields use a
120/// `HashMap<key, bitmap>` (O(1) lookup); Range fields use a `BTreeMap`
121/// for sorted range scans; Boolean is a low-cardinality equality variant.
122#[derive(
123    Debug,
124    Clone,
125    Copy,
126    Default,
127    PartialEq,
128    Eq,
129    serde::Serialize,
130    serde::Deserialize,
131    zerompk::ToMessagePack,
132    zerompk::FromMessagePack,
133)]
134#[non_exhaustive]
135pub enum PayloadIndexKind {
136    #[default]
137    Equality,
138    Range,
139    Boolean,
140}
141
142/// A single payload-bitmap predicate atom emitted by the SQL planner and
143/// consumed by the vector search handler. The handler ANDs all atoms in
144/// `VectorOp::Search::payload_filters`; each atom may itself be a
145/// disjunction (`In`).
146#[derive(
147    Debug,
148    Clone,
149    PartialEq,
150    serde::Serialize,
151    serde::Deserialize,
152    zerompk::ToMessagePack,
153    zerompk::FromMessagePack,
154)]
155#[non_exhaustive]
156pub enum PayloadAtom {
157    /// `field = value` — single equality bitmap lookup.
158    Eq(String, crate::Value),
159    /// `field IN (v1, v2, ...)` — union of per-value bitmaps.
160    In(String, Vec<crate::Value>),
161    /// `field >= low AND field <= high` — sorted range scan over a
162    /// `PayloadIndexKind::Range` index. Either bound being `None` means
163    /// open on that side.
164    Range {
165        field: String,
166        low: Option<crate::Value>,
167        low_inclusive: bool,
168        high: Option<crate::Value>,
169        high_inclusive: bool,
170    },
171}
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176
177    #[test]
178    fn primary_engine_default_is_document() {
179        assert_eq!(PrimaryEngine::default(), PrimaryEngine::Document);
180    }
181
182    #[test]
183    fn infer_from_collection_type_document_schemaless() {
184        let ct = CollectionType::document();
185        assert_eq!(
186            PrimaryEngine::infer_from_collection_type(&ct),
187            PrimaryEngine::Document
188        );
189    }
190
191    #[test]
192    fn infer_from_collection_type_document_strict() {
193        use crate::columnar::{ColumnDef, ColumnType, StrictSchema};
194        let schema = StrictSchema::new(vec![
195            ColumnDef::required("id", ColumnType::Int64).with_primary_key(),
196        ])
197        .unwrap();
198        let ct = CollectionType::strict(schema);
199        assert_eq!(
200            PrimaryEngine::infer_from_collection_type(&ct),
201            PrimaryEngine::Strict
202        );
203    }
204
205    #[test]
206    fn infer_from_collection_type_columnar_plain() {
207        let ct = CollectionType::columnar();
208        assert_eq!(
209            PrimaryEngine::infer_from_collection_type(&ct),
210            PrimaryEngine::Columnar
211        );
212    }
213
214    #[test]
215    fn infer_from_collection_type_columnar_timeseries() {
216        let ct = CollectionType::timeseries("ts", "1h");
217        assert_eq!(
218            PrimaryEngine::infer_from_collection_type(&ct),
219            PrimaryEngine::Columnar
220        );
221    }
222
223    #[test]
224    fn infer_from_collection_type_columnar_spatial() {
225        let ct = CollectionType::spatial("geom");
226        assert_eq!(
227            PrimaryEngine::infer_from_collection_type(&ct),
228            PrimaryEngine::Spatial
229        );
230    }
231
232    #[test]
233    fn infer_from_collection_type_kv() {
234        use crate::columnar::{ColumnDef, ColumnType, StrictSchema};
235        let schema = StrictSchema::new(vec![
236            ColumnDef::required("k", ColumnType::String).with_primary_key(),
237        ])
238        .unwrap();
239        let ct = CollectionType::kv(schema);
240        assert_eq!(
241            PrimaryEngine::infer_from_collection_type(&ct),
242            PrimaryEngine::KeyValue
243        );
244    }
245
246    #[test]
247    fn primary_engine_serde_roundtrip() {
248        for variant in [
249            PrimaryEngine::Document,
250            PrimaryEngine::Strict,
251            PrimaryEngine::KeyValue,
252            PrimaryEngine::Columnar,
253            PrimaryEngine::Spatial,
254            PrimaryEngine::Vector,
255        ] {
256            let json = sonic_rs::to_string(&variant).unwrap();
257            let back: PrimaryEngine = sonic_rs::from_str(&json).unwrap();
258            assert_eq!(back, variant);
259        }
260    }
261
262    #[test]
263    fn primary_engine_msgpack_roundtrip() {
264        for variant in [
265            PrimaryEngine::Document,
266            PrimaryEngine::Strict,
267            PrimaryEngine::KeyValue,
268            PrimaryEngine::Columnar,
269            PrimaryEngine::Spatial,
270            PrimaryEngine::Vector,
271        ] {
272            let bytes = zerompk::to_msgpack_vec(&variant).unwrap();
273            let back: PrimaryEngine = zerompk::from_msgpack(&bytes).unwrap();
274            assert_eq!(back, variant);
275        }
276    }
277
278    #[test]
279    fn vector_primary_config_serde_roundtrip() {
280        let cfg = VectorPrimaryConfig {
281            vector_field: "embedding".to_string(),
282            dim: 1024,
283            quantization: VectorQuantization::RaBitQ,
284            m: 32,
285            ef_construction: 200,
286            metric: DistanceMetric::Cosine,
287            payload_indexes: vec![
288                ("category".to_string(), PayloadIndexKind::Equality),
289                ("timestamp".to_string(), PayloadIndexKind::Range),
290            ],
291        };
292        let json = sonic_rs::to_string(&cfg).unwrap();
293        let back: VectorPrimaryConfig = sonic_rs::from_str(&json).unwrap();
294        assert_eq!(back, cfg);
295    }
296
297    #[test]
298    fn vector_primary_config_msgpack_roundtrip() {
299        let cfg = VectorPrimaryConfig {
300            vector_field: "vec".to_string(),
301            dim: 512,
302            quantization: VectorQuantization::Bbq,
303            m: 16,
304            ef_construction: 100,
305            metric: DistanceMetric::L2,
306            payload_indexes: vec![],
307        };
308        let bytes = zerompk::to_msgpack_vec(&cfg).unwrap();
309        let back: VectorPrimaryConfig = zerompk::from_msgpack(&bytes).unwrap();
310        assert_eq!(back, cfg);
311    }
312}