Skip to main content

velesdb_core/collection/
collection_config.rs

1//! Collection configuration and schema versioning.
2
3use crate::collection::auto_reindex::AutoReindexConfig;
4use crate::collection::streaming::AsyncIndexBuilderConfig;
5use crate::distance::DistanceMetric;
6use crate::index::hnsw::HnswParams;
7use crate::quantization::StorageMode;
8use serde::{Deserialize, Serialize};
9use std::collections::BTreeSet;
10
11use crate::collection::graph::GraphSchema;
12
13/// Current on-disk schema version for `config.json`.
14///
15/// Increment this constant when the persisted format changes in a way that
16/// older VelesDB versions cannot safely read. The `Collection::open()` path
17/// rejects any `schema_version > CURRENT_SCHEMA_VERSION` with a clear error.
18pub const CURRENT_SCHEMA_VERSION: u32 = 2;
19
20/// Returns the default schema version for backward-compatible deserialization.
21///
22/// Old `config.json` files written before schema versioning was introduced
23/// will deserialize with this default, which is equivalent to version 1.
24fn default_schema_version() -> u32 {
25    1
26}
27
28/// Returns `Some(4)` as the default PQ rescore oversampling factor.
29/// Returns `Option` because the field type is `Option<u32>` (None = disabled).
30#[allow(clippy::unnecessary_wraps)]
31fn default_pq_rescore_oversampling() -> Option<u32> {
32    Some(4)
33}
34
35/// Metadata for a collection.
36///
37/// `#[non_exhaustive]`: new fields are added over time (schema-versioned and
38/// serde-defaulted), so external crates must obtain a `CollectionConfig` via
39/// the `VectorCollection::create*` constructors / `Collection::config()` rather
40/// than a struct literal — this keeps future field additions non-breaking.
41#[derive(Debug, Clone, Serialize, Deserialize)]
42#[non_exhaustive]
43pub struct CollectionConfig {
44    /// Name of the collection.
45    pub name: String,
46
47    /// Vector dimension (0 for metadata-only or graph-without-embeddings collections).
48    pub dimension: usize,
49
50    /// Distance metric.
51    pub metric: DistanceMetric,
52
53    /// Number of points in the collection.
54    pub point_count: usize,
55
56    /// On-disk schema version for forward-compatibility detection.
57    ///
58    /// When a newer VelesDB version writes a `config.json` with a higher
59    /// schema version, older versions will refuse to open the collection
60    /// rather than silently corrupting data.
61    ///
62    /// Backward compatible: old `config.json` files without this field
63    /// deserialize to `1` (the initial version).
64    #[serde(default = "default_schema_version")]
65    pub schema_version: u32,
66
67    /// Storage mode for vectors (Full, SQ8, Binary).
68    #[serde(default)]
69    pub storage_mode: StorageMode,
70
71    /// Whether this is a metadata-only collection.
72    #[serde(default)]
73    pub metadata_only: bool,
74
75    /// Graph schema — `Some` iff this is a graph collection.
76    /// Persisted to config.json; `None` for vector and metadata collections.
77    #[serde(default, skip_serializing_if = "Option::is_none")]
78    pub graph_schema: Option<GraphSchema>,
79
80    /// Embedding dimension for graph node vectors (None = no embeddings).
81    /// Only meaningful when `graph_schema` is `Some`.
82    #[serde(default, skip_serializing_if = "Option::is_none")]
83    pub embedding_dimension: Option<usize>,
84
85    /// PQ rescore oversampling factor. `Some(4)` by default.
86    ///
87    /// The search pipeline fetches `max(k * factor, k + 32)` candidates from HNSW
88    /// and rescores them with full-precision ADC.
89    ///
90    /// - `None`: disables rescore entirely (expert-only; risks silent recall collapse).
91    /// - `Some(0)`: treated as disabled (equivalent to `None`) — the oversampling factor
92    ///   of 0 produces a candidates count of 0, which falls back to raw HNSW results.
93    /// - `Some(n)` where `n > 0`: enables rescore with `n`-fold oversampling.
94    #[serde(default = "default_pq_rescore_oversampling")]
95    pub pq_rescore_oversampling: Option<u32>,
96
97    /// Custom HNSW index parameters (M, `ef_construction`, etc.).
98    ///
99    /// When `Some`, these parameters are used to rebuild the HNSW index on
100    /// collection reopen if no persisted index exists yet (`native_meta.bin`
101    /// absent). When `None`, the default `HnswParams::auto(dimension)` is used.
102    ///
103    /// Backward compatible: old `config.json` files without this field
104    /// deserialize to `None`.
105    #[serde(default, skip_serializing_if = "Option::is_none")]
106    pub hnsw_params: Option<HnswParams>,
107
108    /// Deferred indexing configuration (US-366).
109    ///
110    /// When `Some` and `enabled`, inserts are buffered in memory and
111    /// batch-merged into the HNSW index when the buffer reaches
112    /// `merge_threshold`. This decouples write latency from index cost.
113    ///
114    /// Backward compatible: old `config.json` files without this field
115    /// deserialize to `None` (disabled).
116    #[cfg(feature = "persistence")]
117    #[serde(default, skip_serializing_if = "Option::is_none")]
118    pub deferred_indexing: Option<crate::collection::streaming::DeferredIndexerConfig>,
119
120    /// Async index builder configuration (Issue #488 — Bulk Insert V2).
121    ///
122    /// When `Some`, enables the `AsyncIndexBuilder` for deferred HNSW
123    /// insertion during bulk import. Buffered vectors are flushed to the
124    /// HNSW index via `HnswIndex::insert_batch_parallel`.
125    ///
126    /// Backward compatible: old `config.json` files without this field
127    /// deserialize to `None` (disabled).
128    #[serde(default, skip_serializing_if = "Option::is_none")]
129    pub async_index_builder: Option<AsyncIndexBuilderConfig>,
130
131    /// Auto-reindex configuration (schema v2 — W2).
132    ///
133    /// When `Some`, the [`AutoReindexManager`](crate::collection::auto_reindex::AutoReindexManager)
134    /// is restored automatically on [`Collection::open`](crate::collection::VectorCollection)
135    /// so the policy survives a process restart instead of requiring a manual
136    /// re-attach.
137    ///
138    /// Backward compatible: v1 `config.json` files without this field
139    /// deserialize to `None` (no manager attached).
140    #[serde(default, skip_serializing_if = "Option::is_none")]
141    pub auto_reindex_config: Option<AutoReindexConfig>,
142
143    /// Streaming ingestion configuration (schema v2 — STREAM-7).
144    ///
145    /// Describes the persisted shape (channel/batch sizing, flush timing) of
146    /// the streaming pipeline. The live `StreamIngester` is still created on
147    /// demand via `Collection::enable_streaming`; persisting the config lets a
148    /// future open-time hook re-enable streaming without a fresh API call.
149    ///
150    /// Backward compatible: v1 `config.json` files without this field
151    /// deserialize to `None` (streaming not configured).
152    #[cfg(feature = "persistence")]
153    #[serde(default, skip_serializing_if = "Option::is_none")]
154    pub streaming_config: Option<crate::collection::streaming::StreamingConfig>,
155
156    /// Names of payload fields carrying a secondary metadata index
157    /// (`CREATE INDEX (<field>)`) — the persisted **authority** for which
158    /// indexes exist (EPIC-081 phase 3d).
159    ///
160    /// `create_index` adds a field here and `drop_secondary_index` removes it,
161    /// each persisted via [`save_config`](crate::collection::Collection). On
162    /// [`Collection::open`](crate::collection::VectorCollection) every listed
163    /// field is rebuilt from the recovered payloads (backfill), so an index
164    /// survives a process restart instead of silently vanishing — without
165    /// which the ordered-index `ORDER BY` fast path, the bitmap pre-filter,
166    /// `EXPLAIN` `IndexLookup`, and the index advisor would all change behaviour
167    /// after a restart (results stay correct via the exhaustive fallback).
168    ///
169    /// A `BTreeSet` so the on-disk ordering is deterministic. Backward
170    /// compatible: configs written before this field deserialize to an empty
171    /// set (no indexes restored), and an empty set is not serialized.
172    ///
173    /// Downgrade caveat: a pre-3d binary opening this config ignores the field
174    /// (no `deny_unknown_fields`), but the next `save_config` it performs
175    /// re-serializes without it, dropping the authority — a subsequent newer
176    /// binary then will not restore those indexes until `CREATE INDEX` is
177    /// re-issued. Bounded and fully recoverable (results stay correct via the
178    /// exhaustive fallback); no schema-version bump guards it, by design.
179    #[serde(default, skip_serializing_if = "BTreeSet::is_empty")]
180    pub indexed_fields: BTreeSet<String>,
181}
182
183#[cfg(test)]
184mod rescore_config_tests {
185    use super::*;
186    use crate::distance::DistanceMetric;
187    use crate::quantization::StorageMode;
188
189    fn make_config(oversampling: Option<u32>) -> CollectionConfig {
190        CollectionConfig {
191            name: "test".to_string(),
192            dimension: 128,
193            metric: DistanceMetric::Euclidean,
194            point_count: 0,
195            schema_version: CURRENT_SCHEMA_VERSION,
196            storage_mode: StorageMode::ProductQuantization,
197            metadata_only: false,
198            graph_schema: None,
199            embedding_dimension: None,
200            pq_rescore_oversampling: oversampling,
201            hnsw_params: None,
202            #[cfg(feature = "persistence")]
203            deferred_indexing: None,
204            async_index_builder: None,
205            auto_reindex_config: None,
206            #[cfg(feature = "persistence")]
207            streaming_config: None,
208            indexed_fields: BTreeSet::new(),
209        }
210    }
211
212    #[test]
213    fn rescore_default_oversampling_is_4() {
214        let config = make_config(default_pq_rescore_oversampling());
215        assert_eq!(config.pq_rescore_oversampling, Some(4));
216    }
217
218    #[test]
219    fn rescore_candidates_k_formula_default() {
220        // Default factor = 4, k = 10
221        // candidates_k = max(10 * 4, 10 + 32) = max(40, 42) = 42
222        let factor = 4_usize;
223        let k = 10_usize;
224        let candidates_k = k.saturating_mul(factor).max(k + 32);
225        assert_eq!(candidates_k, 42);
226    }
227
228    #[test]
229    fn rescore_candidates_k_formula_custom_factor_6() {
230        // factor = 6, k = 10
231        // candidates_k = max(10 * 6, 10 + 32) = max(60, 42) = 60
232        let factor = 6_usize;
233        let k = 10_usize;
234        let candidates_k = k.saturating_mul(factor).max(k + 32);
235        assert_eq!(candidates_k, 60);
236    }
237
238    #[test]
239    fn rescore_none_disables_oversampling() {
240        let config = make_config(None);
241        let oversampling = config.pq_rescore_oversampling.unwrap_or(0);
242        assert_eq!(oversampling, 0, "None should map to 0 (disabled)");
243    }
244
245    #[test]
246    fn rescore_active_by_default_for_pq() {
247        let config = make_config(default_pq_rescore_oversampling());
248        assert!(
249            config.pq_rescore_oversampling.is_some(),
250            "Rescore must be active by default for PQ"
251        );
252        assert!(
253            config.pq_rescore_oversampling.unwrap() > 0,
254            "Default oversampling must be > 0"
255        );
256    }
257
258    #[test]
259    fn rescore_serde_default_backward_compat() {
260        // Simulate deserializing a config without pq_rescore_oversampling field.
261        // The serde default should kick in and set Some(4).
262        let json = r#"{
263            "name": "old_collection",
264            "dimension": 128,
265            "metric": "Euclidean",
266            "point_count": 100,
267            "storage_mode": "productquantization"
268        }"#;
269        let config: CollectionConfig = serde_json::from_str(json).unwrap();
270        assert_eq!(
271            config.pq_rescore_oversampling,
272            Some(4),
273            "Missing field must deserialize to Some(4) for backward compat"
274        );
275    }
276
277    #[test]
278    fn rescore_minimum_floor_preserved() {
279        // Even with small k, the floor k + 32 must dominate
280        let factor = 4_usize;
281        let k = 5_usize;
282        let candidates_k = k.saturating_mul(factor).max(k + 32);
283        // max(20, 37) = 37
284        assert_eq!(candidates_k, 37);
285    }
286}