Skip to main content

velesdb_core/collection/
collection_config.rs

1//! Collection configuration and schema versioning.
2
3use crate::collection::streaming::AsyncIndexBuilderConfig;
4use crate::distance::DistanceMetric;
5use crate::index::hnsw::HnswParams;
6use crate::quantization::StorageMode;
7use serde::{Deserialize, Serialize};
8
9use crate::collection::graph::GraphSchema;
10
11/// Current on-disk schema version for `config.json`.
12///
13/// Increment this constant when the persisted format changes in a way that
14/// older VelesDB versions cannot safely read. The `Collection::open()` path
15/// rejects any `schema_version > CURRENT_SCHEMA_VERSION` with a clear error.
16pub const CURRENT_SCHEMA_VERSION: u32 = 1;
17
18/// Returns the default schema version for backward-compatible deserialization.
19///
20/// Old `config.json` files written before schema versioning was introduced
21/// will deserialize with this default, which is equivalent to version 1.
22fn default_schema_version() -> u32 {
23    1
24}
25
26/// Returns `Some(4)` as the default PQ rescore oversampling factor.
27/// Returns `Option` because the field type is `Option<u32>` (None = disabled).
28#[allow(clippy::unnecessary_wraps)]
29fn default_pq_rescore_oversampling() -> Option<u32> {
30    Some(4)
31}
32
33/// Metadata for a collection.
34#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct CollectionConfig {
36    /// Name of the collection.
37    pub name: String,
38
39    /// Vector dimension (0 for metadata-only or graph-without-embeddings collections).
40    pub dimension: usize,
41
42    /// Distance metric.
43    pub metric: DistanceMetric,
44
45    /// Number of points in the collection.
46    pub point_count: usize,
47
48    /// On-disk schema version for forward-compatibility detection.
49    ///
50    /// When a newer VelesDB version writes a `config.json` with a higher
51    /// schema version, older versions will refuse to open the collection
52    /// rather than silently corrupting data.
53    ///
54    /// Backward compatible: old `config.json` files without this field
55    /// deserialize to `1` (the initial version).
56    #[serde(default = "default_schema_version")]
57    pub schema_version: u32,
58
59    /// Storage mode for vectors (Full, SQ8, Binary).
60    #[serde(default)]
61    pub storage_mode: StorageMode,
62
63    /// Whether this is a metadata-only collection.
64    #[serde(default)]
65    pub metadata_only: bool,
66
67    /// Graph schema — `Some` iff this is a graph collection.
68    /// Persisted to config.json; `None` for vector and metadata collections.
69    #[serde(default, skip_serializing_if = "Option::is_none")]
70    pub graph_schema: Option<GraphSchema>,
71
72    /// Embedding dimension for graph node vectors (None = no embeddings).
73    /// Only meaningful when `graph_schema` is `Some`.
74    #[serde(default, skip_serializing_if = "Option::is_none")]
75    pub embedding_dimension: Option<usize>,
76
77    /// PQ rescore oversampling factor. `Some(4)` by default.
78    ///
79    /// The search pipeline fetches `max(k * factor, k + 32)` candidates from HNSW
80    /// and rescores them with full-precision ADC.
81    ///
82    /// - `None`: disables rescore entirely (expert-only; risks silent recall collapse).
83    /// - `Some(0)`: treated as disabled (equivalent to `None`) — the oversampling factor
84    ///   of 0 produces a candidates count of 0, which falls back to raw HNSW results.
85    /// - `Some(n)` where `n > 0`: enables rescore with `n`-fold oversampling.
86    #[serde(default = "default_pq_rescore_oversampling")]
87    pub pq_rescore_oversampling: Option<u32>,
88
89    /// Custom HNSW index parameters (M, `ef_construction`, etc.).
90    ///
91    /// When `Some`, these parameters are used to rebuild the HNSW index on
92    /// collection reopen if `hnsw.bin` does not yet exist (empty collection).
93    /// When `None`, the default `HnswParams::auto(dimension)` is used.
94    ///
95    /// Backward compatible: old `config.json` files without this field
96    /// deserialize to `None`.
97    #[serde(default, skip_serializing_if = "Option::is_none")]
98    pub hnsw_params: Option<HnswParams>,
99
100    /// Deferred indexing configuration (US-366).
101    ///
102    /// When `Some` and `enabled`, inserts are buffered in memory and
103    /// batch-merged into the HNSW index when the buffer reaches
104    /// `merge_threshold`. This decouples write latency from index cost.
105    ///
106    /// Backward compatible: old `config.json` files without this field
107    /// deserialize to `None` (disabled).
108    #[cfg(feature = "persistence")]
109    #[serde(default, skip_serializing_if = "Option::is_none")]
110    pub deferred_indexing: Option<crate::collection::streaming::DeferredIndexerConfig>,
111
112    /// Async index builder configuration (Issue #488 — Bulk Insert V2).
113    ///
114    /// When `Some`, enables the `AsyncIndexBuilder` for deferred HNSW
115    /// insertion during bulk import. Buffered vectors are flushed to the
116    /// HNSW index via `HnswIndex::insert_batch_parallel`.
117    ///
118    /// Backward compatible: old `config.json` files without this field
119    /// deserialize to `None` (disabled).
120    #[serde(default, skip_serializing_if = "Option::is_none")]
121    pub async_index_builder: Option<AsyncIndexBuilderConfig>,
122}
123
124#[cfg(test)]
125mod rescore_config_tests {
126    use super::*;
127    use crate::distance::DistanceMetric;
128    use crate::quantization::StorageMode;
129
130    fn make_config(oversampling: Option<u32>) -> CollectionConfig {
131        CollectionConfig {
132            name: "test".to_string(),
133            dimension: 128,
134            metric: DistanceMetric::Euclidean,
135            point_count: 0,
136            schema_version: CURRENT_SCHEMA_VERSION,
137            storage_mode: StorageMode::ProductQuantization,
138            metadata_only: false,
139            graph_schema: None,
140            embedding_dimension: None,
141            pq_rescore_oversampling: oversampling,
142            hnsw_params: None,
143            #[cfg(feature = "persistence")]
144            deferred_indexing: None,
145            async_index_builder: None,
146        }
147    }
148
149    #[test]
150    fn rescore_default_oversampling_is_4() {
151        let config = make_config(default_pq_rescore_oversampling());
152        assert_eq!(config.pq_rescore_oversampling, Some(4));
153    }
154
155    #[test]
156    fn rescore_candidates_k_formula_default() {
157        // Default factor = 4, k = 10
158        // candidates_k = max(10 * 4, 10 + 32) = max(40, 42) = 42
159        let factor = 4_usize;
160        let k = 10_usize;
161        let candidates_k = k.saturating_mul(factor).max(k + 32);
162        assert_eq!(candidates_k, 42);
163    }
164
165    #[test]
166    fn rescore_candidates_k_formula_custom_factor_6() {
167        // factor = 6, k = 10
168        // candidates_k = max(10 * 6, 10 + 32) = max(60, 42) = 60
169        let factor = 6_usize;
170        let k = 10_usize;
171        let candidates_k = k.saturating_mul(factor).max(k + 32);
172        assert_eq!(candidates_k, 60);
173    }
174
175    #[test]
176    fn rescore_none_disables_oversampling() {
177        let config = make_config(None);
178        let oversampling = config.pq_rescore_oversampling.unwrap_or(0);
179        assert_eq!(oversampling, 0, "None should map to 0 (disabled)");
180    }
181
182    #[test]
183    fn rescore_active_by_default_for_pq() {
184        let config = make_config(default_pq_rescore_oversampling());
185        assert!(
186            config.pq_rescore_oversampling.is_some(),
187            "Rescore must be active by default for PQ"
188        );
189        assert!(
190            config.pq_rescore_oversampling.unwrap() > 0,
191            "Default oversampling must be > 0"
192        );
193    }
194
195    #[test]
196    fn rescore_serde_default_backward_compat() {
197        // Simulate deserializing a config without pq_rescore_oversampling field.
198        // The serde default should kick in and set Some(4).
199        let json = r#"{
200            "name": "old_collection",
201            "dimension": 128,
202            "metric": "Euclidean",
203            "point_count": 100,
204            "storage_mode": "productquantization"
205        }"#;
206        let config: CollectionConfig = serde_json::from_str(json).unwrap();
207        assert_eq!(
208            config.pq_rescore_oversampling,
209            Some(4),
210            "Missing field must deserialize to Some(4) for backward compat"
211        );
212    }
213
214    #[test]
215    fn rescore_minimum_floor_preserved() {
216        // Even with small k, the floor k + 32 must dominate
217        let factor = 4_usize;
218        let k = 5_usize;
219        let candidates_k = k.saturating_mul(factor).max(k + 32);
220        // max(20, 37) = 37
221        assert_eq!(candidates_k, 37);
222    }
223}