velesdb_core/collection/collection_config.rs
1//! Collection configuration and schema versioning.
2
3use crate::collection::streaming::AsyncIndexBuilderConfig;
4use crate::distance::DistanceMetric;
5use crate::index::hnsw::HnswParams;
6use crate::quantization::StorageMode;
7use serde::{Deserialize, Serialize};
8
9use crate::collection::graph::GraphSchema;
10
11/// Current on-disk schema version for `config.json`.
12///
13/// Increment this constant when the persisted format changes in a way that
14/// older VelesDB versions cannot safely read. The `Collection::open()` path
15/// rejects any `schema_version > CURRENT_SCHEMA_VERSION` with a clear error.
16pub const CURRENT_SCHEMA_VERSION: u32 = 1;
17
18/// Returns the default schema version for backward-compatible deserialization.
19///
20/// Old `config.json` files written before schema versioning was introduced
21/// will deserialize with this default, which is equivalent to version 1.
22fn default_schema_version() -> u32 {
23 1
24}
25
26/// Returns `Some(4)` as the default PQ rescore oversampling factor.
27/// Returns `Option` because the field type is `Option<u32>` (None = disabled).
28#[allow(clippy::unnecessary_wraps)]
29fn default_pq_rescore_oversampling() -> Option<u32> {
30 Some(4)
31}
32
33/// Metadata for a collection.
34#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct CollectionConfig {
36 /// Name of the collection.
37 pub name: String,
38
39 /// Vector dimension (0 for metadata-only or graph-without-embeddings collections).
40 pub dimension: usize,
41
42 /// Distance metric.
43 pub metric: DistanceMetric,
44
45 /// Number of points in the collection.
46 pub point_count: usize,
47
48 /// On-disk schema version for forward-compatibility detection.
49 ///
50 /// When a newer VelesDB version writes a `config.json` with a higher
51 /// schema version, older versions will refuse to open the collection
52 /// rather than silently corrupting data.
53 ///
54 /// Backward compatible: old `config.json` files without this field
55 /// deserialize to `1` (the initial version).
56 #[serde(default = "default_schema_version")]
57 pub schema_version: u32,
58
59 /// Storage mode for vectors (Full, SQ8, Binary).
60 #[serde(default)]
61 pub storage_mode: StorageMode,
62
63 /// Whether this is a metadata-only collection.
64 #[serde(default)]
65 pub metadata_only: bool,
66
67 /// Graph schema — `Some` iff this is a graph collection.
68 /// Persisted to config.json; `None` for vector and metadata collections.
69 #[serde(default, skip_serializing_if = "Option::is_none")]
70 pub graph_schema: Option<GraphSchema>,
71
72 /// Embedding dimension for graph node vectors (None = no embeddings).
73 /// Only meaningful when `graph_schema` is `Some`.
74 #[serde(default, skip_serializing_if = "Option::is_none")]
75 pub embedding_dimension: Option<usize>,
76
77 /// PQ rescore oversampling factor. `Some(4)` by default.
78 ///
79 /// The search pipeline fetches `max(k * factor, k + 32)` candidates from HNSW
80 /// and rescores them with full-precision ADC.
81 ///
82 /// - `None`: disables rescore entirely (expert-only; risks silent recall collapse).
83 /// - `Some(0)`: treated as disabled (equivalent to `None`) — the oversampling factor
84 /// of 0 produces a candidates count of 0, which falls back to raw HNSW results.
85 /// - `Some(n)` where `n > 0`: enables rescore with `n`-fold oversampling.
86 #[serde(default = "default_pq_rescore_oversampling")]
87 pub pq_rescore_oversampling: Option<u32>,
88
89 /// Custom HNSW index parameters (M, `ef_construction`, etc.).
90 ///
91 /// When `Some`, these parameters are used to rebuild the HNSW index on
92 /// collection reopen if `hnsw.bin` does not yet exist (empty collection).
93 /// When `None`, the default `HnswParams::auto(dimension)` is used.
94 ///
95 /// Backward compatible: old `config.json` files without this field
96 /// deserialize to `None`.
97 #[serde(default, skip_serializing_if = "Option::is_none")]
98 pub hnsw_params: Option<HnswParams>,
99
100 /// Deferred indexing configuration (US-366).
101 ///
102 /// When `Some` and `enabled`, inserts are buffered in memory and
103 /// batch-merged into the HNSW index when the buffer reaches
104 /// `merge_threshold`. This decouples write latency from index cost.
105 ///
106 /// Backward compatible: old `config.json` files without this field
107 /// deserialize to `None` (disabled).
108 #[cfg(feature = "persistence")]
109 #[serde(default, skip_serializing_if = "Option::is_none")]
110 pub deferred_indexing: Option<crate::collection::streaming::DeferredIndexerConfig>,
111
112 /// Async index builder configuration (Issue #488 — Bulk Insert V2).
113 ///
114 /// When `Some`, enables the `AsyncIndexBuilder` for deferred HNSW
115 /// insertion during bulk import. Buffered vectors are flushed to the
116 /// HNSW index via `HnswIndex::insert_batch_parallel`.
117 ///
118 /// Backward compatible: old `config.json` files without this field
119 /// deserialize to `None` (disabled).
120 #[serde(default, skip_serializing_if = "Option::is_none")]
121 pub async_index_builder: Option<AsyncIndexBuilderConfig>,
122}
123
124#[cfg(test)]
125mod rescore_config_tests {
126 use super::*;
127 use crate::distance::DistanceMetric;
128 use crate::quantization::StorageMode;
129
130 fn make_config(oversampling: Option<u32>) -> CollectionConfig {
131 CollectionConfig {
132 name: "test".to_string(),
133 dimension: 128,
134 metric: DistanceMetric::Euclidean,
135 point_count: 0,
136 schema_version: CURRENT_SCHEMA_VERSION,
137 storage_mode: StorageMode::ProductQuantization,
138 metadata_only: false,
139 graph_schema: None,
140 embedding_dimension: None,
141 pq_rescore_oversampling: oversampling,
142 hnsw_params: None,
143 #[cfg(feature = "persistence")]
144 deferred_indexing: None,
145 async_index_builder: None,
146 }
147 }
148
149 #[test]
150 fn rescore_default_oversampling_is_4() {
151 let config = make_config(default_pq_rescore_oversampling());
152 assert_eq!(config.pq_rescore_oversampling, Some(4));
153 }
154
155 #[test]
156 fn rescore_candidates_k_formula_default() {
157 // Default factor = 4, k = 10
158 // candidates_k = max(10 * 4, 10 + 32) = max(40, 42) = 42
159 let factor = 4_usize;
160 let k = 10_usize;
161 let candidates_k = k.saturating_mul(factor).max(k + 32);
162 assert_eq!(candidates_k, 42);
163 }
164
165 #[test]
166 fn rescore_candidates_k_formula_custom_factor_6() {
167 // factor = 6, k = 10
168 // candidates_k = max(10 * 6, 10 + 32) = max(60, 42) = 60
169 let factor = 6_usize;
170 let k = 10_usize;
171 let candidates_k = k.saturating_mul(factor).max(k + 32);
172 assert_eq!(candidates_k, 60);
173 }
174
175 #[test]
176 fn rescore_none_disables_oversampling() {
177 let config = make_config(None);
178 let oversampling = config.pq_rescore_oversampling.unwrap_or(0);
179 assert_eq!(oversampling, 0, "None should map to 0 (disabled)");
180 }
181
182 #[test]
183 fn rescore_active_by_default_for_pq() {
184 let config = make_config(default_pq_rescore_oversampling());
185 assert!(
186 config.pq_rescore_oversampling.is_some(),
187 "Rescore must be active by default for PQ"
188 );
189 assert!(
190 config.pq_rescore_oversampling.unwrap() > 0,
191 "Default oversampling must be > 0"
192 );
193 }
194
195 #[test]
196 fn rescore_serde_default_backward_compat() {
197 // Simulate deserializing a config without pq_rescore_oversampling field.
198 // The serde default should kick in and set Some(4).
199 let json = r#"{
200 "name": "old_collection",
201 "dimension": 128,
202 "metric": "Euclidean",
203 "point_count": 100,
204 "storage_mode": "productquantization"
205 }"#;
206 let config: CollectionConfig = serde_json::from_str(json).unwrap();
207 assert_eq!(
208 config.pq_rescore_oversampling,
209 Some(4),
210 "Missing field must deserialize to Some(4) for backward compat"
211 );
212 }
213
214 #[test]
215 fn rescore_minimum_floor_preserved() {
216 // Even with small k, the floor k + 32 must dominate
217 let factor = 4_usize;
218 let k = 5_usize;
219 let candidates_k = k.saturating_mul(factor).max(k + 32);
220 // max(20, 37) = 37
221 assert_eq!(candidates_k, 37);
222 }
223}