velesdb_core/collection/collection_config.rs
1//! Collection configuration and schema versioning.
2
3use crate::collection::auto_reindex::AutoReindexConfig;
4use crate::collection::streaming::AsyncIndexBuilderConfig;
5use crate::distance::DistanceMetric;
6use crate::index::hnsw::HnswParams;
7use crate::quantization::StorageMode;
8use serde::{Deserialize, Serialize};
9use std::collections::BTreeSet;
10
11use crate::collection::graph::GraphSchema;
12
13/// Current on-disk schema version for `config.json`.
14///
15/// Increment this constant when the persisted format changes in a way that
16/// older VelesDB versions cannot safely read. The `Collection::open()` path
17/// rejects any `schema_version > CURRENT_SCHEMA_VERSION` with a clear error.
18pub const CURRENT_SCHEMA_VERSION: u32 = 2;
19
20/// Returns the default schema version for backward-compatible deserialization.
21///
22/// Old `config.json` files written before schema versioning was introduced
23/// will deserialize with this default, which is equivalent to version 1.
24fn default_schema_version() -> u32 {
25 1
26}
27
28/// Returns `Some(4)` as the default PQ rescore oversampling factor.
29/// Returns `Option` because the field type is `Option<u32>` (None = disabled).
30#[allow(clippy::unnecessary_wraps)]
31fn default_pq_rescore_oversampling() -> Option<u32> {
32 Some(4)
33}
34
35/// Metadata for a collection.
36///
37/// `#[non_exhaustive]`: new fields are added over time (schema-versioned and
38/// serde-defaulted), so external crates must obtain a `CollectionConfig` via
39/// the `VectorCollection::create*` constructors / `Collection::config()` rather
40/// than a struct literal — this keeps future field additions non-breaking.
41#[derive(Debug, Clone, Serialize, Deserialize)]
42#[non_exhaustive]
43pub struct CollectionConfig {
44 /// Name of the collection.
45 pub name: String,
46
47 /// Vector dimension (0 for metadata-only or graph-without-embeddings collections).
48 pub dimension: usize,
49
50 /// Distance metric.
51 pub metric: DistanceMetric,
52
53 /// Number of points in the collection.
54 pub point_count: usize,
55
56 /// On-disk schema version for forward-compatibility detection.
57 ///
58 /// When a newer VelesDB version writes a `config.json` with a higher
59 /// schema version, older versions will refuse to open the collection
60 /// rather than silently corrupting data.
61 ///
62 /// Backward compatible: old `config.json` files without this field
63 /// deserialize to `1` (the initial version).
64 #[serde(default = "default_schema_version")]
65 pub schema_version: u32,
66
67 /// Storage mode for vectors (Full, SQ8, Binary).
68 #[serde(default)]
69 pub storage_mode: StorageMode,
70
71 /// Whether this is a metadata-only collection.
72 #[serde(default)]
73 pub metadata_only: bool,
74
75 /// Graph schema — `Some` iff this is a graph collection.
76 /// Persisted to config.json; `None` for vector and metadata collections.
77 #[serde(default, skip_serializing_if = "Option::is_none")]
78 pub graph_schema: Option<GraphSchema>,
79
80 /// Embedding dimension for graph node vectors (None = no embeddings).
81 /// Only meaningful when `graph_schema` is `Some`.
82 #[serde(default, skip_serializing_if = "Option::is_none")]
83 pub embedding_dimension: Option<usize>,
84
85 /// PQ rescore oversampling factor. `Some(4)` by default.
86 ///
87 /// The search pipeline fetches `max(k * factor, k + 32)` candidates from HNSW
88 /// and rescores them with full-precision ADC.
89 ///
90 /// - `None`: disables rescore entirely (expert-only; risks silent recall collapse).
91 /// - `Some(0)`: treated as disabled (equivalent to `None`) — the oversampling factor
92 /// of 0 produces a candidates count of 0, which falls back to raw HNSW results.
93 /// - `Some(n)` where `n > 0`: enables rescore with `n`-fold oversampling.
94 #[serde(default = "default_pq_rescore_oversampling")]
95 pub pq_rescore_oversampling: Option<u32>,
96
97 /// Custom HNSW index parameters (M, `ef_construction`, etc.).
98 ///
99 /// When `Some`, these parameters are used to rebuild the HNSW index on
100 /// collection reopen if no persisted index exists yet (`native_meta.bin`
101 /// absent). When `None`, the default `HnswParams::auto(dimension)` is used.
102 ///
103 /// Backward compatible: old `config.json` files without this field
104 /// deserialize to `None`.
105 #[serde(default, skip_serializing_if = "Option::is_none")]
106 pub hnsw_params: Option<HnswParams>,
107
108 /// Deferred indexing configuration (US-366).
109 ///
110 /// When `Some` and `enabled`, inserts are buffered in memory and
111 /// batch-merged into the HNSW index when the buffer reaches
112 /// `merge_threshold`. This decouples write latency from index cost.
113 ///
114 /// Backward compatible: old `config.json` files without this field
115 /// deserialize to `None` (disabled).
116 #[cfg(feature = "persistence")]
117 #[serde(default, skip_serializing_if = "Option::is_none")]
118 pub deferred_indexing: Option<crate::collection::streaming::DeferredIndexerConfig>,
119
120 /// Async index builder configuration (Issue #488 — Bulk Insert V2).
121 ///
122 /// When `Some`, enables the `AsyncIndexBuilder` for deferred HNSW
123 /// insertion during bulk import. Buffered vectors are flushed to the
124 /// HNSW index via `HnswIndex::insert_batch_parallel`.
125 ///
126 /// Backward compatible: old `config.json` files without this field
127 /// deserialize to `None` (disabled).
128 #[serde(default, skip_serializing_if = "Option::is_none")]
129 pub async_index_builder: Option<AsyncIndexBuilderConfig>,
130
131 /// Auto-reindex configuration (schema v2 — W2).
132 ///
133 /// When `Some`, the [`AutoReindexManager`](crate::collection::auto_reindex::AutoReindexManager)
134 /// is restored automatically on [`Collection::open`](crate::collection::VectorCollection)
135 /// so the policy survives a process restart instead of requiring a manual
136 /// re-attach.
137 ///
138 /// Backward compatible: v1 `config.json` files without this field
139 /// deserialize to `None` (no manager attached).
140 #[serde(default, skip_serializing_if = "Option::is_none")]
141 pub auto_reindex_config: Option<AutoReindexConfig>,
142
143 /// Streaming ingestion configuration (schema v2 — STREAM-7).
144 ///
145 /// Describes the persisted shape (channel/batch sizing, flush timing) of
146 /// the streaming pipeline. The live `StreamIngester` is still created on
147 /// demand via `Collection::enable_streaming`; persisting the config lets a
148 /// future open-time hook re-enable streaming without a fresh API call.
149 ///
150 /// Backward compatible: v1 `config.json` files without this field
151 /// deserialize to `None` (streaming not configured).
152 #[cfg(feature = "persistence")]
153 #[serde(default, skip_serializing_if = "Option::is_none")]
154 pub streaming_config: Option<crate::collection::streaming::StreamingConfig>,
155
156 /// Names of payload fields carrying a secondary metadata index
157 /// (`CREATE INDEX (<field>)`) — the persisted **authority** for which
158 /// indexes exist (EPIC-081 phase 3d).
159 ///
160 /// `create_index` adds a field here and `drop_secondary_index` removes it,
161 /// each persisted via [`save_config`](crate::collection::Collection). On
162 /// [`Collection::open`](crate::collection::VectorCollection) every listed
163 /// field is rebuilt from the recovered payloads (backfill), so an index
164 /// survives a process restart instead of silently vanishing — without
165 /// which the ordered-index `ORDER BY` fast path, the bitmap pre-filter,
166 /// `EXPLAIN` `IndexLookup`, and the index advisor would all change behaviour
167 /// after a restart (results stay correct via the exhaustive fallback).
168 ///
169 /// A `BTreeSet` so the on-disk ordering is deterministic. Backward
170 /// compatible: configs written before this field deserialize to an empty
171 /// set (no indexes restored), and an empty set is not serialized.
172 ///
173 /// Downgrade caveat: a pre-3d binary opening this config ignores the field
174 /// (no `deny_unknown_fields`), but the next `save_config` it performs
175 /// re-serializes without it, dropping the authority — a subsequent newer
176 /// binary then will not restore those indexes until `CREATE INDEX` is
177 /// re-issued. Bounded and fully recoverable (results stay correct via the
178 /// exhaustive fallback); no schema-version bump guards it, by design.
179 #[serde(default, skip_serializing_if = "BTreeSet::is_empty")]
180 pub indexed_fields: BTreeSet<String>,
181}
182
183#[cfg(test)]
184mod rescore_config_tests {
185 use super::*;
186 use crate::distance::DistanceMetric;
187 use crate::quantization::StorageMode;
188
189 fn make_config(oversampling: Option<u32>) -> CollectionConfig {
190 CollectionConfig {
191 name: "test".to_string(),
192 dimension: 128,
193 metric: DistanceMetric::Euclidean,
194 point_count: 0,
195 schema_version: CURRENT_SCHEMA_VERSION,
196 storage_mode: StorageMode::ProductQuantization,
197 metadata_only: false,
198 graph_schema: None,
199 embedding_dimension: None,
200 pq_rescore_oversampling: oversampling,
201 hnsw_params: None,
202 #[cfg(feature = "persistence")]
203 deferred_indexing: None,
204 async_index_builder: None,
205 auto_reindex_config: None,
206 #[cfg(feature = "persistence")]
207 streaming_config: None,
208 indexed_fields: BTreeSet::new(),
209 }
210 }
211
212 #[test]
213 fn rescore_default_oversampling_is_4() {
214 let config = make_config(default_pq_rescore_oversampling());
215 assert_eq!(config.pq_rescore_oversampling, Some(4));
216 }
217
218 #[test]
219 fn rescore_candidates_k_formula_default() {
220 // Default factor = 4, k = 10
221 // candidates_k = max(10 * 4, 10 + 32) = max(40, 42) = 42
222 let factor = 4_usize;
223 let k = 10_usize;
224 let candidates_k = k.saturating_mul(factor).max(k + 32);
225 assert_eq!(candidates_k, 42);
226 }
227
228 #[test]
229 fn rescore_candidates_k_formula_custom_factor_6() {
230 // factor = 6, k = 10
231 // candidates_k = max(10 * 6, 10 + 32) = max(60, 42) = 60
232 let factor = 6_usize;
233 let k = 10_usize;
234 let candidates_k = k.saturating_mul(factor).max(k + 32);
235 assert_eq!(candidates_k, 60);
236 }
237
238 #[test]
239 fn rescore_none_disables_oversampling() {
240 let config = make_config(None);
241 let oversampling = config.pq_rescore_oversampling.unwrap_or(0);
242 assert_eq!(oversampling, 0, "None should map to 0 (disabled)");
243 }
244
245 #[test]
246 fn rescore_active_by_default_for_pq() {
247 let config = make_config(default_pq_rescore_oversampling());
248 assert!(
249 config.pq_rescore_oversampling.is_some(),
250 "Rescore must be active by default for PQ"
251 );
252 assert!(
253 config.pq_rescore_oversampling.unwrap() > 0,
254 "Default oversampling must be > 0"
255 );
256 }
257
258 #[test]
259 fn rescore_serde_default_backward_compat() {
260 // Simulate deserializing a config without pq_rescore_oversampling field.
261 // The serde default should kick in and set Some(4).
262 let json = r#"{
263 "name": "old_collection",
264 "dimension": 128,
265 "metric": "Euclidean",
266 "point_count": 100,
267 "storage_mode": "productquantization"
268 }"#;
269 let config: CollectionConfig = serde_json::from_str(json).unwrap();
270 assert_eq!(
271 config.pq_rescore_oversampling,
272 Some(4),
273 "Missing field must deserialize to Some(4) for backward compat"
274 );
275 }
276
277 #[test]
278 fn rescore_minimum_floor_preserved() {
279 // Even with small k, the floor k + 32 must dominate
280 let factor = 4_usize;
281 let k = 5_usize;
282 let candidates_k = k.saturating_mul(factor).max(k + 32);
283 // max(20, 37) = 37
284 assert_eq!(candidates_k, 37);
285 }
286}