1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
//! Collection configuration and schema versioning.
use crate::collection::streaming::AsyncIndexBuilderConfig;
use crate::distance::DistanceMetric;
use crate::index::hnsw::HnswParams;
use crate::quantization::StorageMode;
use serde::{Deserialize, Serialize};
use crate::collection::graph::GraphSchema;
/// Current on-disk schema version for `config.json`.
///
/// Increment this constant when the persisted format changes in a way that
/// older VelesDB versions cannot safely read. The `Collection::open()` path
/// rejects any `schema_version > CURRENT_SCHEMA_VERSION` with a clear error.
pub const CURRENT_SCHEMA_VERSION: u32 = 1;
/// Returns the default schema version for backward-compatible deserialization.
///
/// Old `config.json` files written before schema versioning was introduced
/// will deserialize with this default, which is equivalent to version 1.
fn default_schema_version() -> u32 {
1
}
/// Returns `Some(4)` as the default PQ rescore oversampling factor.
/// Returns `Option` because the field type is `Option<u32>` (None = disabled).
#[allow(clippy::unnecessary_wraps)]
fn default_pq_rescore_oversampling() -> Option<u32> {
Some(4)
}
/// Metadata for a collection.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CollectionConfig {
/// Name of the collection.
pub name: String,
/// Vector dimension (0 for metadata-only or graph-without-embeddings collections).
pub dimension: usize,
/// Distance metric.
pub metric: DistanceMetric,
/// Number of points in the collection.
pub point_count: usize,
/// On-disk schema version for forward-compatibility detection.
///
/// When a newer VelesDB version writes a `config.json` with a higher
/// schema version, older versions will refuse to open the collection
/// rather than silently corrupting data.
///
/// Backward compatible: old `config.json` files without this field
/// deserialize to `1` (the initial version).
#[serde(default = "default_schema_version")]
pub schema_version: u32,
/// Storage mode for vectors (Full, SQ8, Binary).
#[serde(default)]
pub storage_mode: StorageMode,
/// Whether this is a metadata-only collection.
#[serde(default)]
pub metadata_only: bool,
/// Graph schema — `Some` iff this is a graph collection.
/// Persisted to config.json; `None` for vector and metadata collections.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub graph_schema: Option<GraphSchema>,
/// Embedding dimension for graph node vectors (None = no embeddings).
/// Only meaningful when `graph_schema` is `Some`.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub embedding_dimension: Option<usize>,
/// PQ rescore oversampling factor. `Some(4)` by default.
///
/// The search pipeline fetches `max(k * factor, k + 32)` candidates from HNSW
/// and rescores them with full-precision ADC.
///
/// - `None`: disables rescore entirely (expert-only; risks silent recall collapse).
/// - `Some(0)`: treated as disabled (equivalent to `None`) — the oversampling factor
/// of 0 produces a candidates count of 0, which falls back to raw HNSW results.
/// - `Some(n)` where `n > 0`: enables rescore with `n`-fold oversampling.
#[serde(default = "default_pq_rescore_oversampling")]
pub pq_rescore_oversampling: Option<u32>,
/// Custom HNSW index parameters (M, `ef_construction`, etc.).
///
/// When `Some`, these parameters are used to rebuild the HNSW index on
/// collection reopen if `hnsw.bin` does not yet exist (empty collection).
/// When `None`, the default `HnswParams::auto(dimension)` is used.
///
/// Backward compatible: old `config.json` files without this field
/// deserialize to `None`.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub hnsw_params: Option<HnswParams>,
/// Deferred indexing configuration (US-366).
///
/// When `Some` and `enabled`, inserts are buffered in memory and
/// batch-merged into the HNSW index when the buffer reaches
/// `merge_threshold`. This decouples write latency from index cost.
///
/// Backward compatible: old `config.json` files without this field
/// deserialize to `None` (disabled).
#[cfg(feature = "persistence")]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub deferred_indexing: Option<crate::collection::streaming::DeferredIndexerConfig>,
/// Async index builder configuration (Issue #488 — Bulk Insert V2).
///
/// When `Some`, enables the `AsyncIndexBuilder` for deferred HNSW
/// insertion during bulk import. Buffered vectors are flushed to the
/// HNSW index via `HnswIndex::insert_batch_parallel`.
///
/// Backward compatible: old `config.json` files without this field
/// deserialize to `None` (disabled).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub async_index_builder: Option<AsyncIndexBuilderConfig>,
}
#[cfg(test)]
mod rescore_config_tests {
use super::*;
use crate::distance::DistanceMetric;
use crate::quantization::StorageMode;
fn make_config(oversampling: Option<u32>) -> CollectionConfig {
CollectionConfig {
name: "test".to_string(),
dimension: 128,
metric: DistanceMetric::Euclidean,
point_count: 0,
schema_version: CURRENT_SCHEMA_VERSION,
storage_mode: StorageMode::ProductQuantization,
metadata_only: false,
graph_schema: None,
embedding_dimension: None,
pq_rescore_oversampling: oversampling,
hnsw_params: None,
#[cfg(feature = "persistence")]
deferred_indexing: None,
async_index_builder: None,
}
}
#[test]
fn rescore_default_oversampling_is_4() {
let config = make_config(default_pq_rescore_oversampling());
assert_eq!(config.pq_rescore_oversampling, Some(4));
}
#[test]
fn rescore_candidates_k_formula_default() {
// Default factor = 4, k = 10
// candidates_k = max(10 * 4, 10 + 32) = max(40, 42) = 42
let factor = 4_usize;
let k = 10_usize;
let candidates_k = k.saturating_mul(factor).max(k + 32);
assert_eq!(candidates_k, 42);
}
#[test]
fn rescore_candidates_k_formula_custom_factor_6() {
// factor = 6, k = 10
// candidates_k = max(10 * 6, 10 + 32) = max(60, 42) = 60
let factor = 6_usize;
let k = 10_usize;
let candidates_k = k.saturating_mul(factor).max(k + 32);
assert_eq!(candidates_k, 60);
}
#[test]
fn rescore_none_disables_oversampling() {
let config = make_config(None);
let oversampling = config.pq_rescore_oversampling.unwrap_or(0);
assert_eq!(oversampling, 0, "None should map to 0 (disabled)");
}
#[test]
fn rescore_active_by_default_for_pq() {
let config = make_config(default_pq_rescore_oversampling());
assert!(
config.pq_rescore_oversampling.is_some(),
"Rescore must be active by default for PQ"
);
assert!(
config.pq_rescore_oversampling.unwrap() > 0,
"Default oversampling must be > 0"
);
}
#[test]
fn rescore_serde_default_backward_compat() {
// Simulate deserializing a config without pq_rescore_oversampling field.
// The serde default should kick in and set Some(4).
let json = r#"{
"name": "old_collection",
"dimension": 128,
"metric": "Euclidean",
"point_count": 100,
"storage_mode": "productquantization"
}"#;
let config: CollectionConfig = serde_json::from_str(json).unwrap();
assert_eq!(
config.pq_rescore_oversampling,
Some(4),
"Missing field must deserialize to Some(4) for backward compat"
);
}
#[test]
fn rescore_minimum_floor_preserved() {
// Even with small k, the floor k + 32 must dominate
let factor = 4_usize;
let k = 5_usize;
let candidates_k = k.saturating_mul(factor).max(k + 32);
// max(20, 37) = 37
assert_eq!(candidates_k, 37);
}
}