1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
//! Collection configuration and schema versioning.
use crate::collection::auto_reindex::AutoReindexConfig;
use crate::collection::streaming::AsyncIndexBuilderConfig;
use crate::distance::DistanceMetric;
use crate::index::hnsw::HnswParams;
use crate::quantization::StorageMode;
use serde::{Deserialize, Serialize};
use std::collections::BTreeSet;
use crate::collection::graph::GraphSchema;
/// Current on-disk schema version for `config.json`.
///
/// Increment this constant when the persisted format changes in a way that
/// older VelesDB versions cannot safely read. The `Collection::open()` path
/// rejects any `schema_version > CURRENT_SCHEMA_VERSION` with a clear error.
pub const CURRENT_SCHEMA_VERSION: u32 = 2;
/// Returns the default schema version for backward-compatible deserialization.
///
/// Old `config.json` files written before schema versioning was introduced
/// will deserialize with this default, which is equivalent to version 1.
fn default_schema_version() -> u32 {
1
}
/// Returns `Some(4)` as the default PQ rescore oversampling factor.
/// Returns `Option` because the field type is `Option<u32>` (None = disabled).
#[allow(clippy::unnecessary_wraps)]
fn default_pq_rescore_oversampling() -> Option<u32> {
Some(4)
}
/// Metadata for a collection.
///
/// `#[non_exhaustive]`: new fields are added over time (schema-versioned and
/// serde-defaulted), so external crates must obtain a `CollectionConfig` via
/// the `VectorCollection::create*` constructors / `Collection::config()` rather
/// than a struct literal — this keeps future field additions non-breaking.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[non_exhaustive]
pub struct CollectionConfig {
/// Name of the collection.
pub name: String,
/// Vector dimension (0 for metadata-only or graph-without-embeddings collections).
pub dimension: usize,
/// Distance metric.
pub metric: DistanceMetric,
/// Number of points in the collection.
pub point_count: usize,
/// On-disk schema version for forward-compatibility detection.
///
/// When a newer VelesDB version writes a `config.json` with a higher
/// schema version, older versions will refuse to open the collection
/// rather than silently corrupting data.
///
/// Backward compatible: old `config.json` files without this field
/// deserialize to `1` (the initial version).
#[serde(default = "default_schema_version")]
pub schema_version: u32,
/// Storage mode for vectors (Full, SQ8, Binary).
#[serde(default)]
pub storage_mode: StorageMode,
/// Whether this is a metadata-only collection.
#[serde(default)]
pub metadata_only: bool,
/// Graph schema — `Some` iff this is a graph collection.
/// Persisted to config.json; `None` for vector and metadata collections.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub graph_schema: Option<GraphSchema>,
/// Embedding dimension for graph node vectors (None = no embeddings).
/// Only meaningful when `graph_schema` is `Some`.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub embedding_dimension: Option<usize>,
/// PQ rescore oversampling factor. `Some(4)` by default.
///
/// The search pipeline fetches `max(k * factor, k + 32)` candidates from HNSW
/// and rescores them with full-precision ADC.
///
/// - `None`: disables rescore entirely (expert-only; risks silent recall collapse).
/// - `Some(0)`: treated as disabled (equivalent to `None`) — the oversampling factor
/// of 0 produces a candidates count of 0, which falls back to raw HNSW results.
/// - `Some(n)` where `n > 0`: enables rescore with `n`-fold oversampling.
#[serde(default = "default_pq_rescore_oversampling")]
pub pq_rescore_oversampling: Option<u32>,
/// Custom HNSW index parameters (M, `ef_construction`, etc.).
///
/// When `Some`, these parameters are used to rebuild the HNSW index on
/// collection reopen if no persisted index exists yet (`native_meta.bin`
/// absent). When `None`, the default `HnswParams::auto(dimension)` is used.
///
/// Backward compatible: old `config.json` files without this field
/// deserialize to `None`.
#[serde(default, skip_serializing_if = "Option::is_none")]
pub hnsw_params: Option<HnswParams>,
/// Deferred indexing configuration (US-366).
///
/// When `Some` and `enabled`, inserts are buffered in memory and
/// batch-merged into the HNSW index when the buffer reaches
/// `merge_threshold`. This decouples write latency from index cost.
///
/// Backward compatible: old `config.json` files without this field
/// deserialize to `None` (disabled).
#[cfg(feature = "persistence")]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub deferred_indexing: Option<crate::collection::streaming::DeferredIndexerConfig>,
/// Async index builder configuration (Issue #488 — Bulk Insert V2).
///
/// When `Some`, enables the `AsyncIndexBuilder` for deferred HNSW
/// insertion during bulk import. Buffered vectors are flushed to the
/// HNSW index via `HnswIndex::insert_batch_parallel`.
///
/// Backward compatible: old `config.json` files without this field
/// deserialize to `None` (disabled).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub async_index_builder: Option<AsyncIndexBuilderConfig>,
/// Auto-reindex configuration (schema v2 — W2).
///
/// When `Some`, the [`AutoReindexManager`](crate::collection::auto_reindex::AutoReindexManager)
/// is restored automatically on [`Collection::open`](crate::collection::VectorCollection)
/// so the policy survives a process restart instead of requiring a manual
/// re-attach.
///
/// Backward compatible: v1 `config.json` files without this field
/// deserialize to `None` (no manager attached).
#[serde(default, skip_serializing_if = "Option::is_none")]
pub auto_reindex_config: Option<AutoReindexConfig>,
/// Streaming ingestion configuration (schema v2 — STREAM-7).
///
/// Describes the persisted shape (channel/batch sizing, flush timing) of
/// the streaming pipeline. The live `StreamIngester` is still created on
/// demand via `Collection::enable_streaming`; persisting the config lets a
/// future open-time hook re-enable streaming without a fresh API call.
///
/// Backward compatible: v1 `config.json` files without this field
/// deserialize to `None` (streaming not configured).
#[cfg(feature = "persistence")]
#[serde(default, skip_serializing_if = "Option::is_none")]
pub streaming_config: Option<crate::collection::streaming::StreamingConfig>,
/// Names of payload fields carrying a secondary metadata index
/// (`CREATE INDEX (<field>)`) — the persisted **authority** for which
/// indexes exist (EPIC-081 phase 3d).
///
/// `create_index` adds a field here and `drop_secondary_index` removes it,
/// each persisted via [`save_config`](crate::collection::Collection). On
/// [`Collection::open`](crate::collection::VectorCollection) every listed
/// field is rebuilt from the recovered payloads (backfill), so an index
/// survives a process restart instead of silently vanishing — without
/// which the ordered-index `ORDER BY` fast path, the bitmap pre-filter,
/// `EXPLAIN` `IndexLookup`, and the index advisor would all change behaviour
/// after a restart (results stay correct via the exhaustive fallback).
///
/// A `BTreeSet` so the on-disk ordering is deterministic. Backward
/// compatible: configs written before this field deserialize to an empty
/// set (no indexes restored), and an empty set is not serialized.
///
/// Downgrade caveat: a pre-3d binary opening this config ignores the field
/// (no `deny_unknown_fields`), but the next `save_config` it performs
/// re-serializes without it, dropping the authority — a subsequent newer
/// binary then will not restore those indexes until `CREATE INDEX` is
/// re-issued. Bounded and fully recoverable (results stay correct via the
/// exhaustive fallback); no schema-version bump guards it, by design.
#[serde(default, skip_serializing_if = "BTreeSet::is_empty")]
pub indexed_fields: BTreeSet<String>,
}
#[cfg(test)]
mod rescore_config_tests {
use super::*;
use crate::distance::DistanceMetric;
use crate::quantization::StorageMode;
fn make_config(oversampling: Option<u32>) -> CollectionConfig {
CollectionConfig {
name: "test".to_string(),
dimension: 128,
metric: DistanceMetric::Euclidean,
point_count: 0,
schema_version: CURRENT_SCHEMA_VERSION,
storage_mode: StorageMode::ProductQuantization,
metadata_only: false,
graph_schema: None,
embedding_dimension: None,
pq_rescore_oversampling: oversampling,
hnsw_params: None,
#[cfg(feature = "persistence")]
deferred_indexing: None,
async_index_builder: None,
auto_reindex_config: None,
#[cfg(feature = "persistence")]
streaming_config: None,
indexed_fields: BTreeSet::new(),
}
}
#[test]
fn rescore_default_oversampling_is_4() {
let config = make_config(default_pq_rescore_oversampling());
assert_eq!(config.pq_rescore_oversampling, Some(4));
}
#[test]
fn rescore_candidates_k_formula_default() {
// Default factor = 4, k = 10
// candidates_k = max(10 * 4, 10 + 32) = max(40, 42) = 42
let factor = 4_usize;
let k = 10_usize;
let candidates_k = k.saturating_mul(factor).max(k + 32);
assert_eq!(candidates_k, 42);
}
#[test]
fn rescore_candidates_k_formula_custom_factor_6() {
// factor = 6, k = 10
// candidates_k = max(10 * 6, 10 + 32) = max(60, 42) = 60
let factor = 6_usize;
let k = 10_usize;
let candidates_k = k.saturating_mul(factor).max(k + 32);
assert_eq!(candidates_k, 60);
}
#[test]
fn rescore_none_disables_oversampling() {
let config = make_config(None);
let oversampling = config.pq_rescore_oversampling.unwrap_or(0);
assert_eq!(oversampling, 0, "None should map to 0 (disabled)");
}
#[test]
fn rescore_active_by_default_for_pq() {
let config = make_config(default_pq_rescore_oversampling());
assert!(
config.pq_rescore_oversampling.is_some(),
"Rescore must be active by default for PQ"
);
assert!(
config.pq_rescore_oversampling.unwrap() > 0,
"Default oversampling must be > 0"
);
}
#[test]
fn rescore_serde_default_backward_compat() {
// Simulate deserializing a config without pq_rescore_oversampling field.
// The serde default should kick in and set Some(4).
let json = r#"{
"name": "old_collection",
"dimension": 128,
"metric": "Euclidean",
"point_count": 100,
"storage_mode": "productquantization"
}"#;
let config: CollectionConfig = serde_json::from_str(json).unwrap();
assert_eq!(
config.pq_rescore_oversampling,
Some(4),
"Missing field must deserialize to Some(4) for backward compat"
);
}
#[test]
fn rescore_minimum_floor_preserved() {
// Even with small k, the floor k + 32 must dominate
let factor = 4_usize;
let k = 5_usize;
let candidates_k = k.saturating_mul(factor).max(k + 32);
// max(20, 37) = 37
assert_eq!(candidates_k, 37);
}
}