1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
#![cfg(feature = "hnsw")]
//! Regression tests for ADSampling.
//!
//! These tests encode specific bugs and edge cases found during development.
//! Each test documents what went wrong and why.
#![allow(clippy::unwrap_used)]
#[cfg(feature = "hnsw")]
mod tests {
use std::collections::HashSet;
use vicinity::adsampling::{ADSamplingParams, ADSamplingState};
use vicinity::hnsw::{HNSWIndex, HNSWParams};
use vicinity::DistanceMetric;
/// Generate deterministic random vectors.
fn random_vectors(n: usize, dim: usize, seed: u64) -> Vec<f32> {
let mut s = seed;
(0..n * dim)
.map(|_| {
s = s
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
((s >> 33) as f32) / (u32::MAX >> 1) as f32
})
.collect()
}
/// Regression: ADSampling must use reordered vectors from HNSW.
///
/// Bug: ADSamplingState::new() was called with the original (pre-build)
/// vector array. After HNSWIndex::build(), vectors are reordered for cache
/// locality. search_with_distance() passes node IDs in reordered space,
/// so ADSampling looked up the wrong vectors, producing near-zero recall.
///
/// Fix: Use ADSamplingState::from_hnsw() which reads index.vectors_raw().
#[test]
fn adsampling_must_use_reordered_vectors() {
let dim = 64;
let n = 300;
let k = 10;
let ef = 100;
let vectors = random_vectors(n, dim, 42);
// Normalize for cosine
let mut normalized = vectors.clone();
for i in 0..n {
let sl = &mut normalized[i * dim..(i + 1) * dim];
let norm: f32 = sl.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 0.0 {
for x in sl.iter_mut() {
*x /= norm;
}
}
}
let params = HNSWParams {
m: 16,
m_max: 16,
ef_construction: 100,
seed: Some(42),
..Default::default()
};
let mut index = HNSWIndex::with_params(dim, params).unwrap();
let ids: Vec<u32> = (0..n as u32).collect();
index.add_batch(&ids, &normalized).unwrap();
index.build().unwrap();
// CORRECT: build from HNSW's reordered vectors
let state_correct = ADSamplingState::from_hnsw(&index, ADSamplingParams::default());
// WRONG: build from original vectors (pre-reorder)
let state_wrong = ADSamplingState::new(&normalized, dim, ADSamplingParams::default());
let query = &normalized[0..dim];
let hnsw_results = index.search(query, k, ef).unwrap();
let correct_results = state_correct.search_hnsw(&index, query, k, ef).unwrap();
let wrong_results = state_wrong.search_hnsw(&index, query, k, ef).unwrap();
// Correct: ADSampling should find results with comparable distances.
// Early termination may cause some recall loss, so compare the closest
// result's distance rather than the 10th (which is more affected by rejections).
let hnsw_1st = hnsw_results[0].1;
let correct_1st = correct_results[0].1;
assert!(
(hnsw_1st - correct_1st).abs() < 0.05,
"from_hnsw() closest-dist should match HNSW: {hnsw_1st:.4} vs {correct_1st:.4}"
);
// Wrong: the pre-reorder version finds different neighbors.
// Its closest result's distance should differ significantly from HNSW's.
let wrong_1st = wrong_results[0].1;
let correct_gap = (hnsw_1st - correct_1st).abs();
let wrong_gap = (hnsw_1st - wrong_1st).abs();
if n > 50 {
assert!(
wrong_gap > correct_gap || correct_gap < 0.01,
"Wrong version should diverge more: correct_gap={correct_gap:.4}, wrong_gap={wrong_gap:.4}"
);
}
}
/// Regression: ADSampling dist_comp must return L2 (sqrt), not L2².
///
/// Bug: dist_comp returned sum of squared diffs (L2²). HNSW graphs are
/// built with L2 (sqrt). The beam search found the same distances but
/// different nodes due to floating-point tie-breaking at squared scale.
///
/// Fix: dist_comp returns sqrt(partial_sum) and squares the threshold
/// internally for the early termination test.
#[test]
fn dist_comp_returns_l2_not_l2_squared() {
let dim = 32;
let params = ADSamplingParams::default();
let a: Vec<f32> = (0..dim).map(|i| i as f32).collect();
let b: Vec<f32> = (0..dim).map(|i| (i as f32) + 1.0).collect();
let vectors: Vec<f32> = [a.clone(), b.clone()].concat();
let state = ADSamplingState::new(&vectors, dim, params);
let rq = state.rotate_query(&a);
let dist = state.dist_comp(&rq, 1, f32::INFINITY).unwrap();
// Expected: L2 distance = sqrt(32 * 1.0²) ≈ 5.657
// NOT L2² = 32.0
let expected_l2 = (dim as f32).sqrt(); // sqrt(32) ≈ 5.657
assert!(
(dist - expected_l2).abs() < 0.5,
"dist_comp should return L2 (≈{expected_l2:.2}), got {dist:.2}"
);
assert!(
dist < 10.0,
"dist_comp returned {dist:.2}, which looks like L2² not L2"
);
}
/// Regression: threshold near zero should not reject all candidates.
///
/// Bug: when the top-k heap contained a near-zero distance (self-match),
/// threshold * ratio ≈ 0, causing any non-zero partial_sum to trigger
/// rejection.
///
/// Fix: guard with `threshold_sq > 1e-10`.
#[test]
fn near_zero_threshold_does_not_reject_everything() {
let dim = 64;
let params = ADSamplingParams::default();
let a: Vec<f32> = vec![1.0; dim];
let b: Vec<f32> = vec![1.001; dim]; // very close
let c: Vec<f32> = vec![2.0; dim]; // farther
let vectors: Vec<f32> = [a.clone(), b, c].concat();
let state = ADSamplingState::new(&vectors, dim, params);
let rq = state.rotate_query(&a);
// With generous threshold (INFINITY), both should be accepted
let near_inf = state.dist_comp(&rq, 1, f32::INFINITY);
let far_inf = state.dist_comp(&rq, 2, f32::INFINITY);
assert!(
near_inf.is_some(),
"near neighbor accepted at INF threshold"
);
assert!(far_inf.is_some(), "far neighbor accepted at INF threshold");
// With threshold slightly above the near neighbor's distance, near should be accepted
let near_dist = near_inf.unwrap();
let near_tight = state.dist_comp(&rq, 1, near_dist * 1.5);
assert!(
near_tight.is_some(),
"near neighbor should be accepted at threshold={:.4} (1.5x its distance {:.4})",
near_dist * 1.5,
near_dist
);
// With threshold=0 (self-match edge case), guard should prevent rejection
let near_zero = state.dist_comp(&rq, 1, 0.0);
assert!(
near_zero.is_some(),
"threshold=0 should disable early termination (guard: threshold_sq < 1e-10)"
);
}
/// Regression: ADSampling on L2 (Euclidean) datasets should achieve
/// comparable recall to standard HNSW search.
///
/// This test uses unnormalized vectors with DistanceMetric::L2 to verify
/// that the from_hnsw + sqrt fix works end-to-end.
#[test]
fn adsampling_l2_recall_matches_hnsw() {
let dim = 128;
let n = 500;
let k = 10;
let ef = 100;
// Unnormalized vectors
let mut s = 77u64;
let vectors: Vec<f32> = (0..n * dim)
.map(|_| {
s = s
.wrapping_mul(6364136223846793005)
.wrapping_add(1442695040888963407);
((s >> 33) as f32) / (u32::MAX >> 1) as f32 * 255.0
})
.collect();
let params = HNSWParams {
m: 16,
ef_construction: 100,
metric: DistanceMetric::L2,
seed: Some(42),
..Default::default()
};
let mut index = HNSWIndex::with_params(dim, params).unwrap();
let ids: Vec<u32> = (0..n as u32).collect();
index.add_batch(&ids, &vectors).unwrap();
index.build().unwrap();
let state = ADSamplingState::from_hnsw(&index, ADSamplingParams::default());
// Test with 20 queries (not in training set)
let test_vectors = random_vectors(20, dim, 999);
let mut total_ads_recall = 0.0;
for qi in 0..20 {
let query = &test_vectors[qi * dim..(qi + 1) * dim];
// Brute-force ground truth using reordered vectors
let raw = index.vectors_raw();
let mut gt: Vec<(u32, f32)> = (0..n)
.map(|i| {
let v = &raw[i * dim..(i + 1) * dim];
(i as u32, vicinity::distance::l2_distance(query, v))
})
.collect();
gt.sort_by(|a, b| a.1.total_cmp(&b.1));
// Map internal IDs to doc_ids for ground truth comparison
// (search returns doc_ids, brute force uses internal IDs)
// Since we don't have the mapping externally, compare via HNSW search
let hnsw_results = index.search(query, k, ef).unwrap();
let ads_results = state.search_hnsw(&index, query, k, ef).unwrap();
let hnsw_ids: HashSet<u32> = hnsw_results.iter().map(|r| r.0).collect();
let ads_ids: HashSet<u32> = ads_results.iter().map(|r| r.0).collect();
// Compare ADSampling against HNSW (not brute force -- both use the same graph)
let overlap = hnsw_ids.intersection(&ads_ids).count();
total_ads_recall += overlap as f64 / k as f64;
}
let avg_parity = total_ads_recall / 20.0;
assert!(
avg_parity > 0.7,
"ADSampling should find >=70% of HNSW's results on L2, got {:.1}%",
avg_parity * 100.0
);
}
/// Property: rotation preserves L2 distance for arbitrary vector pairs.
#[test]
fn rotation_preserves_l2_multiple_dimensions() {
for &dim in &[16, 64, 256, 512] {
let params = ADSamplingParams {
seed: 42,
..Default::default()
};
let a = random_vectors(1, dim, 100);
let b = random_vectors(1, dim, 200);
let vectors: Vec<f32> = [a.clone(), b.clone()].concat();
let state = ADSamplingState::new(&vectors, dim, params);
let rq = state.rotate_query(&a);
let orig_l2: f32 = a
.iter()
.zip(&b)
.map(|(x, y)| (x - y) * (x - y))
.sum::<f32>()
.sqrt();
let rotated_l2 = state.dist_comp(&rq, 1, f32::INFINITY).unwrap();
let rel_error = (orig_l2 - rotated_l2).abs() / orig_l2.max(1e-10);
assert!(
rel_error < 0.01,
"L2 not preserved at dim={dim}: orig={orig_l2:.4}, rotated={rotated_l2:.4}, err={rel_error:.6}"
);
}
}
}