vicinity 0.3.1

Approximate Nearest Neighbor Search: HNSW, DiskANN, IVF-PQ, ScaNN, quantization
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
#![allow(dead_code)]
//! Adaptive computation patterns for approximate nearest neighbor search.
//!
//! # The Core Insight
//!
//! Most ANN algorithms perform redundant computation. Consider HNSW search:
//!
//! ```text
//! Query arrives → Greedy traversal → Visit 100+ nodes → Return top-k
//!//!                                   Many distance computations lead nowhere
//! ```
//!
//! Adaptive computation asks: **can we skip work that won't change the result?**
//!
//! # Techniques
//!
//! This module implements several complementary approaches:
//!
//! | Technique | Key Idea | Speedup |
//! |-----------|----------|---------|
//! | **Early termination** | Stop once result is "good enough" | 1.5-2x |
//! | **Angle estimation** | Predict distances via projections | 1.2-1.6x (FINGER) |
//! | **Dimension sampling** | Use subset of dimensions first | 1.3-2x (ADSampling) |
//! | **Hubness avoidance** | Deprioritize high-degree nodes | Quality improvement |
//!
//! # Research Context
//!
//! - **FINGER** (Chen et al., WSDM 2023): Estimate angles between residual vectors
//!   to skip distance computations in graph traversal.
//!
//! - **ADSampling** (Wei et al., VLDB 2022): Sample dimensions adaptively,
//!   more from informative dimensions, fewer from redundant ones.
//!
//! - **Hubness** (Radovanović et al., JMLR 2010): In high-d spaces, some points
//!   become "hubs" appearing as NN to many queries. These deserve special treatment.
//!
//! The techniques share a meta-principle: **compute just enough to be confident**.

/// Configuration for adaptive search.
#[derive(Debug, Clone)]
pub struct AdaptiveConfig {
    /// Minimum candidates to evaluate before considering early termination.
    pub min_candidates: usize,

    /// Confidence threshold for early termination (0.0-1.0).
    /// Higher = more conservative = better recall but slower.
    pub confidence_threshold: f32,

    /// Number of dimensions for initial projection (0 = disabled).
    pub projection_dims: usize,

    /// Whether to use dimension sampling.
    pub dimension_sampling: bool,

    /// Fraction of dimensions to sample in first pass (0.1-1.0).
    pub sampling_ratio: f32,

    /// Whether to track and deprioritize hubs.
    pub hubness_aware: bool,

    /// Threshold for hub detection (fraction of queries where node appears).
    pub hub_threshold: f32,
}

impl Default for AdaptiveConfig {
    fn default() -> Self {
        Self {
            min_candidates: 10,
            confidence_threshold: 0.9,
            projection_dims: 0,
            dimension_sampling: false,
            sampling_ratio: 0.25,
            hubness_aware: false,
            hub_threshold: 0.1,
        }
    }
}

impl AdaptiveConfig {
    /// Conservative config - maximize recall at cost of speed.
    pub fn conservative() -> Self {
        Self {
            min_candidates: 50,
            confidence_threshold: 0.99,
            projection_dims: 0,
            dimension_sampling: false,
            sampling_ratio: 1.0,
            hubness_aware: false,
            hub_threshold: 0.05,
        }
    }

    /// Aggressive config - maximize speed at cost of recall.
    pub fn aggressive() -> Self {
        Self {
            min_candidates: 5,
            confidence_threshold: 0.7,
            projection_dims: 32,
            dimension_sampling: true,
            sampling_ratio: 0.1,
            hubness_aware: true,
            hub_threshold: 0.15,
        }
    }
}

/// Early termination oracle.
///
/// Tracks distance distributions to decide when enough candidates have been seen.
/// Based on the observation that once we've seen the nearest neighbors, additional
/// candidates are increasingly unlikely to be better.
#[derive(Debug)]
pub struct EarlyTerminationOracle {
    /// Best k distances seen so far.
    top_k_distances: Vec<f32>,
    /// Target k for search.
    k: usize,
    /// Total candidates evaluated.
    num_evaluated: usize,
    /// Running mean of distances (for distribution estimation).
    distance_mean: f32,
    /// Running variance of distances.
    distance_var: f32,
    /// Config.
    config: AdaptiveConfig,
}

impl EarlyTerminationOracle {
    /// Create new oracle for k-NN search.
    pub fn new(k: usize, config: AdaptiveConfig) -> Self {
        Self {
            top_k_distances: Vec::with_capacity(k),
            k,
            num_evaluated: 0,
            distance_mean: 0.0,
            distance_var: 0.0,
            config,
        }
    }

    /// Record a new candidate distance.
    pub fn observe(&mut self, distance: f32) {
        self.num_evaluated += 1;

        // Update running statistics (Welford's algorithm)
        let delta = distance - self.distance_mean;
        self.distance_mean += delta / self.num_evaluated as f32;
        let delta2 = distance - self.distance_mean;
        self.distance_var += delta * delta2;

        // Update top-k
        if self.top_k_distances.len() < self.k {
            self.top_k_distances.push(distance);
            self.top_k_distances.sort_unstable_by(|a, b| a.total_cmp(b));
        } else if distance < self.top_k_distances[self.k - 1] {
            self.top_k_distances[self.k - 1] = distance;
            self.top_k_distances.sort_unstable_by(|a, b| a.total_cmp(b));
        }
    }

    /// Should we stop searching?
    ///
    /// Returns true if we're confident we've found the true top-k.
    pub fn should_terminate(&self) -> bool {
        // Need minimum candidates
        if self.num_evaluated < self.config.min_candidates {
            return false;
        }

        // Need full top-k
        if self.top_k_distances.len() < self.k {
            return false;
        }

        // Estimate probability that a random new candidate would be in top-k
        // Under Gaussian assumption: P(X < threshold) = Φ((threshold - mean) / std)
        let variance = self.distance_var / (self.num_evaluated as f32 - 1.0).max(1.0);
        let std_dev = variance.sqrt().max(1e-9);

        let threshold = self.top_k_distances[self.k - 1];
        let z_score = (threshold - self.distance_mean) / std_dev;

        // Approximate Gaussian CDF (good enough for our purposes)
        // P(X < threshold) ≈ 1 / (1 + exp(-1.7 * z_score))
        let prob_better = 1.0 / (1.0 + (-1.7 * z_score).exp());

        // If probability of finding a better candidate is low enough, stop
        prob_better < (1.0 - self.config.confidence_threshold)
    }

    /// Get current best k distances.
    pub fn top_k(&self) -> &[f32] {
        &self.top_k_distances
    }

    /// Number of candidates evaluated.
    pub fn num_evaluated(&self) -> usize {
        self.num_evaluated
    }
}

/// Dimension importance estimator for adaptive sampling.
///
/// Some dimensions are more informative than others. By estimating importance,
/// we can prioritize computation on the most discriminative dimensions.
#[derive(Debug, Clone)]
pub struct DimensionImportance {
    /// Variance per dimension (higher = more discriminative).
    variances: Vec<f32>,
    /// Mean per dimension (for centering).
    means: Vec<f32>,
    /// Sampling order (sorted by importance).
    importance_order: Vec<usize>,
    /// Number of vectors used to estimate.
    num_samples: usize,
}

impl DimensionImportance {
    /// Estimate importance from a sample of vectors.
    ///
    /// # Arguments
    ///
    /// * `vectors` - Flat array of vectors (row-major)
    /// * `num_vectors` - Number of vectors in sample
    /// * `dimension` - Vector dimension
    pub fn estimate(vectors: &[f32], num_vectors: usize, dimension: usize) -> Self {
        assert_eq!(vectors.len(), num_vectors * dimension);

        let mut means = vec![0.0f32; dimension];
        let mut variances = vec![0.0f32; dimension];

        // Compute means
        for i in 0..num_vectors {
            let vec = &vectors[i * dimension..(i + 1) * dimension];
            for (d, &v) in vec.iter().enumerate() {
                means[d] += v;
            }
        }
        for m in &mut means {
            *m /= num_vectors as f32;
        }

        // Compute variances
        for i in 0..num_vectors {
            let vec = &vectors[i * dimension..(i + 1) * dimension];
            for (d, &v) in vec.iter().enumerate() {
                let diff = v - means[d];
                variances[d] += diff * diff;
            }
        }
        for v in &mut variances {
            *v /= (num_vectors as f32 - 1.0).max(1.0);
        }

        // Sort dimensions by variance (descending)
        let mut importance_order: Vec<usize> = (0..dimension).collect();
        importance_order.sort_unstable_by(|&a, &b| variances[b].total_cmp(&variances[a]));

        Self {
            variances,
            means,
            importance_order,
            num_samples: num_vectors,
        }
    }

    /// Get dimension order sorted by importance.
    pub fn order(&self) -> &[usize] {
        &self.importance_order
    }

    /// Get top-k most important dimensions.
    pub fn top_dimensions(&self, k: usize) -> &[usize] {
        let k = k.min(self.importance_order.len());
        &self.importance_order[..k]
    }

    /// Get importance weight for a dimension.
    pub fn weight(&self, dim: usize) -> f32 {
        self.variances.get(dim).copied().unwrap_or(0.0)
    }

    /// Total variance (sum of all dimension variances).
    pub fn total_variance(&self) -> f32 {
        self.variances.iter().sum()
    }

    /// Cumulative variance explained by top-k dimensions.
    pub fn cumulative_variance(&self, k: usize) -> f32 {
        self.importance_order
            .iter()
            .take(k)
            .map(|&d| self.variances[d])
            .sum()
    }
}

/// Hubness tracker for identifying and handling hub nodes.
///
/// In high-dimensional spaces, some points become "hubs" - they appear as
/// nearest neighbors to many queries despite not being semantically close.
/// This is a symptom of the curse of dimensionality.
///
/// Tracking hub frequency lets us deprioritize them during search.
#[derive(Debug, Clone)]
pub struct HubnessTracker {
    /// Count of times each node appeared in top-k results.
    occurrence_counts: Vec<usize>,
    /// Total queries processed.
    num_queries: usize,
    /// K used for tracking.
    k: usize,
    /// Threshold for hub classification.
    hub_threshold: f32,
}

impl HubnessTracker {
    /// Create tracker for n nodes.
    pub fn new(num_nodes: usize, k: usize, hub_threshold: f32) -> Self {
        Self {
            occurrence_counts: vec![0; num_nodes],
            num_queries: 0,
            k,
            hub_threshold,
        }
    }

    /// Record a query result.
    pub fn record_result(&mut self, top_k_indices: &[usize]) {
        self.num_queries += 1;
        for &idx in top_k_indices.iter().take(self.k) {
            if idx < self.occurrence_counts.len() {
                self.occurrence_counts[idx] += 1;
            }
        }
    }

    /// Is this node a hub?
    pub fn is_hub(&self, node_idx: usize) -> bool {
        if self.num_queries == 0 {
            return false;
        }
        let occurrence_rate = self.occurrence_counts.get(node_idx).copied().unwrap_or(0) as f32
            / self.num_queries as f32;
        occurrence_rate > self.hub_threshold
    }

    /// Get hub score (higher = more hub-like).
    pub fn hub_score(&self, node_idx: usize) -> f32 {
        if self.num_queries == 0 {
            return 0.0;
        }
        self.occurrence_counts.get(node_idx).copied().unwrap_or(0) as f32 / self.num_queries as f32
    }

    /// Get all nodes classified as hubs.
    pub fn hubs(&self) -> Vec<usize> {
        (0..self.occurrence_counts.len())
            .filter(|&i| self.is_hub(i))
            .collect()
    }

    /// Get hub statistics.
    pub fn stats(&self) -> HubnessStats {
        let num_hubs = self.hubs().len();
        let max_occurrences = self.occurrence_counts.iter().max().copied().unwrap_or(0);
        let mean_occurrences = if self.occurrence_counts.is_empty() {
            0.0
        } else {
            self.occurrence_counts.iter().sum::<usize>() as f32
                / self.occurrence_counts.len() as f32
        };

        HubnessStats {
            num_hubs,
            total_nodes: self.occurrence_counts.len(),
            max_occurrences,
            mean_occurrences,
            queries_processed: self.num_queries,
        }
    }
}

/// Statistics about hubness in the index.
#[derive(Debug, Clone)]
pub struct HubnessStats {
    /// Number of nodes classified as hubs.
    pub num_hubs: usize,
    /// Total nodes in index.
    pub total_nodes: usize,
    /// Maximum occurrence count for any node.
    pub max_occurrences: usize,
    /// Mean occurrence count.
    pub mean_occurrences: f32,
    /// Total queries processed.
    pub queries_processed: usize,
}

/// Sampled distance computation using dimension importance.
///
/// Computes distance using only a subset of dimensions, weighted by importance.
/// This provides an estimate that can be used for filtering before full computation.
pub fn sampled_l2_squared(
    a: &[f32],
    b: &[f32],
    importance: &DimensionImportance,
    sample_fraction: f32,
) -> (f32, f32) {
    debug_assert_eq!(a.len(), b.len());
    let dim = a.len();

    let num_samples = ((dim as f32 * sample_fraction) as usize).max(1).min(dim);
    let sampled_dims = importance.top_dimensions(num_samples);

    // Compute sampled distance
    let mut sampled_dist = 0.0f32;
    for &d in sampled_dims {
        let diff = a[d] - b[d];
        sampled_dist += diff * diff;
    }

    // Estimate full distance by scaling
    // Weight by fraction of variance captured
    let sampled_variance = importance.cumulative_variance(num_samples);
    let total_variance = importance.total_variance();

    let scale = if sampled_variance > 1e-9 {
        total_variance / sampled_variance
    } else {
        dim as f32 / num_samples as f32
    };

    let estimated_full = sampled_dist * scale;

    (sampled_dist, estimated_full)
}

/// Two-phase distance computation with early rejection.
///
/// Phase 1: Compute distance on sampled dimensions
/// Phase 2: If estimate passes threshold, compute full distance
///
/// Returns None if rejected in phase 1.
pub fn two_phase_l2_squared(
    a: &[f32],
    b: &[f32],
    importance: &DimensionImportance,
    threshold: f32,
    sample_fraction: f32,
) -> Option<f32> {
    let (_, estimated) = sampled_l2_squared(a, b, importance, sample_fraction);

    // Reject if estimated distance exceeds threshold with margin
    if estimated > threshold * 1.5 {
        return None;
    }

    // Full computation
    let full_dist: f32 = a
        .iter()
        .zip(b.iter())
        .map(|(x, y)| {
            let d = x - y;
            d * d
        })
        .sum();

    Some(full_dist)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_early_termination_basic() {
        let config = AdaptiveConfig::default();
        let mut oracle = EarlyTerminationOracle::new(3, config);

        // Not enough candidates yet
        for i in 0..5 {
            oracle.observe(i as f32);
            assert!(!oracle.should_terminate());
        }

        // After many similar distances, should become confident
        for _ in 0..100 {
            oracle.observe(10.0 + rand_f32() * 0.1);
        }

        // With tight distribution and low k-th distance, might terminate
        // (depends on random values, so we just check it doesn't panic)
        let _ = oracle.should_terminate();
    }

    #[test]
    fn test_dimension_importance() {
        // Create vectors where first dimension has high variance
        let vectors: Vec<f32> = vec![
            0.0, 0.5, 0.5, // vec 0
            1.0, 0.5, 0.5, // vec 1
            2.0, 0.5, 0.5, // vec 2
            3.0, 0.5, 0.5, // vec 3
        ];

        let importance = DimensionImportance::estimate(&vectors, 4, 3);

        // First dimension should have highest importance
        assert_eq!(importance.order()[0], 0);

        // Cumulative variance should increase
        let cum1 = importance.cumulative_variance(1);
        let cum2 = importance.cumulative_variance(2);
        let cum3 = importance.cumulative_variance(3);
        assert!(cum1 <= cum2);
        assert!(cum2 <= cum3);
    }

    #[test]
    fn test_hubness_tracker() {
        let mut tracker = HubnessTracker::new(10, 3, 0.3);

        // Node 0 appears in every query
        for _ in 0..10 {
            tracker.record_result(&[0, 1, 2]);
        }

        // Node 5 appears less frequently
        for _ in 0..3 {
            tracker.record_result(&[5, 6, 7]);
        }

        assert!(tracker.is_hub(0)); // 100% occurrence
        assert!(!tracker.is_hub(5)); // ~23% occurrence

        let stats = tracker.stats();
        assert!(stats.num_hubs > 0);
    }

    #[test]
    fn test_sampled_distance() {
        let vectors: Vec<f32> = (0..100).map(|i| (i as f32 / 100.0) - 0.5).collect();
        let importance = DimensionImportance::estimate(&vectors, 10, 10);

        let a = vec![0.0f32; 10];
        let b: Vec<f32> = (0..10).map(|i| i as f32 * 0.1).collect();

        let (_sampled, estimated) = sampled_l2_squared(&a, &b, &importance, 0.5);

        // Estimated should be roughly proportional to full
        let full: f32 = a
            .iter()
            .zip(b.iter())
            .map(|(x, y)| {
                let d = x - y;
                d * d
            })
            .sum();

        // Just check it's in the right ballpark (within 2x)
        assert!(estimated > 0.0);
        assert!(estimated < full * 3.0);
    }

    fn rand_f32() -> f32 {
        use std::cell::Cell;
        thread_local! {
            static SEED: Cell<u32> = const { Cell::new(12345) };
        }
        SEED.with(|s| {
            let next = s.get().wrapping_mul(1103515245).wrapping_add(12345);
            s.set(next);
            (next as f32) / (u32::MAX as f32)
        })
    }
}