#![allow(clippy::unwrap_used, clippy::expect_used)]
use proptest::prelude::*;
mod distance_props {
use super::*;
fn l2_distance_squared(a: &[f32], b: &[f32]) -> f32 {
a.iter()
.zip(b.iter())
.map(|(x, y)| {
let d = x - y;
d * d
})
.sum()
}
fn cosine_distance(a: &[f32], b: &[f32]) -> f32 {
let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm_a == 0.0 || norm_b == 0.0 {
return 1.0;
}
1.0 - dot / (norm_a * norm_b)
}
prop_compose! {
fn arb_vector(dim: usize)(vec in prop::collection::vec(-10.0f32..10.0, dim)) -> Vec<f32> {
vec
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(100))]
#[test]
fn l2_distance_non_negative(
a in arb_vector(64),
b in arb_vector(64),
) {
let dist = l2_distance_squared(&a, &b);
prop_assert!(dist >= 0.0, "L2 distance must be non-negative, got {}", dist);
}
#[test]
fn l2_distance_symmetric(
a in arb_vector(32),
b in arb_vector(32),
) {
let d_ab = l2_distance_squared(&a, &b);
let d_ba = l2_distance_squared(&b, &a);
prop_assert!(
(d_ab - d_ba).abs() < 1e-6,
"L2 distance not symmetric: {} vs {}",
d_ab, d_ba
);
}
#[test]
fn l2_distance_self_is_zero(
a in arb_vector(32),
) {
let dist = l2_distance_squared(&a, &a);
prop_assert!(
dist.abs() < 1e-10,
"Distance to self should be 0, got {}",
dist
);
}
#[test]
fn l2_triangle_inequality(
a in arb_vector(16),
b in arb_vector(16),
c in arb_vector(16),
) {
let d_ac = l2_distance_squared(&a, &c).sqrt();
let d_ab = l2_distance_squared(&a, &b).sqrt();
let d_bc = l2_distance_squared(&b, &c).sqrt();
prop_assert!(
d_ac <= d_ab + d_bc + 1e-5,
"Triangle inequality violated: {} > {} + {}",
d_ac, d_ab, d_bc
);
}
#[test]
fn cosine_distance_in_range(
a in arb_vector(32),
b in arb_vector(32),
) {
let dist = cosine_distance(&a, &b);
prop_assert!(
(-0.001..=2.001).contains(&dist),
"Cosine distance out of range: {}",
dist
);
}
#[test]
fn cosine_distance_symmetric(
a in arb_vector(32),
b in arb_vector(32),
) {
let d_ab = cosine_distance(&a, &b);
let d_ba = cosine_distance(&b, &a);
prop_assert!(
(d_ab - d_ba).abs() < 1e-5,
"Cosine distance not symmetric: {} vs {}",
d_ab, d_ba
);
}
}
}
mod recall_props {
use super::*;
use std::collections::HashSet;
fn recall_at_k(ground_truth: &[u32], retrieved: &[u32], k: usize) -> f32 {
if k == 0 || ground_truth.is_empty() {
return 0.0;
}
let gt_set: HashSet<u32> = ground_truth.iter().take(k).copied().collect();
let ret_set: HashSet<u32> = retrieved.iter().take(k).copied().collect();
gt_set.intersection(&ret_set).count() as f32 / k as f32
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(200))]
#[test]
fn recall_in_unit_interval(
gt in prop::collection::vec(0u32..1000, 1..50),
ret in prop::collection::vec(0u32..1000, 1..50),
k in 1usize..20,
) {
let recall = recall_at_k(>, &ret, k);
prop_assert!(
(0.0..=1.0).contains(&recall),
"Recall must be in [0,1], got {}",
recall
);
}
#[test]
fn perfect_recall_when_identical(
gt_set in prop::collection::hash_set(0u32..1000, 1..20),
) {
let gt: Vec<u32> = gt_set.into_iter().collect();
let k = gt.len();
let recall = recall_at_k(>, >, k);
prop_assert!(
(recall - 1.0).abs() < 1e-6,
"Recall should be 1.0 for identical sets, got {}",
recall
);
}
#[test]
fn zero_recall_disjoint_sets(
offset in 0u32..1000,
size in 1usize..20,
) {
let gt: Vec<u32> = (0..size as u32).collect();
let ret: Vec<u32> = (offset + 1000..(offset + 1000 + size as u32)).collect();
let recall = recall_at_k(>, &ret, size);
prop_assert!(
recall.abs() < 1e-6,
"Recall should be 0 for disjoint sets, got {}",
recall
);
}
#[test]
fn recall_monotonic_with_overlap(
base in prop::collection::vec(0u32..50, 10..20),
) {
let k = base.len();
let mut retrieved: Vec<u32> = (100..100 + k as u32).collect();
let mut prev_recall = recall_at_k(&base, &retrieved, k);
for (i, >_item) in base.iter().enumerate().take(k / 2) {
retrieved[i] = gt_item;
let new_recall = recall_at_k(&base, &retrieved, k);
prop_assert!(
new_recall >= prev_recall - 1e-6,
"Recall decreased from {} to {} when adding correct items",
prev_recall, new_recall
);
prev_recall = new_recall;
}
}
}
}
mod memory_props {
use super::*;
fn theoretical_hnsw_memory(n_vectors: usize, dimension: usize, m: usize) -> usize {
let raw_bytes = n_vectors * dimension * 4;
let avg_edges = (2.5 * m as f64) as usize;
let graph_bytes = n_vectors * avg_edges * 4;
let metadata_bytes = n_vectors * 5;
raw_bytes + graph_bytes + metadata_bytes
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(100))]
#[test]
fn memory_increases_with_vectors(
n1 in 100usize..1000,
n2 in 1000usize..10000,
dim in 32usize..256,
m in 8usize..32,
) {
let mem1 = theoretical_hnsw_memory(n1, dim, m);
let mem2 = theoretical_hnsw_memory(n2, dim, m);
prop_assert!(
mem2 > mem1,
"Memory should increase with vectors: {} vs {}",
mem1, mem2
);
}
#[test]
fn memory_increases_with_dimension(
n in 1000usize..5000,
dim1 in 32usize..128,
dim2 in 256usize..512,
m in 8usize..24,
) {
let mem1 = theoretical_hnsw_memory(n, dim1, m);
let mem2 = theoretical_hnsw_memory(n, dim2, m);
prop_assert!(
mem2 > mem1,
"Memory should increase with dimension: {} vs {}",
mem1, mem2
);
}
#[test]
fn memory_increases_with_m(
n in 1000usize..5000,
dim in 64usize..256,
m1 in 4usize..16,
m2 in 24usize..48,
) {
let mem1 = theoretical_hnsw_memory(n, dim, m1);
let mem2 = theoretical_hnsw_memory(n, dim, m2);
prop_assert!(
mem2 > mem1,
"Memory should increase with M: {} (M={}) vs {} (M={})",
mem1, m1, mem2, m2
);
}
#[test]
fn memory_positive(
n in 1usize..10000,
dim in 1usize..512,
m in 1usize..64,
) {
let mem = theoretical_hnsw_memory(n, dim, m);
prop_assert!(mem > 0, "Memory must be positive");
}
}
}
mod ground_truth_props {
use super::*;
fn l2_distance_squared(a: &[f32], b: &[f32]) -> f32 {
a.iter()
.zip(b.iter())
.map(|(x, y)| {
let d = x - y;
d * d
})
.sum()
}
fn compute_ground_truth(query: &[f32], database: &[Vec<f32>], k: usize) -> Vec<(u32, f32)> {
let mut distances: Vec<(u32, f32)> = database
.iter()
.enumerate()
.map(|(i, vec)| (i as u32, l2_distance_squared(query, vec)))
.collect();
distances.sort_by(|a, b| a.1.total_cmp(&b.1));
distances.truncate(k);
distances
}
prop_compose! {
fn arb_database(n: usize, dim: usize)(
db in prop::collection::vec(
prop::collection::vec(-5.0f32..5.0, dim),
n
)
) -> Vec<Vec<f32>> {
db
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(50))]
#[test]
fn ground_truth_returns_k_results(
db in arb_database(50, 16),
k in 1usize..20,
) {
let query: Vec<f32> = (0..16).map(|_| 0.0f32).collect();
let gt = compute_ground_truth(&query, &db, k);
let expected_k = k.min(db.len());
prop_assert_eq!(
gt.len(),
expected_k,
"Ground truth should return {} results, got {}",
expected_k,
gt.len()
);
}
#[test]
fn ground_truth_sorted_by_distance(
db in arb_database(30, 16),
) {
let query: Vec<f32> = (0..16).map(|i| i as f32 * 0.1).collect();
let gt = compute_ground_truth(&query, &db, db.len());
for i in 1..gt.len() {
prop_assert!(
gt[i].1 >= gt[i - 1].1,
"Ground truth not sorted: {} > {} at position {}",
gt[i - 1].1, gt[i].1, i
);
}
}
#[test]
fn ground_truth_unique_ids(
db in arb_database(30, 16),
) {
let query: Vec<f32> = (0..16).map(|i| i as f32 * 0.1).collect();
let gt = compute_ground_truth(&query, &db, db.len());
let ids: Vec<u32> = gt.iter().map(|(id, _)| *id).collect();
let mut sorted_ids = ids.clone();
sorted_ids.sort_unstable();
sorted_ids.dedup();
prop_assert_eq!(
ids.len(),
sorted_ids.len(),
"Ground truth contains duplicate IDs"
);
}
#[test]
fn ground_truth_ids_valid(
db in arb_database(30, 16),
) {
let query: Vec<f32> = (0..16).map(|i| i as f32 * 0.1).collect();
let gt = compute_ground_truth(&query, &db, db.len());
for (id, _) in > {
prop_assert!(
(*id as usize) < db.len(),
"Ground truth ID {} out of range (db size {})",
id, db.len()
);
}
}
#[test]
fn ground_truth_distances_match(
db in arb_database(20, 8),
) {
let query: Vec<f32> = (0..8).map(|i| i as f32 * 0.2).collect();
let gt = compute_ground_truth(&query, &db, db.len());
for (id, dist) in > {
let actual_dist = l2_distance_squared(&query, &db[*id as usize]);
prop_assert!(
(dist - actual_dist).abs() < 1e-5,
"Distance mismatch for ID {}: {} vs {}",
id, dist, actual_dist
);
}
}
}
}
mod lid_props {
use super::*;
use vicinity::lid::{estimate_lid_mle, LidConfig, LidEstimate, LidStats};
proptest! {
#![proptest_config(ProptestConfig::with_cases(100))]
#[test]
fn lid_positive_for_valid_distances(
base in 0.01f32..0.1,
increments in prop::collection::vec(0.01f32..0.5, 19),
) {
let mut distances = vec![base];
let mut cumsum = base;
for inc in increments {
cumsum += inc;
distances.push(cumsum);
}
let config = LidConfig::default();
let estimate = estimate_lid_mle(&distances, &config);
prop_assert!(estimate.lid > 0.0 || estimate.lid.is_infinite(),
"LID should be positive, got {}", estimate.lid);
}
#[test]
fn lid_scale_invariant(
base in 0.01f32..0.1,
increments in prop::collection::vec(0.01f32..0.5, 19),
scale in 0.1f32..10.0,
) {
let mut distances = vec![base];
let mut cumsum = base;
for inc in &increments {
cumsum += inc;
distances.push(cumsum);
}
let scaled: Vec<f32> = distances.iter().map(|d| d * scale).collect();
let config = LidConfig::default();
let est1 = estimate_lid_mle(&distances, &config);
let est2 = estimate_lid_mle(&scaled, &config);
if est1.lid.is_finite() && est2.lid.is_finite() {
let relative_diff = (est1.lid - est2.lid).abs() / est1.lid.max(1.0);
prop_assert!(relative_diff < 0.3,
"LID not scale invariant: {} vs {} (scale={})",
est1.lid, est2.lid, scale);
}
}
#[test]
fn lid_respects_k(
distances in prop::collection::vec(0.1f32..10.0, 30..50),
k in 5usize..25,
) {
let mut sorted = distances.clone();
sorted.sort_by(|a, b| a.total_cmp(b));
let config = LidConfig { k, epsilon: 1e-10 };
let estimate = estimate_lid_mle(&sorted, &config);
prop_assert_eq!(estimate.k, k.min(sorted.len()),
"k should be min of config.k and distances.len()");
}
#[test]
fn lid_stats_valid(
lids in prop::collection::vec(1.0f32..50.0, 10..30),
) {
let estimates: Vec<LidEstimate> = lids.iter()
.map(|&lid| LidEstimate { lid, k: 20, max_dist: 1.0 })
.collect();
let stats = LidStats::from_estimates(&estimates);
prop_assert_eq!(stats.count, estimates.len());
prop_assert!(stats.min <= stats.mean);
prop_assert!(stats.mean <= stats.max);
prop_assert!(stats.std_dev >= 0.0);
}
#[test]
fn high_lid_threshold_above_median(
lids in prop::collection::vec(1.0f32..50.0, 5..20),
) {
let estimates: Vec<LidEstimate> = lids.iter()
.map(|&lid| LidEstimate { lid, k: 20, max_dist: 1.0 })
.collect();
let stats = LidStats::from_estimates(&estimates);
if stats.std_dev > 0.0 {
prop_assert!(stats.high_lid_threshold() > stats.median,
"threshold {} should be > median {}",
stats.high_lid_threshold(), stats.median);
}
}
#[test]
fn lid_categorization_consistent(
lids in prop::collection::vec(1.0f32..100.0, 20..50),
) {
use vicinity::lid::LidCategory;
let estimates: Vec<LidEstimate> = lids.iter()
.map(|&lid| LidEstimate { lid, k: 20, max_dist: 1.0 })
.collect();
let stats = LidStats::from_estimates(&estimates);
for &lid in &lids {
let category = stats.categorize(lid);
match category {
LidCategory::Low => {
prop_assert!(lid < stats.median - stats.std_dev + 1e-5,
"Low category but lid {} >= median - std = {}",
lid, stats.median - stats.std_dev);
}
LidCategory::High => {
prop_assert!(lid > stats.median + stats.std_dev - 1e-5,
"High category but lid {} <= median + std = {}",
lid, stats.median + stats.std_dev);
}
LidCategory::Normal => {
}
}
}
}
#[test]
fn lid_uniform_distances_finite(
n in 10usize..50,
step in 0.01f32..0.5,
) {
let distances: Vec<f32> = (0..n).map(|i| (i + 1) as f32 * step).collect();
let config = LidConfig::default();
let estimate = estimate_lid_mle(&distances, &config);
prop_assert!(estimate.lid.is_finite(),
"LID should be finite for uniform distances, got {}", estimate.lid);
prop_assert!(estimate.lid > 0.0,
"LID should be positive, got {}", estimate.lid);
}
}
}
mod twonn_props {
use super::*;
use vicinity::lid::{estimate_twonn, estimate_twonn_with_ci};
proptest! {
#![proptest_config(ProptestConfig::with_cases(200))]
#[test]
fn twonn_positive_dimension(
n in 20usize..100,
seed in any::<u64>(),
) {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
seed.hash(&mut hasher);
let h = hasher.finish();
let ratios: Vec<f32> = (0..n)
.map(|i| {
let x = ((h.wrapping_mul(i as u64 + 1)) % 10000) as f32 / 10000.0;
1.0 + x * 3.0 })
.collect();
let dim = estimate_twonn(&ratios, 0.1);
if dim.is_finite() {
prop_assert!(dim > 0.0, "TwoNN dimension should be positive, got {}", dim);
}
}
#[test]
fn twonn_ci_ordering(
n in 50usize..200,
seed in any::<u64>(),
) {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
seed.hash(&mut hasher);
let h = hasher.finish();
let ratios: Vec<f32> = (0..n)
.map(|i| {
let x = ((h.wrapping_mul(i as u64 + 1)) % 10000) as f32 / 10000.0;
1.0 + x * 2.0
})
.collect();
let result = estimate_twonn_with_ci(&ratios, 0.1);
if result.dimension.is_finite() && result.ci_lower.is_finite() && result.ci_upper.is_finite() {
prop_assert!(
result.ci_lower <= result.dimension,
"CI lower {} should be <= dimension {}",
result.ci_lower, result.dimension
);
prop_assert!(
result.dimension <= result.ci_upper,
"dimension {} should be <= CI upper {}",
result.dimension, result.ci_upper
);
}
}
#[test]
fn twonn_consistency(
n in 50usize..200,
seed in any::<u64>(),
) {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
seed.hash(&mut hasher);
let h = hasher.finish();
let ratios: Vec<f32> = (0..n)
.map(|i| {
let x = ((h.wrapping_mul(i as u64 + 1)) % 10000) as f32 / 10000.0;
1.0 + x * 2.0
})
.collect();
let dim1 = estimate_twonn(&ratios, 0.1);
let result2 = estimate_twonn_with_ci(&ratios, 0.1);
if dim1.is_finite() && result2.dimension.is_finite() {
let rel_diff = (dim1 - result2.dimension).abs() / dim1.max(result2.dimension);
prop_assert!(
rel_diff < 0.5,
"TwoNN methods disagree too much: {} vs {} (rel diff {})",
dim1, result2.dimension, rel_diff
);
}
}
#[test]
fn twonn_discard_sensitivity(
n in 100usize..300,
seed in any::<u64>(),
) {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
seed.hash(&mut hasher);
let h = hasher.finish();
let ratios: Vec<f32> = (0..n)
.map(|i| {
let x = ((h.wrapping_mul(i as u64 + 1)) % 10000) as f32 / 10000.0;
if i < n - 5 {
1.0 + x * 2.0 } else {
5.0 + x * 10.0 }
})
.collect();
let dim_no_discard = estimate_twonn(&ratios, 0.0);
let dim_with_discard = estimate_twonn(&ratios, 0.1);
if dim_no_discard.is_finite() && dim_with_discard.is_finite() {
prop_assert!(
dim_no_discard > 0.0 || dim_with_discard > 0.0,
"At least one estimate should be positive"
);
}
}
#[test]
fn twonn_empty_returns_nan(
n in 0usize..2,
) {
let ratios: Vec<f32> = (0..n).map(|i| 1.0 + i as f32 * 0.5).collect();
let dim = estimate_twonn(&ratios, 0.1);
let result = estimate_twonn_with_ci(&ratios, 0.1);
if n < 2 {
prop_assert!(dim.is_nan(), "Should return NaN for n={}", n);
prop_assert!(result.dimension.is_nan(), "CI version should return NaN for n={}", n);
}
}
#[test]
fn twonn_handles_equidistant(
n in 20usize..50,
frac_equidistant in 0.0f32..0.5,
) {
let n_equidistant = (n as f32 * frac_equidistant) as usize;
let ratios: Vec<f32> = (0..n)
.map(|i| {
if i < n_equidistant {
1.0 } else {
1.1 + (i as f32 * 0.1) % 2.0
}
})
.collect();
let dim = estimate_twonn(&ratios, 0.1);
if n - n_equidistant >= 3 {
prop_assert!(dim.is_finite() || dim.is_nan(),
"Should handle equidistant neighbors gracefully");
}
}
}
}
mod lid_aggregation_props {
use super::*;
use vicinity::lid::{aggregate_lid, LidAggregation, LidEstimate};
proptest! {
#![proptest_config(ProptestConfig::with_cases(100))]
#[test]
fn harmonic_le_arithmetic(
lids in prop::collection::vec(0.5f32..50.0, 5..30),
) {
let estimates: Vec<LidEstimate> = lids.iter()
.map(|&lid| LidEstimate { lid, k: 20, max_dist: 1.0 })
.collect();
let mean = aggregate_lid(&estimates, LidAggregation::Mean);
let harmonic = aggregate_lid(&estimates, LidAggregation::HarmonicMean);
prop_assert!(
harmonic <= mean + 1e-6,
"Harmonic mean {} should be <= arithmetic mean {}",
harmonic, mean
);
}
#[test]
fn median_bounded(
lids in prop::collection::vec(1.0f32..100.0, 3..50),
) {
let estimates: Vec<LidEstimate> = lids.iter()
.map(|&lid| LidEstimate { lid, k: 20, max_dist: 1.0 })
.collect();
let median = aggregate_lid(&estimates, LidAggregation::Median);
let min_lid = lids.iter().cloned().reduce(f32::min).unwrap();
let max_lid = lids.iter().cloned().reduce(f32::max).unwrap();
prop_assert!(
median >= min_lid - 1e-6 && median <= max_lid + 1e-6,
"Median {} should be in [{}, {}]",
median, min_lid, max_lid
);
}
#[test]
fn aggregation_finite(
lids in prop::collection::vec(1.0f32..50.0, 5..20),
) {
let estimates: Vec<LidEstimate> = lids.iter()
.map(|&lid| LidEstimate { lid, k: 20, max_dist: 1.0 })
.collect();
let mean = aggregate_lid(&estimates, LidAggregation::Mean);
let median = aggregate_lid(&estimates, LidAggregation::Median);
let harmonic = aggregate_lid(&estimates, LidAggregation::HarmonicMean);
prop_assert!(mean.is_finite(), "Mean should be finite");
prop_assert!(median.is_finite(), "Median should be finite");
prop_assert!(harmonic.is_finite(), "Harmonic mean should be finite");
}
}
}
mod metric_space_props {
use super::*;
fn arb_vec(dim: usize) -> impl Strategy<Value = Vec<f32>> {
prop::collection::vec(-10.0f32..10.0, dim)
}
fn normalize(v: &[f32]) -> Vec<f32> {
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm < 1e-10 {
v.to_vec()
} else {
v.iter().map(|x| x / norm).collect()
}
}
fn l2_squared(a: &[f32], b: &[f32]) -> f32 {
a.iter().zip(b.iter()).map(|(x, y)| (x - y).powi(2)).sum()
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(200))]
#[test]
fn cosine_distance_bounded(
a in arb_vec(32).prop_filter("non-zero", |v| v.iter().any(|x| x.abs() > 1e-6)),
b in arb_vec(32).prop_filter("non-zero", |v| v.iter().any(|x| x.abs() > 1e-6)),
) {
let a_norm = normalize(&a);
let b_norm = normalize(&b);
let dot: f32 = a_norm.iter().zip(b_norm.iter()).map(|(x, y)| x * y).sum();
let cosine_dist = 1.0 - dot;
prop_assert!(
(-0.01..=2.01).contains(&cosine_dist),
"Cosine distance out of bounds: {}",
cosine_dist
);
}
#[test]
fn cosine_distance_self_zero(
v in arb_vec(32).prop_filter("non-zero", |v| v.iter().any(|x| x.abs() > 1e-6)),
) {
let v_norm = normalize(&v);
let dot: f32 = v_norm.iter().map(|x| x * x).sum();
let cosine_dist = 1.0 - dot;
prop_assert!(
cosine_dist.abs() < 1e-5,
"Cosine distance to self should be 0, got {}",
cosine_dist
);
}
#[test]
fn l2_identity_of_indiscernibles(v in arb_vec(32)) {
let dist = l2_squared(&v, &v);
prop_assert!(dist.abs() < 1e-6, "L2(v, v) should be 0, got {}", dist);
}
#[test]
fn l2_symmetric_test(
a in arb_vec(32),
b in arb_vec(32),
) {
let ab = l2_squared(&a, &b);
let ba = l2_squared(&b, &a);
prop_assert!(
(ab - ba).abs() < 1e-6,
"L2 not symmetric: {} != {}",
ab, ba
);
}
#[test]
fn l2_triangle_inequality_test(
a in arb_vec(16),
b in arb_vec(16),
c in arb_vec(16),
) {
let ab = l2_squared(&a, &b).sqrt();
let bc = l2_squared(&b, &c).sqrt();
let ac = l2_squared(&a, &c).sqrt();
prop_assert!(
ac <= ab + bc + 1e-4,
"Triangle inequality violated: {} > {} + {} = {}",
ac, ab, bc, ab + bc
);
}
#[test]
fn dot_bilinear(
a in arb_vec(32),
b in arb_vec(32),
c in arb_vec(32),
) {
let ab: Vec<f32> = a.iter().zip(b.iter()).map(|(x, y)| x + y).collect();
let dot_ab_c: f32 = ab.iter().zip(c.iter()).map(|(x, y)| x * y).sum();
let dot_a_c: f32 = a.iter().zip(c.iter()).map(|(x, y)| x * y).sum();
let dot_b_c: f32 = b.iter().zip(c.iter()).map(|(x, y)| x * y).sum();
let expected = dot_a_c + dot_b_c;
let tolerance = expected.abs() * 1e-4 + 1e-4;
prop_assert!(
(dot_ab_c - expected).abs() < tolerance,
"Bilinearity violated: {} != {} + {} = {}",
dot_ab_c, dot_a_c, dot_b_c, expected
);
}
#[test]
fn scaling_preserves_direction(
v in arb_vec(32).prop_filter("non-zero", |v| v.iter().any(|x| x.abs() > 1e-6)),
scale in 0.1f32..10.0,
) {
let v_norm = normalize(&v);
let scaled: Vec<f32> = v.iter().map(|x| x * scale).collect();
let scaled_norm = normalize(&scaled);
for (a, b) in v_norm.iter().zip(scaled_norm.iter()) {
prop_assert!(
(a - b).abs() < 1e-5,
"Scaling changed direction"
);
}
}
}
}
#[cfg(feature = "hnsw")]
mod hnsw_props {
use super::*;
use vicinity::hnsw::HNSWIndex;
fn normalize(v: &[f32]) -> Vec<f32> {
let n: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
if n < 1e-10 {
v.to_vec()
} else {
v.iter().map(|x| x / n).collect()
}
}
fn random_vectors(n: usize, dim: usize, seed: u64) -> Vec<Vec<f32>> {
use std::hash::{Hash, Hasher};
(0..n)
.map(|i| {
let raw: Vec<f32> = (0..dim)
.map(|j| {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
seed.hash(&mut hasher);
i.hash(&mut hasher);
j.hash(&mut hasher);
let h = hasher.finish();
(h as f64 / u64::MAX as f64 * 2.0 - 1.0) as f32
})
.collect();
normalize(&raw)
})
.collect()
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(50))]
#[test]
fn hnsw_returns_k_results(
n in 20usize..100,
k in 1usize..20,
seed in any::<u64>(),
) {
let dim = 16;
let k = k.min(n);
let vectors = random_vectors(n, dim, seed);
let mut hnsw = HNSWIndex::new(dim, 16, 16).expect("Failed to create");
for (i, v) in vectors.iter().enumerate() {
hnsw.add(i as u32, v.clone()).expect("Failed to add");
}
hnsw.build().expect("Failed to build");
let results = hnsw.search(&vectors[0], k, 100).expect("Search failed");
prop_assert_eq!(
results.len(),
k,
"Should return {} results, got {}",
k,
results.len()
);
}
#[test]
fn hnsw_results_sorted(
n in 30usize..80,
seed in any::<u64>(),
) {
let dim = 16;
let vectors = random_vectors(n, dim, seed);
let mut hnsw = HNSWIndex::new(dim, 16, 16).expect("Failed to create");
for (i, v) in vectors.iter().enumerate() {
hnsw.add(i as u32, v.clone()).expect("Failed to add");
}
hnsw.build().expect("Failed to build");
let results = hnsw.search(&vectors[n / 2], 10, 100).expect("Search failed");
for i in 1..results.len() {
prop_assert!(
results[i].1 >= results[i - 1].1 - 1e-6,
"Results not sorted: {} > {} at position {}",
results[i - 1].1,
results[i].1,
i
);
}
}
#[test]
fn hnsw_unique_results(
n in 30usize..80,
seed in any::<u64>(),
) {
let dim = 16;
let vectors = random_vectors(n, dim, seed);
let mut hnsw = HNSWIndex::new(dim, 16, 16).expect("Failed to create");
for (i, v) in vectors.iter().enumerate() {
hnsw.add(i as u32, v.clone()).expect("Failed to add");
}
hnsw.build().expect("Failed to build");
let results = hnsw.search(&vectors[0], 15, 100).expect("Search failed");
let ids: std::collections::HashSet<u32> = results.iter().map(|(id, _)| *id).collect();
prop_assert_eq!(
ids.len(),
results.len(),
"Result IDs are not unique"
);
}
#[test]
fn hnsw_deterministic(
n in 30usize..60,
seed in any::<u64>(),
) {
let dim = 16;
let vectors = random_vectors(n, dim, seed);
let mut hnsw = HNSWIndex::new(dim, 16, 16).expect("Failed to create");
for (i, v) in vectors.iter().enumerate() {
hnsw.add(i as u32, v.clone()).expect("Failed to add");
}
hnsw.build().expect("Failed to build");
let query = &vectors[0];
let results1 = hnsw.search(query, 10, 100).expect("Search failed");
let results2 = hnsw.search(query, 10, 100).expect("Search failed");
let results3 = hnsw.search(query, 10, 100).expect("Search failed");
let ids1: std::collections::HashSet<u32> = results1.iter().map(|(id, _)| *id).collect();
let ids2: std::collections::HashSet<u32> = results2.iter().map(|(id, _)| *id).collect();
let ids3: std::collections::HashSet<u32> = results3.iter().map(|(id, _)| *id).collect();
prop_assert_eq!(&ids1, &ids2, "Results not deterministic (run 1 vs 2)");
prop_assert_eq!(&ids2, &ids3, "Results not deterministic (run 2 vs 3)");
}
}
}
#[cfg(all(feature = "persistence", feature = "hnsw"))]
mod persistence_props {
use super::*;
use std::collections::HashSet;
use vicinity::hnsw::HNSWIndex;
use vicinity::persistence::directory::MemoryDirectory;
use vicinity::persistence::hnsw::{HNSWSegmentReader, HNSWSegmentWriter};
fn normalize(v: &[f32]) -> Vec<f32> {
let n: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
if n < 1e-10 {
v.to_vec()
} else {
v.iter().map(|x| x / n).collect()
}
}
fn random_vectors(n: usize, dim: usize, seed: u64) -> Vec<Vec<f32>> {
use std::hash::{Hash, Hasher};
(0..n)
.map(|i| {
let raw: Vec<f32> = (0..dim)
.map(|j| {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
seed.hash(&mut hasher);
i.hash(&mut hasher);
j.hash(&mut hasher);
let h = hasher.finish();
(h as f64 / u64::MAX as f64 * 2.0 - 1.0) as f32
})
.collect();
normalize(&raw)
})
.collect()
}
fn nonsequential_doc_ids(n: usize, seed: u64) -> Vec<u32> {
use std::hash::{Hash, Hasher};
let mut ids: HashSet<u32> = HashSet::new();
let mut i = 0u64;
while ids.len() < n {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
seed.hash(&mut hasher);
i.hash(&mut hasher);
let h = hasher.finish();
let id = 1000 + (h % 99_000) as u32;
ids.insert(id);
i += 1;
}
ids.into_iter().collect()
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(20))]
#[test]
fn prop_persistence_preserves_doc_ids(
n in 10usize..50,
seed in any::<u64>(),
) {
let dim = 8;
let vectors = random_vectors(n, dim, seed);
let doc_ids = nonsequential_doc_ids(n, seed);
let mut original = HNSWIndex::new(dim, 8, 8).expect("create");
for (i, v) in vectors.iter().enumerate() {
original.add(doc_ids[i], v.clone()).expect("add");
}
original.build().expect("build");
let mem = MemoryDirectory::new();
let mut writer = HNSWSegmentWriter::new(Box::new(mem.clone()), 1);
writer.write_hnsw_index(&original).expect("write");
let reader = HNSWSegmentReader::load(Box::new(mem.clone()), 1).expect("load");
let loaded = reader.load_index().expect("load_index");
let query = &vectors[0];
let k = 5.min(n);
let ef = 50;
let original_results = original.search(query, k, ef).expect("search original");
let loaded_results = loaded.search(query, k, ef).expect("search loaded");
let original_ids: HashSet<u32> = original_results.iter().map(|(id, _)| *id).collect();
let loaded_ids: HashSet<u32> = loaded_results.iter().map(|(id, _)| *id).collect();
let doc_id_set: HashSet<u32> = doc_ids.iter().copied().collect();
for id in &loaded_ids {
prop_assert!(
doc_id_set.contains(id),
"Search returned ID {} which was never added (internal index leak?)",
id
);
}
prop_assert_eq!(
original_ids, loaded_ids,
"Loaded index returns different doc_ids than original"
);
}
#[test]
fn prop_persistence_vector_roundtrip(
n in 10usize..40,
dim in 8usize..32,
seed in any::<u64>(),
) {
let vectors = random_vectors(n, dim, seed);
let mut original = HNSWIndex::new(dim, 16, 16).expect("create");
for (i, v) in vectors.iter().enumerate() {
original.add(i as u32, v.clone()).expect("add");
}
original.build().expect("build");
let mem = MemoryDirectory::new();
let mut writer = HNSWSegmentWriter::new(Box::new(mem.clone()), 1);
writer.write_hnsw_index(&original).expect("write");
let reader = HNSWSegmentReader::load(Box::new(mem.clone()), 1).expect("load");
let loaded = reader.load_index().expect("load_index");
let k = 5.min(n);
let ef = n * 2;
for (i, v) in vectors.iter().enumerate().take(5) {
let original_results = original.search(v, k, ef).expect("search original");
let loaded_results = loaded.search(v, k, ef).expect("search loaded");
let original_ids: Vec<u32> = original_results.iter().map(|(id, _)| *id).collect();
let loaded_ids: Vec<u32> = loaded_results.iter().map(|(id, _)| *id).collect();
prop_assert_eq!(
original_ids, loaded_ids,
"Query {} returns different results after persistence",
i
);
for ((_, orig_dist), (_, loaded_dist)) in original_results.iter().zip(loaded_results.iter()) {
let diff = (orig_dist - loaded_dist).abs();
prop_assert!(
diff < 1e-5,
"Distance mismatch after persistence: {} vs {}",
orig_dist, loaded_dist
);
}
}
}
#[test]
fn prop_search_returns_only_known_doc_ids(
n in 20usize..60,
seed in any::<u64>(),
) {
let dim = 16;
let vectors = random_vectors(n, dim, seed);
let doc_ids = nonsequential_doc_ids(n, seed);
let doc_id_set: HashSet<u32> = doc_ids.iter().copied().collect();
let mut index = HNSWIndex::new(dim, 16, 16).expect("create");
for (i, v) in vectors.iter().enumerate() {
index.add(doc_ids[i], v.clone()).expect("add");
}
index.build().expect("build");
for query_idx in [0, n/4, n/2, 3*n/4, n-1].iter().filter(|&&i| i < n) {
let query = &vectors[*query_idx];
let results = index.search(query, 10.min(n), 100).expect("search");
for (id, _dist) in &results {
prop_assert!(
doc_id_set.contains(id),
"Search returned unknown ID {}: internal index leaked through API",
id
);
}
}
}
#[test]
fn prop_duplicate_doc_id_rejected(
n in 5usize..20,
seed in any::<u64>(),
) {
let dim = 8;
let vectors = random_vectors(n + 1, dim, seed);
let doc_ids = nonsequential_doc_ids(n, seed);
let mut index = HNSWIndex::new(dim, 8, 8).expect("create");
for (i, v) in vectors.iter().take(n).enumerate() {
index.add(doc_ids[i], v.clone()).expect("add");
}
let duplicate_id = doc_ids[0];
let different_vector = vectors[n].clone();
let result = index.add(duplicate_id, different_vector);
prop_assert!(
result.is_err(),
"Adding duplicate doc_id {} should fail but succeeded",
duplicate_id
);
}
#[test]
fn prop_persistence_metadata_roundtrip(
n in 10usize..40,
dim in 4usize..64,
seed in any::<u64>(),
) {
let vectors = random_vectors(n, dim, seed);
let mut original = HNSWIndex::new(dim, 8, 8).expect("create");
for (i, v) in vectors.iter().enumerate() {
original.add(i as u32, v.clone()).expect("add");
}
original.build().expect("build");
let mem = MemoryDirectory::new();
let mut writer = HNSWSegmentWriter::new(Box::new(mem.clone()), 1);
writer.write_hnsw_index(&original).expect("write");
let reader = HNSWSegmentReader::load(Box::new(mem.clone()), 1).expect("load");
let loaded = reader.load_index().expect("load_index");
let query = normalize(&vectors[0]);
let results = loaded.search(&query, 5.min(n), 50);
prop_assert!(results.is_ok(), "Search failed on loaded index");
prop_assert_eq!(
results.unwrap().len(),
5.min(n),
"Loaded index returns wrong number of results"
);
}
}
}
mod hnsw_self_retrieval_props {
use proptest::prelude::*;
use vicinity::hnsw::HNSWIndex;
fn random_normalized_vectors(n: usize, dim: usize, seed: u64) -> Vec<Vec<f32>> {
use rand::prelude::*;
let mut rng = StdRng::seed_from_u64(seed);
(0..n)
.map(|_| {
let v: Vec<f32> = (0..dim).map(|_| rng.random::<f32>() - 0.5).collect();
vicinity::distance::normalize(&v)
})
.collect()
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(20))]
#[test]
fn self_retrieval_high_rate(
n in 50usize..200,
dim in prop::sample::select(vec![8, 16, 32, 64]),
m in prop::sample::select(vec![8, 16, 32]),
seed in any::<u64>(),
) {
let vectors = random_normalized_vectors(n, dim, seed);
let mut index = HNSWIndex::new(dim, m, 2 * m).unwrap();
for (i, v) in vectors.iter().enumerate() {
index.add_slice(i as u32, v).unwrap();
}
index.build().unwrap();
let mut self_found = 0;
for (i, v) in vectors.iter().enumerate() {
let results = index.search(v, 1, 200).unwrap();
if !results.is_empty() && results[0].0 == i as u32 {
self_found += 1;
}
}
let rate = self_found as f64 / n as f64;
prop_assert!(rate >= 0.95,
"Self-retrieval {:.1}% < 95% (n={}, dim={}, M={}, seed={})",
rate * 100.0, n, dim, m, seed);
}
}
}
mod hnsw_ef_monotonicity_props {
use proptest::prelude::*;
use std::collections::HashSet;
use vicinity::hnsw::HNSWIndex;
fn random_normalized_vectors(n: usize, dim: usize, seed: u64) -> Vec<Vec<f32>> {
use rand::prelude::*;
let mut rng = StdRng::seed_from_u64(seed);
(0..n)
.map(|_| {
let v: Vec<f32> = (0..dim).map(|_| rng.random::<f32>() - 0.5).collect();
vicinity::distance::normalize(&v)
})
.collect()
}
fn brute_force_knn(query: &[f32], vectors: &[Vec<f32>], k: usize) -> Vec<u32> {
let mut dists: Vec<(u32, f32)> = vectors
.iter()
.enumerate()
.map(|(i, v)| (i as u32, vicinity::distance::cosine_distance(query, v)))
.collect();
dists.sort_by(|a, b| a.1.total_cmp(&b.1));
dists.into_iter().take(k).map(|(id, _)| id).collect()
}
fn recall_at_k(results: &[(u32, f32)], ground_truth: &[u32]) -> f32 {
let gt: HashSet<u32> = ground_truth.iter().copied().collect();
let found: HashSet<u32> = results.iter().map(|(id, _)| *id).collect();
gt.intersection(&found).count() as f32 / ground_truth.len().max(1) as f32
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(15))]
#[test]
fn ef_search_monotonic_recall(
n in 100usize..300,
seed in any::<u64>(),
) {
let dim = 32;
let vectors = random_normalized_vectors(n, dim, seed);
let queries = random_normalized_vectors(10, dim, seed.wrapping_add(1));
let mut index = HNSWIndex::new(dim, 16, 32).unwrap();
for (i, v) in vectors.iter().enumerate() {
index.add_slice(i as u32, v).unwrap();
}
index.build().unwrap();
let k = 10.min(n);
let ef_values = [10, 20, 50, 100, 200];
let mut prev_recall = 0.0f32;
for &ef in &ef_values {
let mut total_recall = 0.0;
for q in &queries {
let results = index.search(q, k, ef).unwrap();
let gt = brute_force_knn(q, &vectors, k);
total_recall += recall_at_k(&results, >);
}
let avg_recall = total_recall / queries.len() as f32;
prop_assert!(avg_recall >= prev_recall - 0.05,
"Recall decreased from {:.3} to {:.3} when ef increased to {} (seed={})",
prev_recall, avg_recall, ef, seed);
prev_recall = avg_recall;
}
}
}
}
mod hnsw_build_order_props {
use proptest::prelude::*;
use std::collections::HashSet;
use vicinity::hnsw::HNSWIndex;
fn random_normalized_vectors(n: usize, dim: usize, seed: u64) -> Vec<Vec<f32>> {
use rand::prelude::*;
let mut rng = StdRng::seed_from_u64(seed);
(0..n)
.map(|_| {
let v: Vec<f32> = (0..dim).map(|_| rng.random::<f32>() - 0.5).collect();
vicinity::distance::normalize(&v)
})
.collect()
}
fn brute_force_knn(query: &[f32], vectors: &[Vec<f32>], k: usize) -> Vec<u32> {
let mut dists: Vec<(u32, f32)> = vectors
.iter()
.enumerate()
.map(|(i, v)| (i as u32, vicinity::distance::cosine_distance(query, v)))
.collect();
dists.sort_by(|a, b| a.1.total_cmp(&b.1));
dists.into_iter().take(k).map(|(id, _)| id).collect()
}
fn recall_at_k(results: &[(u32, f32)], ground_truth: &[u32]) -> f32 {
let gt: HashSet<u32> = ground_truth.iter().copied().collect();
let found: HashSet<u32> = results.iter().map(|(id, _)| *id).collect();
gt.intersection(&found).count() as f32 / ground_truth.len().max(1) as f32
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(10))]
#[test]
fn build_order_independence(
n in 50usize..150,
shuffle_seed in any::<u64>(),
data_seed in any::<u64>(),
) {
let dim = 16;
let vectors = random_normalized_vectors(n, dim, data_seed);
let queries = random_normalized_vectors(5, dim, data_seed.wrapping_add(999));
let mut index1 = HNSWIndex::new(dim, 16, 32).unwrap();
for (i, v) in vectors.iter().enumerate() {
index1.add_slice(i as u32, v).unwrap();
}
index1.build().unwrap();
let mut order: Vec<usize> = (0..n).collect();
let mut rng_state = shuffle_seed;
for i in (1..n).rev() {
rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1);
let j = (rng_state >> 33) as usize % (i + 1);
order.swap(i, j);
}
let mut index2 = HNSWIndex::new(dim, 16, 32).unwrap();
for &i in &order {
index2.add_slice(i as u32, &vectors[i]).unwrap();
}
index2.build().unwrap();
let k = 10.min(n);
for q in &queries {
let gt = brute_force_knn(q, &vectors, k);
let r1 = index1.search(q, k, 200).unwrap();
let r2 = index2.search(q, k, 200).unwrap();
let recall1 = recall_at_k(&r1, >);
let recall2 = recall_at_k(&r2, >);
let diff = (recall1 - recall2).abs();
prop_assert!(diff <= 0.3,
"Build order changed recall by {:.2} ({:.2} vs {:.2}, seed={})",
diff, recall1, recall2, shuffle_seed);
}
}
}
}
mod hnsw_neighbor_bound_props {
use proptest::prelude::*;
use vicinity::hnsw::HNSWIndex;
fn random_normalized_vectors(n: usize, dim: usize, seed: u64) -> Vec<Vec<f32>> {
use rand::prelude::*;
let mut rng = StdRng::seed_from_u64(seed);
(0..n)
.map(|_| {
let v: Vec<f32> = (0..dim).map(|_| rng.random::<f32>() - 0.5).collect();
vicinity::distance::normalize(&v)
})
.collect()
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(20))]
#[test]
fn neighbor_degree_within_bound(
n in 50usize..200,
m in prop::sample::select(vec![4, 8, 16]),
seed in any::<u64>(),
) {
let m_max = 2 * m;
let vectors = random_normalized_vectors(n, m_max.max(8), seed);
let mut index = HNSWIndex::new(vectors[0].len(), m, m_max).unwrap();
for (i, v) in vectors.iter().enumerate() {
index.add_slice(i as u32, v).unwrap();
}
index.build().unwrap();
let (layer, node, degree) = index.max_node_degree();
let bound = if layer == 0 { m_max } else { m };
prop_assert!(degree <= bound,
"Node {} on layer {} has degree {} > bound {} (m={}, m_max={})",
node, layer, degree, bound, m, m_max);
}
}
}
mod hnsw_subset_props {
use proptest::prelude::*;
use std::collections::HashSet;
use vicinity::hnsw::HNSWIndex;
fn random_normalized_vectors(n: usize, dim: usize, seed: u64) -> Vec<Vec<f32>> {
use rand::prelude::*;
let mut rng = StdRng::seed_from_u64(seed);
(0..n)
.map(|_| {
let v: Vec<f32> = (0..dim).map(|_| rng.random::<f32>() - 0.5).collect();
vicinity::distance::normalize(&v)
})
.collect()
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(30))]
#[test]
fn top_k_subset_property(
n in 50usize..200,
seed in any::<u64>(),
) {
let dim = 16;
let vectors = random_normalized_vectors(n, dim, seed);
let mut index = HNSWIndex::new(dim, 16, 32).unwrap();
for (i, v) in vectors.iter().enumerate() {
index.add_slice(i as u32, v).unwrap();
}
index.build().unwrap();
let query = &vectors[0];
let ef = n; let k_small = 5.min(n);
let k_large = 15.min(n);
let results_small = index.search(query, k_small, ef).unwrap();
let results_large = index.search(query, k_large, ef).unwrap();
let ids_small: HashSet<u32> = results_small.iter().map(|(id, _)| *id).collect();
let ids_large: HashSet<u32> = results_large.iter().map(|(id, _)| *id).collect();
let missing: Vec<u32> = ids_small.difference(&ids_large).copied().collect();
prop_assert!(missing.is_empty(),
"top-{} result(s) {:?} not in top-{} (seed={})",
k_small, missing, k_large, seed);
}
}
}