use nodedb_vector::quantize::pq::PqCodec;
fn clustered_with_duplicates() -> Vec<Vec<f32>> {
let mut vecs: Vec<Vec<f32>> = Vec::with_capacity(200);
for i in 0..190 {
let eps = (i as f32) * 1e-5;
vecs.push(vec![eps, -eps, eps * 0.5, -eps * 0.5]);
}
for j in 0..10 {
let x = 100.0 + (j as f32) * 10.0;
vecs.push(vec![x, -x, x * 0.5, -x * 0.5]);
}
vecs
}
fn unique_centroid_count(codec: &PqCodec, vectors: &[Vec<f32>]) -> usize {
let refs: Vec<&[f32]> = vectors.iter().map(|v| v.as_slice()).collect();
let codes = codec.encode_batch(&refs);
let m = codec.m;
let mut min_unique = usize::MAX;
for sub in 0..m {
let mut seen = std::collections::HashSet::new();
for row in 0..vectors.len() {
seen.insert(codes[row * m + sub]);
}
if seen.len() < min_unique {
min_unique = seen.len();
}
}
min_unique
}
#[test]
fn pq_kmeans_produces_diverse_centroids_on_duplicate_heavy_data() {
let vecs = clustered_with_duplicates();
let refs: Vec<&[f32]> = vecs.iter().map(|v| v.as_slice()).collect();
let codec = PqCodec::train(&refs, 4, 2, 16, 20);
let unique = unique_centroid_count(&codec, &vecs);
assert!(
unique >= 4,
"k-means collapsed to {unique} unique centroids per subspace on \
duplicate-heavy input; a correct k-means++ should pick at least \
4 distinct cluster representatives for k=16"
);
}
#[test]
fn pq_distance_table_separates_duplicates_from_outliers() {
let vecs = clustered_with_duplicates();
let refs: Vec<&[f32]> = vecs.iter().map(|v| v.as_slice()).collect();
let codec = PqCodec::train(&refs, 4, 2, 16, 20);
let query = [0.0f32, 0.0, 0.0, 0.0];
let table = codec.build_distance_table(&query);
let dup_code = codec.encode(&vecs[0]); let outlier_code = codec.encode(&vecs[195]);
let dup_dist = codec.asymmetric_distance(&table, &dup_code);
let outlier_dist = codec.asymmetric_distance(&table, &outlier_code);
assert!(
outlier_dist > dup_dist * 10.0,
"PQ failed to distinguish duplicate (d={dup_dist}) from outlier \
(d={outlier_dist}) — codebook collapsed and the two codes encode \
to near-identical table entries"
);
}
#[cfg(feature = "ivf")]
#[test]
fn ivf_pq_training_does_not_collapse_on_duplicate_heavy_data() {
use nodedb_vector::DistanceMetric;
use nodedb_vector::{IvfPqIndex, IvfPqParams};
let vecs = clustered_with_duplicates();
let refs: Vec<&[f32]> = vecs.iter().map(|v| v.as_slice()).collect();
let mut idx = IvfPqIndex::new(
4,
IvfPqParams {
n_cells: 8,
pq_m: 2,
pq_k: 16,
nprobe: 4,
metric: DistanceMetric::L2,
},
);
idx.train(&refs);
for v in &vecs {
idx.add(v);
}
let results = idx.search(&[0.0, 0.0, 0.0, 0.0], 5);
assert!(!results.is_empty(), "IVF-PQ returned no results");
for r in &results {
assert!(
r.id < 190,
"IVF-PQ k-means collapse: query at origin returned outlier id={} \
instead of a near-duplicate cluster member",
r.id
);
}
}