#![allow(clippy::unwrap_used)]
use iqdb_distance::compute;
use iqdb_quantize::{BinaryQuantizer, ProductQuantizer, Quantizer, ScalarQuantizer};
use iqdb_types::DistanceMetric;
const DIM: usize = 128;
const N_CLUSTERS: usize = 8;
const PER_CLUSTER: usize = 20;
const K: usize = 10;
const SEED: u64 = 0xA1B2_C3D4_E5F6_0789;
const SQ8_MIN_OVERLAP: f32 = 0.9;
const BQ_MIN_CLUSTER_PURITY: f32 = 0.7;
const PQ_MIN_OVERLAP: f32 = 0.6;
const PQ_M: usize = 8;
const PQ_K: usize = 32;
const PQ_SEED: u64 = 0x0F0E_0D0C_0B0A_0908;
struct Lcg(u64);
impl Lcg {
fn new(seed: u64) -> Self {
Self(seed)
}
fn next_u64(&mut self) -> u64 {
self.0 = self
.0
.wrapping_mul(6_364_136_223_846_793_005)
.wrapping_add(1_442_695_040_888_963_407);
self.0
}
fn next_unit(&mut self) -> f32 {
let bits = (self.next_u64() >> 40) as u32;
(bits as f32) / ((1u32 << 24) as f32)
}
fn next_normal(&mut self) -> f32 {
let u1 = self.next_unit().max(1e-9);
let u2 = self.next_unit();
let r = (-2.0_f32 * u1.ln()).sqrt();
let theta = 2.0_f32 * std::f32::consts::PI * u2;
r * theta.cos()
}
}
fn build_corpus() -> (Vec<Vec<f32>>, Vec<f32>) {
let mut rng = Lcg::new(SEED);
let mut centres: Vec<Vec<f32>> = Vec::with_capacity(N_CLUSTERS);
for _ in 0..N_CLUSTERS {
let mut c = Vec::with_capacity(DIM);
for _ in 0..DIM {
let sign = if rng.next_u64() >> 63 == 0 { -1.0 } else { 1.0 };
c.push(sign);
}
centres.push(c);
}
let mut corpus = Vec::with_capacity(N_CLUSTERS * PER_CLUSTER);
for c in ¢res {
for _ in 0..PER_CLUSTER {
let v: Vec<f32> = c.iter().map(|x| x + rng.next_normal()).collect();
corpus.push(v);
}
}
let query: Vec<f32> = centres[0]
.iter()
.map(|x| x + 0.3 * rng.next_normal())
.collect();
(corpus, query)
}
fn top_k_f32(query: &[f32], corpus: &[Vec<f32>]) -> Vec<usize> {
let mut distances: Vec<(usize, f32)> = corpus
.iter()
.enumerate()
.map(|(i, v)| (i, compute(DistanceMetric::Cosine, query, v).unwrap()))
.collect();
distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
distances.into_iter().take(K).map(|(i, _)| i).collect()
}
fn top_k_f32_euclidean(query: &[f32], corpus: &[Vec<f32>]) -> Vec<usize> {
let mut distances: Vec<(usize, f32)> = corpus
.iter()
.enumerate()
.map(|(i, v)| (i, compute(DistanceMetric::Euclidean, query, v).unwrap()))
.collect();
distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
distances.into_iter().take(K).map(|(i, _)| i).collect()
}
fn overlap(a: &[usize], b: &[usize]) -> f32 {
let mut hits = 0;
for x in a {
if b.contains(x) {
hits += 1;
}
}
hits as f32 / a.len() as f32
}
#[test]
fn sq8_top_k_overlap_with_f32_baseline_meets_threshold() {
let (corpus, query) = build_corpus();
let refs: Vec<&[f32]> = corpus.iter().map(Vec::as_slice).collect();
let mut sq = ScalarQuantizer::new();
sq.train(&refs).unwrap();
let codes: Vec<_> = corpus.iter().map(|v| sq.quantize(v).unwrap()).collect();
let mut quantized: Vec<(usize, f32)> = codes
.iter()
.enumerate()
.map(|(i, c)| (i, sq.distance(&query, c, DistanceMetric::Cosine).unwrap()))
.collect();
quantized.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
let sq8_top: Vec<usize> = quantized.into_iter().take(K).map(|(i, _)| i).collect();
let baseline = top_k_f32(&query, &corpus);
let r = overlap(&sq8_top, &baseline);
assert!(
r >= SQ8_MIN_OVERLAP,
"SQ8 top-{K} overlap {r:.3} below threshold {SQ8_MIN_OVERLAP:.3} \
(f32 baseline top-{K} clusters {baseline_clusters:?}, \
sq8 top-{K} clusters {sq8_clusters:?})",
baseline_clusters = baseline.iter().map(|&i| cluster_of(i)).collect::<Vec<_>>(),
sq8_clusters = sq8_top.iter().map(|&i| cluster_of(i)).collect::<Vec<_>>(),
);
}
fn cluster_of(i: usize) -> usize {
i / PER_CLUSTER
}
#[test]
fn bq_top_k_preserves_query_cluster() {
let (corpus, query) = build_corpus();
let refs: Vec<&[f32]> = corpus.iter().map(Vec::as_slice).collect();
let mut bq = BinaryQuantizer::new();
bq.train(&refs).unwrap();
let codes: Vec<_> = corpus.iter().map(|v| bq.quantize(v).unwrap()).collect();
let mut quantized: Vec<(usize, f32)> = codes
.iter()
.enumerate()
.map(|(i, c)| (i, bq.distance(&query, c, DistanceMetric::Hamming).unwrap()))
.collect();
quantized.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
let bq_top: Vec<usize> = quantized.into_iter().take(K).map(|(i, _)| i).collect();
let query_cluster = 0;
let hits = bq_top
.iter()
.filter(|&&i| cluster_of(i) == query_cluster)
.count();
let purity = hits as f32 / K as f32;
let baseline = top_k_f32(&query, &corpus);
assert!(
purity >= BQ_MIN_CLUSTER_PURITY,
"BQ top-{K} cluster purity {purity:.3} below threshold {BQ_MIN_CLUSTER_PURITY:.3} \
(bq top clusters {bq_clusters:?}, f32 baseline clusters {baseline_clusters:?})",
bq_clusters = bq_top.iter().map(|&i| cluster_of(i)).collect::<Vec<_>>(),
baseline_clusters = baseline.iter().map(|&i| cluster_of(i)).collect::<Vec<_>>(),
);
}
#[test]
fn pq_top_k_overlap_with_f32_baseline_meets_threshold() {
let (corpus, query) = build_corpus();
let refs: Vec<&[f32]> = corpus.iter().map(Vec::as_slice).collect();
let mut pq = ProductQuantizer::with_config(PQ_M, PQ_K, PQ_SEED);
pq.train(&refs).unwrap();
let codes: Vec<_> = corpus.iter().map(|v| pq.quantize(v).unwrap()).collect();
let mut quantized: Vec<(usize, f32)> = codes
.iter()
.enumerate()
.map(|(i, c)| {
(
i,
pq.distance(&query, c, DistanceMetric::Euclidean).unwrap(),
)
})
.collect();
quantized.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
let pq_top: Vec<usize> = quantized.into_iter().take(K).map(|(i, _)| i).collect();
let baseline = top_k_f32_euclidean(&query, &corpus);
let r = overlap(&pq_top, &baseline);
assert!(
r >= PQ_MIN_OVERLAP,
"PQ top-{K} overlap {r:.3} below threshold {PQ_MIN_OVERLAP:.3} \
(f32 baseline top-{K} clusters {baseline_clusters:?}, \
pq top-{K} clusters {pq_clusters:?})",
baseline_clusters = baseline.iter().map(|&i| cluster_of(i)).collect::<Vec<_>>(),
pq_clusters = pq_top.iter().map(|&i| cluster_of(i)).collect::<Vec<_>>(),
);
}