use criterion::{BenchmarkId, Criterion, Throughput, black_box, criterion_group, criterion_main};
use lattice_embed::simd::{
self, BinaryVector, Int4Vector, NormalizationHint, PreparedQuery, PreparedQueryWithMeta,
QuantizationTier, QuantizedData, QuantizedVector, SimdConfig, approximate_cosine_distance,
approximate_cosine_distance_prepared, approximate_cosine_distance_prepared_with_meta,
approximate_dot_product_prepared,
};
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
const DIMENSIONS: [usize; 4] = [384, 768, 1024, 1536];
fn generate_vector(dim: usize, seed: u64) -> Vec<f32> {
(0..dim)
.map(|i| {
let mut hasher = DefaultHasher::new();
(seed, i).hash(&mut hasher);
(hasher.finish() as f32 / u64::MAX as f32) * 2.0 - 1.0
})
.collect()
}
fn generate_normalized_vector(dim: usize, seed: u64) -> Vec<f32> {
let mut v = generate_vector(dim, seed);
simd::normalize(&mut v);
v
}
fn cosine_similarity_scalar(a: &[f32], b: &[f32]) -> f32 {
if a.len() != b.len() || a.is_empty() {
return 0.0;
}
let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm_a == 0.0 || norm_b == 0.0 {
0.0
} else {
dot / (norm_a * norm_b)
}
}
fn dot_product_scalar(a: &[f32], b: &[f32]) -> f32 {
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
}
fn normalize_scalar(vector: &mut [f32]) {
let norm: f32 = vector.iter().map(|x| x * x).sum::<f32>().sqrt();
if norm > 0.0 {
let inv_norm = 1.0 / norm;
vector.iter_mut().for_each(|x| *x *= inv_norm);
}
}
fn euclidean_distance_scalar(a: &[f32], b: &[f32]) -> f32 {
a.iter()
.zip(b.iter())
.map(|(x, y)| (x - y).powi(2))
.sum::<f32>()
.sqrt()
}
fn bench_cosine_simd_vs_scalar(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_cosine_similarity");
let config = SimdConfig::detect();
println!(
"\n=== SIMD Capabilities: AVX2={}, FMA={}, AVX512-VNNI={}, NEON={} ===\n",
config.avx2_enabled, config.fma_enabled, config.avx512vnni_enabled, config.neon_enabled
);
for dim in DIMENSIONS {
let a = generate_vector(dim, 42);
let b = generate_vector(dim, 123);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("scalar", dim), &dim, |bench, _| {
bench.iter(|| black_box(cosine_similarity_scalar(black_box(&a), black_box(&b))));
});
group.bench_with_input(BenchmarkId::new("simd", dim), &dim, |bench, _| {
bench.iter(|| black_box(simd::cosine_similarity(black_box(&a), black_box(&b))));
});
}
group.finish();
}
fn bench_dot_product_simd_vs_scalar(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_dot_product");
for dim in DIMENSIONS {
let a = generate_normalized_vector(dim, 42);
let b = generate_normalized_vector(dim, 123);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("scalar", dim), &dim, |bench, _| {
bench.iter(|| black_box(dot_product_scalar(black_box(&a), black_box(&b))));
});
group.bench_with_input(BenchmarkId::new("simd", dim), &dim, |bench, _| {
bench.iter(|| black_box(simd::dot_product(black_box(&a), black_box(&b))));
});
}
group.finish();
}
fn bench_normalize_simd_vs_scalar(c: &mut Criterion) {
use criterion::BatchSize;
let mut group = c.benchmark_group("simd_normalize");
for dim in DIMENSIONS {
let template = generate_vector(dim, 42);
group.throughput(Throughput::Elements(dim as u64));
let template_scalar = template.clone();
group.bench_with_input(BenchmarkId::new("scalar", dim), &dim, |bench, _| {
bench.iter_batched(
|| template_scalar.clone(),
|mut v| {
normalize_scalar(black_box(&mut v));
v
},
BatchSize::SmallInput,
);
});
let template_simd = template.clone();
group.bench_with_input(BenchmarkId::new("simd", dim), &dim, |bench, _| {
bench.iter_batched(
|| template_simd.clone(),
|mut v| {
simd::normalize(black_box(&mut v));
v
},
BatchSize::SmallInput,
);
});
}
group.finish();
}
fn bench_euclidean_simd_vs_scalar(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_euclidean_distance");
for dim in DIMENSIONS {
let a = generate_vector(dim, 42);
let b = generate_vector(dim, 123);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("scalar", dim), &dim, |bench, _| {
bench.iter(|| black_box(euclidean_distance_scalar(black_box(&a), black_box(&b))));
});
group.bench_with_input(BenchmarkId::new("simd", dim), &dim, |bench, _| {
bench.iter(|| black_box(simd::euclidean_distance(black_box(&a), black_box(&b))));
});
}
group.finish();
}
fn bench_batch_cosine(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_batch_cosine");
for batch_size in [10, 100, 1000] {
let pairs: Vec<(Vec<f32>, Vec<f32>)> = (0..batch_size)
.map(|i| {
(
generate_vector(384, i as u64),
generate_vector(384, i as u64 + 10000),
)
})
.collect();
group.throughput(Throughput::Elements(batch_size as u64));
group.bench_with_input(
BenchmarkId::new("scalar_loop", batch_size),
&batch_size,
|bench, _| {
bench.iter(|| {
let results: Vec<f32> = pairs
.iter()
.map(|(a, b)| cosine_similarity_scalar(a, b))
.collect();
black_box(results)
});
},
);
let pair_refs: Vec<(&[f32], &[f32])> = pairs
.iter()
.map(|(a, b)| (a.as_slice(), b.as_slice()))
.collect();
group.bench_with_input(
BenchmarkId::new("simd_batch", batch_size),
&batch_size,
|bench, _| {
bench.iter(|| black_box(simd::batch_cosine_similarity(&pair_refs)));
},
);
}
group.finish();
}
fn bench_throughput_384(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_throughput_384");
group.throughput(Throughput::Elements(1));
let a = generate_normalized_vector(384, 42);
let b = generate_normalized_vector(384, 123);
group.bench_function("dot_product", |bench| {
bench.iter(|| black_box(simd::dot_product(black_box(&a), black_box(&b))));
});
group.bench_function("cosine_similarity", |bench| {
bench.iter(|| black_box(simd::cosine_similarity(black_box(&a), black_box(&b))));
});
let template = generate_vector(384, 42);
group.bench_function("normalize", |bench| {
bench.iter(|| {
let mut v = template.clone();
simd::normalize(black_box(&mut v));
black_box(v)
});
});
group.bench_function("euclidean_distance", |bench| {
bench.iter(|| black_box(simd::euclidean_distance(black_box(&a), black_box(&b))));
});
group.finish();
}
fn bench_quantization(c: &mut Criterion) {
let mut group = c.benchmark_group("int8_quantization");
for dim in DIMENSIONS {
let v = generate_normalized_vector(dim, 42);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("quantize", dim), &dim, |bench, _| {
bench.iter(|| black_box(QuantizedVector::from_f32(black_box(&v))));
});
}
group.finish();
}
fn bench_int8_vs_float32(c: &mut Criterion) {
let mut group = c.benchmark_group("int8_vs_float32_cosine");
for dim in DIMENSIONS {
let a = generate_normalized_vector(dim, 42);
let b = generate_normalized_vector(dim, 123);
let a_q = QuantizedVector::from_f32(&a);
let b_q = QuantizedVector::from_f32(&b);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("float32_simd", dim), &dim, |bench, _| {
bench.iter(|| black_box(simd::cosine_similarity(black_box(&a), black_box(&b))));
});
group.bench_with_input(BenchmarkId::new("int8", dim), &dim, |bench, _| {
bench.iter(|| black_box(a_q.cosine_similarity(black_box(&b_q))));
});
}
group.finish();
}
fn bench_int8_batch(c: &mut Criterion) {
let mut group = c.benchmark_group("int8_batch_cosine");
for batch_size in [10, 100, 1000] {
let pairs_f32: Vec<(Vec<f32>, Vec<f32>)> = (0..batch_size)
.map(|i| {
(
generate_normalized_vector(384, i as u64),
generate_normalized_vector(384, i as u64 + 10000),
)
})
.collect();
let pairs_i8: Vec<(QuantizedVector, QuantizedVector)> = pairs_f32
.iter()
.map(|(a, b)| (QuantizedVector::from_f32(a), QuantizedVector::from_f32(b)))
.collect();
group.throughput(Throughput::Elements(batch_size as u64));
let pair_refs: Vec<(&[f32], &[f32])> = pairs_f32
.iter()
.map(|(a, b)| (a.as_slice(), b.as_slice()))
.collect();
group.bench_with_input(
BenchmarkId::new("float32_simd", batch_size),
&batch_size,
|bench, _| {
bench.iter(|| black_box(simd::batch_cosine_similarity(&pair_refs)));
},
);
group.bench_with_input(
BenchmarkId::new("int8_loop", batch_size),
&batch_size,
|bench, _| {
bench.iter(|| {
let results: Vec<f32> = pairs_i8
.iter()
.map(|(a, b)| a.cosine_similarity(b))
.collect();
black_box(results)
});
},
);
}
group.finish();
}
fn bench_memory_usage(c: &mut Criterion) {
let mut group = c.benchmark_group("memory_size");
for dim in DIMENSIONS {
let v = generate_normalized_vector(dim, 42);
let v_q = QuantizedVector::from_f32(&v);
let f32_bytes = v.len() * std::mem::size_of::<f32>();
let i8_bytes = v_q.data.len() * std::mem::size_of::<i8>() + 16;
println!(
"dim={}: float32={}B, int8={}B, ratio={:.1}x",
dim,
f32_bytes,
i8_bytes,
f32_bytes as f64 / i8_bytes as f64
);
}
let vectors_f32: Vec<Vec<f32>> = (0..1000)
.map(|i| generate_normalized_vector(384, i as u64))
.collect();
let vectors_i8: Vec<QuantizedVector> = vectors_f32
.iter()
.map(|v| QuantizedVector::from_f32(v))
.collect();
let query = generate_normalized_vector(384, 99999);
let query_q = QuantizedVector::from_f32(&query);
group.throughput(Throughput::Elements(1000));
group.bench_function("search_1000_float32", |bench| {
bench.iter(|| {
let results: Vec<f32> = vectors_f32
.iter()
.map(|v| simd::cosine_similarity(&query, v))
.collect();
black_box(results)
});
});
group.bench_function("search_1000_int8", |bench| {
bench.iter(|| {
let results: Vec<f32> = vectors_i8
.iter()
.map(|v| query_q.cosine_similarity(v))
.collect();
black_box(results)
});
});
group.finish();
}
fn bench_int4_cosine_distance(c: &mut Criterion) {
let mut group = c.benchmark_group("int4_cosine_distance");
for dim in DIMENSIONS {
let a = generate_normalized_vector(dim, 42);
let b = generate_normalized_vector(dim, 123);
let a_q = Int4Vector::from_f32(&a);
let b_q = Int4Vector::from_f32(&b);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("float32_simd", dim), &dim, |bench, _| {
bench.iter(|| black_box(1.0 - simd::cosine_similarity(black_box(&a), black_box(&b))));
});
group.bench_with_input(BenchmarkId::new("int4", dim), &dim, |bench, _| {
bench.iter(|| black_box(a_q.cosine_distance(black_box(&b_q))));
});
}
group.finish();
}
fn bench_binary_cosine_distance(c: &mut Criterion) {
let mut group = c.benchmark_group("binary_cosine_distance");
for dim in DIMENSIONS {
let a = generate_normalized_vector(dim, 42);
let b = generate_normalized_vector(dim, 123);
let a_q = BinaryVector::from_f32(&a);
let b_q = BinaryVector::from_f32(&b);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("float32_simd", dim), &dim, |bench, _| {
bench.iter(|| black_box(1.0 - simd::cosine_similarity(black_box(&a), black_box(&b))));
});
group.bench_with_input(BenchmarkId::new("binary", dim), &dim, |bench, _| {
bench.iter(|| black_box(a_q.cosine_distance_approx(black_box(&b_q))));
});
}
group.finish();
}
fn bench_batch_dot_product(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_batch_dot_product");
for batch_size in [10, 100, 1000] {
let pairs: Vec<(Vec<f32>, Vec<f32>)> = (0..batch_size)
.map(|i| {
(
generate_vector(384, i as u64),
generate_vector(384, i as u64 + 10000),
)
})
.collect();
group.throughput(Throughput::Elements(batch_size as u64));
group.bench_with_input(
BenchmarkId::new("scalar_loop", batch_size),
&batch_size,
|bench, _| {
bench.iter(|| {
let results: Vec<f32> = pairs
.iter()
.map(|(a, b)| dot_product_scalar(a, b))
.collect();
black_box(results)
});
},
);
let pair_refs: Vec<(&[f32], &[f32])> = pairs
.iter()
.map(|(a, b)| (a.as_slice(), b.as_slice()))
.collect();
group.bench_with_input(
BenchmarkId::new("simd_batch", batch_size),
&batch_size,
|bench, _| {
bench.iter(|| black_box(simd::batch_dot_product(&pair_refs)));
},
);
}
group.finish();
}
fn bench_prepared_query(c: &mut Criterion) {
let mut group = c.benchmark_group("tier_prepared_query");
const COUNT: usize = 1000;
const DIM: usize = 384;
let query_f32 = generate_vector(DIM, 42);
let stored_f32: Vec<Vec<f32>> = (0..COUNT)
.map(|i| generate_vector(DIM, i as u64 + 1))
.collect();
let stored_int8: Vec<QuantizedVector> = stored_f32
.iter()
.map(|v| QuantizedVector::from_f32(v))
.collect();
let stored_int8_data: Vec<QuantizedData> = stored_int8
.iter()
.map(|q| QuantizedData::Int8(q.clone()))
.collect();
let stored_int4: Vec<Int4Vector> = stored_f32.iter().map(|v| Int4Vector::from_f32(v)).collect();
let stored_int4_data: Vec<QuantizedData> = stored_int4
.iter()
.map(|q| QuantizedData::Int4(q.clone()))
.collect();
let stored_binary: Vec<BinaryVector> = stored_f32
.iter()
.map(|v| BinaryVector::from_f32(v))
.collect();
let stored_binary_data: Vec<QuantizedData> = stored_binary
.iter()
.map(|q| QuantizedData::Binary(q.clone()))
.collect();
group.throughput(Throughput::Elements(COUNT as u64));
group.bench_function("int8_query_per_call/1000", |bench| {
bench.iter(|| {
let results: Vec<f32> = stored_int8_data
.iter()
.map(|s| approximate_cosine_distance(black_box(&query_f32), s))
.collect();
black_box(results)
});
});
let pre_q_int8 = PreparedQuery::from_f32(&query_f32, QuantizationTier::Int8);
group.bench_function("int8_query_once/1000", |bench| {
bench.iter(|| {
let results: Vec<f32> = stored_int8_data
.iter()
.map(|s| approximate_cosine_distance_prepared(black_box(&pre_q_int8), s))
.collect();
black_box(results)
});
});
group.bench_function("int4_query_per_call/1000", |bench| {
bench.iter(|| {
let results: Vec<f32> = stored_int4_data
.iter()
.map(|s| approximate_cosine_distance(black_box(&query_f32), s))
.collect();
black_box(results)
});
});
let pre_q_int4 = PreparedQuery::from_f32(&query_f32, QuantizationTier::Int4);
group.bench_function("int4_query_once/1000", |bench| {
bench.iter(|| {
let results: Vec<f32> = stored_int4_data
.iter()
.map(|s| approximate_cosine_distance_prepared(black_box(&pre_q_int4), s))
.collect();
black_box(results)
});
});
group.bench_function("binary_query_per_call/1000", |bench| {
bench.iter(|| {
let results: Vec<f32> = stored_binary_data
.iter()
.map(|s| approximate_cosine_distance(black_box(&query_f32), s))
.collect();
black_box(results)
});
});
let pre_q_binary = PreparedQuery::from_f32(&query_f32, QuantizationTier::Binary);
group.bench_function("binary_query_once/1000", |bench| {
bench.iter(|| {
let results: Vec<f32> = stored_binary_data
.iter()
.map(|s| approximate_cosine_distance_prepared(black_box(&pre_q_binary), s))
.collect();
black_box(results)
});
});
group.finish();
}
fn bench_dot_product_i8_raw(c: &mut Criterion) {
let mut group = c.benchmark_group("int8_raw_dot_product");
for dim in [127usize, 128, 129, 384, 768, 1024] {
let a_q = QuantizedVector::from_f32(&generate_vector(dim, 42));
let b_q = QuantizedVector::from_f32(&generate_vector(dim, 123));
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("dot_product_i8", dim), &dim, |bench, _| {
bench.iter(|| black_box(simd::dot_product_i8(black_box(&a_q), black_box(&b_q))));
});
group.bench_with_input(
BenchmarkId::new("dot_product_i8_raw", dim),
&dim,
|bench, _| {
bench.iter(|| {
black_box(simd::dot_product_i8_raw(
black_box(&a_q.data),
black_box(&b_q.data),
))
});
},
);
}
group.finish();
}
fn bench_int8_prepared_dot_product(c: &mut Criterion) {
let mut group = c.benchmark_group("int8_prepared_dot_product");
for dim in [127usize, 128, 129, 384, 768, 1024] {
let query_f32 = generate_vector(dim, 42);
let stored_q = QuantizedVector::from_f32(&generate_vector(dim, 123));
let stored_data = QuantizedData::Int8(stored_q);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("per_call", dim), &dim, |bench, _| {
bench.iter(|| {
black_box(simd::approximate_dot_product(
black_box(&query_f32),
black_box(&stored_data),
))
});
});
let prepared = PreparedQuery::from_f32(&query_f32, QuantizationTier::Int8);
group.bench_with_input(BenchmarkId::new("prepared", dim), &dim, |bench, _| {
bench.iter(|| {
black_box(approximate_dot_product_prepared(
black_box(&prepared),
black_box(&stored_data),
))
});
});
}
group.finish();
}
fn bench_normalized_cosine_fast_path(c: &mut Criterion) {
const BENCH_DIMS: [usize; 3] = [384, 768, 1024];
let mut group = c.benchmark_group("simd_normalized_cosine_fast_path");
for dim in BENCH_DIMS {
let a = generate_normalized_vector(dim, 42);
let b = generate_normalized_vector(dim, 123);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("cosine_full", dim), &dim, |bench, _| {
bench.iter(|| black_box(simd::cosine_similarity(black_box(&a), black_box(&b))));
});
group.bench_with_input(BenchmarkId::new("dot_product", dim), &dim, |bench, _| {
bench.iter(|| black_box(simd::dot_product(black_box(&a), black_box(&b))));
});
}
group.finish();
}
fn bench_squared_euclidean_fast_path(c: &mut Criterion) {
const BENCH_DIMS: [usize; 3] = [384, 768, 1024];
let mut group = c.benchmark_group("simd_squared_euclidean_fast_path");
for dim in BENCH_DIMS {
let a = generate_vector(dim, 42);
let b = generate_vector(dim, 123);
group.throughput(Throughput::Elements(dim as u64));
group.bench_with_input(BenchmarkId::new("euclidean_full", dim), &dim, |bench, _| {
bench.iter(|| black_box(simd::euclidean_distance(black_box(&a), black_box(&b))));
});
group.bench_with_input(
BenchmarkId::new("squared_euclidean", dim),
&dim,
|bench, _| {
bench.iter(|| {
black_box(simd::squared_euclidean_distance(
black_box(&a),
black_box(&b),
))
});
},
);
}
group.finish();
}
fn bench_prepared_query_normalized_cosine(c: &mut Criterion) {
const BENCH_DIMS: [usize; 3] = [384, 768, 1024];
const STORED_COUNT: usize = 1000;
let mut group = c.benchmark_group("simd_prepared_query_normalized_cosine");
for dim in BENCH_DIMS {
let query_unit = generate_normalized_vector(dim, 999);
let stored_full: Vec<QuantizedData> = (0..STORED_COUNT)
.map(|i| QuantizedData::Full(generate_normalized_vector(dim, i as u64)))
.collect();
group.throughput(Throughput::Elements(STORED_COUNT as u64));
let prepared_full = PreparedQuery::from_f32(&query_unit, QuantizationTier::Full);
group.bench_with_input(
BenchmarkId::new("prepared_full_cosine", dim),
&dim,
|bench, _| {
bench.iter(|| {
let results: Vec<f32> = stored_full
.iter()
.map(|s| approximate_cosine_distance_prepared(black_box(&prepared_full), s))
.collect();
black_box(results)
});
},
);
let query_vec = query_unit.clone();
group.bench_with_input(
BenchmarkId::new("dot_product_loop", dim),
&dim,
|bench, _| {
bench.iter(|| {
let results: Vec<f32> = stored_full
.iter()
.map(|s| match s {
QuantizedData::Full(v) => {
simd::dot_product(black_box(&query_vec), black_box(v))
}
_ => 0.0,
})
.collect();
black_box(results)
});
},
);
let meta = PreparedQueryWithMeta::from_f32(
&query_unit,
QuantizationTier::Full,
NormalizationHint::Unit,
);
group.bench_with_input(
BenchmarkId::new("prepared_meta_unit", dim),
&dim,
|bench, _| {
bench.iter(|| {
let results: Vec<f32> = stored_full
.iter()
.map(|s| {
approximate_cosine_distance_prepared_with_meta(
black_box(&meta),
s,
NormalizationHint::Unit,
)
})
.collect();
black_box(results)
});
},
);
}
group.finish();
}
fn bench_query_batch_dot_product(c: &mut Criterion) {
const BENCH_DIMS: [usize; 3] = [128, 384, 768];
const BATCH_SIZES: [usize; 4] = [4, 16, 64, 256];
let mut group = c.benchmark_group("simd_query_batch_dot_product");
for dim in BENCH_DIMS {
let query = generate_normalized_vector(dim, 42);
let candidates: Vec<Vec<f32>> = (0..256)
.map(|i| generate_vector(dim, i as u64 + 1))
.collect();
for &count in &BATCH_SIZES {
let cands = &candidates[..count];
group.throughput(Throughput::Elements(count as u64));
group.bench_with_input(
BenchmarkId::new("pair_loop", format!("{dim}d_{count}c")),
&count,
|bench, _| {
bench.iter(|| {
let results: Vec<f32> = cands
.iter()
.map(|c| simd::dot_product(black_box(&query), black_box(c)))
.collect();
black_box(results)
});
},
);
let pair_refs: Vec<(&[f32], &[f32])> = cands
.iter()
.map(|c| (query.as_slice(), c.as_slice()))
.collect();
group.bench_with_input(
BenchmarkId::new("simd_batch", format!("{dim}d_{count}c")),
&count,
|bench, _| {
bench.iter(|| black_box(simd::batch_dot_product(black_box(&pair_refs))));
},
);
}
}
group.finish();
}
fn bench_batch_cosine_normalized_query(c: &mut Criterion) {
const BENCH_DIMS: [usize; 3] = [384, 768, 1024];
const BATCH_SIZES: [usize; 5] = [4, 16, 64, 256, 1000];
let mut group = c.benchmark_group("simd_batch_cosine_normalized_query");
for dim in BENCH_DIMS {
let query = generate_normalized_vector(dim, 42);
let candidates: Vec<Vec<f32>> = (0..1000)
.map(|i| generate_normalized_vector(dim, i as u64 + 1))
.collect();
for &count in &BATCH_SIZES {
let cands = &candidates[..count];
group.throughput(Throughput::Elements(count as u64));
group.bench_with_input(
BenchmarkId::new("pair_loop_dot", format!("{dim}d_{count}c")),
&count,
|bench, _| {
bench.iter(|| {
let results: Vec<f32> = cands
.iter()
.map(|c| simd::dot_product(black_box(&query), black_box(c)))
.collect();
black_box(results)
});
},
);
group.bench_with_input(
BenchmarkId::new("pair_loop_cosine", format!("{dim}d_{count}c")),
&count,
|bench, _| {
bench.iter(|| {
let results: Vec<f32> = cands
.iter()
.map(|c| simd::cosine_similarity(black_box(&query), black_box(c)))
.collect();
black_box(results)
});
},
);
let pair_refs: Vec<(&[f32], &[f32])> = cands
.iter()
.map(|c| (query.as_slice(), c.as_slice()))
.collect();
group.bench_with_input(
BenchmarkId::new("simd_batch", format!("{dim}d_{count}c")),
&count,
|bench, _| {
bench.iter(|| black_box(simd::batch_cosine_similarity(black_box(&pair_refs))));
},
);
}
}
group.finish();
}
fn bench_batch_cosine_non_normalized_query(c: &mut Criterion) {
const BENCH_DIMS: [usize; 3] = [384, 768, 1024];
const BATCH_SIZES: [usize; 5] = [4, 16, 64, 256, 1000];
let mut group = c.benchmark_group("simd_batch_cosine_non_normalized_query");
for dim in BENCH_DIMS {
let query = generate_vector(dim, 42);
let candidates: Vec<Vec<f32>> = (0..1000)
.map(|i| generate_vector(dim, i as u64 + 1))
.collect();
for &count in &BATCH_SIZES {
let cands = &candidates[..count];
group.throughput(Throughput::Elements(count as u64));
group.bench_with_input(
BenchmarkId::new("pair_loop", format!("{dim}d_{count}c")),
&count,
|bench, _| {
bench.iter(|| {
let results: Vec<f32> = cands
.iter()
.map(|c| simd::cosine_similarity(black_box(&query), black_box(c)))
.collect();
black_box(results)
});
},
);
let pair_refs: Vec<(&[f32], &[f32])> = cands
.iter()
.map(|c| (query.as_slice(), c.as_slice()))
.collect();
group.bench_with_input(
BenchmarkId::new("simd_batch", format!("{dim}d_{count}c")),
&count,
|bench, _| {
bench.iter(|| black_box(simd::batch_cosine_similarity(black_box(&pair_refs))));
},
);
}
}
group.finish();
}
criterion_group!(
simd_benches,
bench_cosine_simd_vs_scalar,
bench_dot_product_simd_vs_scalar,
bench_normalize_simd_vs_scalar,
bench_euclidean_simd_vs_scalar,
bench_batch_cosine,
bench_throughput_384,
bench_quantization,
bench_int8_vs_float32,
bench_int8_batch,
bench_memory_usage,
bench_int4_cosine_distance,
bench_binary_cosine_distance,
bench_batch_dot_product,
bench_prepared_query,
bench_dot_product_i8_raw,
bench_int8_prepared_dot_product,
bench_normalized_cosine_fast_path,
bench_squared_euclidean_fast_path,
bench_prepared_query_normalized_cosine,
bench_query_batch_dot_product,
bench_batch_cosine_normalized_query,
bench_batch_cosine_non_normalized_query,
);
criterion_main!(simd_benches);