use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use std::hint::black_box;
use edgevec::quantization::simd::portable::hamming_distance_slice as portable_hamming;
use rand::{Rng, SeedableRng};
use rand_chacha::ChaCha8Rng;
#[cfg(target_arch = "aarch64")]
use edgevec::simd::neon::{
dot_product as neon_dot_product, dot_product_portable,
euclidean_distance as neon_euclidean, euclidean_distance_portable,
hamming_distance_slice as neon_hamming,
};
fn generate_byte_vectors(size: usize, seed: u64) -> (Vec<u8>, Vec<u8>) {
let mut rng = ChaCha8Rng::seed_from_u64(seed);
let a: Vec<u8> = (0..size).map(|_| rng.gen()).collect();
let b: Vec<u8> = (0..size).map(|_| rng.gen()).collect();
(a, b)
}
fn generate_f32_vectors(size: usize, seed: u64) -> (Vec<f32>, Vec<f32>) {
let mut rng = ChaCha8Rng::seed_from_u64(seed);
let a: Vec<f32> = (0..size).map(|_| rng.gen_range(-1.0..1.0)).collect();
let b: Vec<f32> = (0..size).map(|_| rng.gen_range(-1.0..1.0)).collect();
(a, b)
}
#[cfg(not(target_arch = "aarch64"))]
fn portable_dot_product(a: &[f32], b: &[f32]) -> f32 {
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
}
#[cfg(not(target_arch = "aarch64"))]
fn portable_euclidean(a: &[f32], b: &[f32]) -> f32 {
a.iter()
.zip(b.iter())
.map(|(x, y)| {
let diff = x - y;
diff * diff
})
.sum::<f32>()
.sqrt()
}
fn bench_neon_vs_portable(c: &mut Criterion) {
let mut group = c.benchmark_group("hamming_neon_vs_portable");
let sizes = [64, 256, 1024, 4096];
for size in sizes {
let (a, b) = generate_byte_vectors(size, 42 + size as u64);
group.throughput(Throughput::Bytes((size * 2) as u64));
group.bench_with_input(BenchmarkId::new("portable", size), &size, |bench, _| {
bench.iter(|| portable_hamming(black_box(&a), black_box(&b)));
});
#[cfg(target_arch = "aarch64")]
group.bench_with_input(BenchmarkId::new("neon", size), &size, |bench, _| {
bench.iter(|| neon_hamming(black_box(&a), black_box(&b)));
});
}
group.finish();
}
fn bench_neon_embedding_sizes(c: &mut Criterion) {
let mut group = c.benchmark_group("hamming_embedding_sizes");
let sizes = [96, 192, 384];
for size in sizes {
let (a, b) = generate_byte_vectors(size, 100 + size as u64);
group.throughput(Throughput::Bytes((size * 2) as u64));
group.bench_with_input(BenchmarkId::new("portable", size), &size, |bench, _| {
bench.iter(|| portable_hamming(black_box(&a), black_box(&b)));
});
#[cfg(target_arch = "aarch64")]
group.bench_with_input(BenchmarkId::new("neon", size), &size, |bench, _| {
bench.iter(|| neon_hamming(black_box(&a), black_box(&b)));
});
}
group.finish();
}
fn bench_neon_batch(c: &mut Criterion) {
let mut group = c.benchmark_group("hamming_batch");
let batch_size = 1000;
let vector_size = 96;
let vectors: Vec<Vec<u8>> = (0..batch_size)
.map(|i| {
let (v, _) = generate_byte_vectors(vector_size, i as u64);
v
})
.collect();
let (query, _) = generate_byte_vectors(vector_size, 9999);
group.throughput(Throughput::Elements(batch_size as u64));
group.bench_function("portable_batch_1000", |bench| {
bench.iter(|| {
for v in &vectors {
black_box(portable_hamming(black_box(&query), black_box(v)));
}
});
});
#[cfg(target_arch = "aarch64")]
group.bench_function("neon_batch_1000", |bench| {
bench.iter(|| {
for v in &vectors {
black_box(neon_hamming(black_box(&query), black_box(v)));
}
});
});
group.finish();
}
fn bench_neon_tail_handling(c: &mut Criterion) {
let mut group = c.benchmark_group("hamming_tail_handling");
let sizes = [
15, 17, 31, 33, 100, ];
for size in sizes {
let (a, b) = generate_byte_vectors(size, 200 + size as u64);
group.bench_with_input(BenchmarkId::new("portable", size), &size, |bench, _| {
bench.iter(|| portable_hamming(black_box(&a), black_box(&b)));
});
#[cfg(target_arch = "aarch64")]
group.bench_with_input(BenchmarkId::new("neon", size), &size, |bench, _| {
bench.iter(|| neon_hamming(black_box(&a), black_box(&b)));
});
}
group.finish();
}
fn bench_dot_product(c: &mut Criterion) {
let mut group = c.benchmark_group("dot_product_neon_vs_portable");
let dims = [128, 768, 1536];
for dim in dims {
let (a, b) = generate_f32_vectors(dim, 300 + dim as u64);
group.throughput(Throughput::Elements(dim as u64));
#[cfg(not(target_arch = "aarch64"))]
group.bench_with_input(BenchmarkId::new("portable", dim), &dim, |bench, _| {
bench.iter(|| portable_dot_product(black_box(&a), black_box(&b)));
});
#[cfg(target_arch = "aarch64")]
{
group.bench_with_input(BenchmarkId::new("portable", dim), &dim, |bench, _| {
bench.iter(|| dot_product_portable(black_box(&a), black_box(&b)));
});
group.bench_with_input(BenchmarkId::new("neon", dim), &dim, |bench, _| {
bench.iter(|| neon_dot_product(black_box(&a), black_box(&b)));
});
}
}
group.finish();
}
fn bench_euclidean_distance(c: &mut Criterion) {
let mut group = c.benchmark_group("euclidean_neon_vs_portable");
let dims = [128, 768, 1536];
for dim in dims {
let (a, b) = generate_f32_vectors(dim, 400 + dim as u64);
group.throughput(Throughput::Elements(dim as u64));
#[cfg(not(target_arch = "aarch64"))]
group.bench_with_input(BenchmarkId::new("portable", dim), &dim, |bench, _| {
bench.iter(|| portable_euclidean(black_box(&a), black_box(&b)));
});
#[cfg(target_arch = "aarch64")]
{
group.bench_with_input(BenchmarkId::new("portable", dim), &dim, |bench, _| {
bench.iter(|| euclidean_distance_portable(black_box(&a), black_box(&b)));
});
group.bench_with_input(BenchmarkId::new("neon", dim), &dim, |bench, _| {
bench.iter(|| neon_euclidean(black_box(&a), black_box(&b)));
});
}
}
group.finish();
}
fn bench_similarity_batch(c: &mut Criterion) {
let mut group = c.benchmark_group("similarity_batch");
let batch_size = 1000;
let dim = 768;
let vectors: Vec<Vec<f32>> = (0..batch_size)
.map(|i| {
let (v, _) = generate_f32_vectors(dim, i as u64);
v
})
.collect();
let (query, _) = generate_f32_vectors(dim, 9999);
group.throughput(Throughput::Elements(batch_size as u64));
#[cfg(not(target_arch = "aarch64"))]
group.bench_function("dot_portable_batch_1000", |bench| {
bench.iter(|| {
for v in &vectors {
black_box(portable_dot_product(black_box(&query), black_box(v)));
}
});
});
#[cfg(target_arch = "aarch64")]
{
group.bench_function("dot_portable_batch_1000", |bench| {
bench.iter(|| {
for v in &vectors {
black_box(dot_product_portable(black_box(&query), black_box(v)));
}
});
});
group.bench_function("dot_neon_batch_1000", |bench| {
bench.iter(|| {
for v in &vectors {
black_box(neon_dot_product(black_box(&query), black_box(v)));
}
});
});
group.bench_function("euclidean_portable_batch_1000", |bench| {
bench.iter(|| {
for v in &vectors {
black_box(euclidean_distance_portable(black_box(&query), black_box(v)));
}
});
});
group.bench_function("euclidean_neon_batch_1000", |bench| {
bench.iter(|| {
for v in &vectors {
black_box(neon_euclidean(black_box(&query), black_box(v)));
}
});
});
}
#[cfg(not(target_arch = "aarch64"))]
group.bench_function("euclidean_portable_batch_1000", |bench| {
bench.iter(|| {
for v in &vectors {
black_box(portable_euclidean(black_box(&query), black_box(v)));
}
});
});
group.finish();
}
fn bench_similarity_tail_handling(c: &mut Criterion) {
let mut group = c.benchmark_group("similarity_tail_handling");
let sizes = [
1, 3, 5, 7, 100, 103, ];
for size in sizes {
let (a, b) = generate_f32_vectors(size, 500 + size as u64);
#[cfg(not(target_arch = "aarch64"))]
group.bench_with_input(BenchmarkId::new("dot_portable", size), &size, |bench, _| {
bench.iter(|| portable_dot_product(black_box(&a), black_box(&b)));
});
#[cfg(target_arch = "aarch64")]
{
group.bench_with_input(BenchmarkId::new("dot_portable", size), &size, |bench, _| {
bench.iter(|| dot_product_portable(black_box(&a), black_box(&b)));
});
group.bench_with_input(BenchmarkId::new("dot_neon", size), &size, |bench, _| {
bench.iter(|| neon_dot_product(black_box(&a), black_box(&b)));
});
}
}
group.finish();
}
criterion_group!(
benches,
bench_neon_vs_portable,
bench_neon_embedding_sizes,
bench_neon_batch,
bench_neon_tail_handling,
bench_dot_product,
bench_euclidean_distance,
bench_similarity_batch,
bench_similarity_tail_handling,
);
criterion_main!(benches);