use std::time::{Duration, Instant};
fn main() {
println!("=== rust-par2 micro-benchmarks ===\n");
#[cfg(target_arch = "x86_64")]
{
println!("CPU features:");
println!(" AVX2: {}", is_x86_feature_detected!("avx2"));
println!(" SSSE3: {}", is_x86_feature_detected!("ssse3"));
println!(" AVX-512F: {}", is_x86_feature_detected!("avx512f"));
println!(" PCLMULQDQ: {}", is_x86_feature_detected!("pclmulqdq"));
println!();
}
let cores = std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(1);
println!("Available parallelism: {cores} threads\n");
rust_par2::gf_simd_public::mul_add_buffer(&mut [0u8; 4], &[0u8; 4], 1);
let slice_size = 768_000; let large_buf = 4 * 1024 * 1024;
bench_xor_throughput(large_buf);
bench_single_mul_add(large_buf);
bench_single_mul_add(slice_size);
bench_multi_source_accumulate(slice_size, 16);
bench_multi_source_accumulate(slice_size, 64);
bench_multi_source_accumulate(slice_size, 128);
bench_multi_source_accumulate(slice_size, 512);
bench_multi_source_accumulate(slice_size, 1365);
println!("\n--- Repair simulation (parallel, rayon) ---\n");
bench_repair_simulation(slice_size, 1365, 40); bench_repair_simulation(slice_size, 1365, 10); }
fn bench_xor_throughput(buf_size: usize) {
let src = vec![0xAAu8; buf_size];
let mut dst = vec![0x55u8; buf_size];
let (iters, elapsed) = bench_loop(Duration::from_secs(2), || {
rust_par2::gf_simd_public::xor_buffers(&mut dst, &src);
});
let total_bytes = buf_size as f64 * iters as f64;
let gbps = total_bytes / elapsed.as_secs_f64() / 1e9;
let mem_gbps = 3.0 * total_bytes / elapsed.as_secs_f64() / 1e9;
println!("XOR throughput ({}):", fmt_size(buf_size));
println!(" {gbps:.2} GB/s data, {mem_gbps:.2} GB/s memory traffic");
println!(" {iters} iterations in {:.3}s", elapsed.as_secs_f64());
println!();
}
fn bench_single_mul_add(buf_size: usize) {
let src = random_buf(buf_size);
let mut dst = vec![0u8; buf_size];
let constant = 12345u16;
let (iters, elapsed) = bench_loop(Duration::from_secs(2), || {
rust_par2::gf_simd_public::mul_add_buffer(&mut dst, &src, constant);
});
let total_bytes = buf_size as f64 * iters as f64;
let gbps = total_bytes / elapsed.as_secs_f64() / 1e9;
let gf_muls = (buf_size / 2) as f64 * iters as f64; let gf_muls_per_sec = gf_muls / elapsed.as_secs_f64();
let mem_gbps = 3.0 * total_bytes / elapsed.as_secs_f64() / 1e9;
println!("Single mul_add ({}):", fmt_size(buf_size));
println!(" {gbps:.2} GB/s data, {mem_gbps:.2} GB/s memory traffic");
println!(" {:.2} billion GF-muls/sec", gf_muls_per_sec / 1e9);
println!(" {iters} iterations in {:.3}s", elapsed.as_secs_f64());
println!();
}
fn bench_multi_source_accumulate(slice_size: usize, num_sources: usize) {
let sources: Vec<Vec<u8>> = (0..num_sources).map(|_| random_buf(slice_size)).collect();
let coeffs: Vec<u16> = (1..=num_sources as u16).collect();
let mut dst = vec![0u8; slice_size];
let (iters, elapsed) = bench_loop(Duration::from_secs(2), || {
dst.fill(0);
for (src, &coeff) in sources.iter().zip(coeffs.iter()) {
rust_par2::gf_simd_public::mul_add_buffer(&mut dst, src, coeff);
}
});
let total_data = slice_size as f64 * num_sources as f64 * iters as f64;
let gbps = total_data / elapsed.as_secs_f64() / 1e9;
let gf_muls = (slice_size / 2) as f64 * num_sources as f64 * iters as f64;
let gf_muls_per_sec = gf_muls / elapsed.as_secs_f64();
let mem_cold =
(num_sources as f64 * slice_size as f64 + 2.0 * slice_size as f64) * iters as f64;
let mem_gbps = mem_cold / elapsed.as_secs_f64() / 1e9;
let per_iter_ms = elapsed.as_secs_f64() * 1000.0 / iters as f64;
println!(
"Multi-source accumulate ({} srcs × {}):",
num_sources,
fmt_size(slice_size)
);
println!(" {gbps:.2} GB/s src data, ~{mem_gbps:.2} GB/s estimated mem traffic");
println!(" {:.2} billion GF-muls/sec", gf_muls_per_sec / 1e9);
println!(
" {per_iter_ms:.3} ms per accumulation ({iters} iters in {:.3}s)",
elapsed.as_secs_f64()
);
println!();
}
fn bench_repair_simulation(slice_size: usize, num_sources: usize, num_damaged: usize) {
use rayon::prelude::*;
let sources: Vec<Vec<u8>> = (0..num_sources).map(|_| random_buf(slice_size)).collect();
let coeffs: Vec<Vec<u16>> = (0..num_damaged)
.map(|d| {
(0..num_sources)
.map(|s| ((d * 31 + s * 17 + 1) % 65535) as u16)
.collect()
})
.collect();
let src_refs: Vec<&[u8]> = sources.iter().map(|s| s.as_slice()).collect();
let t = Instant::now();
let _results: Vec<Vec<u8>> = (0..num_damaged)
.into_par_iter()
.map(|dmg_i| {
let mut dst = vec![0u8; slice_size];
for (src_idx, src_data) in src_refs.iter().enumerate() {
let coeff = coeffs[dmg_i][src_idx];
if coeff != 0 {
rust_par2::gf_simd_public::mul_add_buffer(&mut dst, src_data, coeff);
}
}
dst
})
.collect();
let elapsed = t.elapsed();
let total_src_data = slice_size as f64 * num_sources as f64 * num_damaged as f64;
let gbps = total_src_data / elapsed.as_secs_f64() / 1e9;
let gf_muls = (slice_size / 2) as f64 * num_sources as f64 * num_damaged as f64;
let total_data_gb = num_sources as f64 * slice_size as f64 / 1e9;
println!(
"Repair sim: {} srcs × {} damaged × {} = {:.1} GB total data",
num_sources,
num_damaged,
fmt_size(slice_size),
total_data_gb,
);
println!(" {:.3}s elapsed", elapsed.as_secs_f64());
println!(" {gbps:.2} GB/s src throughput (parallel)");
println!(
" {:.2} billion GF-muls/sec (parallel)",
gf_muls / elapsed.as_secs_f64() / 1e9
);
println!(
" {:.1} ms per damaged block",
elapsed.as_secs_f64() * 1000.0 / num_damaged as f64,
);
println!();
}
fn bench_loop<F: FnMut()>(min_duration: Duration, mut f: F) -> (u64, Duration) {
for _ in 0..3 {
f();
}
let mut iters = 0u64;
let start = Instant::now();
loop {
f();
iters += 1;
let elapsed = start.elapsed();
if elapsed >= min_duration && iters >= 4 {
return (iters, elapsed);
}
}
}
fn random_buf(size: usize) -> Vec<u8> {
use rand::RngCore;
let mut buf = vec![0u8; size];
rand::rng().fill_bytes(&mut buf);
buf
}
fn fmt_size(bytes: usize) -> String {
if bytes >= 1024 * 1024 {
format!("{:.1} MB", bytes as f64 / (1024.0 * 1024.0))
} else if bytes >= 1024 {
format!("{} KB", bytes / 1024)
} else {
format!("{bytes} B")
}
}