use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use numrs2::parallel_optimize::{ParallelConfig, SchedulingStrategy};
use std::hint::black_box;
use std::sync::Arc;
use std::thread;
use scirs2_core::ndarray::ArrayView1;
use scirs2_core::simd::reductions::{simd_max_f64, simd_min_f64, simd_sum_f64};
const SIMD_TEST_SIZE: usize = 1024;
const THREAD_COUNTS: &[usize] = &[1, 2, 4, 8];
const CONTENTION_ITERATIONS: usize = 10_000;
#[repr(align(64))]
struct AlignedCounter {
value: std::sync::atomic::AtomicU64,
_pad: [u8; 56], }
impl Default for AlignedCounter {
fn default() -> Self {
Self {
value: std::sync::atomic::AtomicU64::new(0),
_pad: [0; 56],
}
}
}
#[derive(Default)]
struct UnalignedCounter {
value: std::sync::atomic::AtomicU64,
}
#[repr(align(64))]
struct AlignedArray {
data: [f64; SIMD_TEST_SIZE],
}
impl AlignedArray {
fn new() -> Self {
Self {
data: [0.0; SIMD_TEST_SIZE],
}
}
fn init_with_pattern(&mut self) {
for (i, val) in self.data.iter_mut().enumerate() {
*val = (i as f64) * 0.5;
}
}
}
struct UnalignedArray {
data: [f64; SIMD_TEST_SIZE],
}
impl UnalignedArray {
fn new() -> Self {
Self {
data: [0.0; SIMD_TEST_SIZE],
}
}
fn init_with_pattern(&mut self) {
for (i, val) in self.data.iter_mut().enumerate() {
*val = (i as f64) * 0.5;
}
}
}
fn bench_false_sharing_aligned(c: &mut Criterion) {
let mut group = c.benchmark_group("false_sharing/aligned");
for &num_threads in THREAD_COUNTS.iter() {
group.bench_with_input(
BenchmarkId::from_parameter(num_threads),
&num_threads,
|bencher, &num_threads| {
bencher.iter(|| {
let counters: Vec<AlignedCounter> = (0..num_threads)
.map(|_| AlignedCounter::default())
.collect();
let counters = Arc::new(counters);
let handles: Vec<_> = (0..num_threads)
.map(|thread_id| {
let counters = Arc::clone(&counters);
thread::spawn(move || {
for _ in 0..CONTENTION_ITERATIONS {
counters[thread_id]
.value
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
})
})
.collect();
for handle in handles {
handle.join().unwrap();
}
let sum: u64 = counters
.iter()
.map(|c| c.value.load(std::sync::atomic::Ordering::Relaxed))
.sum();
black_box(sum);
});
},
);
}
group.finish();
}
fn bench_false_sharing_unaligned(c: &mut Criterion) {
let mut group = c.benchmark_group("false_sharing/unaligned");
for &num_threads in THREAD_COUNTS.iter() {
group.bench_with_input(
BenchmarkId::from_parameter(num_threads),
&num_threads,
|bencher, &num_threads| {
bencher.iter(|| {
let counters: Vec<UnalignedCounter> = (0..num_threads)
.map(|_| UnalignedCounter::default())
.collect();
let counters = Arc::new(counters);
let handles: Vec<_> = (0..num_threads)
.map(|thread_id| {
let counters = Arc::clone(&counters);
thread::spawn(move || {
for _ in 0..CONTENTION_ITERATIONS {
counters[thread_id]
.value
.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
}
})
})
.collect();
for handle in handles {
handle.join().unwrap();
}
let sum: u64 = counters
.iter()
.map(|c| c.value.load(std::sync::atomic::Ordering::Relaxed))
.sum();
black_box(sum);
});
},
);
}
group.finish();
}
fn bench_simd_aligned_loads(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_loads/aligned");
group.throughput(Throughput::Elements(SIMD_TEST_SIZE as u64));
let mut array = AlignedArray::new();
array.init_with_pattern();
group.bench_function("simd_sum", |bencher| {
bencher.iter(|| {
let view = ArrayView1::from(&array.data);
let sum: f64 = simd_sum_f64(&view);
black_box(sum);
});
});
group.bench_function("simd_min", |bencher| {
bencher.iter(|| {
let view = ArrayView1::from(&array.data);
let min: f64 = simd_min_f64(&view);
black_box(min);
});
});
group.bench_function("simd_max", |bencher| {
bencher.iter(|| {
let view = ArrayView1::from(&array.data);
let max: f64 = simd_max_f64(&view);
black_box(max);
});
});
group.finish();
}
fn bench_simd_unaligned_loads(c: &mut Criterion) {
let mut group = c.benchmark_group("simd_loads/unaligned");
group.throughput(Throughput::Elements(SIMD_TEST_SIZE as u64));
let mut array = UnalignedArray::new();
array.init_with_pattern();
group.bench_function("simd_sum", |bencher| {
bencher.iter(|| {
let view = ArrayView1::from(&array.data);
let sum: f64 = simd_sum_f64(&view);
black_box(sum);
});
});
group.bench_function("simd_min", |bencher| {
bencher.iter(|| {
let view = ArrayView1::from(&array.data);
let min: f64 = simd_min_f64(&view);
black_box(min);
});
});
group.bench_function("simd_max", |bencher| {
bencher.iter(|| {
let view = ArrayView1::from(&array.data);
let max: f64 = simd_max_f64(&view);
black_box(max);
});
});
group.finish();
}
fn bench_parallel_config_aligned(c: &mut Criterion) {
let mut group = c.benchmark_group("parallel_config/aligned");
let config = Arc::new(ParallelConfig {
use_parallel: true,
min_parallel_size: 1000,
chunk_size: 250,
max_threads: Some(8),
scheduling_strategy: SchedulingStrategy::Adaptive,
});
for &num_threads in THREAD_COUNTS.iter() {
group.bench_with_input(
BenchmarkId::from_parameter(num_threads),
&num_threads,
|bencher, &num_threads| {
bencher.iter(|| {
let handles: Vec<_> = (0..num_threads)
.map(|_| {
let config = Arc::clone(&config);
thread::spawn(move || {
let mut sum = 0usize;
for _ in 0..CONTENTION_ITERATIONS {
sum += config.min_parallel_size;
sum += config.chunk_size;
if config.use_parallel {
sum += 1;
}
}
black_box(sum);
})
})
.collect();
for handle in handles {
handle.join().unwrap();
}
});
},
);
}
group.finish();
}
fn bench_parallel_config_unaligned(c: &mut Criterion) {
#[derive(Clone)]
struct UnalignedConfig {
use_parallel: bool,
min_parallel_size: usize,
chunk_size: usize,
max_threads: Option<usize>,
_extra: u32, }
let mut group = c.benchmark_group("parallel_config/unaligned");
let config = Arc::new(UnalignedConfig {
use_parallel: true,
min_parallel_size: 1000,
chunk_size: 250,
max_threads: Some(8),
_extra: 0,
});
for &num_threads in THREAD_COUNTS.iter() {
group.bench_with_input(
BenchmarkId::from_parameter(num_threads),
&num_threads,
|bencher, &num_threads| {
bencher.iter(|| {
let handles: Vec<_> = (0..num_threads)
.map(|_| {
let config = Arc::clone(&config);
thread::spawn(move || {
let mut sum = 0usize;
for _ in 0..CONTENTION_ITERATIONS {
sum += config.min_parallel_size;
sum += config.chunk_size;
if config.use_parallel {
sum += 1;
}
}
black_box(sum);
})
})
.collect();
for handle in handles {
handle.join().unwrap();
}
});
},
);
}
group.finish();
}
fn bench_cache_contention_aligned(c: &mut Criterion) {
let mut group = c.benchmark_group("cache_contention/aligned");
#[repr(align(64))]
struct AlignedSharedState {
counter: std::sync::atomic::AtomicU64,
_pad: [u8; 56],
}
for &num_threads in THREAD_COUNTS.iter() {
group.bench_with_input(
BenchmarkId::from_parameter(num_threads),
&num_threads,
|bencher, &num_threads| {
bencher.iter(|| {
let state = Arc::new(AlignedSharedState {
counter: std::sync::atomic::AtomicU64::new(0),
_pad: [0; 56],
});
let handles: Vec<_> = (0..num_threads)
.map(|_| {
let state = Arc::clone(&state);
thread::spawn(move || {
for _ in 0..CONTENTION_ITERATIONS {
let val =
state.counter.load(std::sync::atomic::Ordering::Acquire);
state
.counter
.store(val + 1, std::sync::atomic::Ordering::Release);
}
})
})
.collect();
for handle in handles {
handle.join().unwrap();
}
let final_val = state.counter.load(std::sync::atomic::Ordering::Relaxed);
black_box(final_val);
});
},
);
}
group.finish();
}
fn bench_cache_contention_unaligned(c: &mut Criterion) {
let mut group = c.benchmark_group("cache_contention/unaligned");
struct UnalignedSharedState {
counter: std::sync::atomic::AtomicU64,
}
for &num_threads in THREAD_COUNTS.iter() {
group.bench_with_input(
BenchmarkId::from_parameter(num_threads),
&num_threads,
|bencher, &num_threads| {
bencher.iter(|| {
let state = Arc::new(UnalignedSharedState {
counter: std::sync::atomic::AtomicU64::new(0),
});
let handles: Vec<_> = (0..num_threads)
.map(|_| {
let state = Arc::clone(&state);
thread::spawn(move || {
for _ in 0..CONTENTION_ITERATIONS {
let val =
state.counter.load(std::sync::atomic::Ordering::Acquire);
state
.counter
.store(val + 1, std::sync::atomic::Ordering::Release);
}
})
})
.collect();
for handle in handles {
handle.join().unwrap();
}
let final_val = state.counter.load(std::sync::atomic::Ordering::Relaxed);
black_box(final_val);
});
},
);
}
group.finish();
}
fn bench_stride_aligned_sequential(c: &mut Criterion) {
let mut group = c.benchmark_group("stride_access/aligned_sequential");
group.throughput(Throughput::Elements(SIMD_TEST_SIZE as u64));
let mut array = AlignedArray::new();
array.init_with_pattern();
group.bench_function("sum", |bencher| {
bencher.iter(|| {
let sum: f64 = array.data.iter().sum();
black_box(sum);
});
});
group.finish();
}
fn bench_stride_aligned_strided(c: &mut Criterion) {
let mut group = c.benchmark_group("stride_access/aligned_strided");
group.throughput(Throughput::Elements((SIMD_TEST_SIZE / 8) as u64));
let mut array = AlignedArray::new();
array.init_with_pattern();
group.bench_function("sum_stride_8", |bencher| {
bencher.iter(|| {
let sum: f64 = array.data.iter().step_by(8).sum();
black_box(sum);
});
});
group.finish();
}
fn bench_stride_unaligned_sequential(c: &mut Criterion) {
let mut group = c.benchmark_group("stride_access/unaligned_sequential");
group.throughput(Throughput::Elements(SIMD_TEST_SIZE as u64));
let mut array = UnalignedArray::new();
array.init_with_pattern();
group.bench_function("sum", |bencher| {
bencher.iter(|| {
let sum: f64 = array.data.iter().sum();
black_box(sum);
});
});
group.finish();
}
fn bench_stride_unaligned_strided(c: &mut Criterion) {
let mut group = c.benchmark_group("stride_access/unaligned_strided");
group.throughput(Throughput::Elements((SIMD_TEST_SIZE / 8) as u64));
let mut array = UnalignedArray::new();
array.init_with_pattern();
group.bench_function("sum_stride_8", |bencher| {
bencher.iter(|| {
let sum: f64 = array.data.iter().step_by(8).sum();
black_box(sum);
});
});
group.finish();
}
criterion_group!(
cache_benches,
bench_false_sharing_aligned,
bench_false_sharing_unaligned,
bench_simd_aligned_loads,
bench_simd_unaligned_loads,
bench_parallel_config_aligned,
bench_parallel_config_unaligned,
bench_cache_contention_aligned,
bench_cache_contention_unaligned,
bench_stride_aligned_sequential,
bench_stride_aligned_strided,
bench_stride_unaligned_sequential,
bench_stride_unaligned_strided,
);
criterion_main!(cache_benches);