rustsim 0.0.1 - Docs.rs

//! CPU scaling benchmark — closes the R7 exit criterion
//! "CPU scaling is demonstrated on a representative benchmark".
//!
//! Compares the single-threaded `cpu_batch_step` against the opt-in rayon
//! `par_batch_step` across three agent counts (10k / 100k / 1M) on a
//! representative memory-bandwidth-bound SoA kernel (`x[i] += vx[i]`).
//!
//! Run with:
//!
//! ```sh
//! cargo bench -p rustsim --features rayon --bench cpu_scaling_bench
//! ```
//!
//! Without the `rayon` feature only the sequential path is measured.

use criterion::{black_box, criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use rustsim::prelude::*;

#[derive(Debug, Clone)]
struct Particle {
    id: AgentId,
    x: f32,
    vx: f32,
}

impl Agent for Particle {
    fn id(&self) -> AgentId {
        self.id
    }
}

impl SoaExtractable for Particle {
    fn num_columns() -> usize {
        2
    }
    fn column_names() -> Vec<&'static str> {
        vec!["x", "vx"]
    }
    fn extract_row(&self, columns: &mut [Vec<f32>]) {
        columns[0].push(self.x);
        columns[1].push(self.vx);
    }
    fn write_back_row(&mut self, columns: &[&[f32]], row: usize) {
        self.x = columns[0][row];
    }
}

fn build_store(n: u64) -> HashMapStore<Particle> {
    let mut store = HashMapStore::new();
    for i in 1..=n {
        store.insert(Particle {
            id: i,
            x: 0.0,
            vx: 0.001,
        });
    }
    store
}

fn advance(columns: &mut [Vec<f32>], n: usize) {
    let (x_col, rest) = columns.split_at_mut(1);
    let x = &mut x_col[0];
    let vx = &rest[0];
    for i in 0..n {
        x[i] += vx[i];
    }
}

/// Compute-heavy per-agent kernel used to demonstrate CPU scaling on
/// workloads that are core-bound rather than memory-bandwidth-bound.
///
/// Each element performs ~16 transcendental ops, so the kernel moves
/// well past the single-core DRAM bandwidth ceiling where the trivial
/// `x += vx` baseline saturates.
#[inline(always)]
fn compute_heavy_step(x: f32, vx: f32) -> f32 {
    let mut acc = x + vx;
    for _ in 0..4 {
        acc = acc.sin().mul_add(acc.cos(), vx);
        acc = acc.tanh();
    }
    acc
}

fn advance_heavy(columns: &mut [Vec<f32>], n: usize) {
    let (x_col, rest) = columns.split_at_mut(1);
    let x = &mut x_col[0];
    let vx = &rest[0];
    for i in 0..n {
        x[i] = compute_heavy_step(x[i], vx[i]);
    }
}

#[cfg(feature = "rayon")]
fn advance_par(chunk_start: usize, slices: &mut [&mut [f32]]) {
    let _ = chunk_start;
    let (x_slice, rest) = slices.split_at_mut(1);
    let x = &mut *x_slice[0];
    let vx = &*rest[0];
    for i in 0..x.len() {
        x[i] += vx[i];
    }
}

#[cfg(feature = "rayon")]
fn advance_par_heavy(chunk_start: usize, slices: &mut [&mut [f32]]) {
    let _ = chunk_start;
    let (x_slice, rest) = slices.split_at_mut(1);
    let x = &mut *x_slice[0];
    let vx = &*rest[0];
    for i in 0..x.len() {
        x[i] = compute_heavy_step(x[i], vx[i]);
    }
}

fn bench_cpu_scaling(c: &mut Criterion) {
    let mut group = c.benchmark_group("cpu_scaling");
    group.sample_size(20);

    for &n in &[10_000u64, 100_000, 1_000_000] {
        let store = build_store(n);
        group.throughput(Throughput::Elements(n));

        // Memory-bandwidth-bound baseline: `x += vx`. Parallel scaling here
        // is bounded by DRAM bandwidth, not core count — the point of
        // measuring it is to make that ceiling visible.
        group.bench_with_input(BenchmarkId::new("cpu_batch_step/mem", n), &n, |b, _| {
            b.iter(|| {
                let r = cpu_batch_step::<Particle, _, _>(&store, advance);
                black_box(r.agent_count);
            });
        });

        // Compute-bound kernel: several transcendentals per element. This
        // is the kernel the R7 exit criterion wants to see scale across
        // cores, since real per-agent agent-based workloads (social force,
        // collision response, steering) sit closer to this regime than to
        // the trivial add.
        group.bench_with_input(BenchmarkId::new("cpu_batch_step/compute", n), &n, |b, _| {
            b.iter(|| {
                let r = cpu_batch_step::<Particle, _, _>(&store, advance_heavy);
                black_box(r.agent_count);
            });
        });

        #[cfg(feature = "rayon")]
        {
            let chunk_size = (n as usize / 16).max(1024);
            group.bench_with_input(BenchmarkId::new("par_batch_step/mem", n), &n, |b, _| {
                b.iter(|| {
                    let r = par_batch_step::<Particle, _, _>(&store, chunk_size, advance_par);
                    black_box(r.agent_count);
                });
            });
            group.bench_with_input(BenchmarkId::new("par_batch_step/compute", n), &n, |b, _| {
                b.iter(|| {
                    let r = par_batch_step::<Particle, _, _>(&store, chunk_size, advance_par_heavy);
                    black_box(r.agent_count);
                });
            });
        }
    }

    group.finish();
}

criterion_group!(benches, bench_cpu_scaling);
criterion_main!(benches);