rustsim-crowd 0.0.1

//! CUDA vs CPU equivalence for `rustsim-crowd::cuda::social_force`.
//!
//! Gated on the `cuda` feature. On hosts without a working CUDA
//! driver, the test **skips** (logs to stderr and exits cleanly)
//! rather than failing, so it is safe to run unconditionally in CI.
//!
//! The GPU arm runs in `f32` internally; the CPU reference runs in
//! `f64`. We tolerate `1e-2 m` positional divergence and `1e-2 m/s`
//! velocity divergence after 10 ticks of a 16-pedestrian counter-flow
//! with two walls — well within the numerical tolerance expected of
//! an SFM simulation and consistent with how FLAMEGPU2 reports
//! equivalence against reference CPU implementations.

#![cfg(feature = "cuda")]
// CPU baseline intentionally drives the deprecated O(n²) `step` so
// the equivalence test exercises the simplest possible reference
// implementation. Production callers go through `step_scratch` /
// `step_with_grid` (see docs/rustsim-crowd.md P1-7).
#![allow(deprecated)]

use rustsim_crowd::common::{Pedestrian, WallSegment};
use rustsim_crowd::cuda::social_force as sfm_cuda;
use rustsim_crowd::social_force;

fn seed() -> Vec<Pedestrian> {
    // 16 pedestrians, two counter-flow lanes, deterministic seeding.
    (0..16)
        .map(|i| {
            let lane = (i % 2) as f64;
            let col = (i / 2) as f64;
            let dir = if i % 2 == 0 { 1.0 } else { -1.0 };
            Pedestrian::new(
                [col * 1.2, lane * 1.5],
                [0.0, 0.0],
                0.25,
                1.34,
                [dir * 50.0, lane * 1.5],
            )
        })
        .collect()
}

#[test]
fn cuda_sfm_matches_cpu_within_tolerance() {
    let mut peds_cpu = seed();
    let mut peds_gpu = seed();
    let walls = vec![
        WallSegment {
            a: [-20.0, -1.0],
            b: [20.0, -1.0],
        },
        WallSegment {
            a: [-20.0, 2.5],
            b: [20.0, 2.5],
        },
    ];
    let params = social_force::Params::default();
    let dt = 0.05;

    // Try to spin up a CUDA state; skip test cleanly if no GPU.
    let state = match sfm_cuda::CudaState::new() {
        Ok(s) => s,
        Err(e) => {
            eprintln!("skipping cuda_sfm_matches_cpu_within_tolerance: no CUDA device ({e})");
            return;
        }
    };

    // 10 ticks on each path.
    for _ in 0..10 {
        social_force::step(&mut peds_cpu, &walls, &params, dt);
        state
            .step(&mut peds_gpu, &walls, &params, dt)
            .expect("CUDA step failed after successful init");
    }

    for (i, (a, b)) in peds_cpu.iter().zip(peds_gpu.iter()).enumerate() {
        let dx = (a.pos[0] - b.pos[0]).abs();
        let dy = (a.pos[1] - b.pos[1]).abs();
        let dvx = (a.vel[0] - b.vel[0]).abs();
        let dvy = (a.vel[1] - b.vel[1]).abs();
        assert!(
            dx < 1e-2 && dy < 1e-2,
            "agent {i} position diverged: cpu={:?} gpu={:?}",
            a.pos,
            b.pos
        );
        assert!(
            dvx < 1e-2 && dvy < 1e-2,
            "agent {i} velocity diverged: cpu={:?} gpu={:?}",
            a.vel,
            b.vel
        );
    }
}

#[test]
fn cuda_sfm_resident_matches_cpu_within_tolerance() {
    // Same scenario as the stateless test, but driven through the
    // device-resident path: one upload at startup, zero host↔device
    // traffic in the tick loop, one download for verification at the
    // end.
    let mut peds_cpu = seed();
    let peds_initial = seed();
    let walls = vec![
        WallSegment {
            a: [-20.0, -1.0],
            b: [20.0, -1.0],
        },
        WallSegment {
            a: [-20.0, 2.5],
            b: [20.0, 2.5],
        },
    ];
    let params = social_force::Params::default();
    let dt = 0.05;

    let mut resident = match sfm_cuda::CudaResident::upload(&peds_initial, &walls) {
        Ok(r) => r,
        Err(e) => {
            eprintln!(
                "skipping cuda_sfm_resident_matches_cpu_within_tolerance: no CUDA device ({e})"
            );
            return;
        }
    };

    for _ in 0..10 {
        social_force::step(&mut peds_cpu, &walls, &params, dt);
        resident.step(&params, dt).expect("resident step failed");
    }

    let mut peds_gpu: Vec<Pedestrian> = Vec::new();
    resident
        .download(&mut peds_gpu)
        .expect("resident download failed");

    assert_eq!(peds_gpu.len(), peds_cpu.len());
    for (i, (a, b)) in peds_cpu.iter().zip(peds_gpu.iter()).enumerate() {
        let dx = (a.pos[0] - b.pos[0]).abs();
        let dy = (a.pos[1] - b.pos[1]).abs();
        let dvx = (a.vel[0] - b.vel[0]).abs();
        let dvy = (a.vel[1] - b.vel[1]).abs();
        assert!(
            dx < 1e-2 && dy < 1e-2,
            "resident agent {i} position diverged: cpu={:?} gpu={:?}",
            a.pos,
            b.pos
        );
        assert!(
            dvx < 1e-2 && dvy < 1e-2,
            "resident agent {i} velocity diverged: cpu={:?} gpu={:?}",
            a.vel,
            b.vel
        );
    }
}

#[test]
fn cuda_sfm_resident_grid_matches_cpu_within_tolerance() {
    // Same scenario as the resident-path test, but driving the
    // device-side uniform-grid kernel: `enable_grid` configures the
    // bucket layout, then every `step_grid` runs three on-device
    // kernels (clear heads, build linked lists, neighbourhood walk)
    // with zero host↔device traffic. This is the path that scales
    // to millions of agents — the O(n^2) `step_grid`-less variant
    // tops out around N ≈ 10 000 even with all data resident.
    let mut peds_cpu = seed();
    let peds_initial = seed();
    let walls = vec![
        WallSegment {
            a: [-20.0, -1.0],
            b: [20.0, -1.0],
        },
        WallSegment {
            a: [-20.0, 2.5],
            b: [20.0, 2.5],
        },
    ];
    let params = social_force::Params::default();
    let dt = 0.05;

    let mut resident = match sfm_cuda::CudaResident::upload(&peds_initial, &walls) {
        Ok(r) => r,
        Err(e) => {
            eprintln!(
                "skipping cuda_sfm_resident_grid_matches_cpu_within_tolerance: no CUDA device ({e})"
            );
            return;
        }
    };

    // Configure the grid to cover the seeded scene with comfortable
    // padding. Cell size = neighbour cutoff so every relevant pair
    // sits inside the 3x3 cell neighbourhood.
    let cutoff = social_force::neighbor_cutoff(&params);
    let cfg = sfm_cuda::GridConfig {
        origin: [-25.0, -5.0],
        cell_size: cutoff,
        dims: (
            (50.0_f64 / cutoff).ceil() as u32 + 2,
            (10.0_f64 / cutoff).ceil() as u32 + 2,
        ),
        cutoff_sq: cutoff * cutoff,
    };
    resident
        .enable_grid(cfg)
        .expect("enable_grid failed on a small scene");
    assert!(resident.has_grid());

    for _ in 0..10 {
        social_force::step(&mut peds_cpu, &walls, &params, dt);
        resident
            .step_grid(&params, dt)
            .expect("resident step_grid failed");
    }

    let mut peds_gpu: Vec<Pedestrian> = Vec::new();
    resident
        .download(&mut peds_gpu)
        .expect("resident download failed");

    assert_eq!(peds_gpu.len(), peds_cpu.len());
    for (i, (a, b)) in peds_cpu.iter().zip(peds_gpu.iter()).enumerate() {
        let dx = (a.pos[0] - b.pos[0]).abs();
        let dy = (a.pos[1] - b.pos[1]).abs();
        let dvx = (a.vel[0] - b.vel[0]).abs();
        let dvy = (a.vel[1] - b.vel[1]).abs();
        assert!(
            dx < 1e-2 && dy < 1e-2,
            "grid agent {i} position diverged: cpu={:?} gpu={:?}",
            a.pos,
            b.pos
        );
        assert!(
            dvx < 1e-2 && dvy < 1e-2,
            "grid agent {i} velocity diverged: cpu={:?} gpu={:?}",
            a.vel,
            b.vel
        );
    }
}