rustsim-crowd 0.0.1

//! CUDA implementation of [Social Force](crate::social_force).
//!
//! Single entry point: [`step`] — same signature as the CPU
//! [`crate::social_force::step`] plus a `Result<(), String>` return to
//! surface CUDA runtime failures. For callers that want transparent
//! CPU fallback, use [`step_with_fallback`].
//!
//! For multi-step simulations, keep a persistent [`CudaState`] around
//! so the CUDA context, compiled kernel, and device buffers are all
//! reused across ticks — allocating them per step would destroy the
//! throughput advantage.

use cudarc::driver::{
    CudaContext, CudaFunction, CudaSlice, CudaStream, LaunchConfig, PushKernelArg,
};
use cudarc::nvrtc::compile_ptx;
use std::sync::Arc;

use crate::common::{Pedestrian, WallSegment};
use crate::social_force::Params;

/// CUDA source for the Social Force step kernel.
///
/// Compiled to PTX at runtime via NVRTC. One thread per pedestrian,
/// O(n²) pair loop inside, double-buffered for intra-step correctness.
const SFM_CUDA_SRC: &str = r#"
extern "C" __global__ void sfm_step(
    const float* __restrict__ pos_x_in,
    const float* __restrict__ pos_y_in,
    const float* __restrict__ vel_x_in,
    const float* __restrict__ vel_y_in,
    const float* __restrict__ radius,
    const float* __restrict__ desired_speed,
    const float* __restrict__ dest_x,
    const float* __restrict__ dest_y,
    float* __restrict__ pos_x_out,
    float* __restrict__ pos_y_out,
    float* __restrict__ vel_x_out,
    float* __restrict__ vel_y_out,
    const float* __restrict__ wall_ax,
    const float* __restrict__ wall_ay,
    const float* __restrict__ wall_bx,
    const float* __restrict__ wall_by,
    unsigned int n,
    unsigned int n_walls,
    float tau,
    float a_ped,
    float b_ped,
    float a_wall,
    float b_wall,
    float mass,
    float max_speed,
    float max_accel,
    float arrival_radius,
    float dt)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= n) return;

    float pxi = pos_x_in[i];
    float pyi = pos_y_in[i];
    float vxi = vel_x_in[i];
    float vyi = vel_y_in[i];
    float ri  = radius[i];
    float v0  = desired_speed[i];
    float dxi = dest_x[i];
    float dyi = dest_y[i];

    // Driving force with arrival taper.
    float to_dest_x = dxi - pxi;
    float to_dest_y = dyi - pyi;
    float d = sqrtf(to_dest_x * to_dest_x + to_dest_y * to_dest_y);
    float eff_v0 = v0;
    if (arrival_radius > 0.0f && d < arrival_radius) {
        eff_v0 = v0 * (d / arrival_radius);
    }
    float dir_x = 0.0f;
    float dir_y = 0.0f;
    if (d > 1.0e-12f) {
        dir_x = to_dest_x / d;
        dir_y = to_dest_y / d;
    }
    float fx = mass * (eff_v0 * dir_x - vxi) / tau;
    float fy = mass * (eff_v0 * dir_y - vyi) / tau;

    // Pedestrian repulsion — O(n²) pair loop.
    for (unsigned int j = 0; j < n; ++j) {
        if (j == i) continue;
        float dx = pxi - pos_x_in[j];
        float dy = pyi - pos_y_in[j];
        float dist = sqrtf(dx * dx + dy * dy);
        if (dist < 1.0e-12f) continue;
        float r_sum = ri + radius[j];
        float mag = a_ped * expf((r_sum - dist) / b_ped);
        float inv_dist = 1.0f / dist;
        fx += mag * dx * inv_dist;
        fy += mag * dy * inv_dist;
    }

    // Wall repulsion.
    for (unsigned int k = 0; k < n_walls; ++k) {
        float ax = wall_ax[k];
        float ay = wall_ay[k];
        float bx = wall_bx[k];
        float by = wall_by[k];
        float abx = bx - ax;
        float aby = by - ay;
        float denom = abx * abx + aby * aby;
        float t = 0.0f;
        if (denom > 1.0e-18f) {
            t = ((pxi - ax) * abx + (pyi - ay) * aby) / denom;
            if (t < 0.0f) t = 0.0f;
            if (t > 1.0f) t = 1.0f;
        }
        float cpx = ax + t * abx;
        float cpy = ay + t * aby;
        float dx = pxi - cpx;
        float dy = pyi - cpy;
        float dist = sqrtf(dx * dx + dy * dy);
        if (dist < 1.0e-12f) continue;
        float mag = a_wall * expf((ri - dist) / b_wall);
        float inv_dist = 1.0f / dist;
        fx += mag * dx * inv_dist;
        fy += mag * dy * inv_dist;
    }

    // a = F / m, clamp to max_accel for explicit-Euler stability.
    float ax_acc = fx / mass;
    float ay_acc = fy / mass;
    float a_mag = sqrtf(ax_acc * ax_acc + ay_acc * ay_acc);
    if (a_mag > max_accel && a_mag > 1.0e-12f) {
        float s = max_accel / a_mag;
        ax_acc *= s;
        ay_acc *= s;
    }

    // Integrate.
    float vx_new = vxi + ax_acc * dt;
    float vy_new = vyi + ay_acc * dt;
    float v_mag = sqrtf(vx_new * vx_new + vy_new * vy_new);
    if (v_mag > max_speed && v_mag > 1.0e-12f) {
        float s = max_speed / v_mag;
        vx_new *= s;
        vy_new *= s;
    }
    float px_new = pxi + vx_new * dt;
    float py_new = pyi + vy_new * dt;

    pos_x_out[i] = px_new;
    pos_y_out[i] = py_new;
    vel_x_out[i] = vx_new;
    vel_y_out[i] = vy_new;
}
"#;

// ---------------------------------------------------------------------------
// Uniform-grid acceleration kernels.
//
// Replace the O(n^2) `sfm_step` pair loop with a 3x3 cell neighbourhood
// walk via on-device linked-list bucketing. Construction is O(n)
// per tick, traversal is O(n * k) where k is the average neighbours
// per agent — bounded by the local density once the cell size is set
// to ~`neighbor_cutoff(params)`. This is what unlocks million-agent
// scaling; the O(n^2) `sfm_step` kernel above is kept as the
// reference path and for small populations where atomic-exchange
// list insertion is slower than the brute-force scan.
//
// Three kernels, all launched from `CudaResident::step_grid`:
//
//   `grid_clear_heads(cell_head[ncells], ncells)`
//     Sets every entry of `cell_head` to `UINT_MAX` (sentinel for
//     "empty cell"). One thread per cell.
//
//   `grid_build(pos_x_in, pos_y_in, cell_head, cell_next,
//               origin_x, origin_y, inv_cell_size, grid_w, grid_h, n)`
//     For each pedestrian i, computes its cell index and inserts it
//     at the head of that cell's linked list via
//     `cell_next[i] = atomicExch(&cell_head[cell], i)`.
//     This is the Bell / Yokota approach — no prefix sum needed,
//     constant memory overhead, fully data-parallel.
//
//   `sfm_step_grid(... same column args as sfm_step ...,
//                  cell_head, cell_next,
//                  origin_x, origin_y, inv_cell_size, grid_w, grid_h,
//                  cutoff_sq)`
//     Identical force law to `sfm_step`, but each thread walks the
//     linked lists of its own cell and the eight neighbours, pruning
//     pairs whose squared distance exceeds `cutoff_sq`.

const SFM_CUDA_GRID_SRC: &str = r#"
extern "C" __global__ void grid_clear_heads(
    unsigned int* __restrict__ cell_head,
    unsigned int ncells)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= ncells) return;
    cell_head[i] = 0xFFFFFFFFu;
}

__device__ __forceinline__ unsigned int cell_index_for(
    float x, float y,
    float origin_x, float origin_y, float inv_cell_size,
    unsigned int grid_w, unsigned int grid_h)
{
    int cx = (int)floorf((x - origin_x) * inv_cell_size);
    int cy = (int)floorf((y - origin_y) * inv_cell_size);
    if (cx < 0) cx = 0;
    if (cy < 0) cy = 0;
    if ((unsigned int)cx >= grid_w) cx = (int)grid_w - 1;
    if ((unsigned int)cy >= grid_h) cy = (int)grid_h - 1;
    return (unsigned int)cy * grid_w + (unsigned int)cx;
}

extern "C" __global__ void grid_build(
    const float* __restrict__ pos_x_in,
    const float* __restrict__ pos_y_in,
    unsigned int* __restrict__ cell_head,
    unsigned int* __restrict__ cell_next,
    float origin_x, float origin_y, float inv_cell_size,
    unsigned int grid_w, unsigned int grid_h,
    unsigned int n)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= n) return;
    unsigned int cell = cell_index_for(
        pos_x_in[i], pos_y_in[i],
        origin_x, origin_y, inv_cell_size,
        grid_w, grid_h);
    // Atomic-exchange head insertion: classic uniform-grid trick,
    // no scan, no sort, O(1) per agent.
    cell_next[i] = atomicExch(&cell_head[cell], i);
}

extern "C" __global__ void sfm_step_grid(
    const float* __restrict__ pos_x_in,
    const float* __restrict__ pos_y_in,
    const float* __restrict__ vel_x_in,
    const float* __restrict__ vel_y_in,
    const float* __restrict__ radius,
    const float* __restrict__ desired_speed,
    const float* __restrict__ dest_x,
    const float* __restrict__ dest_y,
    float* __restrict__ pos_x_out,
    float* __restrict__ pos_y_out,
    float* __restrict__ vel_x_out,
    float* __restrict__ vel_y_out,
    const float* __restrict__ wall_ax,
    const float* __restrict__ wall_ay,
    const float* __restrict__ wall_bx,
    const float* __restrict__ wall_by,
    const unsigned int* __restrict__ cell_head,
    const unsigned int* __restrict__ cell_next,
    unsigned int n,
    unsigned int n_walls,
    float origin_x, float origin_y, float inv_cell_size,
    unsigned int grid_w, unsigned int grid_h,
    float cutoff_sq,
    float tau,
    float a_ped,
    float b_ped,
    float a_wall,
    float b_wall,
    float mass,
    float max_speed,
    float max_accel,
    float arrival_radius,
    float dt)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= n) return;

    float pxi = pos_x_in[i];
    float pyi = pos_y_in[i];
    float vxi = vel_x_in[i];
    float vyi = vel_y_in[i];
    float ri  = radius[i];
    float v0  = desired_speed[i];
    float dxi = dest_x[i];
    float dyi = dest_y[i];

    float to_dest_x = dxi - pxi;
    float to_dest_y = dyi - pyi;
    float d = sqrtf(to_dest_x * to_dest_x + to_dest_y * to_dest_y);
    float eff_v0 = v0;
    if (arrival_radius > 0.0f && d < arrival_radius) {
        eff_v0 = v0 * (d / arrival_radius);
    }
    float dir_x = 0.0f;
    float dir_y = 0.0f;
    if (d > 1.0e-12f) {
        dir_x = to_dest_x / d;
        dir_y = to_dest_y / d;
    }
    float fx = mass * (eff_v0 * dir_x - vxi) / tau;
    float fy = mass * (eff_v0 * dir_y - vyi) / tau;

    // Walk the 3x3 cell neighbourhood.
    int cx = (int)floorf((pxi - origin_x) * inv_cell_size);
    int cy = (int)floorf((pyi - origin_y) * inv_cell_size);
    int gw = (int)grid_w;
    int gh = (int)grid_h;
    int x0 = cx - 1; if (x0 < 0) x0 = 0;
    int x1 = cx + 1; if (x1 >= gw) x1 = gw - 1;
    int y0 = cy - 1; if (y0 < 0) y0 = 0;
    int y1 = cy + 1; if (y1 >= gh) y1 = gh - 1;

    for (int ny = y0; ny <= y1; ++ny) {
        for (int nx = x0; nx <= x1; ++nx) {
            unsigned int cell = (unsigned int)ny * grid_w + (unsigned int)nx;
            unsigned int j = cell_head[cell];
            while (j != 0xFFFFFFFFu) {
                if (j != i) {
                    float ddx = pxi - pos_x_in[j];
                    float ddy = pyi - pos_y_in[j];
                    float d2 = ddx * ddx + ddy * ddy;
                    if (d2 < cutoff_sq && d2 > 1.0e-24f) {
                        float dist = sqrtf(d2);
                        float r_sum = ri + radius[j];
                        float mag = a_ped * expf((r_sum - dist) / b_ped);
                        float inv_dist = 1.0f / dist;
                        fx += mag * ddx * inv_dist;
                        fy += mag * ddy * inv_dist;
                    }
                }
                j = cell_next[j];
            }
        }
    }

    for (unsigned int k = 0; k < n_walls; ++k) {
        float ax = wall_ax[k];
        float ay = wall_ay[k];
        float bx = wall_bx[k];
        float by = wall_by[k];
        float abx = bx - ax;
        float aby = by - ay;
        float denom = abx * abx + aby * aby;
        float t = 0.0f;
        if (denom > 1.0e-18f) {
            t = ((pxi - ax) * abx + (pyi - ay) * aby) / denom;
            if (t < 0.0f) t = 0.0f;
            if (t > 1.0f) t = 1.0f;
        }
        float cpx = ax + t * abx;
        float cpy = ay + t * aby;
        float ddx = pxi - cpx;
        float ddy = pyi - cpy;
        float dist = sqrtf(ddx * ddx + ddy * ddy);
        if (dist < 1.0e-12f) continue;
        float mag = a_wall * expf((ri - dist) / b_wall);
        float inv_dist = 1.0f / dist;
        fx += mag * ddx * inv_dist;
        fy += mag * ddy * inv_dist;
    }

    float ax_acc = fx / mass;
    float ay_acc = fy / mass;
    float a_mag = sqrtf(ax_acc * ax_acc + ay_acc * ay_acc);
    if (a_mag > max_accel && a_mag > 1.0e-12f) {
        float s = max_accel / a_mag;
        ax_acc *= s;
        ay_acc *= s;
    }

    float vx_new = vxi + ax_acc * dt;
    float vy_new = vyi + ay_acc * dt;
    float v_mag = sqrtf(vx_new * vx_new + vy_new * vy_new);
    if (v_mag > max_speed && v_mag > 1.0e-12f) {
        float s = max_speed / v_mag;
        vx_new *= s;
        vy_new *= s;
    }
    float px_new = pxi + vx_new * dt;
    float py_new = pyi + vy_new * dt;

    pos_x_out[i] = px_new;
    pos_y_out[i] = py_new;
    vel_x_out[i] = vx_new;
    vel_y_out[i] = vy_new;
}
"#;

/// Persistent CUDA state for the Social Force kernel.
///
/// Keeps the CUDA context, compiled kernel function, default stream,
/// and device-side buffer handles alive across successive [`step`]
/// calls. Construct once per simulation; dropping it releases every
/// CUDA resource.
pub struct CudaState {
    ctx: Arc<CudaContext>,
    stream: Arc<CudaStream>,
    func: CudaFunction,
    block_size: u32,
}

impl CudaState {
    /// Initialise CUDA on device 0, compile the SFM kernel to PTX, and
    /// load it into a module. Returns `Err` if no CUDA device is
    /// available, the driver fails to initialise, or NVRTC compilation
    /// fails.
    pub fn new() -> Result<Self, String> {
        Self::with_block_size(256)
    }

    /// Like [`CudaState::new`] but with a configurable CUDA block size.
    pub fn with_block_size(block_size: u32) -> Result<Self, String> {
        if block_size == 0 {
            return Err("block_size must be positive".to_string());
        }
        let ctx = super::new_context(0)?;
        let stream = ctx.default_stream();
        let ptx = compile_ptx(SFM_CUDA_SRC).map_err(|e| format!("NVRTC compile failed: {e}"))?;
        let module = ctx
            .load_module(ptx)
            .map_err(|e| format!("module load failed: {e}"))?;
        let func = module
            .load_function("sfm_step")
            .map_err(|e| format!("kernel lookup failed: {e}"))?;
        Ok(Self {
            ctx,
            stream,
            func,
            block_size,
        })
    }

    /// Advance `peds` by one tick of `dt` seconds on the GPU.
    ///
    /// Stateless w.r.t. device memory: allocates fresh device buffers
    /// on every call. For a hot loop, prefer a caller that holds onto
    /// persistent buffers (roadmap P0-1 follow-up: a resident-store
    /// path analogous to [`rustsim::device_store::DeviceSoaStore`]).
    ///
    /// Returns the kernel execution time in microseconds.
    pub fn step(
        &self,
        peds: &mut [Pedestrian],
        walls: &[WallSegment],
        params: &Params,
        dt: f64,
    ) -> Result<u128, String> {
        let n = peds.len();
        if n == 0 {
            return Ok(0);
        }

        let stream = &self.stream;
        // Unpack AoS → 8 SoA f32 columns for the input state.
        let mut pos_x: Vec<f32> = Vec::with_capacity(n);
        let mut pos_y: Vec<f32> = Vec::with_capacity(n);
        let mut vel_x: Vec<f32> = Vec::with_capacity(n);
        let mut vel_y: Vec<f32> = Vec::with_capacity(n);
        let mut radius: Vec<f32> = Vec::with_capacity(n);
        let mut desired_speed: Vec<f32> = Vec::with_capacity(n);
        let mut dest_x: Vec<f32> = Vec::with_capacity(n);
        let mut dest_y: Vec<f32> = Vec::with_capacity(n);
        for p in peds.iter() {
            pos_x.push(p.pos[0] as f32);
            pos_y.push(p.pos[1] as f32);
            vel_x.push(p.vel[0] as f32);
            vel_y.push(p.vel[1] as f32);
            radius.push(p.radius as f32);
            desired_speed.push(p.desired_speed as f32);
            dest_x.push(p.destination[0] as f32);
            dest_y.push(p.destination[1] as f32);
        }

        // Wall columns.
        let n_walls = walls.len();
        let mut wall_ax: Vec<f32> = Vec::with_capacity(n_walls.max(1));
        let mut wall_ay: Vec<f32> = Vec::with_capacity(n_walls.max(1));
        let mut wall_bx: Vec<f32> = Vec::with_capacity(n_walls.max(1));
        let mut wall_by: Vec<f32> = Vec::with_capacity(n_walls.max(1));
        if n_walls == 0 {
            // CUDA disallows zero-sized device allocations. Pad with a
            // single degenerate segment; the kernel skips it via
            // `n_walls == 0`.
            wall_ax.push(0.0);
            wall_ay.push(0.0);
            wall_bx.push(0.0);
            wall_by.push(0.0);
        } else {
            for w in walls {
                wall_ax.push(w.a[0] as f32);
                wall_ay.push(w.a[1] as f32);
                wall_bx.push(w.b[0] as f32);
                wall_by.push(w.b[1] as f32);
            }
        }

        // Upload to device.
        let d_pos_x: CudaSlice<f32> = stream
            .clone_htod(&pos_x)
            .map_err(|e| format!("htod pos_x failed: {e}"))?;
        let d_pos_y: CudaSlice<f32> = stream
            .clone_htod(&pos_y)
            .map_err(|e| format!("htod pos_y failed: {e}"))?;
        let d_vel_x: CudaSlice<f32> = stream
            .clone_htod(&vel_x)
            .map_err(|e| format!("htod vel_x failed: {e}"))?;
        let d_vel_y: CudaSlice<f32> = stream
            .clone_htod(&vel_y)
            .map_err(|e| format!("htod vel_y failed: {e}"))?;
        let d_radius: CudaSlice<f32> = stream
            .clone_htod(&radius)
            .map_err(|e| format!("htod radius failed: {e}"))?;
        let d_desired_speed: CudaSlice<f32> = stream
            .clone_htod(&desired_speed)
            .map_err(|e| format!("htod desired_speed failed: {e}"))?;
        let d_dest_x: CudaSlice<f32> = stream
            .clone_htod(&dest_x)
            .map_err(|e| format!("htod dest_x failed: {e}"))?;
        let d_dest_y: CudaSlice<f32> = stream
            .clone_htod(&dest_y)
            .map_err(|e| format!("htod dest_y failed: {e}"))?;
        let mut d_pos_x_out: CudaSlice<f32> = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc pos_x_out failed: {e}"))?;
        let mut d_pos_y_out: CudaSlice<f32> = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc pos_y_out failed: {e}"))?;
        let mut d_vel_x_out: CudaSlice<f32> = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc vel_x_out failed: {e}"))?;
        let mut d_vel_y_out: CudaSlice<f32> = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc vel_y_out failed: {e}"))?;
        let d_wall_ax: CudaSlice<f32> = stream
            .clone_htod(&wall_ax)
            .map_err(|e| format!("htod wall_ax failed: {e}"))?;
        let d_wall_ay: CudaSlice<f32> = stream
            .clone_htod(&wall_ay)
            .map_err(|e| format!("htod wall_ay failed: {e}"))?;
        let d_wall_bx: CudaSlice<f32> = stream
            .clone_htod(&wall_bx)
            .map_err(|e| format!("htod wall_bx failed: {e}"))?;
        let d_wall_by: CudaSlice<f32> = stream
            .clone_htod(&wall_by)
            .map_err(|e| format!("htod wall_by failed: {e}"))?;

        let n_u32 = n as u32;
        let n_walls_u32 = n_walls as u32;
        let tau = params.tau as f32;
        let a_ped = params.a_ped as f32;
        let b_ped = params.b_ped as f32;
        let a_wall = params.a_wall as f32;
        let b_wall = params.b_wall as f32;
        let mass = params.mass as f32;
        let max_speed = params.max_speed as f32;
        let max_accel = params.max_accel as f32;
        let arrival_radius = params.arrival_radius as f32;
        let dt_f32 = dt as f32;

        let grid = n.div_ceil(self.block_size as usize) as u32;
        let cfg = LaunchConfig {
            grid_dim: (grid.max(1), 1, 1),
            block_dim: (self.block_size, 1, 1),
            shared_mem_bytes: 0,
        };

        let t0 = std::time::Instant::now();

        // SAFETY:
        // - every device slice was allocated on `stream` above and
        //   outlives the launch;
        // - `n_u32` is the logical row count; the kernel bounds-checks
        //   every access against `n` and `n_walls`;
        // - argument order matches the `sfm_step` signature exactly;
        // - the launch is followed by `stream.synchronize()` before
        //   any host-visible read of the output buffers.
        unsafe {
            let mut b = stream.launch_builder(&self.func);
            b.arg(&d_pos_x);
            b.arg(&d_pos_y);
            b.arg(&d_vel_x);
            b.arg(&d_vel_y);
            b.arg(&d_radius);
            b.arg(&d_desired_speed);
            b.arg(&d_dest_x);
            b.arg(&d_dest_y);
            b.arg(&mut d_pos_x_out);
            b.arg(&mut d_pos_y_out);
            b.arg(&mut d_vel_x_out);
            b.arg(&mut d_vel_y_out);
            b.arg(&d_wall_ax);
            b.arg(&d_wall_ay);
            b.arg(&d_wall_bx);
            b.arg(&d_wall_by);
            b.arg(&n_u32);
            b.arg(&n_walls_u32);
            b.arg(&tau);
            b.arg(&a_ped);
            b.arg(&b_ped);
            b.arg(&a_wall);
            b.arg(&b_wall);
            b.arg(&mass);
            b.arg(&max_speed);
            b.arg(&max_accel);
            b.arg(&arrival_radius);
            b.arg(&dt_f32);
            b.launch(cfg)
                .map_err(|e| format!("kernel launch failed: {e}"))?;
        }

        stream
            .synchronize()
            .map_err(|e| format!("stream sync failed: {e}"))?;
        let kernel_us = t0.elapsed().as_micros();

        // Download outputs.
        stream
            .memcpy_dtoh(&d_pos_x_out, &mut pos_x)
            .map_err(|e| format!("dtoh pos_x failed: {e}"))?;
        stream
            .memcpy_dtoh(&d_pos_y_out, &mut pos_y)
            .map_err(|e| format!("dtoh pos_y failed: {e}"))?;
        stream
            .memcpy_dtoh(&d_vel_x_out, &mut vel_x)
            .map_err(|e| format!("dtoh vel_x failed: {e}"))?;
        stream
            .memcpy_dtoh(&d_vel_y_out, &mut vel_y)
            .map_err(|e| format!("dtoh vel_y failed: {e}"))?;

        // Pack columns back into AoS `Pedestrian`. Radius /
        // desired_speed / destination are read-only this step.
        for (i, p) in peds.iter_mut().enumerate() {
            p.pos = [pos_x[i] as f64, pos_y[i] as f64];
            p.vel = [vel_x[i] as f64, vel_y[i] as f64];
        }

        // Silence the unused-warning on `ctx`: it anchors the CUDA
        // context's lifetime for every buffer we dropped above.
        let _ = &self.ctx;

        Ok(kernel_us)
    }
}

/// Convenience one-shot: initialise CUDA, run one SFM step, tear down.
///
/// Equivalent to `CudaState::new()?.step(peds, walls, params, dt)`.
/// For any production tick loop, construct a [`CudaState`] once and
/// call [`CudaState::step`] every tick instead — otherwise per-step
/// NVRTC compilation and context re-init dominate the runtime.
pub fn step(
    peds: &mut [Pedestrian],
    walls: &[WallSegment],
    params: &Params,
    dt: f64,
) -> Result<u128, String> {
    let state = CudaState::new()?;
    state.step(peds, walls, params, dt)
}

/// CPU-fallback convenience: try CUDA, fall back to [`crate::social_force::step`]
/// on any CUDA error.
///
/// Mirrors the workspace's `auto_batch_step` policy: CUDA is attempted
/// first, and any runtime failure (no device, driver error, kernel
/// launch failure) is logged via `tracing::warn!` and the caller's
/// step is completed on the CPU with bit-for-bit the same semantics
/// (up to f64 → f32 round-off in the GPU arm when it succeeds).
///
/// Returns `true` iff the CUDA path succeeded.
pub fn step_with_fallback(
    state: &mut Option<CudaState>,
    peds: &mut [Pedestrian],
    walls: &[WallSegment],
    params: &Params,
    dt: f64,
) -> bool {
    // Lazy-init on first call so callers don't pay NVRTC on CPU-only
    // hosts.
    if state.is_none() {
        match CudaState::new() {
            Ok(s) => *state = Some(s),
            Err(e) => {
                eprintln!("rustsim-crowd CUDA init failed ({e}), using CPU path");
                #[allow(deprecated)]
                crate::social_force::step(peds, walls, params, dt);
                return false;
            }
        }
    }
    match state.as_ref().unwrap().step(peds, walls, params, dt) {
        Ok(_) => true,
        Err(e) => {
            eprintln!("rustsim-crowd CUDA step failed ({e}), falling back to CPU");
            #[allow(deprecated)]
            crate::social_force::step(peds, walls, params, dt);
            false
        }
    }
}

// ---------------------------------------------------------------------------
// Resident-store path.
// ---------------------------------------------------------------------------

/// CUDA state with **persistent device-resident** buffers.
///
/// Mirrors the `DeviceSoaStore::step_cuda_resident` pattern from the
/// workspace: on `upload`, every per-agent and per-wall column is
/// copied to the device exactly once. Every subsequent [`CudaResident::step`]
/// launches the kernel with no host↔device traffic — the `*_in`/`*_out`
/// buffers are swapped in place on the device, so the simulation stays
/// fully resident across ticks and a bulk [`CudaResident::download`]
/// pulls the final state back to the host when analysis is needed.
///
/// This is the path that unlocks the roadmap's advertised 50–200×
/// throughput over `step_scratch`: the stateless [`CudaState::step`]
/// re-uploads 8 columns per tick, which saturates PCIe long before the
/// kernel has a chance to dominate the profile.
///
/// Invariants:
/// - `n`, radius, desired_speed, destination, and the wall segments
///   are fixed for the lifetime of the `CudaResident`. Mutating any
///   of them requires a fresh `upload`.
/// - The kernel double-buffers through `d_pos_x_a`/`d_pos_x_b` (and
///   the `y`/`vel_x`/`vel_y` counterparts); after every step the
///   `a`/`b` roles swap so the next launch reads from what was just
///   written.
pub struct CudaResident {
    ctx: Arc<CudaContext>,
    stream: Arc<CudaStream>,
    func: CudaFunction,
    block_size: u32,

    n: usize,
    n_walls: usize,

    // Double-buffered dynamic state. Convention: `_a` is always the
    // *input* buffer at the top of a step; `_b` is the *output*. After
    // the kernel returns, the two are `mem::swap`ped so the next tick
    // once again reads from `_a` — that avoids ever needing
    // simultaneous mutable access to both halves of a pair.
    d_pos_x_a: CudaSlice<f32>,
    d_pos_y_a: CudaSlice<f32>,
    d_vel_x_a: CudaSlice<f32>,
    d_vel_y_a: CudaSlice<f32>,
    d_pos_x_b: CudaSlice<f32>,
    d_pos_y_b: CudaSlice<f32>,
    d_vel_x_b: CudaSlice<f32>,
    d_vel_y_b: CudaSlice<f32>,

    // Static per-agent columns.
    d_radius: CudaSlice<f32>,
    d_desired_speed: CudaSlice<f32>,
    d_dest_x: CudaSlice<f32>,
    d_dest_y: CudaSlice<f32>,

    // Walls.
    d_wall_ax: CudaSlice<f32>,
    d_wall_ay: CudaSlice<f32>,
    d_wall_bx: CudaSlice<f32>,
    d_wall_by: CudaSlice<f32>,

    // Optional uniform-grid acceleration. Allocated lazily by
    // `enable_grid`. When `Some`, `step_grid` is available and
    // launches the linked-list cell traversal kernel instead of
    // the O(n^2) `sfm_step`. The grid kernels are pre-loaded at
    // upload time so the same `enable_grid` call doesn't pay
    // NVRTC; only the device buffers are allocated lazily.
    grid: Option<ResidentGrid>,
    func_clear_heads: CudaFunction,
    func_build_grid: CudaFunction,
    func_step_grid: CudaFunction,
}

/// Uniform-grid configuration baked into a [`CudaResident`].
///
/// Built by [`CudaResident::enable_grid`]. The grid is fixed once
/// configured: re-entering [`CudaResident::enable_grid`] with new
/// bounds reallocates the cell head array.
#[derive(Debug, Clone, Copy)]
pub struct GridConfig {
    /// Lower-left corner of the grid in world units.
    pub origin: [f64; 2],
    /// Cell side length in world units. Must be ≥ pair-interaction
    /// cutoff so that the 3×3 cell neighbourhood walk captures every
    /// non-negligible pair.
    pub cell_size: f64,
    /// Number of cells along x and y. The grid covers
    /// `[origin, origin + dims * cell_size]`; agents whose positions
    /// fall outside that box are clamped into the boundary cells.
    pub dims: (u32, u32),
    /// Squared pair-interaction cutoff (m²). Pairs beyond this
    /// distance are pruned inside the grid kernel; choose
    /// `(neighbor_cutoff(params))²` to keep the GPU result within
    /// f32-rounding of the O(n²) `step`.
    pub cutoff_sq: f64,
}

struct ResidentGrid {
    cfg: GridConfig,
    d_cell_head: CudaSlice<u32>,
    d_cell_next: CudaSlice<u32>,
    ncells: usize,
}

impl CudaResident {
    /// Upload an initial pedestrian + wall configuration to the
    /// device. Every subsequent [`CudaResident::step`] advances this
    /// resident state with no host traffic.
    ///
    /// Returns `Err` if CUDA init, NVRTC compile, or any `memcpy_htod`
    /// fails.
    pub fn upload(peds: &[Pedestrian], walls: &[WallSegment]) -> Result<Self, String> {
        Self::upload_with_block_size(peds, walls, 256)
    }

    /// Like [`CudaResident::upload`] but with a configurable block
    /// size.
    pub fn upload_with_block_size(
        peds: &[Pedestrian],
        walls: &[WallSegment],
        block_size: u32,
    ) -> Result<Self, String> {
        if block_size == 0 {
            return Err("block_size must be positive".to_string());
        }
        let ctx = super::new_context(0)?;
        let stream = ctx.default_stream();
        let ptx = compile_ptx(SFM_CUDA_SRC).map_err(|e| format!("NVRTC compile failed: {e}"))?;
        let module = ctx
            .load_module(ptx)
            .map_err(|e| format!("module load failed: {e}"))?;
        let func = module
            .load_function("sfm_step")
            .map_err(|e| format!("kernel lookup failed: {e}"))?;

        // Pre-compile the grid-acceleration kernels so `enable_grid`
        // pays only the device-allocation cost, not NVRTC.
        let grid_ptx = compile_ptx(SFM_CUDA_GRID_SRC)
            .map_err(|e| format!("NVRTC compile (grid) failed: {e}"))?;
        let grid_module = ctx
            .load_module(grid_ptx)
            .map_err(|e| format!("module load (grid) failed: {e}"))?;
        let func_clear_heads = grid_module
            .load_function("grid_clear_heads")
            .map_err(|e| format!("grid_clear_heads lookup failed: {e}"))?;
        let func_build_grid = grid_module
            .load_function("grid_build")
            .map_err(|e| format!("grid_build lookup failed: {e}"))?;
        let func_step_grid = grid_module
            .load_function("sfm_step_grid")
            .map_err(|e| format!("sfm_step_grid lookup failed: {e}"))?;

        let n = peds.len();
        if n == 0 {
            return Err("CudaResident::upload requires at least one pedestrian".to_string());
        }

        // Unpack AoS → f32 columns.
        let mut pos_x = Vec::with_capacity(n);
        let mut pos_y = Vec::with_capacity(n);
        let mut vel_x = Vec::with_capacity(n);
        let mut vel_y = Vec::with_capacity(n);
        let mut radius = Vec::with_capacity(n);
        let mut desired_speed = Vec::with_capacity(n);
        let mut dest_x = Vec::with_capacity(n);
        let mut dest_y = Vec::with_capacity(n);
        for p in peds.iter() {
            pos_x.push(p.pos[0] as f32);
            pos_y.push(p.pos[1] as f32);
            vel_x.push(p.vel[0] as f32);
            vel_y.push(p.vel[1] as f32);
            radius.push(p.radius as f32);
            desired_speed.push(p.desired_speed as f32);
            dest_x.push(p.destination[0] as f32);
            dest_y.push(p.destination[1] as f32);
        }

        // Wall columns with the same "pad to 1 if empty" rule as the
        // stateless path — CUDA rejects zero-sized allocations.
        let n_walls = walls.len();
        let (wall_ax, wall_ay, wall_bx, wall_by) = if n_walls == 0 {
            (vec![0.0f32], vec![0.0f32], vec![0.0f32], vec![0.0f32])
        } else {
            let mut ax = Vec::with_capacity(n_walls);
            let mut ay = Vec::with_capacity(n_walls);
            let mut bx = Vec::with_capacity(n_walls);
            let mut by = Vec::with_capacity(n_walls);
            for w in walls {
                ax.push(w.a[0] as f32);
                ay.push(w.a[1] as f32);
                bx.push(w.b[0] as f32);
                by.push(w.b[1] as f32);
            }
            (ax, ay, bx, by)
        };

        // Upload. The "_a" side holds the initial state; "_b" is
        // zeroed and will be written on the first step.
        let d_pos_x_a = stream
            .clone_htod(&pos_x)
            .map_err(|e| format!("htod pos_x failed: {e}"))?;
        let d_pos_y_a = stream
            .clone_htod(&pos_y)
            .map_err(|e| format!("htod pos_y failed: {e}"))?;
        let d_vel_x_a = stream
            .clone_htod(&vel_x)
            .map_err(|e| format!("htod vel_x failed: {e}"))?;
        let d_vel_y_a = stream
            .clone_htod(&vel_y)
            .map_err(|e| format!("htod vel_y failed: {e}"))?;
        let d_pos_x_b = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc pos_x_b failed: {e}"))?;
        let d_pos_y_b = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc pos_y_b failed: {e}"))?;
        let d_vel_x_b = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc vel_x_b failed: {e}"))?;
        let d_vel_y_b = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc vel_y_b failed: {e}"))?;
        let d_radius = stream
            .clone_htod(&radius)
            .map_err(|e| format!("htod radius failed: {e}"))?;
        let d_desired_speed = stream
            .clone_htod(&desired_speed)
            .map_err(|e| format!("htod desired_speed failed: {e}"))?;
        let d_dest_x = stream
            .clone_htod(&dest_x)
            .map_err(|e| format!("htod dest_x failed: {e}"))?;
        let d_dest_y = stream
            .clone_htod(&dest_y)
            .map_err(|e| format!("htod dest_y failed: {e}"))?;
        let d_wall_ax = stream
            .clone_htod(&wall_ax)
            .map_err(|e| format!("htod wall_ax failed: {e}"))?;
        let d_wall_ay = stream
            .clone_htod(&wall_ay)
            .map_err(|e| format!("htod wall_ay failed: {e}"))?;
        let d_wall_bx = stream
            .clone_htod(&wall_bx)
            .map_err(|e| format!("htod wall_bx failed: {e}"))?;
        let d_wall_by = stream
            .clone_htod(&wall_by)
            .map_err(|e| format!("htod wall_by failed: {e}"))?;

        stream
            .synchronize()
            .map_err(|e| format!("initial sync failed: {e}"))?;

        Ok(Self {
            ctx,
            stream,
            func,
            block_size,
            n,
            n_walls,
            d_pos_x_a,
            d_pos_y_a,
            d_vel_x_a,
            d_vel_y_a,
            d_pos_x_b,
            d_pos_y_b,
            d_vel_x_b,
            d_vel_y_b,
            d_radius,
            d_desired_speed,
            d_dest_x,
            d_dest_y,
            d_wall_ax,
            d_wall_ay,
            d_wall_bx,
            d_wall_by,
            grid: None,
            func_clear_heads,
            func_build_grid,
            func_step_grid,
        })
    }

    /// Number of resident pedestrians.
    #[inline]
    pub fn len(&self) -> usize {
        self.n
    }

    /// Whether any pedestrians are resident.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.n == 0
    }

    /// Advance one tick of `dt` seconds fully on the GPU.
    ///
    /// Zero host↔device traffic. Returns the kernel execution time in
    /// microseconds.
    pub fn step(&mut self, params: &Params, dt: f64) -> Result<u128, String> {
        let n_u32 = self.n as u32;
        let n_walls_u32 = self.n_walls as u32;
        let tau = params.tau as f32;
        let a_ped = params.a_ped as f32;
        let b_ped = params.b_ped as f32;
        let a_wall = params.a_wall as f32;
        let b_wall = params.b_wall as f32;
        let mass = params.mass as f32;
        let max_speed = params.max_speed as f32;
        let max_accel = params.max_accel as f32;
        let arrival_radius = params.arrival_radius as f32;
        let dt_f32 = dt as f32;

        let grid = self.n.div_ceil(self.block_size as usize) as u32;
        let cfg = LaunchConfig {
            grid_dim: (grid.max(1), 1, 1),
            block_dim: (self.block_size, 1, 1),
            shared_mem_bytes: 0,
        };

        // Split-borrow so the input half (`_a`) is shared and the
        // output half (`_b`) is unique for the duration of the launch.
        let Self {
            ref stream,
            ref func,
            ref d_radius,
            ref d_desired_speed,
            ref d_dest_x,
            ref d_dest_y,
            ref d_wall_ax,
            ref d_wall_ay,
            ref d_wall_bx,
            ref d_wall_by,
            ref d_pos_x_a,
            ref d_pos_y_a,
            ref d_vel_x_a,
            ref d_vel_y_a,
            ref mut d_pos_x_b,
            ref mut d_pos_y_b,
            ref mut d_vel_x_b,
            ref mut d_vel_y_b,
            ..
        } = *self;

        let t0 = std::time::Instant::now();

        // SAFETY:
        // - every device slice is owned by `self` and lives longer
        //   than the launch;
        // - `n_u32` is the kernel's logical row count; the kernel
        //   bounds-checks every access;
        // - argument order matches the `sfm_step` signature exactly;
        // - the `_a` (input) and `_b` (output) halves are distinct
        //   allocations — no aliasing.
        unsafe {
            let mut b = stream.launch_builder(func);
            b.arg(d_pos_x_a);
            b.arg(d_pos_y_a);
            b.arg(d_vel_x_a);
            b.arg(d_vel_y_a);
            b.arg(d_radius);
            b.arg(d_desired_speed);
            b.arg(d_dest_x);
            b.arg(d_dest_y);
            b.arg(d_pos_x_b);
            b.arg(d_pos_y_b);
            b.arg(d_vel_x_b);
            b.arg(d_vel_y_b);
            b.arg(d_wall_ax);
            b.arg(d_wall_ay);
            b.arg(d_wall_bx);
            b.arg(d_wall_by);
            b.arg(&n_u32);
            b.arg(&n_walls_u32);
            b.arg(&tau);
            b.arg(&a_ped);
            b.arg(&b_ped);
            b.arg(&a_wall);
            b.arg(&b_wall);
            b.arg(&mass);
            b.arg(&max_speed);
            b.arg(&max_accel);
            b.arg(&arrival_radius);
            b.arg(&dt_f32);
            b.launch(cfg)
                .map_err(|e| format!("kernel launch failed: {e}"))?;
        }

        stream
            .synchronize()
            .map_err(|e| format!("stream sync failed: {e}"))?;
        let kernel_us = t0.elapsed().as_micros();

        // Rotate input/output roles: after the swap, `_a` once again
        // holds the freshest state.
        std::mem::swap(&mut self.d_pos_x_a, &mut self.d_pos_x_b);
        std::mem::swap(&mut self.d_pos_y_a, &mut self.d_pos_y_b);
        std::mem::swap(&mut self.d_vel_x_a, &mut self.d_vel_x_b);
        std::mem::swap(&mut self.d_vel_y_a, &mut self.d_vel_y_b);

        // Silence unused-anchor warnings on the lifetime owners.
        let _ = &self.ctx;

        Ok(kernel_us)
    }

    /// Allocate the device-side uniform-grid buffers for this
    /// resident store and stash the configuration. After this returns
    /// `Ok(())`, [`CudaResident::step_grid`] is the fast path.
    ///
    /// The grid kernels themselves were already compiled at
    /// [`CudaResident::upload`] time; this call only sizes the
    /// `cell_head[ncells]` and `cell_next[n]` device buffers and
    /// records the configuration. Calling `enable_grid` again with
    /// different bounds reallocates `cell_head` (and `cell_next` if
    /// `n` somehow grew, though `n` is fixed for a `CudaResident`).
    ///
    /// Constraints on `cfg`:
    /// - `cell_size > 0` and `cell_size >= sqrt(cutoff_sq)`. The 3×3
    ///   neighbour walk only captures pairs within
    ///   `sqrt(2) * cell_size` so a smaller cell would silently drop
    ///   real interactions.
    /// - `dims.0 > 0 && dims.1 > 0`.
    /// - `dims.0 * dims.1` must fit in `u32` (the device cell index).
    ///
    /// Returns `Err` for any violation, or any device allocation
    /// failure.
    pub fn enable_grid(&mut self, cfg: GridConfig) -> Result<(), String> {
        if cfg.cell_size <= 0.0 || !cfg.cell_size.is_finite() {
            return Err("GridConfig.cell_size must be positive and finite".to_string());
        }
        if !cfg.cutoff_sq.is_finite() || cfg.cutoff_sq < 0.0 {
            return Err("GridConfig.cutoff_sq must be non-negative and finite".to_string());
        }
        if cfg.cell_size * cfg.cell_size < cfg.cutoff_sq - 1.0e-9 {
            return Err(format!(
                "GridConfig.cell_size ({}) is smaller than sqrt(cutoff_sq) ({}); the 3x3 neighbour walk would miss real pairs",
                cfg.cell_size,
                cfg.cutoff_sq.sqrt()
            ));
        }
        if cfg.dims.0 == 0 || cfg.dims.1 == 0 {
            return Err("GridConfig.dims must have both extents > 0".to_string());
        }
        let ncells_u64 = (cfg.dims.0 as u64) * (cfg.dims.1 as u64);
        if ncells_u64 > u32::MAX as u64 {
            return Err(format!(
                "GridConfig.dims product {ncells_u64} overflows u32 cell index"
            ));
        }
        let ncells = ncells_u64 as usize;

        let d_cell_head: CudaSlice<u32> = self
            .stream
            .alloc_zeros(ncells)
            .map_err(|e| format!("alloc cell_head failed: {e}"))?;
        let d_cell_next: CudaSlice<u32> = self
            .stream
            .alloc_zeros(self.n)
            .map_err(|e| format!("alloc cell_next failed: {e}"))?;

        self.grid = Some(ResidentGrid {
            cfg,
            d_cell_head,
            d_cell_next,
            ncells,
        });
        Ok(())
    }

    /// Whether [`CudaResident::step_grid`] is currently available.
    #[inline]
    pub fn has_grid(&self) -> bool {
        self.grid.is_some()
    }

    /// Grid-accelerated tick. Identical force law to
    /// [`CudaResident::step`], but pair iteration walks the 3×3 cell
    /// neighbourhood via on-device linked-list bucketing instead of
    /// the full O(n²) scan. Three kernels per tick:
    ///
    /// 1. `grid_clear_heads` resets every cell head to the empty
    ///    sentinel.
    /// 2. `grid_build` populates the linked list via
    ///    `atomicExch`-on-head — O(n) device work, no scan, no
    ///    sort.
    /// 3. `sfm_step_grid` runs the SFM force law, walking only the
    ///    3×3 cell neighbourhood and pruning by `cutoff_sq`.
    ///
    /// Zero host↔device traffic. Returns the total kernel + sync
    /// wall-clock in microseconds.
    ///
    /// Returns `Err("grid not enabled")` if [`CudaResident::enable_grid`]
    /// hasn't been called.
    pub fn step_grid(&mut self, params: &Params, dt: f64) -> Result<u128, String> {
        let n_u32 = self.n as u32;
        let n_walls_u32 = self.n_walls as u32;
        let tau = params.tau as f32;
        let a_ped = params.a_ped as f32;
        let b_ped = params.b_ped as f32;
        let a_wall = params.a_wall as f32;
        let b_wall = params.b_wall as f32;
        let mass = params.mass as f32;
        let max_speed = params.max_speed as f32;
        let max_accel = params.max_accel as f32;
        let arrival_radius = params.arrival_radius as f32;
        let dt_f32 = dt as f32;

        // Lift grid config out so we can split-borrow.
        let (origin_x, origin_y, inv_cell_size, grid_w, grid_h, cutoff_sq, ncells) = {
            let g = self
                .grid
                .as_ref()
                .ok_or("step_grid: grid not enabled (call enable_grid first)")?;
            let inv = 1.0_f32 / (g.cfg.cell_size as f32);
            (
                g.cfg.origin[0] as f32,
                g.cfg.origin[1] as f32,
                inv,
                g.cfg.dims.0,
                g.cfg.dims.1,
                g.cfg.cutoff_sq as f32,
                g.ncells,
            )
        };

        let block = self.block_size;
        let grid_n = self.n.div_ceil(block as usize) as u32;
        let grid_cells = ncells.div_ceil(block as usize) as u32;
        let cfg_n = LaunchConfig {
            grid_dim: (grid_n.max(1), 1, 1),
            block_dim: (block, 1, 1),
            shared_mem_bytes: 0,
        };
        let cfg_cells = LaunchConfig {
            grid_dim: (grid_cells.max(1), 1, 1),
            block_dim: (block, 1, 1),
            shared_mem_bytes: 0,
        };

        let Self {
            ref stream,
            ref func_clear_heads,
            ref func_build_grid,
            ref func_step_grid,
            ref d_radius,
            ref d_desired_speed,
            ref d_dest_x,
            ref d_dest_y,
            ref d_wall_ax,
            ref d_wall_ay,
            ref d_wall_bx,
            ref d_wall_by,
            ref d_pos_x_a,
            ref d_pos_y_a,
            ref d_vel_x_a,
            ref d_vel_y_a,
            ref mut d_pos_x_b,
            ref mut d_pos_y_b,
            ref mut d_vel_x_b,
            ref mut d_vel_y_b,
            ref mut grid,
            ..
        } = *self;

        let g = grid.as_mut().expect("grid presence checked above");
        let ncells_u32 = g.ncells as u32;

        let t0 = std::time::Instant::now();

        // SAFETY: device buffers are owned by `self` and outlive the
        // launches; the launches are issued on the same stream and
        // are ordered, so the second kernel sees the first's writes
        // and the third sees both. Argument order matches each
        // kernel's signature exactly.
        unsafe {
            // 1) Reset every cell head to UINT_MAX.
            let mut b = stream.launch_builder(func_clear_heads);
            b.arg(&mut g.d_cell_head);
            b.arg(&ncells_u32);
            b.launch(cfg_cells)
                .map_err(|e| format!("grid_clear_heads launch failed: {e}"))?;

            // 2) Build per-cell linked lists from `_a` (current
            //    position) via atomicExch-on-head insertion.
            let mut b = stream.launch_builder(func_build_grid);
            b.arg(d_pos_x_a);
            b.arg(d_pos_y_a);
            b.arg(&mut g.d_cell_head);
            b.arg(&mut g.d_cell_next);
            b.arg(&origin_x);
            b.arg(&origin_y);
            b.arg(&inv_cell_size);
            b.arg(&grid_w);
            b.arg(&grid_h);
            b.arg(&n_u32);
            b.launch(cfg_n)
                .map_err(|e| format!("grid_build launch failed: {e}"))?;

            // 3) Run SFM force law via the 3x3 neighbourhood walk.
            let mut b = stream.launch_builder(func_step_grid);
            b.arg(d_pos_x_a);
            b.arg(d_pos_y_a);
            b.arg(d_vel_x_a);
            b.arg(d_vel_y_a);
            b.arg(d_radius);
            b.arg(d_desired_speed);
            b.arg(d_dest_x);
            b.arg(d_dest_y);
            b.arg(d_pos_x_b);
            b.arg(d_pos_y_b);
            b.arg(d_vel_x_b);
            b.arg(d_vel_y_b);
            b.arg(d_wall_ax);
            b.arg(d_wall_ay);
            b.arg(d_wall_bx);
            b.arg(d_wall_by);
            b.arg(&g.d_cell_head);
            b.arg(&g.d_cell_next);
            b.arg(&n_u32);
            b.arg(&n_walls_u32);
            b.arg(&origin_x);
            b.arg(&origin_y);
            b.arg(&inv_cell_size);
            b.arg(&grid_w);
            b.arg(&grid_h);
            b.arg(&cutoff_sq);
            b.arg(&tau);
            b.arg(&a_ped);
            b.arg(&b_ped);
            b.arg(&a_wall);
            b.arg(&b_wall);
            b.arg(&mass);
            b.arg(&max_speed);
            b.arg(&max_accel);
            b.arg(&arrival_radius);
            b.arg(&dt_f32);
            b.launch(cfg_n)
                .map_err(|e| format!("sfm_step_grid launch failed: {e}"))?;
        }

        stream
            .synchronize()
            .map_err(|e| format!("stream sync failed: {e}"))?;
        let kernel_us = t0.elapsed().as_micros();

        std::mem::swap(&mut self.d_pos_x_a, &mut self.d_pos_x_b);
        std::mem::swap(&mut self.d_pos_y_a, &mut self.d_pos_y_b);
        std::mem::swap(&mut self.d_vel_x_a, &mut self.d_vel_x_b);
        std::mem::swap(&mut self.d_vel_y_a, &mut self.d_vel_y_b);

        let _ = &self.ctx;

        Ok(kernel_us)
    }

    /// Bulk download of the current resident state back into `peds`.
    ///
    /// Resizes `peds` to `self.len()`, copies pos/vel/radius/
    /// desired_speed/destination from the device, and reconstructs
    /// the AoS `Pedestrian` struct per row. Radius, desired_speed,
    /// and destination are copied from the resident (invariant)
    /// columns.
    pub fn download(&self, peds: &mut Vec<Pedestrian>) -> Result<(), String> {
        let n = self.n;
        peds.resize(
            n,
            Pedestrian {
                pos: [0.0, 0.0],
                vel: [0.0, 0.0],
                radius: 0.0,
                desired_speed: 0.0,
                destination: [0.0, 0.0],
            },
        );

        let (cur_px, cur_py, cur_vx, cur_vy) = (
            &self.d_pos_x_a,
            &self.d_pos_y_a,
            &self.d_vel_x_a,
            &self.d_vel_y_a,
        );

        let mut pos_x = vec![0.0f32; n];
        let mut pos_y = vec![0.0f32; n];
        let mut vel_x = vec![0.0f32; n];
        let mut vel_y = vec![0.0f32; n];
        let mut radius = vec![0.0f32; n];
        let mut desired_speed = vec![0.0f32; n];
        let mut dest_x = vec![0.0f32; n];
        let mut dest_y = vec![0.0f32; n];
        self.stream
            .memcpy_dtoh(cur_px, &mut pos_x)
            .map_err(|e| format!("dtoh pos_x failed: {e}"))?;
        self.stream
            .memcpy_dtoh(cur_py, &mut pos_y)
            .map_err(|e| format!("dtoh pos_y failed: {e}"))?;
        self.stream
            .memcpy_dtoh(cur_vx, &mut vel_x)
            .map_err(|e| format!("dtoh vel_x failed: {e}"))?;
        self.stream
            .memcpy_dtoh(cur_vy, &mut vel_y)
            .map_err(|e| format!("dtoh vel_y failed: {e}"))?;
        self.stream
            .memcpy_dtoh(&self.d_radius, &mut radius)
            .map_err(|e| format!("dtoh radius failed: {e}"))?;
        self.stream
            .memcpy_dtoh(&self.d_desired_speed, &mut desired_speed)
            .map_err(|e| format!("dtoh desired_speed failed: {e}"))?;
        self.stream
            .memcpy_dtoh(&self.d_dest_x, &mut dest_x)
            .map_err(|e| format!("dtoh dest_x failed: {e}"))?;
        self.stream
            .memcpy_dtoh(&self.d_dest_y, &mut dest_y)
            .map_err(|e| format!("dtoh dest_y failed: {e}"))?;

        for (i, p) in peds.iter_mut().enumerate().take(n) {
            p.pos = [pos_x[i] as f64, pos_y[i] as f64];
            p.vel = [vel_x[i] as f64, vel_y[i] as f64];
            p.radius = radius[i] as f64;
            p.desired_speed = desired_speed[i] as f64;
            p.destination = [dest_x[i] as f64, dest_y[i] as f64];
        }

        Ok(())
    }
}