rustsim-crowd 0.0.1

//! CUDA implementation of [Optimal Steps](crate::optimal_steps).
//!
//! Closes the OSM slot of the P0-1 "CUDA kernels" item in
//! `docs/rustsim-crowd.md` — and with this module shipped, every
//! 2-D crowd model has a CUDA path.
//!
//! Mirrors the CFS/AVM kinematic shape (no acceleration column;
//! the kernel writes `vel_out` and `pos_out` directly), with the
//! OSM-specific *discrete-step utility scoring* in place of the
//! force / cone-search inner loop:
//!
//! 1. Per pedestrian, evaluate the utility of staying put.
//! 2. Sample `num_candidates` deterministic polar offsets on a
//!    circle of radius `stride = stride_at_free_flow * v/v0`,
//!    capped by `intended_speed * dt` so a huge `dt` cannot teleport.
//! 3. Keep only candidates inside the forward cone (dot ≥
//!    `cos(cone_half_angle)` against the desired direction).
//! 4. Score each candidate as `target_weight * |dest - candidate| +
//!    Σ_j ped_strength * exp(-clearance/ped_range) + Σ_walls
//!    wall_strength * exp(-clearance/wall_range)`.
//! 5. Pick the minimum. The kernel back-derives an effective velocity
//!    `(new_pos - pos_in) / dt` so observers / Arrow exporters see a
//!    consistent vel-position pair, matching the CPU contract.
//!
//! Internal `f32` arithmetic; NVRTC at runtime, no CUDA toolkit at
//! build time. `CudaState::step` is the per-tick entry point;
//! [`step_with_fallback`] falls through to the CPU path on any
//! initialisation or launch error.

use cudarc::driver::{
    CudaContext, CudaFunction, CudaSlice, CudaStream, LaunchConfig, PushKernelArg,
};
use cudarc::nvrtc::compile_ptx;
use std::sync::Arc;

use crate::common::{Pedestrian, WallSegment};
use crate::optimal_steps::Params;

/// CUDA source for the OSM step kernel.
///
/// Mirrors `optimal_steps::best_candidate` + `utility` exactly modulo
/// `f32` precision. The candidate-sample loop is identical to the
/// CPU `candidate_offsets`: `theta_k = k * 2π / num_candidates`,
/// `offset = stride * (cos θ, sin θ)`. The "stand-still" candidate
/// (pedestrian's current position) is always evaluated first as a
/// floor on the chosen utility, exactly like the CPU.
const OSM_CUDA_SRC: &str = r#"
extern "C" __device__ float osm_utility(
    float candx, float candy,
    float ri, float dxi, float dyi,
    unsigned int self_idx,
    const float* pos_x_in,
    const float* pos_y_in,
    const float* radius,
    const float* wall_ax,
    const float* wall_ay,
    const float* wall_bx,
    const float* wall_by,
    unsigned int n,
    unsigned int n_walls,
    float target_weight,
    float ped_strength, float ped_range,
    float wall_strength, float wall_range)
{
    float to_dx = dxi - candx;
    float to_dy = dyi - candy;
    float to_target = sqrtf(to_dx * to_dx + to_dy * to_dy);
    float score = target_weight * to_target;

    for (unsigned int j = 0; j < n; ++j) {
        if (j == self_idx) continue;
        float ddx = candx - pos_x_in[j];
        float ddy = candy - pos_y_in[j];
        float d = sqrtf(ddx * ddx + ddy * ddy);
        float clearance = d - (ri + radius[j]);
        if (clearance < 0.0f) clearance = 0.0f;
        score += ped_strength * expf(-clearance / ped_range);
    }

    for (unsigned int k = 0; k < n_walls; ++k) {
        float wax = wall_ax[k];
        float way = wall_ay[k];
        float wbx = wall_bx[k];
        float wby = wall_by[k];
        float abx = wbx - wax;
        float aby = wby - way;
        float denom = abx * abx + aby * aby;
        float t = 0.0f;
        if (denom > 1.0e-18f) {
            t = ((candx - wax) * abx + (candy - way) * aby) / denom;
            if (t < 0.0f) t = 0.0f;
            if (t > 1.0f) t = 1.0f;
        }
        float cpx = wax + t * abx;
        float cpy = way + t * aby;
        float ddx = candx - cpx;
        float ddy = candy - cpy;
        float d = sqrtf(ddx * ddx + ddy * ddy);
        float clearance = d - ri;
        if (clearance < 0.0f) clearance = 0.0f;
        score += wall_strength * expf(-clearance / wall_range);
    }

    return score;
}

extern "C" __global__ void osm_step(
    const float* __restrict__ pos_x_in,
    const float* __restrict__ pos_y_in,
    const float* __restrict__ vel_x_in,
    const float* __restrict__ vel_y_in,
    const float* __restrict__ radius,
    const float* __restrict__ desired_speed,
    const float* __restrict__ dest_x,
    const float* __restrict__ dest_y,
    float* __restrict__ pos_x_out,
    float* __restrict__ pos_y_out,
    float* __restrict__ vel_x_out,
    float* __restrict__ vel_y_out,
    const float* __restrict__ wall_ax,
    const float* __restrict__ wall_ay,
    const float* __restrict__ wall_bx,
    const float* __restrict__ wall_by,
    unsigned int n,
    unsigned int n_walls,
    unsigned int num_candidates,
    float stride_at_free_flow,
    float cone_half_angle,
    float target_weight,
    float ped_strength, float ped_range,
    float wall_strength, float wall_range,
    float arrival_radius,
    float dt)
{
    unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= n) return;

    float pxi = pos_x_in[i];
    float pyi = pos_y_in[i];
    float vxi = vel_x_in[i];
    float vyi = vel_y_in[i];
    float ri  = radius[i];
    float v0  = desired_speed[i];
    float dxi = dest_x[i];
    float dyi = dest_y[i];

    // Desired direction toward destination (zero if at destination).
    float to_dx = dxi - pxi;
    float to_dy = dyi - pyi;
    float d_to_dest = sqrtf(to_dx * to_dx + to_dy * to_dy);
    float edx = 0.0f;
    float edy = 0.0f;
    if (d_to_dest > 1.0e-12f) {
        edx = to_dx / d_to_dest;
        edy = to_dy / d_to_dest;
    }

    // Arrival-tapered desired speed (CPU `effective_desired_speed`).
    float tapered = v0;
    if (arrival_radius > 0.0f && d_to_dest < arrival_radius) {
        tapered = v0 * (d_to_dest / arrival_radius);
    }

    // Stride length (CPU `stride_length`).
    float speed_now = sqrtf(vxi * vxi + vyi * vyi);
    float intended = tapered > speed_now ? tapered : speed_now;
    float stride_free = stride_at_free_flow > 1.0e-3f ? stride_at_free_flow : 1.0e-3f;
    float v0_safe = v0 > 1.0e-6f ? v0 : 1.0e-6f;
    float stride = stride_free * (intended / v0_safe);
    float cap = intended * dt;
    if (stride > cap) stride = cap;

    // Stand-still floor on the utility. Always evaluated, even at
    // destination — matches the CPU exactly.
    float best_x = pxi;
    float best_y = pyi;
    float best_score = osm_utility(
        pxi, pyi, ri, dxi, dyi, i,
        pos_x_in, pos_y_in, radius,
        wall_ax, wall_ay, wall_bx, wall_by,
        n, n_walls,
        target_weight, ped_strength, ped_range,
        wall_strength, wall_range);

    if (num_candidates > 0 && stride > 0.0f) {
        float cos_cutoff = cosf(cone_half_angle);
        bool dest_present = (edx != 0.0f || edy != 0.0f);
        float two_pi = 6.28318530717958647692f;
        for (unsigned int k = 0; k < num_candidates; ++k) {
            float theta = (float)k * two_pi / (float)num_candidates;
            float ox = stride * cosf(theta);
            float oy = stride * sinf(theta);

            // Forward-cone gate: normalize the offset and dot with e_dest.
            // The offset has magnitude `stride > 0`, so normalising is
            // exactly `(cos θ, sin θ)`.
            if (dest_present) {
                float dot = cosf(theta) * edx + sinf(theta) * edy;
                if (dot < cos_cutoff) continue;
            }

            float cx = pxi + ox;
            float cy = pyi + oy;
            float score = osm_utility(
                cx, cy, ri, dxi, dyi, i,
                pos_x_in, pos_y_in, radius,
                wall_ax, wall_ay, wall_bx, wall_by,
                n, n_walls,
                target_weight, ped_strength, ped_range,
                wall_strength, wall_range);
            if (score < best_score) {
                best_score = score;
                best_x = cx;
                best_y = cy;
            }
        }
    }

    // Back-derive effective velocity: (new_pos - pos_in) / dt.
    float vx_new = 0.0f;
    float vy_new = 0.0f;
    if (dt > 0.0f) {
        vx_new = (best_x - pxi) / dt;
        vy_new = (best_y - pyi) / dt;
    }

    pos_x_out[i] = best_x;
    pos_y_out[i] = best_y;
    vel_x_out[i] = vx_new;
    vel_y_out[i] = vy_new;
}
"#;

/// Persistent CUDA state for the OSM kernel.
pub struct CudaState {
    ctx: Arc<CudaContext>,
    stream: Arc<CudaStream>,
    func: CudaFunction,
    block_size: u32,
}

impl CudaState {
    /// Initialise CUDA on device 0, compile the OSM kernel via NVRTC,
    /// and load it.
    pub fn new() -> Result<Self, String> {
        Self::with_block_size(256)
    }

    /// Like [`CudaState::new`] but with a configurable CUDA block size.
    pub fn with_block_size(block_size: u32) -> Result<Self, String> {
        if block_size == 0 {
            return Err("block_size must be positive".to_string());
        }
        let ctx = super::new_context(0)?;
        let stream = ctx.default_stream();
        let ptx = compile_ptx(OSM_CUDA_SRC).map_err(|e| format!("NVRTC compile failed: {e}"))?;
        let module = ctx
            .load_module(ptx)
            .map_err(|e| format!("module load failed: {e}"))?;
        let func = module
            .load_function("osm_step")
            .map_err(|e| format!("kernel lookup failed: {e}"))?;
        Ok(Self {
            ctx,
            stream,
            func,
            block_size,
        })
    }

    /// Advance `peds` by one tick of `dt` seconds on the GPU.
    ///
    /// Stateless w.r.t. device memory. Returns the kernel execution
    /// time in microseconds.
    pub fn step(
        &self,
        peds: &mut [Pedestrian],
        walls: &[WallSegment],
        params: &Params,
        dt: f64,
    ) -> Result<u128, String> {
        let n = peds.len();
        if n == 0 {
            return Ok(0);
        }

        let stream = &self.stream;
        let mut pos_x: Vec<f32> = Vec::with_capacity(n);
        let mut pos_y: Vec<f32> = Vec::with_capacity(n);
        let mut vel_x: Vec<f32> = Vec::with_capacity(n);
        let mut vel_y: Vec<f32> = Vec::with_capacity(n);
        let mut radius: Vec<f32> = Vec::with_capacity(n);
        let mut desired_speed: Vec<f32> = Vec::with_capacity(n);
        let mut dest_x: Vec<f32> = Vec::with_capacity(n);
        let mut dest_y: Vec<f32> = Vec::with_capacity(n);
        for p in peds.iter() {
            pos_x.push(p.pos[0] as f32);
            pos_y.push(p.pos[1] as f32);
            vel_x.push(p.vel[0] as f32);
            vel_y.push(p.vel[1] as f32);
            radius.push(p.radius as f32);
            desired_speed.push(p.desired_speed as f32);
            dest_x.push(p.destination[0] as f32);
            dest_y.push(p.destination[1] as f32);
        }

        let n_walls = walls.len();
        let mut wall_ax: Vec<f32> = Vec::with_capacity(n_walls.max(1));
        let mut wall_ay: Vec<f32> = Vec::with_capacity(n_walls.max(1));
        let mut wall_bx: Vec<f32> = Vec::with_capacity(n_walls.max(1));
        let mut wall_by: Vec<f32> = Vec::with_capacity(n_walls.max(1));
        if n_walls == 0 {
            wall_ax.push(0.0);
            wall_ay.push(0.0);
            wall_bx.push(0.0);
            wall_by.push(0.0);
        } else {
            for w in walls {
                wall_ax.push(w.a[0] as f32);
                wall_ay.push(w.a[1] as f32);
                wall_bx.push(w.b[0] as f32);
                wall_by.push(w.b[1] as f32);
            }
        }

        let d_pos_x: CudaSlice<f32> = stream
            .clone_htod(&pos_x)
            .map_err(|e| format!("htod pos_x failed: {e}"))?;
        let d_pos_y: CudaSlice<f32> = stream
            .clone_htod(&pos_y)
            .map_err(|e| format!("htod pos_y failed: {e}"))?;
        let d_vel_x: CudaSlice<f32> = stream
            .clone_htod(&vel_x)
            .map_err(|e| format!("htod vel_x failed: {e}"))?;
        let d_vel_y: CudaSlice<f32> = stream
            .clone_htod(&vel_y)
            .map_err(|e| format!("htod vel_y failed: {e}"))?;
        let d_radius: CudaSlice<f32> = stream
            .clone_htod(&radius)
            .map_err(|e| format!("htod radius failed: {e}"))?;
        let d_desired_speed: CudaSlice<f32> = stream
            .clone_htod(&desired_speed)
            .map_err(|e| format!("htod desired_speed failed: {e}"))?;
        let d_dest_x: CudaSlice<f32> = stream
            .clone_htod(&dest_x)
            .map_err(|e| format!("htod dest_x failed: {e}"))?;
        let d_dest_y: CudaSlice<f32> = stream
            .clone_htod(&dest_y)
            .map_err(|e| format!("htod dest_y failed: {e}"))?;
        let mut d_pos_x_out: CudaSlice<f32> = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc pos_x_out failed: {e}"))?;
        let mut d_pos_y_out: CudaSlice<f32> = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc pos_y_out failed: {e}"))?;
        let mut d_vel_x_out: CudaSlice<f32> = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc vel_x_out failed: {e}"))?;
        let mut d_vel_y_out: CudaSlice<f32> = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc vel_y_out failed: {e}"))?;
        let d_wall_ax: CudaSlice<f32> = stream
            .clone_htod(&wall_ax)
            .map_err(|e| format!("htod wall_ax failed: {e}"))?;
        let d_wall_ay: CudaSlice<f32> = stream
            .clone_htod(&wall_ay)
            .map_err(|e| format!("htod wall_ay failed: {e}"))?;
        let d_wall_bx: CudaSlice<f32> = stream
            .clone_htod(&wall_bx)
            .map_err(|e| format!("htod wall_bx failed: {e}"))?;
        let d_wall_by: CudaSlice<f32> = stream
            .clone_htod(&wall_by)
            .map_err(|e| format!("htod wall_by failed: {e}"))?;

        let n_u32 = n as u32;
        let n_walls_u32 = n_walls as u32;
        let num_candidates = params.num_candidates as u32;
        let stride_at_free_flow = params.stride_at_free_flow as f32;
        let cone_half_angle = params.cone_half_angle as f32;
        let target_weight = params.target_weight as f32;
        let ped_strength = params.ped_strength as f32;
        let ped_range = params.ped_range as f32;
        let wall_strength = params.wall_strength as f32;
        let wall_range = params.wall_range as f32;
        let arrival_radius = params.arrival_radius as f32;
        let dt_f32 = dt as f32;

        let grid = n.div_ceil(self.block_size as usize) as u32;
        let cfg = LaunchConfig {
            grid_dim: (grid.max(1), 1, 1),
            block_dim: (self.block_size, 1, 1),
            shared_mem_bytes: 0,
        };

        let t0 = std::time::Instant::now();

        // SAFETY: see `cuda::collision_free_speed` for the contract.
        unsafe {
            let mut b = stream.launch_builder(&self.func);
            b.arg(&d_pos_x);
            b.arg(&d_pos_y);
            b.arg(&d_vel_x);
            b.arg(&d_vel_y);
            b.arg(&d_radius);
            b.arg(&d_desired_speed);
            b.arg(&d_dest_x);
            b.arg(&d_dest_y);
            b.arg(&mut d_pos_x_out);
            b.arg(&mut d_pos_y_out);
            b.arg(&mut d_vel_x_out);
            b.arg(&mut d_vel_y_out);
            b.arg(&d_wall_ax);
            b.arg(&d_wall_ay);
            b.arg(&d_wall_bx);
            b.arg(&d_wall_by);
            b.arg(&n_u32);
            b.arg(&n_walls_u32);
            b.arg(&num_candidates);
            b.arg(&stride_at_free_flow);
            b.arg(&cone_half_angle);
            b.arg(&target_weight);
            b.arg(&ped_strength);
            b.arg(&ped_range);
            b.arg(&wall_strength);
            b.arg(&wall_range);
            b.arg(&arrival_radius);
            b.arg(&dt_f32);
            b.launch(cfg)
                .map_err(|e| format!("kernel launch failed: {e}"))?;
        }

        stream
            .synchronize()
            .map_err(|e| format!("stream sync failed: {e}"))?;
        let kernel_us = t0.elapsed().as_micros();

        stream
            .memcpy_dtoh(&d_pos_x_out, &mut pos_x)
            .map_err(|e| format!("dtoh pos_x failed: {e}"))?;
        stream
            .memcpy_dtoh(&d_pos_y_out, &mut pos_y)
            .map_err(|e| format!("dtoh pos_y failed: {e}"))?;
        stream
            .memcpy_dtoh(&d_vel_x_out, &mut vel_x)
            .map_err(|e| format!("dtoh vel_x failed: {e}"))?;
        stream
            .memcpy_dtoh(&d_vel_y_out, &mut vel_y)
            .map_err(|e| format!("dtoh vel_y failed: {e}"))?;

        for (i, p) in peds.iter_mut().enumerate() {
            p.pos = [pos_x[i] as f64, pos_y[i] as f64];
            p.vel = [vel_x[i] as f64, vel_y[i] as f64];
        }

        let _ = &self.ctx;
        Ok(kernel_us)
    }
}

/// Device-resident OSM state for production tick loops.
///
/// Uploads static columns once, keeps current/next position-velocity buffers
/// on the device across ticks, and downloads only when a host snapshot is
/// requested.
pub struct CudaResident {
    ctx: Arc<CudaContext>,
    stream: Arc<CudaStream>,
    func: CudaFunction,
    block_size: u32,
    n: usize,
    n_walls: usize,
    d_pos_x_a: CudaSlice<f32>,
    d_pos_y_a: CudaSlice<f32>,
    d_vel_x_a: CudaSlice<f32>,
    d_vel_y_a: CudaSlice<f32>,
    d_pos_x_b: CudaSlice<f32>,
    d_pos_y_b: CudaSlice<f32>,
    d_vel_x_b: CudaSlice<f32>,
    d_vel_y_b: CudaSlice<f32>,
    d_radius: CudaSlice<f32>,
    d_desired_speed: CudaSlice<f32>,
    d_dest_x: CudaSlice<f32>,
    d_dest_y: CudaSlice<f32>,
    d_wall_ax: CudaSlice<f32>,
    d_wall_ay: CudaSlice<f32>,
    d_wall_bx: CudaSlice<f32>,
    d_wall_by: CudaSlice<f32>,
}

impl CudaResident {
    /// Upload an initial pedestrian and wall configuration to the device.
    pub fn upload(peds: &[Pedestrian], walls: &[WallSegment]) -> Result<Self, String> {
        Self::upload_with_block_size(peds, walls, 256)
    }

    /// Like [`CudaResident::upload`] but with a configurable CUDA block size.
    pub fn upload_with_block_size(
        peds: &[Pedestrian],
        walls: &[WallSegment],
        block_size: u32,
    ) -> Result<Self, String> {
        if block_size == 0 {
            return Err("block_size must be positive".to_string());
        }
        let n = peds.len();
        if n == 0 {
            return Err("CudaResident::upload requires at least one pedestrian".to_string());
        }

        let state = CudaState::with_block_size(block_size)?;
        let stream = state.stream.clone();

        let mut pos_x = Vec::with_capacity(n);
        let mut pos_y = Vec::with_capacity(n);
        let mut vel_x = Vec::with_capacity(n);
        let mut vel_y = Vec::with_capacity(n);
        let mut radius = Vec::with_capacity(n);
        let mut desired_speed = Vec::with_capacity(n);
        let mut dest_x = Vec::with_capacity(n);
        let mut dest_y = Vec::with_capacity(n);
        for p in peds {
            pos_x.push(p.pos[0] as f32);
            pos_y.push(p.pos[1] as f32);
            vel_x.push(p.vel[0] as f32);
            vel_y.push(p.vel[1] as f32);
            radius.push(p.radius as f32);
            desired_speed.push(p.desired_speed as f32);
            dest_x.push(p.destination[0] as f32);
            dest_y.push(p.destination[1] as f32);
        }

        let n_walls = walls.len();
        let (wall_ax, wall_ay, wall_bx, wall_by) = wall_columns(walls);

        let d_pos_x_a = stream
            .clone_htod(&pos_x)
            .map_err(|e| format!("htod pos_x failed: {e}"))?;
        let d_pos_y_a = stream
            .clone_htod(&pos_y)
            .map_err(|e| format!("htod pos_y failed: {e}"))?;
        let d_vel_x_a = stream
            .clone_htod(&vel_x)
            .map_err(|e| format!("htod vel_x failed: {e}"))?;
        let d_vel_y_a = stream
            .clone_htod(&vel_y)
            .map_err(|e| format!("htod vel_y failed: {e}"))?;
        let d_pos_x_b = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc pos_x_b failed: {e}"))?;
        let d_pos_y_b = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc pos_y_b failed: {e}"))?;
        let d_vel_x_b = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc vel_x_b failed: {e}"))?;
        let d_vel_y_b = stream
            .alloc_zeros(n)
            .map_err(|e| format!("alloc vel_y_b failed: {e}"))?;
        let d_radius = stream
            .clone_htod(&radius)
            .map_err(|e| format!("htod radius failed: {e}"))?;
        let d_desired_speed = stream
            .clone_htod(&desired_speed)
            .map_err(|e| format!("htod desired_speed failed: {e}"))?;
        let d_dest_x = stream
            .clone_htod(&dest_x)
            .map_err(|e| format!("htod dest_x failed: {e}"))?;
        let d_dest_y = stream
            .clone_htod(&dest_y)
            .map_err(|e| format!("htod dest_y failed: {e}"))?;
        let d_wall_ax = stream
            .clone_htod(&wall_ax)
            .map_err(|e| format!("htod wall_ax failed: {e}"))?;
        let d_wall_ay = stream
            .clone_htod(&wall_ay)
            .map_err(|e| format!("htod wall_ay failed: {e}"))?;
        let d_wall_bx = stream
            .clone_htod(&wall_bx)
            .map_err(|e| format!("htod wall_bx failed: {e}"))?;
        let d_wall_by = stream
            .clone_htod(&wall_by)
            .map_err(|e| format!("htod wall_by failed: {e}"))?;

        stream
            .synchronize()
            .map_err(|e| format!("initial sync failed: {e}"))?;

        Ok(Self {
            ctx: state.ctx,
            stream,
            func: state.func,
            block_size,
            n,
            n_walls,
            d_pos_x_a,
            d_pos_y_a,
            d_vel_x_a,
            d_vel_y_a,
            d_pos_x_b,
            d_pos_y_b,
            d_vel_x_b,
            d_vel_y_b,
            d_radius,
            d_desired_speed,
            d_dest_x,
            d_dest_y,
            d_wall_ax,
            d_wall_ay,
            d_wall_bx,
            d_wall_by,
        })
    }

    /// Number of resident pedestrians.
    #[inline]
    pub fn len(&self) -> usize {
        self.n
    }

    /// Whether any pedestrians are resident.
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.n == 0
    }

    /// Advance one OSM tick fully on the GPU.
    pub fn step(&mut self, params: &Params, dt: f64) -> Result<u128, String> {
        let n_u32 = self.n as u32;
        let n_walls_u32 = self.n_walls as u32;
        let num_candidates = params.num_candidates as u32;
        let stride_at_free_flow = params.stride_at_free_flow as f32;
        let cone_half_angle = params.cone_half_angle as f32;
        let target_weight = params.target_weight as f32;
        let ped_strength = params.ped_strength as f32;
        let ped_range = params.ped_range as f32;
        let wall_strength = params.wall_strength as f32;
        let wall_range = params.wall_range as f32;
        let arrival_radius = params.arrival_radius as f32;
        let dt_f32 = dt as f32;

        let grid = self.n.div_ceil(self.block_size as usize) as u32;
        let cfg = LaunchConfig {
            grid_dim: (grid.max(1), 1, 1),
            block_dim: (self.block_size, 1, 1),
            shared_mem_bytes: 0,
        };

        let t0 = std::time::Instant::now();
        unsafe {
            let mut b = self.stream.launch_builder(&self.func);
            b.arg(&self.d_pos_x_a);
            b.arg(&self.d_pos_y_a);
            b.arg(&self.d_vel_x_a);
            b.arg(&self.d_vel_y_a);
            b.arg(&self.d_radius);
            b.arg(&self.d_desired_speed);
            b.arg(&self.d_dest_x);
            b.arg(&self.d_dest_y);
            b.arg(&mut self.d_pos_x_b);
            b.arg(&mut self.d_pos_y_b);
            b.arg(&mut self.d_vel_x_b);
            b.arg(&mut self.d_vel_y_b);
            b.arg(&self.d_wall_ax);
            b.arg(&self.d_wall_ay);
            b.arg(&self.d_wall_bx);
            b.arg(&self.d_wall_by);
            b.arg(&n_u32);
            b.arg(&n_walls_u32);
            b.arg(&num_candidates);
            b.arg(&stride_at_free_flow);
            b.arg(&cone_half_angle);
            b.arg(&target_weight);
            b.arg(&ped_strength);
            b.arg(&ped_range);
            b.arg(&wall_strength);
            b.arg(&wall_range);
            b.arg(&arrival_radius);
            b.arg(&dt_f32);
            b.launch(cfg)
                .map_err(|e| format!("kernel launch failed: {e}"))?;
        }
        self.stream
            .synchronize()
            .map_err(|e| format!("stream sync failed: {e}"))?;
        let kernel_us = t0.elapsed().as_micros();

        std::mem::swap(&mut self.d_pos_x_a, &mut self.d_pos_x_b);
        std::mem::swap(&mut self.d_pos_y_a, &mut self.d_pos_y_b);
        std::mem::swap(&mut self.d_vel_x_a, &mut self.d_vel_x_b);
        std::mem::swap(&mut self.d_vel_y_a, &mut self.d_vel_y_b);

        let _ = &self.ctx;
        Ok(kernel_us)
    }

    /// Bulk download of the current resident state back into `peds`.
    pub fn download(&self, peds: &mut Vec<Pedestrian>) -> Result<(), String> {
        download_resident(
            &self.stream,
            self.n,
            &self.d_pos_x_a,
            &self.d_pos_y_a,
            &self.d_vel_x_a,
            &self.d_vel_y_a,
            &self.d_radius,
            &self.d_desired_speed,
            &self.d_dest_x,
            &self.d_dest_y,
            peds,
        )
    }
}

fn wall_columns(walls: &[WallSegment]) -> (Vec<f32>, Vec<f32>, Vec<f32>, Vec<f32>) {
    if walls.is_empty() {
        return (vec![0.0], vec![0.0], vec![0.0], vec![0.0]);
    }
    let mut ax = Vec::with_capacity(walls.len());
    let mut ay = Vec::with_capacity(walls.len());
    let mut bx = Vec::with_capacity(walls.len());
    let mut by = Vec::with_capacity(walls.len());
    for w in walls {
        ax.push(w.a[0] as f32);
        ay.push(w.a[1] as f32);
        bx.push(w.b[0] as f32);
        by.push(w.b[1] as f32);
    }
    (ax, ay, bx, by)
}

#[allow(clippy::too_many_arguments)]
fn download_resident(
    stream: &Arc<CudaStream>,
    n: usize,
    d_pos_x: &CudaSlice<f32>,
    d_pos_y: &CudaSlice<f32>,
    d_vel_x: &CudaSlice<f32>,
    d_vel_y: &CudaSlice<f32>,
    d_radius: &CudaSlice<f32>,
    d_desired_speed: &CudaSlice<f32>,
    d_dest_x: &CudaSlice<f32>,
    d_dest_y: &CudaSlice<f32>,
    peds: &mut Vec<Pedestrian>,
) -> Result<(), String> {
    peds.resize(
        n,
        Pedestrian {
            pos: [0.0, 0.0],
            vel: [0.0, 0.0],
            radius: 0.0,
            desired_speed: 0.0,
            destination: [0.0, 0.0],
        },
    );

    let mut pos_x = vec![0.0f32; n];
    let mut pos_y = vec![0.0f32; n];
    let mut vel_x = vec![0.0f32; n];
    let mut vel_y = vec![0.0f32; n];
    let mut radius = vec![0.0f32; n];
    let mut desired_speed = vec![0.0f32; n];
    let mut dest_x = vec![0.0f32; n];
    let mut dest_y = vec![0.0f32; n];

    stream
        .memcpy_dtoh(d_pos_x, &mut pos_x)
        .map_err(|e| format!("dtoh pos_x failed: {e}"))?;
    stream
        .memcpy_dtoh(d_pos_y, &mut pos_y)
        .map_err(|e| format!("dtoh pos_y failed: {e}"))?;
    stream
        .memcpy_dtoh(d_vel_x, &mut vel_x)
        .map_err(|e| format!("dtoh vel_x failed: {e}"))?;
    stream
        .memcpy_dtoh(d_vel_y, &mut vel_y)
        .map_err(|e| format!("dtoh vel_y failed: {e}"))?;
    stream
        .memcpy_dtoh(d_radius, &mut radius)
        .map_err(|e| format!("dtoh radius failed: {e}"))?;
    stream
        .memcpy_dtoh(d_desired_speed, &mut desired_speed)
        .map_err(|e| format!("dtoh desired_speed failed: {e}"))?;
    stream
        .memcpy_dtoh(d_dest_x, &mut dest_x)
        .map_err(|e| format!("dtoh dest_x failed: {e}"))?;
    stream
        .memcpy_dtoh(d_dest_y, &mut dest_y)
        .map_err(|e| format!("dtoh dest_y failed: {e}"))?;

    for (i, p) in peds.iter_mut().enumerate().take(n) {
        p.pos = [pos_x[i] as f64, pos_y[i] as f64];
        p.vel = [vel_x[i] as f64, vel_y[i] as f64];
        p.radius = radius[i] as f64;
        p.desired_speed = desired_speed[i] as f64;
        p.destination = [dest_x[i] as f64, dest_y[i] as f64];
    }
    Ok(())
}

/// One-shot wrapper: initialise CUDA, run one OSM step, tear down.
pub fn step(
    peds: &mut [Pedestrian],
    walls: &[WallSegment],
    params: &Params,
    dt: f64,
) -> Result<u128, String> {
    let state = CudaState::new()?;
    state.step(peds, walls, params, dt)
}

/// CPU-fallback wrapper: try CUDA, fall back to
/// [`crate::optimal_steps::step`] on any error.
pub fn step_with_fallback(
    state: &mut Option<CudaState>,
    peds: &mut [Pedestrian],
    walls: &[WallSegment],
    params: &Params,
    dt: f64,
) -> bool {
    if state.is_none() {
        match CudaState::new() {
            Ok(s) => *state = Some(s),
            Err(e) => {
                eprintln!("rustsim-crowd CUDA init failed ({e}), using CPU OSM path");
                #[allow(deprecated)]
                crate::optimal_steps::step(peds, walls, params, dt);
                return false;
            }
        }
    }
    match state.as_ref().unwrap().step(peds, walls, params, dt) {
        Ok(_) => true,
        Err(e) => {
            eprintln!("rustsim-crowd CUDA OSM step failed ({e}), falling back to CPU");
            #[allow(deprecated)]
            crate::optimal_steps::step(peds, walls, params, dt);
            false
        }
    }
}