use cudarc::driver::{
CudaContext, CudaFunction, CudaSlice, CudaStream, LaunchConfig, PushKernelArg,
};
use cudarc::nvrtc::compile_ptx;
use std::sync::Arc;
use crate::common::{Pedestrian, WallSegment};
use crate::social_force::Params;
const SFM_CUDA_SRC: &str = r#"
extern "C" __global__ void sfm_step(
const float* __restrict__ pos_x_in,
const float* __restrict__ pos_y_in,
const float* __restrict__ vel_x_in,
const float* __restrict__ vel_y_in,
const float* __restrict__ radius,
const float* __restrict__ desired_speed,
const float* __restrict__ dest_x,
const float* __restrict__ dest_y,
float* __restrict__ pos_x_out,
float* __restrict__ pos_y_out,
float* __restrict__ vel_x_out,
float* __restrict__ vel_y_out,
const float* __restrict__ wall_ax,
const float* __restrict__ wall_ay,
const float* __restrict__ wall_bx,
const float* __restrict__ wall_by,
unsigned int n,
unsigned int n_walls,
float tau,
float a_ped,
float b_ped,
float a_wall,
float b_wall,
float mass,
float max_speed,
float max_accel,
float arrival_radius,
float dt)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= n) return;
float pxi = pos_x_in[i];
float pyi = pos_y_in[i];
float vxi = vel_x_in[i];
float vyi = vel_y_in[i];
float ri = radius[i];
float v0 = desired_speed[i];
float dxi = dest_x[i];
float dyi = dest_y[i];
// Driving force with arrival taper.
float to_dest_x = dxi - pxi;
float to_dest_y = dyi - pyi;
float d = sqrtf(to_dest_x * to_dest_x + to_dest_y * to_dest_y);
float eff_v0 = v0;
if (arrival_radius > 0.0f && d < arrival_radius) {
eff_v0 = v0 * (d / arrival_radius);
}
float dir_x = 0.0f;
float dir_y = 0.0f;
if (d > 1.0e-12f) {
dir_x = to_dest_x / d;
dir_y = to_dest_y / d;
}
float fx = mass * (eff_v0 * dir_x - vxi) / tau;
float fy = mass * (eff_v0 * dir_y - vyi) / tau;
// Pedestrian repulsion — O(n²) pair loop.
for (unsigned int j = 0; j < n; ++j) {
if (j == i) continue;
float dx = pxi - pos_x_in[j];
float dy = pyi - pos_y_in[j];
float dist = sqrtf(dx * dx + dy * dy);
if (dist < 1.0e-12f) continue;
float r_sum = ri + radius[j];
float mag = a_ped * expf((r_sum - dist) / b_ped);
float inv_dist = 1.0f / dist;
fx += mag * dx * inv_dist;
fy += mag * dy * inv_dist;
}
// Wall repulsion.
for (unsigned int k = 0; k < n_walls; ++k) {
float ax = wall_ax[k];
float ay = wall_ay[k];
float bx = wall_bx[k];
float by = wall_by[k];
float abx = bx - ax;
float aby = by - ay;
float denom = abx * abx + aby * aby;
float t = 0.0f;
if (denom > 1.0e-18f) {
t = ((pxi - ax) * abx + (pyi - ay) * aby) / denom;
if (t < 0.0f) t = 0.0f;
if (t > 1.0f) t = 1.0f;
}
float cpx = ax + t * abx;
float cpy = ay + t * aby;
float dx = pxi - cpx;
float dy = pyi - cpy;
float dist = sqrtf(dx * dx + dy * dy);
if (dist < 1.0e-12f) continue;
float mag = a_wall * expf((ri - dist) / b_wall);
float inv_dist = 1.0f / dist;
fx += mag * dx * inv_dist;
fy += mag * dy * inv_dist;
}
// a = F / m, clamp to max_accel for explicit-Euler stability.
float ax_acc = fx / mass;
float ay_acc = fy / mass;
float a_mag = sqrtf(ax_acc * ax_acc + ay_acc * ay_acc);
if (a_mag > max_accel && a_mag > 1.0e-12f) {
float s = max_accel / a_mag;
ax_acc *= s;
ay_acc *= s;
}
// Integrate.
float vx_new = vxi + ax_acc * dt;
float vy_new = vyi + ay_acc * dt;
float v_mag = sqrtf(vx_new * vx_new + vy_new * vy_new);
if (v_mag > max_speed && v_mag > 1.0e-12f) {
float s = max_speed / v_mag;
vx_new *= s;
vy_new *= s;
}
float px_new = pxi + vx_new * dt;
float py_new = pyi + vy_new * dt;
pos_x_out[i] = px_new;
pos_y_out[i] = py_new;
vel_x_out[i] = vx_new;
vel_y_out[i] = vy_new;
}
"#;
const SFM_CUDA_GRID_SRC: &str = r#"
extern "C" __global__ void grid_clear_heads(
unsigned int* __restrict__ cell_head,
unsigned int ncells)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= ncells) return;
cell_head[i] = 0xFFFFFFFFu;
}
__device__ __forceinline__ unsigned int cell_index_for(
float x, float y,
float origin_x, float origin_y, float inv_cell_size,
unsigned int grid_w, unsigned int grid_h)
{
int cx = (int)floorf((x - origin_x) * inv_cell_size);
int cy = (int)floorf((y - origin_y) * inv_cell_size);
if (cx < 0) cx = 0;
if (cy < 0) cy = 0;
if ((unsigned int)cx >= grid_w) cx = (int)grid_w - 1;
if ((unsigned int)cy >= grid_h) cy = (int)grid_h - 1;
return (unsigned int)cy * grid_w + (unsigned int)cx;
}
extern "C" __global__ void grid_build(
const float* __restrict__ pos_x_in,
const float* __restrict__ pos_y_in,
unsigned int* __restrict__ cell_head,
unsigned int* __restrict__ cell_next,
float origin_x, float origin_y, float inv_cell_size,
unsigned int grid_w, unsigned int grid_h,
unsigned int n)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= n) return;
unsigned int cell = cell_index_for(
pos_x_in[i], pos_y_in[i],
origin_x, origin_y, inv_cell_size,
grid_w, grid_h);
// Atomic-exchange head insertion: classic uniform-grid trick,
// no scan, no sort, O(1) per agent.
cell_next[i] = atomicExch(&cell_head[cell], i);
}
extern "C" __global__ void sfm_step_grid(
const float* __restrict__ pos_x_in,
const float* __restrict__ pos_y_in,
const float* __restrict__ vel_x_in,
const float* __restrict__ vel_y_in,
const float* __restrict__ radius,
const float* __restrict__ desired_speed,
const float* __restrict__ dest_x,
const float* __restrict__ dest_y,
float* __restrict__ pos_x_out,
float* __restrict__ pos_y_out,
float* __restrict__ vel_x_out,
float* __restrict__ vel_y_out,
const float* __restrict__ wall_ax,
const float* __restrict__ wall_ay,
const float* __restrict__ wall_bx,
const float* __restrict__ wall_by,
const unsigned int* __restrict__ cell_head,
const unsigned int* __restrict__ cell_next,
unsigned int n,
unsigned int n_walls,
float origin_x, float origin_y, float inv_cell_size,
unsigned int grid_w, unsigned int grid_h,
float cutoff_sq,
float tau,
float a_ped,
float b_ped,
float a_wall,
float b_wall,
float mass,
float max_speed,
float max_accel,
float arrival_radius,
float dt)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= n) return;
float pxi = pos_x_in[i];
float pyi = pos_y_in[i];
float vxi = vel_x_in[i];
float vyi = vel_y_in[i];
float ri = radius[i];
float v0 = desired_speed[i];
float dxi = dest_x[i];
float dyi = dest_y[i];
float to_dest_x = dxi - pxi;
float to_dest_y = dyi - pyi;
float d = sqrtf(to_dest_x * to_dest_x + to_dest_y * to_dest_y);
float eff_v0 = v0;
if (arrival_radius > 0.0f && d < arrival_radius) {
eff_v0 = v0 * (d / arrival_radius);
}
float dir_x = 0.0f;
float dir_y = 0.0f;
if (d > 1.0e-12f) {
dir_x = to_dest_x / d;
dir_y = to_dest_y / d;
}
float fx = mass * (eff_v0 * dir_x - vxi) / tau;
float fy = mass * (eff_v0 * dir_y - vyi) / tau;
// Walk the 3x3 cell neighbourhood.
int cx = (int)floorf((pxi - origin_x) * inv_cell_size);
int cy = (int)floorf((pyi - origin_y) * inv_cell_size);
int gw = (int)grid_w;
int gh = (int)grid_h;
int x0 = cx - 1; if (x0 < 0) x0 = 0;
int x1 = cx + 1; if (x1 >= gw) x1 = gw - 1;
int y0 = cy - 1; if (y0 < 0) y0 = 0;
int y1 = cy + 1; if (y1 >= gh) y1 = gh - 1;
for (int ny = y0; ny <= y1; ++ny) {
for (int nx = x0; nx <= x1; ++nx) {
unsigned int cell = (unsigned int)ny * grid_w + (unsigned int)nx;
unsigned int j = cell_head[cell];
while (j != 0xFFFFFFFFu) {
if (j != i) {
float ddx = pxi - pos_x_in[j];
float ddy = pyi - pos_y_in[j];
float d2 = ddx * ddx + ddy * ddy;
if (d2 < cutoff_sq && d2 > 1.0e-24f) {
float dist = sqrtf(d2);
float r_sum = ri + radius[j];
float mag = a_ped * expf((r_sum - dist) / b_ped);
float inv_dist = 1.0f / dist;
fx += mag * ddx * inv_dist;
fy += mag * ddy * inv_dist;
}
}
j = cell_next[j];
}
}
}
for (unsigned int k = 0; k < n_walls; ++k) {
float ax = wall_ax[k];
float ay = wall_ay[k];
float bx = wall_bx[k];
float by = wall_by[k];
float abx = bx - ax;
float aby = by - ay;
float denom = abx * abx + aby * aby;
float t = 0.0f;
if (denom > 1.0e-18f) {
t = ((pxi - ax) * abx + (pyi - ay) * aby) / denom;
if (t < 0.0f) t = 0.0f;
if (t > 1.0f) t = 1.0f;
}
float cpx = ax + t * abx;
float cpy = ay + t * aby;
float ddx = pxi - cpx;
float ddy = pyi - cpy;
float dist = sqrtf(ddx * ddx + ddy * ddy);
if (dist < 1.0e-12f) continue;
float mag = a_wall * expf((ri - dist) / b_wall);
float inv_dist = 1.0f / dist;
fx += mag * ddx * inv_dist;
fy += mag * ddy * inv_dist;
}
float ax_acc = fx / mass;
float ay_acc = fy / mass;
float a_mag = sqrtf(ax_acc * ax_acc + ay_acc * ay_acc);
if (a_mag > max_accel && a_mag > 1.0e-12f) {
float s = max_accel / a_mag;
ax_acc *= s;
ay_acc *= s;
}
float vx_new = vxi + ax_acc * dt;
float vy_new = vyi + ay_acc * dt;
float v_mag = sqrtf(vx_new * vx_new + vy_new * vy_new);
if (v_mag > max_speed && v_mag > 1.0e-12f) {
float s = max_speed / v_mag;
vx_new *= s;
vy_new *= s;
}
float px_new = pxi + vx_new * dt;
float py_new = pyi + vy_new * dt;
pos_x_out[i] = px_new;
pos_y_out[i] = py_new;
vel_x_out[i] = vx_new;
vel_y_out[i] = vy_new;
}
"#;
pub struct CudaState {
ctx: Arc<CudaContext>,
stream: Arc<CudaStream>,
func: CudaFunction,
block_size: u32,
}
impl CudaState {
pub fn new() -> Result<Self, String> {
Self::with_block_size(256)
}
pub fn with_block_size(block_size: u32) -> Result<Self, String> {
if block_size == 0 {
return Err("block_size must be positive".to_string());
}
let ctx = super::new_context(0)?;
let stream = ctx.default_stream();
let ptx = compile_ptx(SFM_CUDA_SRC).map_err(|e| format!("NVRTC compile failed: {e}"))?;
let module = ctx
.load_module(ptx)
.map_err(|e| format!("module load failed: {e}"))?;
let func = module
.load_function("sfm_step")
.map_err(|e| format!("kernel lookup failed: {e}"))?;
Ok(Self {
ctx,
stream,
func,
block_size,
})
}
pub fn step(
&self,
peds: &mut [Pedestrian],
walls: &[WallSegment],
params: &Params,
dt: f64,
) -> Result<u128, String> {
let n = peds.len();
if n == 0 {
return Ok(0);
}
let stream = &self.stream;
let mut pos_x: Vec<f32> = Vec::with_capacity(n);
let mut pos_y: Vec<f32> = Vec::with_capacity(n);
let mut vel_x: Vec<f32> = Vec::with_capacity(n);
let mut vel_y: Vec<f32> = Vec::with_capacity(n);
let mut radius: Vec<f32> = Vec::with_capacity(n);
let mut desired_speed: Vec<f32> = Vec::with_capacity(n);
let mut dest_x: Vec<f32> = Vec::with_capacity(n);
let mut dest_y: Vec<f32> = Vec::with_capacity(n);
for p in peds.iter() {
pos_x.push(p.pos[0] as f32);
pos_y.push(p.pos[1] as f32);
vel_x.push(p.vel[0] as f32);
vel_y.push(p.vel[1] as f32);
radius.push(p.radius as f32);
desired_speed.push(p.desired_speed as f32);
dest_x.push(p.destination[0] as f32);
dest_y.push(p.destination[1] as f32);
}
let n_walls = walls.len();
let mut wall_ax: Vec<f32> = Vec::with_capacity(n_walls.max(1));
let mut wall_ay: Vec<f32> = Vec::with_capacity(n_walls.max(1));
let mut wall_bx: Vec<f32> = Vec::with_capacity(n_walls.max(1));
let mut wall_by: Vec<f32> = Vec::with_capacity(n_walls.max(1));
if n_walls == 0 {
wall_ax.push(0.0);
wall_ay.push(0.0);
wall_bx.push(0.0);
wall_by.push(0.0);
} else {
for w in walls {
wall_ax.push(w.a[0] as f32);
wall_ay.push(w.a[1] as f32);
wall_bx.push(w.b[0] as f32);
wall_by.push(w.b[1] as f32);
}
}
let d_pos_x: CudaSlice<f32> = stream
.clone_htod(&pos_x)
.map_err(|e| format!("htod pos_x failed: {e}"))?;
let d_pos_y: CudaSlice<f32> = stream
.clone_htod(&pos_y)
.map_err(|e| format!("htod pos_y failed: {e}"))?;
let d_vel_x: CudaSlice<f32> = stream
.clone_htod(&vel_x)
.map_err(|e| format!("htod vel_x failed: {e}"))?;
let d_vel_y: CudaSlice<f32> = stream
.clone_htod(&vel_y)
.map_err(|e| format!("htod vel_y failed: {e}"))?;
let d_radius: CudaSlice<f32> = stream
.clone_htod(&radius)
.map_err(|e| format!("htod radius failed: {e}"))?;
let d_desired_speed: CudaSlice<f32> = stream
.clone_htod(&desired_speed)
.map_err(|e| format!("htod desired_speed failed: {e}"))?;
let d_dest_x: CudaSlice<f32> = stream
.clone_htod(&dest_x)
.map_err(|e| format!("htod dest_x failed: {e}"))?;
let d_dest_y: CudaSlice<f32> = stream
.clone_htod(&dest_y)
.map_err(|e| format!("htod dest_y failed: {e}"))?;
let mut d_pos_x_out: CudaSlice<f32> = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc pos_x_out failed: {e}"))?;
let mut d_pos_y_out: CudaSlice<f32> = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc pos_y_out failed: {e}"))?;
let mut d_vel_x_out: CudaSlice<f32> = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc vel_x_out failed: {e}"))?;
let mut d_vel_y_out: CudaSlice<f32> = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc vel_y_out failed: {e}"))?;
let d_wall_ax: CudaSlice<f32> = stream
.clone_htod(&wall_ax)
.map_err(|e| format!("htod wall_ax failed: {e}"))?;
let d_wall_ay: CudaSlice<f32> = stream
.clone_htod(&wall_ay)
.map_err(|e| format!("htod wall_ay failed: {e}"))?;
let d_wall_bx: CudaSlice<f32> = stream
.clone_htod(&wall_bx)
.map_err(|e| format!("htod wall_bx failed: {e}"))?;
let d_wall_by: CudaSlice<f32> = stream
.clone_htod(&wall_by)
.map_err(|e| format!("htod wall_by failed: {e}"))?;
let n_u32 = n as u32;
let n_walls_u32 = n_walls as u32;
let tau = params.tau as f32;
let a_ped = params.a_ped as f32;
let b_ped = params.b_ped as f32;
let a_wall = params.a_wall as f32;
let b_wall = params.b_wall as f32;
let mass = params.mass as f32;
let max_speed = params.max_speed as f32;
let max_accel = params.max_accel as f32;
let arrival_radius = params.arrival_radius as f32;
let dt_f32 = dt as f32;
let grid = n.div_ceil(self.block_size as usize) as u32;
let cfg = LaunchConfig {
grid_dim: (grid.max(1), 1, 1),
block_dim: (self.block_size, 1, 1),
shared_mem_bytes: 0,
};
let t0 = std::time::Instant::now();
unsafe {
let mut b = stream.launch_builder(&self.func);
b.arg(&d_pos_x);
b.arg(&d_pos_y);
b.arg(&d_vel_x);
b.arg(&d_vel_y);
b.arg(&d_radius);
b.arg(&d_desired_speed);
b.arg(&d_dest_x);
b.arg(&d_dest_y);
b.arg(&mut d_pos_x_out);
b.arg(&mut d_pos_y_out);
b.arg(&mut d_vel_x_out);
b.arg(&mut d_vel_y_out);
b.arg(&d_wall_ax);
b.arg(&d_wall_ay);
b.arg(&d_wall_bx);
b.arg(&d_wall_by);
b.arg(&n_u32);
b.arg(&n_walls_u32);
b.arg(&tau);
b.arg(&a_ped);
b.arg(&b_ped);
b.arg(&a_wall);
b.arg(&b_wall);
b.arg(&mass);
b.arg(&max_speed);
b.arg(&max_accel);
b.arg(&arrival_radius);
b.arg(&dt_f32);
b.launch(cfg)
.map_err(|e| format!("kernel launch failed: {e}"))?;
}
stream
.synchronize()
.map_err(|e| format!("stream sync failed: {e}"))?;
let kernel_us = t0.elapsed().as_micros();
stream
.memcpy_dtoh(&d_pos_x_out, &mut pos_x)
.map_err(|e| format!("dtoh pos_x failed: {e}"))?;
stream
.memcpy_dtoh(&d_pos_y_out, &mut pos_y)
.map_err(|e| format!("dtoh pos_y failed: {e}"))?;
stream
.memcpy_dtoh(&d_vel_x_out, &mut vel_x)
.map_err(|e| format!("dtoh vel_x failed: {e}"))?;
stream
.memcpy_dtoh(&d_vel_y_out, &mut vel_y)
.map_err(|e| format!("dtoh vel_y failed: {e}"))?;
for (i, p) in peds.iter_mut().enumerate() {
p.pos = [pos_x[i] as f64, pos_y[i] as f64];
p.vel = [vel_x[i] as f64, vel_y[i] as f64];
}
let _ = &self.ctx;
Ok(kernel_us)
}
}
pub fn step(
peds: &mut [Pedestrian],
walls: &[WallSegment],
params: &Params,
dt: f64,
) -> Result<u128, String> {
let state = CudaState::new()?;
state.step(peds, walls, params, dt)
}
pub fn step_with_fallback(
state: &mut Option<CudaState>,
peds: &mut [Pedestrian],
walls: &[WallSegment],
params: &Params,
dt: f64,
) -> bool {
if state.is_none() {
match CudaState::new() {
Ok(s) => *state = Some(s),
Err(e) => {
eprintln!("rustsim-crowd CUDA init failed ({e}), using CPU path");
#[allow(deprecated)]
crate::social_force::step(peds, walls, params, dt);
return false;
}
}
}
match state.as_ref().unwrap().step(peds, walls, params, dt) {
Ok(_) => true,
Err(e) => {
eprintln!("rustsim-crowd CUDA step failed ({e}), falling back to CPU");
#[allow(deprecated)]
crate::social_force::step(peds, walls, params, dt);
false
}
}
}
pub struct CudaResident {
ctx: Arc<CudaContext>,
stream: Arc<CudaStream>,
func: CudaFunction,
block_size: u32,
n: usize,
n_walls: usize,
d_pos_x_a: CudaSlice<f32>,
d_pos_y_a: CudaSlice<f32>,
d_vel_x_a: CudaSlice<f32>,
d_vel_y_a: CudaSlice<f32>,
d_pos_x_b: CudaSlice<f32>,
d_pos_y_b: CudaSlice<f32>,
d_vel_x_b: CudaSlice<f32>,
d_vel_y_b: CudaSlice<f32>,
d_radius: CudaSlice<f32>,
d_desired_speed: CudaSlice<f32>,
d_dest_x: CudaSlice<f32>,
d_dest_y: CudaSlice<f32>,
d_wall_ax: CudaSlice<f32>,
d_wall_ay: CudaSlice<f32>,
d_wall_bx: CudaSlice<f32>,
d_wall_by: CudaSlice<f32>,
grid: Option<ResidentGrid>,
func_clear_heads: CudaFunction,
func_build_grid: CudaFunction,
func_step_grid: CudaFunction,
}
#[derive(Debug, Clone, Copy)]
pub struct GridConfig {
pub origin: [f64; 2],
pub cell_size: f64,
pub dims: (u32, u32),
pub cutoff_sq: f64,
}
struct ResidentGrid {
cfg: GridConfig,
d_cell_head: CudaSlice<u32>,
d_cell_next: CudaSlice<u32>,
ncells: usize,
}
impl CudaResident {
pub fn upload(peds: &[Pedestrian], walls: &[WallSegment]) -> Result<Self, String> {
Self::upload_with_block_size(peds, walls, 256)
}
pub fn upload_with_block_size(
peds: &[Pedestrian],
walls: &[WallSegment],
block_size: u32,
) -> Result<Self, String> {
if block_size == 0 {
return Err("block_size must be positive".to_string());
}
let ctx = super::new_context(0)?;
let stream = ctx.default_stream();
let ptx = compile_ptx(SFM_CUDA_SRC).map_err(|e| format!("NVRTC compile failed: {e}"))?;
let module = ctx
.load_module(ptx)
.map_err(|e| format!("module load failed: {e}"))?;
let func = module
.load_function("sfm_step")
.map_err(|e| format!("kernel lookup failed: {e}"))?;
let grid_ptx = compile_ptx(SFM_CUDA_GRID_SRC)
.map_err(|e| format!("NVRTC compile (grid) failed: {e}"))?;
let grid_module = ctx
.load_module(grid_ptx)
.map_err(|e| format!("module load (grid) failed: {e}"))?;
let func_clear_heads = grid_module
.load_function("grid_clear_heads")
.map_err(|e| format!("grid_clear_heads lookup failed: {e}"))?;
let func_build_grid = grid_module
.load_function("grid_build")
.map_err(|e| format!("grid_build lookup failed: {e}"))?;
let func_step_grid = grid_module
.load_function("sfm_step_grid")
.map_err(|e| format!("sfm_step_grid lookup failed: {e}"))?;
let n = peds.len();
if n == 0 {
return Err("CudaResident::upload requires at least one pedestrian".to_string());
}
let mut pos_x = Vec::with_capacity(n);
let mut pos_y = Vec::with_capacity(n);
let mut vel_x = Vec::with_capacity(n);
let mut vel_y = Vec::with_capacity(n);
let mut radius = Vec::with_capacity(n);
let mut desired_speed = Vec::with_capacity(n);
let mut dest_x = Vec::with_capacity(n);
let mut dest_y = Vec::with_capacity(n);
for p in peds.iter() {
pos_x.push(p.pos[0] as f32);
pos_y.push(p.pos[1] as f32);
vel_x.push(p.vel[0] as f32);
vel_y.push(p.vel[1] as f32);
radius.push(p.radius as f32);
desired_speed.push(p.desired_speed as f32);
dest_x.push(p.destination[0] as f32);
dest_y.push(p.destination[1] as f32);
}
let n_walls = walls.len();
let (wall_ax, wall_ay, wall_bx, wall_by) = if n_walls == 0 {
(vec![0.0f32], vec![0.0f32], vec![0.0f32], vec![0.0f32])
} else {
let mut ax = Vec::with_capacity(n_walls);
let mut ay = Vec::with_capacity(n_walls);
let mut bx = Vec::with_capacity(n_walls);
let mut by = Vec::with_capacity(n_walls);
for w in walls {
ax.push(w.a[0] as f32);
ay.push(w.a[1] as f32);
bx.push(w.b[0] as f32);
by.push(w.b[1] as f32);
}
(ax, ay, bx, by)
};
let d_pos_x_a = stream
.clone_htod(&pos_x)
.map_err(|e| format!("htod pos_x failed: {e}"))?;
let d_pos_y_a = stream
.clone_htod(&pos_y)
.map_err(|e| format!("htod pos_y failed: {e}"))?;
let d_vel_x_a = stream
.clone_htod(&vel_x)
.map_err(|e| format!("htod vel_x failed: {e}"))?;
let d_vel_y_a = stream
.clone_htod(&vel_y)
.map_err(|e| format!("htod vel_y failed: {e}"))?;
let d_pos_x_b = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc pos_x_b failed: {e}"))?;
let d_pos_y_b = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc pos_y_b failed: {e}"))?;
let d_vel_x_b = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc vel_x_b failed: {e}"))?;
let d_vel_y_b = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc vel_y_b failed: {e}"))?;
let d_radius = stream
.clone_htod(&radius)
.map_err(|e| format!("htod radius failed: {e}"))?;
let d_desired_speed = stream
.clone_htod(&desired_speed)
.map_err(|e| format!("htod desired_speed failed: {e}"))?;
let d_dest_x = stream
.clone_htod(&dest_x)
.map_err(|e| format!("htod dest_x failed: {e}"))?;
let d_dest_y = stream
.clone_htod(&dest_y)
.map_err(|e| format!("htod dest_y failed: {e}"))?;
let d_wall_ax = stream
.clone_htod(&wall_ax)
.map_err(|e| format!("htod wall_ax failed: {e}"))?;
let d_wall_ay = stream
.clone_htod(&wall_ay)
.map_err(|e| format!("htod wall_ay failed: {e}"))?;
let d_wall_bx = stream
.clone_htod(&wall_bx)
.map_err(|e| format!("htod wall_bx failed: {e}"))?;
let d_wall_by = stream
.clone_htod(&wall_by)
.map_err(|e| format!("htod wall_by failed: {e}"))?;
stream
.synchronize()
.map_err(|e| format!("initial sync failed: {e}"))?;
Ok(Self {
ctx,
stream,
func,
block_size,
n,
n_walls,
d_pos_x_a,
d_pos_y_a,
d_vel_x_a,
d_vel_y_a,
d_pos_x_b,
d_pos_y_b,
d_vel_x_b,
d_vel_y_b,
d_radius,
d_desired_speed,
d_dest_x,
d_dest_y,
d_wall_ax,
d_wall_ay,
d_wall_bx,
d_wall_by,
grid: None,
func_clear_heads,
func_build_grid,
func_step_grid,
})
}
#[inline]
pub fn len(&self) -> usize {
self.n
}
#[inline]
pub fn is_empty(&self) -> bool {
self.n == 0
}
pub fn step(&mut self, params: &Params, dt: f64) -> Result<u128, String> {
let n_u32 = self.n as u32;
let n_walls_u32 = self.n_walls as u32;
let tau = params.tau as f32;
let a_ped = params.a_ped as f32;
let b_ped = params.b_ped as f32;
let a_wall = params.a_wall as f32;
let b_wall = params.b_wall as f32;
let mass = params.mass as f32;
let max_speed = params.max_speed as f32;
let max_accel = params.max_accel as f32;
let arrival_radius = params.arrival_radius as f32;
let dt_f32 = dt as f32;
let grid = self.n.div_ceil(self.block_size as usize) as u32;
let cfg = LaunchConfig {
grid_dim: (grid.max(1), 1, 1),
block_dim: (self.block_size, 1, 1),
shared_mem_bytes: 0,
};
let Self {
ref stream,
ref func,
ref d_radius,
ref d_desired_speed,
ref d_dest_x,
ref d_dest_y,
ref d_wall_ax,
ref d_wall_ay,
ref d_wall_bx,
ref d_wall_by,
ref d_pos_x_a,
ref d_pos_y_a,
ref d_vel_x_a,
ref d_vel_y_a,
ref mut d_pos_x_b,
ref mut d_pos_y_b,
ref mut d_vel_x_b,
ref mut d_vel_y_b,
..
} = *self;
let t0 = std::time::Instant::now();
unsafe {
let mut b = stream.launch_builder(func);
b.arg(d_pos_x_a);
b.arg(d_pos_y_a);
b.arg(d_vel_x_a);
b.arg(d_vel_y_a);
b.arg(d_radius);
b.arg(d_desired_speed);
b.arg(d_dest_x);
b.arg(d_dest_y);
b.arg(d_pos_x_b);
b.arg(d_pos_y_b);
b.arg(d_vel_x_b);
b.arg(d_vel_y_b);
b.arg(d_wall_ax);
b.arg(d_wall_ay);
b.arg(d_wall_bx);
b.arg(d_wall_by);
b.arg(&n_u32);
b.arg(&n_walls_u32);
b.arg(&tau);
b.arg(&a_ped);
b.arg(&b_ped);
b.arg(&a_wall);
b.arg(&b_wall);
b.arg(&mass);
b.arg(&max_speed);
b.arg(&max_accel);
b.arg(&arrival_radius);
b.arg(&dt_f32);
b.launch(cfg)
.map_err(|e| format!("kernel launch failed: {e}"))?;
}
stream
.synchronize()
.map_err(|e| format!("stream sync failed: {e}"))?;
let kernel_us = t0.elapsed().as_micros();
std::mem::swap(&mut self.d_pos_x_a, &mut self.d_pos_x_b);
std::mem::swap(&mut self.d_pos_y_a, &mut self.d_pos_y_b);
std::mem::swap(&mut self.d_vel_x_a, &mut self.d_vel_x_b);
std::mem::swap(&mut self.d_vel_y_a, &mut self.d_vel_y_b);
let _ = &self.ctx;
Ok(kernel_us)
}
pub fn enable_grid(&mut self, cfg: GridConfig) -> Result<(), String> {
if cfg.cell_size <= 0.0 || !cfg.cell_size.is_finite() {
return Err("GridConfig.cell_size must be positive and finite".to_string());
}
if !cfg.cutoff_sq.is_finite() || cfg.cutoff_sq < 0.0 {
return Err("GridConfig.cutoff_sq must be non-negative and finite".to_string());
}
if cfg.cell_size * cfg.cell_size < cfg.cutoff_sq - 1.0e-9 {
return Err(format!(
"GridConfig.cell_size ({}) is smaller than sqrt(cutoff_sq) ({}); the 3x3 neighbour walk would miss real pairs",
cfg.cell_size,
cfg.cutoff_sq.sqrt()
));
}
if cfg.dims.0 == 0 || cfg.dims.1 == 0 {
return Err("GridConfig.dims must have both extents > 0".to_string());
}
let ncells_u64 = (cfg.dims.0 as u64) * (cfg.dims.1 as u64);
if ncells_u64 > u32::MAX as u64 {
return Err(format!(
"GridConfig.dims product {ncells_u64} overflows u32 cell index"
));
}
let ncells = ncells_u64 as usize;
let d_cell_head: CudaSlice<u32> = self
.stream
.alloc_zeros(ncells)
.map_err(|e| format!("alloc cell_head failed: {e}"))?;
let d_cell_next: CudaSlice<u32> = self
.stream
.alloc_zeros(self.n)
.map_err(|e| format!("alloc cell_next failed: {e}"))?;
self.grid = Some(ResidentGrid {
cfg,
d_cell_head,
d_cell_next,
ncells,
});
Ok(())
}
#[inline]
pub fn has_grid(&self) -> bool {
self.grid.is_some()
}
pub fn step_grid(&mut self, params: &Params, dt: f64) -> Result<u128, String> {
let n_u32 = self.n as u32;
let n_walls_u32 = self.n_walls as u32;
let tau = params.tau as f32;
let a_ped = params.a_ped as f32;
let b_ped = params.b_ped as f32;
let a_wall = params.a_wall as f32;
let b_wall = params.b_wall as f32;
let mass = params.mass as f32;
let max_speed = params.max_speed as f32;
let max_accel = params.max_accel as f32;
let arrival_radius = params.arrival_radius as f32;
let dt_f32 = dt as f32;
let (origin_x, origin_y, inv_cell_size, grid_w, grid_h, cutoff_sq, ncells) = {
let g = self
.grid
.as_ref()
.ok_or("step_grid: grid not enabled (call enable_grid first)")?;
let inv = 1.0_f32 / (g.cfg.cell_size as f32);
(
g.cfg.origin[0] as f32,
g.cfg.origin[1] as f32,
inv,
g.cfg.dims.0,
g.cfg.dims.1,
g.cfg.cutoff_sq as f32,
g.ncells,
)
};
let block = self.block_size;
let grid_n = self.n.div_ceil(block as usize) as u32;
let grid_cells = ncells.div_ceil(block as usize) as u32;
let cfg_n = LaunchConfig {
grid_dim: (grid_n.max(1), 1, 1),
block_dim: (block, 1, 1),
shared_mem_bytes: 0,
};
let cfg_cells = LaunchConfig {
grid_dim: (grid_cells.max(1), 1, 1),
block_dim: (block, 1, 1),
shared_mem_bytes: 0,
};
let Self {
ref stream,
ref func_clear_heads,
ref func_build_grid,
ref func_step_grid,
ref d_radius,
ref d_desired_speed,
ref d_dest_x,
ref d_dest_y,
ref d_wall_ax,
ref d_wall_ay,
ref d_wall_bx,
ref d_wall_by,
ref d_pos_x_a,
ref d_pos_y_a,
ref d_vel_x_a,
ref d_vel_y_a,
ref mut d_pos_x_b,
ref mut d_pos_y_b,
ref mut d_vel_x_b,
ref mut d_vel_y_b,
ref mut grid,
..
} = *self;
let g = grid.as_mut().expect("grid presence checked above");
let ncells_u32 = g.ncells as u32;
let t0 = std::time::Instant::now();
unsafe {
let mut b = stream.launch_builder(func_clear_heads);
b.arg(&mut g.d_cell_head);
b.arg(&ncells_u32);
b.launch(cfg_cells)
.map_err(|e| format!("grid_clear_heads launch failed: {e}"))?;
let mut b = stream.launch_builder(func_build_grid);
b.arg(d_pos_x_a);
b.arg(d_pos_y_a);
b.arg(&mut g.d_cell_head);
b.arg(&mut g.d_cell_next);
b.arg(&origin_x);
b.arg(&origin_y);
b.arg(&inv_cell_size);
b.arg(&grid_w);
b.arg(&grid_h);
b.arg(&n_u32);
b.launch(cfg_n)
.map_err(|e| format!("grid_build launch failed: {e}"))?;
let mut b = stream.launch_builder(func_step_grid);
b.arg(d_pos_x_a);
b.arg(d_pos_y_a);
b.arg(d_vel_x_a);
b.arg(d_vel_y_a);
b.arg(d_radius);
b.arg(d_desired_speed);
b.arg(d_dest_x);
b.arg(d_dest_y);
b.arg(d_pos_x_b);
b.arg(d_pos_y_b);
b.arg(d_vel_x_b);
b.arg(d_vel_y_b);
b.arg(d_wall_ax);
b.arg(d_wall_ay);
b.arg(d_wall_bx);
b.arg(d_wall_by);
b.arg(&g.d_cell_head);
b.arg(&g.d_cell_next);
b.arg(&n_u32);
b.arg(&n_walls_u32);
b.arg(&origin_x);
b.arg(&origin_y);
b.arg(&inv_cell_size);
b.arg(&grid_w);
b.arg(&grid_h);
b.arg(&cutoff_sq);
b.arg(&tau);
b.arg(&a_ped);
b.arg(&b_ped);
b.arg(&a_wall);
b.arg(&b_wall);
b.arg(&mass);
b.arg(&max_speed);
b.arg(&max_accel);
b.arg(&arrival_radius);
b.arg(&dt_f32);
b.launch(cfg_n)
.map_err(|e| format!("sfm_step_grid launch failed: {e}"))?;
}
stream
.synchronize()
.map_err(|e| format!("stream sync failed: {e}"))?;
let kernel_us = t0.elapsed().as_micros();
std::mem::swap(&mut self.d_pos_x_a, &mut self.d_pos_x_b);
std::mem::swap(&mut self.d_pos_y_a, &mut self.d_pos_y_b);
std::mem::swap(&mut self.d_vel_x_a, &mut self.d_vel_x_b);
std::mem::swap(&mut self.d_vel_y_a, &mut self.d_vel_y_b);
let _ = &self.ctx;
Ok(kernel_us)
}
pub fn download(&self, peds: &mut Vec<Pedestrian>) -> Result<(), String> {
let n = self.n;
peds.resize(
n,
Pedestrian {
pos: [0.0, 0.0],
vel: [0.0, 0.0],
radius: 0.0,
desired_speed: 0.0,
destination: [0.0, 0.0],
},
);
let (cur_px, cur_py, cur_vx, cur_vy) = (
&self.d_pos_x_a,
&self.d_pos_y_a,
&self.d_vel_x_a,
&self.d_vel_y_a,
);
let mut pos_x = vec![0.0f32; n];
let mut pos_y = vec![0.0f32; n];
let mut vel_x = vec![0.0f32; n];
let mut vel_y = vec![0.0f32; n];
let mut radius = vec![0.0f32; n];
let mut desired_speed = vec![0.0f32; n];
let mut dest_x = vec![0.0f32; n];
let mut dest_y = vec![0.0f32; n];
self.stream
.memcpy_dtoh(cur_px, &mut pos_x)
.map_err(|e| format!("dtoh pos_x failed: {e}"))?;
self.stream
.memcpy_dtoh(cur_py, &mut pos_y)
.map_err(|e| format!("dtoh pos_y failed: {e}"))?;
self.stream
.memcpy_dtoh(cur_vx, &mut vel_x)
.map_err(|e| format!("dtoh vel_x failed: {e}"))?;
self.stream
.memcpy_dtoh(cur_vy, &mut vel_y)
.map_err(|e| format!("dtoh vel_y failed: {e}"))?;
self.stream
.memcpy_dtoh(&self.d_radius, &mut radius)
.map_err(|e| format!("dtoh radius failed: {e}"))?;
self.stream
.memcpy_dtoh(&self.d_desired_speed, &mut desired_speed)
.map_err(|e| format!("dtoh desired_speed failed: {e}"))?;
self.stream
.memcpy_dtoh(&self.d_dest_x, &mut dest_x)
.map_err(|e| format!("dtoh dest_x failed: {e}"))?;
self.stream
.memcpy_dtoh(&self.d_dest_y, &mut dest_y)
.map_err(|e| format!("dtoh dest_y failed: {e}"))?;
for (i, p) in peds.iter_mut().enumerate().take(n) {
p.pos = [pos_x[i] as f64, pos_y[i] as f64];
p.vel = [vel_x[i] as f64, vel_y[i] as f64];
p.radius = radius[i] as f64;
p.desired_speed = desired_speed[i] as f64;
p.destination = [dest_x[i] as f64, dest_y[i] as f64];
}
Ok(())
}
}