use cudarc::driver::{
CudaContext, CudaFunction, CudaSlice, CudaStream, LaunchConfig, PushKernelArg,
};
use cudarc::nvrtc::compile_ptx;
use std::sync::Arc;
use crate::anticipation_velocity::Params;
use crate::common::{Pedestrian, WallSegment};
const AVM_CUDA_SRC: &str = r#"
extern "C" __device__ float avm_anticipated_headroom(
unsigned int i,
float ex, float ey,
float pxi, float pyi, float ri, float v0_i,
const float* pos_x_in,
const float* pos_y_in,
const float* vel_x_in,
const float* vel_y_in,
const float* radius,
const float* wall_ax,
const float* wall_ay,
const float* wall_bx,
const float* wall_by,
unsigned int n,
unsigned int n_walls,
float anticipation_time,
float wall_range)
{
float s = 1.0e30f;
// Pedestrian neighbours: extrapolate by anticipation_time then test
// forward + lateral.
for (unsigned int j = 0; j < n; ++j) {
if (j == i) continue;
float qfx = pos_x_in[j] + vel_x_in[j] * anticipation_time;
float qfy = pos_y_in[j] + vel_y_in[j] * anticipation_time;
float relx = qfx - pxi;
float rely = qfy - pyi;
float forward = relx * ex + rely * ey;
if (forward <= 0.0f) continue;
float projx = ex * forward;
float projy = ey * forward;
float latx = relx - projx;
float laty = rely - projy;
float lat = sqrtf(latx * latx + laty * laty);
float r_sum = ri + radius[j];
if (lat > r_sum) continue;
float d = sqrtf(relx * relx + rely * rely);
if (d < s) s = d;
}
// Walls: Xu et al. 2021 Eq. 7 closest-point geometric form.
float horizon = v0_i * anticipation_time;
float probex = pxi + ex * horizon;
float probey = pyi + ey * horizon;
for (unsigned int k = 0; k < n_walls; ++k) {
float wax = wall_ax[k];
float way = wall_ay[k];
float wbx = wall_bx[k];
float wby = wall_by[k];
float abx = wbx - wax;
float aby = wby - way;
float denom = abx * abx + aby * aby;
float t = 0.0f;
if (denom > 1.0e-18f) {
t = ((probex - wax) * abx + (probey - way) * aby) / denom;
if (t < 0.0f) t = 0.0f;
if (t > 1.0f) t = 1.0f;
}
float cpx = wax + t * abx;
float cpy = way + t * aby;
float relx = cpx - pxi;
float rely = cpy - pyi;
float forward = relx * ex + rely * ey;
if (forward <= 0.0f) continue;
float projx = ex * forward;
float projy = ey * forward;
float latx = relx - projx;
float laty = rely - projy;
float lat = sqrtf(latx * latx + laty * laty);
if (lat > ri + wall_range) continue;
float d = sqrtf(relx * relx + rely * rely);
if (d < s) s = d;
}
return s;
}
extern "C" __global__ void avm_step(
const float* __restrict__ pos_x_in,
const float* __restrict__ pos_y_in,
const float* __restrict__ vel_x_in,
const float* __restrict__ vel_y_in,
const float* __restrict__ radius,
const float* __restrict__ desired_speed,
const float* __restrict__ dest_x,
const float* __restrict__ dest_y,
float* __restrict__ pos_x_out,
float* __restrict__ pos_y_out,
float* __restrict__ vel_x_out,
float* __restrict__ vel_y_out,
const float* __restrict__ wall_ax,
const float* __restrict__ wall_ay,
const float* __restrict__ wall_bx,
const float* __restrict__ wall_by,
unsigned int n,
unsigned int n_walls,
float time_gap,
float anticipation_time,
unsigned int num_directions,
float cone_half_angle,
float deviation_weight,
float wall_range,
float arrival_radius,
float dt)
{
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= n) return;
float pxi = pos_x_in[i];
float pyi = pos_y_in[i];
float ri = radius[i];
float v0 = desired_speed[i];
float dxi = dest_x[i];
float dyi = dest_y[i];
// Desired direction toward destination + arrival taper.
float to_dx = dxi - pxi;
float to_dy = dyi - pyi;
float d_to_dest = sqrtf(to_dx * to_dx + to_dy * to_dy);
float edx = 0.0f;
float edy = 0.0f;
if (d_to_dest > 1.0e-12f) {
edx = to_dx / d_to_dest;
edy = to_dy / d_to_dest;
}
float eff_v0 = v0;
if (arrival_radius > 0.0f && d_to_dest < arrival_radius) {
eff_v0 = v0 * (d_to_dest / arrival_radius);
}
// Already-arrived (zero desired direction): write zero velocity, keep position.
if (edx == 0.0f && edy == 0.0f) {
pos_x_out[i] = pxi;
pos_y_out[i] = pyi;
vel_x_out[i] = 0.0f;
vel_y_out[i] = 0.0f;
return;
}
float base_angle = atan2f(edy, edx);
unsigned int m = num_directions;
if (m < 1) m = 1;
// Seed best with the desired direction itself (matches CPU).
float best_dx = edx;
float best_dy = edy;
float best_headroom = avm_anticipated_headroom(
i, edx, edy, pxi, pyi, ri, v0,
pos_x_in, pos_y_in, vel_x_in, vel_y_in, radius,
wall_ax, wall_ay, wall_bx, wall_by,
n, n_walls, anticipation_time, wall_range);
float best_score = best_headroom;
for (unsigned int k = 0; k < m; ++k) {
float t;
if (m == 1) {
t = 0.0f;
} else {
t = -1.0f + 2.0f * (float)k / (float)(m - 1);
}
float theta = base_angle + t * cone_half_angle;
float cdx = cosf(theta);
float cdy = sinf(theta);
float headroom = avm_anticipated_headroom(
i, cdx, cdy, pxi, pyi, ri, v0,
pos_x_in, pos_y_in, vel_x_in, vel_y_in, radius,
wall_ax, wall_ay, wall_bx, wall_by,
n, n_walls, anticipation_time, wall_range);
float t_abs = t < 0.0f ? -t : t;
float deviation = deviation_weight * t_abs * cone_half_angle;
float score = headroom - deviation;
if (score > best_score) {
best_score = score;
best_headroom = headroom;
best_dx = cdx;
best_dy = cdy;
}
}
// Speed cap from anticipated headroom.
float cap = (best_headroom - ri) / time_gap;
if (cap < 0.0f) cap = 0.0f;
float v = eff_v0 < cap ? eff_v0 : cap;
float vx_new = best_dx * v;
float vy_new = best_dy * v;
float px_new = pxi + vx_new * dt;
float py_new = pyi + vy_new * dt;
pos_x_out[i] = px_new;
pos_y_out[i] = py_new;
vel_x_out[i] = vx_new;
vel_y_out[i] = vy_new;
}
"#;
pub struct CudaState {
ctx: Arc<CudaContext>,
stream: Arc<CudaStream>,
func: CudaFunction,
block_size: u32,
}
impl CudaState {
pub fn new() -> Result<Self, String> {
Self::with_block_size(256)
}
pub fn with_block_size(block_size: u32) -> Result<Self, String> {
if block_size == 0 {
return Err("block_size must be positive".to_string());
}
let ctx = super::new_context(0)?;
let stream = ctx.default_stream();
let ptx = compile_ptx(AVM_CUDA_SRC).map_err(|e| format!("NVRTC compile failed: {e}"))?;
let module = ctx
.load_module(ptx)
.map_err(|e| format!("module load failed: {e}"))?;
let func = module
.load_function("avm_step")
.map_err(|e| format!("kernel lookup failed: {e}"))?;
Ok(Self {
ctx,
stream,
func,
block_size,
})
}
pub fn step(
&self,
peds: &mut [Pedestrian],
walls: &[WallSegment],
params: &Params,
dt: f64,
) -> Result<u128, String> {
let n = peds.len();
if n == 0 {
return Ok(0);
}
let stream = &self.stream;
let mut pos_x: Vec<f32> = Vec::with_capacity(n);
let mut pos_y: Vec<f32> = Vec::with_capacity(n);
let mut vel_x: Vec<f32> = Vec::with_capacity(n);
let mut vel_y: Vec<f32> = Vec::with_capacity(n);
let mut radius: Vec<f32> = Vec::with_capacity(n);
let mut desired_speed: Vec<f32> = Vec::with_capacity(n);
let mut dest_x: Vec<f32> = Vec::with_capacity(n);
let mut dest_y: Vec<f32> = Vec::with_capacity(n);
for p in peds.iter() {
pos_x.push(p.pos[0] as f32);
pos_y.push(p.pos[1] as f32);
vel_x.push(p.vel[0] as f32);
vel_y.push(p.vel[1] as f32);
radius.push(p.radius as f32);
desired_speed.push(p.desired_speed as f32);
dest_x.push(p.destination[0] as f32);
dest_y.push(p.destination[1] as f32);
}
let n_walls = walls.len();
let mut wall_ax: Vec<f32> = Vec::with_capacity(n_walls.max(1));
let mut wall_ay: Vec<f32> = Vec::with_capacity(n_walls.max(1));
let mut wall_bx: Vec<f32> = Vec::with_capacity(n_walls.max(1));
let mut wall_by: Vec<f32> = Vec::with_capacity(n_walls.max(1));
if n_walls == 0 {
wall_ax.push(0.0);
wall_ay.push(0.0);
wall_bx.push(0.0);
wall_by.push(0.0);
} else {
for w in walls {
wall_ax.push(w.a[0] as f32);
wall_ay.push(w.a[1] as f32);
wall_bx.push(w.b[0] as f32);
wall_by.push(w.b[1] as f32);
}
}
let d_pos_x: CudaSlice<f32> = stream
.clone_htod(&pos_x)
.map_err(|e| format!("htod pos_x failed: {e}"))?;
let d_pos_y: CudaSlice<f32> = stream
.clone_htod(&pos_y)
.map_err(|e| format!("htod pos_y failed: {e}"))?;
let d_vel_x: CudaSlice<f32> = stream
.clone_htod(&vel_x)
.map_err(|e| format!("htod vel_x failed: {e}"))?;
let d_vel_y: CudaSlice<f32> = stream
.clone_htod(&vel_y)
.map_err(|e| format!("htod vel_y failed: {e}"))?;
let d_radius: CudaSlice<f32> = stream
.clone_htod(&radius)
.map_err(|e| format!("htod radius failed: {e}"))?;
let d_desired_speed: CudaSlice<f32> = stream
.clone_htod(&desired_speed)
.map_err(|e| format!("htod desired_speed failed: {e}"))?;
let d_dest_x: CudaSlice<f32> = stream
.clone_htod(&dest_x)
.map_err(|e| format!("htod dest_x failed: {e}"))?;
let d_dest_y: CudaSlice<f32> = stream
.clone_htod(&dest_y)
.map_err(|e| format!("htod dest_y failed: {e}"))?;
let mut d_pos_x_out: CudaSlice<f32> = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc pos_x_out failed: {e}"))?;
let mut d_pos_y_out: CudaSlice<f32> = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc pos_y_out failed: {e}"))?;
let mut d_vel_x_out: CudaSlice<f32> = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc vel_x_out failed: {e}"))?;
let mut d_vel_y_out: CudaSlice<f32> = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc vel_y_out failed: {e}"))?;
let d_wall_ax: CudaSlice<f32> = stream
.clone_htod(&wall_ax)
.map_err(|e| format!("htod wall_ax failed: {e}"))?;
let d_wall_ay: CudaSlice<f32> = stream
.clone_htod(&wall_ay)
.map_err(|e| format!("htod wall_ay failed: {e}"))?;
let d_wall_bx: CudaSlice<f32> = stream
.clone_htod(&wall_bx)
.map_err(|e| format!("htod wall_bx failed: {e}"))?;
let d_wall_by: CudaSlice<f32> = stream
.clone_htod(&wall_by)
.map_err(|e| format!("htod wall_by failed: {e}"))?;
let n_u32 = n as u32;
let n_walls_u32 = n_walls as u32;
let time_gap = params.time_gap as f32;
let anticipation_time = params.anticipation_time as f32;
let num_directions = params.num_directions as u32;
let cone_half_angle = params.cone_half_angle as f32;
let deviation_weight = params.deviation_weight as f32;
let wall_range = params.wall_range as f32;
let arrival_radius = params.arrival_radius as f32;
let dt_f32 = dt as f32;
let grid = n.div_ceil(self.block_size as usize) as u32;
let cfg = LaunchConfig {
grid_dim: (grid.max(1), 1, 1),
block_dim: (self.block_size, 1, 1),
shared_mem_bytes: 0,
};
let t0 = std::time::Instant::now();
unsafe {
let mut b = stream.launch_builder(&self.func);
b.arg(&d_pos_x);
b.arg(&d_pos_y);
b.arg(&d_vel_x);
b.arg(&d_vel_y);
b.arg(&d_radius);
b.arg(&d_desired_speed);
b.arg(&d_dest_x);
b.arg(&d_dest_y);
b.arg(&mut d_pos_x_out);
b.arg(&mut d_pos_y_out);
b.arg(&mut d_vel_x_out);
b.arg(&mut d_vel_y_out);
b.arg(&d_wall_ax);
b.arg(&d_wall_ay);
b.arg(&d_wall_bx);
b.arg(&d_wall_by);
b.arg(&n_u32);
b.arg(&n_walls_u32);
b.arg(&time_gap);
b.arg(&anticipation_time);
b.arg(&num_directions);
b.arg(&cone_half_angle);
b.arg(&deviation_weight);
b.arg(&wall_range);
b.arg(&arrival_radius);
b.arg(&dt_f32);
b.launch(cfg)
.map_err(|e| format!("kernel launch failed: {e}"))?;
}
stream
.synchronize()
.map_err(|e| format!("stream sync failed: {e}"))?;
let kernel_us = t0.elapsed().as_micros();
stream
.memcpy_dtoh(&d_pos_x_out, &mut pos_x)
.map_err(|e| format!("dtoh pos_x failed: {e}"))?;
stream
.memcpy_dtoh(&d_pos_y_out, &mut pos_y)
.map_err(|e| format!("dtoh pos_y failed: {e}"))?;
stream
.memcpy_dtoh(&d_vel_x_out, &mut vel_x)
.map_err(|e| format!("dtoh vel_x failed: {e}"))?;
stream
.memcpy_dtoh(&d_vel_y_out, &mut vel_y)
.map_err(|e| format!("dtoh vel_y failed: {e}"))?;
for (i, p) in peds.iter_mut().enumerate() {
p.pos = [pos_x[i] as f64, pos_y[i] as f64];
p.vel = [vel_x[i] as f64, vel_y[i] as f64];
}
let _ = &self.ctx;
Ok(kernel_us)
}
}
pub struct CudaResident {
ctx: Arc<CudaContext>,
stream: Arc<CudaStream>,
func: CudaFunction,
block_size: u32,
n: usize,
n_walls: usize,
d_pos_x_a: CudaSlice<f32>,
d_pos_y_a: CudaSlice<f32>,
d_vel_x_a: CudaSlice<f32>,
d_vel_y_a: CudaSlice<f32>,
d_pos_x_b: CudaSlice<f32>,
d_pos_y_b: CudaSlice<f32>,
d_vel_x_b: CudaSlice<f32>,
d_vel_y_b: CudaSlice<f32>,
d_radius: CudaSlice<f32>,
d_desired_speed: CudaSlice<f32>,
d_dest_x: CudaSlice<f32>,
d_dest_y: CudaSlice<f32>,
d_wall_ax: CudaSlice<f32>,
d_wall_ay: CudaSlice<f32>,
d_wall_bx: CudaSlice<f32>,
d_wall_by: CudaSlice<f32>,
}
impl CudaResident {
pub fn upload(peds: &[Pedestrian], walls: &[WallSegment]) -> Result<Self, String> {
Self::upload_with_block_size(peds, walls, 256)
}
pub fn upload_with_block_size(
peds: &[Pedestrian],
walls: &[WallSegment],
block_size: u32,
) -> Result<Self, String> {
if block_size == 0 {
return Err("block_size must be positive".to_string());
}
let n = peds.len();
if n == 0 {
return Err("CudaResident::upload requires at least one pedestrian".to_string());
}
let state = CudaState::with_block_size(block_size)?;
let stream = state.stream.clone();
let mut pos_x = Vec::with_capacity(n);
let mut pos_y = Vec::with_capacity(n);
let mut vel_x = Vec::with_capacity(n);
let mut vel_y = Vec::with_capacity(n);
let mut radius = Vec::with_capacity(n);
let mut desired_speed = Vec::with_capacity(n);
let mut dest_x = Vec::with_capacity(n);
let mut dest_y = Vec::with_capacity(n);
for p in peds {
pos_x.push(p.pos[0] as f32);
pos_y.push(p.pos[1] as f32);
vel_x.push(p.vel[0] as f32);
vel_y.push(p.vel[1] as f32);
radius.push(p.radius as f32);
desired_speed.push(p.desired_speed as f32);
dest_x.push(p.destination[0] as f32);
dest_y.push(p.destination[1] as f32);
}
let n_walls = walls.len();
let (wall_ax, wall_ay, wall_bx, wall_by) = wall_columns(walls);
let d_pos_x_a = stream
.clone_htod(&pos_x)
.map_err(|e| format!("htod pos_x failed: {e}"))?;
let d_pos_y_a = stream
.clone_htod(&pos_y)
.map_err(|e| format!("htod pos_y failed: {e}"))?;
let d_vel_x_a = stream
.clone_htod(&vel_x)
.map_err(|e| format!("htod vel_x failed: {e}"))?;
let d_vel_y_a = stream
.clone_htod(&vel_y)
.map_err(|e| format!("htod vel_y failed: {e}"))?;
let d_pos_x_b = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc pos_x_b failed: {e}"))?;
let d_pos_y_b = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc pos_y_b failed: {e}"))?;
let d_vel_x_b = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc vel_x_b failed: {e}"))?;
let d_vel_y_b = stream
.alloc_zeros(n)
.map_err(|e| format!("alloc vel_y_b failed: {e}"))?;
let d_radius = stream
.clone_htod(&radius)
.map_err(|e| format!("htod radius failed: {e}"))?;
let d_desired_speed = stream
.clone_htod(&desired_speed)
.map_err(|e| format!("htod desired_speed failed: {e}"))?;
let d_dest_x = stream
.clone_htod(&dest_x)
.map_err(|e| format!("htod dest_x failed: {e}"))?;
let d_dest_y = stream
.clone_htod(&dest_y)
.map_err(|e| format!("htod dest_y failed: {e}"))?;
let d_wall_ax = stream
.clone_htod(&wall_ax)
.map_err(|e| format!("htod wall_ax failed: {e}"))?;
let d_wall_ay = stream
.clone_htod(&wall_ay)
.map_err(|e| format!("htod wall_ay failed: {e}"))?;
let d_wall_bx = stream
.clone_htod(&wall_bx)
.map_err(|e| format!("htod wall_bx failed: {e}"))?;
let d_wall_by = stream
.clone_htod(&wall_by)
.map_err(|e| format!("htod wall_by failed: {e}"))?;
stream
.synchronize()
.map_err(|e| format!("initial sync failed: {e}"))?;
Ok(Self {
ctx: state.ctx,
stream,
func: state.func,
block_size,
n,
n_walls,
d_pos_x_a,
d_pos_y_a,
d_vel_x_a,
d_vel_y_a,
d_pos_x_b,
d_pos_y_b,
d_vel_x_b,
d_vel_y_b,
d_radius,
d_desired_speed,
d_dest_x,
d_dest_y,
d_wall_ax,
d_wall_ay,
d_wall_bx,
d_wall_by,
})
}
#[inline]
pub fn len(&self) -> usize {
self.n
}
#[inline]
pub fn is_empty(&self) -> bool {
self.n == 0
}
pub fn step(&mut self, params: &Params, dt: f64) -> Result<u128, String> {
let n_u32 = self.n as u32;
let n_walls_u32 = self.n_walls as u32;
let time_gap = params.time_gap as f32;
let anticipation_time = params.anticipation_time as f32;
let num_directions = params.num_directions as u32;
let cone_half_angle = params.cone_half_angle as f32;
let deviation_weight = params.deviation_weight as f32;
let wall_range = params.wall_range as f32;
let arrival_radius = params.arrival_radius as f32;
let dt_f32 = dt as f32;
let grid = self.n.div_ceil(self.block_size as usize) as u32;
let cfg = LaunchConfig {
grid_dim: (grid.max(1), 1, 1),
block_dim: (self.block_size, 1, 1),
shared_mem_bytes: 0,
};
let t0 = std::time::Instant::now();
unsafe {
let mut b = self.stream.launch_builder(&self.func);
b.arg(&self.d_pos_x_a);
b.arg(&self.d_pos_y_a);
b.arg(&self.d_vel_x_a);
b.arg(&self.d_vel_y_a);
b.arg(&self.d_radius);
b.arg(&self.d_desired_speed);
b.arg(&self.d_dest_x);
b.arg(&self.d_dest_y);
b.arg(&mut self.d_pos_x_b);
b.arg(&mut self.d_pos_y_b);
b.arg(&mut self.d_vel_x_b);
b.arg(&mut self.d_vel_y_b);
b.arg(&self.d_wall_ax);
b.arg(&self.d_wall_ay);
b.arg(&self.d_wall_bx);
b.arg(&self.d_wall_by);
b.arg(&n_u32);
b.arg(&n_walls_u32);
b.arg(&time_gap);
b.arg(&anticipation_time);
b.arg(&num_directions);
b.arg(&cone_half_angle);
b.arg(&deviation_weight);
b.arg(&wall_range);
b.arg(&arrival_radius);
b.arg(&dt_f32);
b.launch(cfg)
.map_err(|e| format!("kernel launch failed: {e}"))?;
}
self.stream
.synchronize()
.map_err(|e| format!("stream sync failed: {e}"))?;
let kernel_us = t0.elapsed().as_micros();
std::mem::swap(&mut self.d_pos_x_a, &mut self.d_pos_x_b);
std::mem::swap(&mut self.d_pos_y_a, &mut self.d_pos_y_b);
std::mem::swap(&mut self.d_vel_x_a, &mut self.d_vel_x_b);
std::mem::swap(&mut self.d_vel_y_a, &mut self.d_vel_y_b);
let _ = &self.ctx;
Ok(kernel_us)
}
pub fn download(&self, peds: &mut Vec<Pedestrian>) -> Result<(), String> {
download_resident(
&self.stream,
self.n,
&self.d_pos_x_a,
&self.d_pos_y_a,
&self.d_vel_x_a,
&self.d_vel_y_a,
&self.d_radius,
&self.d_desired_speed,
&self.d_dest_x,
&self.d_dest_y,
peds,
)
}
}
fn wall_columns(walls: &[WallSegment]) -> (Vec<f32>, Vec<f32>, Vec<f32>, Vec<f32>) {
if walls.is_empty() {
return (vec![0.0], vec![0.0], vec![0.0], vec![0.0]);
}
let mut ax = Vec::with_capacity(walls.len());
let mut ay = Vec::with_capacity(walls.len());
let mut bx = Vec::with_capacity(walls.len());
let mut by = Vec::with_capacity(walls.len());
for w in walls {
ax.push(w.a[0] as f32);
ay.push(w.a[1] as f32);
bx.push(w.b[0] as f32);
by.push(w.b[1] as f32);
}
(ax, ay, bx, by)
}
#[allow(clippy::too_many_arguments)]
fn download_resident(
stream: &Arc<CudaStream>,
n: usize,
d_pos_x: &CudaSlice<f32>,
d_pos_y: &CudaSlice<f32>,
d_vel_x: &CudaSlice<f32>,
d_vel_y: &CudaSlice<f32>,
d_radius: &CudaSlice<f32>,
d_desired_speed: &CudaSlice<f32>,
d_dest_x: &CudaSlice<f32>,
d_dest_y: &CudaSlice<f32>,
peds: &mut Vec<Pedestrian>,
) -> Result<(), String> {
peds.resize(
n,
Pedestrian {
pos: [0.0, 0.0],
vel: [0.0, 0.0],
radius: 0.0,
desired_speed: 0.0,
destination: [0.0, 0.0],
},
);
let mut pos_x = vec![0.0f32; n];
let mut pos_y = vec![0.0f32; n];
let mut vel_x = vec![0.0f32; n];
let mut vel_y = vec![0.0f32; n];
let mut radius = vec![0.0f32; n];
let mut desired_speed = vec![0.0f32; n];
let mut dest_x = vec![0.0f32; n];
let mut dest_y = vec![0.0f32; n];
stream
.memcpy_dtoh(d_pos_x, &mut pos_x)
.map_err(|e| format!("dtoh pos_x failed: {e}"))?;
stream
.memcpy_dtoh(d_pos_y, &mut pos_y)
.map_err(|e| format!("dtoh pos_y failed: {e}"))?;
stream
.memcpy_dtoh(d_vel_x, &mut vel_x)
.map_err(|e| format!("dtoh vel_x failed: {e}"))?;
stream
.memcpy_dtoh(d_vel_y, &mut vel_y)
.map_err(|e| format!("dtoh vel_y failed: {e}"))?;
stream
.memcpy_dtoh(d_radius, &mut radius)
.map_err(|e| format!("dtoh radius failed: {e}"))?;
stream
.memcpy_dtoh(d_desired_speed, &mut desired_speed)
.map_err(|e| format!("dtoh desired_speed failed: {e}"))?;
stream
.memcpy_dtoh(d_dest_x, &mut dest_x)
.map_err(|e| format!("dtoh dest_x failed: {e}"))?;
stream
.memcpy_dtoh(d_dest_y, &mut dest_y)
.map_err(|e| format!("dtoh dest_y failed: {e}"))?;
for (i, p) in peds.iter_mut().enumerate().take(n) {
p.pos = [pos_x[i] as f64, pos_y[i] as f64];
p.vel = [vel_x[i] as f64, vel_y[i] as f64];
p.radius = radius[i] as f64;
p.desired_speed = desired_speed[i] as f64;
p.destination = [dest_x[i] as f64, dest_y[i] as f64];
}
Ok(())
}
pub fn step(
peds: &mut [Pedestrian],
walls: &[WallSegment],
params: &Params,
dt: f64,
) -> Result<u128, String> {
let state = CudaState::new()?;
state.step(peds, walls, params, dt)
}
pub fn step_with_fallback(
state: &mut Option<CudaState>,
peds: &mut [Pedestrian],
walls: &[WallSegment],
params: &Params,
dt: f64,
) -> bool {
if state.is_none() {
match CudaState::new() {
Ok(s) => *state = Some(s),
Err(e) => {
eprintln!("rustsim-crowd CUDA init failed ({e}), using CPU AVM path");
#[allow(deprecated)]
crate::anticipation_velocity::step(peds, walls, params, dt);
return false;
}
}
}
match state.as_ref().unwrap().step(peds, walls, params, dt) {
Ok(_) => true,
Err(e) => {
eprintln!("rustsim-crowd CUDA AVM step failed ({e}), falling back to CPU");
#[allow(deprecated)]
crate::anticipation_velocity::step(peds, walls, params, dt);
false
}
}
}