rustsim-crowd 0.0.1

//! CUDA arm for rustsim-crowd — GPU kernels for the hot-path pedestrian
//! models.
//!
//! # Scope
//!
//! Closes the P0-1 "GPU path via `DeviceSoaStore`" item from
//! `docs/rustsim-crowd.md`. Every 2-D crowd model now ships a CUDA
//! arm: **Social Force**, **Generalized Centrifugal Force**,
//! **Collision-Free Speed**, **Anticipation Velocity**, and
//! **Optimal Steps**. Social Force ships stateless, device-resident,
//! and grid-accelerated variants; the other four models ship stateless
//! and device-resident variants so production tick loops can avoid
//! per-step host/device transfer.
//!
//! # Design
//!
//! - One CUDA thread per pedestrian. Every thread reads the full `*_in`
//!   position / velocity / radius column set (old state) and writes its
//!   own row of `*_out` columns (new state). This double-buffered layout
//!   sidesteps the intra-step read/write hazard without any explicit
//!   synchronisation.
//! - Pair interactions are O(n²) inside the kernel. A device-side
//!   uniform-grid neighbour query is the natural next refinement
//!   (mirroring the CPU `NeighborGrid`), but even the O(n²) kernel is
//!   comfortably faster than the CPU `step_scratch` path for
//!   `N ≳ 2 000` because the GPU handles the quadratic term in
//!   massively parallel fashion.
//! - Precision: internal arithmetic is `f32`. Helbing's SFM is
//!   numerically tolerant well below 1e-4 m tolerance, which matches
//!   FLAMEGPU2 and other production SFM GPU implementations.
//! - Wire format: plain scalar PTX arguments, no CUDA constant memory,
//!   no textures. The kernel is compiled at runtime via
//!   `cudarc::nvrtc::compile_ptx` so the build succeeds without a
//!   local CUDA toolkit — only the driver is required at run time.
//!
//! # Runtime requirements
//!
//! - `cuda` feature enabled.
//! - An NVIDIA GPU with a compatible driver at execution time. If no
//!   device is found or CUDA initialisation fails, every entry point
//!   returns `Err(String)` and the caller is expected to fall back to
//!   the CPU path (see [`social_force::step_with_fallback`] for the
//!   convenience wrapper that does exactly that).

use cudarc::driver::CudaContext;
use std::any::Any;
use std::panic::{catch_unwind, AssertUnwindSafe};
use std::sync::Arc;

pub(crate) fn new_context(device: usize) -> Result<Arc<CudaContext>, String> {
    match catch_unwind(AssertUnwindSafe(|| CudaContext::new(device))) {
        Ok(Ok(ctx)) => Ok(ctx),
        Ok(Err(err)) => Err(format!("CUDA context init failed: {err}")),
        Err(payload) => Err(format!(
            "CUDA context init panicked: {}",
            panic_payload_message(payload.as_ref())
        )),
    }
}

fn panic_payload_message(payload: &(dyn Any + Send)) -> &str {
    if let Some(message) = payload.downcast_ref::<&'static str>() {
        message
    } else if let Some(message) = payload.downcast_ref::<String>() {
        message.as_str()
    } else {
        "unknown panic payload"
    }
}

pub mod anticipation_velocity;
pub mod collision_free_speed;
pub mod generalized_centrifugal_force;
pub mod optimal_steps;
pub mod social_force;