#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ComputeBackend {
Cpu,
Cuda,
}
impl std::fmt::Display for ComputeBackend {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
ComputeBackend::Cpu => write!(f, "CPU"),
ComputeBackend::Cuda => write!(f, "CUDA"),
}
}
}
pub fn detect_backend() -> ComputeBackend {
if let Ok(val) = std::env::var("RUSTSIM_BACKEND") {
match val.to_lowercase().as_str() {
"cuda" | "gpu" => {
tracing::info!(backend = "CUDA", "backend override via RUSTSIM_BACKEND");
return ComputeBackend::Cuda;
}
"cpu" => {
tracing::info!(backend = "CPU", "backend override via RUSTSIM_BACKEND");
return ComputeBackend::Cpu;
}
_ => {}
}
}
#[cfg(feature = "cuda")]
{
if crate::cuda_context::new_context(0).is_ok() {
tracing::info!("CUDA device detected");
return ComputeBackend::Cuda;
}
}
#[cfg(not(feature = "cuda"))]
{
match std::process::Command::new("nvidia-smi")
.arg("--query-gpu=name")
.arg("--format=csv,noheader")
.output()
{
Ok(output) if output.status.success() => {
tracing::info!("CUDA device detected via nvidia-smi");
return ComputeBackend::Cuda;
}
_ => {}
}
}
tracing::debug!("no CUDA device found, using CPU");
ComputeBackend::Cpu
}
use rustsim_core::soa::{self, SoaExtractable, SoaExtractableF64};
use rustsim_core::store::AgentStore;
#[derive(Debug)]
pub struct AccelStepResult {
pub backend: ComputeBackend,
pub agent_count: usize,
pub kernel_us: u128,
}
impl AccelStepResult {
pub fn kernel_ms(&self) -> f64 {
self.kernel_us as f64 / 1_000.0
}
pub fn agents_per_second(&self) -> f64 {
if self.kernel_us == 0 {
return 0.0;
}
self.agent_count as f64 / (self.kernel_us as f64 / 1_000_000.0)
}
}
pub fn cpu_batch_step<A, S, F>(store: &S, mut kernel: F) -> AccelStepResult
where
A: SoaExtractable,
S: AgentStore<A>,
F: FnMut(&mut [Vec<f32>], usize),
{
let (ids, mut columns) = soa::extract_soa::<A, S>(store);
let n = ids.len();
let t0 = std::time::Instant::now();
kernel(&mut columns, n);
let kernel_us = t0.elapsed().as_micros();
soa::write_back_soa::<A, S>(store, &ids, &columns);
tracing::debug!(
backend = "CPU",
agents = n,
kernel_us,
"cpu_batch_step completed"
);
AccelStepResult {
backend: ComputeBackend::Cpu,
agent_count: n,
kernel_us,
}
}
pub fn cpu_batch_step_f64<A, S, F>(store: &S, mut kernel: F) -> AccelStepResult
where
A: SoaExtractableF64,
S: AgentStore<A>,
F: FnMut(&mut [Vec<f64>], usize),
{
let (ids, mut columns) = soa::extract_soa_f64::<A, S>(store);
let n = ids.len();
let t0 = std::time::Instant::now();
kernel(&mut columns, n);
let kernel_us = t0.elapsed().as_micros();
soa::write_back_soa_f64::<A, S>(store, &ids, &columns);
tracing::debug!(
backend = "CPU",
precision = "f64",
agents = n,
kernel_us,
"cpu_batch_step_f64 completed"
);
AccelStepResult {
backend: ComputeBackend::Cpu,
agent_count: n,
kernel_us,
}
}
#[cfg(feature = "rayon")]
pub fn par_batch_step<A, S, F>(store: &S, chunk_size: usize, kernel: F) -> AccelStepResult
where
A: SoaExtractable,
S: AgentStore<A>,
F: Fn(usize, &mut [&mut [f32]]) + Send + Sync,
{
let (ids, mut columns) = soa::extract_soa::<A, S>(store);
let n = ids.len();
let t0 = std::time::Instant::now();
{
let mut slices: Vec<&mut [f32]> = columns.iter_mut().map(|c| c.as_mut_slice()).collect();
crate::parallel::par_apply_chunks_multi(&mut slices, chunk_size, kernel);
}
let kernel_us = t0.elapsed().as_micros();
soa::write_back_soa::<A, S>(store, &ids, &columns);
tracing::debug!(
backend = "CPU",
parallel = true,
chunk_size,
agents = n,
kernel_us,
"par_batch_step completed"
);
AccelStepResult {
backend: ComputeBackend::Cpu,
agent_count: n,
kernel_us,
}
}
#[cfg(feature = "cuda")]
pub fn cuda_batch_step<A, S>(
store: &S,
ptx_source: &str,
_module_name: &str,
kernel_name: &str,
block_size: u32,
) -> Result<AccelStepResult, String>
where
A: SoaExtractable,
S: AgentStore<A>,
{
use cudarc::driver::{LaunchConfig, PushKernelArg};
if block_size == 0 {
return Err("block_size must be positive".to_string());
}
let ctx = crate::cuda_context::new_context(0)?;
let stream = ctx.default_stream();
let (ids, mut columns) = soa::extract_soa::<A, S>(store);
let n = ids.len();
if n == 0 {
return Ok(AccelStepResult {
backend: ComputeBackend::Cuda,
agent_count: 0,
kernel_us: 0,
});
}
let ptx = cudarc::nvrtc::Ptx::from_src(ptx_source);
let module = ctx
.load_module(ptx)
.map_err(|e| format!("PTX load failed: {e}"))?;
let func = module
.load_function(kernel_name)
.map_err(|e| format!("kernel '{kernel_name}' not found: {e}"))?;
let mut d_columns = Vec::with_capacity(columns.len());
for col in &columns {
let d_col = stream
.clone_htod(col.as_slice())
.map_err(|e| format!("htod failed: {e}"))?;
d_columns.push(d_col);
}
let grid_size = n.div_ceil(block_size as usize) as u32;
let cfg = LaunchConfig {
grid_dim: (grid_size, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: 0,
};
let n_u32 = n as u32;
let t0 = std::time::Instant::now();
unsafe {
let mut builder = stream.launch_builder(&func);
for d in d_columns.iter_mut() {
builder.arg(d);
}
builder.arg(&n_u32);
builder
.launch(cfg)
.map_err(|e| format!("kernel launch failed: {e}"))?;
}
stream
.synchronize()
.map_err(|e| format!("stream sync failed: {e}"))?;
let kernel_us = t0.elapsed().as_micros();
for (i, d_col) in d_columns.iter().enumerate() {
stream
.memcpy_dtoh(d_col, &mut columns[i])
.map_err(|e| format!("dtoh failed: {e}"))?;
}
soa::write_back_soa::<A, S>(store, &ids, &columns);
Ok(AccelStepResult {
backend: ComputeBackend::Cuda,
agent_count: n,
kernel_us,
})
}
#[cfg(feature = "cuda")]
pub fn cuda_batch_step_pinned<A, S>(
store: &S,
ptx_source: &str,
_module_name: &str,
kernel_name: &str,
block_size: u32,
) -> Result<AccelStepResult, String>
where
A: SoaExtractable,
S: AgentStore<A>,
{
use cudarc::driver::{LaunchConfig, PushKernelArg};
if block_size == 0 {
return Err("block_size must be positive".to_string());
}
let ctx = crate::cuda_context::new_context(0)?;
let copy_stream = ctx
.new_stream()
.map_err(|e| format!("copy stream init failed: {e}"))?;
let compute_stream = ctx
.new_stream()
.map_err(|e| format!("compute stream init failed: {e}"))?;
let (ids, mut columns) = soa::extract_soa::<A, S>(store);
let n = ids.len();
if n == 0 {
return Ok(AccelStepResult {
backend: ComputeBackend::Cuda,
agent_count: 0,
kernel_us: 0,
});
}
let ptx = cudarc::nvrtc::Ptx::from_src(ptx_source);
let module = ctx
.load_module(ptx)
.map_err(|e| format!("PTX load failed: {e}"))?;
let func = module
.load_function(kernel_name)
.map_err(|e| format!("kernel '{kernel_name}' not found: {e}"))?;
let mut pinned: Vec<cudarc::driver::PinnedHostSlice<f32>> = Vec::with_capacity(columns.len());
for col in &columns {
let mut p = unsafe { ctx.alloc_pinned::<f32>(col.len()) }
.map_err(|e| format!("pinned alloc failed: {e}"))?;
p.as_mut_slice()
.map_err(|e| format!("pinned access failed: {e}"))?
.copy_from_slice(col);
pinned.push(p);
}
let mut d_columns: Vec<cudarc::driver::CudaSlice<f32>> = Vec::with_capacity(pinned.len());
for p in &pinned {
let d = copy_stream
.clone_htod(p)
.map_err(|e| format!("htod failed: {e}"))?;
d_columns.push(d);
}
compute_stream
.join(©_stream)
.map_err(|e| format!("compute.join(copy) failed: {e}"))?;
let grid_size = n.div_ceil(block_size as usize) as u32;
let cfg = LaunchConfig {
grid_dim: (grid_size, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: 0,
};
let n_u32 = n as u32;
let t0 = std::time::Instant::now();
unsafe {
let mut builder = compute_stream.launch_builder(&func);
for d in d_columns.iter_mut() {
builder.arg(d);
}
builder.arg(&n_u32);
builder
.launch(cfg)
.map_err(|e| format!("kernel launch failed: {e}"))?;
}
copy_stream
.join(&compute_stream)
.map_err(|e| format!("copy.join(compute) failed: {e}"))?;
for (i, d_col) in d_columns.iter().enumerate() {
copy_stream
.memcpy_dtoh(d_col, &mut pinned[i])
.map_err(|e| format!("dtoh failed: {e}"))?;
}
copy_stream
.synchronize()
.map_err(|e| format!("stream sync failed: {e}"))?;
let kernel_us = t0.elapsed().as_micros();
for (i, p) in pinned.iter().enumerate() {
columns[i].copy_from_slice(
p.as_slice()
.map_err(|e| format!("pinned readback failed: {e}"))?,
);
}
soa::write_back_soa::<A, S>(store, &ids, &columns);
Ok(AccelStepResult {
backend: ComputeBackend::Cuda,
agent_count: n,
kernel_us,
})
}
pub fn auto_batch_step<A, S, F>(
store: &S,
cpu_kernel: F,
#[cfg(feature = "cuda")] ptx_source: &str,
#[cfg(feature = "cuda")] module_name: &str,
#[cfg(feature = "cuda")] kernel_name: &str,
#[cfg(feature = "cuda")] block_size: u32,
) -> AccelStepResult
where
A: SoaExtractable,
S: AgentStore<A>,
F: FnMut(&mut [Vec<f32>], usize),
{
#[cfg(feature = "cuda")]
{
if detect_backend() == ComputeBackend::Cuda {
match cuda_batch_step::<A, S>(store, ptx_source, module_name, kernel_name, block_size) {
Ok(result) => return result,
Err(e) => {
tracing::warn!(error = %e, "CUDA batch step failed, falling back to CPU");
}
}
}
}
cpu_batch_step::<A, S, F>(store, cpu_kernel)
}
pub fn auto_device_step(
device: &mut crate::device_store::DeviceSoaStore,
mut cpu_kernel: impl FnMut(&mut [Vec<f32>], usize),
#[cfg(feature = "cuda")] ptx_source: &str,
#[cfg(feature = "cuda")] module_name: &str,
#[cfg(feature = "cuda")] kernel_name: &str,
#[cfg(feature = "cuda")] block_size: u32,
) -> AccelStepResult {
#[cfg(feature = "cuda")]
{
if detect_backend() == ComputeBackend::Cuda {
match device.step_cuda(ptx_source, module_name, kernel_name, block_size) {
Ok(kernel_us) => {
return AccelStepResult {
backend: ComputeBackend::Cuda,
agent_count: device.agent_count(),
kernel_us,
};
}
Err(e) => {
tracing::warn!(error = %e, "CUDA device step failed, falling back to CPU");
}
}
}
}
let kernel_us = device.step_cpu(&mut cpu_kernel);
AccelStepResult {
backend: ComputeBackend::Cpu,
agent_count: device.agent_count(),
kernel_us,
}
}