#![allow(dead_code)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct CudaBufferHandle(pub usize);
#[derive(Debug, Clone, Default)]
pub struct CudaDeviceInfo {
pub ordinal: u32,
pub name: String,
pub total_mem_bytes: u64,
pub compute_capability: (u32, u32),
pub multiprocessor_count: u32,
pub max_threads_per_block: u32,
pub warp_size: u32,
pub supports_unified_memory: bool,
pub supports_f64: bool,
pub driver_version: String,
}
#[derive(Debug, Clone)]
pub enum CudaInitError {
NotAvailable,
FeatureNotEnabled,
NoDevice,
DeviceOrdinalOutOfRange(u32),
DeviceError(String),
CompilationError(String),
}
impl std::fmt::Display for CudaInitError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Self::NotAvailable => write!(f, "CUDA is not available on this system"),
Self::FeatureNotEnabled => write!(f, "`cuda-backend` feature is not enabled"),
Self::NoDevice => write!(f, "no CUDA-capable device found"),
Self::DeviceOrdinalOutOfRange(n) => write!(f, "device ordinal {n} is out of range"),
Self::DeviceError(msg) => write!(f, "CUDA device error: {msg}"),
Self::CompilationError(msg) => write!(f, "NVRTC compile error: {msg}"),
}
}
}
impl std::error::Error for CudaInitError {}
pub const PTX_SPH_DENSITY: &str = r#"
// CUDA C source (compiled to PTX via nvcc -arch=sm_70 -ptx)
// extern "C" __global__ void sph_density(
// const float4* __restrict__ pos, // positions + mass in .w
// float* __restrict__ rho, // output density
// int n, // particle count
// float h, // smoothing length
// float h_inv // 1/h
// ) {
// int i = blockIdx.x * blockDim.x + threadIdx.x;
// if (i >= n) return;
//
// float xi = pos[i].x, yi = pos[i].y, zi = pos[i].z;
// float density = 0.0f;
// const float coeff = (315.0f / 64.0f) * __fdividef(1.0f, 3.14159265f * h*h*h);
//
// // tile-based neighbour loop (shared memory)
// __shared__ float4 tile[256];
// for (int t = 0; t < (n + 255) / 256; t++) {
// int j = t * 256 + threadIdx.x;
// tile[threadIdx.x] = (j < n) ? pos[j] : make_float4(1e30f, 1e30f, 1e30f, 0.0f);
// __syncthreads();
// for (int k = 0; k < 256; k++) {
// float dx = xi - tile[k].x, dy = yi - tile[k].y, dz = zi - tile[k].z;
// float r2 = dx*dx + dy*dy + dz*dz;
// float h2 = h * h;
// if (r2 < h2) {
// float q = 1.0f - r2 * __fdividef(1.0f, h2);
// density += tile[k].w * coeff * q * q * q;
// }
// }
// __syncthreads();
// }
// rho[i] = density;
// }
// --- actual PTX would be here ---
.version 7.0
.target sm_70
.address_size 64
// (stub — replace with actual nvcc-compiled PTX)
"#;
pub const PTX_PARALLEL_SCAN: &str = r#"
// CUDA C source:
// extern "C" __global__ void exclusive_scan(
// const double* __restrict__ in,
// double* __restrict__ out,
// int n
// ) {
// extern __shared__ double shmem[];
// int tid = threadIdx.x;
// int gid = blockIdx.x * blockDim.x + tid;
//
// // Load into shared memory
// shmem[tid] = (gid < n) ? in[gid] : 0.0;
// __syncthreads();
//
// // Blelloch up-sweep
// for (int stride = 1; stride < blockDim.x; stride <<= 1) {
// int idx = (tid + 1) * stride * 2 - 1;
// if (idx < blockDim.x)
// shmem[idx] += shmem[idx - stride];
// __syncthreads();
// }
//
// // Set root to zero
// if (tid == blockDim.x - 1) shmem[tid] = 0.0;
// __syncthreads();
//
// // Blelloch down-sweep
// for (int stride = blockDim.x / 2; stride >= 1; stride >>= 1) {
// int idx = (tid + 1) * stride * 2 - 1;
// if (idx < blockDim.x) {
// double t = shmem[idx - stride];
// shmem[idx - stride] = shmem[idx];
// shmem[idx] = shmem[idx] + t;
// }
// __syncthreads();
// }
//
// if (gid < n) out[gid] = shmem[tid];
// }
.version 7.0
.target sm_70
.address_size 64
// (stub — replace with actual nvcc-compiled PTX)
"#;
pub const PTX_CONSTRAINT_PGS: &str = r#"
// CUDA C source:
// extern "C" __global__ void constraint_pgs_iter(
// const GpuConstraint* __restrict__ constraints,
// float* __restrict__ lambda,
// float4* __restrict__ vel_lin, // xyz=vel, w=inv_mass
// float4* __restrict__ vel_ang,
// int n,
// float omega
// ) {
// int base = blockIdx.x * blockDim.x;
// if (threadIdx.x != 0) return;
//
// for (int ci = base; ci < min(base + (int)blockDim.x, n); ci++) {
// GpuConstraint c = constraints[ci];
// float3 vla = make_float3(0), wla = make_float3(0); float inv_ma = 0;
// float3 vlb = make_float3(0), wlb = make_float3(0); float inv_mb = 0;
//
// if (c.body_a != 0xFFFFFFFF) {
// float4 vl = vel_lin[c.body_a], vw = vel_ang[c.body_a];
// vla = make_float3(vl); wla = make_float3(vw); inv_ma = vl.w;
// }
// if (c.body_b != 0xFFFFFFFF) {
// float4 vl = vel_lin[c.body_b], vw = vel_ang[c.body_b];
// vlb = make_float3(vl); wlb = make_float3(vw); inv_mb = vl.w;
// }
//
// float3 n3 = make_float3(c.nx, c.ny, c.nz);
// float3 va = vla + cross(wla, make_float3(c.rax, c.ray, c.raz));
// float3 vb = vlb + cross(wlb, make_float3(c.rbx, c.rby, c.rbz));
// float rv = dot(n3, va - vb);
// float d = -(rv + c.bias) * c.em * omega;
// float old = lambda[ci];
// float neo = __saturatef((old + d - c.lambda_lo) / (c.lambda_hi - c.lambda_lo))
// * (c.lambda_hi - c.lambda_lo) + c.lambda_lo;
// lambda[ci] = neo;
// float dl = neo - old;
//
// float3 imp = n3 * dl;
// if (c.body_a != 0xFFFFFFFF) { /* update vel_lin/ang[body_a] */ }
// if (c.body_b != 0xFFFFFFFF) { /* update vel_lin/ang[body_b] */ }
// }
// }
.version 7.0
.target sm_70
.address_size 64
// (stub — replace with actual nvcc-compiled PTX)
"#;
pub const CUDA_SPH_DENSITY_SRC: &str = r#"
extern "C" __global__ void sph_density_kernel(
const double* __restrict__ positions,
double* __restrict__ densities,
int n_particles,
double smoothing_length,
double particle_mass
) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i >= n_particles) return;
double px = positions[3*i], py = positions[3*i+1], pz = positions[3*i+2];
double rho = 0.0;
double h2 = smoothing_length * smoothing_length;
double coeff = 315.0 / (64.0 * 3.14159265358979 * smoothing_length
* smoothing_length * smoothing_length);
for (int j = 0; j < n_particles; j++) {
double dx = px - positions[3*j];
double dy = py - positions[3*j+1];
double dz = pz - positions[3*j+2];
double r2 = dx*dx + dy*dy + dz*dz;
if (r2 < h2) {
double q = 1.0 - r2 / h2;
rho += q * q * q;
}
}
densities[i] = particle_mass * coeff * rho;
}
"#;
#[derive(Debug, Clone)]
struct CudaBufferEntry {
len: usize,
shadow: Vec<f64>,
unified: bool,
}
#[cfg(feature = "cuda-backend")]
mod real_ctx {
use super::CudaInitError;
use std::collections::HashMap;
use std::sync::Arc;
use cudarc::driver::{CudaContext, CudaFunction, CudaModule, CudaSlice, CudaStream};
pub(super) struct CudaRealContext {
pub ctx: Arc<CudaContext>,
pub stream: Arc<CudaStream>,
pub real_buffers: Vec<CudaSlice<u8>>,
pub modules: HashMap<String, Arc<CudaModule>>,
pub functions: HashMap<String, CudaFunction>,
}
impl CudaRealContext {
pub fn new(ordinal: u32) -> Result<Self, CudaInitError> {
let ctx = CudaContext::new(ordinal as usize)
.map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
let stream = ctx.default_stream();
Ok(Self {
ctx,
stream,
real_buffers: Vec::new(),
modules: HashMap::new(),
functions: HashMap::new(),
})
}
pub fn alloc_bytes(&mut self, len: usize) -> Result<usize, CudaInitError> {
let slice: CudaSlice<u8> = self
.stream
.alloc_zeros::<u8>(len)
.map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
let idx = self.real_buffers.len();
self.real_buffers.push(slice);
Ok(idx)
}
pub fn write_f64_slice(&mut self, idx: usize, data: &[f64]) -> Result<(), CudaInitError> {
let byte_len = std::mem::size_of_val(data);
let byte_slice: &[u8] =
unsafe { std::slice::from_raw_parts(data.as_ptr().cast::<u8>(), byte_len) };
let dst = self
.real_buffers
.get_mut(idx)
.ok_or_else(|| CudaInitError::DeviceError("invalid buffer index".to_owned()))?;
let copy_len = byte_len.min(dst.len());
if copy_len == 0 {
return Ok(());
}
let src_trimmed = &byte_slice[..copy_len];
let mut dst_view = dst
.try_slice_mut(..copy_len)
.ok_or_else(|| CudaInitError::DeviceError("slice view failed".to_owned()))?;
self.stream
.memcpy_htod(src_trimmed, &mut dst_view)
.map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))
}
pub fn read_f64_vec(&self, idx: usize) -> Result<Vec<f64>, CudaInitError> {
let src = self
.real_buffers
.get(idx)
.ok_or_else(|| CudaInitError::DeviceError("invalid buffer index".to_owned()))?;
let bytes: Vec<u8> = self
.stream
.clone_dtoh(src)
.map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
Ok(bytes_to_f64_vec(bytes))
}
pub fn register_ptx(&mut self, name: &str, ptx_src: &str) -> Result<(), CudaInitError> {
use cudarc::nvrtc::Ptx;
let ptx = Ptx::from_src(ptx_src);
let module: Arc<CudaModule> = self
.ctx
.load_module(ptx)
.map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
let func: CudaFunction = module
.load_function(name)
.map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
self.modules.insert(name.to_owned(), module);
self.functions.insert(name.to_owned(), func);
Ok(())
}
pub fn compile_and_register(
&mut self,
name: &str,
cuda_c_src: &str,
) -> Result<(), CudaInitError> {
use cudarc::nvrtc::compile_ptx;
let ptx = compile_ptx(cuda_c_src)
.map_err(|e| CudaInitError::CompilationError(format!("{e:?}")))?;
let module: Arc<CudaModule> = self
.ctx
.load_module(ptx)
.map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
let func: CudaFunction = module
.load_function(name)
.map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
self.modules.insert(name.to_owned(), module);
self.functions.insert(name.to_owned(), func);
Ok(())
}
pub fn synchronize(&self) -> Result<(), CudaInitError> {
self.stream
.synchronize()
.map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))
}
}
pub(super) fn bytes_to_f64_vec(bytes: Vec<u8>) -> Vec<f64> {
if !bytes.len().is_multiple_of(8) {
return Vec::new();
}
bytes
.chunks_exact(8)
.filter_map(|c| <[u8; 8]>::try_from(c).ok().map(f64::from_le_bytes))
.collect()
}
}
pub struct CudaBackend {
pub device_info: CudaDeviceInfo,
available: bool,
buffers: Vec<CudaBufferEntry>,
kernels: Vec<String>,
#[cfg(feature = "cuda-backend")]
real: Option<real_ctx::CudaRealContext>,
}
impl CudaBackend {
pub fn try_new(ordinal: u32) -> Result<Self, CudaInitError> {
#[cfg(feature = "cuda-backend")]
{
Self::try_new_real(ordinal)
}
#[cfg(not(feature = "cuda-backend"))]
{
let _ = ordinal;
Err(CudaInitError::FeatureNotEnabled)
}
}
pub fn new_stub() -> Self {
Self {
device_info: CudaDeviceInfo {
name: "CPU stub".into(),
..Default::default()
},
available: false,
buffers: Vec::new(),
kernels: Vec::new(),
#[cfg(feature = "cuda-backend")]
real: None,
}
}
pub fn is_available(&self) -> bool {
self.available
}
pub fn device_info(&self) -> &CudaDeviceInfo {
&self.device_info
}
pub fn create_buffer(&mut self, len: usize) -> CudaBufferHandle {
let handle = CudaBufferHandle(self.buffers.len());
#[cfg(feature = "cuda-backend")]
if let Some(ctx) = self.real.as_mut() {
let byte_len = len * std::mem::size_of::<f64>();
if ctx.alloc_bytes(byte_len).is_ok() {
self.buffers.push(CudaBufferEntry {
len,
shadow: Vec::new(), unified: false,
});
return handle;
}
}
self.buffers.push(CudaBufferEntry {
len,
shadow: vec![0.0; len],
unified: false,
});
handle
}
pub fn alloc_unified(&mut self, len: usize) -> CudaBufferHandle {
let handle = CudaBufferHandle(self.buffers.len());
#[cfg(feature = "cuda-backend")]
if let Some(ctx) = self.real.as_mut() {
let byte_len = len * std::mem::size_of::<f64>();
if ctx.alloc_bytes(byte_len).is_ok() {
self.buffers.push(CudaBufferEntry {
len,
shadow: Vec::new(),
unified: true,
});
return handle;
}
}
self.buffers.push(CudaBufferEntry {
len,
shadow: vec![0.0; len],
unified: true,
});
handle
}
pub fn write_buffer(&mut self, handle: CudaBufferHandle, data: &[f64]) {
#[cfg(feature = "cuda-backend")]
if let Some(ctx) = self.real.as_mut() {
let _ = ctx.write_f64_slice(handle.0, data);
return;
}
if let Some(entry) = self.buffers.get_mut(handle.0) {
let len = data.len().min(entry.len);
if entry.shadow.len() < len {
entry.shadow.resize(entry.len, 0.0);
}
entry.shadow[..len].copy_from_slice(&data[..len]);
}
}
pub fn read_buffer(&self, handle: CudaBufferHandle) -> Vec<f64> {
#[cfg(feature = "cuda-backend")]
if let Some(ctx) = self.real.as_ref() {
return ctx.read_f64_vec(handle.0).unwrap_or_default();
}
self.buffers
.get(handle.0)
.map(|e| e.shadow.clone())
.unwrap_or_default()
}
pub fn register_kernel(&mut self, name: &str, ptx_source: &str) {
#[cfg(feature = "cuda-backend")]
if let Some(ctx) = self.real.as_mut() {
let _ = ctx.register_ptx(name, ptx_source);
}
#[cfg(not(feature = "cuda-backend"))]
let _ = ptx_source;
if !self.kernels.contains(&name.to_owned()) {
self.kernels.push(name.to_string());
}
}
pub fn compile_and_register(
&mut self,
name: &str,
cuda_c_source: &str,
) -> Result<(), CudaInitError> {
#[cfg(feature = "cuda-backend")]
if let Some(ctx) = self.real.as_mut() {
ctx.compile_and_register(name, cuda_c_source)?;
if !self.kernels.contains(&name.to_owned()) {
self.kernels.push(name.to_string());
}
return Ok(());
}
let _ = cuda_c_source;
if !self.kernels.contains(&name.to_owned()) {
self.kernels.push(name.to_string());
}
Ok(())
}
pub fn launch(&mut self, name: &str, buffers: &[CudaBufferHandle], grid_x: u32, block_x: u32) {
self.launch_with_scalars(name, buffers, &[], &[], grid_x, block_x);
}
pub fn launch_with_scalars(
&mut self,
name: &str,
buffers: &[CudaBufferHandle],
scalars_i32: &[i32],
scalars_f64: &[f64],
grid_x: u32,
block_x: u32,
) {
#[cfg(feature = "cuda-backend")]
if let Some(ctx) = self.real.as_mut() {
use cudarc::driver::{LaunchConfig, PushKernelArg};
let cfg = LaunchConfig {
grid_dim: (grid_x, 1, 1),
block_dim: (block_x, 1, 1),
shared_mem_bytes: 0,
};
let Some(func) = ctx.functions.get(name).cloned() else {
return;
};
if buffers.len() > 2 {
return;
}
for (i, h) in buffers.iter().enumerate() {
if h.0 >= ctx.real_buffers.len() {
return;
}
for h2 in &buffers[i + 1..] {
if h.0 == h2.0 {
return;
}
}
}
let real_ptr = ctx.real_buffers.as_mut_ptr();
let buf0 = buffers.first().map(|h| unsafe { &mut *real_ptr.add(h.0) });
let buf1 = buffers.get(1).map(|h| unsafe { &mut *real_ptr.add(h.0) });
let mut builder = ctx.stream.launch_builder(&func);
if let Some(b) = buf0 {
builder.arg(b);
}
if let Some(b) = buf1 {
builder.arg(b);
}
for v in scalars_i32 {
builder.arg(v);
}
for v in scalars_f64 {
builder.arg(v);
}
let _ = unsafe { builder.launch(cfg) };
return;
}
let _ = (name, buffers, scalars_i32, scalars_f64, grid_x, block_x);
}
pub fn synchronize(&mut self) {
#[cfg(feature = "cuda-backend")]
if let Some(ctx) = self.real.as_ref() {
let _ = ctx.synchronize();
}
}
pub fn device_count() -> u32 {
#[cfg(feature = "cuda-backend")]
{
let count = std::panic::catch_unwind(|| {
cudarc::driver::result::init()
.ok()
.and_then(|()| cudarc::driver::result::device::get_count().ok())
.map(|n| n as u32)
.unwrap_or(0)
});
count.unwrap_or(0)
}
#[cfg(not(feature = "cuda-backend"))]
{
0
}
}
pub fn query_device_info(ordinal: u32) -> Result<CudaDeviceInfo, CudaInitError> {
#[cfg(feature = "cuda-backend")]
{
use cudarc::driver::result;
result::init().map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
let dev = result::device::get(ordinal as i32)
.map_err(|_| CudaInitError::DeviceOrdinalOutOfRange(ordinal))?;
let name = result::device::get_name(dev).unwrap_or_else(|_| "unknown".to_owned());
let total_mem = unsafe { result::device::total_mem(dev) }.unwrap_or(0);
Ok(CudaDeviceInfo {
ordinal,
name,
total_mem_bytes: total_mem as u64,
..Default::default()
})
}
#[cfg(not(feature = "cuda-backend"))]
{
let _ = ordinal;
Err(CudaInitError::NotAvailable)
}
}
}
#[cfg(feature = "cuda-backend")]
impl CudaBackend {
fn try_new_real(ordinal: u32) -> Result<Self, CudaInitError> {
use cudarc::driver::result;
let init_result = std::panic::catch_unwind(result::init);
match init_result {
Ok(Ok(())) => {}
Ok(Err(e)) => {
return Err(CudaInitError::DeviceError(format!("{e:?}")));
}
Err(_payload) => {
return Err(CudaInitError::NotAvailable);
}
}
let dev = result::device::get(ordinal as i32)
.map_err(|_| CudaInitError::DeviceOrdinalOutOfRange(ordinal))?;
let name = result::device::get_name(dev).unwrap_or_else(|_| "unknown".to_owned());
let total_mem = unsafe { result::device::total_mem(dev) }.unwrap_or(0);
let real = real_ctx::CudaRealContext::new(ordinal)?;
Ok(Self {
device_info: CudaDeviceInfo {
ordinal,
name,
total_mem_bytes: total_mem as u64,
..Default::default()
},
available: true,
buffers: Vec::new(),
kernels: Vec::new(),
real: Some(real),
})
}
}
impl std::fmt::Debug for CudaBackend {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("CudaBackend")
.field("device", &self.device_info.name)
.field("available", &self.available)
.field("buffers", &self.buffers.len())
.field("kernels", &self.kernels.len())
.finish()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_try_new_behaviour() {
let result = CudaBackend::try_new(0);
#[cfg(not(feature = "cuda-backend"))]
{
assert!(matches!(result, Err(CudaInitError::FeatureNotEnabled)));
}
#[cfg(feature = "cuda-backend")]
{
match result {
Ok(b) => assert!(b.is_available()),
Err(_) => { }
}
}
}
#[test]
fn test_stub_backend_buffer_roundtrip() {
let mut b = CudaBackend::new_stub();
let h = b.create_buffer(8);
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0_f64];
b.write_buffer(h, &data);
let out = b.read_buffer(h);
assert_eq!(out, data);
}
#[test]
fn test_stub_kernel_registration() {
let mut b = CudaBackend::new_stub();
b.register_kernel("sph_density", PTX_SPH_DENSITY);
assert_eq!(b.kernels.len(), 1);
assert_eq!(b.kernels[0], "sph_density");
}
#[test]
fn test_stub_unified_alloc() {
let mut b = CudaBackend::new_stub();
let h = b.alloc_unified(16);
b.write_buffer(h, &[std::f64::consts::PI; 16]);
let out = b.read_buffer(h);
assert!((out[0] - std::f64::consts::PI).abs() < 1e-10);
assert!(b.buffers[h.0].unified);
}
#[test]
fn test_device_count_environment_consistent() {
let count = CudaBackend::device_count();
#[cfg(not(feature = "cuda-backend"))]
{
assert_eq!(count, 0);
}
#[cfg(feature = "cuda-backend")]
{
let _ = count;
}
}
#[test]
fn test_compile_and_register() {
let mut b = CudaBackend::new_stub();
let result = b.compile_and_register("scan", PTX_PARALLEL_SCAN);
assert!(result.is_ok());
assert_eq!(b.kernels[0], "scan");
}
#[test]
fn test_error_display() {
let e = CudaInitError::CompilationError("undefined symbol 'foo'".into());
let s = format!("{e}");
assert!(s.contains("foo"));
}
#[test]
fn test_cuda_sph_density_src_not_empty() {
assert!(!CUDA_SPH_DENSITY_SRC.is_empty());
assert!(CUDA_SPH_DENSITY_SRC.contains("sph_density_kernel"));
}
#[test]
fn test_try_new_no_panic() {
let _ = CudaBackend::try_new(0);
}
}