oxiphysics-gpu 0.1.1

// Copyright 2026 COOLJAPAN OU (Team KitaSan)
// SPDX-License-Identifier: Apache-2.0

//! CUDA compute backend for the OxiPhysics GPU acceleration layer.
//!
//! This module provides [`CudaBackend`] which implements the compute-backend
//! interface using NVIDIA CUDA via the [`cudarc`](https://crates.io/crates/cudarc)
//! crate for type-safe PTX / CUDA kernel management.
//!
//! ## Feature flag
//!
//! This backend is gated behind the `cuda-backend` Cargo feature:
//!
//! ```toml
//! [dependencies]
//! oxiphysics-gpu = { features = ["cuda-backend"] }
//! ```
//!
//! When the feature is disabled the module compiles to a no-op stub returning
//! [`CudaInitError::NotAvailable`] from [`CudaBackend::try_new`].
//!
//! When the feature is enabled, cudarc uses dynamic-loading (`libloading`) so
//! the crate compiles on any platform; the CUDA driver is opened at runtime and
//! an error is returned if it is absent (e.g. macOS, headless Linux without an
//! NVIDIA driver).
//!
//! ## Architecture
//!
//! ```text
//!  CudaBackend
//!   ├── cudarc::CudaContext                  ← CUDA device context (Arc)
//!   ├── cudarc::CudaStream                   ← Default stream for kernel dispatch
//!   ├── cudarc::CudaSlice<u8>                ← Device-resident buffer slices
//!   ├── Vec<CudaBufferEntry>                 ← Registered buffer metadata
//!   └── KernelRegistry                       ← Compiled PTX / NVRTC modules
//!
//!  Compute pipeline:
//!    write_buffer [host→device memcpy via stream]
//!    → launch_kernel(grid, block, args)
//!    → read_buffer  [device→host memcpy via stream]
//! ```
//!
//! ## Kernels shipped with this backend
//!
//! | Source constant | Description |
//! |---|---|
//! | [`PTX_SPH_DENSITY`] | SPH density summation (cubic-spline W3), 256 threads/block |
//! | [`PTX_PARALLEL_SCAN`] | Blelloch exclusive prefix scan, warp-shuffle optimised |
//! | [`PTX_CONSTRAINT_PGS`] | Block-PGS constraint solver, 64 threads/block |
//! | [`CUDA_SPH_DENSITY_SRC`] | CUDA C SPH density kernel (compiled at runtime via NVRTC) |
//!
//! ## Example (when `cuda-backend` feature enabled)
//!
//! ```ignore
//! use oxiphysics_gpu::compute::cuda_backend::CudaBackend;
//!
//! let mut backend = CudaBackend::try_new(0)?;          // device 0
//! let buf = backend.create_buffer(1024);               // 1024 f64 slots
//! backend.write_buffer(buf, &vec![1.0_f64; 1024]);
//! backend.launch("sph_density", &[buf], 16, 256);      // 16 blocks × 256 threads
//! let result = backend.read_buffer(buf);
//! ```

#![allow(dead_code)]

// ── CudaBufferHandle ──────────────────────────────────────────────────────────

/// Opaque handle to a CUDA device buffer allocated by [`CudaBackend`].
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct CudaBufferHandle(pub usize);

// ── CudaDeviceInfo ────────────────────────────────────────────────────────────

/// Information about the selected CUDA device.
#[derive(Debug, Clone, Default)]
pub struct CudaDeviceInfo {
    /// CUDA device ordinal (0-indexed).
    pub ordinal: u32,
    /// Device name from `cuDeviceGetName`.
    pub name: String,
    /// Total global memory in bytes (`cuDeviceTotalMem`).
    pub total_mem_bytes: u64,
    /// Compute capability as `(major, minor)`.
    pub compute_capability: (u32, u32),
    /// Number of CUDA streaming multiprocessors.
    pub multiprocessor_count: u32,
    /// Maximum threads per block.
    pub max_threads_per_block: u32,
    /// Warp size (always 32 on current NVIDIA hardware).
    pub warp_size: u32,
    /// Whether the device supports unified memory (Compute Capability ≥ 3.0).
    pub supports_unified_memory: bool,
    /// Whether the device supports FP64 (`cuDeviceGetAttribute CUDA_DEVICE_ATTRIBUTE_DOUBLE`).
    pub supports_f64: bool,
    /// CUDA driver version string.
    pub driver_version: String,
}

// ── CudaInitError ─────────────────────────────────────────────────────────────

/// Errors returned by [`CudaBackend::try_new`].
#[derive(Debug, Clone)]
pub enum CudaInitError {
    /// CUDA runtime or driver is not installed on this system.
    NotAvailable,
    /// The `cuda-backend` Cargo feature is not enabled in this build.
    FeatureNotEnabled,
    /// No CUDA-capable device found (all GPUs are AMD / Intel).
    NoDevice,
    /// The requested device ordinal is out of range.
    DeviceOrdinalOutOfRange(u32),
    /// cudarc device initialisation returned an error.
    DeviceError(String),
    /// NVRTC compilation of a kernel source failed.
    CompilationError(String),
}

impl std::fmt::Display for CudaInitError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::NotAvailable => write!(f, "CUDA is not available on this system"),
            Self::FeatureNotEnabled => write!(f, "`cuda-backend` feature is not enabled"),
            Self::NoDevice => write!(f, "no CUDA-capable device found"),
            Self::DeviceOrdinalOutOfRange(n) => write!(f, "device ordinal {n} is out of range"),
            Self::DeviceError(msg) => write!(f, "CUDA device error: {msg}"),
            Self::CompilationError(msg) => write!(f, "NVRTC compile error: {msg}"),
        }
    }
}

impl std::error::Error for CudaInitError {}

// ── PTX kernel sources (stub PTX for introspection / documentation) ────────────

/// PTX source for SPH density summation with cubic-spline W3 kernel.
///
/// Grid: N/256 blocks. Block: 256 threads.  Each thread computes the density
/// for one particle by summing contributions from all particles within 2h.
///
/// Shared memory is used for tile-based neighbour loading (32 kB per SM).
///
/// When the `cuda-backend` feature is active, the real CUDA C source in
/// [`CUDA_SPH_DENSITY_SRC`] is compiled via NVRTC at runtime.  This constant
/// is kept as reference documentation and for `register_kernel` calls in the
/// stub path.
pub const PTX_SPH_DENSITY: &str = r#"
// CUDA C source (compiled to PTX via nvcc -arch=sm_70 -ptx)
// extern "C" __global__ void sph_density(
//     const float4* __restrict__ pos,   // positions + mass in .w
//     float*        __restrict__ rho,   // output density
//     int                        n,     // particle count
//     float                      h,     // smoothing length
//     float                      h_inv  // 1/h
// ) {
//     int i = blockIdx.x * blockDim.x + threadIdx.x;
//     if (i >= n) return;
//
//     float xi = pos[i].x, yi = pos[i].y, zi = pos[i].z;
//     float density = 0.0f;
//     const float coeff = (315.0f / 64.0f) * __fdividef(1.0f, 3.14159265f * h*h*h);
//
//     // tile-based neighbour loop (shared memory)
//     __shared__ float4 tile[256];
//     for (int t = 0; t < (n + 255) / 256; t++) {
//         int j = t * 256 + threadIdx.x;
//         tile[threadIdx.x] = (j < n) ? pos[j] : make_float4(1e30f, 1e30f, 1e30f, 0.0f);
//         __syncthreads();
//         for (int k = 0; k < 256; k++) {
//             float dx = xi - tile[k].x, dy = yi - tile[k].y, dz = zi - tile[k].z;
//             float r2 = dx*dx + dy*dy + dz*dz;
//             float h2 = h * h;
//             if (r2 < h2) {
//                 float q = 1.0f - r2 * __fdividef(1.0f, h2);
//                 density += tile[k].w * coeff * q * q * q;
//             }
//         }
//         __syncthreads();
//     }
//     rho[i] = density;
// }
// --- actual PTX would be here ---
.version 7.0
.target sm_70
.address_size 64
// (stub — replace with actual nvcc-compiled PTX)
"#;

/// PTX source for Blelloch exclusive parallel prefix scan.
///
/// Grid: 1 block per chunk of 512 elements.  Block: 256 threads.
/// Uses warp-shuffle primitives (`__shfl_up_sync`) for the intra-warp scan,
/// then shared memory for the inter-warp reduction.
pub const PTX_PARALLEL_SCAN: &str = r#"
// CUDA C source:
// extern "C" __global__ void exclusive_scan(
//     const double* __restrict__ in,
//     double*       __restrict__ out,
//     int                        n
// ) {
//     extern __shared__ double shmem[];
//     int tid = threadIdx.x;
//     int gid = blockIdx.x * blockDim.x + tid;
//
//     // Load into shared memory
//     shmem[tid] = (gid < n) ? in[gid] : 0.0;
//     __syncthreads();
//
//     // Blelloch up-sweep
//     for (int stride = 1; stride < blockDim.x; stride <<= 1) {
//         int idx = (tid + 1) * stride * 2 - 1;
//         if (idx < blockDim.x)
//             shmem[idx] += shmem[idx - stride];
//         __syncthreads();
//     }
//
//     // Set root to zero
//     if (tid == blockDim.x - 1) shmem[tid] = 0.0;
//     __syncthreads();
//
//     // Blelloch down-sweep
//     for (int stride = blockDim.x / 2; stride >= 1; stride >>= 1) {
//         int idx = (tid + 1) * stride * 2 - 1;
//         if (idx < blockDim.x) {
//             double t    = shmem[idx - stride];
//             shmem[idx - stride] = shmem[idx];
//             shmem[idx]  = shmem[idx] + t;
//         }
//         __syncthreads();
//     }
//
//     if (gid < n) out[gid] = shmem[tid];
// }
.version 7.0
.target sm_70
.address_size 64
// (stub — replace with actual nvcc-compiled PTX)
"#;

/// PTX source for block-PGS constraint solving.
///
/// Grid: ⌈N/64⌉ blocks.  Block: 64 threads (1 thread does the sequential inner
/// loop for guaranteed Gauss-Seidel convergence within the block).
pub const PTX_CONSTRAINT_PGS: &str = r#"
// CUDA C source:
// extern "C" __global__ void constraint_pgs_iter(
//     const GpuConstraint* __restrict__ constraints,
//     float* __restrict__              lambda,
//     float4* __restrict__             vel_lin,   // xyz=vel, w=inv_mass
//     float4* __restrict__             vel_ang,
//     int                              n,
//     float                            omega
// ) {
//     int base = blockIdx.x * blockDim.x;
//     if (threadIdx.x != 0) return;
//
//     for (int ci = base; ci < min(base + (int)blockDim.x, n); ci++) {
//         GpuConstraint c = constraints[ci];
//         float3 vla = make_float3(0), wla = make_float3(0); float inv_ma = 0;
//         float3 vlb = make_float3(0), wlb = make_float3(0); float inv_mb = 0;
//
//         if (c.body_a != 0xFFFFFFFF) {
//             float4 vl = vel_lin[c.body_a], vw = vel_ang[c.body_a];
//             vla = make_float3(vl); wla = make_float3(vw); inv_ma = vl.w;
//         }
//         if (c.body_b != 0xFFFFFFFF) {
//             float4 vl = vel_lin[c.body_b], vw = vel_ang[c.body_b];
//             vlb = make_float3(vl); wlb = make_float3(vw); inv_mb = vl.w;
//         }
//
//         float3 n3 = make_float3(c.nx, c.ny, c.nz);
//         float3 va  = vla + cross(wla, make_float3(c.rax, c.ray, c.raz));
//         float3 vb  = vlb + cross(wlb, make_float3(c.rbx, c.rby, c.rbz));
//         float  rv  = dot(n3, va - vb);
//         float  d   = -(rv + c.bias) * c.em * omega;
//         float  old = lambda[ci];
//         float  neo = __saturatef((old + d - c.lambda_lo) / (c.lambda_hi - c.lambda_lo))
//                      * (c.lambda_hi - c.lambda_lo) + c.lambda_lo;
//         lambda[ci] = neo;
//         float  dl  = neo - old;
//
//         float3 imp = n3 * dl;
//         if (c.body_a != 0xFFFFFFFF) { /* update vel_lin/ang[body_a] */ }
//         if (c.body_b != 0xFFFFFFFF) { /* update vel_lin/ang[body_b] */ }
//     }
// }
.version 7.0
.target sm_70
.address_size 64
// (stub — replace with actual nvcc-compiled PTX)
"#;

/// CUDA C source for SPH density summation kernel, compiled at runtime via NVRTC.
///
/// This kernel computes the SPH density for each particle using the cubic-spline
/// kernel W(r, h) = (315 / 64π h³) (1 − r²/h²)³ for r < h.
///
/// Each thread handles one particle (index `i`) and iterates over all `n_particles`
/// to accumulate density.  The grid-stride is 1 thread per particle; caller must
/// dispatch `ceil(n_particles / 256)` blocks × 256 threads.
///
/// Positions are stored as a flat interleaved array: `positions[3*i]` = x,
/// `positions[3*i+1]` = y, `positions[3*i+2]` = z.
pub const CUDA_SPH_DENSITY_SRC: &str = r#"
extern "C" __global__ void sph_density_kernel(
    const double* __restrict__ positions,
    double* __restrict__ densities,
    int n_particles,
    double smoothing_length,
    double particle_mass
) {
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i >= n_particles) return;
    double px = positions[3*i], py = positions[3*i+1], pz = positions[3*i+2];
    double rho = 0.0;
    double h2 = smoothing_length * smoothing_length;
    double coeff = 315.0 / (64.0 * 3.14159265358979 * smoothing_length
                            * smoothing_length * smoothing_length);
    for (int j = 0; j < n_particles; j++) {
        double dx = px - positions[3*j];
        double dy = py - positions[3*j+1];
        double dz = pz - positions[3*j+2];
        double r2 = dx*dx + dy*dy + dz*dz;
        if (r2 < h2) {
            double q = 1.0 - r2 / h2;
            rho += q * q * q;
        }
    }
    densities[i] = particle_mass * coeff * rho;
}
"#;

// ── Internal buffer entry ──────────────────────────────────────────────────────

/// Internal buffer entry: CPU shadow + metadata.
#[derive(Debug, Clone)]
struct CudaBufferEntry {
    /// Number of `f64` elements allocated.
    len: usize,
    /// CPU shadow data (mirrors device memory in stub implementation).
    shadow: Vec<f64>,
    /// Whether this buffer uses unified memory (UM).
    unified: bool,
}

// ── Real CUDA context (feature-gated) ─────────────────────────────────────────

#[cfg(feature = "cuda-backend")]
mod real_ctx {
    use super::CudaInitError;
    use std::collections::HashMap;
    use std::sync::Arc;

    use cudarc::driver::{CudaContext, CudaFunction, CudaModule, CudaSlice, CudaStream};

    /// Holds live cudarc objects for the active CUDA device context.
    pub(super) struct CudaRealContext {
        /// The CUDA device context.
        pub ctx: Arc<CudaContext>,
        /// Default stream used for all memory operations and kernel launches.
        pub stream: Arc<CudaStream>,
        /// Device-resident byte buffers, indexed parallel to `CudaBackend::buffers`.
        pub real_buffers: Vec<CudaSlice<u8>>,
        /// Loaded modules keyed by name.
        pub modules: HashMap<String, Arc<CudaModule>>,
        /// Functions keyed by name.
        pub functions: HashMap<String, CudaFunction>,
    }

    impl CudaRealContext {
        /// Initialise a CUDA device context for the given ordinal.
        ///
        /// `default_stream` is infallible in cudarc 0.19 — it simply wraps the
        /// null-pointer stream which always exists.
        pub fn new(ordinal: u32) -> Result<Self, CudaInitError> {
            let ctx = CudaContext::new(ordinal as usize)
                .map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
            // default_stream() returns Arc<CudaStream> directly (not Result).
            let stream = ctx.default_stream();
            Ok(Self {
                ctx,
                stream,
                real_buffers: Vec::new(),
                modules: HashMap::new(),
                functions: HashMap::new(),
            })
        }

        /// Allocate `len` bytes zeroed on the device, returning the buffer index.
        pub fn alloc_bytes(&mut self, len: usize) -> Result<usize, CudaInitError> {
            let slice: CudaSlice<u8> = self
                .stream
                .alloc_zeros::<u8>(len)
                .map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
            let idx = self.real_buffers.len();
            self.real_buffers.push(slice);
            Ok(idx)
        }

        /// Upload `data` (as raw bytes of f64) to buffer at `idx`.
        pub fn write_f64_slice(&mut self, idx: usize, data: &[f64]) -> Result<(), CudaInitError> {
            // Reinterpret f64 slice as u8 slice for the memcpy.
            let byte_len = std::mem::size_of_val(data);
            let byte_slice: &[u8] =
                // SAFETY: f64 is a POD type; we never write through this reference.
                unsafe { std::slice::from_raw_parts(data.as_ptr().cast::<u8>(), byte_len) };

            let dst = self
                .real_buffers
                .get_mut(idx)
                .ok_or_else(|| CudaInitError::DeviceError("invalid buffer index".to_owned()))?;

            // Only copy as many bytes as fit in the allocated slice.
            let copy_len = byte_len.min(dst.len());
            if copy_len == 0 {
                return Ok(());
            }
            let src_trimmed = &byte_slice[..copy_len];

            // memcpy_htod requires dst.len() >= src.len(), so use a sub-view.
            let mut dst_view = dst
                .try_slice_mut(..copy_len)
                .ok_or_else(|| CudaInitError::DeviceError("slice view failed".to_owned()))?;

            self.stream
                .memcpy_htod(src_trimmed, &mut dst_view)
                .map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))
        }

        /// Download buffer at `idx` into a Vec<f64>.
        pub fn read_f64_vec(&self, idx: usize) -> Result<Vec<f64>, CudaInitError> {
            let src = self
                .real_buffers
                .get(idx)
                .ok_or_else(|| CudaInitError::DeviceError("invalid buffer index".to_owned()))?;
            let bytes: Vec<u8> = self
                .stream
                .clone_dtoh(src)
                .map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
            Ok(bytes_to_f64_vec(bytes))
        }

        /// Register a PTX-source kernel from a raw `.ptx` string via `Ptx::from_src`.
        pub fn register_ptx(&mut self, name: &str, ptx_src: &str) -> Result<(), CudaInitError> {
            use cudarc::nvrtc::Ptx;
            let ptx = Ptx::from_src(ptx_src);
            let module: Arc<CudaModule> = self
                .ctx
                .load_module(ptx)
                .map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
            let func: CudaFunction = module
                .load_function(name)
                .map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
            self.modules.insert(name.to_owned(), module);
            self.functions.insert(name.to_owned(), func);
            Ok(())
        }

        /// Compile CUDA C source via NVRTC and register the named kernel.
        pub fn compile_and_register(
            &mut self,
            name: &str,
            cuda_c_src: &str,
        ) -> Result<(), CudaInitError> {
            use cudarc::nvrtc::compile_ptx;
            let ptx = compile_ptx(cuda_c_src)
                .map_err(|e| CudaInitError::CompilationError(format!("{e:?}")))?;
            let module: Arc<CudaModule> = self
                .ctx
                .load_module(ptx)
                .map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
            let func: CudaFunction = module
                .load_function(name)
                .map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
            self.modules.insert(name.to_owned(), module);
            self.functions.insert(name.to_owned(), func);
            Ok(())
        }

        /// Synchronise the default stream (block until all work completes).
        pub fn synchronize(&self) -> Result<(), CudaInitError> {
            self.stream
                .synchronize()
                .map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))
        }
    }

    /// Convert raw `Vec<u8>` (little-endian IEEE-754) back to `Vec<f64>`.
    pub(super) fn bytes_to_f64_vec(bytes: Vec<u8>) -> Vec<f64> {
        if !bytes.len().is_multiple_of(8) {
            return Vec::new();
        }
        bytes
            .chunks_exact(8)
            .filter_map(|c| <[u8; 8]>::try_from(c).ok().map(f64::from_le_bytes))
            .collect()
    }
}

// ── CudaBackend ───────────────────────────────────────────────────────────────

/// CUDA compute backend.
///
/// **Without** `cuda-backend` feature: no-op stub; [`Self::try_new`] always returns
/// [`CudaInitError::FeatureNotEnabled`].  All buffer and kernel methods operate
/// on CPU shadows so unit tests compile and run on any platform.
///
/// **With** `cuda-backend` feature: real cudarc device context; [`Self::try_new`]
/// calls `CudaContext::new(ordinal)` and returns an error if the CUDA driver is
/// absent (e.g. on macOS or a Linux machine without an NVIDIA driver).  Buffer
/// methods perform actual host↔device memcpy via the default stream.
pub struct CudaBackend {
    /// Device information (filled from driver attributes when a real context is active).
    pub device_info: CudaDeviceInfo,
    /// Whether a real CUDA device context is active.
    available: bool,
    /// CPU-side buffer shadows (used by the stub path; metadata only in real path).
    buffers: Vec<CudaBufferEntry>,
    /// Registered kernel names (stub path) or names of compiled functions (real path).
    kernels: Vec<String>,
    /// Live cudarc context — present only when `cuda-backend` feature is enabled
    /// **and** device initialisation succeeded.
    #[cfg(feature = "cuda-backend")]
    real: Option<real_ctx::CudaRealContext>,
}

// ── Common constructor helpers ────────────────────────────────────────────────

impl CudaBackend {
    /// Attempt to create a CUDA backend on device `ordinal`.
    ///
    /// - **Without** `cuda-backend` feature: always returns
    ///   `Err(CudaInitError::FeatureNotEnabled)`.
    /// - **With** `cuda-backend` feature: calls `CudaContext::new(ordinal)`.
    ///   Returns `Err(CudaInitError::DeviceError(...))` if the CUDA driver is
    ///   absent or the ordinal is invalid.
    pub fn try_new(ordinal: u32) -> Result<Self, CudaInitError> {
        #[cfg(feature = "cuda-backend")]
        {
            Self::try_new_real(ordinal)
        }
        #[cfg(not(feature = "cuda-backend"))]
        {
            let _ = ordinal;
            Err(CudaInitError::FeatureNotEnabled)
        }
    }

    /// Create a CPU-fallback stub (useful for unit testing without a GPU).
    pub fn new_stub() -> Self {
        Self {
            device_info: CudaDeviceInfo {
                name: "CPU stub".into(),
                ..Default::default()
            },
            available: false,
            buffers: Vec::new(),
            kernels: Vec::new(),
            #[cfg(feature = "cuda-backend")]
            real: None,
        }
    }

    /// True if a real CUDA device context is active.
    pub fn is_available(&self) -> bool {
        self.available
    }

    /// Device information.
    pub fn device_info(&self) -> &CudaDeviceInfo {
        &self.device_info
    }

    // ── Buffer management ────────────────────────────────────────────────────

    /// Allocate a device buffer that can hold `len` `f64` values.
    ///
    /// Real path: calls `CudaStream::alloc_zeros::<u8>(len * 8)` and stores
    /// the returned `CudaSlice<u8>`.  Falls back to a CPU-shadow buffer when
    /// no real context is active.
    pub fn create_buffer(&mut self, len: usize) -> CudaBufferHandle {
        let handle = CudaBufferHandle(self.buffers.len());

        #[cfg(feature = "cuda-backend")]
        if let Some(ctx) = self.real.as_mut() {
            let byte_len = len * std::mem::size_of::<f64>();
            // If real allocation fails, degrade gracefully to CPU shadow.
            if ctx.alloc_bytes(byte_len).is_ok() {
                self.buffers.push(CudaBufferEntry {
                    len,
                    shadow: Vec::new(), // no CPU shadow in real path
                    unified: false,
                });
                return handle;
            }
        }

        self.buffers.push(CudaBufferEntry {
            len,
            shadow: vec![0.0; len],
            unified: false,
        });
        handle
    }

    /// Allocate a **unified memory** buffer (accessible from both CPU and GPU).
    ///
    /// In the current implementation unified memory is backed by the same
    /// `CudaSlice<u8>` path as a regular buffer; true UM page migration would
    /// require `UnifiedSlice` from cudarc which is gated on additional CUDA
    /// driver capabilities.  Falls back to a CPU-shadow buffer in the stub.
    pub fn alloc_unified(&mut self, len: usize) -> CudaBufferHandle {
        let handle = CudaBufferHandle(self.buffers.len());

        #[cfg(feature = "cuda-backend")]
        if let Some(ctx) = self.real.as_mut() {
            let byte_len = len * std::mem::size_of::<f64>();
            if ctx.alloc_bytes(byte_len).is_ok() {
                self.buffers.push(CudaBufferEntry {
                    len,
                    shadow: Vec::new(),
                    unified: true,
                });
                return handle;
            }
        }

        self.buffers.push(CudaBufferEntry {
            len,
            shadow: vec![0.0; len],
            unified: true,
        });
        handle
    }

    /// Upload `data` to the device buffer at `handle`.
    ///
    /// Real path: `CudaStream::memcpy_htod` — synchronous on the default stream.
    /// Stub path: copies into the CPU shadow.
    pub fn write_buffer(&mut self, handle: CudaBufferHandle, data: &[f64]) {
        #[cfg(feature = "cuda-backend")]
        if let Some(ctx) = self.real.as_mut() {
            // Attempt real memcpy; silently degrade on error.
            let _ = ctx.write_f64_slice(handle.0, data);
            return;
        }

        if let Some(entry) = self.buffers.get_mut(handle.0) {
            let len = data.len().min(entry.len);
            if entry.shadow.len() < len {
                entry.shadow.resize(entry.len, 0.0);
            }
            entry.shadow[..len].copy_from_slice(&data[..len]);
        }
    }

    /// Download data from the device buffer at `handle`.
    ///
    /// Real path: `CudaStream::clone_dtoh` — synchronous copy to a new `Vec<f64>`.
    /// Stub path: returns a clone of the CPU shadow.
    pub fn read_buffer(&self, handle: CudaBufferHandle) -> Vec<f64> {
        #[cfg(feature = "cuda-backend")]
        if let Some(ctx) = self.real.as_ref() {
            return ctx.read_f64_vec(handle.0).unwrap_or_default();
        }

        self.buffers
            .get(handle.0)
            .map(|e| e.shadow.clone())
            .unwrap_or_default()
    }

    // ── Kernel management ────────────────────────────────────────────────────

    /// Register a PTX kernel source and associate it with `name`.
    ///
    /// Real path: loads the module via `CudaContext::load_module` and retrieves
    /// the named function.  Stub path: records the name only.
    pub fn register_kernel(&mut self, name: &str, ptx_source: &str) {
        #[cfg(feature = "cuda-backend")]
        if let Some(ctx) = self.real.as_mut() {
            let _ = ctx.register_ptx(name, ptx_source);
        }
        // In stub path the ptx_source is intentionally not used (no NVRTC).
        #[cfg(not(feature = "cuda-backend"))]
        let _ = ptx_source;

        if !self.kernels.contains(&name.to_owned()) {
            self.kernels.push(name.to_string());
        }
    }

    /// Compile a CUDA C kernel at runtime via NVRTC and register it.
    ///
    /// Real path: calls `cudarc::nvrtc::compile_ptx` then loads the module.
    /// Stub path: records the name and returns `Ok(())`.
    pub fn compile_and_register(
        &mut self,
        name: &str,
        cuda_c_source: &str,
    ) -> Result<(), CudaInitError> {
        #[cfg(feature = "cuda-backend")]
        if let Some(ctx) = self.real.as_mut() {
            ctx.compile_and_register(name, cuda_c_source)?;
            if !self.kernels.contains(&name.to_owned()) {
                self.kernels.push(name.to_string());
            }
            return Ok(());
        }

        // Stub path: record name, suppress unused-var warnings
        let _ = cuda_c_source;
        if !self.kernels.contains(&name.to_owned()) {
            self.kernels.push(name.to_string());
        }
        Ok(())
    }

    // ── Kernel launch ────────────────────────────────────────────────────────

    /// Launch a registered kernel with buffer arguments only.
    ///
    /// # Parameters
    ///
    /// - `name` — kernel name as passed to [`Self::register_kernel`] or
    ///   [`Self::compile_and_register`]
    /// - `buffers` — buffer handles bound as kernel arguments (in order)
    /// - `grid_x` — number of thread blocks in X dimension
    /// - `block_x` — number of threads per block in X dimension
    ///
    /// For kernels that take scalar arguments (e.g. an integer particle count
    /// or floating-point smoothing length), use [`Self::launch_with_scalars`]
    /// instead — calling `launch` against a kernel whose signature includes
    /// scalar parameters will pass uninitialised registers to those slots and
    /// is undefined behaviour.
    ///
    /// Real path: retrieves the stored `CudaFunction` and dispatches via
    /// `CudaStream::launch_builder`.  Currently up to two buffer arguments
    /// are forwarded; extend as needed for higher-arity kernels.
    ///
    /// Stub path: no-op.
    pub fn launch(&mut self, name: &str, buffers: &[CudaBufferHandle], grid_x: u32, block_x: u32) {
        self.launch_with_scalars(name, buffers, &[], &[], grid_x, block_x);
    }

    /// Launch a registered kernel passing buffer **and** scalar arguments.
    ///
    /// Scalars are appended to the kernel argument list after the buffer
    /// arguments in the order `i32` scalars then `f64` scalars; the kernel
    /// signature must match that ordering exactly.
    ///
    /// # Parameters
    ///
    /// - `name` — kernel name as passed to [`Self::register_kernel`] or
    ///   [`Self::compile_and_register`]
    /// - `buffers` — buffer handles bound as the leading kernel arguments
    /// - `scalars_i32` — `i32` scalars appended after the buffers
    /// - `scalars_f64` — `f64` scalars appended after the `i32` scalars
    /// - `grid_x` — number of thread blocks in X dimension
    /// - `block_x` — number of threads per block in X dimension
    ///
    /// Stub path: no-op.
    pub fn launch_with_scalars(
        &mut self,
        name: &str,
        buffers: &[CudaBufferHandle],
        scalars_i32: &[i32],
        scalars_f64: &[f64],
        grid_x: u32,
        block_x: u32,
    ) {
        #[cfg(feature = "cuda-backend")]
        if let Some(ctx) = self.real.as_mut() {
            use cudarc::driver::{LaunchConfig, PushKernelArg};
            let cfg = LaunchConfig {
                grid_dim: (grid_x, 1, 1),
                block_dim: (block_x, 1, 1),
                shared_mem_bytes: 0,
            };
            let Some(func) = ctx.functions.get(name).cloned() else {
                return;
            };

            // Current support: up to two buffer arguments.  Validate indices
            // are in range and pairwise distinct (aliasing breaks the unsafe
            // split below).
            if buffers.len() > 2 {
                return;
            }
            for (i, h) in buffers.iter().enumerate() {
                if h.0 >= ctx.real_buffers.len() {
                    return;
                }
                for h2 in &buffers[i + 1..] {
                    if h.0 == h2.0 {
                        return;
                    }
                }
            }

            // Materialise the buffer references first (raw-pointer split),
            // then borrow ctx.stream immutably to build the launch.  The
            // raw-pointer derived references and ctx.stream live in disjoint
            // fields of ctx; the borrow checker cannot see this through the
            // pointer cast, so we rely on the manual validation above.
            let real_ptr = ctx.real_buffers.as_mut_ptr();
            // SAFETY: indices validated above; lifetimes do not outlive this
            // function and we do not call any &mut ctx.real_buffers method
            // between here and `.launch(cfg)`.
            let buf0 = buffers.first().map(|h| unsafe { &mut *real_ptr.add(h.0) });
            let buf1 = buffers.get(1).map(|h| unsafe { &mut *real_ptr.add(h.0) });

            let mut builder = ctx.stream.launch_builder(&func);
            if let Some(b) = buf0 {
                builder.arg(b);
            }
            if let Some(b) = buf1 {
                builder.arg(b);
            }
            for v in scalars_i32 {
                builder.arg(v);
            }
            for v in scalars_f64 {
                builder.arg(v);
            }
            let _ = unsafe { builder.launch(cfg) };
            return;
        }
        // Stub: no-op
        let _ = (name, buffers, scalars_i32, scalars_f64, grid_x, block_x);
    }

    /// Synchronise the device (blocks until all submitted work completes).
    ///
    /// Real path: `CudaStream::synchronize()`.
    /// Stub path: immediate return.
    pub fn synchronize(&mut self) {
        #[cfg(feature = "cuda-backend")]
        if let Some(ctx) = self.real.as_ref() {
            let _ = ctx.synchronize();
        }
    }

    // ── Device query ─────────────────────────────────────────────────────────

    /// Return the number of CUDA devices available on this system.
    ///
    /// Real path: calls `cudarc::driver::result::device::get_count()`.
    /// Stub path: always returns `0`.
    pub fn device_count() -> u32 {
        #[cfg(feature = "cuda-backend")]
        {
            // cudarc panics on dlopen failure with dynamic-loading; catch it.
            let count = std::panic::catch_unwind(|| {
                cudarc::driver::result::init()
                    .ok()
                    .and_then(|()| cudarc::driver::result::device::get_count().ok())
                    .map(|n| n as u32)
                    .unwrap_or(0)
            });
            count.unwrap_or(0)
        }
        #[cfg(not(feature = "cuda-backend"))]
        {
            0
        }
    }

    /// Query device attributes for device `ordinal` without creating a backend.
    ///
    /// Stub path: always returns `Err(CudaInitError::NotAvailable)`.
    /// Real path: returns basic info derived from the driver (name, total mem, CC).
    pub fn query_device_info(ordinal: u32) -> Result<CudaDeviceInfo, CudaInitError> {
        #[cfg(feature = "cuda-backend")]
        {
            use cudarc::driver::result;
            result::init().map_err(|e| CudaInitError::DeviceError(format!("{e:?}")))?;
            let dev = result::device::get(ordinal as i32)
                .map_err(|_| CudaInitError::DeviceOrdinalOutOfRange(ordinal))?;
            let name = result::device::get_name(dev).unwrap_or_else(|_| "unknown".to_owned());
            let total_mem = unsafe { result::device::total_mem(dev) }.unwrap_or(0);
            Ok(CudaDeviceInfo {
                ordinal,
                name,
                total_mem_bytes: total_mem as u64,
                ..Default::default()
            })
        }
        #[cfg(not(feature = "cuda-backend"))]
        {
            let _ = ordinal;
            Err(CudaInitError::NotAvailable)
        }
    }
}

// ── Real-path constructor (feature-gated) ─────────────────────────────────────

#[cfg(feature = "cuda-backend")]
impl CudaBackend {
    /// Initialise a real CUDA backend on device `ordinal` using cudarc 0.19.
    ///
    /// Called by [`try_new`] when the `cuda-backend` feature is active.
    fn try_new_real(ordinal: u32) -> Result<Self, CudaInitError> {
        use cudarc::driver::result;

        // cudarc with `dynamic-loading` **panics** at the dlopen stage when no
        // CUDA shared library is found on the system (e.g. on macOS or a
        // machine without an NVIDIA driver).  Catch that panic and convert it
        // into a clean `Err(DeviceError(...))` so callers can handle it without
        // unwinding the test process.
        let init_result = std::panic::catch_unwind(result::init);
        match init_result {
            Ok(Ok(())) => {}
            Ok(Err(e)) => {
                return Err(CudaInitError::DeviceError(format!("{e:?}")));
            }
            Err(_payload) => {
                // cudarc panicked during dlopen — CUDA driver not present.
                return Err(CudaInitError::NotAvailable);
            }
        }

        let dev = result::device::get(ordinal as i32)
            .map_err(|_| CudaInitError::DeviceOrdinalOutOfRange(ordinal))?;

        // Query basic device info before acquiring the context.
        let name = result::device::get_name(dev).unwrap_or_else(|_| "unknown".to_owned());
        // SAFETY: `dev` was returned by `result::device::get`, fulfilling the contract.
        let total_mem = unsafe { result::device::total_mem(dev) }.unwrap_or(0);

        let real = real_ctx::CudaRealContext::new(ordinal)?;

        Ok(Self {
            device_info: CudaDeviceInfo {
                ordinal,
                name,
                total_mem_bytes: total_mem as u64,
                ..Default::default()
            },
            available: true,
            buffers: Vec::new(),
            kernels: Vec::new(),
            real: Some(real),
        })
    }
}

// ── Debug impl ────────────────────────────────────────────────────────────────

impl std::fmt::Debug for CudaBackend {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("CudaBackend")
            .field("device", &self.device_info.name)
            .field("available", &self.available)
            .field("buffers", &self.buffers.len())
            .field("kernels", &self.kernels.len())
            .finish()
    }
}

// ── tests ─────────────────────────────────────────────────────────────────────

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_try_new_behaviour() {
        // Without the `cuda-backend` feature, `try_new` must fail with
        // `FeatureNotEnabled`.
        //
        // With the `cuda-backend` feature on a machine without a CUDA driver,
        // it must fail with `NotAvailable` / `DeviceError` (and not panic).
        //
        // With the `cuda-backend` feature on a machine *with* a working CUDA
        // driver and at least one device, it returns `Ok` and the backend
        // must report itself as available.  All three outcomes are valid;
        // the contract is "no panic and outcome consistent with environment".
        let result = CudaBackend::try_new(0);
        #[cfg(not(feature = "cuda-backend"))]
        {
            assert!(matches!(result, Err(CudaInitError::FeatureNotEnabled)));
        }
        #[cfg(feature = "cuda-backend")]
        {
            match result {
                Ok(b) => assert!(b.is_available()),
                Err(_) => { /* no CUDA driver / device on this machine — OK */ }
            }
        }
    }

    #[test]
    fn test_stub_backend_buffer_roundtrip() {
        let mut b = CudaBackend::new_stub();
        let h = b.create_buffer(8);
        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0_f64];
        b.write_buffer(h, &data);
        let out = b.read_buffer(h);
        assert_eq!(out, data);
    }

    #[test]
    fn test_stub_kernel_registration() {
        let mut b = CudaBackend::new_stub();
        b.register_kernel("sph_density", PTX_SPH_DENSITY);
        assert_eq!(b.kernels.len(), 1);
        assert_eq!(b.kernels[0], "sph_density");
    }

    #[test]
    fn test_stub_unified_alloc() {
        let mut b = CudaBackend::new_stub();
        let h = b.alloc_unified(16);
        b.write_buffer(h, &[std::f64::consts::PI; 16]);
        let out = b.read_buffer(h);
        assert!((out[0] - std::f64::consts::PI).abs() < 1e-10);
        // Verify the entry is marked as unified
        assert!(b.buffers[h.0].unified);
    }

    #[test]
    fn test_device_count_environment_consistent() {
        // Without the `cuda-backend` feature the count is always 0.
        // With the feature the count reflects the host: 0 on machines without
        // a CUDA driver, >=1 on machines with one or more CUDA devices.  In
        // either case the call must not panic.
        let count = CudaBackend::device_count();
        #[cfg(not(feature = "cuda-backend"))]
        {
            assert_eq!(count, 0);
        }
        #[cfg(feature = "cuda-backend")]
        {
            // Just exercise the path — any non-panicking result is acceptable.
            let _ = count;
        }
    }

    #[test]
    fn test_compile_and_register() {
        let mut b = CudaBackend::new_stub();
        let result = b.compile_and_register("scan", PTX_PARALLEL_SCAN);
        assert!(result.is_ok());
        assert_eq!(b.kernels[0], "scan");
    }

    #[test]
    fn test_error_display() {
        let e = CudaInitError::CompilationError("undefined symbol 'foo'".into());
        let s = format!("{e}");
        assert!(s.contains("foo"));
    }

    #[test]
    fn test_cuda_sph_density_src_not_empty() {
        assert!(!CUDA_SPH_DENSITY_SRC.is_empty());
        assert!(CUDA_SPH_DENSITY_SRC.contains("sph_density_kernel"));
    }

    #[test]
    fn test_try_new_no_panic() {
        // Regardless of feature flags or hardware, try_new(0) must not panic.
        let _ = CudaBackend::try_new(0);
    }
}