rustsim 0.0.1 - Docs.rs

//! Compute backend detection and GPU-accelerated batch stepping.
//!
//! The CUDA accelerator works on SoA (Structure-of-Arrays) buffers extracted
//! from the ABM agent store. The flow is:
//!
//! 1. Extract agent fields into flat `Vec<f32>` columns via `SoaExtractable`
//! 2. Upload columns to GPU device memory
//! 3. Launch a user-provided PTX kernel that processes all agents in parallel
//! 4. Download results back to host
//! 5. Write columns back into the agent store
//!
//! When CUDA is unavailable (no `cuda` feature or no device), the same SoA
//! buffers are processed on CPU via a user-provided closure.
//!
//! # Determinism and backend selection
//!
//! `cpu_batch_step` is replayable when:
//! - SoA extraction order is deterministic for the chosen store and workload
//! - the supplied CPU kernel is itself deterministic
//!
//! `auto_batch_step` and `auto_device_step` do **not** guarantee a fixed backend
//! across machines or runs, because backend selection depends on:
//! - compile-time `cuda` support
//! - runtime device availability
//! - the `RUSTSIM_BACKEND` environment variable
//! - CUDA failure fallback to CPU
//!
//! Exact bitwise equivalence between CPU and CUDA results is not guaranteed.
//! Floating-point behavior, execution order, and kernel implementation details
//! may differ across backends.
//!
//! # CUDA safety and failure surfaces
//!
//! The only `unsafe` operations in this module are CUDA kernel launches via
//! `cudarc`. Those launches rely on the following invariants:
//! - `block_size > 0`
//! - the PTX kernel signature matches the launched argument tuple
//! - each device buffer points to a valid uploaded SoA column
//! - the kernel performs bounds checks for `idx < n`
//! - the kernel does not read or write out of bounds
//!
//! Failure surfaces are explicit `Err(String)` results from:
//! - CUDA device initialization
//! - PTX load/module lookup
//! - host-to-device transfer
//! - invalid launch configuration such as `block_size == 0`
//! - unsupported SoA arity outside `1..=8`
//! - kernel launch / synchronization
//! - device-to-host transfer
//!
//! `auto_batch_step` and `auto_device_step` treat those CUDA errors as runtime
//! fallback triggers and continue on CPU.
//!
//! # Persistent Device Store
//!
//! For multi-step runs, use [`DeviceSoaStore`](crate::device_store::DeviceSoaStore)
//! to avoid per-step SoA extraction overhead. This mirrors FlameGPU2's design
//! where agent data lives on the GPU across steps.

/// Represents the available compute backend.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ComputeBackend {
    Cpu,
    Cuda,
}

impl std::fmt::Display for ComputeBackend {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            ComputeBackend::Cpu => write!(f, "CPU"),
            ComputeBackend::Cuda => write!(f, "CUDA"),
        }
    }
}

/// Probe the system for CUDA availability.
///
/// # Backend selection precedence
///
/// The result is determined in this fixed order:
///
/// 1. **`RUSTSIM_BACKEND` environment variable**, if set to a recognized value:
///    - `cuda` / `gpu` → force [`ComputeBackend::Cuda`]
///    - `cpu` → force [`ComputeBackend::Cpu`]
///    - any other value is ignored and detection continues.
/// 2. **`cuda` feature enabled** (default): probe
///    a panic-safe CUDA context probe; on success return
///    [`ComputeBackend::Cuda`]. Because the umbrella crate enables the
///    `cuda` feature by default, on a host with a working NVIDIA driver
///    this step routes to the GPU automatically with no opt-in required;
///    hosts without a driver or runtime fall through to step 4.
/// 3. **`cuda` feature disabled** (built with `--no-default-features`):
///    probe `nvidia-smi --query-gpu=name` on `PATH`; on success return
///    [`ComputeBackend::Cuda`]. In this mode the Cuda variant is advisory
///    only — the actual CUDA call sites are feature-gated out, so callers
///    of `auto_batch_step` / `auto_device_step` will still run on the
///    CPU.
/// 4. **Otherwise**: return [`ComputeBackend::Cpu`].
///
/// Callers can still force a specific backend per call via `auto_batch_step`
/// / `auto_device_step` arguments or the [`cpu_batch_step`] / `cuda_batch_step`
/// entry points. Backend selection is **not** a determinism guarantee — see
/// [`docs/determinism.md`](https://github.com/rustsim/rustsim/blob/main/docs/determinism.md).
pub fn detect_backend() -> ComputeBackend {
    if let Ok(val) = std::env::var("RUSTSIM_BACKEND") {
        match val.to_lowercase().as_str() {
            "cuda" | "gpu" => {
                tracing::info!(backend = "CUDA", "backend override via RUSTSIM_BACKEND");
                return ComputeBackend::Cuda;
            }
            "cpu" => {
                tracing::info!(backend = "CPU", "backend override via RUSTSIM_BACKEND");
                return ComputeBackend::Cpu;
            }
            _ => {}
        }
    }

    #[cfg(feature = "cuda")]
    {
        if crate::cuda_context::new_context(0).is_ok() {
            tracing::info!("CUDA device detected");
            return ComputeBackend::Cuda;
        }
    }

    #[cfg(not(feature = "cuda"))]
    {
        match std::process::Command::new("nvidia-smi")
            .arg("--query-gpu=name")
            .arg("--format=csv,noheader")
            .output()
        {
            Ok(output) if output.status.success() => {
                tracing::info!("CUDA device detected via nvidia-smi");
                return ComputeBackend::Cuda;
            }
            _ => {}
        }
    }

    tracing::debug!("no CUDA device found, using CPU");
    ComputeBackend::Cpu
}

// ---------------------------------------------------------------------------
// GPU Accelerator
// ---------------------------------------------------------------------------

use rustsim_core::soa::{self, SoaExtractable, SoaExtractableF64};
use rustsim_core::store::AgentStore;

/// Result of a GPU (or CPU-fallback) batch step.
#[derive(Debug)]
pub struct AccelStepResult {
    /// Which backend was used.
    pub backend: ComputeBackend,
    /// Number of agents processed.
    pub agent_count: usize,
    /// Wall-clock time for the kernel / CPU work (excludes extract + write-back).
    pub kernel_us: u128,
}

impl AccelStepResult {
    /// Kernel/runtime duration in milliseconds.
    pub fn kernel_ms(&self) -> f64 {
        self.kernel_us as f64 / 1_000.0
    }

    /// Approximate processed-agent throughput in agents/second.
    pub fn agents_per_second(&self) -> f64 {
        if self.kernel_us == 0 {
            return 0.0;
        }
        self.agent_count as f64 / (self.kernel_us as f64 / 1_000_000.0)
    }
}

/// CPU-side batch step over SoA columns.
///
/// `kernel` receives `(columns, n)` where each `columns[c]` is a mutable
/// slice of length `n`. The closure should update the columns in place,
/// operating on all `n` agents.
pub fn cpu_batch_step<A, S, F>(store: &S, mut kernel: F) -> AccelStepResult
where
    A: SoaExtractable,
    S: AgentStore<A>,
    F: FnMut(&mut [Vec<f32>], usize),
{
    let (ids, mut columns) = soa::extract_soa::<A, S>(store);
    let n = ids.len();

    let t0 = std::time::Instant::now();
    kernel(&mut columns, n);
    let kernel_us = t0.elapsed().as_micros();

    soa::write_back_soa::<A, S>(store, &ids, &columns);

    tracing::debug!(
        backend = "CPU",
        agents = n,
        kernel_us,
        "cpu_batch_step completed"
    );

    AccelStepResult {
        backend: ComputeBackend::Cpu,
        agent_count: n,
        kernel_us,
    }
}

/// CPU-side batch step over **`f64`** SoA columns.
///
/// Parallel to [`cpu_batch_step`] but preserves double precision end-to-end.
/// Use this when `f32` would introduce unacceptable rounding — e.g.
/// long-horizon integrators, stiff dynamics, or scientific workloads.
///
/// `kernel` receives `(columns, n)` where each `columns[c]` is a mutable
/// `Vec<f64>` of length `n`.
pub fn cpu_batch_step_f64<A, S, F>(store: &S, mut kernel: F) -> AccelStepResult
where
    A: SoaExtractableF64,
    S: AgentStore<A>,
    F: FnMut(&mut [Vec<f64>], usize),
{
    let (ids, mut columns) = soa::extract_soa_f64::<A, S>(store);
    let n = ids.len();

    let t0 = std::time::Instant::now();
    kernel(&mut columns, n);
    let kernel_us = t0.elapsed().as_micros();

    soa::write_back_soa_f64::<A, S>(store, &ids, &columns);

    tracing::debug!(
        backend = "CPU",
        precision = "f64",
        agents = n,
        kernel_us,
        "cpu_batch_step_f64 completed"
    );

    AccelStepResult {
        backend: ComputeBackend::Cpu,
        agent_count: n,
        kernel_us,
    }
}

/// Parallel CPU batch step over SoA columns, chunked by `chunk_size`.
///
/// Available only with the `rayon` feature. Extracts SoA columns from
/// `store`, invokes `kernel(chunk_start, &mut [&mut [f32]])` in parallel
/// across aligned chunks of every column, then writes results back.
///
/// The kernel sees a mutable sub-slice of every column for the same
/// `[chunk_start, chunk_start + chunk_len)` index range. Chunks are
/// disjoint, so kernel invocations do not alias each other.
///
/// Use this when the per-agent work is non-trivial. For very small kernels,
/// the serial [`cpu_batch_step`] may be faster due to lower overhead.
#[cfg(feature = "rayon")]
pub fn par_batch_step<A, S, F>(store: &S, chunk_size: usize, kernel: F) -> AccelStepResult
where
    A: SoaExtractable,
    S: AgentStore<A>,
    F: Fn(usize, &mut [&mut [f32]]) + Send + Sync,
{
    let (ids, mut columns) = soa::extract_soa::<A, S>(store);
    let n = ids.len();

    let t0 = std::time::Instant::now();
    {
        let mut slices: Vec<&mut [f32]> = columns.iter_mut().map(|c| c.as_mut_slice()).collect();
        crate::parallel::par_apply_chunks_multi(&mut slices, chunk_size, kernel);
    }
    let kernel_us = t0.elapsed().as_micros();

    soa::write_back_soa::<A, S>(store, &ids, &columns);

    tracing::debug!(
        backend = "CPU",
        parallel = true,
        chunk_size,
        agents = n,
        kernel_us,
        "par_batch_step completed"
    );

    AccelStepResult {
        backend: ComputeBackend::Cpu,
        agent_count: n,
        kernel_us,
    }
}

/// CUDA batch step over SoA columns.
///
/// Uploads agent columns to the GPU on a dedicated [`cudarc::driver::CudaStream`],
/// launches the named kernel from the provided PTX source, then downloads
/// results back onto the host and writes them into the agent store.
///
/// Supports any number of SoA columns — the launch uses the stream-based
/// [`cudarc::driver::CudaStream::launch_builder`] API and collects
/// `(col_0, col_1, …, col_{k-1}, n)` as argument slots in order.
///
/// Failure surfaces returned as `Err(String)` include:
/// - invalid `block_size`
/// - CUDA context initialization
/// - PTX compile/load or kernel lookup
/// - host/device transfer failures
/// - kernel launch or stream synchronization failures
///
/// # Safety requirements for the PTX kernel
/// - the kernel signature must match the launched argument list
/// - the kernel must bounds-check against `n`
/// - the kernel must not read or write outside the provided column buffers
///
/// # Arguments
/// - `store` -- the agent store to extract from / write back to
/// - `ptx_source` -- PTX source string (compile your `.cu` to PTX offline or embed it)
/// - `module_name` -- name for the loaded module (unused with cudarc 0.19,
///   kept for source-compatibility with the previous API)
/// - `kernel_name` -- the `__global__` function name inside the PTX
/// - `block_size` -- CUDA threads per block (e.g. 256)
#[cfg(feature = "cuda")]
pub fn cuda_batch_step<A, S>(
    store: &S,
    ptx_source: &str,
    _module_name: &str,
    kernel_name: &str,
    block_size: u32,
) -> Result<AccelStepResult, String>
where
    A: SoaExtractable,
    S: AgentStore<A>,
{
    use cudarc::driver::{LaunchConfig, PushKernelArg};

    if block_size == 0 {
        return Err("block_size must be positive".to_string());
    }

    // CudaContext owns the device; streams are scheduled off it.
    let ctx = crate::cuda_context::new_context(0)?;
    let stream = ctx.default_stream();

    // Extract SoA
    let (ids, mut columns) = soa::extract_soa::<A, S>(store);
    let n = ids.len();
    if n == 0 {
        return Ok(AccelStepResult {
            backend: ComputeBackend::Cuda,
            agent_count: 0,
            kernel_us: 0,
        });
    }

    // Compile/load PTX module and look up the kernel.
    let ptx = cudarc::nvrtc::Ptx::from_src(ptx_source);
    let module = ctx
        .load_module(ptx)
        .map_err(|e| format!("PTX load failed: {e}"))?;
    let func = module
        .load_function(kernel_name)
        .map_err(|e| format!("kernel '{kernel_name}' not found: {e}"))?;

    // Upload columns to device on the compute stream.
    let mut d_columns = Vec::with_capacity(columns.len());
    for col in &columns {
        let d_col = stream
            .clone_htod(col.as_slice())
            .map_err(|e| format!("htod failed: {e}"))?;
        d_columns.push(d_col);
    }

    // Build launch config.
    let grid_size = n.div_ceil(block_size as usize) as u32;
    let cfg = LaunchConfig {
        grid_dim: (grid_size, 1, 1),
        block_dim: (block_size, 1, 1),
        shared_mem_bytes: 0,
    };

    let n_u32 = n as u32;

    let t0 = std::time::Instant::now();

    // SAFETY:
    // - each `d_columns[i]` is a valid device buffer allocated on `stream`
    // - `n_u32` is the logical row count passed to the kernel for bounds checking
    // - callers are responsible for providing a PTX kernel whose signature matches
    //   the argument list built below and whose implementation does not access out
    //   of bounds
    // - `block_size > 0` has been validated above
    // - the `launch_builder` / `launch` pair schedules work on `stream` only, and
    //   `stream.synchronize()` is called below before any host-visible read
    unsafe {
        let mut builder = stream.launch_builder(&func);
        for d in d_columns.iter_mut() {
            builder.arg(d);
        }
        builder.arg(&n_u32);
        builder
            .launch(cfg)
            .map_err(|e| format!("kernel launch failed: {e}"))?;
    }

    stream
        .synchronize()
        .map_err(|e| format!("stream sync failed: {e}"))?;
    let kernel_us = t0.elapsed().as_micros();

    // Download results.
    for (i, d_col) in d_columns.iter().enumerate() {
        stream
            .memcpy_dtoh(d_col, &mut columns[i])
            .map_err(|e| format!("dtoh failed: {e}"))?;
    }

    // Write back
    soa::write_back_soa::<A, S>(store, &ids, &columns);

    Ok(AccelStepResult {
        backend: ComputeBackend::Cuda,
        agent_count: n,
        kernel_us,
    })
}

/// CUDA batch step over SoA columns using **pinned host memory and dedicated
/// non-default CUDA streams** for host/device transfer overlap.
///
/// Same contract as [`cuda_batch_step`] but:
/// - SoA columns are staged through page-locked (pinned) host buffers
///   allocated via [`cudarc::driver::CudaContext::alloc_pinned`], letting the
///   driver issue truly asynchronous `memcpy_htod` / `memcpy_dtoh`.
/// - Host-to-device uploads and device-to-host downloads run on a dedicated
///   *copy* stream, while the kernel launch runs on a dedicated *compute*
///   stream; the two are serialized via
///   [`cudarc::driver::CudaStream::join`]. This is the standard CUDA pattern
///   for overlapping transfer and compute across successive steps.
///
/// A single invocation still runs in-order on the host timeline; the benefit
/// materializes when multiple kernels are scheduled back-to-back and the
/// driver is free to overlap the download of step *N* with the upload of
/// step *N+1*. For persistent device-side data use
/// [`crate::device_store::DeviceSoaStore::step_cuda_pinned`].
///
/// Failure surfaces returned as `Err(String)` are identical to
/// [`cuda_batch_step`] plus pinned-allocation and stream-creation failures.
///
/// # Safety requirements for the PTX kernel
/// - the kernel signature must match the launched argument list
/// - the kernel must bounds-check against `n`
/// - the kernel must not read or write outside the provided column buffers
///
/// # Arguments
/// Same as [`cuda_batch_step`].
#[cfg(feature = "cuda")]
pub fn cuda_batch_step_pinned<A, S>(
    store: &S,
    ptx_source: &str,
    _module_name: &str,
    kernel_name: &str,
    block_size: u32,
) -> Result<AccelStepResult, String>
where
    A: SoaExtractable,
    S: AgentStore<A>,
{
    use cudarc::driver::{LaunchConfig, PushKernelArg};

    if block_size == 0 {
        return Err("block_size must be positive".to_string());
    }

    let ctx = crate::cuda_context::new_context(0)?;
    let copy_stream = ctx
        .new_stream()
        .map_err(|e| format!("copy stream init failed: {e}"))?;
    let compute_stream = ctx
        .new_stream()
        .map_err(|e| format!("compute stream init failed: {e}"))?;

    let (ids, mut columns) = soa::extract_soa::<A, S>(store);
    let n = ids.len();
    if n == 0 {
        return Ok(AccelStepResult {
            backend: ComputeBackend::Cuda,
            agent_count: 0,
            kernel_us: 0,
        });
    }

    let ptx = cudarc::nvrtc::Ptx::from_src(ptx_source);
    let module = ctx
        .load_module(ptx)
        .map_err(|e| format!("PTX load failed: {e}"))?;
    let func = module
        .load_function(kernel_name)
        .map_err(|e| format!("kernel '{kernel_name}' not found: {e}"))?;

    // Allocate pinned host staging and fill from extracted SoA columns.
    let mut pinned: Vec<cudarc::driver::PinnedHostSlice<f32>> = Vec::with_capacity(columns.len());
    for col in &columns {
        // SAFETY: `alloc_pinned` returns uninitialized pinned host memory.
        // We immediately fill the entire slice via `copy_from_slice` before
        // any read, so no uninitialized byte is ever observed.
        let mut p = unsafe { ctx.alloc_pinned::<f32>(col.len()) }
            .map_err(|e| format!("pinned alloc failed: {e}"))?;
        p.as_mut_slice()
            .map_err(|e| format!("pinned access failed: {e}"))?
            .copy_from_slice(col);
        pinned.push(p);
    }

    // Host -> device on the copy stream.
    let mut d_columns: Vec<cudarc::driver::CudaSlice<f32>> = Vec::with_capacity(pinned.len());
    for p in &pinned {
        let d = copy_stream
            .clone_htod(p)
            .map_err(|e| format!("htod failed: {e}"))?;
        d_columns.push(d);
    }

    compute_stream
        .join(&copy_stream)
        .map_err(|e| format!("compute.join(copy) failed: {e}"))?;

    let grid_size = n.div_ceil(block_size as usize) as u32;
    let cfg = LaunchConfig {
        grid_dim: (grid_size, 1, 1),
        block_dim: (block_size, 1, 1),
        shared_mem_bytes: 0,
    };

    let n_u32 = n as u32;

    let t0 = std::time::Instant::now();

    // SAFETY:
    // - each `d_columns[i]` is a valid device buffer allocated on `copy_stream`
    //   and made visible to `compute_stream` via `compute_stream.join(&copy_stream)`
    // - `n_u32` is the logical row count passed to the kernel for bounds checking
    // - callers are responsible for providing a PTX kernel whose signature matches
    //   the argument list built below and whose implementation does not access out
    //   of bounds
    // - `block_size > 0` has been validated above
    // - work is scheduled on `compute_stream`; the copy stream waits on it via
    //   `copy_stream.join(&compute_stream)` below before issuing dtoh, and the
    //   copy stream is then synchronized before any host-visible read
    unsafe {
        let mut builder = compute_stream.launch_builder(&func);
        for d in d_columns.iter_mut() {
            builder.arg(d);
        }
        builder.arg(&n_u32);
        builder
            .launch(cfg)
            .map_err(|e| format!("kernel launch failed: {e}"))?;
    }

    copy_stream
        .join(&compute_stream)
        .map_err(|e| format!("copy.join(compute) failed: {e}"))?;

    for (i, d_col) in d_columns.iter().enumerate() {
        copy_stream
            .memcpy_dtoh(d_col, &mut pinned[i])
            .map_err(|e| format!("dtoh failed: {e}"))?;
    }

    copy_stream
        .synchronize()
        .map_err(|e| format!("stream sync failed: {e}"))?;
    let kernel_us = t0.elapsed().as_micros();

    for (i, p) in pinned.iter().enumerate() {
        columns[i].copy_from_slice(
            p.as_slice()
                .map_err(|e| format!("pinned readback failed: {e}"))?,
        );
    }

    soa::write_back_soa::<A, S>(store, &ids, &columns);

    Ok(AccelStepResult {
        backend: ComputeBackend::Cuda,
        agent_count: n,
        kernel_us,
    })
}

/// Automatically choose CUDA or CPU for a batch step.
///
/// If the `cuda` feature is enabled and a device is found, uses `cuda_batch_step`.
/// Otherwise falls back to `cpu_batch_step`.
///
/// If CUDA is selected but fails at runtime (device init, PTX load, launch,
/// synchronization, transfer, or invalid CUDA configuration), the function logs
/// a warning and continues on CPU.
pub fn auto_batch_step<A, S, F>(
    store: &S,
    cpu_kernel: F,
    #[cfg(feature = "cuda")] ptx_source: &str,
    #[cfg(feature = "cuda")] module_name: &str,
    #[cfg(feature = "cuda")] kernel_name: &str,
    #[cfg(feature = "cuda")] block_size: u32,
) -> AccelStepResult
where
    A: SoaExtractable,
    S: AgentStore<A>,
    F: FnMut(&mut [Vec<f32>], usize),
{
    #[cfg(feature = "cuda")]
    {
        if detect_backend() == ComputeBackend::Cuda {
            match cuda_batch_step::<A, S>(store, ptx_source, module_name, kernel_name, block_size) {
                Ok(result) => return result,
                Err(e) => {
                    tracing::warn!(error = %e, "CUDA batch step failed, falling back to CPU");
                }
            }
        }
    }

    cpu_batch_step::<A, S, F>(store, cpu_kernel)
}

/// Step a [`DeviceSoaStore`](crate::device_store::DeviceSoaStore) using CUDA or CPU.
///
/// Unlike `auto_batch_step`, this operates on persistent SoA storage,
/// avoiding the extract/write-back cycle each step.
///
/// Returns the kernel time in microseconds and the backend used.
///
/// If CUDA is selected but fails at runtime, the function logs a warning and
/// continues on the CPU path over the persistent SoA buffers.
pub fn auto_device_step(
    device: &mut crate::device_store::DeviceSoaStore,
    mut cpu_kernel: impl FnMut(&mut [Vec<f32>], usize),
    #[cfg(feature = "cuda")] ptx_source: &str,
    #[cfg(feature = "cuda")] module_name: &str,
    #[cfg(feature = "cuda")] kernel_name: &str,
    #[cfg(feature = "cuda")] block_size: u32,
) -> AccelStepResult {
    #[cfg(feature = "cuda")]
    {
        if detect_backend() == ComputeBackend::Cuda {
            match device.step_cuda(ptx_source, module_name, kernel_name, block_size) {
                Ok(kernel_us) => {
                    return AccelStepResult {
                        backend: ComputeBackend::Cuda,
                        agent_count: device.agent_count(),
                        kernel_us,
                    };
                }
                Err(e) => {
                    tracing::warn!(error = %e, "CUDA device step failed, falling back to CPU");
                }
            }
        }
    }

    let kernel_us = device.step_cpu(&mut cpu_kernel);
    AccelStepResult {
        backend: ComputeBackend::Cpu,
        agent_count: device.agent_count(),
        kernel_us,
    }
}