rustsim 0.0.1 - Docs.rs

//! End-to-end CUDA batch-step example.
//!
//! Requires the `cuda` feature *and* a working NVIDIA GPU + driver at runtime:
//!
//! ```text
//! cargo run -p rustsim --example cuda_particle --features cuda --release
//! ```
//!
//! The companion CUDA source lives next to this file at `cuda_particle.cu`
//! and is compiled to PTX at runtime via `cudarc::nvrtc::compile_ptx`, which
//! mirrors how FlameGPU2 ships agent functions.
//!
//! Pipeline:
//!   1. populate 1 M particles with random initial velocities
//!   2. compile the CUDA kernel to PTX via NVRTC
//!   3. run `N_STEPS` iterations through `cuda_batch_step` (extract SoA →
//!      upload → launch → download → write back)
//!   4. report throughput (agent-steps / s)

#[cfg(not(feature = "cuda"))]
fn main() {
    eprintln!(
        "This example requires the `cuda` feature. \
         Run with: cargo run -p rustsim --example cuda_particle --features cuda --release"
    );
}

#[cfg(feature = "cuda")]
fn main() {
    run();
}

#[cfg(feature = "cuda")]
fn run() {
    use rustsim::prelude::*;
    use std::time::Instant;

    const N: u64 = 1_000_000;
    const N_STEPS: usize = 10;
    const BLOCK_SIZE: u32 = 256;

    #[derive(Debug, Clone)]
    struct Particle {
        id: AgentId,
        x: f32,
        vx: f32,
    }

    impl Agent for Particle {
        fn id(&self) -> AgentId {
            self.id
        }
    }

    impl SoaExtractable for Particle {
        fn num_columns() -> usize {
            2
        }
        fn column_names() -> Vec<&'static str> {
            vec!["x", "vx"]
        }
        fn extract_row(&self, columns: &mut [Vec<f32>]) {
            columns[0].push(self.x);
            columns[1].push(self.vx);
        }
        fn write_back_row(&mut self, columns: &[&[f32]], row: usize) {
            self.x = columns[0][row];
        }
    }

    // Compile the shipped CUDA source to PTX at runtime.
    let cu_src = include_str!("cuda_particle.cu");
    let ptx = match cudarc::nvrtc::compile_ptx(cu_src) {
        Ok(p) => p,
        Err(e) => {
            eprintln!("NVRTC compile failed: {e}");
            return;
        }
    };
    // `Ptx::from_src` accepts the stringified PTX directly; `compile_ptx`
    // returns a `Ptx` whose string form is what `cuda_batch_step` expects.
    let ptx_source: String = ptx.to_src();

    // Populate 1M particles.
    let mut store = HashMapStore::new();
    for i in 1..=N {
        store.insert(Particle {
            id: i,
            x: 0.0,
            vx: (i as f32) * 1.0e-6,
        });
    }

    println!("CUDA particle demo: {N} agents, {N_STEPS} steps, block={BLOCK_SIZE}");

    // ------------------------------------------------------------------
    // Path 1: plain cuda_batch_step (pageable host memory, default stream)
    // ------------------------------------------------------------------
    let t0 = Instant::now();
    let mut total_kernel_us: u128 = 0;
    for step in 0..N_STEPS {
        match cuda_batch_step::<Particle, _>(
            &store,
            &ptx_source,
            "rustsim_cuda_particle",
            "advance",
            BLOCK_SIZE,
        ) {
            Ok(r) => {
                total_kernel_us += r.kernel_us;
                if step == 0 {
                    println!(
                        "  [plain]   step 0: backend={} agents={} kernel_us={}",
                        r.backend, r.agent_count, r.kernel_us
                    );
                }
            }
            Err(e) => {
                eprintln!("cuda_batch_step failed at step {step}: {e}");
                return;
            }
        }
    }
    let wall_ms = t0.elapsed().as_millis();
    let agent_steps = (N as u128) * (N_STEPS as u128);
    let throughput = (agent_steps as f64) / (wall_ms.max(1) as f64) * 1.0e3;
    println!(
        "  [plain]   wall={wall_ms} ms  total_kernel_us={total_kernel_us}  throughput={:.2e} agent-steps/s",
        throughput
    );

    // ------------------------------------------------------------------
    // Path 2: cuda_batch_step_pinned (page-locked staging + dual streams)
    // ------------------------------------------------------------------
    let t0 = Instant::now();
    let mut total_kernel_us: u128 = 0;
    for step in 0..N_STEPS {
        match cuda_batch_step_pinned::<Particle, _>(
            &store,
            &ptx_source,
            "rustsim_cuda_particle",
            "advance",
            BLOCK_SIZE,
        ) {
            Ok(r) => {
                total_kernel_us += r.kernel_us;
                if step == 0 {
                    println!(
                        "  [pinned]  step 0: backend={} agents={} kernel_us={}",
                        r.backend, r.agent_count, r.kernel_us
                    );
                }
            }
            Err(e) => {
                eprintln!("cuda_batch_step_pinned failed at step {step}: {e}");
                return;
            }
        }
    }
    let wall_ms = t0.elapsed().as_millis();
    let throughput = (agent_steps as f64) / (wall_ms.max(1) as f64) * 1.0e3;
    println!(
        "  [pinned]  wall={wall_ms} ms  total_kernel_us={total_kernel_us}  throughput={:.2e} agent-steps/s",
        throughput
    );

    // ------------------------------------------------------------------
    // Path 3: DeviceSoaStore::step_cuda_pinned (persistent device store)
    // ------------------------------------------------------------------
    let mut device = DeviceSoaStore::upload::<Particle, _>(&store);
    let t0 = Instant::now();
    let mut total_kernel_us: u128 = 0;
    for step in 0..N_STEPS {
        match device.step_cuda_pinned(&ptx_source, "rustsim_cuda_particle", "advance", BLOCK_SIZE) {
            Ok(kernel_us) => {
                total_kernel_us += kernel_us;
                if step == 0 {
                    println!(
                        "  [device]  step 0: agents={} kernel_us={kernel_us}",
                        device.agent_count(),
                    );
                }
            }
            Err(e) => {
                eprintln!("step_cuda_pinned failed at step {step}: {e}");
                return;
            }
        }
    }
    let wall_ms = t0.elapsed().as_millis();
    let throughput = (agent_steps as f64) / (wall_ms.max(1) as f64) * 1.0e3;
    println!(
        "  [device]  wall={wall_ms} ms  total_kernel_us={total_kernel_us}  throughput={:.2e} agent-steps/s",
        throughput
    );

    // ------------------------------------------------------------------
    // Path 4: DeviceSoaStore + init_cuda / step_cuda_resident
    //
    // True persistent GPU state: one upload at init_cuda, many launches
    // with zero host/device transfers, one download at sync_to_host.
    // ------------------------------------------------------------------
    let mut resident = DeviceSoaStore::upload::<Particle, _>(&store);
    if let Err(e) = resident.init_cuda(&ptx_source, "advance") {
        eprintln!("init_cuda failed: {e}");
        return;
    }
    let t0 = Instant::now();
    let mut total_submit_us: u128 = 0;
    for step in 0..N_STEPS {
        match resident.step_cuda_resident(BLOCK_SIZE) {
            Ok(submit_us) => {
                total_submit_us += submit_us;
                if step == 0 {
                    println!(
                        "  [resident] step 0: agents={} submit_us={submit_us}",
                        resident.agent_count(),
                    );
                }
            }
            Err(e) => {
                eprintln!("step_cuda_resident failed at step {step}: {e}");
                return;
            }
        }
    }
    if let Err(e) = resident.sync_to_host() {
        eprintln!("sync_to_host failed: {e}");
        return;
    }
    let wall_ms = t0.elapsed().as_millis();
    let throughput = (agent_steps as f64) / (wall_ms.max(1) as f64) * 1.0e3;
    println!(
        "  [resident] wall={wall_ms} ms (incl. one sync_to_host)  total_submit_us={total_submit_us}  throughput={:.2e} agent-steps/s",
        throughput
    );
}