#[cfg(not(feature = "cuda"))]
fn main() {
eprintln!(
"This example requires the `cuda` feature. \
Run with: cargo run -p rustsim --example cuda_particle --features cuda --release"
);
}
#[cfg(feature = "cuda")]
fn main() {
run();
}
#[cfg(feature = "cuda")]
fn run() {
use rustsim::prelude::*;
use std::time::Instant;
const N: u64 = 1_000_000;
const N_STEPS: usize = 10;
const BLOCK_SIZE: u32 = 256;
#[derive(Debug, Clone)]
struct Particle {
id: AgentId,
x: f32,
vx: f32,
}
impl Agent for Particle {
fn id(&self) -> AgentId {
self.id
}
}
impl SoaExtractable for Particle {
fn num_columns() -> usize {
2
}
fn column_names() -> Vec<&'static str> {
vec!["x", "vx"]
}
fn extract_row(&self, columns: &mut [Vec<f32>]) {
columns[0].push(self.x);
columns[1].push(self.vx);
}
fn write_back_row(&mut self, columns: &[&[f32]], row: usize) {
self.x = columns[0][row];
}
}
let cu_src = include_str!("cuda_particle.cu");
let ptx = match cudarc::nvrtc::compile_ptx(cu_src) {
Ok(p) => p,
Err(e) => {
eprintln!("NVRTC compile failed: {e}");
return;
}
};
let ptx_source: String = ptx.to_src();
let mut store = HashMapStore::new();
for i in 1..=N {
store.insert(Particle {
id: i,
x: 0.0,
vx: (i as f32) * 1.0e-6,
});
}
println!("CUDA particle demo: {N} agents, {N_STEPS} steps, block={BLOCK_SIZE}");
let t0 = Instant::now();
let mut total_kernel_us: u128 = 0;
for step in 0..N_STEPS {
match cuda_batch_step::<Particle, _>(
&store,
&ptx_source,
"rustsim_cuda_particle",
"advance",
BLOCK_SIZE,
) {
Ok(r) => {
total_kernel_us += r.kernel_us;
if step == 0 {
println!(
" [plain] step 0: backend={} agents={} kernel_us={}",
r.backend, r.agent_count, r.kernel_us
);
}
}
Err(e) => {
eprintln!("cuda_batch_step failed at step {step}: {e}");
return;
}
}
}
let wall_ms = t0.elapsed().as_millis();
let agent_steps = (N as u128) * (N_STEPS as u128);
let throughput = (agent_steps as f64) / (wall_ms.max(1) as f64) * 1.0e3;
println!(
" [plain] wall={wall_ms} ms total_kernel_us={total_kernel_us} throughput={:.2e} agent-steps/s",
throughput
);
let t0 = Instant::now();
let mut total_kernel_us: u128 = 0;
for step in 0..N_STEPS {
match cuda_batch_step_pinned::<Particle, _>(
&store,
&ptx_source,
"rustsim_cuda_particle",
"advance",
BLOCK_SIZE,
) {
Ok(r) => {
total_kernel_us += r.kernel_us;
if step == 0 {
println!(
" [pinned] step 0: backend={} agents={} kernel_us={}",
r.backend, r.agent_count, r.kernel_us
);
}
}
Err(e) => {
eprintln!("cuda_batch_step_pinned failed at step {step}: {e}");
return;
}
}
}
let wall_ms = t0.elapsed().as_millis();
let throughput = (agent_steps as f64) / (wall_ms.max(1) as f64) * 1.0e3;
println!(
" [pinned] wall={wall_ms} ms total_kernel_us={total_kernel_us} throughput={:.2e} agent-steps/s",
throughput
);
let mut device = DeviceSoaStore::upload::<Particle, _>(&store);
let t0 = Instant::now();
let mut total_kernel_us: u128 = 0;
for step in 0..N_STEPS {
match device.step_cuda_pinned(&ptx_source, "rustsim_cuda_particle", "advance", BLOCK_SIZE) {
Ok(kernel_us) => {
total_kernel_us += kernel_us;
if step == 0 {
println!(
" [device] step 0: agents={} kernel_us={kernel_us}",
device.agent_count(),
);
}
}
Err(e) => {
eprintln!("step_cuda_pinned failed at step {step}: {e}");
return;
}
}
}
let wall_ms = t0.elapsed().as_millis();
let throughput = (agent_steps as f64) / (wall_ms.max(1) as f64) * 1.0e3;
println!(
" [device] wall={wall_ms} ms total_kernel_us={total_kernel_us} throughput={:.2e} agent-steps/s",
throughput
);
let mut resident = DeviceSoaStore::upload::<Particle, _>(&store);
if let Err(e) = resident.init_cuda(&ptx_source, "advance") {
eprintln!("init_cuda failed: {e}");
return;
}
let t0 = Instant::now();
let mut total_submit_us: u128 = 0;
for step in 0..N_STEPS {
match resident.step_cuda_resident(BLOCK_SIZE) {
Ok(submit_us) => {
total_submit_us += submit_us;
if step == 0 {
println!(
" [resident] step 0: agents={} submit_us={submit_us}",
resident.agent_count(),
);
}
}
Err(e) => {
eprintln!("step_cuda_resident failed at step {step}: {e}");
return;
}
}
}
if let Err(e) = resident.sync_to_host() {
eprintln!("sync_to_host failed: {e}");
return;
}
let wall_ms = t0.elapsed().as_millis();
let throughput = (agent_steps as f64) / (wall_ms.max(1) as f64) * 1.0e3;
println!(
" [resident] wall={wall_ms} ms (incl. one sync_to_host) total_submit_us={total_submit_us} throughput={:.2e} agent-steps/s",
throughput
);
}