use std::sync::Arc;
use cudarc::driver::{CudaContext, CudaFunction, CudaSlice, CudaStream, PushKernelArg};
use cudarc::nvrtc::compile_ptx;
use super::{BFS_KERNEL_SOURCE, SPMV_KERNEL_SOURCE};
use crate::models::CsrMatrix;
#[derive(Debug, thiserror::Error)]
pub enum GpuGraphError {
#[error("CUDA error: {0}")]
CudaError(String),
#[error("Compilation error: {0}")]
CompilationError(String),
#[error("Invalid input: {0}")]
InvalidInput(String),
}
type Result<T> = std::result::Result<T, GpuGraphError>;
pub struct GpuBfs {
context: Arc<CudaContext>,
stream: Arc<CudaStream>,
init_fn: CudaFunction,
advance_fn: CudaFunction,
copy_fn: CudaFunction,
}
impl GpuBfs {
pub fn new(device_ordinal: usize) -> Result<Self> {
let context = CudaContext::new(device_ordinal)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
let stream = context.default_stream();
let ptx = compile_ptx(BFS_KERNEL_SOURCE)
.map_err(|e| GpuGraphError::CompilationError(e.to_string()))?;
let module = context
.load_module(ptx)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
let init_fn = module
.load_function("bfs_init")
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
let advance_fn = module
.load_function("bfs_advance")
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
let copy_fn = module
.load_function("bfs_copy")
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
Ok(Self {
context,
stream,
init_fn,
advance_fn,
copy_fn,
})
}
pub fn bfs(&self, matrix: &CsrMatrix, sources: &[u32]) -> Result<Vec<i32>> {
if sources.is_empty() {
return Err(GpuGraphError::InvalidInput("No source nodes".to_string()));
}
let num_nodes = matrix.num_rows as u32;
let row_ptr_dev = self.htod_u64(&matrix.row_ptr)?;
let col_idx_dev = self.htod_u32(&matrix.col_idx)?;
let sources_dev = self.htod_u32(sources)?;
let mut distances = self.alloc_i32(num_nodes as usize)?;
let mut new_distances = self.alloc_i32(num_nodes as usize)?;
let mut frontier_size_dev = self.alloc_i32(1)?;
let block_size = 256u32;
let grid_size = num_nodes.div_ceil(block_size);
let num_sources = sources.len() as u32;
unsafe {
self.stream
.launch_builder(&self.init_fn)
.arg(&mut distances)
.arg(&sources_dev)
.arg(&num_nodes)
.arg(&num_sources)
.launch(cudarc::driver::LaunchConfig {
grid_dim: (grid_size, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: 0,
})
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
}
unsafe {
self.stream
.launch_builder(&self.copy_fn)
.arg(&distances)
.arg(&mut new_distances)
.arg(&num_nodes)
.launch(cudarc::driver::LaunchConfig {
grid_dim: (grid_size, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: 0,
})
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
}
let max_iterations = num_nodes; for level in 0..max_iterations as i32 {
let zero = vec![0i32];
self.stream
.memcpy_htod(&zero, &mut frontier_size_dev)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
unsafe {
self.stream
.launch_builder(&self.advance_fn)
.arg(&row_ptr_dev)
.arg(&col_idx_dev)
.arg(&distances)
.arg(&mut new_distances)
.arg(&mut frontier_size_dev)
.arg(&num_nodes)
.arg(&level)
.launch(cudarc::driver::LaunchConfig {
grid_dim: (grid_size, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: 0,
})
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
}
let mut frontier_size = vec![0i32];
self.stream
.memcpy_dtoh(&frontier_size_dev, &mut frontier_size)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
if frontier_size[0] == 0 {
break;
}
unsafe {
self.stream
.launch_builder(&self.copy_fn)
.arg(&new_distances)
.arg(&mut distances)
.arg(&num_nodes)
.launch(cudarc::driver::LaunchConfig {
grid_dim: (grid_size, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: 0,
})
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
}
}
let mut result = vec![0i32; num_nodes as usize];
self.stream
.memcpy_dtoh(&new_distances, &mut result)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
Ok(result)
}
pub fn synchronize(&self) -> Result<()> {
self.context
.synchronize()
.map_err(|e| GpuGraphError::CudaError(e.to_string()))
}
fn alloc_i32(&self, n: usize) -> Result<CudaSlice<i32>> {
unsafe {
self.stream
.alloc::<i32>(n)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))
}
}
fn htod_u64(&self, data: &[u64]) -> Result<CudaSlice<u64>> {
let mut slice = unsafe {
self.stream
.alloc::<u64>(data.len())
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?
};
self.stream
.memcpy_htod(data, &mut slice)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
Ok(slice)
}
fn htod_u32(&self, data: &[u32]) -> Result<CudaSlice<u32>> {
let mut slice = unsafe {
self.stream
.alloc::<u32>(data.len())
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?
};
self.stream
.memcpy_htod(data, &mut slice)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
Ok(slice)
}
}
pub struct GpuSpmv {
context: Arc<CudaContext>,
stream: Arc<CudaStream>,
spmv_fn: CudaFunction,
spmv_axpby_fn: CudaFunction,
dot_fn: CudaFunction,
#[allow(dead_code)]
scale_fn: CudaFunction,
#[allow(dead_code)]
copy_fn: CudaFunction,
}
impl GpuSpmv {
pub fn new(device_ordinal: usize) -> Result<Self> {
let context = CudaContext::new(device_ordinal)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
let stream = context.default_stream();
let ptx = compile_ptx(SPMV_KERNEL_SOURCE)
.map_err(|e| GpuGraphError::CompilationError(e.to_string()))?;
let module = context
.load_module(ptx)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
let spmv_fn = module
.load_function("spmv_csr")
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
let spmv_axpby_fn = module
.load_function("spmv_csr_axpby")
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
let dot_fn = module
.load_function("dot_product")
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
let scale_fn = module
.load_function("vector_scale")
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
let copy_fn = module
.load_function("vector_copy")
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
Ok(Self {
context,
stream,
spmv_fn,
spmv_axpby_fn,
dot_fn,
scale_fn,
copy_fn,
})
}
pub fn spmv(&self, matrix: &CsrMatrix, x: &[f64], alpha: f64) -> Result<Vec<f64>> {
if x.len() != matrix.num_cols {
return Err(GpuGraphError::InvalidInput(format!(
"Vector size {} doesn't match matrix columns {}",
x.len(),
matrix.num_cols
)));
}
let num_rows = matrix.num_rows as u32;
let row_ptr_dev = self.htod_u64(&matrix.row_ptr)?;
let col_idx_dev = self.htod_u32(&matrix.col_idx)?;
let x_dev = self.htod_f64(x)?;
let (values_dev, has_values) = if let Some(ref values) = matrix.values {
(self.htod_f64(values)?, 1i32)
} else {
let dummy = unsafe {
self.stream
.alloc::<f64>(1)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?
};
(dummy, 0i32)
};
let mut y_dev = self.alloc_f64(matrix.num_rows)?;
let block_size = 256u32;
let grid_size = num_rows.div_ceil(block_size);
unsafe {
self.stream
.launch_builder(&self.spmv_fn)
.arg(&row_ptr_dev)
.arg(&col_idx_dev)
.arg(&values_dev)
.arg(&x_dev)
.arg(&mut y_dev)
.arg(&num_rows)
.arg(&alpha)
.arg(&has_values)
.launch(cudarc::driver::LaunchConfig {
grid_dim: (grid_size, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: 0,
})
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
}
let mut y = vec![0.0f64; matrix.num_rows];
self.stream
.memcpy_dtoh(&y_dev, &mut y)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
Ok(y)
}
pub fn spmv_axpby(
&self,
matrix: &CsrMatrix,
x: &[f64],
y: &mut [f64],
alpha: f64,
beta: f64,
) -> Result<()> {
if x.len() != matrix.num_cols {
return Err(GpuGraphError::InvalidInput(format!(
"Input vector size {} doesn't match matrix columns {}",
x.len(),
matrix.num_cols
)));
}
if y.len() != matrix.num_rows {
return Err(GpuGraphError::InvalidInput(format!(
"Output vector size {} doesn't match matrix rows {}",
y.len(),
matrix.num_rows
)));
}
let num_rows = matrix.num_rows as u32;
let row_ptr_dev = self.htod_u64(&matrix.row_ptr)?;
let col_idx_dev = self.htod_u32(&matrix.col_idx)?;
let x_dev = self.htod_f64(x)?;
let mut y_dev = self.htod_f64(y)?;
let (values_dev, has_values) = if let Some(ref values) = matrix.values {
(self.htod_f64(values)?, 1i32)
} else {
let dummy = unsafe {
self.stream
.alloc::<f64>(1)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?
};
(dummy, 0i32)
};
let block_size = 256u32;
let grid_size = num_rows.div_ceil(block_size);
unsafe {
self.stream
.launch_builder(&self.spmv_axpby_fn)
.arg(&row_ptr_dev)
.arg(&col_idx_dev)
.arg(&values_dev)
.arg(&x_dev)
.arg(&mut y_dev)
.arg(&num_rows)
.arg(&alpha)
.arg(&beta)
.arg(&has_values)
.launch(cudarc::driver::LaunchConfig {
grid_dim: (grid_size, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: 0,
})
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
}
self.stream
.memcpy_dtoh(&y_dev, y)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
Ok(())
}
pub fn dot(&self, x: &[f64], y: &[f64]) -> Result<f64> {
if x.len() != y.len() {
return Err(GpuGraphError::InvalidInput(
"Vector sizes don't match".to_string(),
));
}
let n = x.len() as u32;
let block_size = 256u32;
let num_blocks = n.div_ceil(block_size);
let x_dev = self.htod_f64(x)?;
let y_dev = self.htod_f64(y)?;
let mut partial_sums = self.alloc_f64(num_blocks as usize)?;
unsafe {
self.stream
.launch_builder(&self.dot_fn)
.arg(&x_dev)
.arg(&y_dev)
.arg(&mut partial_sums)
.arg(&n)
.launch(cudarc::driver::LaunchConfig {
grid_dim: (num_blocks, 1, 1),
block_dim: (block_size, 1, 1),
shared_mem_bytes: 0,
})
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
}
let mut sums = vec![0.0f64; num_blocks as usize];
self.stream
.memcpy_dtoh(&partial_sums, &mut sums)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
Ok(sums.iter().sum())
}
pub fn norm2(&self, x: &[f64]) -> Result<f64> {
let dot_xx = self.dot(x, x)?;
Ok(dot_xx.sqrt())
}
pub fn power_iteration(
&self,
matrix: &CsrMatrix,
max_iterations: usize,
tolerance: f64,
) -> Result<(Vec<f64>, f64)> {
if matrix.num_rows == 0 {
return Err(GpuGraphError::InvalidInput("Empty matrix".to_string()));
}
let n = matrix.num_rows;
let mut x: Vec<f64> = vec![1.0 / (n as f64).sqrt(); n];
let mut eigenvalue = 0.0;
for _ in 0..max_iterations {
let y = self.spmv(matrix, &x, 1.0)?;
let new_eigenvalue = self.dot(&x, &y)?;
let norm = self.norm2(&y)?;
if norm < 1e-10 {
break;
}
x = y.into_iter().map(|yi| yi / norm).collect();
if (new_eigenvalue - eigenvalue).abs() < tolerance {
return Ok((x, new_eigenvalue));
}
eigenvalue = new_eigenvalue;
}
Ok((x, eigenvalue))
}
pub fn synchronize(&self) -> Result<()> {
self.context
.synchronize()
.map_err(|e| GpuGraphError::CudaError(e.to_string()))
}
fn alloc_f64(&self, n: usize) -> Result<CudaSlice<f64>> {
unsafe {
self.stream
.alloc::<f64>(n)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))
}
}
fn htod_u64(&self, data: &[u64]) -> Result<CudaSlice<u64>> {
let mut slice = unsafe {
self.stream
.alloc::<u64>(data.len())
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?
};
self.stream
.memcpy_htod(data, &mut slice)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
Ok(slice)
}
fn htod_u32(&self, data: &[u32]) -> Result<CudaSlice<u32>> {
let mut slice = unsafe {
self.stream
.alloc::<u32>(data.len())
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?
};
self.stream
.memcpy_htod(data, &mut slice)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
Ok(slice)
}
fn htod_f64(&self, data: &[f64]) -> Result<CudaSlice<f64>> {
let mut slice = unsafe {
self.stream
.alloc::<f64>(data.len())
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?
};
self.stream
.memcpy_htod(data, &mut slice)
.map_err(|e| GpuGraphError::CudaError(e.to_string()))?;
Ok(slice)
}
}
pub fn is_cuda_available() -> bool {
std::panic::catch_unwind(|| {
cudarc::driver::CudaContext::device_count()
.map(|c| c > 0)
.unwrap_or(false)
})
.unwrap_or(false)
}
#[cfg(test)]
mod tests {
use super::*;
fn skip_if_no_cuda() -> bool {
if !is_cuda_available() {
println!("Skipping test: CUDA not available");
return true;
}
false
}
#[test]
fn test_gpu_bfs_simple() {
if skip_if_no_cuda() {
return;
}
let edges = [(0, 1), (1, 2), (2, 3)];
let matrix = CsrMatrix::from_edges(4, &edges);
let gpu_bfs = GpuBfs::new(0).unwrap();
let distances = gpu_bfs.bfs(&matrix, &[0]).unwrap();
assert_eq!(distances[0], 0);
assert_eq!(distances[1], 1);
assert_eq!(distances[2], 2);
assert_eq!(distances[3], 3);
}
#[test]
fn test_gpu_bfs_unreachable() {
if skip_if_no_cuda() {
return;
}
let edges = [(0, 1)];
let matrix = CsrMatrix::from_edges(3, &edges);
let gpu_bfs = GpuBfs::new(0).unwrap();
let distances = gpu_bfs.bfs(&matrix, &[0]).unwrap();
assert_eq!(distances[0], 0);
assert_eq!(distances[1], 1);
assert_eq!(distances[2], -1); }
#[test]
fn test_gpu_bfs_multi_source() {
if skip_if_no_cuda() {
return;
}
let edges = [(0, 1), (1, 2), (3, 4)];
let matrix = CsrMatrix::from_edges(5, &edges);
let gpu_bfs = GpuBfs::new(0).unwrap();
let distances = gpu_bfs.bfs(&matrix, &[0, 3]).unwrap();
assert_eq!(distances[0], 0); assert_eq!(distances[1], 1);
assert_eq!(distances[2], 2);
assert_eq!(distances[3], 0); assert_eq!(distances[4], 1);
}
#[test]
fn test_gpu_spmv_identity() {
if skip_if_no_cuda() {
return;
}
let edges = [(0, 0, 1.0), (1, 1, 1.0), (2, 2, 1.0)];
let matrix = CsrMatrix::from_weighted_edges(3, &edges);
let gpu_spmv = GpuSpmv::new(0).unwrap();
let x = vec![1.0, 2.0, 3.0];
let y = gpu_spmv.spmv(&matrix, &x, 1.0).unwrap();
assert!((y[0] - 1.0).abs() < 1e-10);
assert!((y[1] - 2.0).abs() < 1e-10);
assert!((y[2] - 3.0).abs() < 1e-10);
}
#[test]
fn test_gpu_spmv_weighted() {
if skip_if_no_cuda() {
return;
}
let edges = [(0, 1, 2.0), (0, 2, 3.0), (1, 2, 4.0)];
let matrix = CsrMatrix::from_weighted_edges(3, &edges);
let gpu_spmv = GpuSpmv::new(0).unwrap();
let x = vec![1.0, 1.0, 1.0];
let y = gpu_spmv.spmv(&matrix, &x, 1.0).unwrap();
assert!((y[0] - 5.0).abs() < 1e-10);
assert!((y[1] - 4.0).abs() < 1e-10);
assert!((y[2] - 0.0).abs() < 1e-10);
}
#[test]
fn test_gpu_spmv_unweighted() {
if skip_if_no_cuda() {
return;
}
let edges = [(0, 1), (0, 2), (1, 2)];
let matrix = CsrMatrix::from_edges(3, &edges);
let gpu_spmv = GpuSpmv::new(0).unwrap();
let x = vec![1.0, 1.0, 1.0];
let y = gpu_spmv.spmv(&matrix, &x, 1.0).unwrap();
assert!((y[0] - 2.0).abs() < 1e-10);
assert!((y[1] - 1.0).abs() < 1e-10);
assert!((y[2] - 0.0).abs() < 1e-10);
}
#[test]
fn test_gpu_dot_product() {
if skip_if_no_cuda() {
return;
}
let gpu_spmv = GpuSpmv::new(0).unwrap();
let x = vec![1.0, 2.0, 3.0];
let y = vec![4.0, 5.0, 6.0];
let dot = gpu_spmv.dot(&x, &y).unwrap();
assert!((dot - 32.0).abs() < 1e-10);
}
#[test]
fn test_gpu_norm2() {
if skip_if_no_cuda() {
return;
}
let gpu_spmv = GpuSpmv::new(0).unwrap();
let x = vec![3.0, 4.0];
let norm = gpu_spmv.norm2(&x).unwrap();
assert!((norm - 5.0).abs() < 1e-10);
}
#[test]
fn test_gpu_power_iteration() {
if skip_if_no_cuda() {
return;
}
let mut builder = crate::models::CsrMatrixBuilder::new(2);
builder.add_weighted_edge(0, 0, 2.0);
builder.add_weighted_edge(0, 1, 1.0);
builder.add_weighted_edge(1, 0, 1.0);
builder.add_weighted_edge(1, 1, 2.0);
let matrix = builder.build();
let gpu_spmv = GpuSpmv::new(0).unwrap();
let (eigenvector, eigenvalue) = gpu_spmv.power_iteration(&matrix, 100, 1e-6).unwrap();
assert!(
(eigenvalue - 3.0).abs() < 0.01,
"Eigenvalue {} far from 3.0",
eigenvalue
);
let expected = 1.0 / 2.0_f64.sqrt();
assert!(
(eigenvector[0].abs() - expected).abs() < 0.01,
"Eigenvector[0] = {} far from {}",
eigenvector[0],
expected
);
assert!(
(eigenvector[1].abs() - expected).abs() < 0.01,
"Eigenvector[1] = {} far from {}",
eigenvector[1],
expected
);
}
}