#![allow(dead_code)]
#[allow(unused_imports)]
use crate::gpu_ops::{GpuBackend, GpuBuffer, GpuBufferExt, GpuDevice, GpuError, GpuKernelHandle};
#[cfg(feature = "gpu")]
use scirs2_core::gpu::GpuContext;
use scirs2_core::numeric::{Float, SparseElement};
use scirs2_core::GpuDataType;
use std::fmt::Debug;
#[derive(Debug, Clone)]
pub struct GpuKernelConfig {
pub workgroup_size: [u32; 3],
pub compute_units: u32,
pub vectorization: bool,
pub memory_strategy: MemoryStrategy,
}
impl Default for GpuKernelConfig {
fn default() -> Self {
Self {
workgroup_size: [256, 1, 1],
compute_units: 0, vectorization: true,
memory_strategy: MemoryStrategy::Coalesced,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum MemoryStrategy {
Standard,
Coalesced,
SharedMemory,
TextureMemory,
}
#[cfg(feature = "gpu")]
#[allow(dead_code)]
#[allow(clippy::too_many_arguments)]
pub fn execute_spmv_kernel<T>(
device: &GpuDevice,
kernel: &GpuKernelHandle,
rows: usize,
indptr_buffer: &GpuBuffer<u32>,
indices_buffer: &GpuBuffer<u32>,
data_buffer: &GpuBuffer<T>,
x_buffer: &GpuBuffer<T>,
y_buffer: &GpuBuffer<T>,
config: &GpuKernelConfig,
) -> Result<(), GpuError>
where
T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
{
let (global_size, local_size) = calculate_optimal_dimensions(
device.backend(),
rows,
config.workgroup_size,
config.compute_units,
);
match device.backend() {
GpuBackend::Cuda => execute_cuda_spmv(
device,
kernel,
rows,
indptr_buffer,
indices_buffer,
data_buffer,
x_buffer,
y_buffer,
&global_size,
&local_size,
config,
),
GpuBackend::OpenCL => execute_opencl_spmv(
device,
kernel,
rows,
indptr_buffer,
indices_buffer,
data_buffer,
x_buffer,
y_buffer,
&global_size,
&local_size,
config,
),
GpuBackend::Metal => execute_metal_spmv(
device,
kernel,
rows,
indptr_buffer,
indices_buffer,
data_buffer,
x_buffer,
y_buffer,
&global_size,
&local_size,
config,
),
GpuBackend::Cpu => execute_cpu_spmv_fallback(
rows,
indptr_buffer,
indices_buffer,
data_buffer,
x_buffer,
y_buffer,
),
GpuBackend::Rocm | GpuBackend::Wgpu => {
execute_cpu_spmv_fallback(
rows,
indptr_buffer,
indices_buffer,
data_buffer,
x_buffer,
y_buffer,
)
}
}
}
#[cfg(feature = "gpu")]
#[allow(dead_code)]
#[allow(clippy::too_many_arguments)]
pub fn execute_symmetric_spmv_kernel<T>(
device: &GpuDevice,
kernel: &GpuKernelHandle,
rows: usize,
indptr_buffer: &GpuBuffer<u32>,
indices_buffer: &GpuBuffer<u32>,
data_buffer: &GpuBuffer<T>,
x_buffer: &GpuBuffer<T>,
y_buffer: &GpuBuffer<T>,
config: &GpuKernelConfig,
) -> Result<(), GpuError>
where
T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
{
let (global_size, local_size) = calculate_optimal_dimensions(
device.backend(),
rows,
config.workgroup_size,
config.compute_units,
);
match device.backend() {
GpuBackend::Cuda => execute_cuda_symmetric_spmv(
device,
kernel,
rows,
indptr_buffer,
indices_buffer,
data_buffer,
x_buffer,
y_buffer,
&global_size,
&local_size,
config,
),
GpuBackend::OpenCL => execute_opencl_symmetric_spmv(
device,
kernel,
rows,
indptr_buffer,
indices_buffer,
data_buffer,
x_buffer,
y_buffer,
&global_size,
&local_size,
config,
),
GpuBackend::Metal => execute_metal_symmetric_spmv(
device,
kernel,
rows,
indptr_buffer,
indices_buffer,
data_buffer,
x_buffer,
y_buffer,
&global_size,
&local_size,
config,
),
GpuBackend::Cpu => execute_cpu_symmetric_spmv_fallback(
rows,
indptr_buffer,
indices_buffer,
data_buffer,
x_buffer,
y_buffer,
),
GpuBackend::Rocm | GpuBackend::Wgpu => {
execute_cpu_symmetric_spmv_fallback(
rows,
indptr_buffer,
indices_buffer,
data_buffer,
x_buffer,
y_buffer,
)
}
}
}
#[allow(dead_code)]
fn calculate_optimal_dimensions(
backend: GpuBackend,
problem_size: usize,
workgroup_size: [u32; 3],
_compute_units: u32,
) -> (Vec<usize>, Vec<usize>) {
let optimal_workgroup = match backend {
GpuBackend::Cuda => {
let warp_aligned = workgroup_size[0].div_ceil(32) * 32;
[warp_aligned.min(1024), 1, 1] }
GpuBackend::OpenCL => {
[
workgroup_size[0].min(256),
workgroup_size[1],
workgroup_size[2],
]
}
GpuBackend::Metal => {
let simd_aligned = workgroup_size[0].div_ceil(32) * 32;
[simd_aligned.min(1024), 1, 1]
}
GpuBackend::Cpu => {
workgroup_size
}
GpuBackend::Rocm => {
let wave_aligned = workgroup_size[0].div_ceil(64) * 64;
[wave_aligned.min(1024), 1, 1]
}
GpuBackend::Wgpu => {
[workgroup_size[0].min(256), 1, 1]
}
#[cfg(not(feature = "gpu"))]
GpuBackend::Vulkan => {
[workgroup_size[0].min(256), 1, 1]
}
};
let global_size =
vec![problem_size.div_ceil(optimal_workgroup[0] as usize) * optimal_workgroup[0] as usize];
let local_size = vec![optimal_workgroup[0] as usize];
(global_size, local_size)
}
#[cfg(feature = "gpu")]
#[allow(dead_code)]
#[allow(unused_variables)]
fn execute_cuda_spmv<T>(
device: &GpuDevice,
kernel: &GpuKernelHandle,
rows: usize,
indptr_buffer: &GpuBuffer<u32>,
indices_buffer: &GpuBuffer<u32>,
data_buffer: &GpuBuffer<T>,
x_buffer: &GpuBuffer<T>,
y_buffer: &GpuBuffer<T>,
global_size: &[usize],
local_size: &[usize],
config: &GpuKernelConfig,
) -> Result<(), GpuError>
where
T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
{
let _warp_size = 32; let block_size = local_size[0].min(1024); let grid_size = rows.div_ceil(block_size);
let shared_memory_size = match config.memory_strategy {
MemoryStrategy::SharedMemory => std::mem::size_of::<T>() * block_size,
MemoryStrategy::Standard | MemoryStrategy::Coalesced | MemoryStrategy::TextureMemory => 0,
};
let cuda_args = &[
Box::new(rows as u32) as Box<dyn std::any::Any>,
Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
Box::new(std::ptr::addr_of!(*y_buffer) as *mut GpuBuffer<T>) as Box<dyn std::any::Any>,
Box::new(block_size as u32) as Box<dyn std::any::Any>,
Box::new(shared_memory_size as u32) as Box<dyn std::any::Any>,
];
let cuda_global_size = &[grid_size, 1, 1];
let cuda_local_size = &[block_size, 1, 1];
Err(GpuError::BackendNotImplemented(GpuBackend::Cuda))
}
#[cfg(feature = "gpu")]
#[allow(dead_code)]
#[allow(unused_variables)]
fn execute_opencl_spmv<T>(
device: &GpuDevice,
kernel: &GpuKernelHandle,
rows: usize,
indptr_buffer: &GpuBuffer<u32>,
indices_buffer: &GpuBuffer<u32>,
data_buffer: &GpuBuffer<T>,
x_buffer: &GpuBuffer<T>,
y_buffer: &GpuBuffer<T>,
global_size: &[usize],
local_size: &[usize],
config: &GpuKernelConfig,
) -> Result<(), GpuError>
where
T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
{
let optimal_local_size = local_size[0].min(256);
let aligned_global_size = rows.div_ceil(optimal_local_size) * optimal_local_size;
let local_memory_size = match config.memory_strategy {
MemoryStrategy::SharedMemory => std::mem::size_of::<T>() * optimal_local_size,
MemoryStrategy::Standard | MemoryStrategy::Coalesced | MemoryStrategy::TextureMemory => 0,
};
let opencl_args = &[
Box::new(rows as u32) as Box<dyn std::any::Any>,
Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
Box::new(std::ptr::addr_of!(*y_buffer) as *mut GpuBuffer<T>) as Box<dyn std::any::Any>,
Box::new(local_memory_size as u32) as Box<dyn std::any::Any>,
Box::new(config.vectorization as u32) as Box<dyn std::any::Any>,
];
let opencl_global_size = &[aligned_global_size, 1, 1];
let opencl_local_size = &[optimal_local_size, 1, 1];
Err(GpuError::BackendNotImplemented(GpuBackend::OpenCL))
}
#[cfg(feature = "gpu")]
#[allow(dead_code)]
#[allow(unused_variables)]
fn execute_metal_spmv<T>(
device: &GpuDevice,
kernel: &GpuKernelHandle,
rows: usize,
indptr_buffer: &GpuBuffer<u32>,
indices_buffer: &GpuBuffer<u32>,
data_buffer: &GpuBuffer<T>,
x_buffer: &GpuBuffer<T>,
y_buffer: &GpuBuffer<T>,
global_size: &[usize],
local_size: &[usize],
config: &GpuKernelConfig,
) -> Result<(), GpuError>
where
T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
{
let simdgroup_size = 32; let optimal_threadgroup_size = local_size[0].min(1024);
let aligned_threadgroup_size =
(optimal_threadgroup_size + simdgroup_size - 1) / simdgroup_size * simdgroup_size;
let num_threadgroups = (rows + aligned_threadgroup_size - 1) / aligned_threadgroup_size;
let threadgroup_memory_size = match config.memory_strategy {
MemoryStrategy::SharedMemory => std::mem::size_of::<T>() * aligned_threadgroup_size,
MemoryStrategy::Standard | MemoryStrategy::Coalesced | MemoryStrategy::TextureMemory => 0,
};
let metal_args = &[
Box::new(rows as u32) as Box<dyn std::any::Any>,
Box::new(&raw const *indptr_buffer) as Box<dyn std::any::Any>,
Box::new(&raw const *indices_buffer) as Box<dyn std::any::Any>,
Box::new(&raw const *data_buffer) as Box<dyn std::any::Any>,
Box::new(&raw const *x_buffer) as Box<dyn std::any::Any>,
Box::new(std::ptr::addr_of!(*y_buffer) as *mut GpuBuffer<T>) as Box<dyn std::any::Any>,
Box::new(aligned_threadgroup_size as u32) as Box<dyn std::any::Any>,
Box::new(threadgroup_memory_size as u32) as Box<dyn std::any::Any>,
Box::new(simdgroup_size as u32) as Box<dyn std::any::Any>,
];
let metal_global_size = &[num_threadgroups * aligned_threadgroup_size, 1, 1];
let metal_local_size = &[aligned_threadgroup_size, 1, 1];
Err(GpuError::BackendNotImplemented(GpuBackend::Metal))
}
#[allow(dead_code)]
#[allow(unused_variables)]
fn execute_cpu_spmv_fallback<T>(
rows: usize,
indptr_buffer: &GpuBuffer<u32>,
indices_buffer: &GpuBuffer<u32>,
data_buffer: &GpuBuffer<T>,
x_buffer: &GpuBuffer<T>,
y_buffer: &GpuBuffer<T>,
) -> Result<(), GpuError>
where
T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
{
let indptr = indptr_buffer.to_host()?;
let indices = indices_buffer.to_host()?;
let data = data_buffer.to_host()?;
let x = x_buffer.to_host()?;
let mut y = y_buffer.to_host()?;
for i in 0..rows {
let start = indptr[i] as usize;
let end = indptr[i + 1] as usize;
let mut sum = T::sparse_zero();
for j in start..end {
sum = sum + data[j] * x[indices[j] as usize];
}
y[i] = sum;
}
Ok(())
}
#[cfg(feature = "gpu")]
#[allow(dead_code)]
#[allow(unused_variables)]
fn execute_cuda_symmetric_spmv<T>(
device: &GpuDevice,
kernel: &GpuKernelHandle,
rows: usize,
indptr_buffer: &GpuBuffer<u32>,
indices_buffer: &GpuBuffer<u32>,
data_buffer: &GpuBuffer<T>,
x_buffer: &GpuBuffer<T>,
y_buffer: &GpuBuffer<T>,
global_size: &[usize],
local_size: &[usize],
config: &GpuKernelConfig,
) -> Result<(), GpuError>
where
T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
{
Err(GpuError::BackendNotImplemented(GpuBackend::Cuda))
}
#[cfg(feature = "gpu")]
#[allow(dead_code)]
#[allow(unused_variables)]
fn execute_opencl_symmetric_spmv<T>(
device: &GpuDevice,
kernel: &GpuKernelHandle,
rows: usize,
indptr_buffer: &GpuBuffer<u32>,
indices_buffer: &GpuBuffer<u32>,
data_buffer: &GpuBuffer<T>,
x_buffer: &GpuBuffer<T>,
y_buffer: &GpuBuffer<T>,
global_size: &[usize],
local_size: &[usize],
config: &GpuKernelConfig,
) -> Result<(), GpuError>
where
T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
{
Err(GpuError::BackendNotImplemented(GpuBackend::OpenCL))
}
#[cfg(feature = "gpu")]
#[allow(dead_code)]
#[allow(unused_variables)]
fn execute_metal_symmetric_spmv<T>(
device: &GpuDevice,
kernel: &GpuKernelHandle,
rows: usize,
indptr_buffer: &GpuBuffer<u32>,
indices_buffer: &GpuBuffer<u32>,
data_buffer: &GpuBuffer<T>,
x_buffer: &GpuBuffer<T>,
y_buffer: &GpuBuffer<T>,
global_size: &[usize],
local_size: &[usize],
config: &GpuKernelConfig,
) -> Result<(), GpuError>
where
T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
{
Err(GpuError::BackendNotImplemented(GpuBackend::Metal))
}
#[allow(dead_code)]
#[allow(unused_variables)]
fn execute_cpu_symmetric_spmv_fallback<T>(
rows: usize,
indptr_buffer: &GpuBuffer<u32>,
indices_buffer: &GpuBuffer<u32>,
data_buffer: &GpuBuffer<T>,
x_buffer: &GpuBuffer<T>,
y_buffer: &GpuBuffer<T>,
) -> Result<(), GpuError>
where
T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
{
let indptr = indptr_buffer.to_host()?;
let indices = indices_buffer.to_host()?;
let data = data_buffer.to_host()?;
let x = x_buffer.to_host()?;
let mut y = y_buffer.to_host()?;
for i in 0..rows {
let start = indptr[i] as usize;
let end = indptr[i + 1] as usize;
let mut sum = T::sparse_zero();
for j in start..end {
let col_idx = indices[j] as usize;
let val = data[j];
if col_idx == i {
sum = sum + val * x[col_idx];
} else {
sum = sum + val * x[col_idx];
}
}
y[i] = sum;
}
Ok(())
}
#[cfg(feature = "gpu")]
#[allow(dead_code)]
#[allow(clippy::too_many_arguments)]
pub fn execute_triangular_solve_kernel<T>(
device: &GpuDevice,
kernel: &GpuKernelHandle,
n: usize,
indptr_buffer: &GpuBuffer<u32>,
indices_buffer: &GpuBuffer<u32>,
data_buffer: &GpuBuffer<T>,
b_buffer: &GpuBuffer<T>,
x_buffer: &GpuBuffer<T>,
_config: &GpuKernelConfig,
) -> Result<(), GpuError>
where
T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
{
let (global_size, local_size) = match device.backend() {
GpuBackend::Cuda => {
([32], [32]) }
GpuBackend::OpenCL => {
([64], [64])
}
GpuBackend::Metal => {
([32], [32])
}
GpuBackend::Cpu => {
([1], [1])
}
GpuBackend::Rocm | GpuBackend::Wgpu => {
([64], [64])
}
};
match device.backend() {
GpuBackend::Cpu => execute_cpu_triangular_solve_fallback(
n,
indptr_buffer,
indices_buffer,
data_buffer,
b_buffer,
x_buffer,
),
_ => {
Err(GpuError::BackendNotImplemented(device.backend()))
}
}
}
#[allow(dead_code)]
#[allow(unused_variables)]
fn execute_cpu_triangular_solve_fallback<T>(
n: usize,
indptr_buffer: &GpuBuffer<u32>,
indices_buffer: &GpuBuffer<u32>,
data_buffer: &GpuBuffer<T>,
b_buffer: &GpuBuffer<T>,
x_buffer: &GpuBuffer<T>,
) -> Result<(), GpuError>
where
T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
{
let indptr = indptr_buffer.to_host()?;
let indices = indices_buffer.to_host()?;
let data = data_buffer.to_host()?;
let b = b_buffer.to_host()?;
let mut x = x_buffer.to_host()?;
for i in 0..n {
let start = indptr[i] as usize;
let end = indptr[i + 1] as usize;
let mut sum = T::sparse_zero();
let mut diag_val = T::sparse_zero();
for j in start..end {
let col_idx = indices[j] as usize;
let val = data[j];
match col_idx.cmp(&i) {
std::cmp::Ordering::Equal => {
diag_val = val;
}
std::cmp::Ordering::Less => {
sum = sum + val * x[col_idx];
}
std::cmp::Ordering::Greater => {}
}
}
if diag_val != T::sparse_zero() {
x[i] = (b[i] - sum) / diag_val;
} else {
#[cfg(feature = "gpu")]
return Err(GpuError::InvalidParameter(
"Singular matrix in triangular solve".to_string(),
));
#[cfg(not(feature = "gpu"))]
return Err(GpuError::invalid_parameter(
"Singular matrix in triangular solve".to_string(),
));
}
}
Ok(())
}
pub struct GpuMemoryManager {
device: GpuDevice,
#[cfg(feature = "gpu")]
context: Option<GpuContext>,
buffer_pool: std::collections::HashMap<(usize, std::any::TypeId), Vec<Box<dyn std::any::Any>>>,
memory_stats: GpuMemoryStats,
transfer_queue: std::collections::VecDeque<TransferRequest>,
alignment_preference: usize,
max_pool_size: usize,
}
#[derive(Debug, Clone, Default)]
pub struct GpuMemoryStats {
pub total_allocated: usize,
pub peak_usage: usize,
pub allocation_count: u64,
pub pool_hits: u64,
pub pool_misses: u64,
pub avg_transfer_bandwidth: f64,
}
#[derive(Debug)]
struct TransferRequest {
size: usize,
priority: TransferPriority,
timestamp: std::time::Instant,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum TransferPriority {
Low,
Normal,
High,
Critical,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum MemoryLayout {
Standard,
Coalesced,
Strided { stride: usize },
}
impl GpuMemoryManager {
pub fn new(backend: GpuBackend) -> Result<Self, GpuError> {
#[cfg(feature = "gpu")]
let device = GpuDevice::new(backend, 0);
#[cfg(not(feature = "gpu"))]
let device = GpuDevice::new(backend)?;
let alignment_preference = match backend {
GpuBackend::Cuda => 128, GpuBackend::OpenCL => 64, GpuBackend::Metal => 16, GpuBackend::Cpu => 8, GpuBackend::Rocm => 64, GpuBackend::Wgpu => 32, #[cfg(not(feature = "gpu"))]
GpuBackend::Vulkan => 32, };
#[cfg(feature = "gpu")]
let context = if device.backend() != GpuBackend::Cpu {
GpuContext::new(device.backend()).ok()
} else {
None
};
Ok(Self {
device,
#[cfg(feature = "gpu")]
context,
buffer_pool: std::collections::HashMap::new(),
memory_stats: GpuMemoryStats::default(),
transfer_queue: std::collections::VecDeque::new(),
alignment_preference,
max_pool_size: 20, })
}
pub fn get_buffer<T>(&mut self, size: usize) -> Result<GpuBuffer<T>, GpuError>
where
T: GpuDataType + Default + 'static,
{
let aligned_size =
self.align_size(size * std::mem::size_of::<T>()) / std::mem::size_of::<T>();
let key = (aligned_size, std::any::TypeId::of::<T>());
if let Some(pool) = self.buffer_pool.get_mut(&key) {
if let Some(buffer) = pool.pop() {
if let Ok(typed_buffer) = buffer.downcast::<GpuBuffer<T>>() {
self.memory_stats.pool_hits += 1;
return Ok(*typed_buffer);
}
}
}
self.memory_stats.pool_misses += 1;
self.memory_stats.allocation_count += 1;
#[cfg(feature = "gpu")]
let buffer = if let Some(ref context) = self.context {
let buffer = context.create_buffer::<T>(aligned_size);
let zeros = vec![T::default(); aligned_size];
buffer.copy_from_host(&zeros)?;
buffer
} else {
return Err(GpuError::BackendNotAvailable(
"No GPU context available".to_string(),
));
};
#[cfg(not(feature = "gpu"))]
let buffer = GpuBuffer::from_vec(vec![T::default(); aligned_size]);
let allocation_size = aligned_size * std::mem::size_of::<T>();
self.memory_stats.total_allocated += allocation_size;
if self.memory_stats.total_allocated > self.memory_stats.peak_usage {
self.memory_stats.peak_usage = self.memory_stats.total_allocated;
}
Ok(buffer)
}
pub fn get_buffer_with_layout<T>(
&mut self,
size: usize,
layout: MemoryLayout,
) -> Result<GpuBuffer<T>, GpuError>
where
T: GpuDataType + Default + 'static,
{
match layout {
MemoryLayout::Coalesced => {
let coalesced_size = self.calculate_coalesced_size(size);
self.get_buffer::<T>(coalesced_size)
}
MemoryLayout::Strided { stride } => {
let strided_size = size + (size % stride);
self.get_buffer::<T>(strided_size)
}
MemoryLayout::Standard => self.get_buffer::<T>(size),
}
}
pub fn return_buffer<T>(&mut self, buffer: GpuBuffer<T>)
where
T: GpuDataType + 'static,
{
let size = buffer.len();
let allocation_size = size * std::mem::size_of::<T>();
let key = (size, std::any::TypeId::of::<T>());
let pool = self.buffer_pool.entry(key).or_default();
if pool.len() < self.max_pool_size && allocation_size > 1024 {
pool.push(Box::new(buffer));
}
self.memory_stats.total_allocated = self
.memory_stats
.total_allocated
.saturating_sub(allocation_size);
self.cleanup_old_buffers_if_needed();
}
fn cleanup_old_buffers_if_needed(&mut self) {
if self.memory_stats.total_allocated > self.memory_stats.peak_usage * 3 / 4 {
for (_, pool) in self.buffer_pool.iter_mut() {
let remove_count = pool.len() / 2;
for _ in 0..remove_count {
pool.remove(0);
}
}
}
}
pub fn transfer_data_optimized<T>(
&mut self,
host_data: &[T],
_priority: TransferPriority,
) -> Result<GpuBuffer<T>, GpuError>
where
T: GpuDataType + Copy,
{
let transfer_size = std::mem::size_of_val(host_data);
let start_time = std::time::Instant::now();
let buffer = match self.device.backend() {
#[cfg(feature = "gpu")]
GpuBackend::Cuda => {
self.transfer_data_cuda_optimized(host_data, transfer_size, _priority)
}
#[cfg(feature = "gpu")]
GpuBackend::OpenCL => {
self.transfer_data_opencl_optimized(host_data, transfer_size, _priority)
}
#[cfg(feature = "gpu")]
GpuBackend::Metal => {
self.transfer_data_metal_optimized(host_data, transfer_size, _priority)
}
_ => {
#[cfg(feature = "gpu")]
{
if let Some(ref context) = self.context {
let buffer = context.create_buffer_from_slice(host_data);
Ok(buffer)
} else {
Err(GpuError::BackendNotAvailable(
"No GPU context available".to_string(),
))
}
}
#[cfg(not(feature = "gpu"))]
Ok(GpuBuffer::from_vec(host_data.to_vec()))
}
}?;
let elapsed = start_time.elapsed().as_secs_f64();
if elapsed > 0.0 {
let bandwidth = transfer_size as f64 / elapsed;
self.update_bandwidth_stats(bandwidth);
}
Ok(buffer)
}
#[cfg(feature = "gpu")]
fn transfer_data_cuda_optimized<T>(
&self,
host_data: &[T],
transfer_size: usize,
priority: TransferPriority,
) -> Result<GpuBuffer<T>, GpuError>
where
T: GpuDataType + Copy,
{
if let Some(ref context) = self.context {
if matches!(
priority,
TransferPriority::High | TransferPriority::Critical
) && transfer_size > 4 * 1024 * 1024
{
Ok(context.create_buffer_from_slice(host_data))
} else if transfer_size > 64 * 1024 {
Ok(context.create_buffer_from_slice(host_data))
} else {
Ok(context.create_buffer_from_slice(host_data))
}
} else {
Err(GpuError::BackendNotAvailable(
"No GPU context available".to_string(),
))
}
}
#[cfg(feature = "gpu")]
fn transfer_data_opencl_optimized<T>(
&self,
host_data: &[T],
transfer_size: usize,
_priority: TransferPriority,
) -> Result<GpuBuffer<T>, GpuError>
where
T: GpuDataType + Copy,
{
if let Some(ref context) = self.context {
if transfer_size > 1024 * 1024 {
Ok(context.create_buffer_from_slice(host_data))
} else {
Ok(context.create_buffer_from_slice(host_data))
}
} else {
Err(GpuError::BackendNotAvailable(
"No GPU context available".to_string(),
))
}
}
#[cfg(feature = "gpu")]
fn transfer_data_metal_optimized<T>(
&self,
host_data: &[T],
transfer_size: usize,
_priority: TransferPriority,
) -> Result<GpuBuffer<T>, GpuError>
where
T: GpuDataType + Copy,
{
if let Some(ref context) = self.context {
if transfer_size > 2 * 1024 * 1024 {
Ok(context.create_buffer_from_slice(host_data))
} else {
Ok(context.create_buffer_from_slice(host_data))
}
} else {
Err(GpuError::BackendNotAvailable(
"No GPU context available".to_string(),
))
}
}
pub fn batch_operations<F, R>(&mut self, operations: F) -> Result<R, GpuError>
where
F: FnOnce(&mut Self) -> Result<R, GpuError>,
{
let batch_start = std::time::Instant::now();
let result = operations(self)?;
self.process_pending_transfers()?;
let batch_duration = batch_start.elapsed();
if batch_duration.as_millis() > 100 {
eprintln!(
"Warning: GPU batch operation took {batch_duration_ms}ms",
batch_duration_ms = batch_duration.as_millis()
);
}
Ok(result)
}
fn process_pending_transfers(&mut self) -> Result<(), GpuError> {
let mut transfers: Vec<_> = self.transfer_queue.drain(..).collect();
transfers.sort_by_key(|t| (t.priority, t.timestamp));
for transfer in transfers {
if transfer.priority >= TransferPriority::High {
} else {
self.transfer_queue.push_back(transfer);
}
}
Ok(())
}
fn update_bandwidth_stats(&mut self, bandwidth: f64) {
let alpha = 0.1; if self.memory_stats.avg_transfer_bandwidth == 0.0 {
self.memory_stats.avg_transfer_bandwidth = bandwidth;
} else {
self.memory_stats.avg_transfer_bandwidth =
alpha * bandwidth + (1.0 - alpha) * self.memory_stats.avg_transfer_bandwidth;
}
}
pub fn get_memory_stats(&self) -> &GpuMemoryStats {
&self.memory_stats
}
pub fn get_pool_efficiency(&self) -> f64 {
let total_requests = self.memory_stats.pool_hits + self.memory_stats.pool_misses;
if total_requests == 0 {
0.0
} else {
self.memory_stats.pool_hits as f64 / total_requests as f64
}
}
fn align_size(&self, size: usize) -> usize {
(size + self.alignment_preference - 1) & !(self.alignment_preference - 1)
}
fn calculate_coalesced_size(&self, size: usize) -> usize {
match self.device.backend() {
GpuBackend::Cuda => {
let alignment = 128 / std::mem::size_of::<usize>();
size.div_ceil(alignment) * alignment
}
GpuBackend::OpenCL => {
let alignment = 64 / std::mem::size_of::<usize>();
size.div_ceil(alignment) * alignment
}
GpuBackend::Metal => {
let alignment = 16 / std::mem::size_of::<usize>();
size.div_ceil(alignment) * alignment
}
GpuBackend::Cpu => size,
GpuBackend::Rocm => {
let alignment = 64 / std::mem::size_of::<usize>();
size.div_ceil(alignment) * alignment
}
GpuBackend::Wgpu => {
let alignment = 32 / std::mem::size_of::<usize>();
size.div_ceil(alignment) * alignment
}
#[cfg(not(feature = "gpu"))]
GpuBackend::Vulkan => {
let alignment = 32 / std::mem::size_of::<usize>();
size.div_ceil(alignment) * alignment
}
}
}
}
#[allow(dead_code)]
pub fn prefetch_matrix_data<T>(
memory_manager: &mut GpuMemoryManager,
matrix_data: &[T],
access_pattern: AccessPattern,
) -> Result<GpuBuffer<T>, GpuError>
where
T: GpuDataType + Copy,
{
let priority = match access_pattern {
AccessPattern::Sequential => TransferPriority::Normal,
AccessPattern::Random => TransferPriority::High,
AccessPattern::Strided { .. } => TransferPriority::High,
AccessPattern::Blocked => TransferPriority::Normal,
};
memory_manager.transfer_data_optimized(matrix_data, priority)
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum AccessPattern {
Sequential,
Random,
Strided { stride: usize },
Blocked,
}
#[allow(dead_code)]
pub fn optimize_memory_bandwidth(
backend: GpuBackend,
data_size: usize,
access_pattern: AccessPattern,
) -> MemoryStrategy {
match (backend, access_pattern, data_size) {
(GpuBackend::Cuda, AccessPattern::Sequential, size) if size > 1024 * 1024 => {
MemoryStrategy::Coalesced
}
(GpuBackend::Cuda, AccessPattern::Random, _) => MemoryStrategy::TextureMemory,
(GpuBackend::OpenCL, AccessPattern::Blocked, _) => MemoryStrategy::SharedMemory,
(GpuBackend::Metal, _, size) if size > 512 * 1024 => {
MemoryStrategy::SharedMemory }
_ => MemoryStrategy::Standard,
}
}
#[allow(dead_code)]
pub fn calculate_adaptive_workgroup_size(
backend: GpuBackend,
matrix_rows: usize,
matrix_nnz: usize,
available_memory: usize,
) -> GpuKernelConfig {
let avg_nnz_per_row = matrix_nnz.checked_div(matrix_rows).unwrap_or(0);
let workgroup_size = match backend {
GpuBackend::Cuda => {
if avg_nnz_per_row < 10 {
[128, 1, 1] } else if avg_nnz_per_row < 50 {
[256, 1, 1] } else {
[512, 1, 1] }
}
GpuBackend::OpenCL => {
if avg_nnz_per_row < 20 {
[64, 1, 1]
} else {
[128, 1, 1]
}
}
GpuBackend::Metal => {
if avg_nnz_per_row < 15 {
[128, 1, 1]
} else {
[256, 1, 1]
}
}
GpuBackend::Cpu => {
[1, 1, 1]
}
GpuBackend::Rocm => {
if avg_nnz_per_row < 10 {
[64, 1, 1]
} else if avg_nnz_per_row < 50 {
[128, 1, 1]
} else {
[256, 1, 1]
}
}
GpuBackend::Wgpu => {
if avg_nnz_per_row < 20 {
[32, 1, 1]
} else {
[64, 1, 1]
}
}
#[cfg(not(feature = "gpu"))]
GpuBackend::Vulkan => {
if avg_nnz_per_row < 20 {
[32, 1, 1]
} else {
[64, 1, 1]
}
}
};
let memory_strategy = if available_memory > 512 * 1024 * 1024 {
MemoryStrategy::SharedMemory } else {
MemoryStrategy::Coalesced };
GpuKernelConfig {
workgroup_size,
compute_units: 0, vectorization: avg_nnz_per_row > 4, memory_strategy,
}
}
#[cfg(not(feature = "gpu"))]
#[allow(dead_code)]
#[allow(unused_variables)]
#[allow(clippy::too_many_arguments)]
pub fn execute_spmv_kernel<T>(
_device: &GpuDevice,
_kernel: &GpuKernelHandle,
rows: usize,
indptr_buffer: &GpuBuffer<u32>,
indices_buffer: &GpuBuffer<u32>,
data_buffer: &GpuBuffer<T>,
x_buffer: &GpuBuffer<T>,
y_buffer: &GpuBuffer<T>,
_config: &GpuKernelConfig,
) -> Result<(), GpuError>
where
T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
{
execute_cpu_spmv_fallback(
rows,
indptr_buffer,
indices_buffer,
data_buffer,
x_buffer,
y_buffer,
)
}
#[cfg(not(feature = "gpu"))]
#[allow(dead_code)]
#[allow(unused_variables)]
#[allow(clippy::too_many_arguments)]
pub fn execute_symmetric_spmv_kernel<T>(
_device: &GpuDevice,
_kernel: &GpuKernelHandle,
rows: usize,
indptr_buffer: &GpuBuffer<u32>,
indices_buffer: &GpuBuffer<u32>,
data_buffer: &GpuBuffer<T>,
x_buffer: &GpuBuffer<T>,
y_buffer: &GpuBuffer<T>,
_config: &GpuKernelConfig,
) -> Result<(), GpuError>
where
T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
{
execute_cpu_symmetric_spmv_fallback(
rows,
indptr_buffer,
indices_buffer,
data_buffer,
x_buffer,
y_buffer,
)
}
#[cfg(not(feature = "gpu"))]
#[allow(dead_code)]
#[allow(unused_variables)]
#[allow(clippy::too_many_arguments)]
pub fn execute_triangular_solve_kernel<T>(
_device: &GpuDevice,
_kernel: &GpuKernelHandle,
n: usize,
indptr_buffer: &GpuBuffer<u32>,
indices_buffer: &GpuBuffer<u32>,
data_buffer: &GpuBuffer<T>,
b_buffer: &GpuBuffer<T>,
x_buffer: &GpuBuffer<T>,
_config: &GpuKernelConfig,
) -> Result<(), GpuError>
where
T: Float + SparseElement + Debug + Copy + 'static + GpuDataType,
{
execute_cpu_triangular_solve_fallback(
n,
indptr_buffer,
indices_buffer,
data_buffer,
b_buffer,
x_buffer,
)
}
pub struct GpuPerformanceProfiler {
backend: GpuBackend,
timing_data: std::collections::HashMap<String, Vec<f64>>,
}
impl GpuPerformanceProfiler {
pub fn new(backend: GpuBackend) -> Self {
Self {
backend,
timing_data: std::collections::HashMap::new(),
}
}
pub fn profile_operation<F, R>(
&mut self,
operationname: &str,
operation: F,
) -> Result<R, GpuError>
where
F: FnOnce() -> Result<R, GpuError>,
{
let start_time = std::time::Instant::now();
let result = operation()?;
let elapsed = start_time.elapsed().as_secs_f64() * 1000.0;
let timings = self
.timing_data
.entry(operationname.to_string())
.or_default();
timings.push(elapsed);
if timings.len() > 100 {
timings.remove(0);
}
Ok(result)
}
pub fn get_average_time(&self, operationname: &str) -> Option<f64> {
if let Some(timings) = self.timing_data.get(operationname) {
if !timings.is_empty() {
Some(timings.iter().sum::<f64>() / timings.len() as f64)
} else {
None
}
} else {
None
}
}
pub fn get_recommendations(&self) -> Vec<String> {
let mut recommendations = Vec::new();
for (operation, timings) in &self.timing_data {
if let Some(avg_time) = self.get_average_time(operation) {
let variance = if timings.len() > 1 {
let mean = avg_time;
let var = timings.iter().map(|&t| (t - mean).powi(2)).sum::<f64>()
/ timings.len() as f64;
var.sqrt()
} else {
0.0
};
match self.backend {
GpuBackend::Cuda => {
if avg_time > 10.0 && operation.contains("spmv") {
recommendations.push(format!(
"Consider using larger workgroup sizes for {operation} (current avg: {avg_time:.2}ms, variance: {variance:.2})"
));
}
if variance > avg_time * 0.5 {
recommendations.push(format!(
"High timing variance for {operation} suggests memory bandwidth bottleneck"
));
}
}
GpuBackend::OpenCL => {
if avg_time > 15.0 {
recommendations.push(format!(
"OpenCL performance for {operation} could be improved with memory optimization (current avg: {avg_time:.2}ms)"
));
}
if variance > 5.0 {
recommendations.push(format!(
"Consider using local memory optimization for {operation} to reduce timing variance"
));
}
}
GpuBackend::Metal => {
if avg_time > 8.0 && operation.contains("triangular") {
recommendations.push(format!(
"Metal triangular solve {operation} may benefit from simdgroup optimization (current avg: {avg_time:.2}ms)"
));
}
if operation.contains("spmv") && avg_time > 5.0 {
recommendations.push(format!(
"Consider using Metal's unified memory architecture for {operation} optimization"
));
}
}
GpuBackend::Cpu => {
if avg_time > 50.0 {
recommendations.push(format!(
"Consider enabling GPU acceleration for {operation} (CPU avg: {avg_time:.2}ms)"
));
}
if variance > 20.0 {
recommendations.push(format!(
"High CPU timing variance for {operation} suggests CPU scheduling issues"
));
}
}
GpuBackend::Rocm => {
if avg_time > 12.0 {
recommendations.push(format!(
"ROCm performance for {operation} could be improved with memory optimization (current avg: {avg_time:.2}ms)"
));
}
}
GpuBackend::Wgpu => {
if avg_time > 15.0 {
recommendations.push(format!(
"WebGPU performance for {operation} could be improved with buffer optimization (current avg: {avg_time:.2}ms)"
));
} else if avg_time > 12.0 {
recommendations.push(format!(
"WebGPU performance for {operation} could be improved with memory optimization (current avg: {avg_time:.2}ms)"
));
}
}
#[cfg(not(feature = "gpu"))]
GpuBackend::Vulkan => {
if avg_time > 15.0 {
recommendations.push(format!(
"Vulkan performance for {operation} could be improved with buffer optimization (current avg: {avg_time:.2}ms)"
));
} else if avg_time > 12.0 {
recommendations.push(format!(
"Vulkan performance for {operation} could be improved with memory optimization (current avg: {avg_time:.2}ms)"
));
}
}
}
if avg_time > 100.0 {
recommendations.push(format!(
"Operation {operation} is taking very long ({avg_time:.2}ms) - consider algorithm optimization"
));
}
}
}
recommendations
}
pub fn get_operation_metrics(&self, operationname: &str) -> Option<OperationMetrics> {
if let Some(timings) = self.timing_data.get(operationname) {
if timings.is_empty() {
return None;
}
let count = timings.len();
let total_time: f64 = timings.iter().sum();
let avg_time = total_time / count as f64;
let min_time = timings.iter().copied().fold(f64::INFINITY, f64::min);
let max_time = timings.iter().copied().fold(f64::NEG_INFINITY, f64::max);
let variance = if count > 1 {
timings.iter().map(|&t| (t - avg_time).powi(2)).sum::<f64>() / count as f64
} else {
0.0
};
Some(OperationMetrics {
operationname: operationname.to_string(),
call_count: count,
total_time,
avg_time,
min_time,
max_time,
variance: variance.sqrt(),
throughput: 1000.0 / avg_time, })
} else {
None
}
}
pub fn reset_metrics(&mut self) {
self.timing_data.clear();
}
pub fn export_metrics(&self) -> Vec<OperationMetrics> {
self.timing_data
.keys()
.filter_map(|op| self.get_operation_metrics(op))
.collect()
}
}
#[derive(Debug, Clone)]
pub struct OperationMetrics {
pub operationname: String,
pub call_count: usize,
pub total_time: f64,
pub avg_time: f64,
pub min_time: f64,
pub max_time: f64,
pub variance: f64,
pub throughput: f64, }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_calculate_optimal_dimensions() {
let (global, local) = calculate_optimal_dimensions(
GpuBackend::Cuda,
1000, [256, 1, 1], 0, );
assert_eq!(local[0], 256);
assert!(global[0] >= 1000);
assert_eq!(global[0] % local[0], 0);
}
#[test]
fn test_adaptive_workgroup_config() {
let config = calculate_adaptive_workgroup_size(
GpuBackend::Cuda,
10000, 50000, 1024 * 1024 * 1024, );
assert!(config.workgroup_size[0] > 0);
assert!(config.vectorization); assert_eq!(config.memory_strategy, MemoryStrategy::SharedMemory);
}
#[test]
fn test_gpu_kernel_config_default() {
let config = GpuKernelConfig::default();
assert_eq!(config.workgroup_size, [256, 1, 1]);
assert_eq!(config.compute_units, 0);
assert!(config.vectorization);
assert_eq!(config.memory_strategy, MemoryStrategy::Coalesced);
}
}