use crate::error::{LinalgError, LinalgResult};
use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2};
use scirs2_core::numeric::{Float, NumAssign, Zero};
use std::fmt::Debug;
pub mod acceleration;
pub mod advanced;
pub mod backends;
pub mod device_info;
pub mod memory;
pub mod operations;
pub use operations::{
AdvancedGpuOperations, BatchSizeOptimizer, GpuOperationDispatcher, DEFAULT_GPU_THRESHOLD,
};
pub use acceleration::{
get_global_gpu_framework, initialize_global_gpu_acceleration, AccelerationConfig,
GpuAccelerationFramework, GpuPerformanceProfiler,
};
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum GpuDeviceType {
Cuda,
OpenCl,
Rocm,
Vulkan,
Metal,
OneApi,
WebGpu,
}
#[derive(Debug, Clone)]
pub struct GpuDeviceInfo {
pub device_type: GpuDeviceType,
pub name: String,
pub total_memory: usize,
pub compute_units: u32,
pub clock_frequency: u32,
pub supports_fp64: bool,
pub supports_fp16: bool,
pub max_work_groupsize: usize,
pub memory_bandwidth: f64,
pub l2_cachesize: usize,
pub shared_memory_per_block: usize,
pub registers_per_block: u32,
pub warpsize: u32,
pub max_threads_per_mp: u32,
pub multiprocessor_count: u32,
pub supports_tensor_cores: bool,
pub supports_mixed_precision: bool,
pub vendor: String,
}
#[derive(Debug, Clone)]
pub struct GpuPerformanceProfile {
pub peak_flops_sp: f64,
pub peak_flops_dp: f64,
pub memory_efficiency: f64,
pub compute_efficiency: f64,
pub optimal_work_groupsizes: std::collections::HashMap<GpuOperation, usize>,
pub target_occupancy: f64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum GpuOperation {
MatrixMultiplication,
ElementWise,
Reduction,
Transpose,
Decomposition,
IterativeSolver,
FFT,
Convolution,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GpuMemoryStrategy {
GpuOnly,
Unified,
Explicit,
Adaptive,
Streaming,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GpuPrecisionMode {
Single,
Double,
Half,
Mixed,
TensorCore,
}
#[derive(Debug, Clone)]
pub struct GpuAccelerationStrategy {
pub minsize_threshold: usize,
pub max_single_gpusize: usize,
pub memory_strategy: GpuMemoryStrategy,
pub precision_mode: GpuPrecisionMode,
pub multi_gpu_enabled: bool,
pub overlap_compute_transfer: bool,
pub use_streams: bool,
pub num_streams: usize,
}
impl Default for GpuAccelerationStrategy {
fn default() -> Self {
Self {
minsize_threshold: 512,
max_single_gpusize: 50000,
memory_strategy: GpuMemoryStrategy::Adaptive,
precision_mode: GpuPrecisionMode::Double,
multi_gpu_enabled: true,
overlap_compute_transfer: true,
use_streams: true,
num_streams: 4,
}
}
}
pub trait GpuBuffer<T>: Send + Sync + std::fmt::Debug {
fn len(&self) -> usize;
fn is_empty(&self) -> bool {
self.len() == 0
}
fn copy_from_host(&mut self, data: &[T]) -> LinalgResult<()>;
fn copy_to_host(&self, data: &mut [T]) -> LinalgResult<()>;
fn device_ptr(&self) -> *mut std::ffi::c_void;
}
pub trait GpuContext: Send + Sync + std::fmt::Debug {
fn device_info(&self) -> &GpuDeviceInfo;
fn synchronize(&self) -> LinalgResult<()>;
fn available_memory(&self) -> LinalgResult<usize>;
fn total_memory(&self) -> usize {
self.device_info().total_memory
}
}
pub trait GpuContextAlloc: GpuContext {
fn allocate_buffer<T: Clone + Send + Sync + Copy + 'static + std::fmt::Debug>(
&self,
size: usize,
) -> LinalgResult<Box<dyn GpuBuffer<T>>>;
}
pub trait GpuLinearAlgebra<T>: Send + Sync
where
T: Float + NumAssign + Zero + Send + Sync + Clone + Debug,
{
fn gemm(
&self,
a: &dyn GpuBuffer<T>,
b: &dyn GpuBuffer<T>,
c: &mut dyn GpuBuffer<T>,
m: usize,
n: usize,
k: usize,
alpha: T,
beta: T,
) -> LinalgResult<()>;
fn gemv(
&self,
a: &dyn GpuBuffer<T>,
x: &dyn GpuBuffer<T>,
y: &mut dyn GpuBuffer<T>,
m: usize,
n: usize,
alpha: T,
beta: T,
) -> LinalgResult<()>;
fn elementwise_add(
&self,
a: &dyn GpuBuffer<T>,
b: &dyn GpuBuffer<T>,
result: &mut dyn GpuBuffer<T>,
size: usize,
) -> LinalgResult<()>;
fn dot(&self, a: &dyn GpuBuffer<T>, b: &dyn GpuBuffer<T>, size: usize) -> LinalgResult<T>;
fn transpose(
&self,
input: &dyn GpuBuffer<T>,
output: &mut dyn GpuBuffer<T>,
rows: usize,
cols: usize,
) -> LinalgResult<()>;
fn cholesky(&self, matrix: &mut dyn GpuBuffer<T>, n: usize) -> LinalgResult<()>;
fn lu_decomposition(
&self,
matrix: &mut dyn GpuBuffer<T>,
pivots: &mut dyn GpuBuffer<i32>,
n: usize,
) -> LinalgResult<()>;
fn qr_decomposition(
&self,
matrix: &mut dyn GpuBuffer<T>,
q: &mut dyn GpuBuffer<T>,
r: &mut dyn GpuBuffer<T>,
m: usize,
n: usize,
) -> LinalgResult<()>;
fn eigenvalues(
&self,
matrix: &dyn GpuBuffer<T>,
eigenvals: &mut dyn GpuBuffer<T>,
n: usize,
) -> LinalgResult<()>;
}
pub struct GpuManager {
devices: Vec<Box<dyn GpuContext>>,
strategy: GpuAccelerationStrategy,
#[allow(dead_code)]
performance_profiles: std::collections::HashMap<usize, GpuPerformanceProfile>,
}
impl GpuManager {
pub fn new() -> Self {
Self {
devices: Vec::new(),
strategy: GpuAccelerationStrategy::default(),
performance_profiles: std::collections::HashMap::new(),
}
}
pub fn add_device(&mut self, device: Box<dyn GpuContext>) {
self.devices.push(device);
}
pub fn device_count(&self) -> usize {
self.devices.len()
}
pub fn device_info(&self, device_id: usize) -> Option<&GpuDeviceInfo> {
self.devices.get(device_id).map(|d| d.device_info())
}
pub fn set_strategy(&mut self, strategy: GpuAccelerationStrategy) {
self.strategy = strategy;
}
pub fn recommend_gpu_usage(
&self,
operation: GpuOperation,
matrixsize: usize,
data_typesize: usize,
) -> GpuRecommendation {
let total_elements = matrixsize * matrixsize;
let memory_required = total_elements * data_typesize;
if matrixsize < self.strategy.minsize_threshold {
return GpuRecommendation::UseCpu {
reason: "Matrix too small for GPU acceleration".to_string(),
};
}
let suitable_devices: Vec<usize> = self
.devices
.iter()
.enumerate()
.filter(|(_, device)| device.device_info().total_memory > memory_required)
.map(|(idx, _)| idx)
.collect();
if suitable_devices.is_empty() {
return GpuRecommendation::UseCpu {
reason: "No GPU with sufficient memory".to_string(),
};
}
let best_device = self.select_best_device(&suitable_devices, operation);
if matrixsize > self.strategy.max_single_gpusize && self.strategy.multi_gpu_enabled {
GpuRecommendation::UseMultiGpu {
devices: suitable_devices,
primary_device: best_device,
partition_strategy: MultiGpuPartition::RowWise,
}
} else {
GpuRecommendation::UseSingleGpu {
device_id: best_device,
memory_strategy: self.strategy.memory_strategy,
precision_mode: self.strategy.precision_mode,
}
}
}
fn select_best_device(&self, candidates: &[usize], operation: GpuOperation) -> usize {
candidates
.iter()
.max_by_key(|&&device_id| self.devices[device_id].device_info().total_memory)
.copied()
.unwrap_or(0)
}
pub fn get_performance_stats(&self) -> Vec<GpuPerformanceStats> {
self.devices
.iter()
.enumerate()
.map(|(idx, device)| {
GpuPerformanceStats {
device_id: idx,
device_info: device.device_info().clone(),
operations_per_second: 0.0, memory_utilization: 0.0,
compute_utilization: 0.0,
power_consumption: 0.0,
}
})
.collect()
}
}
#[derive(Debug, Clone)]
pub enum GpuRecommendation {
UseCpu { reason: String },
UseSingleGpu {
device_id: usize,
memory_strategy: GpuMemoryStrategy,
precision_mode: GpuPrecisionMode,
},
UseMultiGpu {
devices: Vec<usize>,
primary_device: usize,
partition_strategy: MultiGpuPartition,
},
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum MultiGpuPartition {
RowWise,
ColumnWise,
Block2D,
Replicated,
}
#[derive(Debug, Clone)]
pub struct GpuPerformanceStats {
pub device_id: usize,
pub device_info: GpuDeviceInfo,
pub operations_per_second: f64,
pub memory_utilization: f64,
pub compute_utilization: f64,
pub power_consumption: f64,
}
impl Default for GpuManager {
fn default() -> Self {
Self::new()
}
}
pub trait GpuLinalgOps<T>: Send + Sync
where
T: Float + NumAssign + Zero + Send + Sync + Debug + 'static,
{
fn gpu_matvec(
&self,
ctx: &dyn GpuContext,
a: &ArrayView2<T>,
x: &ArrayView1<T>,
) -> LinalgResult<Array1<T>>;
fn gpu_matmul(
&self,
ctx: &dyn GpuContext,
a: &ArrayView2<T>,
b: &ArrayView2<T>,
) -> LinalgResult<Array2<T>>;
fn gpu_dot(
&self,
ctx: &dyn GpuContext,
x: &ArrayView1<T>,
y: &ArrayView1<T>,
) -> LinalgResult<T>;
fn gpu_norm(&self, ctx: &dyn GpuContext, x: &ArrayView1<T>) -> LinalgResult<T>;
fn gpu_elementwise_add(
&self,
ctx: &dyn GpuContext,
a: &ArrayView2<T>,
b: &ArrayView2<T>,
) -> LinalgResult<Array2<T>>;
fn gpu_elementwise_mul(
&self,
ctx: &dyn GpuContext,
a: &ArrayView2<T>,
b: &ArrayView2<T>,
) -> LinalgResult<Array2<T>>;
}
pub trait GpuBackend: Send + Sync {
fn name(&self) -> &str;
fn is_available(&self) -> bool;
fn list_devices(&self) -> LinalgResult<Vec<GpuDeviceInfo>>;
fn create_context(&self, deviceid: usize) -> LinalgResult<Box<dyn GpuContext>>;
fn create_best_context(&self) -> LinalgResult<Box<dyn GpuContext>> {
let devices = self.list_devices()?;
if devices.is_empty() {
return Err(LinalgError::ComputationError(
"No GPU devices available".to_string(),
));
}
let best_device = devices
.iter()
.enumerate()
.max_by_key(|(_, device)| device.total_memory)
.map(|(idx, _)| idx)
.expect("Operation failed");
self.create_context(best_device)
}
}
#[derive(Default)]
pub struct GpuBackendManager {
backends: Vec<Box<dyn GpuBackend>>,
}
impl GpuBackendManager {
pub fn new() -> Self {
Self {
backends: Vec::new(),
}
}
pub fn register_backend(&mut self, backend: Box<dyn GpuBackend>) {
if backend.is_available() {
self.backends.push(backend);
}
}
pub fn available_backends(&self) -> &[Box<dyn GpuBackend>] {
&self.backends
}
pub fn get_backend(&self, name: &str) -> Option<&dyn GpuBackend> {
self.backends
.iter()
.find(|backend| backend.name() == name)
.map(|b| b.as_ref())
}
pub fn create_best_context(&self) -> LinalgResult<Box<dyn GpuContext>> {
if self.backends.is_empty() {
return Err(LinalgError::ComputationError(
"No GPU backends available".to_string(),
));
}
for backend in &self.backends {
if let Ok(context) = backend.create_best_context() {
return Ok(context);
}
}
Err(LinalgError::ComputationError(
"Failed to create GPU context with any backend".to_string(),
))
}
pub fn list_all_devices(&self) -> LinalgResult<Vec<(String, Vec<GpuDeviceInfo>)>> {
let mut all_devices = Vec::new();
for backend in &self.backends {
let devices = backend.list_devices()?;
all_devices.push((backend.name().to_string(), devices));
}
Ok(all_devices)
}
}
#[allow(dead_code)]
pub fn initialize_gpu_manager() -> LinalgResult<GpuBackendManager> {
let mut manager = GpuBackendManager::new();
#[cfg(feature = "cuda")]
{
if let Ok(cuda_backend) = backends::cuda::CudaBackend::new() {
manager.register_backend(Box::new(cuda_backend));
}
}
#[cfg(feature = "opencl")]
{
if let Ok(opencl_backend) = backends::opencl::OpenClBackend::new() {
manager.register_backend(Box::new(opencl_backend));
}
}
#[cfg(feature = "rocm")]
{
if let Ok(rocm_backend) = backends::rocm::RocmBackend::new() {
manager.register_backend(Box::new(rocm_backend));
}
}
#[cfg(feature = "metal")]
{
if let Ok(metal_backend) = backends::metal::MetalBackend::new() {
manager.register_backend(Box::new(metal_backend));
}
}
Ok(manager)
}
#[allow(dead_code)]
pub fn should_use_gpu(
matrix_elements: usize,
threshold: usize,
gpu_context: Option<&dyn GpuContext>,
) -> bool {
gpu_context.is_some() && matrix_elements > threshold
}
pub trait AutoGpuSelector<T>
where
T: Float + NumAssign + Zero + Send + Sync + Debug + 'static,
{
fn auto_matvec(
&self,
a: &ArrayView2<T>,
x: &ArrayView1<T>,
gpu_context: Option<&dyn GpuContext>,
) -> LinalgResult<Array1<T>>;
fn auto_matmul(
&self,
a: &ArrayView2<T>,
b: &ArrayView2<T>,
gpu_context: Option<&dyn GpuContext>,
) -> LinalgResult<Array2<T>>;
}
pub const DEFAULT_GPU_THRESHOLD_MATVEC: usize = 10_000;
pub const DEFAULT_GPU_THRESHOLD_MATMUL: usize = 100_000;
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_gpu_manager_creation() {
let manager = GpuBackendManager::new();
assert_eq!(manager.available_backends().len(), 0);
}
#[test]
fn test_should_use_gpu_threshold() {
assert!(!should_use_gpu(100, 1000, None));
assert!(!should_use_gpu(2000, 1000, None));
}
#[test]
fn test_gpu_device_type_equality() {
assert_eq!(GpuDeviceType::Cuda, GpuDeviceType::Cuda);
assert_ne!(GpuDeviceType::Cuda, GpuDeviceType::OpenCl);
}
}