use crate::error::{IoError, Result};
use scirs2_core::gpu::{GpuBackend, GpuDevice, GpuDeviceInfo, GpuError};
use scirs2_core::simd_ops::PlatformCapabilities;
const CUDA_MIN_TOTAL_MEMORY_BYTES: u64 = 512 * 1024 * 1024;
const METAL_MIN_TOTAL_MEMORY_BYTES: u64 = 256 * 1024 * 1024;
const OPENCL_MIN_TOTAL_MEMORY_BYTES: u64 = 128 * 1024 * 1024;
const CUDA_MIN_WORK_GROUP_SIZE: u32 = 32;
const METAL_MIN_WORK_GROUP_SIZE: u32 = 32;
const OPENCL_MIN_WORK_GROUP_SIZE: u32 = 32;
const BYTES_PER_GIB: f64 = (1024 * 1024 * 1024) as f64;
#[derive(Debug)]
pub struct GpuIoProcessor {
pub device: GpuDevice,
pub capabilities: PlatformCapabilities,
}
impl GpuIoProcessor {
pub fn new() -> Result<Self> {
let capabilities = PlatformCapabilities::detect();
if !capabilities.gpu_available {
return Err(IoError::Other("GPU acceleration not available. Please ensure GPU drivers are installed and properly configured.".to_string()));
}
let backend = Self::detect_optimal_backend()
.map_err(|e| IoError::Other(format!("Failed to detect optimal GPU backend: {}", e)))?;
let device = GpuDevice::new(backend, 0);
if backend != GpuBackend::Cpu && !Self::validate_device_info(&device)? {
return Err(IoError::ValidationError(format!(
"GPU backend {backend} did not meet minimum capability requirements",
)));
}
Ok(Self {
device,
capabilities,
})
}
pub fn with_backend(backend: GpuBackend) -> Result<Self> {
if !Self::is_backend_available(backend) {
return Err(IoError::Other(format!(
"GPU backend {} is not available",
backend
)));
}
let device = GpuDevice::new(backend, 0);
let capabilities = PlatformCapabilities::detect();
Ok(Self {
device,
capabilities,
})
}
pub fn detect_optimal_backend() -> Result<GpuBackend> {
let backends_to_try = [
GpuBackend::Cuda, GpuBackend::Metal, GpuBackend::OpenCL, ];
for &backend in &backends_to_try {
if Self::is_backend_available(backend) {
match Self::validate_backend(backend) {
Ok(true) => return Ok(backend),
_ => continue,
}
}
}
Ok(GpuBackend::Cpu)
}
pub fn validate_backend(backend: GpuBackend) -> Result<bool> {
if !backend.is_available() {
return Ok(false);
}
let device = GpuDevice::new(backend, 0);
Self::validate_device_info(&device)
}
fn validate_device_info(device: &GpuDevice) -> Result<bool> {
let info = device
.get_info()
.map_err(|e| IoError::Other(format!("Failed to query GPU device info: {e}")))?;
match device.backend() {
GpuBackend::Cuda => Self::validate_cuda_backend(&info),
GpuBackend::Metal => Self::validate_metal_backend(&info),
GpuBackend::OpenCL => Self::validate_opencl_backend(&info),
_ => Ok(false),
}
}
fn validate_cuda_backend(info: &GpuDeviceInfo) -> Result<bool> {
if info.backend != GpuBackend::Cuda {
return Ok(false);
}
if !info.supports_fp64 {
return Ok(false);
}
Ok(
Self::meets_memory_requirement(info, CUDA_MIN_TOTAL_MEMORY_BYTES)
&& Self::meets_work_group_requirement(info, CUDA_MIN_WORK_GROUP_SIZE),
)
}
fn validate_metal_backend(info: &GpuDeviceInfo) -> Result<bool> {
if info.backend != GpuBackend::Metal {
return Ok(false);
}
if !info.supports_fp16 {
return Ok(false);
}
Ok(
Self::meets_memory_requirement(info, METAL_MIN_TOTAL_MEMORY_BYTES)
&& Self::meets_work_group_requirement(info, METAL_MIN_WORK_GROUP_SIZE),
)
}
fn validate_opencl_backend(info: &GpuDeviceInfo) -> Result<bool> {
if info.backend != GpuBackend::OpenCL {
return Ok(false);
}
Ok(
Self::meets_memory_requirement(info, OPENCL_MIN_TOTAL_MEMORY_BYTES)
&& Self::meets_work_group_requirement(info, OPENCL_MIN_WORK_GROUP_SIZE),
)
}
fn meets_memory_requirement(info: &GpuDeviceInfo, minimum: u64) -> bool {
info.total_memory == 0 || info.total_memory >= minimum
}
fn meets_work_group_requirement(info: &GpuDeviceInfo, minimum: u32) -> bool {
info.max_work_group_size == 0 || info.max_work_group_size >= minimum
}
pub fn get_backend_capabilities(&self) -> Result<BackendCapabilities> {
let info = self
.device
.get_info()
.map_err(|e| IoError::Other(format!("Failed to query GPU device info: {e}")))?;
Ok(Self::capabilities_from_info(&info))
}
fn capabilities_from_info(info: &GpuDeviceInfo) -> BackendCapabilities {
let default_memory_gb = match info.backend {
GpuBackend::Cuda | GpuBackend::Rocm => 4.0,
GpuBackend::Metal => 8.0,
GpuBackend::OpenCL | GpuBackend::Wgpu => 2.0,
GpuBackend::Cpu => 1.0,
};
let default_work_group: usize = match info.backend {
GpuBackend::Cuda | GpuBackend::Rocm | GpuBackend::Metal => 1024,
GpuBackend::OpenCL | GpuBackend::Wgpu => 256,
GpuBackend::Cpu => 1,
};
let memory_gb = if info.total_memory == 0 {
default_memory_gb
} else {
info.total_memory as f64 / BYTES_PER_GIB
};
let max_work_group_size = if info.max_work_group_size == 0 {
default_work_group
} else {
info.max_work_group_size as usize
};
let known_memory_bytes = if info.available_memory != 0 {
info.available_memory
} else {
info.total_memory
};
let max_allocation_size = if known_memory_bytes == 0 {
1usize << 30 } else {
usize::try_from(known_memory_bytes).unwrap_or(usize::MAX)
};
BackendCapabilities {
backend: info.backend,
memory_gb,
max_work_group_size,
supports_fp64: info.supports_fp64,
supports_fp16: info.supports_fp16,
compute_units: (max_work_group_size / 32).max(1),
max_allocation_size,
local_memory_size: 48 * 1024, }
}
pub fn backend(&self) -> GpuBackend {
self.device.backend()
}
pub fn is_backend_available(backend: GpuBackend) -> bool {
backend.is_available()
}
pub fn list_available_backends() -> Vec<GpuBackend> {
let mut list: Vec<GpuBackend> = [GpuBackend::Cuda, GpuBackend::Metal, GpuBackend::OpenCL]
.iter()
.filter(|&&backend| Self::is_backend_available(backend))
.copied()
.collect();
if !list.contains(&GpuBackend::Cpu) {
list.push(GpuBackend::Cpu);
}
list
}
pub fn get_optimal_backend_for_workload(workload: GpuWorkloadType) -> Result<GpuBackend> {
let available_backends = Self::list_available_backends();
if available_backends.is_empty() {
return Ok(GpuBackend::Cpu);
}
match workload {
GpuWorkloadType::MachineLearning => {
if available_backends.contains(&GpuBackend::Cuda) {
Ok(GpuBackend::Cuda)
} else {
Ok(available_backends[0])
}
}
GpuWorkloadType::ImageProcessing => {
if available_backends.contains(&GpuBackend::Metal) {
Ok(GpuBackend::Metal)
} else if available_backends.contains(&GpuBackend::Cuda) {
Ok(GpuBackend::Cuda)
} else {
Ok(available_backends[0])
}
}
GpuWorkloadType::GeneralCompute => {
Ok(available_backends[0])
}
GpuWorkloadType::Compression => {
if available_backends.contains(&GpuBackend::Cuda) {
Ok(GpuBackend::Cuda)
} else {
Ok(available_backends[0])
}
}
}
}
}
impl Default for GpuIoProcessor {
fn default() -> Self {
Self::new().unwrap_or_else(|_| {
GpuIoProcessor {
device: GpuDevice::new(GpuBackend::Cpu, 0),
capabilities: PlatformCapabilities::detect(),
}
})
}
}
#[derive(Debug, Clone)]
pub struct BackendCapabilities {
pub backend: GpuBackend,
pub memory_gb: f64,
pub max_work_group_size: usize,
pub supports_fp64: bool,
pub supports_fp16: bool,
pub compute_units: usize,
pub max_allocation_size: usize,
pub local_memory_size: usize,
}
impl BackendCapabilities {
pub fn supports_high_precision(&self) -> bool {
self.supports_fp64
}
pub fn supports_half_precision(&self) -> bool {
self.supports_fp16
}
pub fn get_optimal_work_group_size(&self, problem_size: usize) -> usize {
let base_size = match self.backend {
GpuBackend::Cuda => 256, GpuBackend::Metal => 64, GpuBackend::OpenCL => 128, _ => 64,
};
base_size.min(self.max_work_group_size).min(problem_size)
}
pub fn estimate_memory_bandwidth(&self) -> f64 {
match self.backend {
GpuBackend::Cuda => self.memory_gb * 0.8, GpuBackend::Metal => self.memory_gb * 0.7, GpuBackend::OpenCL => self.memory_gb * 0.6, _ => self.memory_gb * 0.4,
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum GpuWorkloadType {
MachineLearning,
ImageProcessing,
GeneralCompute,
Compression,
}
#[derive(Debug, Clone)]
pub struct BackendPerformanceProfile {
pub backend: GpuBackend,
pub throughput_gbps: f64,
pub latency_ms: f64,
pub power_efficiency: f64,
pub memory_efficiency: f64,
}
impl BackendPerformanceProfile {
pub fn new(backend: GpuBackend, capabilities: &BackendCapabilities) -> Self {
let (throughput, latency, power_eff, mem_eff) = match backend {
GpuBackend::Cuda => (capabilities.memory_gb * 0.8, 0.1, 0.7, 0.9),
GpuBackend::Metal => (capabilities.memory_gb * 0.7, 0.15, 0.9, 0.8),
GpuBackend::OpenCL => (capabilities.memory_gb * 0.6, 0.2, 0.6, 0.7),
_ => (capabilities.memory_gb * 0.4, 0.5, 0.8, 0.5),
};
Self {
backend,
throughput_gbps: throughput,
latency_ms: latency,
power_efficiency: power_eff,
memory_efficiency: mem_eff,
}
}
pub fn performance_score(&self) -> f64 {
self.throughput_gbps * 0.4
+ (1.0 / self.latency_ms) * 0.3
+ self.power_efficiency * 0.2
+ self.memory_efficiency * 0.1
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_backend_availability_detection() {
let backends = GpuIoProcessor::list_available_backends();
assert!(!backends.is_empty());
}
#[test]
fn test_backend_capabilities() {
if let Ok(processor) = GpuIoProcessor::new() {
let capabilities = processor
.get_backend_capabilities()
.expect("Operation failed");
assert!(capabilities.memory_gb > 0.0);
assert!(capabilities.compute_units > 0);
}
}
#[test]
fn test_optimal_backend_for_workload() {
let backend =
GpuIoProcessor::get_optimal_backend_for_workload(GpuWorkloadType::MachineLearning);
assert!(backend.is_ok());
}
#[test]
fn test_work_group_size_calculation() {
let capabilities = BackendCapabilities {
backend: GpuBackend::Cuda,
memory_gb: 8.0,
max_work_group_size: 1024,
supports_fp64: true,
supports_fp16: true,
compute_units: 32,
max_allocation_size: 1024 * 1024 * 1024,
local_memory_size: 48 * 1024,
};
let work_group_size = capabilities.get_optimal_work_group_size(10000);
assert_eq!(work_group_size, 256);
let small_size = capabilities.get_optimal_work_group_size(100);
assert_eq!(small_size, 100); }
#[test]
fn test_performance_profile_scoring() {
let capabilities = BackendCapabilities {
backend: GpuBackend::Cuda,
memory_gb: 8.0,
max_work_group_size: 1024,
supports_fp64: true,
supports_fp16: true,
compute_units: 32,
max_allocation_size: 1024 * 1024 * 1024,
local_memory_size: 48 * 1024,
};
let profile = BackendPerformanceProfile::new(GpuBackend::Cuda, &capabilities);
let score = profile.performance_score();
assert!(score > 0.0);
}
#[test]
fn test_validate_device_info_per_backend() {
for backend in [GpuBackend::Cuda, GpuBackend::Metal, GpuBackend::OpenCL] {
let device = GpuDevice::new(backend, 0);
let valid = GpuIoProcessor::validate_device_info(&device)
.expect("validate_device_info should not fail in the default build");
assert!(
valid,
"expected backend {backend} to validate against its placeholder GpuDeviceInfo",
);
}
let cpu = GpuDevice::new(GpuBackend::Cpu, 0);
assert!(!GpuIoProcessor::validate_device_info(&cpu)
.expect("validate_device_info should not fail for CPU"));
}
#[test]
fn test_validate_backend_returns_ok() {
for backend in [GpuBackend::Cuda, GpuBackend::Metal, GpuBackend::OpenCL] {
let result = GpuIoProcessor::validate_backend(backend);
assert!(
result.is_ok(),
"validate_backend({backend}) should return Ok, got {result:?}",
);
}
}
#[test]
fn test_get_backend_capabilities_is_well_formed() {
let processor = GpuIoProcessor::with_backend(GpuBackend::Cpu)
.expect("CPU backend is always constructible");
let caps = processor
.get_backend_capabilities()
.expect("get_backend_capabilities should query device info and succeed");
assert_eq!(caps.backend, GpuBackend::Cpu);
assert!(
caps.memory_gb > 0.0,
"memory_gb fell back to a positive default"
);
assert!(caps.max_work_group_size >= 1);
assert!(caps.compute_units >= 1);
assert!(caps.max_allocation_size >= 1 << 20);
assert!(caps.local_memory_size > 0);
}
#[test]
fn test_unknown_capability_skips_check() {
let mut info = GpuDeviceInfo::for_backend(GpuBackend::Cuda);
assert!(GpuIoProcessor::meets_memory_requirement(
&info,
CUDA_MIN_TOTAL_MEMORY_BYTES
));
assert!(GpuIoProcessor::meets_work_group_requirement(
&info,
CUDA_MIN_WORK_GROUP_SIZE
));
info.total_memory = CUDA_MIN_TOTAL_MEMORY_BYTES - 1;
assert!(!GpuIoProcessor::meets_memory_requirement(
&info,
CUDA_MIN_TOTAL_MEMORY_BYTES
));
info.total_memory = CUDA_MIN_TOTAL_MEMORY_BYTES;
assert!(GpuIoProcessor::meets_memory_requirement(
&info,
CUDA_MIN_TOTAL_MEMORY_BYTES
));
}
#[test]
fn test_cuda_rejects_without_fp64() {
let mut info = GpuDeviceInfo::for_backend(GpuBackend::Cuda);
info.supports_fp64 = false;
assert!(!GpuIoProcessor::validate_cuda_backend(&info)
.expect("validate_cuda_backend should not fail"));
}
#[test]
fn test_capabilities_uses_known_memory() {
let mut info = GpuDeviceInfo::for_backend(GpuBackend::OpenCL);
info.total_memory = 8 * 1024 * 1024 * 1024; info.max_work_group_size = 512;
let caps = GpuIoProcessor::capabilities_from_info(&info);
assert!((caps.memory_gb - 8.0).abs() < 1e-9);
assert_eq!(caps.max_work_group_size, 512);
}
}