pub mod acceleration;
pub mod core;
pub mod distance;
pub mod kernels;
pub mod memory;
pub use core::{BackendContext, DeviceSelection, GpuBackend, GpuConfig, GpuContext, GpuDevice};
pub use distance::{DistanceMetric, GpuArray, GpuDistanceMatrix};
pub use memory::{
BandwidthMonitor, GpuMemoryBlock, GpuMemoryManager, MemoryStats, MemoryStrategy, MemoryTransfer,
};
pub use acceleration::{
detect_tensor_core_capabilities, AdvancedDeviceSelection, AdvancedGpuMemoryManager,
AdvancedMemoryStrategy, AllocationRecord, DeviceBenchmark, DeviceSelector,
GpuAccelerationConfig, GpuKMeans, GpuKMeansResult, KMeansMetrics, KernelOptimizations,
MemoryUsageStats, PrecisionMode, ProfilingRecord, TensorCoreCapabilities, TensorCoreConfig,
};
pub use kernels::{
calculate_kernel_config, generate_cuda_batch_distance_kernel,
generate_cuda_distance_matrix_kernel, generate_cuda_kmeans_assign_kernel,
generate_metal_distance_kernel, generate_opencl_distance_matrix_kernel,
generate_rocm_distance_kernel, get_kernel_source, get_kmeans_kernel_source, DistanceKernelType,
KernelConfig, KernelDataType,
};
pub fn auto_config() -> GpuConfig {
let preferred_backends = [
GpuBackend::Cuda,
GpuBackend::OpenCl,
GpuBackend::Rocm,
GpuBackend::Metal,
GpuBackend::OneApi,
];
for &backend in &preferred_backends {
if is_backend_available(backend) {
return GpuConfig::new(backend);
}
}
GpuConfig::default()
}
pub fn is_backend_available(backend: GpuBackend) -> bool {
match backend {
GpuBackend::CpuFallback => true,
_ => {
false
}
}
}
pub fn list_devices() -> Vec<GpuDevice> {
vec![GpuDevice::new(
0,
"Integrated GPU".to_string(),
4_000_000_000, 3_500_000_000, "1.0".to_string(),
512,
GpuBackend::CpuFallback,
false,
)]
}
pub fn get_best_device() -> Option<GpuDevice> {
let devices = list_devices();
devices
.into_iter()
.filter(|d| d.backend != GpuBackend::CpuFallback)
.max_by(|a, b| {
a.get_device_score()
.partial_cmp(&b.get_device_score())
.unwrap_or(std::cmp::Ordering::Equal)
})
}
pub fn benchmark_gpu_vs_cpu(
data_size: usize,
n_features: usize,
metric: DistanceMetric,
) -> Result<BenchmarkResult, crate::error::ClusteringError> {
use scirs2_core::ndarray::Array2;
use std::time::Instant;
let data = Array2::from_shape_fn((data_size, n_features), |(i, j)| {
(i * n_features + j) as f64 / 1000.0
});
let cpu_start = Instant::now();
let cpu_config = GpuConfig::new(GpuBackend::CpuFallback);
let cpu_matrix = GpuDistanceMatrix::new(cpu_config, metric, None)?;
let _cpu_result = cpu_matrix.compute_distance_matrix_cpu(data.view())?;
let cpu_duration = cpu_start.elapsed();
let gpu_start = Instant::now();
let gpu_config = auto_config();
let mut gpu_matrix = GpuDistanceMatrix::new(gpu_config, metric, None)?;
let _gpu_result = gpu_matrix.compute_distance_matrix(data.view())?;
let gpu_duration = gpu_start.elapsed();
Ok(BenchmarkResult {
cpu_duration_ms: cpu_duration.as_millis() as f64,
gpu_duration_ms: gpu_duration.as_millis() as f64,
speedup: cpu_duration.as_secs_f64() / gpu_duration.as_secs_f64(),
data_size,
n_features,
metric,
})
}
#[derive(Debug, Clone)]
pub struct BenchmarkResult {
pub cpu_duration_ms: f64,
pub gpu_duration_ms: f64,
pub speedup: f64,
pub data_size: usize,
pub n_features: usize,
pub metric: DistanceMetric,
}
impl BenchmarkResult {
pub fn summary(&self) -> String {
format!(
"GPU vs CPU Benchmark Results:\n\
Data size: {} samples x {} features\n\
Distance metric: {}\n\
CPU time: {:.2} ms\n\
GPU time: {:.2} ms\n\
Speedup: {:.2}x",
self.data_size,
self.n_features,
self.metric,
self.cpu_duration_ms,
self.gpu_duration_ms,
self.speedup
)
}
pub fn gpu_is_faster(&self) -> bool {
self.speedup > 1.0
}
pub fn efficiency_rating(&self) -> &'static str {
match self.speedup {
x if x >= 10.0 => "Excellent",
x if x >= 5.0 => "Very Good",
x if x >= 2.0 => "Good",
x if x >= 1.1 => "Marginal",
_ => "No Benefit",
}
}
}
pub struct GpuCapabilities {
pub available_backends: Vec<GpuBackend>,
pub best_devices: std::collections::HashMap<GpuBackend, GpuDevice>,
pub total_gpu_memory: usize,
pub supports_unified_memory: bool,
pub supports_double_precision: bool,
}
impl GpuCapabilities {
pub fn detect() -> Self {
let available_backends: Vec<GpuBackend> = [
GpuBackend::Cuda,
GpuBackend::OpenCl,
GpuBackend::Rocm,
GpuBackend::Metal,
GpuBackend::OneApi,
]
.iter()
.cloned()
.filter(|&backend| is_backend_available(backend))
.collect();
let mut best_devices = std::collections::HashMap::new();
let mut total_memory = 0;
let mut supports_unified = false;
let mut supports_double = false;
for backend in available_backends.iter() {
if let Some(device) = Self::get_best_device_for_backend(*backend) {
total_memory += device.total_memory;
supports_unified |= *backend == GpuBackend::Cuda; supports_double |= device.supports_double_precision;
best_devices.insert(*backend, device);
}
}
Self {
available_backends,
best_devices,
total_gpu_memory: total_memory,
supports_unified_memory: supports_unified,
supports_double_precision: supports_double,
}
}
pub fn summary(&self) -> String {
let mut summary = String::new();
summary.push_str("GPU Capabilities Summary:\n");
summary.push_str(&format!(
"Available backends: {:?}\n",
self.available_backends
));
summary.push_str(&format!(
"Total GPU memory: {:.2} GB\n",
self.total_gpu_memory as f64 / (1024.0 * 1024.0 * 1024.0)
));
summary.push_str(&format!(
"Unified memory support: {}\n",
self.supports_unified_memory
));
summary.push_str(&format!(
"Double precision support: {}\n",
self.supports_double_precision
));
for (backend, device) in &self.best_devices {
summary.push_str(&format!(
"Best {} device: {} ({:.2} GB)\n",
backend,
device.name,
device.total_memory as f64 / (1024.0 * 1024.0 * 1024.0)
));
}
summary
}
fn get_best_device_for_backend(backend: GpuBackend) -> Option<GpuDevice> {
match backend {
GpuBackend::CpuFallback => None,
_ => Some(GpuDevice::new(
0,
format!("{} Device", backend),
8_000_000_000,
7_000_000_000,
"1.0".to_string(),
1024,
backend,
true,
)),
}
}
}
pub fn is_gpu_recommended(n_samples: usize, n_features: usize) -> bool {
let problem_size = n_samples * n_features;
problem_size > 10_000 && n_samples > 100
}
pub fn get_recommended_tile_size(device: &GpuDevice, element_size: usize) -> usize {
let memory_per_tile = device.available_memory / 16; let elements_per_tile = memory_per_tile / element_size;
let sqrt_elements = (elements_per_tile as f64).sqrt() as usize;
let base_tile_size = sqrt_elements.max(32).min(1024);
let compute_aligned = ((base_tile_size + device.compute_units as usize - 1)
/ device.compute_units as usize)
* device.compute_units as usize;
compute_aligned.min(1024)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_auto_config() {
let config = auto_config();
assert!(config.auto_fallback);
}
#[test]
fn test_backend_availability() {
assert!(is_backend_available(GpuBackend::CpuFallback));
assert!(!is_backend_available(GpuBackend::Cuda));
}
#[test]
fn test_list_devices() {
let devices = list_devices();
assert!(!devices.is_empty());
}
#[test]
fn test_gpu_recommendation() {
assert!(!is_gpu_recommended(10, 10)); assert!(is_gpu_recommended(1000, 100)); }
#[test]
fn test_capabilities_detection() {
let caps = GpuCapabilities::detect();
assert!(!caps.summary().is_empty());
}
#[test]
fn test_recommended_tile_size() {
let device = GpuDevice::new(
0,
"Test".to_string(),
8_000_000_000,
6_000_000_000,
"1.0".to_string(),
1024,
GpuBackend::Cuda,
true,
);
let tile_size = get_recommended_tile_size(&device, 8);
assert!(tile_size >= 32);
assert!(tile_size <= 1024);
}
#[test]
fn test_benchmark_result() {
let result = BenchmarkResult {
cpu_duration_ms: 100.0,
gpu_duration_ms: 20.0,
speedup: 5.0,
data_size: 1000,
n_features: 10,
metric: DistanceMetric::Euclidean,
};
assert!(result.gpu_is_faster());
assert_eq!(result.efficiency_rating(), "Very Good");
assert!(!result.summary().is_empty());
}
}