use crate::cuda_detect::{detect_cuda_devices, CudaDeviceInfo};
use crate::device::Device;
#[derive(Debug, Clone)]
pub struct GpuReadinessReport {
pub gpu_available: bool,
pub gpu_count: usize,
pub gpus: Vec<GpuCapability>,
pub recommended_device: Device,
pub recommendation_reasons: Vec<String>,
pub estimated_speedup: Option<f64>,
}
#[derive(Debug, Clone)]
pub struct GpuCapability {
pub device: Device,
pub name: String,
pub memory_mb: u64,
pub memory_bandwidth_gbs: f64,
pub compute_capability: Option<(u32, u32)>,
pub cuda_cores: Option<u32>,
pub has_tensor_cores: bool,
pub supports_fp16: bool,
pub supports_int8: bool,
pub recommended: bool,
}
impl GpuCapability {
pub fn from_cuda_device(info: &CudaDeviceInfo) -> Self {
let compute_capability = info.compute_capability;
let has_tensor_cores = compute_capability
.map(|(major, _minor)| major >= 7)
.unwrap_or(false);
let supports_fp16 = compute_capability
.map(|(major, _)| major >= 6)
.unwrap_or(false);
let supports_int8 = compute_capability
.map(|(major, _)| major >= 6)
.unwrap_or(false);
let memory_bandwidth_gbs = estimate_memory_bandwidth(&info.name, info.memory_mb);
let cuda_cores = compute_capability
.and_then(|(major, minor)| estimate_cuda_cores(&info.name, major, minor));
Self {
device: Device::cuda(info.index),
name: info.name.clone(),
memory_mb: info.memory_mb,
memory_bandwidth_gbs,
compute_capability,
cuda_cores,
has_tensor_cores,
supports_fp16,
supports_int8,
recommended: false,
}
}
pub fn capability_score(&self) -> f64 {
let mut score = 0.0;
score += self.memory_bandwidth_gbs * 0.5;
score += (self.memory_mb as f64 / 1024.0) * 2.0;
if let Some((major, minor)) = self.compute_capability {
score += (major as f64 * 100.0) + (minor as f64 * 10.0);
}
if self.has_tensor_cores {
score += 200.0;
}
if self.supports_fp16 {
score += 50.0;
}
if self.supports_int8 {
score += 30.0;
}
score
}
}
fn estimate_memory_bandwidth(name: &str, memory_mb: u64) -> f64 {
let name_lower = name.to_lowercase();
if name_lower.contains("a100") {
1555.0 } else if name_lower.contains("a6000") {
768.0 } else if name_lower.contains("rtx 3090") {
936.0 } else if name_lower.contains("rtx 3080") {
760.0 } else if name_lower.contains("rtx 3070") {
448.0 } else if name_lower.contains("v100") {
900.0 } else if name_lower.contains("h100") {
3000.0 } else {
(memory_mb as f64 / 1024.0) * 30.0
}
}
fn estimate_cuda_cores(name: &str, major: u32, minor: u32) -> Option<u32> {
let name_lower = name.to_lowercase();
if name_lower.contains("a100") {
Some(6912)
} else if name_lower.contains("a6000") {
Some(10752)
} else if name_lower.contains("rtx 3090") {
Some(10496)
} else if name_lower.contains("rtx 3080") {
Some(8704)
} else if name_lower.contains("rtx 3070") {
Some(5888)
} else if name_lower.contains("v100") {
Some(5120)
} else if name_lower.contains("h100") {
Some(14592)
} else {
match (major, minor) {
(8, 6) => Some(8192), (8, 0) => Some(6912), (7, 5) => Some(4608), (7, 0) => Some(5120), _ => None,
}
}
}
pub fn assess_gpu_readiness() -> GpuReadinessReport {
let cuda_devices = detect_cuda_devices();
let gpu_count = cuda_devices.len();
let gpu_available = gpu_count > 0;
let mut gpus: Vec<GpuCapability> = cuda_devices
.iter()
.map(GpuCapability::from_cuda_device)
.collect();
gpus.sort_by(|a, b| {
b.capability_score()
.partial_cmp(&a.capability_score())
.unwrap_or(std::cmp::Ordering::Equal)
});
if let Some(best_gpu) = gpus.first_mut() {
best_gpu.recommended = true;
}
let mut recommendation_reasons = Vec::new();
let recommended_device = if gpu_available {
let best = &gpus[0];
recommendation_reasons.push(format!(
"GPU {} has highest capability score: {:.1}",
best.name,
best.capability_score()
));
if best.has_tensor_cores {
recommendation_reasons
.push("GPU has Tensor Cores for accelerated matrix operations".to_string());
}
recommendation_reasons.push(format!(
"GPU memory: {} GB ({:.0} GB/s bandwidth)",
best.memory_mb / 1024,
best.memory_bandwidth_gbs
));
best.device.clone()
} else {
recommendation_reasons.push("No GPU detected, using CPU".to_string());
recommendation_reasons.push("CPU is currently the only supported backend".to_string());
Device::cpu()
};
let estimated_speedup = if gpu_available {
Some(estimate_gpu_speedup(&gpus[0]))
} else {
None
};
GpuReadinessReport {
gpu_available,
gpu_count,
gpus,
recommended_device,
recommendation_reasons,
estimated_speedup,
}
}
fn estimate_gpu_speedup(gpu: &GpuCapability) -> f64 {
let mut speedup = 1.0;
speedup *= gpu.memory_bandwidth_gbs / 30.0;
if let Some((major, _)) = gpu.compute_capability {
speedup *= 1.0 + (major as f64 * 0.2);
}
if gpu.has_tensor_cores {
speedup *= 1.5;
}
speedup.clamp(1.0, 50.0)
}
#[derive(Debug, Clone)]
pub struct WorkloadProfile {
pub operation_count: usize,
pub avg_tensor_size: usize,
pub peak_memory_mb: u64,
pub compute_intensity: f64,
}
pub fn recommend_batch_size(gpu: &GpuCapability, workload: &WorkloadProfile) -> usize {
let available_memory_mb = (gpu.memory_mb as f64 * 0.8) as u64;
let memory_per_sample_mb = workload.peak_memory_mb;
if memory_per_sample_mb == 0 {
return 1;
}
let max_batch = (available_memory_mb / memory_per_sample_mb).max(1) as usize;
let compute_adjusted = if gpu.has_tensor_cores {
max_batch.min(256) } else {
max_batch.min(128)
};
compute_adjusted.next_power_of_two() / 2
}
pub fn generate_recommendations(
report: &GpuReadinessReport,
workload: Option<&WorkloadProfile>,
) -> Vec<String> {
let mut recommendations = Vec::new();
if !report.gpu_available {
recommendations.push(
"Consider using SIMD optimizations with the 'simd' feature for CPU acceleration"
.to_string(),
);
recommendations.push("Use the 'parallel' feature for multi-threaded execution".to_string());
return recommendations;
}
let best_gpu = &report.gpus[0];
if best_gpu.has_tensor_cores {
recommendations.push(
"Enable FP16 mixed precision to utilize Tensor Cores (future feature)".to_string(),
);
}
if best_gpu.supports_int8 {
recommendations.push(
"Consider INT8 quantization for inference workloads (future feature)".to_string(),
);
}
if best_gpu.memory_mb < 8192 {
recommendations
.push("GPU has <8GB memory: Use gradient checkpointing for training".to_string());
} else if best_gpu.memory_mb >= 40960 {
recommendations.push("Large GPU memory available: Can use larger batch sizes".to_string());
}
if let Some(wl) = workload {
let batch_size = recommend_batch_size(best_gpu, wl);
recommendations.push(format!(
"Recommended batch size for GPU: {} (based on {} MB memory per sample)",
batch_size, wl.peak_memory_mb
));
if wl.compute_intensity < 10.0 {
recommendations
.push("Low compute intensity: Memory bandwidth is bottleneck".to_string());
} else {
recommendations.push("High compute intensity: Good for GPU acceleration".to_string());
}
}
recommendations
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_estimate_memory_bandwidth() {
assert_eq!(estimate_memory_bandwidth("NVIDIA A100", 40960), 1555.0);
assert_eq!(
estimate_memory_bandwidth("NVIDIA GeForce RTX 3090", 24576),
936.0
);
assert!(estimate_memory_bandwidth("Unknown GPU", 16384) > 0.0);
}
#[test]
fn test_estimate_cuda_cores() {
assert_eq!(estimate_cuda_cores("NVIDIA A100", 8, 0), Some(6912));
assert_eq!(
estimate_cuda_cores("NVIDIA GeForce RTX 3090", 8, 6),
Some(10496)
);
}
#[test]
fn test_gpu_capability_score() {
let cuda_info = CudaDeviceInfo {
index: 0,
name: "NVIDIA A100".to_string(),
memory_mb: 40960,
compute_capability: Some((8, 0)),
};
let cap = GpuCapability::from_cuda_device(&cuda_info);
let score = cap.capability_score();
assert!(score > 1000.0);
assert!(cap.has_tensor_cores);
assert!(cap.supports_fp16);
assert!(cap.supports_int8);
}
#[test]
fn test_assess_gpu_readiness() {
let report = assess_gpu_readiness();
assert_eq!(report.gpu_count, report.gpus.len());
assert_eq!(report.gpu_available, report.gpu_count > 0);
if report.gpu_available {
assert!(report.estimated_speedup.is_some());
assert_ne!(report.recommended_device, Device::cpu());
assert!(report.gpus.iter().any(|g| g.recommended));
} else {
assert_eq!(report.recommended_device, Device::cpu());
assert!(report.estimated_speedup.is_none());
}
assert!(!report.recommendation_reasons.is_empty());
}
#[test]
fn test_recommend_batch_size() {
let gpu = GpuCapability {
device: Device::cuda(0),
name: "Test GPU".to_string(),
memory_mb: 16384,
memory_bandwidth_gbs: 500.0,
compute_capability: Some((8, 0)),
cuda_cores: Some(8192),
has_tensor_cores: true,
supports_fp16: true,
supports_int8: true,
recommended: true,
};
let workload = WorkloadProfile {
operation_count: 1000,
avg_tensor_size: 100000,
peak_memory_mb: 128,
compute_intensity: 50.0,
};
let batch_size = recommend_batch_size(&gpu, &workload);
assert!(batch_size > 0);
assert!(batch_size <= 256);
assert_eq!(batch_size.count_ones(), 1);
}
#[test]
fn test_generate_recommendations() {
let report = GpuReadinessReport {
gpu_available: false,
gpu_count: 0,
gpus: vec![],
recommended_device: Device::cpu(),
recommendation_reasons: vec![],
estimated_speedup: None,
};
let recommendations = generate_recommendations(&report, None);
assert!(!recommendations.is_empty());
assert!(recommendations
.iter()
.any(|r| r.contains("SIMD") || r.contains("parallel")));
}
#[test]
fn test_estimate_gpu_speedup() {
let gpu = GpuCapability {
device: Device::cuda(0),
name: "High-end GPU".to_string(),
memory_mb: 40960,
memory_bandwidth_gbs: 1500.0,
compute_capability: Some((8, 0)),
cuda_cores: Some(10000),
has_tensor_cores: true,
supports_fp16: true,
supports_int8: true,
recommended: true,
};
let speedup = estimate_gpu_speedup(&gpu);
assert!(speedup > 1.0);
assert!(speedup <= 50.0); }
#[test]
fn test_workload_profile_creation() {
let profile = WorkloadProfile {
operation_count: 5000,
avg_tensor_size: 250000,
peak_memory_mb: 512,
compute_intensity: 75.0,
};
assert_eq!(profile.operation_count, 5000);
assert_eq!(profile.peak_memory_mb, 512);
assert!(profile.compute_intensity > 50.0);
}
}