Skip to main content

tensorlogic_scirs_backend/
gpu_readiness.rs

1//! GPU Readiness Framework
2//!
3//! This module provides utilities for assessing GPU readiness and
4//! planning for GPU execution. It helps determine optimal execution
5//! strategies based on available hardware and workload characteristics.
6
7use crate::cuda_detect::{detect_cuda_devices, CudaDeviceInfo};
8use crate::device::Device;
9
10/// GPU readiness assessment result.
11#[derive(Debug, Clone)]
12pub struct GpuReadinessReport {
13    /// Whether any GPU is available
14    pub gpu_available: bool,
15
16    /// Number of available GPUs
17    pub gpu_count: usize,
18
19    /// Detected GPU devices
20    pub gpus: Vec<GpuCapability>,
21
22    /// Recommended execution device
23    pub recommended_device: Device,
24
25    /// Reasons for recommendation
26    pub recommendation_reasons: Vec<String>,
27
28    /// Estimated speedup over CPU (if GPU available)
29    pub estimated_speedup: Option<f64>,
30}
31
32/// Detailed GPU capability information.
33#[derive(Debug, Clone)]
34pub struct GpuCapability {
35    /// Device information
36    pub device: Device,
37
38    /// GPU name
39    pub name: String,
40
41    /// Total memory in MB
42    pub memory_mb: u64,
43
44    /// Memory bandwidth in GB/s (estimated)
45    pub memory_bandwidth_gbs: f64,
46
47    /// Compute capability
48    pub compute_capability: Option<(u32, u32)>,
49
50    /// CUDA cores (estimated based on architecture)
51    pub cuda_cores: Option<u32>,
52
53    /// Tensor cores available
54    pub has_tensor_cores: bool,
55
56    /// FP16 support
57    pub supports_fp16: bool,
58
59    /// INT8 support
60    pub supports_int8: bool,
61
62    /// Recommended for this workload
63    pub recommended: bool,
64}
65
66impl GpuCapability {
67    /// Create GPU capability from CUDA device info.
68    pub fn from_cuda_device(info: &CudaDeviceInfo) -> Self {
69        let compute_capability = info.compute_capability;
70        let has_tensor_cores = compute_capability
71            .map(|(major, _minor)| major >= 7)
72            .unwrap_or(false);
73
74        let supports_fp16 = compute_capability
75            .map(|(major, _)| major >= 6)
76            .unwrap_or(false);
77
78        let supports_int8 = compute_capability
79            .map(|(major, _)| major >= 6)
80            .unwrap_or(false);
81
82        // Estimate memory bandwidth based on GPU name and memory size
83        let memory_bandwidth_gbs = estimate_memory_bandwidth(&info.name, info.memory_mb);
84
85        // Estimate CUDA cores based on compute capability
86        let cuda_cores = compute_capability
87            .and_then(|(major, minor)| estimate_cuda_cores(&info.name, major, minor));
88
89        Self {
90            device: Device::cuda(info.index),
91            name: info.name.clone(),
92            memory_mb: info.memory_mb,
93            memory_bandwidth_gbs,
94            compute_capability,
95            cuda_cores,
96            has_tensor_cores,
97            supports_fp16,
98            supports_int8,
99            recommended: false,
100        }
101    }
102
103    /// Get a capability score (higher is better).
104    pub fn capability_score(&self) -> f64 {
105        let mut score = 0.0;
106
107        // Memory bandwidth contribution
108        score += self.memory_bandwidth_gbs * 0.5;
109
110        // Memory size contribution (GB)
111        score += (self.memory_mb as f64 / 1024.0) * 2.0;
112
113        // Compute capability contribution
114        if let Some((major, minor)) = self.compute_capability {
115            score += (major as f64 * 100.0) + (minor as f64 * 10.0);
116        }
117
118        // Tensor cores bonus
119        if self.has_tensor_cores {
120            score += 200.0;
121        }
122
123        // FP16/INT8 support
124        if self.supports_fp16 {
125            score += 50.0;
126        }
127        if self.supports_int8 {
128            score += 30.0;
129        }
130
131        score
132    }
133}
134
135/// Estimate memory bandwidth based on GPU name and memory size.
136fn estimate_memory_bandwidth(name: &str, memory_mb: u64) -> f64 {
137    let name_lower = name.to_lowercase();
138
139    // Known GPU families and their typical bandwidth
140    if name_lower.contains("a100") {
141        1555.0 // A100 40GB/80GB
142    } else if name_lower.contains("a6000") {
143        768.0 // RTX A6000
144    } else if name_lower.contains("rtx 3090") {
145        936.0 // RTX 3090
146    } else if name_lower.contains("rtx 3080") {
147        760.0 // RTX 3080
148    } else if name_lower.contains("rtx 3070") {
149        448.0 // RTX 3070
150    } else if name_lower.contains("v100") {
151        900.0 // V100
152    } else if name_lower.contains("h100") {
153        3000.0 // H100
154    } else {
155        // Rough estimate based on memory size
156        // Assume ~30 GB/s per GB of memory (very rough heuristic)
157        (memory_mb as f64 / 1024.0) * 30.0
158    }
159}
160
161/// Estimate CUDA cores based on GPU name and compute capability.
162fn estimate_cuda_cores(name: &str, major: u32, minor: u32) -> Option<u32> {
163    let name_lower = name.to_lowercase();
164
165    // Known GPU models
166    if name_lower.contains("a100") {
167        Some(6912)
168    } else if name_lower.contains("a6000") {
169        Some(10752)
170    } else if name_lower.contains("rtx 3090") {
171        Some(10496)
172    } else if name_lower.contains("rtx 3080") {
173        Some(8704)
174    } else if name_lower.contains("rtx 3070") {
175        Some(5888)
176    } else if name_lower.contains("v100") {
177        Some(5120)
178    } else if name_lower.contains("h100") {
179        Some(14592)
180    } else {
181        // Rough estimate based on compute capability
182        match (major, minor) {
183            (8, 6) => Some(8192), // Ampere
184            (8, 0) => Some(6912), // Ampere
185            (7, 5) => Some(4608), // Turing
186            (7, 0) => Some(5120), // Volta
187            _ => None,
188        }
189    }
190}
191
192/// Assess GPU readiness for TensorLogic execution.
193pub fn assess_gpu_readiness() -> GpuReadinessReport {
194    let cuda_devices = detect_cuda_devices();
195    let gpu_count = cuda_devices.len();
196    let gpu_available = gpu_count > 0;
197
198    let mut gpus: Vec<GpuCapability> = cuda_devices
199        .iter()
200        .map(GpuCapability::from_cuda_device)
201        .collect();
202
203    // Rank GPUs by capability score
204    gpus.sort_by(|a, b| {
205        b.capability_score()
206            .partial_cmp(&a.capability_score())
207            .unwrap_or(std::cmp::Ordering::Equal)
208    });
209
210    // Mark the best GPU as recommended
211    if let Some(best_gpu) = gpus.first_mut() {
212        best_gpu.recommended = true;
213    }
214
215    let mut recommendation_reasons = Vec::new();
216    let recommended_device = if gpu_available {
217        let best = &gpus[0];
218        recommendation_reasons.push(format!(
219            "GPU {} has highest capability score: {:.1}",
220            best.name,
221            best.capability_score()
222        ));
223
224        if best.has_tensor_cores {
225            recommendation_reasons
226                .push("GPU has Tensor Cores for accelerated matrix operations".to_string());
227        }
228
229        recommendation_reasons.push(format!(
230            "GPU memory: {} GB ({:.0} GB/s bandwidth)",
231            best.memory_mb / 1024,
232            best.memory_bandwidth_gbs
233        ));
234
235        best.device.clone()
236    } else {
237        recommendation_reasons.push("No GPU detected, using CPU".to_string());
238        recommendation_reasons.push("CPU is currently the only supported backend".to_string());
239        Device::cpu()
240    };
241
242    let estimated_speedup = if gpu_available {
243        Some(estimate_gpu_speedup(&gpus[0]))
244    } else {
245        None
246    };
247
248    GpuReadinessReport {
249        gpu_available,
250        gpu_count,
251        gpus,
252        recommended_device,
253        recommendation_reasons,
254        estimated_speedup,
255    }
256}
257
258/// Estimate theoretical GPU speedup over CPU.
259fn estimate_gpu_speedup(gpu: &GpuCapability) -> f64 {
260    let mut speedup = 1.0;
261
262    // Base speedup from memory bandwidth (GPU vs CPU ~30 GB/s)
263    speedup *= gpu.memory_bandwidth_gbs / 30.0;
264
265    // Compute capability contribution
266    if let Some((major, _)) = gpu.compute_capability {
267        speedup *= 1.0 + (major as f64 * 0.2);
268    }
269
270    // Tensor cores provide significant speedup for matrix operations
271    if gpu.has_tensor_cores {
272        speedup *= 1.5;
273    }
274
275    // Cap at realistic values (typical GPU speedup is 5-50x)
276    speedup.clamp(1.0, 50.0)
277}
278
279/// Workload characteristics for optimization recommendations.
280#[derive(Debug, Clone)]
281pub struct WorkloadProfile {
282    /// Number of tensor operations
283    pub operation_count: usize,
284
285    /// Average tensor size in elements
286    pub avg_tensor_size: usize,
287
288    /// Peak memory usage in MB
289    pub peak_memory_mb: u64,
290
291    /// Compute intensity (FLOPs per byte)
292    pub compute_intensity: f64,
293}
294
295/// Recommend optimal batch size for GPU execution.
296pub fn recommend_batch_size(gpu: &GpuCapability, workload: &WorkloadProfile) -> usize {
297    let available_memory_mb = (gpu.memory_mb as f64 * 0.8) as u64; // Use 80% of GPU memory
298
299    // Calculate memory per sample
300    let memory_per_sample_mb = workload.peak_memory_mb;
301
302    if memory_per_sample_mb == 0 {
303        return 1;
304    }
305
306    // Maximum batch size based on memory
307    let max_batch = (available_memory_mb / memory_per_sample_mb).max(1) as usize;
308
309    // Adjust based on compute capability
310    let compute_adjusted = if gpu.has_tensor_cores {
311        max_batch.min(256) // Tensor cores work well with medium batches
312    } else {
313        max_batch.min(128)
314    };
315
316    // Ensure batch size is a power of 2 for optimal GPU utilization
317    compute_adjusted.next_power_of_two() / 2
318}
319
320/// Generate optimization recommendations based on GPU capabilities.
321pub fn generate_recommendations(
322    report: &GpuReadinessReport,
323    workload: Option<&WorkloadProfile>,
324) -> Vec<String> {
325    let mut recommendations = Vec::new();
326
327    if !report.gpu_available {
328        recommendations.push(
329            "Consider using SIMD optimizations with the 'simd' feature for CPU acceleration"
330                .to_string(),
331        );
332        recommendations.push("Use the 'parallel' feature for multi-threaded execution".to_string());
333        return recommendations;
334    }
335
336    let best_gpu = &report.gpus[0];
337
338    // GPU-specific recommendations
339    if best_gpu.has_tensor_cores {
340        recommendations.push(
341            "Enable FP16 mixed precision to utilize Tensor Cores (future feature)".to_string(),
342        );
343    }
344
345    if best_gpu.supports_int8 {
346        recommendations.push(
347            "Consider INT8 quantization for inference workloads (future feature)".to_string(),
348        );
349    }
350
351    // Memory recommendations
352    if best_gpu.memory_mb < 8192 {
353        recommendations
354            .push("GPU has <8GB memory: Use gradient checkpointing for training".to_string());
355    } else if best_gpu.memory_mb >= 40960 {
356        recommendations.push("Large GPU memory available: Can use larger batch sizes".to_string());
357    }
358
359    // Workload-specific recommendations
360    if let Some(wl) = workload {
361        let batch_size = recommend_batch_size(best_gpu, wl);
362        recommendations.push(format!(
363            "Recommended batch size for GPU: {} (based on {} MB memory per sample)",
364            batch_size, wl.peak_memory_mb
365        ));
366
367        if wl.compute_intensity < 10.0 {
368            recommendations
369                .push("Low compute intensity: Memory bandwidth is bottleneck".to_string());
370        } else {
371            recommendations.push("High compute intensity: Good for GPU acceleration".to_string());
372        }
373    }
374
375    recommendations
376}
377
378#[cfg(test)]
379mod tests {
380    use super::*;
381
382    #[test]
383    fn test_estimate_memory_bandwidth() {
384        assert_eq!(estimate_memory_bandwidth("NVIDIA A100", 40960), 1555.0);
385        assert_eq!(
386            estimate_memory_bandwidth("NVIDIA GeForce RTX 3090", 24576),
387            936.0
388        );
389        assert!(estimate_memory_bandwidth("Unknown GPU", 16384) > 0.0);
390    }
391
392    #[test]
393    fn test_estimate_cuda_cores() {
394        assert_eq!(estimate_cuda_cores("NVIDIA A100", 8, 0), Some(6912));
395        assert_eq!(
396            estimate_cuda_cores("NVIDIA GeForce RTX 3090", 8, 6),
397            Some(10496)
398        );
399    }
400
401    #[test]
402    fn test_gpu_capability_score() {
403        let cuda_info = CudaDeviceInfo {
404            index: 0,
405            name: "NVIDIA A100".to_string(),
406            memory_mb: 40960,
407            compute_capability: Some((8, 0)),
408        };
409
410        let cap = GpuCapability::from_cuda_device(&cuda_info);
411        let score = cap.capability_score();
412
413        // Should have high score due to tensor cores, memory, compute capability
414        assert!(score > 1000.0);
415        assert!(cap.has_tensor_cores);
416        assert!(cap.supports_fp16);
417        assert!(cap.supports_int8);
418    }
419
420    #[test]
421    fn test_assess_gpu_readiness() {
422        // Test GPU readiness assessment - behavior depends on actual hardware
423        let report = assess_gpu_readiness();
424
425        // Validate internal consistency regardless of GPU presence
426        assert_eq!(report.gpu_count, report.gpus.len());
427        assert_eq!(report.gpu_available, report.gpu_count > 0);
428
429        if report.gpu_available {
430            // If GPU is available, should have estimated speedup and recommend GPU
431            assert!(report.estimated_speedup.is_some());
432            assert_ne!(report.recommended_device, Device::cpu());
433            // At least one GPU should be marked as recommended
434            assert!(report.gpus.iter().any(|g| g.recommended));
435        } else {
436            // If no GPU, should recommend CPU and have no estimated speedup
437            assert_eq!(report.recommended_device, Device::cpu());
438            assert!(report.estimated_speedup.is_none());
439        }
440
441        // Should always have recommendation reasons
442        assert!(!report.recommendation_reasons.is_empty());
443    }
444
445    #[test]
446    fn test_recommend_batch_size() {
447        let gpu = GpuCapability {
448            device: Device::cuda(0),
449            name: "Test GPU".to_string(),
450            memory_mb: 16384,
451            memory_bandwidth_gbs: 500.0,
452            compute_capability: Some((8, 0)),
453            cuda_cores: Some(8192),
454            has_tensor_cores: true,
455            supports_fp16: true,
456            supports_int8: true,
457            recommended: true,
458        };
459
460        let workload = WorkloadProfile {
461            operation_count: 1000,
462            avg_tensor_size: 100000,
463            peak_memory_mb: 128,
464            compute_intensity: 50.0,
465        };
466
467        let batch_size = recommend_batch_size(&gpu, &workload);
468
469        // Should recommend reasonable batch size
470        assert!(batch_size > 0);
471        assert!(batch_size <= 256);
472        // Should be power of 2
473        assert_eq!(batch_size.count_ones(), 1);
474    }
475
476    #[test]
477    fn test_generate_recommendations() {
478        let report = GpuReadinessReport {
479            gpu_available: false,
480            gpu_count: 0,
481            gpus: vec![],
482            recommended_device: Device::cpu(),
483            recommendation_reasons: vec![],
484            estimated_speedup: None,
485        };
486
487        let recommendations = generate_recommendations(&report, None);
488
489        assert!(!recommendations.is_empty());
490        assert!(recommendations
491            .iter()
492            .any(|r| r.contains("SIMD") || r.contains("parallel")));
493    }
494
495    #[test]
496    fn test_estimate_gpu_speedup() {
497        let gpu = GpuCapability {
498            device: Device::cuda(0),
499            name: "High-end GPU".to_string(),
500            memory_mb: 40960,
501            memory_bandwidth_gbs: 1500.0,
502            compute_capability: Some((8, 0)),
503            cuda_cores: Some(10000),
504            has_tensor_cores: true,
505            supports_fp16: true,
506            supports_int8: true,
507            recommended: true,
508        };
509
510        let speedup = estimate_gpu_speedup(&gpu);
511
512        // Should estimate significant speedup
513        assert!(speedup > 1.0);
514        assert!(speedup <= 50.0); // Capped at 50x
515    }
516
517    #[test]
518    fn test_workload_profile_creation() {
519        let profile = WorkloadProfile {
520            operation_count: 5000,
521            avg_tensor_size: 250000,
522            peak_memory_mb: 512,
523            compute_intensity: 75.0,
524        };
525
526        assert_eq!(profile.operation_count, 5000);
527        assert_eq!(profile.peak_memory_mb, 512);
528        assert!(profile.compute_intensity > 50.0);
529    }
530}