tensorlogic_scirs_backend/
gpu_readiness.rs1use crate::cuda_detect::{detect_cuda_devices, CudaDeviceInfo};
8use crate::device::Device;
9
10#[derive(Debug, Clone)]
12pub struct GpuReadinessReport {
13 pub gpu_available: bool,
15
16 pub gpu_count: usize,
18
19 pub gpus: Vec<GpuCapability>,
21
22 pub recommended_device: Device,
24
25 pub recommendation_reasons: Vec<String>,
27
28 pub estimated_speedup: Option<f64>,
30}
31
32#[derive(Debug, Clone)]
34pub struct GpuCapability {
35 pub device: Device,
37
38 pub name: String,
40
41 pub memory_mb: u64,
43
44 pub memory_bandwidth_gbs: f64,
46
47 pub compute_capability: Option<(u32, u32)>,
49
50 pub cuda_cores: Option<u32>,
52
53 pub has_tensor_cores: bool,
55
56 pub supports_fp16: bool,
58
59 pub supports_int8: bool,
61
62 pub recommended: bool,
64}
65
66impl GpuCapability {
67 pub fn from_cuda_device(info: &CudaDeviceInfo) -> Self {
69 let compute_capability = info.compute_capability;
70 let has_tensor_cores = compute_capability
71 .map(|(major, _minor)| major >= 7)
72 .unwrap_or(false);
73
74 let supports_fp16 = compute_capability
75 .map(|(major, _)| major >= 6)
76 .unwrap_or(false);
77
78 let supports_int8 = compute_capability
79 .map(|(major, _)| major >= 6)
80 .unwrap_or(false);
81
82 let memory_bandwidth_gbs = estimate_memory_bandwidth(&info.name, info.memory_mb);
84
85 let cuda_cores = compute_capability
87 .and_then(|(major, minor)| estimate_cuda_cores(&info.name, major, minor));
88
89 Self {
90 device: Device::cuda(info.index),
91 name: info.name.clone(),
92 memory_mb: info.memory_mb,
93 memory_bandwidth_gbs,
94 compute_capability,
95 cuda_cores,
96 has_tensor_cores,
97 supports_fp16,
98 supports_int8,
99 recommended: false,
100 }
101 }
102
103 pub fn capability_score(&self) -> f64 {
105 let mut score = 0.0;
106
107 score += self.memory_bandwidth_gbs * 0.5;
109
110 score += (self.memory_mb as f64 / 1024.0) * 2.0;
112
113 if let Some((major, minor)) = self.compute_capability {
115 score += (major as f64 * 100.0) + (minor as f64 * 10.0);
116 }
117
118 if self.has_tensor_cores {
120 score += 200.0;
121 }
122
123 if self.supports_fp16 {
125 score += 50.0;
126 }
127 if self.supports_int8 {
128 score += 30.0;
129 }
130
131 score
132 }
133}
134
135fn estimate_memory_bandwidth(name: &str, memory_mb: u64) -> f64 {
137 let name_lower = name.to_lowercase();
138
139 if name_lower.contains("a100") {
141 1555.0 } else if name_lower.contains("a6000") {
143 768.0 } else if name_lower.contains("rtx 3090") {
145 936.0 } else if name_lower.contains("rtx 3080") {
147 760.0 } else if name_lower.contains("rtx 3070") {
149 448.0 } else if name_lower.contains("v100") {
151 900.0 } else if name_lower.contains("h100") {
153 3000.0 } else {
155 (memory_mb as f64 / 1024.0) * 30.0
158 }
159}
160
161fn estimate_cuda_cores(name: &str, major: u32, minor: u32) -> Option<u32> {
163 let name_lower = name.to_lowercase();
164
165 if name_lower.contains("a100") {
167 Some(6912)
168 } else if name_lower.contains("a6000") {
169 Some(10752)
170 } else if name_lower.contains("rtx 3090") {
171 Some(10496)
172 } else if name_lower.contains("rtx 3080") {
173 Some(8704)
174 } else if name_lower.contains("rtx 3070") {
175 Some(5888)
176 } else if name_lower.contains("v100") {
177 Some(5120)
178 } else if name_lower.contains("h100") {
179 Some(14592)
180 } else {
181 match (major, minor) {
183 (8, 6) => Some(8192), (8, 0) => Some(6912), (7, 5) => Some(4608), (7, 0) => Some(5120), _ => None,
188 }
189 }
190}
191
192pub fn assess_gpu_readiness() -> GpuReadinessReport {
194 let cuda_devices = detect_cuda_devices();
195 let gpu_count = cuda_devices.len();
196 let gpu_available = gpu_count > 0;
197
198 let mut gpus: Vec<GpuCapability> = cuda_devices
199 .iter()
200 .map(GpuCapability::from_cuda_device)
201 .collect();
202
203 gpus.sort_by(|a, b| {
205 b.capability_score()
206 .partial_cmp(&a.capability_score())
207 .unwrap_or(std::cmp::Ordering::Equal)
208 });
209
210 if let Some(best_gpu) = gpus.first_mut() {
212 best_gpu.recommended = true;
213 }
214
215 let mut recommendation_reasons = Vec::new();
216 let recommended_device = if gpu_available {
217 let best = &gpus[0];
218 recommendation_reasons.push(format!(
219 "GPU {} has highest capability score: {:.1}",
220 best.name,
221 best.capability_score()
222 ));
223
224 if best.has_tensor_cores {
225 recommendation_reasons
226 .push("GPU has Tensor Cores for accelerated matrix operations".to_string());
227 }
228
229 recommendation_reasons.push(format!(
230 "GPU memory: {} GB ({:.0} GB/s bandwidth)",
231 best.memory_mb / 1024,
232 best.memory_bandwidth_gbs
233 ));
234
235 best.device.clone()
236 } else {
237 recommendation_reasons.push("No GPU detected, using CPU".to_string());
238 recommendation_reasons.push("CPU is currently the only supported backend".to_string());
239 Device::cpu()
240 };
241
242 let estimated_speedup = if gpu_available {
243 Some(estimate_gpu_speedup(&gpus[0]))
244 } else {
245 None
246 };
247
248 GpuReadinessReport {
249 gpu_available,
250 gpu_count,
251 gpus,
252 recommended_device,
253 recommendation_reasons,
254 estimated_speedup,
255 }
256}
257
258fn estimate_gpu_speedup(gpu: &GpuCapability) -> f64 {
260 let mut speedup = 1.0;
261
262 speedup *= gpu.memory_bandwidth_gbs / 30.0;
264
265 if let Some((major, _)) = gpu.compute_capability {
267 speedup *= 1.0 + (major as f64 * 0.2);
268 }
269
270 if gpu.has_tensor_cores {
272 speedup *= 1.5;
273 }
274
275 speedup.clamp(1.0, 50.0)
277}
278
279#[derive(Debug, Clone)]
281pub struct WorkloadProfile {
282 pub operation_count: usize,
284
285 pub avg_tensor_size: usize,
287
288 pub peak_memory_mb: u64,
290
291 pub compute_intensity: f64,
293}
294
295pub fn recommend_batch_size(gpu: &GpuCapability, workload: &WorkloadProfile) -> usize {
297 let available_memory_mb = (gpu.memory_mb as f64 * 0.8) as u64; let memory_per_sample_mb = workload.peak_memory_mb;
301
302 if memory_per_sample_mb == 0 {
303 return 1;
304 }
305
306 let max_batch = (available_memory_mb / memory_per_sample_mb).max(1) as usize;
308
309 let compute_adjusted = if gpu.has_tensor_cores {
311 max_batch.min(256) } else {
313 max_batch.min(128)
314 };
315
316 compute_adjusted.next_power_of_two() / 2
318}
319
320pub fn generate_recommendations(
322 report: &GpuReadinessReport,
323 workload: Option<&WorkloadProfile>,
324) -> Vec<String> {
325 let mut recommendations = Vec::new();
326
327 if !report.gpu_available {
328 recommendations.push(
329 "Consider using SIMD optimizations with the 'simd' feature for CPU acceleration"
330 .to_string(),
331 );
332 recommendations.push("Use the 'parallel' feature for multi-threaded execution".to_string());
333 return recommendations;
334 }
335
336 let best_gpu = &report.gpus[0];
337
338 if best_gpu.has_tensor_cores {
340 recommendations.push(
341 "Enable FP16 mixed precision to utilize Tensor Cores (future feature)".to_string(),
342 );
343 }
344
345 if best_gpu.supports_int8 {
346 recommendations.push(
347 "Consider INT8 quantization for inference workloads (future feature)".to_string(),
348 );
349 }
350
351 if best_gpu.memory_mb < 8192 {
353 recommendations
354 .push("GPU has <8GB memory: Use gradient checkpointing for training".to_string());
355 } else if best_gpu.memory_mb >= 40960 {
356 recommendations.push("Large GPU memory available: Can use larger batch sizes".to_string());
357 }
358
359 if let Some(wl) = workload {
361 let batch_size = recommend_batch_size(best_gpu, wl);
362 recommendations.push(format!(
363 "Recommended batch size for GPU: {} (based on {} MB memory per sample)",
364 batch_size, wl.peak_memory_mb
365 ));
366
367 if wl.compute_intensity < 10.0 {
368 recommendations
369 .push("Low compute intensity: Memory bandwidth is bottleneck".to_string());
370 } else {
371 recommendations.push("High compute intensity: Good for GPU acceleration".to_string());
372 }
373 }
374
375 recommendations
376}
377
378#[cfg(test)]
379mod tests {
380 use super::*;
381
382 #[test]
383 fn test_estimate_memory_bandwidth() {
384 assert_eq!(estimate_memory_bandwidth("NVIDIA A100", 40960), 1555.0);
385 assert_eq!(
386 estimate_memory_bandwidth("NVIDIA GeForce RTX 3090", 24576),
387 936.0
388 );
389 assert!(estimate_memory_bandwidth("Unknown GPU", 16384) > 0.0);
390 }
391
392 #[test]
393 fn test_estimate_cuda_cores() {
394 assert_eq!(estimate_cuda_cores("NVIDIA A100", 8, 0), Some(6912));
395 assert_eq!(
396 estimate_cuda_cores("NVIDIA GeForce RTX 3090", 8, 6),
397 Some(10496)
398 );
399 }
400
401 #[test]
402 fn test_gpu_capability_score() {
403 let cuda_info = CudaDeviceInfo {
404 index: 0,
405 name: "NVIDIA A100".to_string(),
406 memory_mb: 40960,
407 compute_capability: Some((8, 0)),
408 };
409
410 let cap = GpuCapability::from_cuda_device(&cuda_info);
411 let score = cap.capability_score();
412
413 assert!(score > 1000.0);
415 assert!(cap.has_tensor_cores);
416 assert!(cap.supports_fp16);
417 assert!(cap.supports_int8);
418 }
419
420 #[test]
421 fn test_assess_gpu_readiness() {
422 let report = assess_gpu_readiness();
424
425 assert_eq!(report.gpu_count, report.gpus.len());
427 assert_eq!(report.gpu_available, report.gpu_count > 0);
428
429 if report.gpu_available {
430 assert!(report.estimated_speedup.is_some());
432 assert_ne!(report.recommended_device, Device::cpu());
433 assert!(report.gpus.iter().any(|g| g.recommended));
435 } else {
436 assert_eq!(report.recommended_device, Device::cpu());
438 assert!(report.estimated_speedup.is_none());
439 }
440
441 assert!(!report.recommendation_reasons.is_empty());
443 }
444
445 #[test]
446 fn test_recommend_batch_size() {
447 let gpu = GpuCapability {
448 device: Device::cuda(0),
449 name: "Test GPU".to_string(),
450 memory_mb: 16384,
451 memory_bandwidth_gbs: 500.0,
452 compute_capability: Some((8, 0)),
453 cuda_cores: Some(8192),
454 has_tensor_cores: true,
455 supports_fp16: true,
456 supports_int8: true,
457 recommended: true,
458 };
459
460 let workload = WorkloadProfile {
461 operation_count: 1000,
462 avg_tensor_size: 100000,
463 peak_memory_mb: 128,
464 compute_intensity: 50.0,
465 };
466
467 let batch_size = recommend_batch_size(&gpu, &workload);
468
469 assert!(batch_size > 0);
471 assert!(batch_size <= 256);
472 assert_eq!(batch_size.count_ones(), 1);
474 }
475
476 #[test]
477 fn test_generate_recommendations() {
478 let report = GpuReadinessReport {
479 gpu_available: false,
480 gpu_count: 0,
481 gpus: vec![],
482 recommended_device: Device::cpu(),
483 recommendation_reasons: vec![],
484 estimated_speedup: None,
485 };
486
487 let recommendations = generate_recommendations(&report, None);
488
489 assert!(!recommendations.is_empty());
490 assert!(recommendations
491 .iter()
492 .any(|r| r.contains("SIMD") || r.contains("parallel")));
493 }
494
495 #[test]
496 fn test_estimate_gpu_speedup() {
497 let gpu = GpuCapability {
498 device: Device::cuda(0),
499 name: "High-end GPU".to_string(),
500 memory_mb: 40960,
501 memory_bandwidth_gbs: 1500.0,
502 compute_capability: Some((8, 0)),
503 cuda_cores: Some(10000),
504 has_tensor_cores: true,
505 supports_fp16: true,
506 supports_int8: true,
507 recommended: true,
508 };
509
510 let speedup = estimate_gpu_speedup(&gpu);
511
512 assert!(speedup > 1.0);
514 assert!(speedup <= 50.0); }
516
517 #[test]
518 fn test_workload_profile_creation() {
519 let profile = WorkloadProfile {
520 operation_count: 5000,
521 avg_tensor_size: 250000,
522 peak_memory_mb: 512,
523 compute_intensity: 75.0,
524 };
525
526 assert_eq!(profile.operation_count, 5000);
527 assert_eq!(profile.peak_memory_mb, 512);
528 assert!(profile.compute_intensity > 50.0);
529 }
530}