1use std::collections::HashMap;
7use std::sync::{Arc, RwLock};
8use std::time::Instant;
9
10#[derive(Debug, Clone)]
12pub struct GpuDevice {
13 pub id: u32,
14 pub name: String,
15 pub memory_total: u64,
16 pub memory_available: u64,
17 pub compute_capability: (u32, u32),
18 pub cores: u32,
19 pub clock_rate: u32,
20 pub memory_bandwidth: u64,
21 pub is_integrated: bool,
22}
23
24#[derive(Debug, Clone)]
26pub struct GpuMemoryAllocation {
27 pub ptr: u64,
28 pub size: u64,
29 pub device_id: u32,
30 pub allocated_at: Instant,
31 pub name: String,
32}
33
34#[derive(Debug, Clone)]
36pub struct GpuKernelExecution {
37 pub kernel_name: String,
38 pub device_id: u32,
39 pub grid_size: (u32, u32, u32),
40 pub block_size: (u32, u32, u32),
41 pub shared_memory: u32,
42 pub execution_time: f64,
43 pub parameters: HashMap<String, String>,
44}
45
46#[derive(Debug)]
48pub struct GpuUtils {
49 devices: Vec<GpuDevice>,
50 allocations: Arc<RwLock<HashMap<u64, GpuMemoryAllocation>>>,
51 kernel_executions: Arc<RwLock<Vec<GpuKernelExecution>>>,
52 performance_counters: Arc<RwLock<HashMap<String, f64>>>,
53}
54
55impl GpuUtils {
56 pub fn new() -> Self {
58 Self {
59 devices: Vec::new(),
60 allocations: Arc::new(RwLock::new(HashMap::new())),
61 kernel_executions: Arc::new(RwLock::new(Vec::new())),
62 performance_counters: Arc::new(RwLock::new(HashMap::new())),
63 }
64 }
65
66 pub fn init_devices(&mut self) -> Result<(), GpuError> {
68 let mock_devices = vec![
70 GpuDevice {
71 id: 0,
72 name: "NVIDIA GeForce RTX 3080".to_string(),
73 memory_total: 10_737_418_240, memory_available: 9_663_676_416, compute_capability: (8, 6),
76 cores: 8704,
77 clock_rate: 1710,
78 memory_bandwidth: 760_000_000_000, is_integrated: false,
80 },
81 GpuDevice {
82 id: 1,
83 name: "Intel UHD Graphics 770".to_string(),
84 memory_total: 2_147_483_648, memory_available: 1_610_612_736, compute_capability: (0, 0),
87 cores: 256,
88 clock_rate: 1550,
89 memory_bandwidth: 68_000_000_000, is_integrated: true,
91 },
92 ];
93
94 self.devices = mock_devices;
95 Ok(())
96 }
97
98 pub fn get_devices(&self) -> &[GpuDevice] {
100 &self.devices
101 }
102
103 pub fn get_device(&self, id: u32) -> Option<&GpuDevice> {
105 self.devices.iter().find(|d| d.id == id)
106 }
107
108 pub fn get_best_device(&self) -> Option<&GpuDevice> {
110 self.devices
111 .iter()
112 .filter(|d| !d.is_integrated)
113 .max_by_key(|d| d.cores * d.clock_rate)
114 .or_else(|| self.devices.first())
115 }
116
117 pub fn allocate_memory(&self, size: u64, device_id: u32, name: &str) -> Result<u64, GpuError> {
119 let device = self.get_device(device_id).ok_or(GpuError::DeviceNotFound)?;
120
121 if size > device.memory_available {
122 return Err(GpuError::OutOfMemory);
123 }
124
125 let ptr = (std::ptr::null::<u8>() as u64) + size; let allocation = GpuMemoryAllocation {
128 ptr,
129 size,
130 device_id,
131 allocated_at: Instant::now(),
132 name: name.to_string(),
133 };
134
135 self.allocations.write().unwrap().insert(ptr, allocation);
136 Ok(ptr)
137 }
138
139 pub fn free_memory(&self, ptr: u64) -> Result<(), GpuError> {
141 let mut allocations = self.allocations.write().unwrap();
142 allocations.remove(&ptr).ok_or(GpuError::InvalidPointer)?;
143 Ok(())
144 }
145
146 pub fn get_memory_stats(&self) -> HashMap<u32, MemoryStats> {
148 let allocations = self.allocations.read().unwrap();
149 let mut stats = HashMap::new();
150
151 for device in &self.devices {
152 let device_allocations: Vec<_> = allocations
153 .values()
154 .filter(|a| a.device_id == device.id)
155 .collect();
156
157 let total_allocated = device_allocations.iter().map(|a| a.size).sum();
158 let num_allocations = device_allocations.len();
159
160 stats.insert(
161 device.id,
162 MemoryStats {
163 total_memory: device.memory_total,
164 available_memory: device.memory_available,
165 allocated_memory: total_allocated,
166 free_memory: device.memory_available - total_allocated,
167 num_allocations,
168 largest_allocation: device_allocations
169 .iter()
170 .map(|a| a.size)
171 .max()
172 .unwrap_or(0),
173 fragmentation_ratio: if num_allocations > 0 {
174 (num_allocations as f64) / (total_allocated as f64 / 1024.0)
175 } else {
176 0.0
177 },
178 },
179 );
180 }
181
182 stats
183 }
184
185 pub fn execute_kernel(&self, kernel: &GpuKernelInfo) -> Result<GpuKernelExecution, GpuError> {
187 let _device = self
188 .get_device(kernel.device_id)
189 .ok_or(GpuError::DeviceNotFound)?;
190
191 let start_time = Instant::now();
192
193 std::thread::sleep(std::time::Duration::from_millis(1));
195
196 let execution_time = start_time.elapsed().as_secs_f64() * 1000.0; let execution = GpuKernelExecution {
199 kernel_name: kernel.name.clone(),
200 device_id: kernel.device_id,
201 grid_size: kernel.grid_size,
202 block_size: kernel.block_size,
203 shared_memory: kernel.shared_memory,
204 execution_time,
205 parameters: kernel.parameters.clone(),
206 };
207
208 self.kernel_executions
209 .write()
210 .unwrap()
211 .push(execution.clone());
212 Ok(execution)
213 }
214
215 pub fn get_kernel_history(&self) -> Vec<GpuKernelExecution> {
217 self.kernel_executions.read().unwrap().clone()
218 }
219
220 pub fn get_performance_counters(&self) -> HashMap<String, f64> {
222 self.performance_counters.read().unwrap().clone()
223 }
224
225 pub fn update_counter(&self, name: &str, value: f64) {
227 self.performance_counters
228 .write()
229 .unwrap()
230 .insert(name.to_string(), value);
231 }
232
233 pub fn estimate_throughput(&self, device_id: u32, array_size: usize, operation: &str) -> f64 {
235 let device = match self.get_device(device_id) {
236 Some(d) => d,
237 None => return 0.0,
238 };
239
240 let base_throughput = match operation {
241 "add" | "subtract" | "multiply" => device.memory_bandwidth as f64 * 0.8,
242 "divide" | "sqrt" | "exp" | "log" => device.memory_bandwidth as f64 * 0.6,
243 "matrix_multiply" => (device.cores as f64 * device.clock_rate as f64 * 1e6) * 0.5,
244 "fft" => (device.cores as f64 * device.clock_rate as f64 * 1e6) * 0.3,
245 _ => device.memory_bandwidth as f64 * 0.5,
246 };
247
248 let array_factor = (array_size as f64).log2() / 20.0; base_throughput * (1.0 - array_factor.min(0.5))
250 }
251
252 pub fn should_use_gpu(&self, array_size: usize, operation: &str) -> bool {
254 if self.devices.is_empty() {
255 return false;
256 }
257
258 let min_size = match operation {
259 "add" | "subtract" | "multiply" | "divide" => 1000,
260 "matrix_multiply" => 100,
261 "fft" | "conv" => 512,
262 _ => 1000,
263 };
264
265 array_size >= min_size
266 }
267
268 pub fn get_utilization(&self) -> HashMap<u32, f64> {
270 let mut utilization = HashMap::new();
271
272 for device in &self.devices {
273 let recent_executions = self
275 .kernel_executions
276 .read()
277 .unwrap()
278 .iter()
279 .filter(|e| e.device_id == device.id)
280 .filter(|e| e.execution_time > 0.0)
281 .count();
282
283 let util = (recent_executions as f64 / 10.0).min(1.0);
284 utilization.insert(device.id, util);
285 }
286
287 utilization
288 }
289
290 pub fn cleanup(&self) -> Result<(), GpuError> {
292 let allocations = self.allocations.read().unwrap();
293 if !allocations.is_empty() {
294 return Err(GpuError::ResourcesNotFreed);
295 }
296
297 self.kernel_executions.write().unwrap().clear();
299 self.performance_counters.write().unwrap().clear();
300
301 Ok(())
302 }
303}
304
305#[derive(Debug, Clone)]
307pub struct GpuKernelInfo {
308 pub name: String,
309 pub device_id: u32,
310 pub grid_size: (u32, u32, u32),
311 pub block_size: (u32, u32, u32),
312 pub shared_memory: u32,
313 pub parameters: HashMap<String, String>,
314}
315
316#[derive(Debug, Clone)]
318pub struct MemoryStats {
319 pub total_memory: u64,
320 pub available_memory: u64,
321 pub allocated_memory: u64,
322 pub free_memory: u64,
323 pub num_allocations: usize,
324 pub largest_allocation: u64,
325 pub fragmentation_ratio: f64,
326}
327
328pub struct GpuArrayOps;
330
331impl GpuArrayOps {
332 pub fn add_arrays(a: &[f32], b: &[f32], _device_id: u32) -> Result<Vec<f32>, GpuError> {
334 if a.len() != b.len() {
335 return Err(GpuError::ShapeMismatch);
336 }
337
338 let result: Vec<f32> = a.iter().zip(b.iter()).map(|(x, y)| x + y).collect();
340 Ok(result)
341 }
342
343 pub fn multiply_arrays(a: &[f32], b: &[f32], _device_id: u32) -> Result<Vec<f32>, GpuError> {
345 if a.len() != b.len() {
346 return Err(GpuError::ShapeMismatch);
347 }
348
349 let result: Vec<f32> = a.iter().zip(b.iter()).map(|(x, y)| x * y).collect();
351 Ok(result)
352 }
353
354 pub fn matrix_multiply(
356 a: &[f32],
357 b: &[f32],
358 m: usize,
359 n: usize,
360 k: usize,
361 _device_id: u32,
362 ) -> Result<Vec<f32>, GpuError> {
363 if a.len() != m * k || b.len() != k * n {
364 return Err(GpuError::ShapeMismatch);
365 }
366
367 let mut result = vec![0.0f32; m * n];
369
370 for i in 0..m {
371 for j in 0..n {
372 for l in 0..k {
373 result[i * n + j] += a[i * k + l] * b[l * n + j];
374 }
375 }
376 }
377
378 Ok(result)
379 }
380
381 pub fn apply_activation(
383 input: &[f32],
384 activation: ActivationFunction,
385 _device_id: u32,
386 ) -> Result<Vec<f32>, GpuError> {
387 let result: Vec<f32> = input
389 .iter()
390 .map(|&x| {
391 match activation {
392 ActivationFunction::ReLU => x.max(0.0),
393 ActivationFunction::Sigmoid => 1.0 / (1.0 + (-x).exp()),
394 ActivationFunction::Tanh => x.tanh(),
395 ActivationFunction::Softmax => x.exp(), }
397 })
398 .collect();
399
400 Ok(result)
401 }
402
403 pub fn reduce_sum(input: &[f32], _device_id: u32) -> Result<f32, GpuError> {
405 Ok(input.iter().sum())
407 }
408
409 pub fn reduce_max(input: &[f32], _device_id: u32) -> Result<f32, GpuError> {
411 input
413 .iter()
414 .fold(f32::NEG_INFINITY, |a, &b| a.max(b))
415 .is_finite()
416 .then_some(input.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b)))
417 .ok_or(GpuError::ComputationError)
418 }
419}
420
421#[derive(Debug, Clone, Copy)]
423pub enum ActivationFunction {
424 ReLU,
425 Sigmoid,
426 Tanh,
427 Softmax,
428}
429
430#[derive(Debug, thiserror::Error)]
432pub enum GpuError {
433 #[error("GPU device not found")]
434 DeviceNotFound,
435 #[error("Out of GPU memory")]
436 OutOfMemory,
437 #[error("Invalid GPU pointer")]
438 InvalidPointer,
439 #[error("GPU computation error")]
440 ComputationError,
441 #[error("Array shape mismatch")]
442 ShapeMismatch,
443 #[error("GPU resources not freed")]
444 ResourcesNotFreed,
445 #[error("GPU initialization failed: {0}")]
446 InitializationFailed(String),
447}
448
449#[derive(Debug)]
451pub struct GpuProfiler {
452 kernel_times: HashMap<String, Vec<f64>>,
453 memory_transfers: Vec<(Instant, u64, String)>,
454 device_utilization: HashMap<u32, Vec<(Instant, f64)>>,
455}
456
457impl GpuProfiler {
458 pub fn new() -> Self {
460 Self {
461 kernel_times: HashMap::new(),
462 memory_transfers: Vec::new(),
463 device_utilization: HashMap::new(),
464 }
465 }
466
467 pub fn record_kernel_time(&mut self, kernel_name: &str, time_ms: f64) {
469 self.kernel_times
470 .entry(kernel_name.to_string())
471 .or_default()
472 .push(time_ms);
473 }
474
475 pub fn record_memory_transfer(&mut self, size: u64, direction: &str) {
477 self.memory_transfers
478 .push((Instant::now(), size, direction.to_string()));
479 }
480
481 pub fn record_utilization(&mut self, device_id: u32, utilization: f64) {
483 self.device_utilization
484 .entry(device_id)
485 .or_default()
486 .push((Instant::now(), utilization));
487 }
488
489 pub fn get_kernel_stats(&self) -> HashMap<String, KernelStats> {
491 let mut stats = HashMap::new();
492
493 for (kernel_name, times) in &self.kernel_times {
494 let count = times.len();
495 let total_time: f64 = times.iter().sum();
496 let avg_time = total_time / count as f64;
497 let min_time = times.iter().fold(f64::INFINITY, |a, &b| a.min(b));
498 let max_time = times.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
499
500 stats.insert(
501 kernel_name.clone(),
502 KernelStats {
503 count,
504 total_time,
505 avg_time,
506 min_time,
507 max_time,
508 },
509 );
510 }
511
512 stats
513 }
514
515 pub fn get_memory_transfer_stats(&self) -> MemoryTransferStats {
517 let total_transfers = self.memory_transfers.len();
518 let total_bytes: u64 = self.memory_transfers.iter().map(|(_, size, _)| size).sum();
519
520 let host_to_device = self
521 .memory_transfers
522 .iter()
523 .filter(|(_, _, dir)| dir == "host_to_device")
524 .count();
525
526 let device_to_host = self
527 .memory_transfers
528 .iter()
529 .filter(|(_, _, dir)| dir == "device_to_host")
530 .count();
531
532 MemoryTransferStats {
533 total_transfers,
534 total_bytes,
535 host_to_device_transfers: host_to_device,
536 device_to_host_transfers: device_to_host,
537 }
538 }
539
540 pub fn clear(&mut self) {
542 self.kernel_times.clear();
543 self.memory_transfers.clear();
544 self.device_utilization.clear();
545 }
546}
547
548#[derive(Debug, Clone)]
550pub struct KernelStats {
551 pub count: usize,
552 pub total_time: f64,
553 pub avg_time: f64,
554 pub min_time: f64,
555 pub max_time: f64,
556}
557
558#[derive(Debug, Clone)]
560pub struct MemoryTransferStats {
561 pub total_transfers: usize,
562 pub total_bytes: u64,
563 pub host_to_device_transfers: usize,
564 pub device_to_host_transfers: usize,
565}
566
567impl Default for GpuUtils {
568 fn default() -> Self {
569 Self::new()
570 }
571}
572
573impl Default for GpuProfiler {
574 fn default() -> Self {
575 Self::new()
576 }
577}
578
579pub struct MultiGpuCoordinator {
581 gpus: HashMap<u32, GpuUtils>,
582 load_balancer: LoadBalancer,
583 #[allow(dead_code)]
584 communication_topology: CommunicationTopology,
585 #[allow(dead_code)]
586 synchronization_barriers: Vec<SynchronizationBarrier>,
587}
588
589impl Default for MultiGpuCoordinator {
590 fn default() -> Self {
591 Self::new()
592 }
593}
594
595impl MultiGpuCoordinator {
596 pub fn new() -> Self {
598 Self {
599 gpus: HashMap::new(),
600 load_balancer: LoadBalancer::new(),
601 communication_topology: CommunicationTopology::Ring,
602 synchronization_barriers: Vec::new(),
603 }
604 }
605
606 pub fn init_all_gpus(&mut self) -> Result<(), GpuError> {
608 for gpu_id in 0..8 {
609 let mut gpu = GpuUtils::new();
611 if gpu.init_devices().is_ok() && !gpu.devices.is_empty() {
612 self.gpus.insert(gpu_id, gpu);
613 }
614 }
615
616 if self.gpus.is_empty() {
617 return Err(GpuError::InitializationFailed("No GPUs found".to_string()));
618 }
619
620 Ok(())
621 }
622
623 pub fn get_optimal_assignment(&self, workload: &DistributedWorkload) -> Vec<GpuAssignment> {
625 self.load_balancer.assign_workload(workload, &self.gpus)
626 }
627
628 pub fn execute_distributed(
630 &self,
631 operation: &DistributedOperation,
632 ) -> Result<DistributedResult, GpuError> {
633 let assignments = self.get_optimal_assignment(&operation.workload);
634 let mut results = Vec::new();
635
636 for assignment in assignments {
638 let gpu = self
639 .gpus
640 .get(&assignment.gpu_id)
641 .ok_or(GpuError::DeviceNotFound)?;
642
643 let kernel_info = GpuKernelInfo {
644 name: operation.kernel_name.clone(),
645 device_id: assignment.gpu_id,
646 grid_size: assignment.grid_size,
647 block_size: assignment.block_size,
648 shared_memory: assignment.shared_memory,
649 parameters: assignment.parameters.clone(),
650 };
651
652 let execution = gpu.execute_kernel(&kernel_info)?;
653 results.push(execution);
654 }
655
656 let total_time: f64 = results.iter().map(|e| e.execution_time).sum();
658 Ok(DistributedResult {
659 executions: results,
660 total_time,
661 communication_overhead: 0.0, })
663 }
664
665 pub fn synchronize_all(&self) -> Result<(), GpuError> {
667 std::thread::sleep(std::time::Duration::from_millis(1));
669 Ok(())
670 }
671
672 pub fn get_cluster_memory_stats(&self) -> ClusterMemoryStats {
674 let mut total_memory = 0;
675 let mut total_allocated = 0;
676 let mut device_stats = HashMap::new();
677
678 for (gpu_id, gpu) in &self.gpus {
679 let stats = gpu.get_memory_stats();
680 if let Some(stat) = stats.get(gpu_id) {
681 total_memory += stat.total_memory;
682 total_allocated += stat.allocated_memory;
683 device_stats.insert(*gpu_id, stat.clone());
684 }
685 }
686
687 ClusterMemoryStats {
688 total_memory,
689 total_allocated,
690 total_free: total_memory - total_allocated,
691 num_devices: self.gpus.len(),
692 device_stats,
693 }
694 }
695}
696
697pub struct GpuMemoryPool {
699 pools: HashMap<u32, Vec<MemoryBlock>>,
700 #[allow(dead_code)]
701 allocation_strategy: AllocationStrategy,
702 #[allow(dead_code)]
703 fragmentation_threshold: f64,
704}
705
706impl GpuMemoryPool {
707 pub fn new(strategy: AllocationStrategy) -> Self {
709 Self {
710 pools: HashMap::new(),
711 allocation_strategy: strategy,
712 fragmentation_threshold: 0.3,
713 }
714 }
715
716 pub fn allocate(&mut self, size: u64, device_id: u32) -> Result<u64, GpuError> {
718 let pool = self.pools.entry(device_id).or_default();
720
721 for (i, block) in pool.iter().enumerate() {
723 if !block.is_allocated && block.size >= size {
724 if block.size > size * 2 {
726 let new_block = MemoryBlock {
727 ptr: block.ptr + size,
728 size: block.size - size,
729 is_allocated: false,
730 allocation_time: None,
731 };
732 pool.push(new_block);
733
734 pool[i].size = size;
735 }
736
737 pool[i].is_allocated = true;
738 pool[i].allocation_time = Some(Instant::now());
739 return Ok(pool[i].ptr);
740 }
741 }
742
743 let ptr = self.allocate_new_block(size, device_id)?;
745
746 let pool = self.pools.entry(device_id).or_default();
748 pool.push(MemoryBlock {
749 ptr,
750 size,
751 is_allocated: true,
752 allocation_time: Some(Instant::now()),
753 });
754
755 Ok(ptr)
756 }
757
758 pub fn free(&mut self, ptr: u64, device_id: u32) -> Result<(), GpuError> {
760 let pool = self
761 .pools
762 .get_mut(&device_id)
763 .ok_or(GpuError::DeviceNotFound)?;
764
765 for block in pool.iter_mut() {
766 if block.ptr == ptr {
767 block.is_allocated = false;
768 block.allocation_time = None;
769 self.try_merge_blocks(device_id);
770 return Ok(());
771 }
772 }
773
774 Err(GpuError::InvalidPointer)
775 }
776
777 pub fn defragment(&mut self, device_id: u32) -> Result<DefragmentationResult, GpuError> {
779 let before_fragmentation = self.calculate_fragmentation(device_id);
780
781 let pool = self
782 .pools
783 .get_mut(&device_id)
784 .ok_or(GpuError::DeviceNotFound)?;
785 let before_blocks = pool.len();
786
787 pool.sort_by_key(|b| b.ptr);
789
790 let mut i = 0;
792 while i < pool.len() - 1 {
793 if !pool[i].is_allocated
794 && !pool[i + 1].is_allocated
795 && pool[i].ptr + pool[i].size == pool[i + 1].ptr
796 {
797 pool[i].size += pool[i + 1].size;
798 pool.remove(i + 1);
799 } else {
800 i += 1;
801 }
802 }
803
804 let after_blocks = pool.len();
805 let after_fragmentation = self.calculate_fragmentation(device_id);
806
807 Ok(DefragmentationResult {
808 blocks_before: before_blocks,
809 blocks_after: after_blocks,
810 fragmentation_before: before_fragmentation,
811 fragmentation_after: after_fragmentation,
812 })
813 }
814
815 fn allocate_new_block(&self, size: u64, _device_id: u32) -> Result<u64, GpuError> {
816 let ptr = (std::ptr::null::<u8>() as u64) + size;
818 Ok(ptr)
819 }
820
821 fn try_merge_blocks(&mut self, device_id: u32) {
822 if let Some(pool) = self.pools.get_mut(&device_id) {
823 pool.sort_by_key(|b| b.ptr);
824
825 let mut i = 0;
826 while i < pool.len() - 1 {
827 if !pool[i].is_allocated
828 && !pool[i + 1].is_allocated
829 && pool[i].ptr + pool[i].size == pool[i + 1].ptr
830 {
831 pool[i].size += pool[i + 1].size;
832 pool.remove(i + 1);
833 } else {
834 i += 1;
835 }
836 }
837 }
838 }
839
840 fn calculate_fragmentation(&self, device_id: u32) -> f64 {
841 let empty_pool = Vec::new();
842 let pool = self.pools.get(&device_id).unwrap_or(&empty_pool);
843 let free_blocks = pool.iter().filter(|b| !b.is_allocated).count();
844 let total_blocks = pool.len();
845
846 if total_blocks == 0 {
847 0.0
848 } else {
849 free_blocks as f64 / total_blocks as f64
850 }
851 }
852}
853
854pub struct AsyncGpuOps {
856 streams: HashMap<u32, Vec<GpuStream>>,
857 pending_operations: Vec<AsyncOperation>,
858}
859
860impl Default for AsyncGpuOps {
861 fn default() -> Self {
862 Self::new()
863 }
864}
865
866impl AsyncGpuOps {
867 pub fn new() -> Self {
869 Self {
870 streams: HashMap::new(),
871 pending_operations: Vec::new(),
872 }
873 }
874
875 pub fn create_stream(&mut self, device_id: u32) -> Result<u32, GpuError> {
877 let stream_id = self.streams.get(&device_id).map_or(0, |s| s.len() as u32);
878 let stream = GpuStream {
879 id: stream_id,
880 device_id,
881 is_busy: false,
882 priority: StreamPriority::Normal,
883 };
884
885 self.streams.entry(device_id).or_default().push(stream);
886 Ok(stream_id)
887 }
888
889 pub fn launch_kernel_async(
891 &mut self,
892 kernel: &GpuKernelInfo,
893 stream_id: u32,
894 ) -> Result<AsyncOperationHandle, GpuError> {
895 let operation = AsyncOperation {
896 id: self.pending_operations.len() as u32,
897 kernel_info: kernel.clone(),
898 stream_id,
899 start_time: Instant::now(),
900 status: OperationStatus::Pending,
901 };
902
903 let handle = AsyncOperationHandle {
904 operation_id: operation.id,
905 device_id: kernel.device_id,
906 };
907
908 self.pending_operations.push(operation);
909 Ok(handle)
910 }
911
912 pub fn wait_for_completion(
914 &mut self,
915 handle: &AsyncOperationHandle,
916 ) -> Result<GpuKernelExecution, GpuError> {
917 std::thread::sleep(std::time::Duration::from_millis(1));
919
920 if let Some(op) = self
921 .pending_operations
922 .iter_mut()
923 .find(|op| op.id == handle.operation_id)
924 {
925 op.status = OperationStatus::Completed;
926
927 Ok(GpuKernelExecution {
928 kernel_name: op.kernel_info.name.clone(),
929 device_id: op.kernel_info.device_id,
930 grid_size: op.kernel_info.grid_size,
931 block_size: op.kernel_info.block_size,
932 shared_memory: op.kernel_info.shared_memory,
933 execution_time: op.start_time.elapsed().as_secs_f64() * 1000.0,
934 parameters: op.kernel_info.parameters.clone(),
935 })
936 } else {
937 Err(GpuError::ComputationError)
938 }
939 }
940
941 pub fn is_complete(&self, handle: &AsyncOperationHandle) -> bool {
943 self.pending_operations
944 .iter()
945 .find(|op| op.id == handle.operation_id)
946 .is_some_and(|op| matches!(op.status, OperationStatus::Completed))
947 }
948}
949
950pub struct GpuOptimizationAdvisor {
952 performance_history: HashMap<String, Vec<PerformanceMetric>>,
953 optimization_rules: Vec<OptimizationRule>,
954}
955
956impl Default for GpuOptimizationAdvisor {
957 fn default() -> Self {
958 Self::new()
959 }
960}
961
962impl GpuOptimizationAdvisor {
963 pub fn new() -> Self {
965 let mut advisor = Self {
966 performance_history: HashMap::new(),
967 optimization_rules: Vec::new(),
968 };
969
970 advisor.init_default_rules();
971 advisor
972 }
973
974 pub fn analyze_performance(
976 &mut self,
977 kernel_name: &str,
978 execution: &GpuKernelExecution,
979 workload_size: usize,
980 ) -> Vec<OptimizationRecommendation> {
981 let metric = PerformanceMetric {
982 execution_time: execution.execution_time,
983 throughput: workload_size as f64 / execution.execution_time,
984 memory_bandwidth: 0.0, occupancy: self.calculate_occupancy(execution),
986 };
987
988 self.performance_history
989 .entry(kernel_name.to_string())
990 .or_default()
991 .push(metric.clone());
992
993 let mut recommendations = Vec::new();
994
995 for rule in &self.optimization_rules {
996 if let Some(recommendation) = rule.evaluate(&metric, execution) {
997 recommendations.push(recommendation);
998 }
999 }
1000
1001 recommendations
1002 }
1003
1004 fn init_default_rules(&mut self) {
1005 self.optimization_rules.push(OptimizationRule {
1006 name: "Low Occupancy".to_string(),
1007 condition: Box::new(|metric, _| metric.occupancy < 0.5),
1008 recommendation: "Consider increasing block size or reducing register usage".to_string(),
1009 priority: RecommendationPriority::High,
1010 });
1011
1012 self.optimization_rules.push(OptimizationRule {
1013 name: "Memory Bandwidth".to_string(),
1014 condition: Box::new(|metric, _| metric.memory_bandwidth < 0.7),
1015 recommendation: "Optimize memory access patterns for better coalescing".to_string(),
1016 priority: RecommendationPriority::Medium,
1017 });
1018
1019 self.optimization_rules.push(OptimizationRule {
1020 name: "Small Grid Size".to_string(),
1021 condition: Box::new(|_, execution| {
1022 let total_threads = execution.grid_size.0
1023 * execution.grid_size.1
1024 * execution.grid_size.2
1025 * execution.block_size.0
1026 * execution.block_size.1
1027 * execution.block_size.2;
1028 total_threads < 1024
1029 }),
1030 recommendation: "Consider increasing grid size to better utilize GPU cores".to_string(),
1031 priority: RecommendationPriority::Low,
1032 });
1033 }
1034
1035 fn calculate_occupancy(&self, execution: &GpuKernelExecution) -> f64 {
1036 let threads_per_block =
1037 execution.block_size.0 * execution.block_size.1 * execution.block_size.2;
1038 let blocks_per_sm = 2048 / threads_per_block.max(1); (blocks_per_sm as f64 / 32.0).min(1.0) }
1041}
1042
1043#[derive(Debug, Clone)]
1046pub struct DistributedWorkload {
1047 pub total_elements: usize,
1048 pub operation_type: String,
1049 pub memory_requirement: u64,
1050 pub computation_complexity: f64,
1051}
1052
1053#[derive(Debug, Clone)]
1054pub struct DistributedOperation {
1055 pub kernel_name: String,
1056 pub workload: DistributedWorkload,
1057}
1058
1059#[derive(Debug, Clone)]
1060pub struct DistributedResult {
1061 pub executions: Vec<GpuKernelExecution>,
1062 pub total_time: f64,
1063 pub communication_overhead: f64,
1064}
1065
1066#[derive(Debug, Clone)]
1067pub struct GpuAssignment {
1068 pub gpu_id: u32,
1069 pub grid_size: (u32, u32, u32),
1070 pub block_size: (u32, u32, u32),
1071 pub shared_memory: u32,
1072 pub parameters: HashMap<String, String>,
1073}
1074
1075#[derive(Debug, Clone)]
1076pub struct LoadBalancer {
1077 #[allow(dead_code)]
1078 strategy: LoadBalancingStrategy,
1079}
1080
1081impl Default for LoadBalancer {
1082 fn default() -> Self {
1083 Self::new()
1084 }
1085}
1086
1087impl LoadBalancer {
1088 pub fn new() -> Self {
1089 Self {
1090 strategy: LoadBalancingStrategy::WorkloadProportional,
1091 }
1092 }
1093
1094 pub fn assign_workload(
1095 &self,
1096 workload: &DistributedWorkload,
1097 gpus: &HashMap<u32, GpuUtils>,
1098 ) -> Vec<GpuAssignment> {
1099 let mut assignments = Vec::new();
1100 let num_gpus = gpus.len() as u32;
1101
1102 if num_gpus == 0 {
1103 return assignments;
1104 }
1105
1106 let elements_per_gpu = workload.total_elements / num_gpus as usize;
1107
1108 for (gpu_id, _) in gpus.iter() {
1109 let assignment = GpuAssignment {
1110 gpu_id: *gpu_id,
1111 grid_size: (elements_per_gpu as u32 / 256, 1, 1),
1112 block_size: (256, 1, 1),
1113 shared_memory: 0,
1114 parameters: HashMap::new(),
1115 };
1116 assignments.push(assignment);
1117 }
1118
1119 assignments
1120 }
1121}
1122
1123#[derive(Debug, Clone)]
1124pub enum LoadBalancingStrategy {
1125 RoundRobin,
1126 WorkloadProportional,
1127 MemoryAware,
1128 PerformanceBased,
1129}
1130
1131#[derive(Debug, Clone)]
1132pub enum CommunicationTopology {
1133 Ring,
1134 Tree,
1135 AllToAll,
1136 Custom(Vec<Vec<u32>>),
1137}
1138
1139#[derive(Debug, Clone)]
1140pub struct SynchronizationBarrier {
1141 pub id: u32,
1142 pub participating_gpus: Vec<u32>,
1143 pub barrier_type: BarrierType,
1144}
1145
1146#[derive(Debug, Clone)]
1147pub enum BarrierType {
1148 Global,
1149 Local(Vec<u32>),
1150 Hierarchical,
1151}
1152
1153#[derive(Debug, Clone)]
1154pub struct ClusterMemoryStats {
1155 pub total_memory: u64,
1156 pub total_allocated: u64,
1157 pub total_free: u64,
1158 pub num_devices: usize,
1159 pub device_stats: HashMap<u32, MemoryStats>,
1160}
1161
1162#[derive(Debug, Clone)]
1163pub struct MemoryBlock {
1164 pub ptr: u64,
1165 pub size: u64,
1166 pub is_allocated: bool,
1167 pub allocation_time: Option<Instant>,
1168}
1169
1170#[derive(Debug, Clone)]
1171pub enum AllocationStrategy {
1172 FirstFit,
1173 BestFit,
1174 WorstFit,
1175 BuddySystem,
1176}
1177
1178#[derive(Debug, Clone)]
1179pub struct DefragmentationResult {
1180 pub blocks_before: usize,
1181 pub blocks_after: usize,
1182 pub fragmentation_before: f64,
1183 pub fragmentation_after: f64,
1184}
1185
1186#[derive(Debug, Clone)]
1187pub struct GpuStream {
1188 pub id: u32,
1189 pub device_id: u32,
1190 pub is_busy: bool,
1191 pub priority: StreamPriority,
1192}
1193
1194#[derive(Debug, Clone)]
1195pub enum StreamPriority {
1196 Low,
1197 Normal,
1198 High,
1199}
1200
1201#[derive(Debug, Clone)]
1202pub struct AsyncOperation {
1203 pub id: u32,
1204 pub kernel_info: GpuKernelInfo,
1205 pub stream_id: u32,
1206 pub start_time: Instant,
1207 pub status: OperationStatus,
1208}
1209
1210#[derive(Debug, Clone)]
1211pub enum OperationStatus {
1212 Pending,
1213 Running,
1214 Completed,
1215 Failed,
1216}
1217
1218#[derive(Debug, Clone)]
1219pub struct AsyncOperationHandle {
1220 pub operation_id: u32,
1221 pub device_id: u32,
1222}
1223
1224#[derive(Debug, Clone)]
1225pub struct PerformanceMetric {
1226 pub execution_time: f64,
1227 pub throughput: f64,
1228 pub memory_bandwidth: f64,
1229 pub occupancy: f64,
1230}
1231
1232type OptimizationCondition =
1233 Box<dyn Fn(&PerformanceMetric, &GpuKernelExecution) -> bool + Send + Sync>;
1234
1235pub struct OptimizationRule {
1236 pub name: String,
1237 pub condition: OptimizationCondition,
1238 pub recommendation: String,
1239 pub priority: RecommendationPriority,
1240}
1241
1242impl std::fmt::Debug for OptimizationRule {
1243 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1244 f.debug_struct("OptimizationRule")
1245 .field("name", &self.name)
1246 .field("condition", &"<function>")
1247 .field("recommendation", &self.recommendation)
1248 .field("priority", &self.priority)
1249 .finish()
1250 }
1251}
1252
1253impl Clone for OptimizationRule {
1254 fn clone(&self) -> Self {
1255 OptimizationRule {
1259 name: self.name.clone(),
1260 condition: Box::new(|_metric, _execution| false), recommendation: self.recommendation.clone(),
1262 priority: self.priority.clone(),
1263 }
1264 }
1265}
1266
1267impl OptimizationRule {
1268 pub fn evaluate(
1269 &self,
1270 metric: &PerformanceMetric,
1271 execution: &GpuKernelExecution,
1272 ) -> Option<OptimizationRecommendation> {
1273 if (self.condition)(metric, execution) {
1274 Some(OptimizationRecommendation {
1275 rule_name: self.name.clone(),
1276 recommendation: self.recommendation.clone(),
1277 priority: self.priority.clone(),
1278 estimated_improvement: 0.0, })
1280 } else {
1281 None
1282 }
1283 }
1284}
1285
1286#[derive(Debug, Clone)]
1287pub struct OptimizationRecommendation {
1288 pub rule_name: String,
1289 pub recommendation: String,
1290 pub priority: RecommendationPriority,
1291 pub estimated_improvement: f64,
1292}
1293
1294#[derive(Debug, Clone)]
1295pub enum RecommendationPriority {
1296 Low,
1297 Medium,
1298 High,
1299 Critical,
1300}
1301
1302#[allow(non_snake_case)]
1303#[cfg(test)]
1304mod tests {
1305 use super::*;
1306
1307 #[test]
1308 fn test_gpu_utils_creation() {
1309 let utils = GpuUtils::new();
1310 assert!(utils.devices.is_empty());
1311 assert!(utils.allocations.read().unwrap().is_empty());
1312 }
1313
1314 #[test]
1315 fn test_device_initialization() {
1316 let mut utils = GpuUtils::new();
1317 assert!(utils.init_devices().is_ok());
1318 assert!(!utils.devices.is_empty());
1319 }
1320
1321 #[test]
1322 fn test_device_selection() {
1323 let mut utils = GpuUtils::new();
1324 utils.init_devices().unwrap();
1325
1326 let best_device = utils.get_best_device();
1327 assert!(best_device.is_some());
1328 assert!(!best_device.unwrap().is_integrated);
1329 }
1330
1331 #[test]
1332 fn test_memory_allocation() {
1333 let mut utils = GpuUtils::new();
1334 utils.init_devices().unwrap();
1335
1336 let ptr = utils.allocate_memory(1024, 0, "test").unwrap();
1337 assert!(ptr > 0);
1338
1339 assert!(utils.free_memory(ptr).is_ok());
1340 }
1341
1342 #[test]
1343 fn test_kernel_execution() {
1344 let mut utils = GpuUtils::new();
1345 utils.init_devices().unwrap();
1346
1347 let kernel_info = GpuKernelInfo {
1348 name: "test_kernel".to_string(),
1349 device_id: 0,
1350 grid_size: (1, 1, 1),
1351 block_size: (256, 1, 1),
1352 shared_memory: 0,
1353 parameters: HashMap::new(),
1354 };
1355
1356 let execution = utils.execute_kernel(&kernel_info).unwrap();
1357 assert_eq!(execution.kernel_name, "test_kernel");
1358 assert!(execution.execution_time > 0.0);
1359 }
1360
1361 #[test]
1362 fn test_array_operations() {
1363 let a = vec![1.0, 2.0, 3.0, 4.0];
1364 let b = vec![5.0, 6.0, 7.0, 8.0];
1365
1366 let result = GpuArrayOps::add_arrays(&a, &b, 0).unwrap();
1367 assert_eq!(result, vec![6.0, 8.0, 10.0, 12.0]);
1368
1369 let result = GpuArrayOps::multiply_arrays(&a, &b, 0).unwrap();
1370 assert_eq!(result, vec![5.0, 12.0, 21.0, 32.0]);
1371 }
1372
1373 #[test]
1374 fn test_matrix_multiplication() {
1375 let a = vec![1.0, 2.0, 3.0, 4.0]; let b = vec![5.0, 6.0, 7.0, 8.0]; let result = GpuArrayOps::matrix_multiply(&a, &b, 2, 2, 2, 0).unwrap();
1379 assert_eq!(result, vec![19.0, 22.0, 43.0, 50.0]);
1380 }
1381
1382 #[test]
1383 fn test_activation_functions() {
1384 let input = vec![-1.0, 0.0, 1.0, 2.0];
1385
1386 let result = GpuArrayOps::apply_activation(&input, ActivationFunction::ReLU, 0).unwrap();
1387 assert_eq!(result, vec![0.0, 0.0, 1.0, 2.0]);
1388
1389 let result = GpuArrayOps::apply_activation(&input, ActivationFunction::Sigmoid, 0).unwrap();
1390 assert!(result.iter().all(|&x| x >= 0.0 && x <= 1.0));
1391 }
1392
1393 #[test]
1394 fn test_reduction_operations() {
1395 let input = vec![1.0, 2.0, 3.0, 4.0, 5.0];
1396
1397 let sum = GpuArrayOps::reduce_sum(&input, 0).unwrap();
1398 assert_eq!(sum, 15.0);
1399
1400 let max = GpuArrayOps::reduce_max(&input, 0).unwrap();
1401 assert_eq!(max, 5.0);
1402 }
1403
1404 #[test]
1405 fn test_gpu_profiler() {
1406 let mut profiler = GpuProfiler::new();
1407
1408 profiler.record_kernel_time("test_kernel", 1.5);
1409 profiler.record_kernel_time("test_kernel", 2.0);
1410 profiler.record_memory_transfer(1024, "host_to_device");
1411
1412 let stats = profiler.get_kernel_stats();
1413 assert!(stats.contains_key("test_kernel"));
1414 assert_eq!(stats["test_kernel"].count, 2);
1415 assert_eq!(stats["test_kernel"].avg_time, 1.75);
1416
1417 let mem_stats = profiler.get_memory_transfer_stats();
1418 assert_eq!(mem_stats.total_transfers, 1);
1419 assert_eq!(mem_stats.total_bytes, 1024);
1420 }
1421
1422 #[test]
1423 fn test_throughput_estimation() {
1424 let mut utils = GpuUtils::new();
1425 utils.init_devices().unwrap();
1426
1427 let throughput = utils.estimate_throughput(0, 1000, "add");
1428 assert!(throughput > 0.0);
1429
1430 let should_use = utils.should_use_gpu(1000, "add");
1431 assert!(should_use);
1432
1433 let should_not_use = utils.should_use_gpu(100, "add");
1434 assert!(!should_not_use);
1435 }
1436
1437 #[test]
1438 fn test_memory_stats() {
1439 let mut utils = GpuUtils::new();
1440 utils.init_devices().unwrap();
1441
1442 let _ptr = utils.allocate_memory(1024, 0, "test").unwrap();
1443 let stats = utils.get_memory_stats();
1444
1445 assert!(stats.contains_key(&0));
1446 assert_eq!(stats[&0].allocated_memory, 1024);
1447 assert_eq!(stats[&0].num_allocations, 1);
1448 }
1449
1450 #[test]
1451 fn test_error_handling() {
1452 let utils = GpuUtils::new();
1453
1454 let result = utils.allocate_memory(1024, 999, "test");
1456 assert!(matches!(result, Err(GpuError::DeviceNotFound)));
1457
1458 let result = utils.free_memory(0);
1460 assert!(matches!(result, Err(GpuError::InvalidPointer)));
1461
1462 let a = vec![1.0, 2.0];
1464 let b = vec![3.0, 4.0, 5.0];
1465 let result = GpuArrayOps::add_arrays(&a, &b, 0);
1466 assert!(matches!(result, Err(GpuError::ShapeMismatch)));
1467 }
1468
1469 #[test]
1472 fn test_multi_gpu_coordinator() {
1473 let mut coordinator = MultiGpuCoordinator::new();
1474
1475 let result = coordinator.init_all_gpus();
1477 assert!(result.is_ok() || matches!(result, Err(GpuError::InitializationFailed(_))));
1478
1479 let workload = DistributedWorkload {
1481 total_elements: 10_000,
1482 operation_type: "matrix_multiply".to_string(),
1483 memory_requirement: 1024 * 1024,
1484 computation_complexity: 1.0,
1485 };
1486
1487 let assignments = coordinator.get_optimal_assignment(&workload);
1488 assert!(!assignments.is_empty() || coordinator.gpus.is_empty());
1489 }
1490
1491 #[test]
1492 fn test_distributed_operation() {
1493 let mut coordinator = MultiGpuCoordinator::new();
1494 let init_result = coordinator.init_all_gpus();
1495
1496 let operation = DistributedOperation {
1497 kernel_name: "test_kernel".to_string(),
1498 workload: DistributedWorkload {
1499 total_elements: 1000,
1500 operation_type: "add".to_string(),
1501 memory_requirement: 4000,
1502 computation_complexity: 0.5,
1503 },
1504 };
1505
1506 if init_result.is_ok() && !coordinator.gpus.is_empty() {
1507 let result = coordinator.execute_distributed(&operation);
1508
1509 if result.is_ok() {
1512 let dist_result = result.unwrap();
1513 assert!(!dist_result.executions.is_empty());
1514 assert!(dist_result.total_time >= 0.0);
1515 } else {
1516 assert!(!coordinator.gpus.is_empty());
1519 }
1520 } else {
1521 assert!(coordinator.gpus.is_empty());
1524 }
1525 }
1526
1527 #[test]
1528 fn test_cluster_memory_stats() {
1529 let mut coordinator = MultiGpuCoordinator::new();
1530 let _ = coordinator.init_all_gpus();
1531
1532 let stats = coordinator.get_cluster_memory_stats();
1533 assert_eq!(stats.num_devices, coordinator.gpus.len());
1534 assert_eq!(stats.total_free, stats.total_memory - stats.total_allocated);
1535 }
1536
1537 #[test]
1538 fn test_gpu_memory_pool() {
1539 let mut pool = GpuMemoryPool::new(AllocationStrategy::FirstFit);
1540
1541 let ptr1 = pool.allocate(1024, 0);
1543 assert!(ptr1.is_ok());
1544
1545 let ptr2 = pool.allocate(2048, 0);
1546 assert!(ptr2.is_ok());
1547
1548 let free_result = pool.free(ptr1.unwrap(), 0);
1550 assert!(free_result.is_ok());
1551
1552 let defrag_result = pool.defragment(0);
1554 assert!(defrag_result.is_ok());
1555
1556 let defrag = defrag_result.unwrap();
1557 assert!(defrag.fragmentation_after <= defrag.fragmentation_before);
1558 }
1559
1560 #[test]
1561 fn test_memory_pool_strategies() {
1562 let strategies = vec![
1563 AllocationStrategy::FirstFit,
1564 AllocationStrategy::BestFit,
1565 AllocationStrategy::WorstFit,
1566 AllocationStrategy::BuddySystem,
1567 ];
1568
1569 for strategy in strategies {
1570 let mut pool = GpuMemoryPool::new(strategy);
1571 let ptr = pool.allocate(1024, 0);
1572 assert!(ptr.is_ok());
1573 }
1574 }
1575
1576 #[test]
1577 fn test_async_gpu_operations() {
1578 let mut async_ops = AsyncGpuOps::new();
1579
1580 let stream_id = async_ops.create_stream(0);
1582 assert!(stream_id.is_ok());
1583
1584 let kernel_info = GpuKernelInfo {
1586 name: "async_test".to_string(),
1587 device_id: 0,
1588 grid_size: (1, 1, 1),
1589 block_size: (256, 1, 1),
1590 shared_memory: 0,
1591 parameters: HashMap::new(),
1592 };
1593
1594 let handle = async_ops.launch_kernel_async(&kernel_info, stream_id.unwrap());
1595 assert!(handle.is_ok());
1596
1597 let operation_handle = handle.unwrap();
1598
1599 let _is_complete_before = async_ops.is_complete(&operation_handle);
1601
1602 let execution = async_ops.wait_for_completion(&operation_handle);
1604 assert!(execution.is_ok());
1605
1606 let is_complete_after = async_ops.is_complete(&operation_handle);
1607 assert!(is_complete_after);
1608 }
1609
1610 #[test]
1611 fn test_gpu_optimization_advisor() {
1612 let mut advisor = GpuOptimizationAdvisor::new();
1613
1614 let execution = GpuKernelExecution {
1616 kernel_name: "test_kernel".to_string(),
1617 device_id: 0,
1618 grid_size: (10, 1, 1), block_size: (32, 1, 1), shared_memory: 0,
1621 execution_time: 5.0,
1622 parameters: HashMap::new(),
1623 };
1624
1625 let recommendations = advisor.analyze_performance("test_kernel", &execution, 1000);
1626 assert!(!recommendations.is_empty());
1627
1628 let has_grid_size_recommendation = recommendations
1630 .iter()
1631 .any(|r| r.rule_name.contains("Grid Size"));
1632 assert!(has_grid_size_recommendation);
1633 }
1634
1635 #[test]
1636 fn test_load_balancer() {
1637 let balancer = LoadBalancer::new();
1638 let mut gpus = HashMap::new();
1639
1640 let mut gpu1 = GpuUtils::new();
1642 let mut gpu2 = GpuUtils::new();
1643 let _ = gpu1.init_devices();
1644 let _ = gpu2.init_devices();
1645
1646 gpus.insert(0, gpu1);
1647 gpus.insert(1, gpu2);
1648
1649 let workload = DistributedWorkload {
1650 total_elements: 10_000,
1651 operation_type: "matrix_multiply".to_string(),
1652 memory_requirement: 1024 * 1024,
1653 computation_complexity: 1.0,
1654 };
1655
1656 let assignments = balancer.assign_workload(&workload, &gpus);
1657 assert_eq!(assignments.len(), gpus.len());
1658
1659 let total_elements: u32 = assignments
1661 .iter()
1662 .map(|a| a.grid_size.0 * a.block_size.0)
1663 .sum();
1664 assert!(total_elements > 0);
1665 }
1666
1667 #[test]
1668 fn test_stream_priorities() {
1669 let mut async_ops = AsyncGpuOps::new();
1670 let _stream_id = async_ops.create_stream(0).unwrap();
1671
1672 let streams = async_ops.streams.get(&0).unwrap();
1674 assert_eq!(streams.len(), 1);
1675 assert!(matches!(streams[0].priority, StreamPriority::Normal));
1676 }
1677
1678 #[test]
1679 fn test_memory_block_operations() {
1680 let block1 = MemoryBlock {
1681 ptr: 1000,
1682 size: 1024,
1683 is_allocated: false,
1684 allocation_time: None,
1685 };
1686
1687 let block2 = MemoryBlock {
1688 ptr: 2024,
1689 size: 2048,
1690 is_allocated: true,
1691 allocation_time: Some(Instant::now()),
1692 };
1693
1694 assert!(!block1.is_allocated);
1695 assert!(block2.is_allocated);
1696 assert!(block1.allocation_time.is_none());
1697 assert!(block2.allocation_time.is_some());
1698 }
1699
1700 #[test]
1701 fn test_distributed_workload() {
1702 let workload = DistributedWorkload {
1703 total_elements: 1_000_000,
1704 operation_type: "fft".to_string(),
1705 memory_requirement: 8 * 1_000_000, computation_complexity: 2.5, };
1708
1709 assert_eq!(workload.total_elements, 1_000_000);
1710 assert_eq!(workload.operation_type, "fft");
1711 assert!(workload.computation_complexity > 1.0);
1712 }
1713
1714 #[test]
1715 fn test_communication_topology() {
1716 let ring_topology = CommunicationTopology::Ring;
1717 let tree_topology = CommunicationTopology::Tree;
1718 let all_to_all_topology = CommunicationTopology::AllToAll;
1719 let custom_topology =
1720 CommunicationTopology::Custom(vec![vec![1, 2], vec![0, 3], vec![0, 3], vec![1, 2]]);
1721
1722 match ring_topology {
1724 CommunicationTopology::Ring => {}
1725 _ => panic!(),
1726 }
1727 match tree_topology {
1728 CommunicationTopology::Tree => {}
1729 _ => panic!(),
1730 }
1731 match all_to_all_topology {
1732 CommunicationTopology::AllToAll => {}
1733 _ => panic!(),
1734 }
1735 match custom_topology {
1736 CommunicationTopology::Custom(_) => {}
1737 _ => panic!(),
1738 }
1739 }
1740
1741 #[test]
1742 fn test_synchronization_barrier() {
1743 let barrier = SynchronizationBarrier {
1744 id: 1,
1745 participating_gpus: vec![0, 1, 2, 3],
1746 barrier_type: BarrierType::Global,
1747 };
1748
1749 assert_eq!(barrier.id, 1);
1750 assert_eq!(barrier.participating_gpus.len(), 4);
1751 assert!(matches!(barrier.barrier_type, BarrierType::Global));
1752 }
1753
1754 #[test]
1755 fn test_optimization_recommendation_priorities() {
1756 let low_priority = RecommendationPriority::Low;
1757 let medium_priority = RecommendationPriority::Medium;
1758 let high_priority = RecommendationPriority::High;
1759 let critical_priority = RecommendationPriority::Critical;
1760
1761 match low_priority {
1763 RecommendationPriority::Low => {}
1764 _ => panic!(),
1765 }
1766 match medium_priority {
1767 RecommendationPriority::Medium => {}
1768 _ => panic!(),
1769 }
1770 match high_priority {
1771 RecommendationPriority::High => {}
1772 _ => panic!(),
1773 }
1774 match critical_priority {
1775 RecommendationPriority::Critical => {}
1776 _ => panic!(),
1777 }
1778 }
1779
1780 #[test]
1781 fn test_performance_metric_calculations() {
1782 let metric = PerformanceMetric {
1783 execution_time: 10.0, throughput: 1000.0, memory_bandwidth: 0.8, occupancy: 0.75, };
1788
1789 assert!(metric.execution_time > 0.0);
1790 assert!(metric.throughput > 0.0);
1791 assert!(metric.memory_bandwidth <= 1.0);
1792 assert!(metric.occupancy <= 1.0);
1793 }
1794
1795 #[test]
1796 fn test_operation_status_transitions() {
1797 let mut operation = AsyncOperation {
1798 id: 0,
1799 kernel_info: GpuKernelInfo {
1800 name: "test".to_string(),
1801 device_id: 0,
1802 grid_size: (1, 1, 1),
1803 block_size: (1, 1, 1),
1804 shared_memory: 0,
1805 parameters: HashMap::new(),
1806 },
1807 stream_id: 0,
1808 start_time: Instant::now(),
1809 status: OperationStatus::Pending,
1810 };
1811
1812 assert!(matches!(operation.status, OperationStatus::Pending));
1813
1814 operation.status = OperationStatus::Running;
1815 assert!(matches!(operation.status, OperationStatus::Running));
1816
1817 operation.status = OperationStatus::Completed;
1818 assert!(matches!(operation.status, OperationStatus::Completed));
1819 }
1820}