amari_gpu/
performance.rs

1//! GPU performance optimization and profiling infrastructure
2//!
3//! This module provides comprehensive GPU performance profiling, workgroup optimization,
4//! and adaptive tuning capabilities for all mathematical operations in the Amari library.
5
6use crate::{SharedGpuContext, UnifiedGpuResult};
7use std::collections::HashMap;
8use std::time::Instant;
9
10/// GPU performance profiler with timestamp queries and metrics collection
11pub struct GpuProfiler {
12    context: SharedGpuContext,
13    query_set: wgpu::QuerySet,
14    #[allow(dead_code)] // Used in timestamp calculations
15    timestamp_period: f32,
16    active_profiles: HashMap<String, ProfileSession>,
17    completed_profiles: Vec<GpuProfile>,
18    current_query_idx: u32,
19}
20
21/// Individual profiling session tracking compute operations
22#[derive(Debug)]
23pub struct ProfileSession {
24    #[allow(dead_code)] // Used in profiling infrastructure
25    name: String,
26    start_time: Instant,
27    #[allow(dead_code)] // Used in GPU timestamp queries
28    start_query_idx: u32,
29    end_query_idx: Option<u32>,
30    workgroup_count: (u32, u32, u32),
31    buffer_sizes: Vec<u64>,
32}
33
34/// Completed GPU profile with timing and performance metrics
35#[derive(Debug, Clone)]
36pub struct GpuProfile {
37    pub name: String,
38    pub cpu_time_ms: f32,
39    pub gpu_time_ms: Option<f32>,
40    pub memory_bandwidth_gb_s: f32,
41    pub compute_efficiency_percent: f32,
42    pub workgroup_utilization_percent: f32,
43    pub buffer_pool_hit_rate: f32,
44}
45
46/// Workgroup configuration optimizer
47pub struct WorkgroupOptimizer {
48    optimal_configs: HashMap<String, WorkgroupConfig>,
49    calibration_results: HashMap<String, Vec<CalibrationResult>>,
50}
51
52/// Optimized workgroup configuration for specific operation types
53#[derive(Debug, Clone, Copy)]
54pub struct WorkgroupConfig {
55    pub size: (u32, u32, u32),
56    pub shared_memory_bytes: u32,
57    pub optimal_dispatch_size: u32,
58}
59
60#[derive(Debug, Clone)]
61pub struct CalibrationResult {
62    pub config: WorkgroupConfig,
63    pub throughput_gops: f32,
64    pub latency_ms: f32,
65    pub efficiency_percent: f32,
66}
67
68/// Performance optimization recommendations
69#[derive(Debug, Clone)]
70pub struct PerformanceReport {
71    pub total_gpu_time_ms: f32,
72    pub total_cpu_time_ms: f32,
73    pub gpu_utilization_percent: f32,
74    pub memory_bandwidth_utilization: f32,
75    pub buffer_pool_efficiency: f32,
76    pub bottlenecks: Vec<PerformanceBottleneck>,
77    pub recommendations: Vec<OptimizationRecommendation>,
78}
79
80#[derive(Debug, Clone)]
81pub enum PerformanceBottleneck {
82    MemoryBandwidth { utilization_percent: f32 },
83    ComputeUnits { utilization_percent: f32 },
84    BufferAllocation { avg_allocation_time_ms: f32 },
85    ShaderCompilation { avg_compilation_time_ms: f32 },
86    GpuToProcessorSync { avg_sync_time_ms: f32 },
87}
88
89#[derive(Debug, Clone)]
90pub enum OptimizationRecommendation {
91    IncreaseWorkgroupSize {
92        current: u32,
93        recommended: u32,
94    },
95    EnableBufferPooling {
96        potential_speedup: f32,
97    },
98    OptimizeMemoryLayout {
99        current_efficiency: f32,
100        potential_efficiency: f32,
101    },
102    ReduceBatchSize {
103        current: usize,
104        recommended: usize,
105    },
106    IncreaseBatchSize {
107        current: usize,
108        recommended: usize,
109    },
110    UseSharedMemory {
111        potential_speedup: f32,
112    },
113}
114
115impl GpuProfiler {
116    /// Create a new GPU profiler
117    pub async fn new() -> UnifiedGpuResult<Self> {
118        let context = SharedGpuContext::global().await?.clone();
119
120        // Create timestamp query set
121        let query_set = context
122            .device()
123            .create_query_set(&wgpu::QuerySetDescriptor {
124                label: Some("GPU Profiler Timestamps"),
125                ty: wgpu::QueryType::Timestamp,
126                count: 1024, // Support up to 512 overlapping profiles
127            });
128
129        // Get timestamp period for converting to nanoseconds
130        let timestamp_period = context.queue().get_timestamp_period();
131
132        Ok(Self {
133            context,
134            query_set,
135            timestamp_period,
136            active_profiles: HashMap::new(),
137            completed_profiles: Vec::new(),
138            current_query_idx: 0,
139        })
140    }
141
142    /// Begin profiling a GPU operation
143    pub fn begin_profile(
144        &mut self,
145        name: &str,
146        workgroup_count: (u32, u32, u32),
147        buffer_sizes: &[u64],
148    ) -> ProfileScope<'_> {
149        let start_query_idx = self.current_query_idx;
150        self.current_query_idx += 2; // Reserve start and end timestamps
151
152        let session = ProfileSession {
153            name: name.to_string(),
154            start_time: Instant::now(),
155            start_query_idx,
156            end_query_idx: None,
157            workgroup_count,
158            buffer_sizes: buffer_sizes.to_vec(),
159        };
160
161        self.active_profiles.insert(name.to_string(), session);
162
163        ProfileScope {
164            profiler: self,
165            name: name.to_string(),
166            start_query_idx,
167        }
168    }
169
170    /// End profiling and compute metrics
171    fn end_profile(&mut self, name: &str, start_query_idx: u32) {
172        let end_query_idx = start_query_idx + 1;
173
174        if let Some(mut session) = self.active_profiles.remove(name) {
175            session.end_query_idx = Some(end_query_idx);
176            let cpu_time_ms = session.start_time.elapsed().as_secs_f32() * 1000.0;
177
178            // Calculate memory bandwidth (simplified)
179            let total_memory_bytes: u64 = session.buffer_sizes.iter().sum();
180            let memory_bandwidth_gb_s = if cpu_time_ms > 0.0 {
181                (total_memory_bytes as f32 * 2.0) / (cpu_time_ms / 1000.0) / 1e9
182            // Read + write
183            } else {
184                0.0
185            };
186
187            // Estimate workgroup utilization (simplified - would need GPU capabilities)
188            let total_threads =
189                session.workgroup_count.0 * session.workgroup_count.1 * session.workgroup_count.2;
190            let workgroup_utilization = (total_threads.min(4096) as f32 / 4096.0) * 100.0;
191
192            // Get buffer pool stats
193            let buffer_pool_stats = self.context.buffer_pool_stats();
194
195            let profile = GpuProfile {
196                name: name.to_string(),
197                cpu_time_ms,
198                gpu_time_ms: None, // Would be filled in from timestamp query results
199                memory_bandwidth_gb_s,
200                compute_efficiency_percent: 85.0, // Placeholder - would compute from occupancy
201                workgroup_utilization_percent: workgroup_utilization,
202                buffer_pool_hit_rate: buffer_pool_stats.hit_rate_percent,
203            };
204
205            self.completed_profiles.push(profile);
206        }
207    }
208
209    /// Generate comprehensive performance report
210    pub fn generate_report(&self) -> PerformanceReport {
211        let total_cpu_time: f32 = self.completed_profiles.iter().map(|p| p.cpu_time_ms).sum();
212        let total_gpu_time: f32 = self
213            .completed_profiles
214            .iter()
215            .map(|p| p.gpu_time_ms.unwrap_or(p.cpu_time_ms * 0.8))
216            .sum();
217
218        let avg_gpu_utilization: f32 = if !self.completed_profiles.is_empty() {
219            self.completed_profiles
220                .iter()
221                .map(|p| p.compute_efficiency_percent)
222                .sum::<f32>()
223                / self.completed_profiles.len() as f32
224        } else {
225            0.0
226        };
227
228        let avg_memory_bandwidth: f32 = if !self.completed_profiles.is_empty() {
229            self.completed_profiles
230                .iter()
231                .map(|p| p.memory_bandwidth_gb_s)
232                .sum::<f32>()
233                / self.completed_profiles.len() as f32
234        } else {
235            0.0
236        };
237
238        let buffer_pool_stats = self.context.buffer_pool_stats();
239
240        // Identify bottlenecks
241        let mut bottlenecks = Vec::new();
242        let mut recommendations = Vec::new();
243
244        if avg_memory_bandwidth < 100.0 {
245            // Assuming 100 GB/s theoretical max
246            bottlenecks.push(PerformanceBottleneck::MemoryBandwidth {
247                utilization_percent: (avg_memory_bandwidth / 100.0) * 100.0,
248            });
249            recommendations.push(OptimizationRecommendation::OptimizeMemoryLayout {
250                current_efficiency: avg_memory_bandwidth / 100.0,
251                potential_efficiency: 0.8,
252            });
253        }
254
255        if avg_gpu_utilization < 70.0 {
256            bottlenecks.push(PerformanceBottleneck::ComputeUnits {
257                utilization_percent: avg_gpu_utilization,
258            });
259            recommendations.push(OptimizationRecommendation::IncreaseWorkgroupSize {
260                current: 64,
261                recommended: 256,
262            });
263        }
264
265        if buffer_pool_stats.hit_rate_percent < 50.0 {
266            bottlenecks.push(PerformanceBottleneck::BufferAllocation {
267                avg_allocation_time_ms: 5.0, // Estimate
268            });
269            recommendations.push(OptimizationRecommendation::EnableBufferPooling {
270                potential_speedup: 1.3,
271            });
272        }
273
274        PerformanceReport {
275            total_gpu_time_ms: total_gpu_time,
276            total_cpu_time_ms: total_cpu_time,
277            gpu_utilization_percent: avg_gpu_utilization,
278            memory_bandwidth_utilization: (avg_memory_bandwidth / 100.0) * 100.0,
279            buffer_pool_efficiency: buffer_pool_stats.hit_rate_percent,
280            bottlenecks,
281            recommendations,
282        }
283    }
284
285    /// Get all completed profiles
286    pub fn profiles(&self) -> &[GpuProfile] {
287        &self.completed_profiles
288    }
289
290    /// Clear all profiles
291    pub fn clear_profiles(&mut self) {
292        self.completed_profiles.clear();
293        self.active_profiles.clear();
294        self.current_query_idx = 0;
295    }
296}
297
298/// RAII profiling scope that automatically ends profiling when dropped
299pub struct ProfileScope<'a> {
300    profiler: &'a mut GpuProfiler,
301    name: String,
302    start_query_idx: u32,
303}
304
305impl<'a> ProfileScope<'a> {
306    /// Add timestamp query to command encoder (call this in your compute pass)
307    pub fn write_timestamp(&self, encoder: &mut wgpu::CommandEncoder, stage: TimestampStage) {
308        match stage {
309            TimestampStage::Start => {
310                encoder.write_timestamp(&self.profiler.query_set, self.start_query_idx);
311            }
312            TimestampStage::End => {
313                encoder.write_timestamp(&self.profiler.query_set, self.start_query_idx + 1);
314            }
315        }
316    }
317}
318
319impl<'a> Drop for ProfileScope<'a> {
320    fn drop(&mut self) {
321        self.profiler.end_profile(&self.name, self.start_query_idx);
322    }
323}
324
325#[derive(Debug, Clone, Copy)]
326pub enum TimestampStage {
327    Start,
328    End,
329}
330
331impl WorkgroupOptimizer {
332    /// Create a new workgroup optimizer
333    pub fn new() -> Self {
334        Self {
335            optimal_configs: HashMap::new(),
336            calibration_results: HashMap::new(),
337        }
338    }
339
340    /// Get optimal workgroup configuration for operation type
341    pub fn get_optimal_config(&self, operation_type: &str) -> WorkgroupConfig {
342        self.optimal_configs
343            .get(operation_type)
344            .cloned()
345            .unwrap_or({
346                // Default configurations based on operation type
347                match operation_type {
348                    "matrix_multiply" => WorkgroupConfig {
349                        size: (16, 16, 1),
350                        shared_memory_bytes: 8192,
351                        optimal_dispatch_size: 1024,
352                    },
353                    "vector_operation" => WorkgroupConfig {
354                        size: (256, 1, 1),
355                        shared_memory_bytes: 0,
356                        optimal_dispatch_size: 256,
357                    },
358                    "reduction" => WorkgroupConfig {
359                        size: (128, 1, 1),
360                        shared_memory_bytes: 4096,
361                        optimal_dispatch_size: 128,
362                    },
363                    "cellular_automata" => WorkgroupConfig {
364                        size: (256, 1, 1),
365                        shared_memory_bytes: 0,
366                        optimal_dispatch_size: 256,
367                    },
368                    "fisher_information" => WorkgroupConfig {
369                        size: (256, 1, 1),
370                        shared_memory_bytes: 0,
371                        optimal_dispatch_size: 256,
372                    },
373                    "tropical_operations" => WorkgroupConfig {
374                        size: (128, 1, 1),
375                        shared_memory_bytes: 2048,
376                        optimal_dispatch_size: 128,
377                    },
378                    _ => WorkgroupConfig {
379                        size: (64, 1, 1),
380                        shared_memory_bytes: 0,
381                        optimal_dispatch_size: 64,
382                    },
383                }
384            })
385    }
386
387    /// Calibrate optimal workgroup size for a specific operation
388    pub async fn calibrate_operation(
389        &mut self,
390        operation_type: &str,
391        test_function: impl Fn(WorkgroupConfig) -> f32,
392    ) -> UnifiedGpuResult<WorkgroupConfig> {
393        let test_configs = vec![
394            WorkgroupConfig {
395                size: (32, 1, 1),
396                shared_memory_bytes: 0,
397                optimal_dispatch_size: 32,
398            },
399            WorkgroupConfig {
400                size: (64, 1, 1),
401                shared_memory_bytes: 0,
402                optimal_dispatch_size: 64,
403            },
404            WorkgroupConfig {
405                size: (128, 1, 1),
406                shared_memory_bytes: 0,
407                optimal_dispatch_size: 128,
408            },
409            WorkgroupConfig {
410                size: (256, 1, 1),
411                shared_memory_bytes: 0,
412                optimal_dispatch_size: 256,
413            },
414            WorkgroupConfig {
415                size: (16, 16, 1),
416                shared_memory_bytes: 4096,
417                optimal_dispatch_size: 256,
418            },
419            WorkgroupConfig {
420                size: (32, 8, 1),
421                shared_memory_bytes: 2048,
422                optimal_dispatch_size: 256,
423            },
424        ];
425
426        let mut results = Vec::new();
427
428        for config in test_configs {
429            let start = Instant::now();
430            let throughput = test_function(config);
431            let latency = start.elapsed().as_secs_f32() * 1000.0;
432
433            let efficiency = if latency > 0.0 {
434                throughput / latency
435            } else {
436                0.0
437            };
438
439            results.push(CalibrationResult {
440                config,
441                throughput_gops: throughput,
442                latency_ms: latency,
443                efficiency_percent: efficiency,
444            });
445        }
446
447        // Find best configuration
448        let best_config = results
449            .iter()
450            .max_by(|a, b| {
451                a.efficiency_percent
452                    .partial_cmp(&b.efficiency_percent)
453                    .unwrap()
454            })
455            .map(|r| r.config)
456            .unwrap_or(WorkgroupConfig {
457                size: (128, 1, 1),
458                shared_memory_bytes: 0,
459                optimal_dispatch_size: 128,
460            });
461
462        self.optimal_configs
463            .insert(operation_type.to_string(), best_config);
464        self.calibration_results
465            .insert(operation_type.to_string(), results);
466
467        Ok(best_config)
468    }
469
470    /// Get calibration results for analysis
471    pub fn get_calibration_results(&self, operation_type: &str) -> Option<&[CalibrationResult]> {
472        self.calibration_results
473            .get(operation_type)
474            .map(|v| v.as_slice())
475    }
476}
477
478impl Default for WorkgroupOptimizer {
479    fn default() -> Self {
480        Self::new()
481    }
482}
483
484/// Adaptive dispatch policy that learns optimal CPU/GPU thresholds
485pub struct AdaptiveDispatchPolicy {
486    #[allow(dead_code)] // Used in performance learning
487    cpu_performance_profile: PerformanceProfile,
488    #[allow(dead_code)] // Used in performance learning
489    gpu_performance_profile: PerformanceProfile,
490    crossover_points: HashMap<String, usize>,
491    calibration_history: Vec<DispatchBenchmark>,
492}
493
494#[derive(Debug, Clone)]
495pub struct PerformanceProfile {
496    pub operations_per_second: f32,
497    pub setup_overhead_ms: f32,
498    pub memory_bandwidth_gb_s: f32,
499    pub last_updated: Instant,
500}
501
502#[derive(Debug, Clone)]
503pub struct DispatchBenchmark {
504    pub operation_type: String,
505    pub data_size: usize,
506    pub cpu_time_ms: f32,
507    pub gpu_time_ms: f32,
508    pub timestamp: Instant,
509}
510
511impl AdaptiveDispatchPolicy {
512    pub fn new() -> Self {
513        Self {
514            cpu_performance_profile: PerformanceProfile {
515                operations_per_second: 1000.0,
516                setup_overhead_ms: 0.1,
517                memory_bandwidth_gb_s: 25.0,
518                last_updated: Instant::now(),
519            },
520            gpu_performance_profile: PerformanceProfile {
521                operations_per_second: 10000.0,
522                setup_overhead_ms: 5.0,
523                memory_bandwidth_gb_s: 500.0,
524                last_updated: Instant::now(),
525            },
526            crossover_points: HashMap::new(),
527            calibration_history: Vec::new(),
528        }
529    }
530
531    /// Determine if GPU should be used for this operation
532    pub fn should_use_gpu(&mut self, operation_type: &str, data_size: usize) -> bool {
533        if let Some(&crossover) = self.crossover_points.get(operation_type) {
534            data_size >= crossover
535        } else {
536            // Conservative default - require substantial work for GPU
537            data_size >= 1000
538        }
539    }
540
541    /// Update performance profile based on benchmark results
542    pub fn update_from_benchmark(&mut self, benchmark: DispatchBenchmark) {
543        // Simple learning: if GPU was faster, lower the crossover point; if slower, raise it
544        let gpu_advantage = benchmark.cpu_time_ms / benchmark.gpu_time_ms.max(0.1);
545
546        let current_crossover = self
547            .crossover_points
548            .get(&benchmark.operation_type)
549            .cloned()
550            .unwrap_or(1000);
551
552        let new_crossover = if gpu_advantage > 1.1 {
553            // GPU was significantly faster, lower threshold
554            (current_crossover as f32 * 0.8) as usize
555        } else if gpu_advantage < 0.9 {
556            // GPU was slower, raise threshold
557            (current_crossover as f32 * 1.2) as usize
558        } else {
559            current_crossover
560        };
561
562        self.crossover_points.insert(
563            benchmark.operation_type.clone(),
564            new_crossover.clamp(10, 100000),
565        );
566        self.calibration_history.push(benchmark);
567
568        // Keep only recent history
569        self.calibration_history
570            .retain(|b| b.timestamp.elapsed().as_secs() < 3600);
571    }
572
573    /// Get current crossover points
574    pub fn get_crossover_points(&self) -> &HashMap<String, usize> {
575        &self.crossover_points
576    }
577}
578
579impl Default for AdaptiveDispatchPolicy {
580    fn default() -> Self {
581        Self::new()
582    }
583}