1use crate::{SharedGpuContext, UnifiedGpuResult};
7use std::collections::HashMap;
8use std::time::Instant;
9
10pub struct GpuProfiler {
12 context: SharedGpuContext,
13 query_set: wgpu::QuerySet,
14 #[allow(dead_code)] timestamp_period: f32,
16 active_profiles: HashMap<String, ProfileSession>,
17 completed_profiles: Vec<GpuProfile>,
18 current_query_idx: u32,
19}
20
21#[derive(Debug)]
23pub struct ProfileSession {
24 #[allow(dead_code)] name: String,
26 start_time: Instant,
27 #[allow(dead_code)] start_query_idx: u32,
29 end_query_idx: Option<u32>,
30 workgroup_count: (u32, u32, u32),
31 buffer_sizes: Vec<u64>,
32}
33
34#[derive(Debug, Clone)]
36pub struct GpuProfile {
37 pub name: String,
38 pub cpu_time_ms: f32,
39 pub gpu_time_ms: Option<f32>,
40 pub memory_bandwidth_gb_s: f32,
41 pub compute_efficiency_percent: f32,
42 pub workgroup_utilization_percent: f32,
43 pub buffer_pool_hit_rate: f32,
44}
45
46pub struct WorkgroupOptimizer {
48 optimal_configs: HashMap<String, WorkgroupConfig>,
49 calibration_results: HashMap<String, Vec<CalibrationResult>>,
50}
51
52#[derive(Debug, Clone, Copy)]
54pub struct WorkgroupConfig {
55 pub size: (u32, u32, u32),
56 pub shared_memory_bytes: u32,
57 pub optimal_dispatch_size: u32,
58}
59
60#[derive(Debug, Clone)]
61pub struct CalibrationResult {
62 pub config: WorkgroupConfig,
63 pub throughput_gops: f32,
64 pub latency_ms: f32,
65 pub efficiency_percent: f32,
66}
67
68#[derive(Debug, Clone)]
70pub struct PerformanceReport {
71 pub total_gpu_time_ms: f32,
72 pub total_cpu_time_ms: f32,
73 pub gpu_utilization_percent: f32,
74 pub memory_bandwidth_utilization: f32,
75 pub buffer_pool_efficiency: f32,
76 pub bottlenecks: Vec<PerformanceBottleneck>,
77 pub recommendations: Vec<OptimizationRecommendation>,
78}
79
80#[derive(Debug, Clone)]
81pub enum PerformanceBottleneck {
82 MemoryBandwidth { utilization_percent: f32 },
83 ComputeUnits { utilization_percent: f32 },
84 BufferAllocation { avg_allocation_time_ms: f32 },
85 ShaderCompilation { avg_compilation_time_ms: f32 },
86 GpuToProcessorSync { avg_sync_time_ms: f32 },
87}
88
89#[derive(Debug, Clone)]
90pub enum OptimizationRecommendation {
91 IncreaseWorkgroupSize {
92 current: u32,
93 recommended: u32,
94 },
95 EnableBufferPooling {
96 potential_speedup: f32,
97 },
98 OptimizeMemoryLayout {
99 current_efficiency: f32,
100 potential_efficiency: f32,
101 },
102 ReduceBatchSize {
103 current: usize,
104 recommended: usize,
105 },
106 IncreaseBatchSize {
107 current: usize,
108 recommended: usize,
109 },
110 UseSharedMemory {
111 potential_speedup: f32,
112 },
113}
114
115impl GpuProfiler {
116 pub async fn new() -> UnifiedGpuResult<Self> {
118 let context = SharedGpuContext::global().await?.clone();
119
120 let query_set = context
122 .device()
123 .create_query_set(&wgpu::QuerySetDescriptor {
124 label: Some("GPU Profiler Timestamps"),
125 ty: wgpu::QueryType::Timestamp,
126 count: 1024, });
128
129 let timestamp_period = context.queue().get_timestamp_period();
131
132 Ok(Self {
133 context,
134 query_set,
135 timestamp_period,
136 active_profiles: HashMap::new(),
137 completed_profiles: Vec::new(),
138 current_query_idx: 0,
139 })
140 }
141
142 pub fn begin_profile(
144 &mut self,
145 name: &str,
146 workgroup_count: (u32, u32, u32),
147 buffer_sizes: &[u64],
148 ) -> ProfileScope<'_> {
149 let start_query_idx = self.current_query_idx;
150 self.current_query_idx += 2; let session = ProfileSession {
153 name: name.to_string(),
154 start_time: Instant::now(),
155 start_query_idx,
156 end_query_idx: None,
157 workgroup_count,
158 buffer_sizes: buffer_sizes.to_vec(),
159 };
160
161 self.active_profiles.insert(name.to_string(), session);
162
163 ProfileScope {
164 profiler: self,
165 name: name.to_string(),
166 start_query_idx,
167 }
168 }
169
170 fn end_profile(&mut self, name: &str, start_query_idx: u32) {
172 let end_query_idx = start_query_idx + 1;
173
174 if let Some(mut session) = self.active_profiles.remove(name) {
175 session.end_query_idx = Some(end_query_idx);
176 let cpu_time_ms = session.start_time.elapsed().as_secs_f32() * 1000.0;
177
178 let total_memory_bytes: u64 = session.buffer_sizes.iter().sum();
180 let memory_bandwidth_gb_s = if cpu_time_ms > 0.0 {
181 (total_memory_bytes as f32 * 2.0) / (cpu_time_ms / 1000.0) / 1e9
182 } else {
184 0.0
185 };
186
187 let total_threads =
189 session.workgroup_count.0 * session.workgroup_count.1 * session.workgroup_count.2;
190 let workgroup_utilization = (total_threads.min(4096) as f32 / 4096.0) * 100.0;
191
192 let buffer_pool_stats = self.context.buffer_pool_stats();
194
195 let profile = GpuProfile {
196 name: name.to_string(),
197 cpu_time_ms,
198 gpu_time_ms: None, memory_bandwidth_gb_s,
200 compute_efficiency_percent: 85.0, workgroup_utilization_percent: workgroup_utilization,
202 buffer_pool_hit_rate: buffer_pool_stats.hit_rate_percent,
203 };
204
205 self.completed_profiles.push(profile);
206 }
207 }
208
209 pub fn generate_report(&self) -> PerformanceReport {
211 let total_cpu_time: f32 = self.completed_profiles.iter().map(|p| p.cpu_time_ms).sum();
212 let total_gpu_time: f32 = self
213 .completed_profiles
214 .iter()
215 .map(|p| p.gpu_time_ms.unwrap_or(p.cpu_time_ms * 0.8))
216 .sum();
217
218 let avg_gpu_utilization: f32 = if !self.completed_profiles.is_empty() {
219 self.completed_profiles
220 .iter()
221 .map(|p| p.compute_efficiency_percent)
222 .sum::<f32>()
223 / self.completed_profiles.len() as f32
224 } else {
225 0.0
226 };
227
228 let avg_memory_bandwidth: f32 = if !self.completed_profiles.is_empty() {
229 self.completed_profiles
230 .iter()
231 .map(|p| p.memory_bandwidth_gb_s)
232 .sum::<f32>()
233 / self.completed_profiles.len() as f32
234 } else {
235 0.0
236 };
237
238 let buffer_pool_stats = self.context.buffer_pool_stats();
239
240 let mut bottlenecks = Vec::new();
242 let mut recommendations = Vec::new();
243
244 if avg_memory_bandwidth < 100.0 {
245 bottlenecks.push(PerformanceBottleneck::MemoryBandwidth {
247 utilization_percent: (avg_memory_bandwidth / 100.0) * 100.0,
248 });
249 recommendations.push(OptimizationRecommendation::OptimizeMemoryLayout {
250 current_efficiency: avg_memory_bandwidth / 100.0,
251 potential_efficiency: 0.8,
252 });
253 }
254
255 if avg_gpu_utilization < 70.0 {
256 bottlenecks.push(PerformanceBottleneck::ComputeUnits {
257 utilization_percent: avg_gpu_utilization,
258 });
259 recommendations.push(OptimizationRecommendation::IncreaseWorkgroupSize {
260 current: 64,
261 recommended: 256,
262 });
263 }
264
265 if buffer_pool_stats.hit_rate_percent < 50.0 {
266 bottlenecks.push(PerformanceBottleneck::BufferAllocation {
267 avg_allocation_time_ms: 5.0, });
269 recommendations.push(OptimizationRecommendation::EnableBufferPooling {
270 potential_speedup: 1.3,
271 });
272 }
273
274 PerformanceReport {
275 total_gpu_time_ms: total_gpu_time,
276 total_cpu_time_ms: total_cpu_time,
277 gpu_utilization_percent: avg_gpu_utilization,
278 memory_bandwidth_utilization: (avg_memory_bandwidth / 100.0) * 100.0,
279 buffer_pool_efficiency: buffer_pool_stats.hit_rate_percent,
280 bottlenecks,
281 recommendations,
282 }
283 }
284
285 pub fn profiles(&self) -> &[GpuProfile] {
287 &self.completed_profiles
288 }
289
290 pub fn clear_profiles(&mut self) {
292 self.completed_profiles.clear();
293 self.active_profiles.clear();
294 self.current_query_idx = 0;
295 }
296}
297
298pub struct ProfileScope<'a> {
300 profiler: &'a mut GpuProfiler,
301 name: String,
302 start_query_idx: u32,
303}
304
305impl<'a> ProfileScope<'a> {
306 pub fn write_timestamp(&self, encoder: &mut wgpu::CommandEncoder, stage: TimestampStage) {
308 match stage {
309 TimestampStage::Start => {
310 encoder.write_timestamp(&self.profiler.query_set, self.start_query_idx);
311 }
312 TimestampStage::End => {
313 encoder.write_timestamp(&self.profiler.query_set, self.start_query_idx + 1);
314 }
315 }
316 }
317}
318
319impl<'a> Drop for ProfileScope<'a> {
320 fn drop(&mut self) {
321 self.profiler.end_profile(&self.name, self.start_query_idx);
322 }
323}
324
325#[derive(Debug, Clone, Copy)]
326pub enum TimestampStage {
327 Start,
328 End,
329}
330
331impl WorkgroupOptimizer {
332 pub fn new() -> Self {
334 Self {
335 optimal_configs: HashMap::new(),
336 calibration_results: HashMap::new(),
337 }
338 }
339
340 pub fn get_optimal_config(&self, operation_type: &str) -> WorkgroupConfig {
342 self.optimal_configs
343 .get(operation_type)
344 .cloned()
345 .unwrap_or({
346 match operation_type {
348 "matrix_multiply" => WorkgroupConfig {
349 size: (16, 16, 1),
350 shared_memory_bytes: 8192,
351 optimal_dispatch_size: 1024,
352 },
353 "vector_operation" => WorkgroupConfig {
354 size: (256, 1, 1),
355 shared_memory_bytes: 0,
356 optimal_dispatch_size: 256,
357 },
358 "reduction" => WorkgroupConfig {
359 size: (128, 1, 1),
360 shared_memory_bytes: 4096,
361 optimal_dispatch_size: 128,
362 },
363 "cellular_automata" => WorkgroupConfig {
364 size: (256, 1, 1),
365 shared_memory_bytes: 0,
366 optimal_dispatch_size: 256,
367 },
368 "fisher_information" => WorkgroupConfig {
369 size: (256, 1, 1),
370 shared_memory_bytes: 0,
371 optimal_dispatch_size: 256,
372 },
373 "tropical_operations" => WorkgroupConfig {
374 size: (128, 1, 1),
375 shared_memory_bytes: 2048,
376 optimal_dispatch_size: 128,
377 },
378 _ => WorkgroupConfig {
379 size: (64, 1, 1),
380 shared_memory_bytes: 0,
381 optimal_dispatch_size: 64,
382 },
383 }
384 })
385 }
386
387 pub async fn calibrate_operation(
389 &mut self,
390 operation_type: &str,
391 test_function: impl Fn(WorkgroupConfig) -> f32,
392 ) -> UnifiedGpuResult<WorkgroupConfig> {
393 let test_configs = vec![
394 WorkgroupConfig {
395 size: (32, 1, 1),
396 shared_memory_bytes: 0,
397 optimal_dispatch_size: 32,
398 },
399 WorkgroupConfig {
400 size: (64, 1, 1),
401 shared_memory_bytes: 0,
402 optimal_dispatch_size: 64,
403 },
404 WorkgroupConfig {
405 size: (128, 1, 1),
406 shared_memory_bytes: 0,
407 optimal_dispatch_size: 128,
408 },
409 WorkgroupConfig {
410 size: (256, 1, 1),
411 shared_memory_bytes: 0,
412 optimal_dispatch_size: 256,
413 },
414 WorkgroupConfig {
415 size: (16, 16, 1),
416 shared_memory_bytes: 4096,
417 optimal_dispatch_size: 256,
418 },
419 WorkgroupConfig {
420 size: (32, 8, 1),
421 shared_memory_bytes: 2048,
422 optimal_dispatch_size: 256,
423 },
424 ];
425
426 let mut results = Vec::new();
427
428 for config in test_configs {
429 let start = Instant::now();
430 let throughput = test_function(config);
431 let latency = start.elapsed().as_secs_f32() * 1000.0;
432
433 let efficiency = if latency > 0.0 {
434 throughput / latency
435 } else {
436 0.0
437 };
438
439 results.push(CalibrationResult {
440 config,
441 throughput_gops: throughput,
442 latency_ms: latency,
443 efficiency_percent: efficiency,
444 });
445 }
446
447 let best_config = results
449 .iter()
450 .max_by(|a, b| {
451 a.efficiency_percent
452 .partial_cmp(&b.efficiency_percent)
453 .unwrap()
454 })
455 .map(|r| r.config)
456 .unwrap_or(WorkgroupConfig {
457 size: (128, 1, 1),
458 shared_memory_bytes: 0,
459 optimal_dispatch_size: 128,
460 });
461
462 self.optimal_configs
463 .insert(operation_type.to_string(), best_config);
464 self.calibration_results
465 .insert(operation_type.to_string(), results);
466
467 Ok(best_config)
468 }
469
470 pub fn get_calibration_results(&self, operation_type: &str) -> Option<&[CalibrationResult]> {
472 self.calibration_results
473 .get(operation_type)
474 .map(|v| v.as_slice())
475 }
476}
477
478impl Default for WorkgroupOptimizer {
479 fn default() -> Self {
480 Self::new()
481 }
482}
483
484pub struct AdaptiveDispatchPolicy {
486 #[allow(dead_code)] cpu_performance_profile: PerformanceProfile,
488 #[allow(dead_code)] gpu_performance_profile: PerformanceProfile,
490 crossover_points: HashMap<String, usize>,
491 calibration_history: Vec<DispatchBenchmark>,
492}
493
494#[derive(Debug, Clone)]
495pub struct PerformanceProfile {
496 pub operations_per_second: f32,
497 pub setup_overhead_ms: f32,
498 pub memory_bandwidth_gb_s: f32,
499 pub last_updated: Instant,
500}
501
502#[derive(Debug, Clone)]
503pub struct DispatchBenchmark {
504 pub operation_type: String,
505 pub data_size: usize,
506 pub cpu_time_ms: f32,
507 pub gpu_time_ms: f32,
508 pub timestamp: Instant,
509}
510
511impl AdaptiveDispatchPolicy {
512 pub fn new() -> Self {
513 Self {
514 cpu_performance_profile: PerformanceProfile {
515 operations_per_second: 1000.0,
516 setup_overhead_ms: 0.1,
517 memory_bandwidth_gb_s: 25.0,
518 last_updated: Instant::now(),
519 },
520 gpu_performance_profile: PerformanceProfile {
521 operations_per_second: 10000.0,
522 setup_overhead_ms: 5.0,
523 memory_bandwidth_gb_s: 500.0,
524 last_updated: Instant::now(),
525 },
526 crossover_points: HashMap::new(),
527 calibration_history: Vec::new(),
528 }
529 }
530
531 pub fn should_use_gpu(&mut self, operation_type: &str, data_size: usize) -> bool {
533 if let Some(&crossover) = self.crossover_points.get(operation_type) {
534 data_size >= crossover
535 } else {
536 data_size >= 1000
538 }
539 }
540
541 pub fn update_from_benchmark(&mut self, benchmark: DispatchBenchmark) {
543 let gpu_advantage = benchmark.cpu_time_ms / benchmark.gpu_time_ms.max(0.1);
545
546 let current_crossover = self
547 .crossover_points
548 .get(&benchmark.operation_type)
549 .cloned()
550 .unwrap_or(1000);
551
552 let new_crossover = if gpu_advantage > 1.1 {
553 (current_crossover as f32 * 0.8) as usize
555 } else if gpu_advantage < 0.9 {
556 (current_crossover as f32 * 1.2) as usize
558 } else {
559 current_crossover
560 };
561
562 self.crossover_points.insert(
563 benchmark.operation_type.clone(),
564 new_crossover.clamp(10, 100000),
565 );
566 self.calibration_history.push(benchmark);
567
568 self.calibration_history
570 .retain(|b| b.timestamp.elapsed().as_secs() < 3600);
571 }
572
573 pub fn get_crossover_points(&self) -> &HashMap<String, usize> {
575 &self.crossover_points
576 }
577}
578
579impl Default for AdaptiveDispatchPolicy {
580 fn default() -> Self {
581 Self::new()
582 }
583}