1use anyhow::Result;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10#[derive(Debug)]
12pub struct PerformanceTuner {
13 config: TunerConfig,
15 history: Vec<PerformanceSnapshot>,
17}
18
19#[derive(Debug, Clone, Serialize, Deserialize)]
21pub struct TunerConfig {
22 pub enable_memory_tuning: bool,
24 pub enable_compute_tuning: bool,
26 pub enable_batch_tuning: bool,
28 pub enable_layer_tuning: bool,
30 pub confidence_threshold: f64,
32 pub target_hardware: HardwareType,
34}
35
36impl Default for TunerConfig {
37 fn default() -> Self {
38 Self {
39 enable_memory_tuning: true,
40 enable_compute_tuning: true,
41 enable_batch_tuning: true,
42 enable_layer_tuning: true,
43 confidence_threshold: 0.7,
44 target_hardware: HardwareType::Auto,
45 }
46 }
47}
48
49#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
51pub enum HardwareType {
52 Auto,
54 NvidiaGpu,
56 AmdGpu,
58 AppleSilicon,
60 Cpu,
62 Tpu,
64}
65
66#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct PerformanceSnapshot {
69 pub timestamp: u64,
71 pub total_time_ms: f64,
73 pub memory_usage_mb: f64,
75 pub peak_memory_mb: f64,
77 pub gpu_utilization: f64,
79 pub throughput: f64,
81 pub batch_size: usize,
83 pub layer_timings: HashMap<String, f64>,
85 pub layer_memory: HashMap<String, f64>,
87}
88
89#[derive(Debug, Clone, Serialize, Deserialize)]
91pub struct Recommendation {
92 pub category: RecommendationCategory,
94 pub priority: Priority,
96 pub confidence: f64,
98 pub title: String,
100 pub description: String,
102 pub expected_impact: ImpactEstimate,
104 pub difficulty: Difficulty,
106 pub actions: Vec<String>,
108 pub code_example: Option<String>,
110}
111
112#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
114pub enum RecommendationCategory {
115 Memory,
117 Compute,
119 BatchSize,
121 Layer,
123 Hardware,
125 DataLoading,
127 Architecture,
129}
130
131#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
133pub enum Priority {
134 Low,
136 Medium,
138 High,
140 Critical,
142}
143
144#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
146pub enum Difficulty {
147 Easy,
149 Moderate,
151 Hard,
153}
154
155#[derive(Debug, Clone, Serialize, Deserialize)]
157pub struct ImpactEstimate {
158 pub speedup: f64,
160 pub memory_reduction_mb: f64,
162 pub throughput_improvement: f64,
164}
165
166#[derive(Debug, Clone, Serialize, Deserialize)]
168pub struct TuningReport {
169 pub recommendations: Vec<Recommendation>,
171 pub current_performance: PerformanceSummary,
173 pub estimated_performance: PerformanceSummary,
175 pub timestamp: u64,
177}
178
179#[derive(Debug, Clone, Serialize, Deserialize)]
181pub struct PerformanceSummary {
182 pub avg_time_ms: f64,
184 pub avg_memory_mb: f64,
186 pub avg_throughput: f64,
188 pub gpu_utilization: f64,
190 pub efficiency_score: f64,
192}
193
194impl PerformanceTuner {
195 pub fn new(config: TunerConfig) -> Self {
197 Self {
198 config,
199 history: Vec::new(),
200 }
201 }
202
203 pub fn record_snapshot(&mut self, snapshot: PerformanceSnapshot) {
205 self.history.push(snapshot);
206
207 if self.history.len() > 100 {
209 self.history.remove(0);
210 }
211 }
212
213 pub fn analyze(&self) -> Result<TuningReport> {
215 let mut recommendations = Vec::new();
216
217 if self.history.is_empty() {
218 anyhow::bail!("No performance data available");
219 }
220
221 if self.config.enable_memory_tuning {
223 recommendations.extend(self.analyze_memory());
224 }
225
226 if self.config.enable_compute_tuning {
227 recommendations.extend(self.analyze_compute());
228 }
229
230 if self.config.enable_batch_tuning {
231 recommendations.extend(self.analyze_batch_size());
232 }
233
234 if self.config.enable_layer_tuning {
235 recommendations.extend(self.analyze_layers());
236 }
237
238 recommendations.retain(|r| r.confidence >= self.config.confidence_threshold);
240
241 recommendations.sort_by_key(|item| std::cmp::Reverse(item.priority));
243
244 let current_perf = self.compute_current_performance();
245 let estimated_perf = self.estimate_improved_performance(&recommendations);
246
247 Ok(TuningReport {
248 recommendations,
249 current_performance: current_perf,
250 estimated_performance: estimated_perf,
251 timestamp: std::time::SystemTime::now()
252 .duration_since(std::time::UNIX_EPOCH)
253 .expect("SystemTime should be after UNIX_EPOCH")
254 .as_secs(),
255 })
256 }
257
258 fn analyze_memory(&self) -> Vec<Recommendation> {
260 let mut recommendations = Vec::new();
261
262 let avg_memory =
263 self.history.iter().map(|s| s.memory_usage_mb).sum::<f64>() / self.history.len() as f64;
264
265 let peak_memory = self.history.iter().map(|s| s.peak_memory_mb).fold(0.0, f64::max);
266
267 if peak_memory > avg_memory * 1.5 {
269 recommendations.push(Recommendation {
270 category: RecommendationCategory::Memory,
271 priority: Priority::High,
272 confidence: 0.85,
273 title: "Reduce memory fragmentation".to_string(),
274 description: format!(
275 "Peak memory ({:.1}MB) is significantly higher than average ({:.1}MB). \
276 This indicates memory fragmentation.",
277 peak_memory, avg_memory
278 ),
279 expected_impact: ImpactEstimate {
280 speedup: 1.1,
281 memory_reduction_mb: (peak_memory - avg_memory) * 0.5,
282 throughput_improvement: 5.0,
283 },
284 difficulty: Difficulty::Moderate,
285 actions: vec![
286 "Enable gradient checkpointing to reduce activation memory".to_string(),
287 "Use torch.cuda.empty_cache() or equivalent after large operations".to_string(),
288 "Consider using mixed precision training (FP16/BF16)".to_string(),
289 ],
290 code_example: Some(
291 "# Enable gradient checkpointing\n\
292 model.gradient_checkpointing_enable()\n\
293 \n\
294 # Use automatic mixed precision\n\
295 with torch.cuda.amp.autocast():\n\
296 \u{00a0}\u{00a0}\u{00a0}\u{00a0}output = model(input)"
297 .to_string(),
298 ),
299 });
300 }
301
302 if avg_memory > 8000.0 && self.config.target_hardware == HardwareType::Cpu {
304 recommendations.push(Recommendation {
305 category: RecommendationCategory::Memory,
306 priority: Priority::High,
307 confidence: 0.9,
308 title: "Reduce memory footprint for CPU execution".to_string(),
309 description: format!(
310 "Average memory usage ({:.1}GB) is high for CPU execution. \
311 Consider model compression techniques.",
312 avg_memory / 1024.0
313 ),
314 expected_impact: ImpactEstimate {
315 speedup: 1.3,
316 memory_reduction_mb: avg_memory * 0.4,
317 throughput_improvement: 15.0,
318 },
319 difficulty: Difficulty::Moderate,
320 actions: vec![
321 "Apply 8-bit or 4-bit quantization".to_string(),
322 "Use dynamic quantization for linear layers".to_string(),
323 "Consider model distillation to a smaller model".to_string(),
324 ],
325 code_example: Some(
326 "# Apply 8-bit quantization\n\
327 quantized_model = torch.quantization.quantize_dynamic(\n\
328 \u{00a0}\u{00a0}\u{00a0}\u{00a0}model, {torch.nn.Linear}, dtype=torch.qint8\n\
329 )"
330 .to_string(),
331 ),
332 });
333 }
334
335 recommendations
336 }
337
338 fn analyze_compute(&self) -> Vec<Recommendation> {
340 let mut recommendations = Vec::new();
341
342 let avg_gpu_util =
343 self.history.iter().map(|s| s.gpu_utilization).sum::<f64>() / self.history.len() as f64;
344
345 if avg_gpu_util < 50.0 && self.config.target_hardware != HardwareType::Cpu {
347 recommendations.push(Recommendation {
348 category: RecommendationCategory::Compute,
349 priority: Priority::High,
350 confidence: 0.88,
351 title: "Improve GPU utilization".to_string(),
352 description: format!(
353 "Average GPU utilization ({:.1}%) is low. GPU is underutilized.",
354 avg_gpu_util
355 ),
356 expected_impact: ImpactEstimate {
357 speedup: 1.8,
358 memory_reduction_mb: 0.0,
359 throughput_improvement: 40.0,
360 },
361 difficulty: Difficulty::Easy,
362 actions: vec![
363 "Increase batch size to maximize GPU occupancy".to_string(),
364 "Use DataLoader with num_workers > 0 to prevent CPU bottleneck".to_string(),
365 "Enable pin_memory for faster host-to-device transfers".to_string(),
366 "Use compiled models (torch.compile)".to_string(),
367 ],
368 code_example: Some(
369 "# Optimize data loading\n\
370 dataloader = DataLoader(\n\
371 \u{00a0}\u{00a0}\u{00a0}\u{00a0}dataset,\n\
372 \u{00a0}\u{00a0}\u{00a0}\u{00a0}batch_size=32,\n\
373 \u{00a0}\u{00a0}\u{00a0}\u{00a0}num_workers=4, # Parallel data loading\n\
374 \u{00a0}\u{00a0}\u{00a0}\u{00a0}pin_memory=True # Faster transfers\n\
375 )"
376 .to_string(),
377 ),
378 });
379 }
380
381 recommendations
382 }
383
384 fn analyze_batch_size(&self) -> Vec<Recommendation> {
386 let mut recommendations = Vec::new();
387
388 if let Some(last_snapshot) = self.history.last() {
389 let batch_size = last_snapshot.batch_size;
390
391 if batch_size < 16 && self.config.target_hardware != HardwareType::Cpu {
393 recommendations.push(Recommendation {
394 category: RecommendationCategory::BatchSize,
395 priority: Priority::Medium,
396 confidence: 0.75,
397 title: "Increase batch size".to_string(),
398 description: format!(
399 "Current batch size ({}) is small. Larger batches improve GPU utilization.",
400 batch_size
401 ),
402 expected_impact: ImpactEstimate {
403 speedup: 1.5,
404 memory_reduction_mb: 0.0,
405 throughput_improvement: 30.0,
406 },
407 difficulty: Difficulty::Easy,
408 actions: vec![
409 format!("Increase batch size to {} or higher", batch_size * 2),
410 "Monitor memory usage to find optimal batch size".to_string(),
411 "Use gradient accumulation if memory is limited".to_string(),
412 ],
413 code_example: Some(
414 "# Gradient accumulation for effective larger batch\n\
415 accumulation_steps = 4\n\
416 for i, batch in enumerate(dataloader):\n\
417 \u{00a0}\u{00a0}\u{00a0}\u{00a0}loss = model(batch) / accumulation_steps\n\
418 \u{00a0}\u{00a0}\u{00a0}\u{00a0}loss.backward()\n\
419 \u{00a0}\u{00a0}\u{00a0}\u{00a0}if (i + 1) % accumulation_steps == 0:\n\
420 \u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}optimizer.step()\n\
421 \u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}optimizer.zero_grad()"
422 .to_string()
423 ),
424 });
425 }
426 }
427
428 recommendations
429 }
430
431 fn analyze_layers(&self) -> Vec<Recommendation> {
433 let mut recommendations = Vec::new();
434
435 if let Some(snapshot) = self.history.last() {
436 let total_time: f64 = snapshot.layer_timings.values().sum();
437
438 for (layer_name, &time) in &snapshot.layer_timings {
440 let percentage = (time / total_time) * 100.0;
441
442 if percentage > 20.0 {
443 recommendations.push(Recommendation {
444 category: RecommendationCategory::Layer,
445 priority: Priority::Medium,
446 confidence: 0.8,
447 title: format!("Optimize {} layer", layer_name),
448 description: format!(
449 "Layer '{}' takes {:.1}% of total execution time ({:.2}ms). \
450 Consider layer-specific optimizations.",
451 layer_name, percentage, time
452 ),
453 expected_impact: ImpactEstimate {
454 speedup: 1.2,
455 memory_reduction_mb: 0.0,
456 throughput_improvement: 15.0,
457 },
458 difficulty: Difficulty::Moderate,
459 actions: vec![
460 "Use fused operations for this layer type".to_string(),
461 "Check if layer can benefit from Flash Attention".to_string(),
462 "Consider layer pruning if accuracy allows".to_string(),
463 ],
464 code_example: None,
465 });
466 }
467 }
468 }
469
470 recommendations
471 }
472
473 fn compute_current_performance(&self) -> PerformanceSummary {
475 let count = self.history.len() as f64;
476
477 let avg_time = self.history.iter().map(|s| s.total_time_ms).sum::<f64>() / count;
478
479 let avg_memory = self.history.iter().map(|s| s.memory_usage_mb).sum::<f64>() / count;
480
481 let avg_throughput = self.history.iter().map(|s| s.throughput).sum::<f64>() / count;
482
483 let avg_gpu = self.history.iter().map(|s| s.gpu_utilization).sum::<f64>() / count;
484
485 let efficiency = (avg_gpu.min(100.0) + (avg_throughput / 10.0).min(100.0)) / 2.0;
487
488 PerformanceSummary {
489 avg_time_ms: avg_time,
490 avg_memory_mb: avg_memory,
491 avg_throughput,
492 gpu_utilization: avg_gpu,
493 efficiency_score: efficiency,
494 }
495 }
496
497 fn estimate_improved_performance(
499 &self,
500 recommendations: &[Recommendation],
501 ) -> PerformanceSummary {
502 let current = self.compute_current_performance();
503
504 let total_speedup: f64 =
506 recommendations.iter().map(|r| r.expected_impact.speedup - 1.0).sum::<f64>() + 1.0;
507
508 let total_memory_reduction: f64 =
509 recommendations.iter().map(|r| r.expected_impact.memory_reduction_mb).sum();
510
511 let total_throughput_improvement: f64 =
512 recommendations.iter().map(|r| r.expected_impact.throughput_improvement).sum();
513
514 PerformanceSummary {
515 avg_time_ms: current.avg_time_ms / total_speedup,
516 avg_memory_mb: (current.avg_memory_mb - total_memory_reduction).max(0.0),
517 avg_throughput: current.avg_throughput * (1.0 + total_throughput_improvement / 100.0),
518 gpu_utilization: (current.gpu_utilization * 1.2).min(95.0),
519 efficiency_score: (current.efficiency_score * 1.3).min(100.0),
520 }
521 }
522}
523
524#[cfg(test)]
525mod tests {
526 use super::*;
527
528 #[test]
529 fn test_tuner_creation() {
530 let config = TunerConfig::default();
531 let _tuner = PerformanceTuner::new(config);
532 }
533
534 #[test]
535 fn test_snapshot_recording() {
536 let mut tuner = PerformanceTuner::new(TunerConfig::default());
537
538 let snapshot = PerformanceSnapshot {
539 timestamp: 0,
540 total_time_ms: 100.0,
541 memory_usage_mb: 500.0,
542 peak_memory_mb: 600.0,
543 gpu_utilization: 75.0,
544 throughput: 50.0,
545 batch_size: 16,
546 layer_timings: HashMap::new(),
547 layer_memory: HashMap::new(),
548 };
549
550 tuner.record_snapshot(snapshot);
551 assert_eq!(tuner.history.len(), 1);
552 }
553
554 #[test]
555 fn test_analysis_with_data() -> Result<()> {
556 let mut tuner = PerformanceTuner::new(TunerConfig::default());
557
558 for i in 0..10 {
560 let snapshot = PerformanceSnapshot {
561 timestamp: i,
562 total_time_ms: 100.0,
563 memory_usage_mb: 1000.0,
564 peak_memory_mb: 2000.0, gpu_utilization: 40.0, throughput: 20.0,
567 batch_size: 8, layer_timings: {
569 let mut timings = HashMap::new();
570 timings.insert("attention".to_string(), 60.0);
571 timings.insert("ffn".to_string(), 30.0);
572 timings.insert("other".to_string(), 10.0);
573 timings
574 },
575 layer_memory: HashMap::new(),
576 };
577
578 tuner.record_snapshot(snapshot);
579 }
580
581 let report = tuner.analyze()?;
582
583 assert!(!report.recommendations.is_empty());
585
586 assert!(report.current_performance.avg_time_ms > 0.0);
588 assert!(report.estimated_performance.avg_time_ms > 0.0);
589
590 Ok(())
591 }
592}