1use anyhow::Result;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10#[derive(Debug, Clone)]
12pub enum PerformanceModel {
13 RuleBased {
19 multipliers: HashMap<RecommendationCategory, f64>,
21 },
22}
23
24impl Default for PerformanceModel {
25 fn default() -> Self {
26 let mut multipliers = HashMap::new();
27 multipliers.insert(RecommendationCategory::Memory, 1.0);
28 multipliers.insert(RecommendationCategory::Compute, 1.0);
29 multipliers.insert(RecommendationCategory::BatchSize, 1.0);
30 multipliers.insert(RecommendationCategory::Layer, 1.0);
31 multipliers.insert(RecommendationCategory::Hardware, 1.0);
32 multipliers.insert(RecommendationCategory::DataLoading, 1.0);
33 multipliers.insert(RecommendationCategory::Architecture, 1.0);
34 PerformanceModel::RuleBased { multipliers }
35 }
36}
37
38#[derive(Debug)]
40pub struct PerformanceTuner {
41 config: TunerConfig,
43 history: Vec<PerformanceSnapshot>,
45}
46
47#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct TunerConfig {
50 pub enable_memory_tuning: bool,
52 pub enable_compute_tuning: bool,
54 pub enable_batch_tuning: bool,
56 pub enable_layer_tuning: bool,
58 pub confidence_threshold: f64,
60 pub target_hardware: HardwareType,
62 pub data_loading_window: usize,
64 #[serde(skip)]
66 pub performance_model: PerformanceModel,
67}
68
69impl Default for TunerConfig {
70 fn default() -> Self {
71 Self {
72 enable_memory_tuning: true,
73 enable_compute_tuning: true,
74 enable_batch_tuning: true,
75 enable_layer_tuning: true,
76 confidence_threshold: 0.7,
77 target_hardware: HardwareType::Auto,
78 data_loading_window: 10,
79 performance_model: PerformanceModel::default(),
80 }
81 }
82}
83
84#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
86pub enum HardwareType {
87 Auto,
89 NvidiaGpu,
91 AmdGpu,
93 AppleSilicon,
95 Cpu,
97 Tpu,
99}
100
101#[derive(Debug, Clone, Serialize, Deserialize, Default)]
103pub struct PerformanceSnapshot {
104 pub timestamp: u64,
106 pub total_time_ms: f64,
108 pub memory_usage_mb: f64,
110 pub peak_memory_mb: f64,
112 pub gpu_utilization: f64,
114 pub throughput: f64,
116 pub batch_size: usize,
118 pub layer_timings: HashMap<String, f64>,
120 pub layer_memory: HashMap<String, f64>,
122 pub hardware_type: Option<HardwareType>,
124 pub io_wait_pct: Option<f32>,
126 pub batch_throughput_per_sec: Option<f32>,
128 pub gpu_peak_throughput: Option<f32>,
130 pub model_depth: Option<usize>,
132 pub num_heads: Option<usize>,
134 pub kv_cache_bytes: Option<u64>,
136 pub seq_len: Option<usize>,
138}
139
140#[derive(Debug, Clone, Serialize, Deserialize)]
142pub struct Recommendation {
143 pub category: RecommendationCategory,
145 pub priority: Priority,
147 pub confidence: f64,
149 pub title: String,
151 pub description: String,
153 pub expected_impact: ImpactEstimate,
155 pub difficulty: Difficulty,
157 pub actions: Vec<String>,
159 pub code_example: Option<String>,
161}
162
163#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
165pub enum RecommendationCategory {
166 Memory,
168 Compute,
170 BatchSize,
172 Layer,
174 Hardware,
176 DataLoading,
178 Architecture,
180}
181
182#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
184pub enum Priority {
185 Low,
187 Medium,
189 High,
191 Critical,
193}
194
195#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
197pub enum Difficulty {
198 Easy,
200 Moderate,
202 Hard,
204}
205
206#[derive(Debug, Clone, Serialize, Deserialize)]
208pub struct ImpactEstimate {
209 pub speedup: f64,
211 pub memory_reduction_mb: f64,
213 pub throughput_improvement: f64,
215}
216
217#[derive(Debug, Clone, Serialize, Deserialize)]
219pub struct TuningReport {
220 pub recommendations: Vec<Recommendation>,
222 pub current_performance: PerformanceSummary,
224 pub estimated_performance: PerformanceSummary,
226 pub timestamp: u64,
228}
229
230#[derive(Debug, Clone, Serialize, Deserialize)]
232pub struct PerformanceSummary {
233 pub avg_time_ms: f64,
235 pub avg_memory_mb: f64,
237 pub avg_throughput: f64,
239 pub gpu_utilization: f64,
241 pub efficiency_score: f64,
243}
244
245impl PerformanceTuner {
246 pub fn new(config: TunerConfig) -> Self {
248 Self {
249 config,
250 history: Vec::new(),
251 }
252 }
253
254 pub fn record_snapshot(&mut self, snapshot: PerformanceSnapshot) {
256 self.history.push(snapshot);
257
258 if self.history.len() > 100 {
260 self.history.remove(0);
261 }
262 }
263
264 fn detect_hardware(&self) -> HardwareType {
270 #[cfg(target_os = "macos")]
271 return HardwareType::AppleSilicon;
272
273 #[cfg(all(not(target_os = "macos"), feature = "cuda"))]
274 return HardwareType::NvidiaGpu;
275
276 #[cfg(all(not(target_os = "macos"), not(feature = "cuda"), feature = "rocm"))]
277 return HardwareType::AmdGpu;
278
279 #[cfg(all(
280 not(target_os = "macos"),
281 not(feature = "cuda"),
282 not(feature = "rocm"),
283 feature = "tpu"
284 ))]
285 return HardwareType::Tpu;
286
287 #[cfg(all(
288 not(target_os = "macos"),
289 not(feature = "cuda"),
290 not(feature = "rocm"),
291 not(feature = "tpu")
292 ))]
293 HardwareType::Cpu
294 }
295
296 pub fn detected_hardware(&self) -> HardwareType {
298 self.detect_hardware()
299 }
300
301 pub fn analyze(&self) -> Result<TuningReport> {
303 let mut recommendations = Vec::new();
304
305 if self.history.is_empty() {
306 anyhow::bail!("No performance data available");
307 }
308
309 if self.config.enable_memory_tuning {
311 recommendations.extend(self.analyze_memory());
312 }
313
314 if self.config.enable_compute_tuning {
315 recommendations.extend(self.analyze_compute());
316 }
317
318 if self.config.enable_batch_tuning {
319 recommendations.extend(self.analyze_batch_size());
320 }
321
322 if self.config.enable_layer_tuning {
323 recommendations.extend(self.analyze_layers());
324 }
325
326 recommendations.extend(self.analyze_hardware(&self.history));
328
329 if self.history.iter().any(|s| s.io_wait_pct.is_some()) {
331 recommendations.extend(self.analyze_data_loading(&self.history));
332 }
333
334 if self.history.iter().any(|s| s.seq_len.is_some()) {
336 recommendations.extend(self.analyze_architecture(&self.history));
337 }
338
339 recommendations.retain(|r| r.confidence >= self.config.confidence_threshold);
341
342 recommendations.sort_by_key(|item| std::cmp::Reverse(item.priority));
344
345 let current_perf = self.compute_current_performance();
346 let estimated_perf = self.estimate_improved_performance(&recommendations);
347
348 Ok(TuningReport {
349 recommendations,
350 current_performance: current_perf,
351 estimated_performance: estimated_perf,
352 timestamp: std::time::SystemTime::now()
353 .duration_since(std::time::UNIX_EPOCH)
354 .expect("SystemTime should be after UNIX_EPOCH")
355 .as_secs(),
356 })
357 }
358
359 fn analyze_memory(&self) -> Vec<Recommendation> {
361 let mut recommendations = Vec::new();
362
363 let avg_memory =
364 self.history.iter().map(|s| s.memory_usage_mb).sum::<f64>() / self.history.len() as f64;
365
366 let peak_memory = self.history.iter().map(|s| s.peak_memory_mb).fold(0.0, f64::max);
367
368 if peak_memory > avg_memory * 1.5 {
370 recommendations.push(Recommendation {
371 category: RecommendationCategory::Memory,
372 priority: Priority::High,
373 confidence: 0.85,
374 title: "Reduce memory fragmentation".to_string(),
375 description: format!(
376 "Peak memory ({:.1}MB) is significantly higher than average ({:.1}MB). \
377 This indicates memory fragmentation.",
378 peak_memory, avg_memory
379 ),
380 expected_impact: ImpactEstimate {
381 speedup: 1.1,
382 memory_reduction_mb: (peak_memory - avg_memory) * 0.5,
383 throughput_improvement: 5.0,
384 },
385 difficulty: Difficulty::Moderate,
386 actions: vec![
387 "Enable gradient checkpointing to reduce activation memory".to_string(),
388 "Use torch.cuda.empty_cache() or equivalent after large operations".to_string(),
389 "Consider using mixed precision training (FP16/BF16)".to_string(),
390 ],
391 code_example: Some(
392 "# Enable gradient checkpointing\n\
393 model.gradient_checkpointing_enable()\n\
394 \n\
395 # Use automatic mixed precision\n\
396 with torch.cuda.amp.autocast():\n\
397 \u{00a0}\u{00a0}\u{00a0}\u{00a0}output = model(input)"
398 .to_string(),
399 ),
400 });
401 }
402
403 if avg_memory > 8000.0 && self.config.target_hardware == HardwareType::Cpu {
405 recommendations.push(Recommendation {
406 category: RecommendationCategory::Memory,
407 priority: Priority::High,
408 confidence: 0.9,
409 title: "Reduce memory footprint for CPU execution".to_string(),
410 description: format!(
411 "Average memory usage ({:.1}GB) is high for CPU execution. \
412 Consider model compression techniques.",
413 avg_memory / 1024.0
414 ),
415 expected_impact: ImpactEstimate {
416 speedup: 1.3,
417 memory_reduction_mb: avg_memory * 0.4,
418 throughput_improvement: 15.0,
419 },
420 difficulty: Difficulty::Moderate,
421 actions: vec![
422 "Apply 8-bit or 4-bit quantization".to_string(),
423 "Use dynamic quantization for linear layers".to_string(),
424 "Consider model distillation to a smaller model".to_string(),
425 ],
426 code_example: Some(
427 "# Apply 8-bit quantization\n\
428 quantized_model = torch.quantization.quantize_dynamic(\n\
429 \u{00a0}\u{00a0}\u{00a0}\u{00a0}model, {torch.nn.Linear}, dtype=torch.qint8\n\
430 )"
431 .to_string(),
432 ),
433 });
434 }
435
436 recommendations
437 }
438
439 fn analyze_compute(&self) -> Vec<Recommendation> {
441 let mut recommendations = Vec::new();
442
443 let avg_gpu_util =
444 self.history.iter().map(|s| s.gpu_utilization).sum::<f64>() / self.history.len() as f64;
445
446 if avg_gpu_util < 50.0 && self.config.target_hardware != HardwareType::Cpu {
448 recommendations.push(Recommendation {
449 category: RecommendationCategory::Compute,
450 priority: Priority::High,
451 confidence: 0.88,
452 title: "Improve GPU utilization".to_string(),
453 description: format!(
454 "Average GPU utilization ({:.1}%) is low. GPU is underutilized.",
455 avg_gpu_util
456 ),
457 expected_impact: ImpactEstimate {
458 speedup: 1.8,
459 memory_reduction_mb: 0.0,
460 throughput_improvement: 40.0,
461 },
462 difficulty: Difficulty::Easy,
463 actions: vec![
464 "Increase batch size to maximize GPU occupancy".to_string(),
465 "Use DataLoader with num_workers > 0 to prevent CPU bottleneck".to_string(),
466 "Enable pin_memory for faster host-to-device transfers".to_string(),
467 "Use compiled models (torch.compile)".to_string(),
468 ],
469 code_example: Some(
470 "# Optimize data loading\n\
471 dataloader = DataLoader(\n\
472 \u{00a0}\u{00a0}\u{00a0}\u{00a0}dataset,\n\
473 \u{00a0}\u{00a0}\u{00a0}\u{00a0}batch_size=32,\n\
474 \u{00a0}\u{00a0}\u{00a0}\u{00a0}num_workers=4, # Parallel data loading\n\
475 \u{00a0}\u{00a0}\u{00a0}\u{00a0}pin_memory=True # Faster transfers\n\
476 )"
477 .to_string(),
478 ),
479 });
480 }
481
482 recommendations
483 }
484
485 fn analyze_batch_size(&self) -> Vec<Recommendation> {
487 let mut recommendations = Vec::new();
488
489 if let Some(last_snapshot) = self.history.last() {
490 let batch_size = last_snapshot.batch_size;
491
492 if batch_size < 16 && self.config.target_hardware != HardwareType::Cpu {
494 recommendations.push(Recommendation {
495 category: RecommendationCategory::BatchSize,
496 priority: Priority::Medium,
497 confidence: 0.75,
498 title: "Increase batch size".to_string(),
499 description: format!(
500 "Current batch size ({}) is small. Larger batches improve GPU utilization.",
501 batch_size
502 ),
503 expected_impact: ImpactEstimate {
504 speedup: 1.5,
505 memory_reduction_mb: 0.0,
506 throughput_improvement: 30.0,
507 },
508 difficulty: Difficulty::Easy,
509 actions: vec![
510 format!("Increase batch size to {} or higher", batch_size * 2),
511 "Monitor memory usage to find optimal batch size".to_string(),
512 "Use gradient accumulation if memory is limited".to_string(),
513 ],
514 code_example: Some(
515 "# Gradient accumulation for effective larger batch\n\
516 accumulation_steps = 4\n\
517 for i, batch in enumerate(dataloader):\n\
518 \u{00a0}\u{00a0}\u{00a0}\u{00a0}loss = model(batch) / accumulation_steps\n\
519 \u{00a0}\u{00a0}\u{00a0}\u{00a0}loss.backward()\n\
520 \u{00a0}\u{00a0}\u{00a0}\u{00a0}if (i + 1) % accumulation_steps == 0:\n\
521 \u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}optimizer.step()\n\
522 \u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}optimizer.zero_grad()"
523 .to_string()
524 ),
525 });
526 }
527 }
528
529 recommendations
530 }
531
532 fn analyze_layers(&self) -> Vec<Recommendation> {
534 let mut recommendations = Vec::new();
535
536 if let Some(snapshot) = self.history.last() {
537 let total_time: f64 = snapshot.layer_timings.values().sum();
538
539 for (layer_name, &time) in &snapshot.layer_timings {
541 let percentage = (time / total_time) * 100.0;
542
543 if percentage > 20.0 {
544 recommendations.push(Recommendation {
545 category: RecommendationCategory::Layer,
546 priority: Priority::Medium,
547 confidence: 0.8,
548 title: format!("Optimize {} layer", layer_name),
549 description: format!(
550 "Layer '{}' takes {:.1}% of total execution time ({:.2}ms). \
551 Consider layer-specific optimizations.",
552 layer_name, percentage, time
553 ),
554 expected_impact: ImpactEstimate {
555 speedup: 1.2,
556 memory_reduction_mb: 0.0,
557 throughput_improvement: 15.0,
558 },
559 difficulty: Difficulty::Moderate,
560 actions: vec![
561 "Use fused operations for this layer type".to_string(),
562 "Check if layer can benefit from Flash Attention".to_string(),
563 "Consider layer pruning if accuracy allows".to_string(),
564 ],
565 code_example: None,
566 });
567 }
568 }
569 }
570
571 recommendations
572 }
573
574 fn analyze_hardware(&self, snapshots: &[PerformanceSnapshot]) -> Vec<Recommendation> {
580 let mut recommendations = Vec::new();
581
582 let effective_hw = snapshots.last().and_then(|s| s.hardware_type).unwrap_or_else(|| {
583 if self.config.target_hardware == HardwareType::Auto {
584 self.detect_hardware()
585 } else {
586 self.config.target_hardware
587 }
588 });
589
590 let hw_recs: &[(&str, &str)] = match effective_hw {
591 HardwareType::NvidiaGpu => &[
592 (
593 "Enable TF32 matmul for Ampere+ GPUs",
594 "Enable TF32 matmul for Ampere+ GPUs (torch.backends.cuda.matmul.allow_tf32)",
595 ),
596 (
597 "cuDNN deterministic algorithms",
598 "Consider cuDNN deterministic algorithms for reproducibility (may reduce throughput)",
599 ),
600 ],
601 HardwareType::AmdGpu => &[
602 (
603 "ROCm hipBLAS strided-batched GEMM",
604 "Enable ROCm hipBLAS strided-batched GEMM for batch inference",
605 ),
606 (
607 "bf16 precision with ROCm MI200+",
608 "Use bf16 precision with ROCm MI200+ for 2x throughput",
609 ),
610 ],
611 HardwareType::AppleSilicon => &[
612 (
613 "Metal Performance Shaders fused kernels",
614 "Enable Metal Performance Shaders fused kernels via trustformers metal backend",
615 ),
616 (
617 "f16 precision on Apple Silicon",
618 "Use f16 precision on Apple Silicon where model accuracy permits",
619 ),
620 ],
621 HardwareType::Cpu => &[
622 (
623 "Enable AVX-512 via scirs2-core SIMD",
624 "Enable AVX-512 via scirs2-core SIMD features if not already active",
625 ),
626 (
627 "AMX acceleration on Apple M-series",
628 "Use AMX acceleration on Apple M-series CPUs for matrix operations",
629 ),
630 ],
631 HardwareType::Tpu => &[
632 (
633 "bf16 precision for TPU v4+",
634 "Use bf16 precision with matmul_precision=highest for TPU v4+",
635 ),
636 (
637 "XLA sharding for tensor parallelism",
638 "Enable XLA sharding for tensor parallelism across TPU cores",
639 ),
640 ],
641 HardwareType::Auto => {
642 let resolved = self.detect_hardware();
644 let resolved_snap: Vec<PerformanceSnapshot> = snapshots
645 .iter()
646 .cloned()
647 .map(|mut s| {
648 s.hardware_type = Some(resolved);
649 s
650 })
651 .collect();
652 return self.analyze_hardware(&resolved_snap);
653 }
654 };
655
656 for (title, description) in hw_recs {
657 recommendations.push(Recommendation {
658 category: RecommendationCategory::Hardware,
659 priority: Priority::Medium,
660 confidence: 0.8,
661 title: (*title).to_string(),
662 description: (*description).to_string(),
663 expected_impact: ImpactEstimate {
664 speedup: 1.15,
665 memory_reduction_mb: 0.0,
666 throughput_improvement: 10.0,
667 },
668 difficulty: Difficulty::Easy,
669 actions: vec![(*description).to_string()],
670 code_example: None,
671 });
672 }
673
674 recommendations
675 }
676
677 fn analyze_data_loading(&self, snapshots: &[PerformanceSnapshot]) -> Vec<Recommendation> {
679 let mut recommendations = Vec::new();
680
681 let window = self.config.data_loading_window.min(snapshots.len());
682 if window == 0 {
683 return recommendations;
684 }
685 let recent = &snapshots[snapshots.len() - window..];
686
687 let (io_sum, io_count) = recent.iter().fold((0.0_f64, 0_usize), |(acc, n), s| {
689 if let Some(pct) = s.io_wait_pct {
690 (acc + pct as f64, n + 1)
691 } else {
692 (acc, n)
693 }
694 });
695
696 if io_count > 0 {
697 let avg_io_wait = io_sum / io_count as f64;
698
699 if avg_io_wait > 0.15 {
700 recommendations.push(Recommendation {
701 category: RecommendationCategory::DataLoading,
702 priority: Priority::High,
703 confidence: 0.82,
704 title: "Increase data-loader worker count".to_string(),
705 description: format!(
706 "Increase data-loader worker count (current I/O wait {:.1}% suggests bottleneck)",
707 avg_io_wait * 100.0
708 ),
709 expected_impact: ImpactEstimate {
710 speedup: 1.2,
711 memory_reduction_mb: 0.0,
712 throughput_improvement: 20.0,
713 },
714 difficulty: Difficulty::Easy,
715 actions: vec![
716 format!(
717 "Increase data-loader worker count (current I/O wait {:.1}% suggests bottleneck)",
718 avg_io_wait * 100.0
719 ),
720 "Enable dataset prefetch to overlap data loading with model computation"
721 .to_string(),
722 "Memory-map large weight files to reduce I/O overhead".to_string(),
723 ],
724 code_example: None,
725 });
726 }
727 }
728
729 let last = snapshots.last();
731 if let Some(snap) = last {
732 if let (Some(bt), Some(gpt)) = (snap.batch_throughput_per_sec, snap.gpu_peak_throughput)
733 {
734 if gpt > 0.0 && (bt / gpt) < 0.5 {
735 recommendations.push(Recommendation {
736 category: RecommendationCategory::DataLoading,
737 priority: Priority::Medium,
738 confidence: 0.75,
739 title: "Parallelise tokenization across CPU workers".to_string(),
740 description:
741 "Parallelise tokenization across CPU workers to reduce preprocessing bottleneck"
742 .to_string(),
743 expected_impact: ImpactEstimate {
744 speedup: 1.25,
745 memory_reduction_mb: 0.0,
746 throughput_improvement: 25.0,
747 },
748 difficulty: Difficulty::Moderate,
749 actions: vec![
750 "Parallelise tokenization across CPU workers to reduce preprocessing bottleneck"
751 .to_string(),
752 "Move data preprocessing to CPU worker pool to better utilise GPU"
753 .to_string(),
754 ],
755 code_example: None,
756 });
757 }
758 }
759 }
760
761 recommendations
762 }
763
764 fn analyze_architecture(&self, snapshots: &[PerformanceSnapshot]) -> Vec<Recommendation> {
766 let mut recommendations = Vec::new();
767
768 let seq_snap = snapshots.iter().rev().find(|s| s.seq_len.is_some());
770
771 if let Some(snap) = seq_snap {
772 let seq_len = snap.seq_len.unwrap_or(0);
773
774 if seq_len > 1024 {
775 recommendations.push(Recommendation {
776 category: RecommendationCategory::Architecture,
777 priority: Priority::High,
778 confidence: 0.88,
779 title: "Enable Flash Attention for long sequences".to_string(),
780 description: format!(
781 "Enable Flash Attention for seq_len={} (reduces memory O(n^2) to O(n))",
782 seq_len
783 ),
784 expected_impact: ImpactEstimate {
785 speedup: 1.6,
786 memory_reduction_mb: 0.0,
787 throughput_improvement: 30.0,
788 },
789 difficulty: Difficulty::Moderate,
790 actions: vec![format!(
791 "Enable Flash Attention for seq_len={} (reduces memory O(n^2) to O(n))",
792 seq_len
793 )],
794 code_example: None,
795 });
796 }
797
798 if let (Some(kv_bytes), Some(gpu_peak)) =
799 (snap.kv_cache_bytes, snap.gpu_peak_throughput)
800 {
801 let kv_mb = kv_bytes as f64 / (1024.0 * 1024.0);
802 if kv_bytes as f64 > gpu_peak as f64 * 0.5 {
804 recommendations.push(Recommendation {
805 category: RecommendationCategory::Architecture,
806 priority: Priority::High,
807 confidence: 0.78,
808 title: "Reduce KV heads using Grouped-Query Attention".to_string(),
809 description: format!(
810 "Reduce KV heads using Grouped-Query Attention (GQA) — current KV cache {:.0} MB exceeds 50% of GPU memory budget",
811 kv_mb
812 ),
813 expected_impact: ImpactEstimate {
814 speedup: 1.3,
815 memory_reduction_mb: kv_mb * 0.5,
816 throughput_improvement: 20.0,
817 },
818 difficulty: Difficulty::Hard,
819 actions: vec![format!(
820 "Reduce KV heads using Grouped-Query Attention (GQA) — current KV cache {:.0} MB exceeds 50% of GPU memory budget",
821 kv_mb
822 )],
823 code_example: None,
824 });
825 }
826 }
827
828 if seq_len > 4096 {
829 recommendations.push(Recommendation {
830 category: RecommendationCategory::Architecture,
831 priority: Priority::Medium,
832 confidence: 0.82,
833 title: "Consider sliding-window attention".to_string(),
834 description:
835 "Consider sliding-window attention (Mistral-style) to reduce quadratic memory growth at very long contexts"
836 .to_string(),
837 expected_impact: ImpactEstimate {
838 speedup: 1.4,
839 memory_reduction_mb: 0.0,
840 throughput_improvement: 25.0,
841 },
842 difficulty: Difficulty::Hard,
843 actions: vec![
844 "Consider sliding-window attention (Mistral-style) to reduce quadratic memory growth at very long contexts"
845 .to_string(),
846 ],
847 code_example: None,
848 });
849 }
850
851 if let Some(num_heads) = snap.num_heads {
852 if num_heads > 32 {
853 recommendations.push(Recommendation {
854 category: RecommendationCategory::Architecture,
855 priority: Priority::Medium,
856 confidence: 0.76,
857 title: "Multi-Query Attention to reduce KV memory".to_string(),
858 description: format!(
859 "Multi-Query Attention (single KV head) could reduce memory by {}x while retaining most accuracy",
860 num_heads
861 ),
862 expected_impact: ImpactEstimate {
863 speedup: 1.2,
864 memory_reduction_mb: 0.0,
865 throughput_improvement: 15.0,
866 },
867 difficulty: Difficulty::Hard,
868 actions: vec![format!(
869 "Multi-Query Attention (single KV head) could reduce memory by {}x while retaining most accuracy",
870 num_heads
871 )],
872 code_example: None,
873 });
874 }
875 }
876 }
877
878 let depth_snap = snapshots.iter().rev().find(|s| s.model_depth.is_some());
880 if let Some(snap) = depth_snap {
881 if let Some(depth) = snap.model_depth {
882 if depth > 48 {
883 recommendations.push(Recommendation {
884 category: RecommendationCategory::Architecture,
885 priority: Priority::Medium,
886 confidence: 0.84,
887 title: "Enable gradient checkpointing for deep models".to_string(),
888 description: format!(
889 "Enable gradient checkpointing for models with >48 layers — reduces activation memory at ~33% compute overhead (current depth: {})",
890 depth
891 ),
892 expected_impact: ImpactEstimate {
893 speedup: 0.85, memory_reduction_mb: 0.0,
895 throughput_improvement: 0.0,
896 },
897 difficulty: Difficulty::Easy,
898 actions: vec![
899 "Enable gradient checkpointing for models with >48 layers — reduces activation memory at ~33% compute overhead"
900 .to_string(),
901 ],
902 code_example: None,
903 });
904 }
905 }
906 }
907
908 recommendations
909 }
910
911 fn compute_current_performance(&self) -> PerformanceSummary {
913 let count = self.history.len() as f64;
914
915 let avg_time = self.history.iter().map(|s| s.total_time_ms).sum::<f64>() / count;
916
917 let avg_memory = self.history.iter().map(|s| s.memory_usage_mb).sum::<f64>() / count;
918
919 let avg_throughput = self.history.iter().map(|s| s.throughput).sum::<f64>() / count;
920
921 let avg_gpu = self.history.iter().map(|s| s.gpu_utilization).sum::<f64>() / count;
922
923 let efficiency = (avg_gpu.min(100.0) + (avg_throughput / 10.0).min(100.0)) / 2.0;
925
926 PerformanceSummary {
927 avg_time_ms: avg_time,
928 avg_memory_mb: avg_memory,
929 avg_throughput,
930 gpu_utilization: avg_gpu,
931 efficiency_score: efficiency,
932 }
933 }
934
935 fn estimate_improved_performance(
941 &self,
942 recommendations: &[Recommendation],
943 ) -> PerformanceSummary {
944 let current = self.compute_current_performance();
945
946 let total_speedup: f64 = match &self.config.performance_model {
947 PerformanceModel::RuleBased { multipliers } => {
948 recommendations
949 .iter()
950 .map(|r| {
951 let m = multipliers.get(&r.category).copied().unwrap_or(1.0);
952 m * (r.expected_impact.speedup - 1.0)
953 })
954 .sum::<f64>()
955 + 1.0
956 },
957 };
958
959 let total_memory_reduction: f64 =
960 recommendations.iter().map(|r| r.expected_impact.memory_reduction_mb).sum();
961
962 let total_throughput_improvement: f64 =
963 recommendations.iter().map(|r| r.expected_impact.throughput_improvement).sum();
964
965 PerformanceSummary {
966 avg_time_ms: current.avg_time_ms / total_speedup,
967 avg_memory_mb: (current.avg_memory_mb - total_memory_reduction).max(0.0),
968 avg_throughput: current.avg_throughput * (1.0 + total_throughput_improvement / 100.0),
969 gpu_utilization: (current.gpu_utilization * 1.2).min(95.0),
970 efficiency_score: (current.efficiency_score * 1.3).min(100.0),
971 }
972 }
973}
974
975#[cfg(test)]
976mod tests {
977 use super::*;
978
979 fn base_snapshot() -> PerformanceSnapshot {
980 PerformanceSnapshot {
981 timestamp: 0,
982 total_time_ms: 100.0,
983 memory_usage_mb: 1000.0,
984 peak_memory_mb: 2000.0,
985 gpu_utilization: 40.0,
986 throughput: 20.0,
987 batch_size: 8,
988 layer_timings: {
989 let mut t = HashMap::new();
990 t.insert("attention".to_string(), 60.0);
991 t.insert("ffn".to_string(), 30.0);
992 t.insert("other".to_string(), 10.0);
993 t
994 },
995 ..Default::default()
996 }
997 }
998
999 #[test]
1000 fn test_tuner_creation() {
1001 let config = TunerConfig::default();
1002 let _tuner = PerformanceTuner::new(config);
1003 }
1004
1005 #[test]
1006 fn test_snapshot_recording() {
1007 let mut tuner = PerformanceTuner::new(TunerConfig::default());
1008
1009 let snapshot = PerformanceSnapshot {
1010 timestamp: 0,
1011 total_time_ms: 100.0,
1012 memory_usage_mb: 500.0,
1013 peak_memory_mb: 600.0,
1014 gpu_utilization: 75.0,
1015 throughput: 50.0,
1016 batch_size: 16,
1017 layer_timings: HashMap::new(),
1018 layer_memory: HashMap::new(),
1019 ..Default::default()
1020 };
1021
1022 tuner.record_snapshot(snapshot);
1023 assert_eq!(tuner.history.len(), 1);
1024 }
1025
1026 #[test]
1027 fn test_analysis_with_data() -> Result<()> {
1028 let mut tuner = PerformanceTuner::new(TunerConfig::default());
1029
1030 for i in 0..10 {
1032 let snapshot = PerformanceSnapshot {
1033 timestamp: i,
1034 ..base_snapshot()
1035 };
1036
1037 tuner.record_snapshot(snapshot);
1038 }
1039
1040 let report = tuner.analyze()?;
1041
1042 assert!(!report.recommendations.is_empty());
1044
1045 assert!(report.current_performance.avg_time_ms > 0.0);
1047 assert!(report.estimated_performance.avg_time_ms > 0.0);
1048
1049 Ok(())
1050 }
1051
1052 #[test]
1053 fn test_analyze_hardware_nvidia_produces_hardware_recommendation() -> Result<()> {
1054 let mut tuner = PerformanceTuner::new(TunerConfig::default());
1055
1056 let snapshot = PerformanceSnapshot {
1057 hardware_type: Some(HardwareType::NvidiaGpu),
1058 ..base_snapshot()
1059 };
1060 tuner.record_snapshot(snapshot);
1061
1062 let report = tuner.analyze()?;
1063 let hw_recs: Vec<_> = report
1064 .recommendations
1065 .iter()
1066 .filter(|r| r.category == RecommendationCategory::Hardware)
1067 .collect();
1068
1069 assert!(
1070 !hw_recs.is_empty(),
1071 "Expected at least one Hardware recommendation for NvidiaGpu"
1072 );
1073 Ok(())
1074 }
1075
1076 #[test]
1077 fn test_analyze_hardware_apple_silicon() -> Result<()> {
1078 let mut tuner = PerformanceTuner::new(TunerConfig::default());
1079
1080 let snapshot = PerformanceSnapshot {
1081 hardware_type: Some(HardwareType::AppleSilicon),
1082 ..base_snapshot()
1083 };
1084 tuner.record_snapshot(snapshot);
1085
1086 let report = tuner.analyze()?;
1087 let hw_recs: Vec<_> = report
1088 .recommendations
1089 .iter()
1090 .filter(|r| r.category == RecommendationCategory::Hardware)
1091 .collect();
1092
1093 assert!(
1094 !hw_recs.is_empty(),
1095 "Expected at least one Hardware recommendation for AppleSilicon"
1096 );
1097 Ok(())
1098 }
1099
1100 #[test]
1101 fn test_analyze_data_loading_high_io_wait() -> Result<()> {
1102 let mut tuner = PerformanceTuner::new(TunerConfig::default());
1103
1104 let snapshot = PerformanceSnapshot {
1105 io_wait_pct: Some(0.3),
1106 gpu_peak_throughput: Some(100.0),
1107 batch_throughput_per_sec: Some(20.0),
1108 ..base_snapshot()
1109 };
1110 tuner.record_snapshot(snapshot);
1111
1112 let report = tuner.analyze()?;
1113 let dl_recs: Vec<_> = report
1114 .recommendations
1115 .iter()
1116 .filter(|r| r.category == RecommendationCategory::DataLoading)
1117 .collect();
1118
1119 assert!(
1120 !dl_recs.is_empty(),
1121 "Expected at least one DataLoading recommendation for high I/O wait"
1122 );
1123 Ok(())
1124 }
1125
1126 #[test]
1127 fn test_analyze_architecture_long_context() -> Result<()> {
1128 let mut tuner = PerformanceTuner::new(TunerConfig::default());
1129
1130 let snapshot = PerformanceSnapshot {
1131 seq_len: Some(4096),
1132 ..base_snapshot()
1133 };
1134 tuner.record_snapshot(snapshot);
1135
1136 let report = tuner.analyze()?;
1137 let arch_recs: Vec<_> = report
1138 .recommendations
1139 .iter()
1140 .filter(|r| {
1141 r.category == RecommendationCategory::Architecture
1142 && r.description.contains("Flash Attention")
1143 })
1144 .collect();
1145
1146 assert!(
1147 !arch_recs.is_empty(),
1148 "Expected at least one Architecture recommendation mentioning Flash Attention"
1149 );
1150 Ok(())
1151 }
1152
1153 #[test]
1154 fn test_detected_hardware_returns_non_auto() {
1155 let tuner = PerformanceTuner::new(TunerConfig::default());
1156 let hw = tuner.detected_hardware();
1157 assert_ne!(
1158 hw,
1159 HardwareType::Auto,
1160 "detected_hardware should never return Auto"
1161 );
1162 }
1163
1164 #[test]
1165 fn test_performance_model_rule_based_configurable() -> Result<()> {
1166 let mut multipliers = HashMap::new();
1168 for cat in &[
1169 RecommendationCategory::Memory,
1170 RecommendationCategory::Compute,
1171 RecommendationCategory::BatchSize,
1172 RecommendationCategory::Layer,
1173 RecommendationCategory::Hardware,
1174 RecommendationCategory::DataLoading,
1175 RecommendationCategory::Architecture,
1176 ] {
1177 multipliers.insert(*cat, 1.0);
1178 }
1179 multipliers.insert(RecommendationCategory::Memory, 2.0);
1180
1181 let config = TunerConfig {
1182 performance_model: PerformanceModel::RuleBased { multipliers },
1183 ..TunerConfig::default()
1184 };
1185
1186 let mut tuner = PerformanceTuner::new(config);
1187
1188 for i in 0..5 {
1190 let snapshot = PerformanceSnapshot {
1191 timestamp: i,
1192 memory_usage_mb: 1000.0,
1193 peak_memory_mb: 3000.0, ..base_snapshot()
1195 };
1196 tuner.record_snapshot(snapshot);
1197 }
1198
1199 let report = tuner.analyze()?;
1200
1201 let default_config = TunerConfig::default();
1204 let mut default_tuner = PerformanceTuner::new(default_config);
1205 for i in 0..5 {
1206 let snapshot = PerformanceSnapshot {
1207 timestamp: i,
1208 memory_usage_mb: 1000.0,
1209 peak_memory_mb: 3000.0,
1210 ..base_snapshot()
1211 };
1212 default_tuner.record_snapshot(snapshot);
1213 }
1214 let default_report = default_tuner.analyze()?;
1215
1216 assert!(
1219 report.estimated_performance.avg_time_ms
1220 <= default_report.estimated_performance.avg_time_ms + 1e-6,
1221 "Custom multiplier=2.0 should produce at least as optimistic a time estimate as default"
1222 );
1223
1224 Ok(())
1225 }
1226}