trustformers_debug/
performance_tuning.rs

1//! Automated Performance Tuning Recommendations
2//!
3//! This module analyzes profiling data and generates actionable performance
4//! optimization recommendations for transformer models.
5
6use anyhow::Result;
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10/// Performance model defining how improvement estimates are computed.
11#[derive(Debug, Clone)]
12pub enum PerformanceModel {
13    /// Rule-based estimation using configurable per-category multipliers.
14    ///
15    /// Each multiplier scales the contribution of a recommendation's speedup
16    /// to the total estimated improvement. Default multipliers are all `1.0`,
17    /// which reproduces the original additive behaviour.
18    RuleBased {
19        /// Multiplier for each recommendation category (defaults to 1.0 if absent).
20        multipliers: HashMap<RecommendationCategory, f64>,
21    },
22}
23
24impl Default for PerformanceModel {
25    fn default() -> Self {
26        let mut multipliers = HashMap::new();
27        multipliers.insert(RecommendationCategory::Memory, 1.0);
28        multipliers.insert(RecommendationCategory::Compute, 1.0);
29        multipliers.insert(RecommendationCategory::BatchSize, 1.0);
30        multipliers.insert(RecommendationCategory::Layer, 1.0);
31        multipliers.insert(RecommendationCategory::Hardware, 1.0);
32        multipliers.insert(RecommendationCategory::DataLoading, 1.0);
33        multipliers.insert(RecommendationCategory::Architecture, 1.0);
34        PerformanceModel::RuleBased { multipliers }
35    }
36}
37
38/// Performance tuning analyzer
39#[derive(Debug)]
40pub struct PerformanceTuner {
41    /// Configuration for the tuner
42    config: TunerConfig,
43    /// Historical performance data
44    history: Vec<PerformanceSnapshot>,
45}
46
47/// Configuration for performance tuner
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct TunerConfig {
50    /// Enable memory optimization suggestions
51    pub enable_memory_tuning: bool,
52    /// Enable compute optimization suggestions
53    pub enable_compute_tuning: bool,
54    /// Enable batch size optimization
55    pub enable_batch_tuning: bool,
56    /// Enable layer-specific tuning
57    pub enable_layer_tuning: bool,
58    /// Minimum confidence threshold (0.0-1.0)
59    pub confidence_threshold: f64,
60    /// Target hardware type
61    pub target_hardware: HardwareType,
62    /// Number of recent snapshots to consider for data-loading analysis
63    pub data_loading_window: usize,
64    /// Performance model used for improved-performance estimation
65    #[serde(skip)]
66    pub performance_model: PerformanceModel,
67}
68
69impl Default for TunerConfig {
70    fn default() -> Self {
71        Self {
72            enable_memory_tuning: true,
73            enable_compute_tuning: true,
74            enable_batch_tuning: true,
75            enable_layer_tuning: true,
76            confidence_threshold: 0.7,
77            target_hardware: HardwareType::Auto,
78            data_loading_window: 10,
79            performance_model: PerformanceModel::default(),
80        }
81    }
82}
83
84/// Target hardware type
85#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
86pub enum HardwareType {
87    /// Auto-detect hardware
88    Auto,
89    /// NVIDIA GPU (CUDA)
90    NvidiaGpu,
91    /// AMD GPU (ROCm)
92    AmdGpu,
93    /// Apple Silicon (Metal)
94    AppleSilicon,
95    /// CPU only
96    Cpu,
97    /// TPU
98    Tpu,
99}
100
101/// Performance snapshot for analysis
102#[derive(Debug, Clone, Serialize, Deserialize, Default)]
103pub struct PerformanceSnapshot {
104    /// Timestamp
105    pub timestamp: u64,
106    /// Total execution time (ms)
107    pub total_time_ms: f64,
108    /// Memory usage (MB)
109    pub memory_usage_mb: f64,
110    /// Peak memory (MB)
111    pub peak_memory_mb: f64,
112    /// GPU utilization (0-100)
113    pub gpu_utilization: f64,
114    /// Throughput (samples/sec)
115    pub throughput: f64,
116    /// Batch size used
117    pub batch_size: usize,
118    /// Layer timings (layer name -> time in ms)
119    pub layer_timings: HashMap<String, f64>,
120    /// Memory per layer (layer name -> memory in MB)
121    pub layer_memory: HashMap<String, f64>,
122    /// Override hardware type for this snapshot (overrides TunerConfig::target_hardware)
123    pub hardware_type: Option<HardwareType>,
124    /// Fraction of time (0.0-1.0) spent waiting on I/O during this snapshot
125    pub io_wait_pct: Option<f32>,
126    /// Samples processed per second during this snapshot
127    pub batch_throughput_per_sec: Option<f32>,
128    /// Theoretical peak GPU throughput (samples/sec) for the detected GPU
129    pub gpu_peak_throughput: Option<f32>,
130    /// Number of transformer layers in the model
131    pub model_depth: Option<usize>,
132    /// Number of attention heads
133    pub num_heads: Option<usize>,
134    /// Current KV-cache size in bytes
135    pub kv_cache_bytes: Option<u64>,
136    /// Sequence length for this batch
137    pub seq_len: Option<usize>,
138}
139
140/// Tuning recommendation
141#[derive(Debug, Clone, Serialize, Deserialize)]
142pub struct Recommendation {
143    /// Recommendation category
144    pub category: RecommendationCategory,
145    /// Priority level
146    pub priority: Priority,
147    /// Confidence score (0.0-1.0)
148    pub confidence: f64,
149    /// Short title
150    pub title: String,
151    /// Detailed description
152    pub description: String,
153    /// Expected impact
154    pub expected_impact: ImpactEstimate,
155    /// Implementation difficulty
156    pub difficulty: Difficulty,
157    /// Specific actions to take
158    pub actions: Vec<String>,
159    /// Code example (if applicable)
160    pub code_example: Option<String>,
161}
162
163/// Recommendation category
164#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
165pub enum RecommendationCategory {
166    /// Memory optimization
167    Memory,
168    /// Compute optimization
169    Compute,
170    /// Batch processing
171    BatchSize,
172    /// Layer-specific optimization
173    Layer,
174    /// Hardware configuration
175    Hardware,
176    /// Data loading
177    DataLoading,
178    /// Model architecture
179    Architecture,
180}
181
182/// Priority level
183#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
184pub enum Priority {
185    /// Low priority
186    Low,
187    /// Medium priority
188    Medium,
189    /// High priority
190    High,
191    /// Critical (blocking performance)
192    Critical,
193}
194
195/// Implementation difficulty
196#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
197pub enum Difficulty {
198    /// Easy to implement
199    Easy,
200    /// Moderate effort required
201    Moderate,
202    /// Significant effort required
203    Hard,
204}
205
206/// Expected performance impact
207#[derive(Debug, Clone, Serialize, Deserialize)]
208pub struct ImpactEstimate {
209    /// Expected speedup (e.g., 1.5 = 50% faster)
210    pub speedup: f64,
211    /// Expected memory reduction (MB)
212    pub memory_reduction_mb: f64,
213    /// Expected throughput improvement (%)
214    pub throughput_improvement: f64,
215}
216
217/// Complete tuning report
218#[derive(Debug, Clone, Serialize, Deserialize)]
219pub struct TuningReport {
220    /// All recommendations sorted by priority
221    pub recommendations: Vec<Recommendation>,
222    /// Current performance summary
223    pub current_performance: PerformanceSummary,
224    /// Estimated performance after applying recommendations
225    pub estimated_performance: PerformanceSummary,
226    /// Analysis timestamp
227    pub timestamp: u64,
228}
229
230/// Performance summary
231#[derive(Debug, Clone, Serialize, Deserialize)]
232pub struct PerformanceSummary {
233    /// Average execution time (ms)
234    pub avg_time_ms: f64,
235    /// Average memory usage (MB)
236    pub avg_memory_mb: f64,
237    /// Average throughput (samples/sec)
238    pub avg_throughput: f64,
239    /// GPU utilization (%)
240    pub gpu_utilization: f64,
241    /// Efficiency score (0-100)
242    pub efficiency_score: f64,
243}
244
245impl PerformanceTuner {
246    /// Create a new performance tuner
247    pub fn new(config: TunerConfig) -> Self {
248        Self {
249            config,
250            history: Vec::new(),
251        }
252    }
253
254    /// Record a performance snapshot
255    pub fn record_snapshot(&mut self, snapshot: PerformanceSnapshot) {
256        self.history.push(snapshot);
257
258        // Keep only last 100 snapshots
259        if self.history.len() > 100 {
260            self.history.remove(0);
261        }
262    }
263
264    /// Detect the effective hardware type at runtime using cfg flags.
265    ///
266    /// Each branch is guarded by a disjoint cfg predicate so that exactly one
267    /// branch is compiled on any given target, eliminating unreachable-code
268    /// and unexpected-cfg warnings.
269    fn detect_hardware(&self) -> HardwareType {
270        #[cfg(target_os = "macos")]
271        return HardwareType::AppleSilicon;
272
273        #[cfg(all(not(target_os = "macos"), feature = "cuda"))]
274        return HardwareType::NvidiaGpu;
275
276        #[cfg(all(not(target_os = "macos"), not(feature = "cuda"), feature = "rocm"))]
277        return HardwareType::AmdGpu;
278
279        #[cfg(all(
280            not(target_os = "macos"),
281            not(feature = "cuda"),
282            not(feature = "rocm"),
283            feature = "tpu"
284        ))]
285        return HardwareType::Tpu;
286
287        #[cfg(all(
288            not(target_os = "macos"),
289            not(feature = "cuda"),
290            not(feature = "rocm"),
291            not(feature = "tpu")
292        ))]
293        HardwareType::Cpu
294    }
295
296    /// Return the detected hardware type (public API for `detect_hardware`).
297    pub fn detected_hardware(&self) -> HardwareType {
298        self.detect_hardware()
299    }
300
301    /// Analyze performance and generate recommendations
302    pub fn analyze(&self) -> Result<TuningReport> {
303        let mut recommendations = Vec::new();
304
305        if self.history.is_empty() {
306            anyhow::bail!("No performance data available");
307        }
308
309        // Generate different types of recommendations
310        if self.config.enable_memory_tuning {
311            recommendations.extend(self.analyze_memory());
312        }
313
314        if self.config.enable_compute_tuning {
315            recommendations.extend(self.analyze_compute());
316        }
317
318        if self.config.enable_batch_tuning {
319            recommendations.extend(self.analyze_batch_size());
320        }
321
322        if self.config.enable_layer_tuning {
323            recommendations.extend(self.analyze_layers());
324        }
325
326        // Hardware analysis is always relevant regardless of other flags.
327        recommendations.extend(self.analyze_hardware(&self.history));
328
329        // Data-loading analysis only when any snapshot carries io_wait_pct data.
330        if self.history.iter().any(|s| s.io_wait_pct.is_some()) {
331            recommendations.extend(self.analyze_data_loading(&self.history));
332        }
333
334        // Architecture analysis only when any snapshot carries seq_len data.
335        if self.history.iter().any(|s| s.seq_len.is_some()) {
336            recommendations.extend(self.analyze_architecture(&self.history));
337        }
338
339        // Filter by confidence threshold
340        recommendations.retain(|r| r.confidence >= self.config.confidence_threshold);
341
342        // Sort by priority (highest first)
343        recommendations.sort_by_key(|item| std::cmp::Reverse(item.priority));
344
345        let current_perf = self.compute_current_performance();
346        let estimated_perf = self.estimate_improved_performance(&recommendations);
347
348        Ok(TuningReport {
349            recommendations,
350            current_performance: current_perf,
351            estimated_performance: estimated_perf,
352            timestamp: std::time::SystemTime::now()
353                .duration_since(std::time::UNIX_EPOCH)
354                .expect("SystemTime should be after UNIX_EPOCH")
355                .as_secs(),
356        })
357    }
358
359    /// Analyze memory usage patterns
360    fn analyze_memory(&self) -> Vec<Recommendation> {
361        let mut recommendations = Vec::new();
362
363        let avg_memory =
364            self.history.iter().map(|s| s.memory_usage_mb).sum::<f64>() / self.history.len() as f64;
365
366        let peak_memory = self.history.iter().map(|s| s.peak_memory_mb).fold(0.0, f64::max);
367
368        // Check for high memory fragmentation
369        if peak_memory > avg_memory * 1.5 {
370            recommendations.push(Recommendation {
371                category: RecommendationCategory::Memory,
372                priority: Priority::High,
373                confidence: 0.85,
374                title: "Reduce memory fragmentation".to_string(),
375                description: format!(
376                    "Peak memory ({:.1}MB) is significantly higher than average ({:.1}MB). \
377                     This indicates memory fragmentation.",
378                    peak_memory, avg_memory
379                ),
380                expected_impact: ImpactEstimate {
381                    speedup: 1.1,
382                    memory_reduction_mb: (peak_memory - avg_memory) * 0.5,
383                    throughput_improvement: 5.0,
384                },
385                difficulty: Difficulty::Moderate,
386                actions: vec![
387                    "Enable gradient checkpointing to reduce activation memory".to_string(),
388                    "Use torch.cuda.empty_cache() or equivalent after large operations".to_string(),
389                    "Consider using mixed precision training (FP16/BF16)".to_string(),
390                ],
391                code_example: Some(
392                    "# Enable gradient checkpointing\n\
393                     model.gradient_checkpointing_enable()\n\
394                     \n\
395                     # Use automatic mixed precision\n\
396                     with torch.cuda.amp.autocast():\n\
397                     \u{00a0}\u{00a0}\u{00a0}\u{00a0}output = model(input)"
398                        .to_string(),
399                ),
400            });
401        }
402
403        // Check for excessive memory usage
404        if avg_memory > 8000.0 && self.config.target_hardware == HardwareType::Cpu {
405            recommendations.push(Recommendation {
406                category: RecommendationCategory::Memory,
407                priority: Priority::High,
408                confidence: 0.9,
409                title: "Reduce memory footprint for CPU execution".to_string(),
410                description: format!(
411                    "Average memory usage ({:.1}GB) is high for CPU execution. \
412                     Consider model compression techniques.",
413                    avg_memory / 1024.0
414                ),
415                expected_impact: ImpactEstimate {
416                    speedup: 1.3,
417                    memory_reduction_mb: avg_memory * 0.4,
418                    throughput_improvement: 15.0,
419                },
420                difficulty: Difficulty::Moderate,
421                actions: vec![
422                    "Apply 8-bit or 4-bit quantization".to_string(),
423                    "Use dynamic quantization for linear layers".to_string(),
424                    "Consider model distillation to a smaller model".to_string(),
425                ],
426                code_example: Some(
427                    "# Apply 8-bit quantization\n\
428                     quantized_model = torch.quantization.quantize_dynamic(\n\
429                     \u{00a0}\u{00a0}\u{00a0}\u{00a0}model, {torch.nn.Linear}, dtype=torch.qint8\n\
430                     )"
431                    .to_string(),
432                ),
433            });
434        }
435
436        recommendations
437    }
438
439    /// Analyze compute patterns
440    fn analyze_compute(&self) -> Vec<Recommendation> {
441        let mut recommendations = Vec::new();
442
443        let avg_gpu_util =
444            self.history.iter().map(|s| s.gpu_utilization).sum::<f64>() / self.history.len() as f64;
445
446        // Check for low GPU utilization
447        if avg_gpu_util < 50.0 && self.config.target_hardware != HardwareType::Cpu {
448            recommendations.push(Recommendation {
449                category: RecommendationCategory::Compute,
450                priority: Priority::High,
451                confidence: 0.88,
452                title: "Improve GPU utilization".to_string(),
453                description: format!(
454                    "Average GPU utilization ({:.1}%) is low. GPU is underutilized.",
455                    avg_gpu_util
456                ),
457                expected_impact: ImpactEstimate {
458                    speedup: 1.8,
459                    memory_reduction_mb: 0.0,
460                    throughput_improvement: 40.0,
461                },
462                difficulty: Difficulty::Easy,
463                actions: vec![
464                    "Increase batch size to maximize GPU occupancy".to_string(),
465                    "Use DataLoader with num_workers > 0 to prevent CPU bottleneck".to_string(),
466                    "Enable pin_memory for faster host-to-device transfers".to_string(),
467                    "Use compiled models (torch.compile)".to_string(),
468                ],
469                code_example: Some(
470                    "# Optimize data loading\n\
471                     dataloader = DataLoader(\n\
472                     \u{00a0}\u{00a0}\u{00a0}\u{00a0}dataset,\n\
473                     \u{00a0}\u{00a0}\u{00a0}\u{00a0}batch_size=32,\n\
474                     \u{00a0}\u{00a0}\u{00a0}\u{00a0}num_workers=4,  # Parallel data loading\n\
475                     \u{00a0}\u{00a0}\u{00a0}\u{00a0}pin_memory=True  # Faster transfers\n\
476                     )"
477                    .to_string(),
478                ),
479            });
480        }
481
482        recommendations
483    }
484
485    /// Analyze batch size efficiency
486    fn analyze_batch_size(&self) -> Vec<Recommendation> {
487        let mut recommendations = Vec::new();
488
489        if let Some(last_snapshot) = self.history.last() {
490            let batch_size = last_snapshot.batch_size;
491
492            // Check if batch size is too small
493            if batch_size < 16 && self.config.target_hardware != HardwareType::Cpu {
494                recommendations.push(Recommendation {
495                    category: RecommendationCategory::BatchSize,
496                    priority: Priority::Medium,
497                    confidence: 0.75,
498                    title: "Increase batch size".to_string(),
499                    description: format!(
500                        "Current batch size ({}) is small. Larger batches improve GPU utilization.",
501                        batch_size
502                    ),
503                    expected_impact: ImpactEstimate {
504                        speedup: 1.5,
505                        memory_reduction_mb: 0.0,
506                        throughput_improvement: 30.0,
507                    },
508                    difficulty: Difficulty::Easy,
509                    actions: vec![
510                        format!("Increase batch size to {} or higher", batch_size * 2),
511                        "Monitor memory usage to find optimal batch size".to_string(),
512                        "Use gradient accumulation if memory is limited".to_string(),
513                    ],
514                    code_example: Some(
515                        "# Gradient accumulation for effective larger batch\n\
516                         accumulation_steps = 4\n\
517                         for i, batch in enumerate(dataloader):\n\
518                         \u{00a0}\u{00a0}\u{00a0}\u{00a0}loss = model(batch) / accumulation_steps\n\
519                         \u{00a0}\u{00a0}\u{00a0}\u{00a0}loss.backward()\n\
520                         \u{00a0}\u{00a0}\u{00a0}\u{00a0}if (i + 1) % accumulation_steps == 0:\n\
521                         \u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}optimizer.step()\n\
522                         \u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}optimizer.zero_grad()"
523                            .to_string()
524                    ),
525                });
526            }
527        }
528
529        recommendations
530    }
531
532    /// Analyze layer-specific bottlenecks
533    fn analyze_layers(&self) -> Vec<Recommendation> {
534        let mut recommendations = Vec::new();
535
536        if let Some(snapshot) = self.history.last() {
537            let total_time: f64 = snapshot.layer_timings.values().sum();
538
539            // Find layers that take >20% of total time
540            for (layer_name, &time) in &snapshot.layer_timings {
541                let percentage = (time / total_time) * 100.0;
542
543                if percentage > 20.0 {
544                    recommendations.push(Recommendation {
545                        category: RecommendationCategory::Layer,
546                        priority: Priority::Medium,
547                        confidence: 0.8,
548                        title: format!("Optimize {} layer", layer_name),
549                        description: format!(
550                            "Layer '{}' takes {:.1}% of total execution time ({:.2}ms). \
551                             Consider layer-specific optimizations.",
552                            layer_name, percentage, time
553                        ),
554                        expected_impact: ImpactEstimate {
555                            speedup: 1.2,
556                            memory_reduction_mb: 0.0,
557                            throughput_improvement: 15.0,
558                        },
559                        difficulty: Difficulty::Moderate,
560                        actions: vec![
561                            "Use fused operations for this layer type".to_string(),
562                            "Check if layer can benefit from Flash Attention".to_string(),
563                            "Consider layer pruning if accuracy allows".to_string(),
564                        ],
565                        code_example: None,
566                    });
567                }
568            }
569        }
570
571        recommendations
572    }
573
574    /// Analyze hardware-specific optimisation opportunities.
575    ///
576    /// Determines the effective hardware type from the latest snapshot's
577    /// `hardware_type` field (if set), then falls back to the config value
578    /// (resolving `Auto` via `detect_hardware`).
579    fn analyze_hardware(&self, snapshots: &[PerformanceSnapshot]) -> Vec<Recommendation> {
580        let mut recommendations = Vec::new();
581
582        let effective_hw = snapshots.last().and_then(|s| s.hardware_type).unwrap_or_else(|| {
583            if self.config.target_hardware == HardwareType::Auto {
584                self.detect_hardware()
585            } else {
586                self.config.target_hardware
587            }
588        });
589
590        let hw_recs: &[(&str, &str)] = match effective_hw {
591            HardwareType::NvidiaGpu => &[
592                (
593                    "Enable TF32 matmul for Ampere+ GPUs",
594                    "Enable TF32 matmul for Ampere+ GPUs (torch.backends.cuda.matmul.allow_tf32)",
595                ),
596                (
597                    "cuDNN deterministic algorithms",
598                    "Consider cuDNN deterministic algorithms for reproducibility (may reduce throughput)",
599                ),
600            ],
601            HardwareType::AmdGpu => &[
602                (
603                    "ROCm hipBLAS strided-batched GEMM",
604                    "Enable ROCm hipBLAS strided-batched GEMM for batch inference",
605                ),
606                (
607                    "bf16 precision with ROCm MI200+",
608                    "Use bf16 precision with ROCm MI200+ for 2x throughput",
609                ),
610            ],
611            HardwareType::AppleSilicon => &[
612                (
613                    "Metal Performance Shaders fused kernels",
614                    "Enable Metal Performance Shaders fused kernels via trustformers metal backend",
615                ),
616                (
617                    "f16 precision on Apple Silicon",
618                    "Use f16 precision on Apple Silicon where model accuracy permits",
619                ),
620            ],
621            HardwareType::Cpu => &[
622                (
623                    "Enable AVX-512 via scirs2-core SIMD",
624                    "Enable AVX-512 via scirs2-core SIMD features if not already active",
625                ),
626                (
627                    "AMX acceleration on Apple M-series",
628                    "Use AMX acceleration on Apple M-series CPUs for matrix operations",
629                ),
630            ],
631            HardwareType::Tpu => &[
632                (
633                    "bf16 precision for TPU v4+",
634                    "Use bf16 precision with matmul_precision=highest for TPU v4+",
635                ),
636                (
637                    "XLA sharding for tensor parallelism",
638                    "Enable XLA sharding for tensor parallelism across TPU cores",
639                ),
640            ],
641            HardwareType::Auto => {
642                // Resolve Auto by detecting actual hardware and re-running.
643                let resolved = self.detect_hardware();
644                let resolved_snap: Vec<PerformanceSnapshot> = snapshots
645                    .iter()
646                    .cloned()
647                    .map(|mut s| {
648                        s.hardware_type = Some(resolved);
649                        s
650                    })
651                    .collect();
652                return self.analyze_hardware(&resolved_snap);
653            }
654        };
655
656        for (title, description) in hw_recs {
657            recommendations.push(Recommendation {
658                category: RecommendationCategory::Hardware,
659                priority: Priority::Medium,
660                confidence: 0.8,
661                title: (*title).to_string(),
662                description: (*description).to_string(),
663                expected_impact: ImpactEstimate {
664                    speedup: 1.15,
665                    memory_reduction_mb: 0.0,
666                    throughput_improvement: 10.0,
667                },
668                difficulty: Difficulty::Easy,
669                actions: vec![(*description).to_string()],
670                code_example: None,
671            });
672        }
673
674        recommendations
675    }
676
677    /// Analyse data-loading bottlenecks from recent snapshots.
678    fn analyze_data_loading(&self, snapshots: &[PerformanceSnapshot]) -> Vec<Recommendation> {
679        let mut recommendations = Vec::new();
680
681        let window = self.config.data_loading_window.min(snapshots.len());
682        if window == 0 {
683            return recommendations;
684        }
685        let recent = &snapshots[snapshots.len() - window..];
686
687        // Average io_wait_pct over the window (ignoring None entries).
688        let (io_sum, io_count) = recent.iter().fold((0.0_f64, 0_usize), |(acc, n), s| {
689            if let Some(pct) = s.io_wait_pct {
690                (acc + pct as f64, n + 1)
691            } else {
692                (acc, n)
693            }
694        });
695
696        if io_count > 0 {
697            let avg_io_wait = io_sum / io_count as f64;
698
699            if avg_io_wait > 0.15 {
700                recommendations.push(Recommendation {
701                    category: RecommendationCategory::DataLoading,
702                    priority: Priority::High,
703                    confidence: 0.82,
704                    title: "Increase data-loader worker count".to_string(),
705                    description: format!(
706                        "Increase data-loader worker count (current I/O wait {:.1}% suggests bottleneck)",
707                        avg_io_wait * 100.0
708                    ),
709                    expected_impact: ImpactEstimate {
710                        speedup: 1.2,
711                        memory_reduction_mb: 0.0,
712                        throughput_improvement: 20.0,
713                    },
714                    difficulty: Difficulty::Easy,
715                    actions: vec![
716                        format!(
717                            "Increase data-loader worker count (current I/O wait {:.1}% suggests bottleneck)",
718                            avg_io_wait * 100.0
719                        ),
720                        "Enable dataset prefetch to overlap data loading with model computation"
721                            .to_string(),
722                        "Memory-map large weight files to reduce I/O overhead".to_string(),
723                    ],
724                    code_example: None,
725                });
726            }
727        }
728
729        // Check compute/GPU utilisation ratio.
730        let last = snapshots.last();
731        if let Some(snap) = last {
732            if let (Some(bt), Some(gpt)) = (snap.batch_throughput_per_sec, snap.gpu_peak_throughput)
733            {
734                if gpt > 0.0 && (bt / gpt) < 0.5 {
735                    recommendations.push(Recommendation {
736                        category: RecommendationCategory::DataLoading,
737                        priority: Priority::Medium,
738                        confidence: 0.75,
739                        title: "Parallelise tokenization across CPU workers".to_string(),
740                        description:
741                            "Parallelise tokenization across CPU workers to reduce preprocessing bottleneck"
742                                .to_string(),
743                        expected_impact: ImpactEstimate {
744                            speedup: 1.25,
745                            memory_reduction_mb: 0.0,
746                            throughput_improvement: 25.0,
747                        },
748                        difficulty: Difficulty::Moderate,
749                        actions: vec![
750                            "Parallelise tokenization across CPU workers to reduce preprocessing bottleneck"
751                                .to_string(),
752                            "Move data preprocessing to CPU worker pool to better utilise GPU"
753                                .to_string(),
754                        ],
755                        code_example: None,
756                    });
757                }
758            }
759        }
760
761        recommendations
762    }
763
764    /// Analyse model architecture for structural optimisation opportunities.
765    fn analyze_architecture(&self, snapshots: &[PerformanceSnapshot]) -> Vec<Recommendation> {
766        let mut recommendations = Vec::new();
767
768        // Use the latest snapshot that carries seq_len.
769        let seq_snap = snapshots.iter().rev().find(|s| s.seq_len.is_some());
770
771        if let Some(snap) = seq_snap {
772            let seq_len = snap.seq_len.unwrap_or(0);
773
774            if seq_len > 1024 {
775                recommendations.push(Recommendation {
776                    category: RecommendationCategory::Architecture,
777                    priority: Priority::High,
778                    confidence: 0.88,
779                    title: "Enable Flash Attention for long sequences".to_string(),
780                    description: format!(
781                        "Enable Flash Attention for seq_len={} (reduces memory O(n^2) to O(n))",
782                        seq_len
783                    ),
784                    expected_impact: ImpactEstimate {
785                        speedup: 1.6,
786                        memory_reduction_mb: 0.0,
787                        throughput_improvement: 30.0,
788                    },
789                    difficulty: Difficulty::Moderate,
790                    actions: vec![format!(
791                        "Enable Flash Attention for seq_len={} (reduces memory O(n^2) to O(n))",
792                        seq_len
793                    )],
794                    code_example: None,
795                });
796            }
797
798            if let (Some(kv_bytes), Some(gpu_peak)) =
799                (snap.kv_cache_bytes, snap.gpu_peak_throughput)
800            {
801                let kv_mb = kv_bytes as f64 / (1024.0 * 1024.0);
802                // Heuristic: KV cache > 50% of the GPU memory budget (proxy via peak throughput).
803                if kv_bytes as f64 > gpu_peak as f64 * 0.5 {
804                    recommendations.push(Recommendation {
805                        category: RecommendationCategory::Architecture,
806                        priority: Priority::High,
807                        confidence: 0.78,
808                        title: "Reduce KV heads using Grouped-Query Attention".to_string(),
809                        description: format!(
810                            "Reduce KV heads using Grouped-Query Attention (GQA) — current KV cache {:.0} MB exceeds 50% of GPU memory budget",
811                            kv_mb
812                        ),
813                        expected_impact: ImpactEstimate {
814                            speedup: 1.3,
815                            memory_reduction_mb: kv_mb * 0.5,
816                            throughput_improvement: 20.0,
817                        },
818                        difficulty: Difficulty::Hard,
819                        actions: vec![format!(
820                            "Reduce KV heads using Grouped-Query Attention (GQA) — current KV cache {:.0} MB exceeds 50% of GPU memory budget",
821                            kv_mb
822                        )],
823                        code_example: None,
824                    });
825                }
826            }
827
828            if seq_len > 4096 {
829                recommendations.push(Recommendation {
830                    category: RecommendationCategory::Architecture,
831                    priority: Priority::Medium,
832                    confidence: 0.82,
833                    title: "Consider sliding-window attention".to_string(),
834                    description:
835                        "Consider sliding-window attention (Mistral-style) to reduce quadratic memory growth at very long contexts"
836                            .to_string(),
837                    expected_impact: ImpactEstimate {
838                        speedup: 1.4,
839                        memory_reduction_mb: 0.0,
840                        throughput_improvement: 25.0,
841                    },
842                    difficulty: Difficulty::Hard,
843                    actions: vec![
844                        "Consider sliding-window attention (Mistral-style) to reduce quadratic memory growth at very long contexts"
845                            .to_string(),
846                    ],
847                    code_example: None,
848                });
849            }
850
851            if let Some(num_heads) = snap.num_heads {
852                if num_heads > 32 {
853                    recommendations.push(Recommendation {
854                        category: RecommendationCategory::Architecture,
855                        priority: Priority::Medium,
856                        confidence: 0.76,
857                        title: "Multi-Query Attention to reduce KV memory".to_string(),
858                        description: format!(
859                            "Multi-Query Attention (single KV head) could reduce memory by {}x while retaining most accuracy",
860                            num_heads
861                        ),
862                        expected_impact: ImpactEstimate {
863                            speedup: 1.2,
864                            memory_reduction_mb: 0.0,
865                            throughput_improvement: 15.0,
866                        },
867                        difficulty: Difficulty::Hard,
868                        actions: vec![format!(
869                            "Multi-Query Attention (single KV head) could reduce memory by {}x while retaining most accuracy",
870                            num_heads
871                        )],
872                        code_example: None,
873                    });
874                }
875            }
876        }
877
878        // model_depth-based recommendations (use latest snapshot with depth set).
879        let depth_snap = snapshots.iter().rev().find(|s| s.model_depth.is_some());
880        if let Some(snap) = depth_snap {
881            if let Some(depth) = snap.model_depth {
882                if depth > 48 {
883                    recommendations.push(Recommendation {
884                        category: RecommendationCategory::Architecture,
885                        priority: Priority::Medium,
886                        confidence: 0.84,
887                        title: "Enable gradient checkpointing for deep models".to_string(),
888                        description: format!(
889                            "Enable gradient checkpointing for models with >48 layers — reduces activation memory at ~33% compute overhead (current depth: {})",
890                            depth
891                        ),
892                        expected_impact: ImpactEstimate {
893                            speedup: 0.85, // slight slowdown due to recomputation
894                            memory_reduction_mb: 0.0,
895                            throughput_improvement: 0.0,
896                        },
897                        difficulty: Difficulty::Easy,
898                        actions: vec![
899                            "Enable gradient checkpointing for models with >48 layers — reduces activation memory at ~33% compute overhead"
900                                .to_string(),
901                        ],
902                        code_example: None,
903                    });
904                }
905            }
906        }
907
908        recommendations
909    }
910
911    /// Compute current performance summary
912    fn compute_current_performance(&self) -> PerformanceSummary {
913        let count = self.history.len() as f64;
914
915        let avg_time = self.history.iter().map(|s| s.total_time_ms).sum::<f64>() / count;
916
917        let avg_memory = self.history.iter().map(|s| s.memory_usage_mb).sum::<f64>() / count;
918
919        let avg_throughput = self.history.iter().map(|s| s.throughput).sum::<f64>() / count;
920
921        let avg_gpu = self.history.iter().map(|s| s.gpu_utilization).sum::<f64>() / count;
922
923        // Compute efficiency score (0-100)
924        let efficiency = (avg_gpu.min(100.0) + (avg_throughput / 10.0).min(100.0)) / 2.0;
925
926        PerformanceSummary {
927            avg_time_ms: avg_time,
928            avg_memory_mb: avg_memory,
929            avg_throughput,
930            gpu_utilization: avg_gpu,
931            efficiency_score: efficiency,
932        }
933    }
934
935    /// Estimate performance after applying recommendations.
936    ///
937    /// Uses `TunerConfig::performance_model` to scale each recommendation's
938    /// speedup contribution, replacing hard-coded magic numbers with
939    /// user-configurable multipliers.
940    fn estimate_improved_performance(
941        &self,
942        recommendations: &[Recommendation],
943    ) -> PerformanceSummary {
944        let current = self.compute_current_performance();
945
946        let total_speedup: f64 = match &self.config.performance_model {
947            PerformanceModel::RuleBased { multipliers } => {
948                recommendations
949                    .iter()
950                    .map(|r| {
951                        let m = multipliers.get(&r.category).copied().unwrap_or(1.0);
952                        m * (r.expected_impact.speedup - 1.0)
953                    })
954                    .sum::<f64>()
955                    + 1.0
956            },
957        };
958
959        let total_memory_reduction: f64 =
960            recommendations.iter().map(|r| r.expected_impact.memory_reduction_mb).sum();
961
962        let total_throughput_improvement: f64 =
963            recommendations.iter().map(|r| r.expected_impact.throughput_improvement).sum();
964
965        PerformanceSummary {
966            avg_time_ms: current.avg_time_ms / total_speedup,
967            avg_memory_mb: (current.avg_memory_mb - total_memory_reduction).max(0.0),
968            avg_throughput: current.avg_throughput * (1.0 + total_throughput_improvement / 100.0),
969            gpu_utilization: (current.gpu_utilization * 1.2).min(95.0),
970            efficiency_score: (current.efficiency_score * 1.3).min(100.0),
971        }
972    }
973}
974
975#[cfg(test)]
976mod tests {
977    use super::*;
978
979    fn base_snapshot() -> PerformanceSnapshot {
980        PerformanceSnapshot {
981            timestamp: 0,
982            total_time_ms: 100.0,
983            memory_usage_mb: 1000.0,
984            peak_memory_mb: 2000.0,
985            gpu_utilization: 40.0,
986            throughput: 20.0,
987            batch_size: 8,
988            layer_timings: {
989                let mut t = HashMap::new();
990                t.insert("attention".to_string(), 60.0);
991                t.insert("ffn".to_string(), 30.0);
992                t.insert("other".to_string(), 10.0);
993                t
994            },
995            ..Default::default()
996        }
997    }
998
999    #[test]
1000    fn test_tuner_creation() {
1001        let config = TunerConfig::default();
1002        let _tuner = PerformanceTuner::new(config);
1003    }
1004
1005    #[test]
1006    fn test_snapshot_recording() {
1007        let mut tuner = PerformanceTuner::new(TunerConfig::default());
1008
1009        let snapshot = PerformanceSnapshot {
1010            timestamp: 0,
1011            total_time_ms: 100.0,
1012            memory_usage_mb: 500.0,
1013            peak_memory_mb: 600.0,
1014            gpu_utilization: 75.0,
1015            throughput: 50.0,
1016            batch_size: 16,
1017            layer_timings: HashMap::new(),
1018            layer_memory: HashMap::new(),
1019            ..Default::default()
1020        };
1021
1022        tuner.record_snapshot(snapshot);
1023        assert_eq!(tuner.history.len(), 1);
1024    }
1025
1026    #[test]
1027    fn test_analysis_with_data() -> Result<()> {
1028        let mut tuner = PerformanceTuner::new(TunerConfig::default());
1029
1030        // Add some sample data
1031        for i in 0..10 {
1032            let snapshot = PerformanceSnapshot {
1033                timestamp: i,
1034                ..base_snapshot()
1035            };
1036
1037            tuner.record_snapshot(snapshot);
1038        }
1039
1040        let report = tuner.analyze()?;
1041
1042        // Should have recommendations
1043        assert!(!report.recommendations.is_empty());
1044
1045        // Should have current and estimated performance
1046        assert!(report.current_performance.avg_time_ms > 0.0);
1047        assert!(report.estimated_performance.avg_time_ms > 0.0);
1048
1049        Ok(())
1050    }
1051
1052    #[test]
1053    fn test_analyze_hardware_nvidia_produces_hardware_recommendation() -> Result<()> {
1054        let mut tuner = PerformanceTuner::new(TunerConfig::default());
1055
1056        let snapshot = PerformanceSnapshot {
1057            hardware_type: Some(HardwareType::NvidiaGpu),
1058            ..base_snapshot()
1059        };
1060        tuner.record_snapshot(snapshot);
1061
1062        let report = tuner.analyze()?;
1063        let hw_recs: Vec<_> = report
1064            .recommendations
1065            .iter()
1066            .filter(|r| r.category == RecommendationCategory::Hardware)
1067            .collect();
1068
1069        assert!(
1070            !hw_recs.is_empty(),
1071            "Expected at least one Hardware recommendation for NvidiaGpu"
1072        );
1073        Ok(())
1074    }
1075
1076    #[test]
1077    fn test_analyze_hardware_apple_silicon() -> Result<()> {
1078        let mut tuner = PerformanceTuner::new(TunerConfig::default());
1079
1080        let snapshot = PerformanceSnapshot {
1081            hardware_type: Some(HardwareType::AppleSilicon),
1082            ..base_snapshot()
1083        };
1084        tuner.record_snapshot(snapshot);
1085
1086        let report = tuner.analyze()?;
1087        let hw_recs: Vec<_> = report
1088            .recommendations
1089            .iter()
1090            .filter(|r| r.category == RecommendationCategory::Hardware)
1091            .collect();
1092
1093        assert!(
1094            !hw_recs.is_empty(),
1095            "Expected at least one Hardware recommendation for AppleSilicon"
1096        );
1097        Ok(())
1098    }
1099
1100    #[test]
1101    fn test_analyze_data_loading_high_io_wait() -> Result<()> {
1102        let mut tuner = PerformanceTuner::new(TunerConfig::default());
1103
1104        let snapshot = PerformanceSnapshot {
1105            io_wait_pct: Some(0.3),
1106            gpu_peak_throughput: Some(100.0),
1107            batch_throughput_per_sec: Some(20.0),
1108            ..base_snapshot()
1109        };
1110        tuner.record_snapshot(snapshot);
1111
1112        let report = tuner.analyze()?;
1113        let dl_recs: Vec<_> = report
1114            .recommendations
1115            .iter()
1116            .filter(|r| r.category == RecommendationCategory::DataLoading)
1117            .collect();
1118
1119        assert!(
1120            !dl_recs.is_empty(),
1121            "Expected at least one DataLoading recommendation for high I/O wait"
1122        );
1123        Ok(())
1124    }
1125
1126    #[test]
1127    fn test_analyze_architecture_long_context() -> Result<()> {
1128        let mut tuner = PerformanceTuner::new(TunerConfig::default());
1129
1130        let snapshot = PerformanceSnapshot {
1131            seq_len: Some(4096),
1132            ..base_snapshot()
1133        };
1134        tuner.record_snapshot(snapshot);
1135
1136        let report = tuner.analyze()?;
1137        let arch_recs: Vec<_> = report
1138            .recommendations
1139            .iter()
1140            .filter(|r| {
1141                r.category == RecommendationCategory::Architecture
1142                    && r.description.contains("Flash Attention")
1143            })
1144            .collect();
1145
1146        assert!(
1147            !arch_recs.is_empty(),
1148            "Expected at least one Architecture recommendation mentioning Flash Attention"
1149        );
1150        Ok(())
1151    }
1152
1153    #[test]
1154    fn test_detected_hardware_returns_non_auto() {
1155        let tuner = PerformanceTuner::new(TunerConfig::default());
1156        let hw = tuner.detected_hardware();
1157        assert_ne!(
1158            hw,
1159            HardwareType::Auto,
1160            "detected_hardware should never return Auto"
1161        );
1162    }
1163
1164    #[test]
1165    fn test_performance_model_rule_based_configurable() -> Result<()> {
1166        // Build a custom model where Memory multiplier is 2.0 (doubles each Memory rec's contribution).
1167        let mut multipliers = HashMap::new();
1168        for cat in &[
1169            RecommendationCategory::Memory,
1170            RecommendationCategory::Compute,
1171            RecommendationCategory::BatchSize,
1172            RecommendationCategory::Layer,
1173            RecommendationCategory::Hardware,
1174            RecommendationCategory::DataLoading,
1175            RecommendationCategory::Architecture,
1176        ] {
1177            multipliers.insert(*cat, 1.0);
1178        }
1179        multipliers.insert(RecommendationCategory::Memory, 2.0);
1180
1181        let config = TunerConfig {
1182            performance_model: PerformanceModel::RuleBased { multipliers },
1183            ..TunerConfig::default()
1184        };
1185
1186        let mut tuner = PerformanceTuner::new(config);
1187
1188        // Add snapshots that trigger a Memory recommendation (fragmentation).
1189        for i in 0..5 {
1190            let snapshot = PerformanceSnapshot {
1191                timestamp: i,
1192                memory_usage_mb: 1000.0,
1193                peak_memory_mb: 3000.0, // >1.5x average triggers memory rec
1194                ..base_snapshot()
1195            };
1196            tuner.record_snapshot(snapshot);
1197        }
1198
1199        let report = tuner.analyze()?;
1200
1201        // With a 2x multiplier on Memory, the speedup contribution of Memory recommendations
1202        // is doubled. The estimated throughput should exceed that of the default model.
1203        let default_config = TunerConfig::default();
1204        let mut default_tuner = PerformanceTuner::new(default_config);
1205        for i in 0..5 {
1206            let snapshot = PerformanceSnapshot {
1207                timestamp: i,
1208                memory_usage_mb: 1000.0,
1209                peak_memory_mb: 3000.0,
1210                ..base_snapshot()
1211            };
1212            default_tuner.record_snapshot(snapshot);
1213        }
1214        let default_report = default_tuner.analyze()?;
1215
1216        // Custom model with multiplier=2 should estimate at least as well as default (multiplier=1).
1217        // Both have the same recs, but the custom model scales Memory speedup by 2x.
1218        assert!(
1219            report.estimated_performance.avg_time_ms
1220                <= default_report.estimated_performance.avg_time_ms + 1e-6,
1221            "Custom multiplier=2.0 should produce at least as optimistic a time estimate as default"
1222        );
1223
1224        Ok(())
1225    }
1226}
trustformers_debug/performance_tuning.rs

trustformers_debug/
performance_tuning.rs