use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
#[derive(Debug, Clone)]
pub enum PerformanceModel {
RuleBased {
multipliers: HashMap<RecommendationCategory, f64>,
},
}
impl Default for PerformanceModel {
fn default() -> Self {
let mut multipliers = HashMap::new();
multipliers.insert(RecommendationCategory::Memory, 1.0);
multipliers.insert(RecommendationCategory::Compute, 1.0);
multipliers.insert(RecommendationCategory::BatchSize, 1.0);
multipliers.insert(RecommendationCategory::Layer, 1.0);
multipliers.insert(RecommendationCategory::Hardware, 1.0);
multipliers.insert(RecommendationCategory::DataLoading, 1.0);
multipliers.insert(RecommendationCategory::Architecture, 1.0);
PerformanceModel::RuleBased { multipliers }
}
}
#[derive(Debug)]
pub struct PerformanceTuner {
config: TunerConfig,
history: Vec<PerformanceSnapshot>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TunerConfig {
pub enable_memory_tuning: bool,
pub enable_compute_tuning: bool,
pub enable_batch_tuning: bool,
pub enable_layer_tuning: bool,
pub confidence_threshold: f64,
pub target_hardware: HardwareType,
pub data_loading_window: usize,
#[serde(skip)]
pub performance_model: PerformanceModel,
}
impl Default for TunerConfig {
fn default() -> Self {
Self {
enable_memory_tuning: true,
enable_compute_tuning: true,
enable_batch_tuning: true,
enable_layer_tuning: true,
confidence_threshold: 0.7,
target_hardware: HardwareType::Auto,
data_loading_window: 10,
performance_model: PerformanceModel::default(),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum HardwareType {
Auto,
NvidiaGpu,
AmdGpu,
AppleSilicon,
Cpu,
Tpu,
}
#[derive(Debug, Clone, Serialize, Deserialize, Default)]
pub struct PerformanceSnapshot {
pub timestamp: u64,
pub total_time_ms: f64,
pub memory_usage_mb: f64,
pub peak_memory_mb: f64,
pub gpu_utilization: f64,
pub throughput: f64,
pub batch_size: usize,
pub layer_timings: HashMap<String, f64>,
pub layer_memory: HashMap<String, f64>,
pub hardware_type: Option<HardwareType>,
pub io_wait_pct: Option<f32>,
pub batch_throughput_per_sec: Option<f32>,
pub gpu_peak_throughput: Option<f32>,
pub model_depth: Option<usize>,
pub num_heads: Option<usize>,
pub kv_cache_bytes: Option<u64>,
pub seq_len: Option<usize>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct Recommendation {
pub category: RecommendationCategory,
pub priority: Priority,
pub confidence: f64,
pub title: String,
pub description: String,
pub expected_impact: ImpactEstimate,
pub difficulty: Difficulty,
pub actions: Vec<String>,
pub code_example: Option<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
pub enum RecommendationCategory {
Memory,
Compute,
BatchSize,
Layer,
Hardware,
DataLoading,
Architecture,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
pub enum Priority {
Low,
Medium,
High,
Critical,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum Difficulty {
Easy,
Moderate,
Hard,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImpactEstimate {
pub speedup: f64,
pub memory_reduction_mb: f64,
pub throughput_improvement: f64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TuningReport {
pub recommendations: Vec<Recommendation>,
pub current_performance: PerformanceSummary,
pub estimated_performance: PerformanceSummary,
pub timestamp: u64,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerformanceSummary {
pub avg_time_ms: f64,
pub avg_memory_mb: f64,
pub avg_throughput: f64,
pub gpu_utilization: f64,
pub efficiency_score: f64,
}
impl PerformanceTuner {
pub fn new(config: TunerConfig) -> Self {
Self {
config,
history: Vec::new(),
}
}
pub fn record_snapshot(&mut self, snapshot: PerformanceSnapshot) {
self.history.push(snapshot);
if self.history.len() > 100 {
self.history.remove(0);
}
}
fn detect_hardware(&self) -> HardwareType {
#[cfg(target_os = "macos")]
return HardwareType::AppleSilicon;
#[cfg(all(not(target_os = "macos"), feature = "cuda"))]
return HardwareType::NvidiaGpu;
#[cfg(all(not(target_os = "macos"), not(feature = "cuda"), feature = "rocm"))]
return HardwareType::AmdGpu;
#[cfg(all(
not(target_os = "macos"),
not(feature = "cuda"),
not(feature = "rocm"),
feature = "tpu"
))]
return HardwareType::Tpu;
#[cfg(all(
not(target_os = "macos"),
not(feature = "cuda"),
not(feature = "rocm"),
not(feature = "tpu")
))]
HardwareType::Cpu
}
pub fn detected_hardware(&self) -> HardwareType {
self.detect_hardware()
}
pub fn analyze(&self) -> Result<TuningReport> {
let mut recommendations = Vec::new();
if self.history.is_empty() {
anyhow::bail!("No performance data available");
}
if self.config.enable_memory_tuning {
recommendations.extend(self.analyze_memory());
}
if self.config.enable_compute_tuning {
recommendations.extend(self.analyze_compute());
}
if self.config.enable_batch_tuning {
recommendations.extend(self.analyze_batch_size());
}
if self.config.enable_layer_tuning {
recommendations.extend(self.analyze_layers());
}
recommendations.extend(self.analyze_hardware(&self.history));
if self.history.iter().any(|s| s.io_wait_pct.is_some()) {
recommendations.extend(self.analyze_data_loading(&self.history));
}
if self.history.iter().any(|s| s.seq_len.is_some()) {
recommendations.extend(self.analyze_architecture(&self.history));
}
recommendations.retain(|r| r.confidence >= self.config.confidence_threshold);
recommendations.sort_by_key(|item| std::cmp::Reverse(item.priority));
let current_perf = self.compute_current_performance();
let estimated_perf = self.estimate_improved_performance(&recommendations);
Ok(TuningReport {
recommendations,
current_performance: current_perf,
estimated_performance: estimated_perf,
timestamp: std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.expect("SystemTime should be after UNIX_EPOCH")
.as_secs(),
})
}
fn analyze_memory(&self) -> Vec<Recommendation> {
let mut recommendations = Vec::new();
let avg_memory =
self.history.iter().map(|s| s.memory_usage_mb).sum::<f64>() / self.history.len() as f64;
let peak_memory = self.history.iter().map(|s| s.peak_memory_mb).fold(0.0, f64::max);
if peak_memory > avg_memory * 1.5 {
recommendations.push(Recommendation {
category: RecommendationCategory::Memory,
priority: Priority::High,
confidence: 0.85,
title: "Reduce memory fragmentation".to_string(),
description: format!(
"Peak memory ({:.1}MB) is significantly higher than average ({:.1}MB). \
This indicates memory fragmentation.",
peak_memory, avg_memory
),
expected_impact: ImpactEstimate {
speedup: 1.1,
memory_reduction_mb: (peak_memory - avg_memory) * 0.5,
throughput_improvement: 5.0,
},
difficulty: Difficulty::Moderate,
actions: vec![
"Enable gradient checkpointing to reduce activation memory".to_string(),
"Use torch.cuda.empty_cache() or equivalent after large operations".to_string(),
"Consider using mixed precision training (FP16/BF16)".to_string(),
],
code_example: Some(
"# Enable gradient checkpointing\n\
model.gradient_checkpointing_enable()\n\
\n\
# Use automatic mixed precision\n\
with torch.cuda.amp.autocast():\n\
\u{00a0}\u{00a0}\u{00a0}\u{00a0}output = model(input)"
.to_string(),
),
});
}
if avg_memory > 8000.0 && self.config.target_hardware == HardwareType::Cpu {
recommendations.push(Recommendation {
category: RecommendationCategory::Memory,
priority: Priority::High,
confidence: 0.9,
title: "Reduce memory footprint for CPU execution".to_string(),
description: format!(
"Average memory usage ({:.1}GB) is high for CPU execution. \
Consider model compression techniques.",
avg_memory / 1024.0
),
expected_impact: ImpactEstimate {
speedup: 1.3,
memory_reduction_mb: avg_memory * 0.4,
throughput_improvement: 15.0,
},
difficulty: Difficulty::Moderate,
actions: vec![
"Apply 8-bit or 4-bit quantization".to_string(),
"Use dynamic quantization for linear layers".to_string(),
"Consider model distillation to a smaller model".to_string(),
],
code_example: Some(
"# Apply 8-bit quantization\n\
quantized_model = torch.quantization.quantize_dynamic(\n\
\u{00a0}\u{00a0}\u{00a0}\u{00a0}model, {torch.nn.Linear}, dtype=torch.qint8\n\
)"
.to_string(),
),
});
}
recommendations
}
fn analyze_compute(&self) -> Vec<Recommendation> {
let mut recommendations = Vec::new();
let avg_gpu_util =
self.history.iter().map(|s| s.gpu_utilization).sum::<f64>() / self.history.len() as f64;
if avg_gpu_util < 50.0 && self.config.target_hardware != HardwareType::Cpu {
recommendations.push(Recommendation {
category: RecommendationCategory::Compute,
priority: Priority::High,
confidence: 0.88,
title: "Improve GPU utilization".to_string(),
description: format!(
"Average GPU utilization ({:.1}%) is low. GPU is underutilized.",
avg_gpu_util
),
expected_impact: ImpactEstimate {
speedup: 1.8,
memory_reduction_mb: 0.0,
throughput_improvement: 40.0,
},
difficulty: Difficulty::Easy,
actions: vec![
"Increase batch size to maximize GPU occupancy".to_string(),
"Use DataLoader with num_workers > 0 to prevent CPU bottleneck".to_string(),
"Enable pin_memory for faster host-to-device transfers".to_string(),
"Use compiled models (torch.compile)".to_string(),
],
code_example: Some(
"# Optimize data loading\n\
dataloader = DataLoader(\n\
\u{00a0}\u{00a0}\u{00a0}\u{00a0}dataset,\n\
\u{00a0}\u{00a0}\u{00a0}\u{00a0}batch_size=32,\n\
\u{00a0}\u{00a0}\u{00a0}\u{00a0}num_workers=4, # Parallel data loading\n\
\u{00a0}\u{00a0}\u{00a0}\u{00a0}pin_memory=True # Faster transfers\n\
)"
.to_string(),
),
});
}
recommendations
}
fn analyze_batch_size(&self) -> Vec<Recommendation> {
let mut recommendations = Vec::new();
if let Some(last_snapshot) = self.history.last() {
let batch_size = last_snapshot.batch_size;
if batch_size < 16 && self.config.target_hardware != HardwareType::Cpu {
recommendations.push(Recommendation {
category: RecommendationCategory::BatchSize,
priority: Priority::Medium,
confidence: 0.75,
title: "Increase batch size".to_string(),
description: format!(
"Current batch size ({}) is small. Larger batches improve GPU utilization.",
batch_size
),
expected_impact: ImpactEstimate {
speedup: 1.5,
memory_reduction_mb: 0.0,
throughput_improvement: 30.0,
},
difficulty: Difficulty::Easy,
actions: vec![
format!("Increase batch size to {} or higher", batch_size * 2),
"Monitor memory usage to find optimal batch size".to_string(),
"Use gradient accumulation if memory is limited".to_string(),
],
code_example: Some(
"# Gradient accumulation for effective larger batch\n\
accumulation_steps = 4\n\
for i, batch in enumerate(dataloader):\n\
\u{00a0}\u{00a0}\u{00a0}\u{00a0}loss = model(batch) / accumulation_steps\n\
\u{00a0}\u{00a0}\u{00a0}\u{00a0}loss.backward()\n\
\u{00a0}\u{00a0}\u{00a0}\u{00a0}if (i + 1) % accumulation_steps == 0:\n\
\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}optimizer.step()\n\
\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}\u{00a0}optimizer.zero_grad()"
.to_string()
),
});
}
}
recommendations
}
fn analyze_layers(&self) -> Vec<Recommendation> {
let mut recommendations = Vec::new();
if let Some(snapshot) = self.history.last() {
let total_time: f64 = snapshot.layer_timings.values().sum();
for (layer_name, &time) in &snapshot.layer_timings {
let percentage = (time / total_time) * 100.0;
if percentage > 20.0 {
recommendations.push(Recommendation {
category: RecommendationCategory::Layer,
priority: Priority::Medium,
confidence: 0.8,
title: format!("Optimize {} layer", layer_name),
description: format!(
"Layer '{}' takes {:.1}% of total execution time ({:.2}ms). \
Consider layer-specific optimizations.",
layer_name, percentage, time
),
expected_impact: ImpactEstimate {
speedup: 1.2,
memory_reduction_mb: 0.0,
throughput_improvement: 15.0,
},
difficulty: Difficulty::Moderate,
actions: vec![
"Use fused operations for this layer type".to_string(),
"Check if layer can benefit from Flash Attention".to_string(),
"Consider layer pruning if accuracy allows".to_string(),
],
code_example: None,
});
}
}
}
recommendations
}
fn analyze_hardware(&self, snapshots: &[PerformanceSnapshot]) -> Vec<Recommendation> {
let mut recommendations = Vec::new();
let effective_hw = snapshots.last().and_then(|s| s.hardware_type).unwrap_or_else(|| {
if self.config.target_hardware == HardwareType::Auto {
self.detect_hardware()
} else {
self.config.target_hardware
}
});
let hw_recs: &[(&str, &str)] = match effective_hw {
HardwareType::NvidiaGpu => &[
(
"Enable TF32 matmul for Ampere+ GPUs",
"Enable TF32 matmul for Ampere+ GPUs (torch.backends.cuda.matmul.allow_tf32)",
),
(
"cuDNN deterministic algorithms",
"Consider cuDNN deterministic algorithms for reproducibility (may reduce throughput)",
),
],
HardwareType::AmdGpu => &[
(
"ROCm hipBLAS strided-batched GEMM",
"Enable ROCm hipBLAS strided-batched GEMM for batch inference",
),
(
"bf16 precision with ROCm MI200+",
"Use bf16 precision with ROCm MI200+ for 2x throughput",
),
],
HardwareType::AppleSilicon => &[
(
"Metal Performance Shaders fused kernels",
"Enable Metal Performance Shaders fused kernels via trustformers metal backend",
),
(
"f16 precision on Apple Silicon",
"Use f16 precision on Apple Silicon where model accuracy permits",
),
],
HardwareType::Cpu => &[
(
"Enable AVX-512 via scirs2-core SIMD",
"Enable AVX-512 via scirs2-core SIMD features if not already active",
),
(
"AMX acceleration on Apple M-series",
"Use AMX acceleration on Apple M-series CPUs for matrix operations",
),
],
HardwareType::Tpu => &[
(
"bf16 precision for TPU v4+",
"Use bf16 precision with matmul_precision=highest for TPU v4+",
),
(
"XLA sharding for tensor parallelism",
"Enable XLA sharding for tensor parallelism across TPU cores",
),
],
HardwareType::Auto => {
let resolved = self.detect_hardware();
let resolved_snap: Vec<PerformanceSnapshot> = snapshots
.iter()
.cloned()
.map(|mut s| {
s.hardware_type = Some(resolved);
s
})
.collect();
return self.analyze_hardware(&resolved_snap);
}
};
for (title, description) in hw_recs {
recommendations.push(Recommendation {
category: RecommendationCategory::Hardware,
priority: Priority::Medium,
confidence: 0.8,
title: (*title).to_string(),
description: (*description).to_string(),
expected_impact: ImpactEstimate {
speedup: 1.15,
memory_reduction_mb: 0.0,
throughput_improvement: 10.0,
},
difficulty: Difficulty::Easy,
actions: vec![(*description).to_string()],
code_example: None,
});
}
recommendations
}
fn analyze_data_loading(&self, snapshots: &[PerformanceSnapshot]) -> Vec<Recommendation> {
let mut recommendations = Vec::new();
let window = self.config.data_loading_window.min(snapshots.len());
if window == 0 {
return recommendations;
}
let recent = &snapshots[snapshots.len() - window..];
let (io_sum, io_count) = recent.iter().fold((0.0_f64, 0_usize), |(acc, n), s| {
if let Some(pct) = s.io_wait_pct {
(acc + pct as f64, n + 1)
} else {
(acc, n)
}
});
if io_count > 0 {
let avg_io_wait = io_sum / io_count as f64;
if avg_io_wait > 0.15 {
recommendations.push(Recommendation {
category: RecommendationCategory::DataLoading,
priority: Priority::High,
confidence: 0.82,
title: "Increase data-loader worker count".to_string(),
description: format!(
"Increase data-loader worker count (current I/O wait {:.1}% suggests bottleneck)",
avg_io_wait * 100.0
),
expected_impact: ImpactEstimate {
speedup: 1.2,
memory_reduction_mb: 0.0,
throughput_improvement: 20.0,
},
difficulty: Difficulty::Easy,
actions: vec![
format!(
"Increase data-loader worker count (current I/O wait {:.1}% suggests bottleneck)",
avg_io_wait * 100.0
),
"Enable dataset prefetch to overlap data loading with model computation"
.to_string(),
"Memory-map large weight files to reduce I/O overhead".to_string(),
],
code_example: None,
});
}
}
let last = snapshots.last();
if let Some(snap) = last {
if let (Some(bt), Some(gpt)) = (snap.batch_throughput_per_sec, snap.gpu_peak_throughput)
{
if gpt > 0.0 && (bt / gpt) < 0.5 {
recommendations.push(Recommendation {
category: RecommendationCategory::DataLoading,
priority: Priority::Medium,
confidence: 0.75,
title: "Parallelise tokenization across CPU workers".to_string(),
description:
"Parallelise tokenization across CPU workers to reduce preprocessing bottleneck"
.to_string(),
expected_impact: ImpactEstimate {
speedup: 1.25,
memory_reduction_mb: 0.0,
throughput_improvement: 25.0,
},
difficulty: Difficulty::Moderate,
actions: vec![
"Parallelise tokenization across CPU workers to reduce preprocessing bottleneck"
.to_string(),
"Move data preprocessing to CPU worker pool to better utilise GPU"
.to_string(),
],
code_example: None,
});
}
}
}
recommendations
}
fn analyze_architecture(&self, snapshots: &[PerformanceSnapshot]) -> Vec<Recommendation> {
let mut recommendations = Vec::new();
let seq_snap = snapshots.iter().rev().find(|s| s.seq_len.is_some());
if let Some(snap) = seq_snap {
let seq_len = snap.seq_len.unwrap_or(0);
if seq_len > 1024 {
recommendations.push(Recommendation {
category: RecommendationCategory::Architecture,
priority: Priority::High,
confidence: 0.88,
title: "Enable Flash Attention for long sequences".to_string(),
description: format!(
"Enable Flash Attention for seq_len={} (reduces memory O(n^2) to O(n))",
seq_len
),
expected_impact: ImpactEstimate {
speedup: 1.6,
memory_reduction_mb: 0.0,
throughput_improvement: 30.0,
},
difficulty: Difficulty::Moderate,
actions: vec![format!(
"Enable Flash Attention for seq_len={} (reduces memory O(n^2) to O(n))",
seq_len
)],
code_example: None,
});
}
if let (Some(kv_bytes), Some(gpu_peak)) =
(snap.kv_cache_bytes, snap.gpu_peak_throughput)
{
let kv_mb = kv_bytes as f64 / (1024.0 * 1024.0);
if kv_bytes as f64 > gpu_peak as f64 * 0.5 {
recommendations.push(Recommendation {
category: RecommendationCategory::Architecture,
priority: Priority::High,
confidence: 0.78,
title: "Reduce KV heads using Grouped-Query Attention".to_string(),
description: format!(
"Reduce KV heads using Grouped-Query Attention (GQA) — current KV cache {:.0} MB exceeds 50% of GPU memory budget",
kv_mb
),
expected_impact: ImpactEstimate {
speedup: 1.3,
memory_reduction_mb: kv_mb * 0.5,
throughput_improvement: 20.0,
},
difficulty: Difficulty::Hard,
actions: vec![format!(
"Reduce KV heads using Grouped-Query Attention (GQA) — current KV cache {:.0} MB exceeds 50% of GPU memory budget",
kv_mb
)],
code_example: None,
});
}
}
if seq_len > 4096 {
recommendations.push(Recommendation {
category: RecommendationCategory::Architecture,
priority: Priority::Medium,
confidence: 0.82,
title: "Consider sliding-window attention".to_string(),
description:
"Consider sliding-window attention (Mistral-style) to reduce quadratic memory growth at very long contexts"
.to_string(),
expected_impact: ImpactEstimate {
speedup: 1.4,
memory_reduction_mb: 0.0,
throughput_improvement: 25.0,
},
difficulty: Difficulty::Hard,
actions: vec![
"Consider sliding-window attention (Mistral-style) to reduce quadratic memory growth at very long contexts"
.to_string(),
],
code_example: None,
});
}
if let Some(num_heads) = snap.num_heads {
if num_heads > 32 {
recommendations.push(Recommendation {
category: RecommendationCategory::Architecture,
priority: Priority::Medium,
confidence: 0.76,
title: "Multi-Query Attention to reduce KV memory".to_string(),
description: format!(
"Multi-Query Attention (single KV head) could reduce memory by {}x while retaining most accuracy",
num_heads
),
expected_impact: ImpactEstimate {
speedup: 1.2,
memory_reduction_mb: 0.0,
throughput_improvement: 15.0,
},
difficulty: Difficulty::Hard,
actions: vec![format!(
"Multi-Query Attention (single KV head) could reduce memory by {}x while retaining most accuracy",
num_heads
)],
code_example: None,
});
}
}
}
let depth_snap = snapshots.iter().rev().find(|s| s.model_depth.is_some());
if let Some(snap) = depth_snap {
if let Some(depth) = snap.model_depth {
if depth > 48 {
recommendations.push(Recommendation {
category: RecommendationCategory::Architecture,
priority: Priority::Medium,
confidence: 0.84,
title: "Enable gradient checkpointing for deep models".to_string(),
description: format!(
"Enable gradient checkpointing for models with >48 layers — reduces activation memory at ~33% compute overhead (current depth: {})",
depth
),
expected_impact: ImpactEstimate {
speedup: 0.85, memory_reduction_mb: 0.0,
throughput_improvement: 0.0,
},
difficulty: Difficulty::Easy,
actions: vec![
"Enable gradient checkpointing for models with >48 layers — reduces activation memory at ~33% compute overhead"
.to_string(),
],
code_example: None,
});
}
}
}
recommendations
}
fn compute_current_performance(&self) -> PerformanceSummary {
let count = self.history.len() as f64;
let avg_time = self.history.iter().map(|s| s.total_time_ms).sum::<f64>() / count;
let avg_memory = self.history.iter().map(|s| s.memory_usage_mb).sum::<f64>() / count;
let avg_throughput = self.history.iter().map(|s| s.throughput).sum::<f64>() / count;
let avg_gpu = self.history.iter().map(|s| s.gpu_utilization).sum::<f64>() / count;
let efficiency = (avg_gpu.min(100.0) + (avg_throughput / 10.0).min(100.0)) / 2.0;
PerformanceSummary {
avg_time_ms: avg_time,
avg_memory_mb: avg_memory,
avg_throughput,
gpu_utilization: avg_gpu,
efficiency_score: efficiency,
}
}
fn estimate_improved_performance(
&self,
recommendations: &[Recommendation],
) -> PerformanceSummary {
let current = self.compute_current_performance();
let total_speedup: f64 = match &self.config.performance_model {
PerformanceModel::RuleBased { multipliers } => {
recommendations
.iter()
.map(|r| {
let m = multipliers.get(&r.category).copied().unwrap_or(1.0);
m * (r.expected_impact.speedup - 1.0)
})
.sum::<f64>()
+ 1.0
},
};
let total_memory_reduction: f64 =
recommendations.iter().map(|r| r.expected_impact.memory_reduction_mb).sum();
let total_throughput_improvement: f64 =
recommendations.iter().map(|r| r.expected_impact.throughput_improvement).sum();
PerformanceSummary {
avg_time_ms: current.avg_time_ms / total_speedup,
avg_memory_mb: (current.avg_memory_mb - total_memory_reduction).max(0.0),
avg_throughput: current.avg_throughput * (1.0 + total_throughput_improvement / 100.0),
gpu_utilization: (current.gpu_utilization * 1.2).min(95.0),
efficiency_score: (current.efficiency_score * 1.3).min(100.0),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn base_snapshot() -> PerformanceSnapshot {
PerformanceSnapshot {
timestamp: 0,
total_time_ms: 100.0,
memory_usage_mb: 1000.0,
peak_memory_mb: 2000.0,
gpu_utilization: 40.0,
throughput: 20.0,
batch_size: 8,
layer_timings: {
let mut t = HashMap::new();
t.insert("attention".to_string(), 60.0);
t.insert("ffn".to_string(), 30.0);
t.insert("other".to_string(), 10.0);
t
},
..Default::default()
}
}
#[test]
fn test_tuner_creation() {
let config = TunerConfig::default();
let _tuner = PerformanceTuner::new(config);
}
#[test]
fn test_snapshot_recording() {
let mut tuner = PerformanceTuner::new(TunerConfig::default());
let snapshot = PerformanceSnapshot {
timestamp: 0,
total_time_ms: 100.0,
memory_usage_mb: 500.0,
peak_memory_mb: 600.0,
gpu_utilization: 75.0,
throughput: 50.0,
batch_size: 16,
layer_timings: HashMap::new(),
layer_memory: HashMap::new(),
..Default::default()
};
tuner.record_snapshot(snapshot);
assert_eq!(tuner.history.len(), 1);
}
#[test]
fn test_analysis_with_data() -> Result<()> {
let mut tuner = PerformanceTuner::new(TunerConfig::default());
for i in 0..10 {
let snapshot = PerformanceSnapshot {
timestamp: i,
..base_snapshot()
};
tuner.record_snapshot(snapshot);
}
let report = tuner.analyze()?;
assert!(!report.recommendations.is_empty());
assert!(report.current_performance.avg_time_ms > 0.0);
assert!(report.estimated_performance.avg_time_ms > 0.0);
Ok(())
}
#[test]
fn test_analyze_hardware_nvidia_produces_hardware_recommendation() -> Result<()> {
let mut tuner = PerformanceTuner::new(TunerConfig::default());
let snapshot = PerformanceSnapshot {
hardware_type: Some(HardwareType::NvidiaGpu),
..base_snapshot()
};
tuner.record_snapshot(snapshot);
let report = tuner.analyze()?;
let hw_recs: Vec<_> = report
.recommendations
.iter()
.filter(|r| r.category == RecommendationCategory::Hardware)
.collect();
assert!(
!hw_recs.is_empty(),
"Expected at least one Hardware recommendation for NvidiaGpu"
);
Ok(())
}
#[test]
fn test_analyze_hardware_apple_silicon() -> Result<()> {
let mut tuner = PerformanceTuner::new(TunerConfig::default());
let snapshot = PerformanceSnapshot {
hardware_type: Some(HardwareType::AppleSilicon),
..base_snapshot()
};
tuner.record_snapshot(snapshot);
let report = tuner.analyze()?;
let hw_recs: Vec<_> = report
.recommendations
.iter()
.filter(|r| r.category == RecommendationCategory::Hardware)
.collect();
assert!(
!hw_recs.is_empty(),
"Expected at least one Hardware recommendation for AppleSilicon"
);
Ok(())
}
#[test]
fn test_analyze_data_loading_high_io_wait() -> Result<()> {
let mut tuner = PerformanceTuner::new(TunerConfig::default());
let snapshot = PerformanceSnapshot {
io_wait_pct: Some(0.3),
gpu_peak_throughput: Some(100.0),
batch_throughput_per_sec: Some(20.0),
..base_snapshot()
};
tuner.record_snapshot(snapshot);
let report = tuner.analyze()?;
let dl_recs: Vec<_> = report
.recommendations
.iter()
.filter(|r| r.category == RecommendationCategory::DataLoading)
.collect();
assert!(
!dl_recs.is_empty(),
"Expected at least one DataLoading recommendation for high I/O wait"
);
Ok(())
}
#[test]
fn test_analyze_architecture_long_context() -> Result<()> {
let mut tuner = PerformanceTuner::new(TunerConfig::default());
let snapshot = PerformanceSnapshot {
seq_len: Some(4096),
..base_snapshot()
};
tuner.record_snapshot(snapshot);
let report = tuner.analyze()?;
let arch_recs: Vec<_> = report
.recommendations
.iter()
.filter(|r| {
r.category == RecommendationCategory::Architecture
&& r.description.contains("Flash Attention")
})
.collect();
assert!(
!arch_recs.is_empty(),
"Expected at least one Architecture recommendation mentioning Flash Attention"
);
Ok(())
}
#[test]
fn test_detected_hardware_returns_non_auto() {
let tuner = PerformanceTuner::new(TunerConfig::default());
let hw = tuner.detected_hardware();
assert_ne!(
hw,
HardwareType::Auto,
"detected_hardware should never return Auto"
);
}
#[test]
fn test_performance_model_rule_based_configurable() -> Result<()> {
let mut multipliers = HashMap::new();
for cat in &[
RecommendationCategory::Memory,
RecommendationCategory::Compute,
RecommendationCategory::BatchSize,
RecommendationCategory::Layer,
RecommendationCategory::Hardware,
RecommendationCategory::DataLoading,
RecommendationCategory::Architecture,
] {
multipliers.insert(*cat, 1.0);
}
multipliers.insert(RecommendationCategory::Memory, 2.0);
let config = TunerConfig {
performance_model: PerformanceModel::RuleBased { multipliers },
..TunerConfig::default()
};
let mut tuner = PerformanceTuner::new(config);
for i in 0..5 {
let snapshot = PerformanceSnapshot {
timestamp: i,
memory_usage_mb: 1000.0,
peak_memory_mb: 3000.0, ..base_snapshot()
};
tuner.record_snapshot(snapshot);
}
let report = tuner.analyze()?;
let default_config = TunerConfig::default();
let mut default_tuner = PerformanceTuner::new(default_config);
for i in 0..5 {
let snapshot = PerformanceSnapshot {
timestamp: i,
memory_usage_mb: 1000.0,
peak_memory_mb: 3000.0,
..base_snapshot()
};
default_tuner.record_snapshot(snapshot);
}
let default_report = default_tuner.analyze()?;
assert!(
report.estimated_performance.avg_time_ms
<= default_report.estimated_performance.avg_time_ms + 1e-6,
"Custom multiplier=2.0 should produce at least as optimistic a time estimate as default"
);
Ok(())
}
}