use crate::error::ModelResult;
use crate::AutoregressiveModel;
use scirs2_core::ndarray::Array1;
use std::time::{Duration, Instant};
#[derive(Debug, Clone)]
pub struct ProfilingResults {
pub num_steps: usize,
pub total_duration: Duration,
pub avg_latency_us: f64,
pub min_latency_us: f64,
pub max_latency_us: f64,
pub median_latency_us: f64,
pub p95_latency_us: f64,
pub p99_latency_us: f64,
pub throughput_steps_per_sec: f64,
pub std_dev_us: f64,
}
impl ProfilingResults {
pub fn from_timings(timings: &[Duration]) -> Self {
let num_steps = timings.len();
if num_steps == 0 {
return Self::default();
}
let mut latencies_us: Vec<f64> = timings
.iter()
.map(|d| d.as_secs_f64() * 1_000_000.0)
.collect();
latencies_us.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let total_duration: Duration = timings.iter().sum();
let total_us = total_duration.as_secs_f64() * 1_000_000.0;
let avg_latency_us = total_us / num_steps as f64;
let min_latency_us = latencies_us[0];
let max_latency_us = latencies_us[num_steps - 1];
let median_latency_us = if num_steps.is_multiple_of(2) {
(latencies_us[num_steps / 2 - 1] + latencies_us[num_steps / 2]) / 2.0
} else {
latencies_us[num_steps / 2]
};
let p95_idx = ((num_steps as f64 * 0.95) as usize).min(num_steps - 1);
let p95_latency_us = latencies_us[p95_idx];
let p99_idx = ((num_steps as f64 * 0.99) as usize).min(num_steps - 1);
let p99_latency_us = latencies_us[p99_idx];
let throughput_steps_per_sec = num_steps as f64 / total_duration.as_secs_f64();
let variance = latencies_us
.iter()
.map(|&x| (x - avg_latency_us).powi(2))
.sum::<f64>()
/ num_steps as f64;
let std_dev_us = variance.sqrt();
Self {
num_steps,
total_duration,
avg_latency_us,
min_latency_us,
max_latency_us,
median_latency_us,
p95_latency_us,
p99_latency_us,
throughput_steps_per_sec,
std_dev_us,
}
}
pub fn format_report(&self) -> String {
format!(
"Profiling Results:\n\
==================\n\
Steps: {}\n\
Total Duration: {:.2}ms\n\
Average Latency: {:.2}μs\n\
Min Latency: {:.2}μs\n\
Max Latency: {:.2}μs\n\
Median Latency: {:.2}μs\n\
P95 Latency: {:.2}μs\n\
P99 Latency: {:.2}μs\n\
Std Dev: {:.2}μs\n\
Throughput: {:.2} steps/sec\n",
self.num_steps,
self.total_duration.as_secs_f64() * 1000.0,
self.avg_latency_us,
self.min_latency_us,
self.max_latency_us,
self.median_latency_us,
self.p95_latency_us,
self.p99_latency_us,
self.std_dev_us,
self.throughput_steps_per_sec,
)
}
}
impl Default for ProfilingResults {
fn default() -> Self {
Self {
num_steps: 0,
total_duration: Duration::ZERO,
avg_latency_us: 0.0,
min_latency_us: 0.0,
max_latency_us: 0.0,
median_latency_us: 0.0,
p95_latency_us: 0.0,
p99_latency_us: 0.0,
throughput_steps_per_sec: 0.0,
std_dev_us: 0.0,
}
}
}
pub struct ModelProfiler<M: AutoregressiveModel> {
model: M,
warmup_steps: usize,
}
impl<M: AutoregressiveModel> ModelProfiler<M> {
pub fn new(model: M) -> Self {
Self {
model,
warmup_steps: 10,
}
}
pub fn warmup_steps(mut self, steps: usize) -> Self {
self.warmup_steps = steps;
self
}
pub fn profile_inference(
&mut self,
num_steps: usize,
input_dim: usize,
) -> ModelResult<ProfilingResults> {
let warmup_input = Array1::zeros(input_dim);
for _ in 0..self.warmup_steps {
let _ = self.model.step(&warmup_input)?;
}
self.model.reset();
let mut timings = Vec::with_capacity(num_steps);
let input = Array1::from_elem(input_dim, 1.0);
for _ in 0..num_steps {
let start = Instant::now();
let _ = self.model.step(&input)?;
timings.push(start.elapsed());
}
Ok(ProfilingResults::from_timings(&timings))
}
pub fn profile_input_scaling(
&mut self,
input_dims: &[usize],
steps_per_dim: usize,
) -> ModelResult<Vec<(usize, ProfilingResults)>> {
let mut results = Vec::with_capacity(input_dims.len());
for &dim in input_dims {
self.model.reset();
let profile = self.profile_inference(steps_per_dim, dim)?;
results.push((dim, profile));
}
Ok(results)
}
pub fn estimate_memory_usage(&self) -> MemoryProfile {
let hidden_dim = self.model.hidden_dim();
let state_dim = self.model.state_dim();
let num_layers = self.model.num_layers();
let state_bytes_per_layer = hidden_dim * state_dim * 4;
let total_state_bytes = state_bytes_per_layer * num_layers;
let weight_estimate = hidden_dim * hidden_dim * 4 * num_layers * 5;
MemoryProfile {
hidden_dim,
state_dim,
num_layers,
state_memory_bytes: total_state_bytes,
estimated_weight_memory_bytes: weight_estimate,
total_estimated_bytes: total_state_bytes + weight_estimate,
}
}
pub fn model(&self) -> &M {
&self.model
}
pub fn model_mut(&mut self) -> &mut M {
&mut self.model
}
pub fn into_model(self) -> M {
self.model
}
}
#[derive(Debug, Clone)]
pub struct MemoryProfile {
pub hidden_dim: usize,
pub state_dim: usize,
pub num_layers: usize,
pub state_memory_bytes: usize,
pub estimated_weight_memory_bytes: usize,
pub total_estimated_bytes: usize,
}
impl MemoryProfile {
pub fn format_report(&self) -> String {
format!(
"Memory Profile:\n\
===============\n\
Hidden Dim: {}\n\
State Dim: {}\n\
Num Layers: {}\n\
State Memory: {:.2} MB\n\
Weight Memory: {:.2} MB (estimated)\n\
Total Memory: {:.2} MB (estimated)\n",
self.hidden_dim,
self.state_dim,
self.num_layers,
self.state_memory_bytes as f64 / 1_048_576.0,
self.estimated_weight_memory_bytes as f64 / 1_048_576.0,
self.total_estimated_bytes as f64 / 1_048_576.0,
)
}
}
pub struct BenchmarkSuite {
num_steps: usize,
warmup_steps: usize,
input_dim: usize,
}
impl BenchmarkSuite {
pub fn new() -> Self {
Self {
num_steps: 1000,
warmup_steps: 10,
input_dim: 1,
}
}
pub fn num_steps(mut self, steps: usize) -> Self {
self.num_steps = steps;
self
}
pub fn warmup_steps(mut self, steps: usize) -> Self {
self.warmup_steps = steps;
self
}
pub fn input_dim(mut self, dim: usize) -> Self {
self.input_dim = dim;
self
}
pub fn benchmark<M: AutoregressiveModel>(&self, model: M) -> ModelResult<ProfilingResults> {
let mut profiler = ModelProfiler::new(model).warmup_steps(self.warmup_steps);
profiler.profile_inference(self.num_steps, self.input_dim)
}
}
impl Default for BenchmarkSuite {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
pub enum BottleneckSeverity {
Low,
Medium,
High,
Critical,
}
#[derive(Debug, Clone)]
pub struct BottleneckInfo {
pub name: String,
pub description: String,
pub estimated_time_us: f64,
pub percentage_of_total: f64,
pub severity: BottleneckSeverity,
pub recommendations: Vec<String>,
}
#[derive(Debug, Clone)]
pub struct ModelBottleneckAnalysis {
pub model_name: String,
pub results: ProfilingResults,
pub memory: MemoryProfile,
pub bottlenecks: Vec<BottleneckInfo>,
pub performance_score: f64,
}
impl ModelBottleneckAnalysis {
pub fn analyze<M: AutoregressiveModel>(
model: M,
model_name: String,
num_steps: usize,
) -> ModelResult<Self> {
let mut profiler = ModelProfiler::new(model).warmup_steps(10);
let results = profiler.profile_inference(num_steps, 1)?;
let memory = profiler.estimate_memory_usage();
let bottlenecks = Self::identify_bottlenecks(&results, &memory, &model_name);
let performance_score = Self::calculate_performance_score(&results, &memory);
Ok(Self {
model_name,
results,
memory,
bottlenecks,
performance_score,
})
}
fn identify_bottlenecks(
results: &ProfilingResults,
memory: &MemoryProfile,
model_name: &str,
) -> Vec<BottleneckInfo> {
let mut bottlenecks = Vec::new();
if results.avg_latency_us > 200.0 {
let severity = if results.avg_latency_us > 1000.0 {
BottleneckSeverity::Critical
} else if results.avg_latency_us > 500.0 {
BottleneckSeverity::High
} else {
BottleneckSeverity::Medium
};
bottlenecks.push(BottleneckInfo {
name: "High average latency".to_string(),
description: format!(
"Average latency of {:.2}μs exceeds target of 200μs",
results.avg_latency_us
),
estimated_time_us: results.avg_latency_us,
percentage_of_total: 100.0,
severity,
recommendations: vec![
"Consider using SIMD optimizations".to_string(),
"Enable parallel processing for multi-head operations".to_string(),
"Use cache-friendly memory layouts".to_string(),
],
});
}
let cv = results.std_dev_us / results.avg_latency_us; if cv > 0.5 {
bottlenecks.push(BottleneckInfo {
name: "High latency variance".to_string(),
description: format!(
"Standard deviation {:.2}μs is {:.1}% of mean, indicating unstable performance",
results.std_dev_us,
cv * 100.0
),
estimated_time_us: results.std_dev_us,
percentage_of_total: cv * 100.0,
severity: if cv > 1.0 {
BottleneckSeverity::High
} else {
BottleneckSeverity::Medium
},
recommendations: vec![
"Investigate cache misses and memory allocation patterns".to_string(),
"Use memory pooling to reduce allocation variance".to_string(),
],
});
}
let memory_mb = memory.total_estimated_bytes as f64 / (1024.0 * 1024.0);
if memory_mb > 100.0 {
bottlenecks.push(BottleneckInfo {
name: "High memory usage".to_string(),
description: format!("Estimated memory usage of {:.2}MB is high", memory_mb),
estimated_time_us: 0.0,
percentage_of_total: 0.0,
severity: if memory_mb > 500.0 {
BottleneckSeverity::High
} else {
BottleneckSeverity::Medium
},
recommendations: vec![
"Consider quantization (INT8/FP16) to reduce memory footprint".to_string(),
"Use sparse representations where applicable".to_string(),
],
});
}
if model_name.contains("Transformer") {
if results.avg_latency_us > 500.0 {
bottlenecks.push(BottleneckInfo {
name: "Quadratic attention complexity".to_string(),
description: "Transformer attention has O(N²) complexity per step".to_string(),
estimated_time_us: results.avg_latency_us * 0.7, percentage_of_total: 70.0,
severity: BottleneckSeverity::High,
recommendations: vec![
"Consider using linear attention variants (e.g., Performers)".to_string(),
"Use Flash Attention for memory-efficient attention".to_string(),
"Switch to SSM-based models (Mamba, RWKV) for O(1) inference".to_string(),
],
});
}
}
bottlenecks
}
fn calculate_performance_score(results: &ProfilingResults, memory: &MemoryProfile) -> f64 {
let latency_score = 100.0 * (100.0 / (results.avg_latency_us + 100.0));
let memory_mb = memory.total_estimated_bytes as f64 / (1024.0 * 1024.0);
let memory_score = 100.0 * (50.0 / (memory_mb + 50.0));
let cv = results.std_dev_us / results.avg_latency_us;
let stability_score = 100.0 * (1.0 / (1.0 + cv));
(latency_score * 0.5 + memory_score * 0.3 + stability_score * 0.2).min(100.0)
}
pub fn format_report(&self) -> String {
let mut report = String::new();
report.push_str("\n═══════════════════════════════════════\n");
report.push_str(&format!(" {} Analysis Report\n", self.model_name));
report.push_str("═══════════════════════════════════════\n\n");
report.push_str(&format!(
"Performance Score: {:.1}/100\n\n",
self.performance_score
));
report.push_str(&self.results.format_report());
report.push('\n');
report.push_str(&self.memory.format_report());
report.push('\n');
if self.bottlenecks.is_empty() {
report.push_str("✓ No significant bottlenecks identified!\n");
} else {
report.push_str(&format!(
"⚠ {} Bottleneck(s) Identified:\n\n",
self.bottlenecks.len()
));
for (i, bottleneck) in self.bottlenecks.iter().enumerate() {
let severity_icon = match bottleneck.severity {
BottleneckSeverity::Low => "ℹ",
BottleneckSeverity::Medium => "⚠",
BottleneckSeverity::High => "⚠⚠",
BottleneckSeverity::Critical => "🔥",
};
report.push_str(&format!(
"{}. {} {}\n",
i + 1,
severity_icon,
bottleneck.name
));
report.push_str(&format!(" {}\n", bottleneck.description));
if bottleneck.estimated_time_us > 0.0 {
report.push_str(&format!(
" Time: {:.2}μs ({:.1}% of total)\n",
bottleneck.estimated_time_us, bottleneck.percentage_of_total
));
}
report.push_str(" Recommendations:\n");
for rec in &bottleneck.recommendations {
report.push_str(&format!(" • {}\n", rec));
}
report.push('\n');
}
}
report
}
}
#[derive(Debug, Clone)]
pub struct ComprehensiveComparison {
pub analyses: Vec<ModelBottleneckAnalysis>,
pub fastest_model: String,
pub most_memory_efficient: String,
pub best_overall: String,
}
impl ComprehensiveComparison {
pub fn format_report(&self) -> String {
let mut report = String::new();
report.push('\n');
report.push_str("╔═══════════════════════════════════════════════════════════════╗\n");
report.push_str("║ COMPREHENSIVE MODEL PERFORMANCE COMPARISON ║\n");
report.push_str("╚═══════════════════════════════════════════════════════════════╝\n\n");
report.push_str("┌─────────────┬──────────────┬──────────────┬──────────────┬────────┐\n");
report.push_str("│ Model │ Avg Latency │ Throughput │ Memory (MB) │ Score │\n");
report.push_str("├─────────────┼──────────────┼──────────────┼──────────────┼────────┤\n");
for analysis in &self.analyses {
let memory_mb = analysis.memory.total_estimated_bytes as f64 / (1024.0 * 1024.0);
report.push_str(&format!(
"│ {:11} │ {:9.2} μs │ {:9.1} /s │ {:11.2} │ {:5.1} │\n",
analysis.model_name,
analysis.results.avg_latency_us,
analysis.results.throughput_steps_per_sec,
memory_mb,
analysis.performance_score
));
}
report
.push_str("└─────────────┴──────────────┴──────────────┴──────────────┴────────┘\n\n");
report.push_str(&format!(
"🏆 Fastest Model: {}\n",
self.fastest_model
));
report.push_str(&format!(
"💾 Most Memory Efficient: {}\n",
self.most_memory_efficient
));
report.push_str(&format!(
"⭐ Best Overall: {}\n\n",
self.best_overall
));
report.push_str("═══════════════════════════════════════════════════════════════\n");
report.push_str(" DETAILED BOTTLENECK ANALYSES\n");
report.push_str("═══════════════════════════════════════════════════════════════\n");
for analysis in &self.analyses {
report.push_str(&analysis.format_report());
}
report
}
}
pub struct ComprehensiveProfiler {
num_steps: usize,
}
impl ComprehensiveProfiler {
pub fn new() -> Self {
Self { num_steps: 1000 }
}
pub fn num_steps(mut self, steps: usize) -> Self {
self.num_steps = steps;
self
}
pub fn profile_all_models(&self) -> ModelResult<ComprehensiveComparison> {
use crate::{mamba::*, mamba2::*, rwkv::*, s4::*, s5::*, transformer::*};
let mut analyses = Vec::new();
let hidden_dim = 256;
let num_layers = 4;
let state_dim = 64;
let mamba_config = MambaConfig::default()
.hidden_dim(hidden_dim)
.state_dim(state_dim)
.num_layers(num_layers);
let mamba = Mamba::new(mamba_config)?;
let mamba_analysis =
ModelBottleneckAnalysis::analyze(mamba, "Mamba".to_string(), self.num_steps)?;
analyses.push(mamba_analysis);
let mamba2_config = Mamba2Config::default()
.hidden_dim(hidden_dim)
.state_dim(state_dim)
.num_layers(num_layers)
.num_heads(4);
let mamba2 = Mamba2::new(mamba2_config)?;
let mamba2_analysis =
ModelBottleneckAnalysis::analyze(mamba2, "Mamba2".to_string(), self.num_steps)?;
analyses.push(mamba2_analysis);
let rwkv_config = RwkvConfig::default()
.hidden_dim(hidden_dim)
.num_layers(num_layers)
.num_heads(4);
let rwkv = Rwkv::new(rwkv_config)?;
let rwkv_analysis =
ModelBottleneckAnalysis::analyze(rwkv, "RWKV".to_string(), self.num_steps)?;
analyses.push(rwkv_analysis);
let s4_config = S4Config::default()
.hidden_dim(hidden_dim)
.state_dim(state_dim)
.num_layers(num_layers);
let s4 = S4D::new(s4_config)?;
let s4_analysis = ModelBottleneckAnalysis::analyze(s4, "S4D".to_string(), self.num_steps)?;
analyses.push(s4_analysis);
let s5_config = S5Config::new(1, hidden_dim, num_layers);
let s5 = S5::new(s5_config)?;
let s5_analysis = ModelBottleneckAnalysis::analyze(s5, "S5".to_string(), self.num_steps)?;
analyses.push(s5_analysis);
let transformer_config = TransformerConfig::default()
.hidden_dim(hidden_dim)
.num_heads(4)
.num_layers(num_layers);
let transformer = Transformer::new(transformer_config)?;
let transformer_analysis = ModelBottleneckAnalysis::analyze(
transformer,
"Transformer".to_string(),
self.num_steps,
)?;
analyses.push(transformer_analysis);
let fastest_model = analyses
.iter()
.min_by(|a, b| {
a.results
.avg_latency_us
.partial_cmp(&b.results.avg_latency_us)
.unwrap_or(std::cmp::Ordering::Equal)
})
.map(|a| a.model_name.clone())
.unwrap_or_default();
let most_memory_efficient = analyses
.iter()
.min_by(|a, b| {
a.memory
.total_estimated_bytes
.cmp(&b.memory.total_estimated_bytes)
})
.map(|a| a.model_name.clone())
.unwrap_or_default();
let best_overall = analyses
.iter()
.max_by(|a, b| {
a.performance_score
.partial_cmp(&b.performance_score)
.unwrap_or(std::cmp::Ordering::Equal)
})
.map(|a| a.model_name.clone())
.unwrap_or_default();
Ok(ComprehensiveComparison {
analyses,
fastest_model,
most_memory_efficient,
best_overall,
})
}
}
impl Default for ComprehensiveProfiler {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Default, Clone)]
pub struct TimingAccumulator {
pub total: Duration,
pub count: u64,
pub min: Option<Duration>,
pub max: Option<Duration>,
}
impl TimingAccumulator {
pub fn record(&mut self, elapsed: Duration) {
self.total += elapsed;
self.count += 1;
self.min = Some(match self.min {
Some(m) => m.min(elapsed),
None => elapsed,
});
self.max = Some(match self.max {
Some(m) => m.max(elapsed),
None => elapsed,
});
}
pub fn mean(&self) -> Option<Duration> {
if self.count == 0 {
return None;
}
Some(self.total / self.count as u32)
}
pub fn throughput_per_sec(&self, items: u64) -> f64 {
let secs = self.total.as_secs_f64();
if secs == 0.0 {
return 0.0;
}
items as f64 / secs
}
}
#[derive(Debug, Default)]
pub struct ProfilingRegistry {
timings: std::collections::HashMap<String, TimingAccumulator>,
enabled: bool,
}
impl ProfilingRegistry {
pub fn new() -> Self {
Self::default()
}
pub fn enable(&mut self) {
self.enabled = true;
}
pub fn disable(&mut self) {
self.enabled = false;
}
pub fn is_enabled(&self) -> bool {
self.enabled
}
pub fn record(&mut self, name: &str, elapsed: Duration) {
if !self.enabled {
return;
}
self.timings
.entry(name.to_string())
.or_default()
.record(elapsed);
}
pub fn get(&self, name: &str) -> Option<&TimingAccumulator> {
self.timings.get(name)
}
pub fn reset(&mut self) {
self.timings.clear();
}
pub fn summary(&self) -> Vec<(String, TimingAccumulator)> {
let mut entries: Vec<_> = self
.timings
.iter()
.map(|(k, v)| (k.clone(), v.clone()))
.collect();
entries.sort_by(|(a, _), (b, _)| a.cmp(b));
entries
}
}
pub struct TimingGuard<'a> {
registry: &'a mut ProfilingRegistry,
name: String,
start: Instant,
}
impl<'a> TimingGuard<'a> {
pub fn new(registry: &'a mut ProfilingRegistry, name: impl Into<String>) -> Self {
Self {
registry,
name: name.into(),
start: Instant::now(),
}
}
}
impl Drop for TimingGuard<'_> {
fn drop(&mut self) {
let elapsed = self.start.elapsed();
self.registry.record(&self.name, elapsed);
}
}
#[derive(Clone, Default)]
pub struct SharedProfilingRegistry(std::sync::Arc<std::sync::Mutex<ProfilingRegistry>>);
impl SharedProfilingRegistry {
pub fn new() -> Self {
Self::default()
}
pub fn enable(&self) {
if let Ok(mut r) = self.0.lock() {
r.enable();
}
}
pub fn disable(&self) {
if let Ok(mut r) = self.0.lock() {
r.disable();
}
}
pub fn record(&self, name: &str, elapsed: Duration) {
if let Ok(mut r) = self.0.lock() {
r.record(name, elapsed);
}
}
pub fn summary(&self) -> Vec<(String, TimingAccumulator)> {
self.0.lock().map(|r| r.summary()).unwrap_or_default()
}
pub fn reset(&self) {
if let Ok(mut r) = self.0.lock() {
r.reset();
}
}
}
impl std::fmt::Debug for SharedProfilingRegistry {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_tuple("SharedProfilingRegistry").finish()
}
}
#[macro_export]
macro_rules! time_op {
($registry:expr, $name:expr, $block:expr) => {{
let _start = std::time::Instant::now();
let result = $block;
$registry.record($name, _start.elapsed());
result
}};
}
#[cfg(test)]
mod tests {
use super::*;
use crate::mamba::{Mamba, MambaConfig};
#[test]
fn test_profiling_results() {
let timings = vec![
Duration::from_micros(100),
Duration::from_micros(150),
Duration::from_micros(120),
Duration::from_micros(200),
Duration::from_micros(110),
];
let results = ProfilingResults::from_timings(&timings);
assert_eq!(results.num_steps, 5);
assert!(results.avg_latency_us > 0.0);
assert!(results.min_latency_us <= results.avg_latency_us);
assert!(results.avg_latency_us <= results.max_latency_us);
assert!(results.throughput_steps_per_sec > 0.0);
}
#[test]
#[ignore] fn test_model_profiler() {
let config = MambaConfig::default().hidden_dim(64).num_layers(2);
let model = Mamba::new(config).expect("Failed to create Mamba model");
let mut profiler = ModelProfiler::new(model).warmup_steps(5);
let results = profiler
.profile_inference(100, 1)
.expect("Failed to profile inference");
assert_eq!(results.num_steps, 100);
assert!(results.avg_latency_us > 0.0);
assert!(results.throughput_steps_per_sec > 0.0);
}
#[test]
#[ignore] fn test_input_scaling() {
let hidden_dims = vec![32, 64, 128];
let mut results = Vec::new();
for hidden_dim in hidden_dims {
let config = MambaConfig::default().hidden_dim(hidden_dim).num_layers(2);
let model = Mamba::new(config).expect("Failed to create Mamba model");
let mut profiler = ModelProfiler::new(model).warmup_steps(5);
let profile = profiler
.profile_inference(50, 1)
.expect("Failed to profile inference");
results.push((hidden_dim, profile));
}
assert_eq!(results.len(), 3);
for (dim, result) in &results {
assert_eq!(result.num_steps, 50);
assert!(*dim > 0);
assert!(result.avg_latency_us > 0.0);
}
}
#[test]
fn test_memory_profile() {
let config = MambaConfig::default().hidden_dim(256).num_layers(4);
let model = Mamba::new(config).expect("Failed to create Mamba model");
let profiler = ModelProfiler::new(model);
let memory = profiler.estimate_memory_usage();
assert_eq!(memory.hidden_dim, 256);
assert_eq!(memory.num_layers, 4);
assert!(memory.total_estimated_bytes > 0);
}
#[test]
#[ignore] fn test_benchmark_suite() {
let config = MambaConfig::default().hidden_dim(64).num_layers(2);
let model = Mamba::new(config).expect("Failed to create Mamba model");
let suite = BenchmarkSuite::new().num_steps(100).warmup_steps(5);
let results = suite.benchmark(model).expect("Failed to run benchmark");
assert_eq!(results.num_steps, 100);
assert!(results.avg_latency_us > 0.0);
}
#[test]
fn test_format_report() {
let timings = vec![Duration::from_micros(100); 10];
let results = ProfilingResults::from_timings(&timings);
let report = results.format_report();
assert!(report.contains("Profiling Results"));
assert!(report.contains("Average Latency"));
assert!(report.contains("Throughput"));
}
#[test]
fn test_timing_accumulator_record() {
let mut acc = TimingAccumulator::default();
acc.record(Duration::from_millis(10));
acc.record(Duration::from_millis(20));
assert_eq!(acc.count, 2);
assert_eq!(acc.total, Duration::from_millis(30));
assert_eq!(acc.mean(), Some(Duration::from_millis(15)));
}
#[test]
fn test_timing_accumulator_empty_mean() {
let acc = TimingAccumulator::default();
assert_eq!(acc.mean(), None);
}
#[test]
fn test_timing_accumulator_min_max() {
let mut acc = TimingAccumulator::default();
acc.record(Duration::from_millis(5));
acc.record(Duration::from_millis(15));
acc.record(Duration::from_millis(10));
assert_eq!(acc.min, Some(Duration::from_millis(5)));
assert_eq!(acc.max, Some(Duration::from_millis(15)));
}
#[test]
fn test_profiling_registry_disabled_by_default() {
let mut reg = ProfilingRegistry::new();
reg.record("op", Duration::from_millis(1));
assert!(reg.get("op").is_none()); }
#[test]
fn test_profiling_registry_enabled() {
let mut reg = ProfilingRegistry::new();
reg.enable();
reg.record("op", Duration::from_millis(5));
assert!(reg.get("op").is_some());
}
#[test]
fn test_profiling_registry_reset() {
let mut reg = ProfilingRegistry::new();
reg.enable();
reg.record("op", Duration::from_millis(5));
assert!(reg.get("op").is_some());
reg.reset();
assert!(reg.get("op").is_none());
}
#[test]
fn test_profiling_registry_summary_sorted() {
let mut reg = ProfilingRegistry::new();
reg.enable();
reg.record("b_op", Duration::from_millis(1));
reg.record("a_op", Duration::from_millis(2));
let summary = reg.summary();
assert_eq!(summary.len(), 2);
assert_eq!(summary[0].0, "a_op");
assert_eq!(summary[1].0, "b_op");
}
#[test]
fn test_shared_profiling_registry() {
let registry = SharedProfilingRegistry::new();
registry.enable();
registry.record("test_op", Duration::from_micros(100));
let summary = registry.summary();
assert!(!summary.is_empty());
}
#[test]
fn test_shared_registry_disabled_by_default() {
let registry = SharedProfilingRegistry::new();
registry.record("op", Duration::from_millis(1));
assert!(registry.summary().is_empty());
}
#[test]
fn test_timing_guard_records_on_drop() {
let mut reg = ProfilingRegistry::new();
reg.enable();
{
let _guard = TimingGuard::new(&mut reg, "guarded_op");
std::thread::sleep(Duration::from_millis(1));
}
assert!(reg.get("guarded_op").is_some());
let acc = reg.get("guarded_op").expect("accumulator must exist");
assert_eq!(acc.count, 1);
}
}