tenflowers_core/
simplified_benchmarks.rs

1// Simplified Performance Benchmarks for TenfloweRS
2// Validates performance optimizations with current codebase capabilities
3
4use crate::{Device, Result, Tensor};
5use std::collections::HashMap;
6use std::time::Instant;
7
8/// Simplified benchmark configuration
9#[derive(Debug, Clone)]
10pub struct SimpleBenchmarkConfig {
11    pub warmup_iterations: usize,
12    pub benchmark_iterations: usize,
13    pub test_sizes: Vec<Vec<usize>>,
14}
15
16impl Default for SimpleBenchmarkConfig {
17    fn default() -> Self {
18        Self {
19            warmup_iterations: 5,
20            benchmark_iterations: 20,
21            test_sizes: vec![
22                vec![256, 256],   // Small tensors
23                vec![512, 512],   // Medium tensors
24                vec![1024, 1024], // Large tensors
25            ],
26        }
27    }
28}
29
30/// Simple benchmark result
31#[derive(Debug, Clone)]
32pub struct SimpleBenchmarkResult {
33    pub benchmark_name: String,
34    pub execution_time_ms: f64,
35    pub throughput_ops_per_sec: f64,
36    pub memory_usage_mb: f64,
37}
38
39/// Simple benchmarking suite
40pub struct SimpleBenchmarkSuite {
41    config: SimpleBenchmarkConfig,
42    results: HashMap<String, SimpleBenchmarkResult>,
43}
44
45impl SimpleBenchmarkSuite {
46    /// Create new benchmark suite
47    pub fn new(config: SimpleBenchmarkConfig) -> Self {
48        Self {
49            config,
50            results: HashMap::new(),
51        }
52    }
53
54    /// Run comprehensive benchmarks
55    pub fn run_benchmarks(&mut self) -> Result<BenchmarkReport> {
56        println!("🚀 Running TenfloweRS Performance Benchmarks...");
57
58        // Test basic tensor operations
59        self.benchmark_tensor_creation()?;
60        self.benchmark_element_wise_operations()?;
61        self.benchmark_matrix_operations()?;
62
63        // Generate report
64        Ok(BenchmarkReport {
65            results: self.results.clone(),
66            summary: self.generate_summary(),
67        })
68    }
69
70    /// Benchmark tensor creation performance
71    fn benchmark_tensor_creation(&mut self) -> Result<()> {
72        println!("  📊 Benchmarking tensor creation...");
73
74        for size in &self.config.test_sizes {
75            if size.len() == 2 {
76                let rows = size[0];
77                let cols = size[1];
78
79                // Warmup
80                for _ in 0..self.config.warmup_iterations {
81                    let _: Tensor<f32> = Tensor::zeros(&[rows, cols]);
82                }
83
84                // Benchmark
85                let start = Instant::now();
86                for _ in 0..self.config.benchmark_iterations {
87                    let _: Tensor<f32> = Tensor::zeros(&[rows, cols]);
88                }
89                let elapsed = start.elapsed();
90
91                let avg_time_ms =
92                    elapsed.as_secs_f64() * 1000.0 / self.config.benchmark_iterations as f64;
93                let ops_per_sec = 1000.0 / avg_time_ms;
94                let memory_mb = (rows * cols * 4) as f64 / (1024.0 * 1024.0); // Assuming f32
95
96                let result = SimpleBenchmarkResult {
97                    benchmark_name: format!("TensorCreation_{}x{}", rows, cols),
98                    execution_time_ms: avg_time_ms,
99                    throughput_ops_per_sec: ops_per_sec,
100                    memory_usage_mb: memory_mb,
101                };
102
103                self.results.insert(result.benchmark_name.clone(), result);
104            }
105        }
106
107        Ok(())
108    }
109
110    /// Benchmark element-wise operations
111    fn benchmark_element_wise_operations(&mut self) -> Result<()> {
112        println!("  ⚡ Benchmarking element-wise operations...");
113
114        for size in &self.config.test_sizes {
115            if size.len() == 2 {
116                let rows = size[0];
117                let cols = size[1];
118
119                let a: Tensor<f32> = Tensor::ones(&[rows, cols]);
120                let b: Tensor<f32> = Tensor::ones(&[rows, cols]);
121
122                // Warmup
123                for _ in 0..self.config.warmup_iterations {
124                    let _ = a.add(&b)?;
125                }
126
127                // Benchmark addition
128                let start = Instant::now();
129                for _ in 0..self.config.benchmark_iterations {
130                    let _ = a.add(&b)?;
131                }
132                let elapsed = start.elapsed();
133
134                let avg_time_ms =
135                    elapsed.as_secs_f64() * 1000.0 / self.config.benchmark_iterations as f64;
136                let elements_per_sec = (rows * cols) as f64 / (avg_time_ms / 1000.0);
137                let memory_mb = (rows * cols * 3 * 4) as f64 / (1024.0 * 1024.0); // 3 tensors * f32
138
139                let result = SimpleBenchmarkResult {
140                    benchmark_name: format!("ElementwiseAdd_{}x{}", rows, cols),
141                    execution_time_ms: avg_time_ms,
142                    throughput_ops_per_sec: elements_per_sec,
143                    memory_usage_mb: memory_mb,
144                };
145
146                self.results.insert(result.benchmark_name.clone(), result);
147            }
148        }
149
150        Ok(())
151    }
152
153    /// Benchmark matrix operations
154    fn benchmark_matrix_operations(&mut self) -> Result<()> {
155        println!("  🔢 Benchmarking matrix operations...");
156
157        for size in &self.config.test_sizes {
158            if size.len() == 2 {
159                let rows = size[0];
160                let cols = size[1];
161
162                let a: Tensor<f32> = Tensor::ones(&[rows, cols]);
163                let b: Tensor<f32> = Tensor::ones(&[cols, rows]);
164
165                // Warmup
166                for _ in 0..self.config.warmup_iterations {
167                    let _ = a.matmul(&b)?;
168                }
169
170                // Benchmark matrix multiplication
171                let start = Instant::now();
172                for _ in 0..self.config.benchmark_iterations {
173                    let _ = a.matmul(&b)?;
174                }
175                let elapsed = start.elapsed();
176
177                let avg_time_ms =
178                    elapsed.as_secs_f64() * 1000.0 / self.config.benchmark_iterations as f64;
179                let flops = (rows * cols * cols * 2) as f64; // Multiply-accumulate operations
180                let gflops_per_sec = flops / (avg_time_ms / 1000.0) / 1e9;
181                let memory_mb = (rows * cols * 2 + rows * rows) as f64 * 4.0 / (1024.0 * 1024.0);
182
183                let result = SimpleBenchmarkResult {
184                    benchmark_name: format!("MatMul_{}x{}", rows, cols),
185                    execution_time_ms: avg_time_ms,
186                    throughput_ops_per_sec: gflops_per_sec,
187                    memory_usage_mb: memory_mb,
188                };
189
190                self.results.insert(result.benchmark_name.clone(), result);
191            }
192        }
193
194        Ok(())
195    }
196
197    /// Generate benchmark summary
198    fn generate_summary(&self) -> BenchmarkSummary {
199        let mut total_time = 0.0f64;
200        let mut max_throughput = 0.0f64;
201        let mut total_memory = 0.0f64;
202
203        for result in self.results.values() {
204            total_time += result.execution_time_ms;
205            max_throughput = max_throughput.max(result.throughput_ops_per_sec);
206            total_memory += result.memory_usage_mb;
207        }
208
209        BenchmarkSummary {
210            total_benchmarks: self.results.len(),
211            average_execution_time_ms: total_time / self.results.len() as f64,
212            peak_throughput: max_throughput,
213            total_memory_usage_mb: total_memory,
214            performance_score: Self::calculate_performance_score(&self.results),
215        }
216    }
217
218    /// Calculate overall performance score
219    fn calculate_performance_score(results: &HashMap<String, SimpleBenchmarkResult>) -> f64 {
220        let mut score = 0.0;
221        let mut count = 0;
222
223        for result in results.values() {
224            // Simple scoring based on throughput (higher is better)
225            let normalized_score = (result.throughput_ops_per_sec / 1e6).min(10.0); // Cap at 10
226            score += normalized_score;
227            count += 1;
228        }
229
230        if count > 0 {
231            score / count as f64
232        } else {
233            0.0
234        }
235    }
236}
237
238/// Benchmark report
239#[derive(Debug, Clone)]
240pub struct BenchmarkReport {
241    pub results: HashMap<String, SimpleBenchmarkResult>,
242    pub summary: BenchmarkSummary,
243}
244
245/// Benchmark summary
246#[derive(Debug, Clone)]
247pub struct BenchmarkSummary {
248    pub total_benchmarks: usize,
249    pub average_execution_time_ms: f64,
250    pub peak_throughput: f64,
251    pub total_memory_usage_mb: f64,
252    pub performance_score: f64,
253}
254
255impl BenchmarkReport {
256    /// Print comprehensive report
257    pub fn print_report(&self) {
258        println!("\n🎯 === TENFLOWERS PERFORMANCE BENCHMARK REPORT ===");
259
260        println!("\n📊 Individual Benchmark Results:");
261        for (name, result) in &self.results {
262            println!(
263                "  {} - {:.2}ms, {:.1} ops/s, {:.1} MB",
264                name,
265                result.execution_time_ms,
266                result.throughput_ops_per_sec,
267                result.memory_usage_mb
268            );
269        }
270
271        println!("\n📈 Summary:");
272        println!("  Total Benchmarks: {}", self.summary.total_benchmarks);
273        println!(
274            "  Average Execution Time: {:.2}ms",
275            self.summary.average_execution_time_ms
276        );
277        println!(
278            "  Peak Throughput: {:.1} ops/s",
279            self.summary.peak_throughput
280        );
281        println!(
282            "  Total Memory Usage: {:.1} MB",
283            self.summary.total_memory_usage_mb
284        );
285        println!(
286            "  Performance Score: {:.2}/10",
287            self.summary.performance_score
288        );
289
290        // Performance evaluation
291        match self.summary.performance_score {
292            score if score >= 8.0 => println!("  ✅ Excellent Performance!"),
293            score if score >= 6.0 => println!("  ✅ Good Performance"),
294            score if score >= 4.0 => println!("  ⚠️  Moderate Performance"),
295            _ => println!("  ❌ Performance Needs Improvement"),
296        }
297    }
298}
299
300/// Run simple benchmarks (convenience function)
301pub fn run_simple_benchmarks() -> Result<BenchmarkReport> {
302    let config = SimpleBenchmarkConfig::default();
303    let mut suite = SimpleBenchmarkSuite::new(config);
304    suite.run_benchmarks()
305}
306
307/// Validate optimization effectiveness
308pub fn validate_optimizations() -> Result<()> {
309    println!("🔍 === OPTIMIZATION VALIDATION ===");
310
311    // Test CPU performance
312    println!("⚡ Testing CPU Performance...");
313    let _cpu_device = Device::Cpu;
314    let a: Tensor<f32> = Tensor::ones(&[1000, 1000]);
315    let b: Tensor<f32> = Tensor::ones(&[1000, 1000]);
316
317    let start = Instant::now();
318    let _result = a.matmul(&b)?;
319    let cpu_time = start.elapsed();
320
321    println!(
322        "  CPU MatMul (1000x1000): {:.2}ms",
323        cpu_time.as_secs_f64() * 1000.0
324    );
325
326    // Test SIMD effectiveness (simplified)
327    println!("📊 Testing SIMD Effectiveness...");
328    let large_tensor: Tensor<f32> = Tensor::ones(&[10000]);
329    let another_tensor: Tensor<f32> = Tensor::ones(&[10000]);
330
331    let start = Instant::now();
332    let _result = large_tensor.add(&another_tensor)?;
333    let simd_time = start.elapsed();
334
335    println!(
336        "  Element-wise Add (10k elements): {:.2}ms",
337        simd_time.as_secs_f64() * 1000.0
338    );
339
340    // Memory efficiency validation
341    println!("💾 Testing Memory Efficiency...");
342    let memory_test_size = 5000;
343    let start = Instant::now();
344    let _large_matrix: Tensor<f32> = Tensor::zeros(&[memory_test_size, memory_test_size]);
345    let memory_time = start.elapsed();
346
347    let memory_mb = (memory_test_size * memory_test_size * 4) as f64 / (1024.0 * 1024.0);
348    let allocation_rate = memory_mb / memory_time.as_secs_f64();
349
350    println!(
351        "  Memory Allocation ({}MB): {:.2}ms, {:.1} MB/s",
352        memory_mb,
353        memory_time.as_secs_f64() * 1000.0,
354        allocation_rate
355    );
356
357    println!("✅ Optimization validation complete!");
358
359    Ok(())
360}
tenflowers_core/simplified_benchmarks.rs

tenflowers_core/
simplified_benchmarks.rs