tenflowers_core/ops/
performance_benchmark.rs

1//! Performance benchmarking for optimized operations
2//!
3//! This module provides benchmarking utilities to measure and compare
4//! the performance of optimized vs original tensor operations.
5
6use crate::{Result, Tensor};
7use std::time::{Duration, Instant};
8
9/// Benchmark configuration
10#[derive(Debug, Clone)]
11pub struct BenchmarkConfig {
12    pub warmup_iterations: usize,
13    pub measurement_iterations: usize,
14    pub sizes: Vec<usize>,
15    pub verify_correctness: bool,
16}
17
18impl Default for BenchmarkConfig {
19    fn default() -> Self {
20        Self {
21            warmup_iterations: 5,
22            measurement_iterations: 10,
23            sizes: vec![1000, 10000, 100000, 1000000],
24            verify_correctness: true,
25        }
26    }
27}
28
29/// Benchmark result
30#[derive(Debug, Clone)]
31pub struct BenchmarkResult {
32    pub operation: String,
33    pub size: usize,
34    pub original_time: Duration,
35    pub optimized_time: Duration,
36    pub speedup: f64,
37    pub throughput_original: f64,  // elements per second
38    pub throughput_optimized: f64, // elements per second
39    pub correctness_verified: bool,
40}
41
42impl BenchmarkResult {
43    pub fn new(
44        operation: String,
45        size: usize,
46        original_time: Duration,
47        optimized_time: Duration,
48        correctness_verified: bool,
49    ) -> Self {
50        let speedup = original_time.as_nanos() as f64 / optimized_time.as_nanos() as f64;
51        let throughput_original = size as f64 / original_time.as_secs_f64();
52        let throughput_optimized = size as f64 / optimized_time.as_secs_f64();
53
54        Self {
55            operation,
56            size,
57            original_time,
58            optimized_time,
59            speedup,
60            throughput_original,
61            throughput_optimized,
62            correctness_verified,
63        }
64    }
65}
66
67/// Binary operation benchmark suite
68pub fn benchmark_binary_operations(config: BenchmarkConfig) -> Result<Vec<BenchmarkResult>> {
69    let mut results = Vec::new();
70
71    for &size in &config.sizes {
72        println!("Benchmarking size: {size}");
73
74        // Test addition
75        if let Ok(result) = benchmark_add_f32(size, &config) {
76            results.push(result);
77        }
78
79        // Test multiplication
80        if let Ok(_result) = benchmark_mul_f32(size, &config) {}
81
82        // Test subtraction
83        if let Ok(result) = benchmark_sub_f32(size, &config) {
84            results.push(result);
85        }
86
87        // Test division
88        if let Ok(result) = benchmark_div_f32(size, &config) {
89            results.push(result);
90        }
91    }
92
93    Ok(results)
94}
95
96/// Benchmark addition operation
97fn benchmark_add_f32(size: usize, config: &BenchmarkConfig) -> Result<BenchmarkResult> {
98    // Create test data
99    let a_data: Vec<f32> = (0..size).map(|i| i as f32).collect();
100    let b_data: Vec<f32> = (0..size).map(|i| (i as f32) + 1.0).collect();
101
102    let a = Tensor::from_vec(a_data, &[size])?;
103    let b = Tensor::from_vec(b_data, &[size])?;
104
105    // Warmup for original implementation
106    for _ in 0..config.warmup_iterations {
107        let _ = super::binary::add(&a, &b)?;
108    }
109
110    // Benchmark original implementation
111    let start = Instant::now();
112    for _ in 0..config.measurement_iterations {
113        let _ = super::binary::add(&a, &b)?;
114    }
115    let original_time = start.elapsed() / config.measurement_iterations as u32;
116
117    // Warmup for optimized implementation
118    for _ in 0..config.warmup_iterations {
119        let _ = super::optimized_binary::optimized_add(&a, &b)?;
120    }
121
122    // Benchmark optimized implementation
123    let start = Instant::now();
124    for _ in 0..config.measurement_iterations {
125        let _ = super::optimized_binary::optimized_add(&a, &b)?;
126    }
127    let optimized_time = start.elapsed() / config.measurement_iterations as u32;
128
129    // Verify correctness
130    let correctness_verified = if config.verify_correctness {
131        let original_result = super::binary::add(&a, &b)?;
132        let optimized_result = super::optimized_binary::optimized_add(&a, &b)?;
133
134        // Compare results element-wise with small tolerance for floating point
135        let orig_data = original_result.to_vec()?;
136        let opt_data = optimized_result.to_vec()?;
137
138        orig_data
139            .iter()
140            .zip(opt_data.iter())
141            .all(|(o, p)| (o - p).abs() < 1e-6)
142    } else {
143        true
144    };
145
146    Ok(BenchmarkResult::new(
147        "Add".to_string(),
148        size,
149        original_time,
150        optimized_time,
151        correctness_verified,
152    ))
153}
154
155/// Benchmark multiplication operation
156fn benchmark_mul_f32(size: usize, config: &BenchmarkConfig) -> Result<BenchmarkResult> {
157    // Create test data
158    let a_data: Vec<f32> = (0..size).map(|i| (i as f32) + 1.0).collect();
159    let b_data: Vec<f32> = (0..size).map(|i| (i as f32) + 2.0).collect();
160
161    let a = Tensor::from_vec(a_data, &[size])?;
162    let b = Tensor::from_vec(b_data, &[size])?;
163
164    // Warmup and benchmark original
165    for _ in 0..config.warmup_iterations {
166        let _ = super::binary::mul(&a, &b)?;
167    }
168
169    let start = Instant::now();
170    for _ in 0..config.measurement_iterations {
171        let _ = super::binary::mul(&a, &b)?;
172    }
173    let original_time = start.elapsed() / config.measurement_iterations as u32;
174
175    // Warmup and benchmark optimized
176    for _ in 0..config.warmup_iterations {
177        let _ = super::optimized_binary::optimized_mul(&a, &b)?;
178    }
179
180    let start = Instant::now();
181    for _ in 0..config.measurement_iterations {
182        let _ = super::optimized_binary::optimized_mul(&a, &b)?;
183    }
184    let optimized_time = start.elapsed() / config.measurement_iterations as u32;
185
186    let correctness_verified = if config.verify_correctness {
187        let original_result = super::binary::mul(&a, &b)?;
188        let optimized_result = super::optimized_binary::optimized_mul(&a, &b)?;
189
190        let orig_data = original_result.to_vec()?;
191        let opt_data = optimized_result.to_vec()?;
192
193        orig_data
194            .iter()
195            .zip(opt_data.iter())
196            .all(|(o, p)| (o - p).abs() < 1e-6)
197    } else {
198        true
199    };
200
201    Ok(BenchmarkResult::new(
202        "Mul".to_string(),
203        size,
204        original_time,
205        optimized_time,
206        correctness_verified,
207    ))
208}
209
210/// Benchmark subtraction operation
211fn benchmark_sub_f32(size: usize, config: &BenchmarkConfig) -> Result<BenchmarkResult> {
212    let a_data: Vec<f32> = (0..size).map(|i| (i as f32) + 5.0).collect();
213    let b_data: Vec<f32> = (0..size).map(|i| (i as f32) + 1.0).collect();
214
215    let a = Tensor::from_vec(a_data, &[size])?;
216    let b = Tensor::from_vec(b_data, &[size])?;
217
218    // Warmup and benchmark original
219    for _ in 0..config.warmup_iterations {
220        let _ = super::binary::sub(&a, &b)?;
221    }
222
223    let start = Instant::now();
224    for _ in 0..config.measurement_iterations {
225        let _ = super::binary::sub(&a, &b)?;
226    }
227    let original_time = start.elapsed() / config.measurement_iterations as u32;
228
229    // Warmup and benchmark optimized
230    for _ in 0..config.warmup_iterations {
231        let _ = super::optimized_binary::optimized_sub(&a, &b)?;
232    }
233
234    let start = Instant::now();
235    for _ in 0..config.measurement_iterations {
236        let _ = super::optimized_binary::optimized_sub(&a, &b)?;
237    }
238    let optimized_time = start.elapsed() / config.measurement_iterations as u32;
239
240    let correctness_verified = if config.verify_correctness {
241        let original_result = super::binary::sub(&a, &b)?;
242        let optimized_result = super::optimized_binary::optimized_sub(&a, &b)?;
243
244        let orig_data = original_result.to_vec()?;
245        let opt_data = optimized_result.to_vec()?;
246
247        orig_data
248            .iter()
249            .zip(opt_data.iter())
250            .all(|(o, p)| (o - p).abs() < 1e-6)
251    } else {
252        true
253    };
254
255    Ok(BenchmarkResult::new(
256        "Sub".to_string(),
257        size,
258        original_time,
259        optimized_time,
260        correctness_verified,
261    ))
262}
263
264/// Benchmark division operation
265fn benchmark_div_f32(size: usize, config: &BenchmarkConfig) -> Result<BenchmarkResult> {
266    let a_data: Vec<f32> = (0..size).map(|i| (i as f32) + 10.0).collect();
267    let b_data: Vec<f32> = (0..size).map(|i| (i as f32) + 2.0).collect();
268
269    let a = Tensor::from_vec(a_data, &[size])?;
270    let b = Tensor::from_vec(b_data, &[size])?;
271
272    // Warmup and benchmark original
273    for _ in 0..config.warmup_iterations {
274        let _ = super::binary::div(&a, &b)?;
275    }
276
277    let start = Instant::now();
278    for _ in 0..config.measurement_iterations {
279        let _ = super::binary::div(&a, &b)?;
280    }
281    let original_time = start.elapsed() / config.measurement_iterations as u32;
282
283    // Warmup and benchmark optimized
284    for _ in 0..config.warmup_iterations {
285        let _ = super::optimized_binary::optimized_div(&a, &b)?;
286    }
287
288    let start = Instant::now();
289    for _ in 0..config.measurement_iterations {
290        let _ = super::optimized_binary::optimized_div(&a, &b)?;
291    }
292    let optimized_time = start.elapsed() / config.measurement_iterations as u32;
293
294    let correctness_verified = if config.verify_correctness {
295        let original_result = super::binary::div(&a, &b)?;
296        let optimized_result = super::optimized_binary::optimized_div(&a, &b)?;
297
298        let orig_data = original_result.to_vec()?;
299        let opt_data = optimized_result.to_vec()?;
300
301        orig_data
302            .iter()
303            .zip(opt_data.iter())
304            .all(|(o, p)| (o - p).abs() < 1e-6)
305    } else {
306        true
307    };
308
309    Ok(BenchmarkResult::new(
310        "Div".to_string(),
311        size,
312        original_time,
313        optimized_time,
314        correctness_verified,
315    ))
316}
317
318/// Print benchmark results in a formatted table
319pub fn print_benchmark_results(results: &[BenchmarkResult]) {
320    println!("\n{:-<100}", "");
321    println!(
322        "| {:^12} | {:^12} | {:^12} | {:^12} | {:^10} | {:^15} | {:^15} |",
323        "Operation",
324        "Size",
325        "Original (μs)",
326        "Optimized (μs)",
327        "Speedup",
328        "Orig Throughput",
329        "Opt Throughput"
330    );
331    println!("{:-<100}", "");
332
333    for result in results {
334        let orig_us = result.original_time.as_micros();
335        let opt_us = result.optimized_time.as_micros();
336        let orig_throughput = format!("{:.1e}", result.throughput_original);
337        let opt_throughput = format!("{:.1e}", result.throughput_optimized);
338
339        println!(
340            "| {:^12} | {:^12} | {:^12} | {:^12} | {:^10.2} | {:^15} | {:^15} |",
341            result.operation,
342            result.size,
343            orig_us,
344            opt_us,
345            result.speedup,
346            orig_throughput,
347            opt_throughput
348        );
349
350        if !result.correctness_verified {
351            println!(
352                "  ⚠️  WARNING: Correctness verification failed for {} size {}",
353                result.operation, result.size
354            );
355        }
356    }
357    println!("{:-<100}", "");
358
359    // Summary statistics
360    let avg_speedup: f64 = results.iter().map(|r| r.speedup).sum::<f64>() / results.len() as f64;
361    let max_speedup = results.iter().map(|r| r.speedup).fold(0.0, f64::max);
362    let min_speedup = results
363        .iter()
364        .map(|r| r.speedup)
365        .fold(f64::INFINITY, f64::min);
366
367    println!("Summary:");
368    println!("  Average speedup: {avg_speedup:.2}x");
369    println!("  Maximum speedup: {max_speedup:.2}x");
370    println!("  Minimum speedup: {min_speedup:.2}x");
371
372    let correctness_issues = results.iter().filter(|r| !r.correctness_verified).count();
373    if correctness_issues > 0 {
374        println!("  ⚠️  {correctness_issues} correctness verification failures");
375    } else {
376        println!("  ✅ All correctness verifications passed");
377    }
378}
379
380/// Run a complete benchmark suite and return results
381pub fn run_performance_benchmark() -> Result<Vec<BenchmarkResult>> {
382    println!("Running TenfloweRS CPU Performance Benchmark");
383    println!("Testing optimized vs original binary operations...\n");
384
385    let config = BenchmarkConfig::default();
386    let results = benchmark_binary_operations(config)?;
387
388    print_benchmark_results(&results);
389
390    Ok(results)
391}
392
393#[cfg(test)]
394mod tests {
395    use super::*;
396
397    #[test]
398    fn test_benchmark_correctness() {
399        let config = BenchmarkConfig {
400            warmup_iterations: 1,
401            measurement_iterations: 1,
402            sizes: vec![1000],
403            verify_correctness: true,
404        };
405
406        let results = benchmark_binary_operations(config)
407            .expect("test: benchmark_binary_operations should succeed");
408
409        // All results should have correctness verified
410        for result in &results {
411            assert!(
412                result.correctness_verified,
413                "Correctness verification failed for {}",
414                result.operation
415            );
416        }
417
418        // Should have results for all operations
419        assert!(!results.is_empty());
420    }
421
422    #[test]
423    fn test_small_benchmark() {
424        let config = BenchmarkConfig {
425            warmup_iterations: 1,
426            measurement_iterations: 2,
427            sizes: vec![100],
428            verify_correctness: true,
429        };
430
431        let results = benchmark_binary_operations(config)
432            .expect("test: benchmark_binary_operations should succeed");
433        assert!(!results.is_empty());
434
435        // Print results for manual inspection
436        print_benchmark_results(&results);
437    }
438}
tenflowers_core/ops/performance_benchmark.rs

tenflowers_core/ops/
performance_benchmark.rs