1use crate::{Result, Tensor};
7use std::time::{Duration, Instant};
8
9#[derive(Debug, Clone)]
11pub struct BenchmarkConfig {
12 pub warmup_iterations: usize,
13 pub measurement_iterations: usize,
14 pub sizes: Vec<usize>,
15 pub verify_correctness: bool,
16}
17
18impl Default for BenchmarkConfig {
19 fn default() -> Self {
20 Self {
21 warmup_iterations: 5,
22 measurement_iterations: 10,
23 sizes: vec![1000, 10000, 100000, 1000000],
24 verify_correctness: true,
25 }
26 }
27}
28
29#[derive(Debug, Clone)]
31pub struct BenchmarkResult {
32 pub operation: String,
33 pub size: usize,
34 pub original_time: Duration,
35 pub optimized_time: Duration,
36 pub speedup: f64,
37 pub throughput_original: f64, pub throughput_optimized: f64, pub correctness_verified: bool,
40}
41
42impl BenchmarkResult {
43 pub fn new(
44 operation: String,
45 size: usize,
46 original_time: Duration,
47 optimized_time: Duration,
48 correctness_verified: bool,
49 ) -> Self {
50 let speedup = original_time.as_nanos() as f64 / optimized_time.as_nanos() as f64;
51 let throughput_original = size as f64 / original_time.as_secs_f64();
52 let throughput_optimized = size as f64 / optimized_time.as_secs_f64();
53
54 Self {
55 operation,
56 size,
57 original_time,
58 optimized_time,
59 speedup,
60 throughput_original,
61 throughput_optimized,
62 correctness_verified,
63 }
64 }
65}
66
67pub fn benchmark_binary_operations(config: BenchmarkConfig) -> Result<Vec<BenchmarkResult>> {
69 let mut results = Vec::new();
70
71 for &size in &config.sizes {
72 println!("Benchmarking size: {size}");
73
74 if let Ok(result) = benchmark_add_f32(size, &config) {
76 results.push(result);
77 }
78
79 if let Ok(_result) = benchmark_mul_f32(size, &config) {}
81
82 if let Ok(result) = benchmark_sub_f32(size, &config) {
84 results.push(result);
85 }
86
87 if let Ok(result) = benchmark_div_f32(size, &config) {
89 results.push(result);
90 }
91 }
92
93 Ok(results)
94}
95
96fn benchmark_add_f32(size: usize, config: &BenchmarkConfig) -> Result<BenchmarkResult> {
98 let a_data: Vec<f32> = (0..size).map(|i| i as f32).collect();
100 let b_data: Vec<f32> = (0..size).map(|i| (i as f32) + 1.0).collect();
101
102 let a = Tensor::from_vec(a_data, &[size])?;
103 let b = Tensor::from_vec(b_data, &[size])?;
104
105 for _ in 0..config.warmup_iterations {
107 let _ = super::binary::add(&a, &b)?;
108 }
109
110 let start = Instant::now();
112 for _ in 0..config.measurement_iterations {
113 let _ = super::binary::add(&a, &b)?;
114 }
115 let original_time = start.elapsed() / config.measurement_iterations as u32;
116
117 for _ in 0..config.warmup_iterations {
119 let _ = super::optimized_binary::optimized_add(&a, &b)?;
120 }
121
122 let start = Instant::now();
124 for _ in 0..config.measurement_iterations {
125 let _ = super::optimized_binary::optimized_add(&a, &b)?;
126 }
127 let optimized_time = start.elapsed() / config.measurement_iterations as u32;
128
129 let correctness_verified = if config.verify_correctness {
131 let original_result = super::binary::add(&a, &b)?;
132 let optimized_result = super::optimized_binary::optimized_add(&a, &b)?;
133
134 let orig_data = original_result.to_vec()?;
136 let opt_data = optimized_result.to_vec()?;
137
138 orig_data
139 .iter()
140 .zip(opt_data.iter())
141 .all(|(o, p)| (o - p).abs() < 1e-6)
142 } else {
143 true
144 };
145
146 Ok(BenchmarkResult::new(
147 "Add".to_string(),
148 size,
149 original_time,
150 optimized_time,
151 correctness_verified,
152 ))
153}
154
155fn benchmark_mul_f32(size: usize, config: &BenchmarkConfig) -> Result<BenchmarkResult> {
157 let a_data: Vec<f32> = (0..size).map(|i| (i as f32) + 1.0).collect();
159 let b_data: Vec<f32> = (0..size).map(|i| (i as f32) + 2.0).collect();
160
161 let a = Tensor::from_vec(a_data, &[size])?;
162 let b = Tensor::from_vec(b_data, &[size])?;
163
164 for _ in 0..config.warmup_iterations {
166 let _ = super::binary::mul(&a, &b)?;
167 }
168
169 let start = Instant::now();
170 for _ in 0..config.measurement_iterations {
171 let _ = super::binary::mul(&a, &b)?;
172 }
173 let original_time = start.elapsed() / config.measurement_iterations as u32;
174
175 for _ in 0..config.warmup_iterations {
177 let _ = super::optimized_binary::optimized_mul(&a, &b)?;
178 }
179
180 let start = Instant::now();
181 for _ in 0..config.measurement_iterations {
182 let _ = super::optimized_binary::optimized_mul(&a, &b)?;
183 }
184 let optimized_time = start.elapsed() / config.measurement_iterations as u32;
185
186 let correctness_verified = if config.verify_correctness {
187 let original_result = super::binary::mul(&a, &b)?;
188 let optimized_result = super::optimized_binary::optimized_mul(&a, &b)?;
189
190 let orig_data = original_result.to_vec()?;
191 let opt_data = optimized_result.to_vec()?;
192
193 orig_data
194 .iter()
195 .zip(opt_data.iter())
196 .all(|(o, p)| (o - p).abs() < 1e-6)
197 } else {
198 true
199 };
200
201 Ok(BenchmarkResult::new(
202 "Mul".to_string(),
203 size,
204 original_time,
205 optimized_time,
206 correctness_verified,
207 ))
208}
209
210fn benchmark_sub_f32(size: usize, config: &BenchmarkConfig) -> Result<BenchmarkResult> {
212 let a_data: Vec<f32> = (0..size).map(|i| (i as f32) + 5.0).collect();
213 let b_data: Vec<f32> = (0..size).map(|i| (i as f32) + 1.0).collect();
214
215 let a = Tensor::from_vec(a_data, &[size])?;
216 let b = Tensor::from_vec(b_data, &[size])?;
217
218 for _ in 0..config.warmup_iterations {
220 let _ = super::binary::sub(&a, &b)?;
221 }
222
223 let start = Instant::now();
224 for _ in 0..config.measurement_iterations {
225 let _ = super::binary::sub(&a, &b)?;
226 }
227 let original_time = start.elapsed() / config.measurement_iterations as u32;
228
229 for _ in 0..config.warmup_iterations {
231 let _ = super::optimized_binary::optimized_sub(&a, &b)?;
232 }
233
234 let start = Instant::now();
235 for _ in 0..config.measurement_iterations {
236 let _ = super::optimized_binary::optimized_sub(&a, &b)?;
237 }
238 let optimized_time = start.elapsed() / config.measurement_iterations as u32;
239
240 let correctness_verified = if config.verify_correctness {
241 let original_result = super::binary::sub(&a, &b)?;
242 let optimized_result = super::optimized_binary::optimized_sub(&a, &b)?;
243
244 let orig_data = original_result.to_vec()?;
245 let opt_data = optimized_result.to_vec()?;
246
247 orig_data
248 .iter()
249 .zip(opt_data.iter())
250 .all(|(o, p)| (o - p).abs() < 1e-6)
251 } else {
252 true
253 };
254
255 Ok(BenchmarkResult::new(
256 "Sub".to_string(),
257 size,
258 original_time,
259 optimized_time,
260 correctness_verified,
261 ))
262}
263
264fn benchmark_div_f32(size: usize, config: &BenchmarkConfig) -> Result<BenchmarkResult> {
266 let a_data: Vec<f32> = (0..size).map(|i| (i as f32) + 10.0).collect();
267 let b_data: Vec<f32> = (0..size).map(|i| (i as f32) + 2.0).collect();
268
269 let a = Tensor::from_vec(a_data, &[size])?;
270 let b = Tensor::from_vec(b_data, &[size])?;
271
272 for _ in 0..config.warmup_iterations {
274 let _ = super::binary::div(&a, &b)?;
275 }
276
277 let start = Instant::now();
278 for _ in 0..config.measurement_iterations {
279 let _ = super::binary::div(&a, &b)?;
280 }
281 let original_time = start.elapsed() / config.measurement_iterations as u32;
282
283 for _ in 0..config.warmup_iterations {
285 let _ = super::optimized_binary::optimized_div(&a, &b)?;
286 }
287
288 let start = Instant::now();
289 for _ in 0..config.measurement_iterations {
290 let _ = super::optimized_binary::optimized_div(&a, &b)?;
291 }
292 let optimized_time = start.elapsed() / config.measurement_iterations as u32;
293
294 let correctness_verified = if config.verify_correctness {
295 let original_result = super::binary::div(&a, &b)?;
296 let optimized_result = super::optimized_binary::optimized_div(&a, &b)?;
297
298 let orig_data = original_result.to_vec()?;
299 let opt_data = optimized_result.to_vec()?;
300
301 orig_data
302 .iter()
303 .zip(opt_data.iter())
304 .all(|(o, p)| (o - p).abs() < 1e-6)
305 } else {
306 true
307 };
308
309 Ok(BenchmarkResult::new(
310 "Div".to_string(),
311 size,
312 original_time,
313 optimized_time,
314 correctness_verified,
315 ))
316}
317
318pub fn print_benchmark_results(results: &[BenchmarkResult]) {
320 println!("\n{:-<100}", "");
321 println!(
322 "| {:^12} | {:^12} | {:^12} | {:^12} | {:^10} | {:^15} | {:^15} |",
323 "Operation",
324 "Size",
325 "Original (μs)",
326 "Optimized (μs)",
327 "Speedup",
328 "Orig Throughput",
329 "Opt Throughput"
330 );
331 println!("{:-<100}", "");
332
333 for result in results {
334 let orig_us = result.original_time.as_micros();
335 let opt_us = result.optimized_time.as_micros();
336 let orig_throughput = format!("{:.1e}", result.throughput_original);
337 let opt_throughput = format!("{:.1e}", result.throughput_optimized);
338
339 println!(
340 "| {:^12} | {:^12} | {:^12} | {:^12} | {:^10.2} | {:^15} | {:^15} |",
341 result.operation,
342 result.size,
343 orig_us,
344 opt_us,
345 result.speedup,
346 orig_throughput,
347 opt_throughput
348 );
349
350 if !result.correctness_verified {
351 println!(
352 " ⚠️ WARNING: Correctness verification failed for {} size {}",
353 result.operation, result.size
354 );
355 }
356 }
357 println!("{:-<100}", "");
358
359 let avg_speedup: f64 = results.iter().map(|r| r.speedup).sum::<f64>() / results.len() as f64;
361 let max_speedup = results.iter().map(|r| r.speedup).fold(0.0, f64::max);
362 let min_speedup = results
363 .iter()
364 .map(|r| r.speedup)
365 .fold(f64::INFINITY, f64::min);
366
367 println!("Summary:");
368 println!(" Average speedup: {avg_speedup:.2}x");
369 println!(" Maximum speedup: {max_speedup:.2}x");
370 println!(" Minimum speedup: {min_speedup:.2}x");
371
372 let correctness_issues = results.iter().filter(|r| !r.correctness_verified).count();
373 if correctness_issues > 0 {
374 println!(" ⚠️ {correctness_issues} correctness verification failures");
375 } else {
376 println!(" ✅ All correctness verifications passed");
377 }
378}
379
380pub fn run_performance_benchmark() -> Result<Vec<BenchmarkResult>> {
382 println!("Running TenfloweRS CPU Performance Benchmark");
383 println!("Testing optimized vs original binary operations...\n");
384
385 let config = BenchmarkConfig::default();
386 let results = benchmark_binary_operations(config)?;
387
388 print_benchmark_results(&results);
389
390 Ok(results)
391}
392
393#[cfg(test)]
394mod tests {
395 use super::*;
396
397 #[test]
398 fn test_benchmark_correctness() {
399 let config = BenchmarkConfig {
400 warmup_iterations: 1,
401 measurement_iterations: 1,
402 sizes: vec![1000],
403 verify_correctness: true,
404 };
405
406 let results = benchmark_binary_operations(config)
407 .expect("test: benchmark_binary_operations should succeed");
408
409 for result in &results {
411 assert!(
412 result.correctness_verified,
413 "Correctness verification failed for {}",
414 result.operation
415 );
416 }
417
418 assert!(!results.is_empty());
420 }
421
422 #[test]
423 fn test_small_benchmark() {
424 let config = BenchmarkConfig {
425 warmup_iterations: 1,
426 measurement_iterations: 2,
427 sizes: vec![100],
428 verify_correctness: true,
429 };
430
431 let results = benchmark_binary_operations(config)
432 .expect("test: benchmark_binary_operations should succeed");
433 assert!(!results.is_empty());
434
435 print_benchmark_results(&results);
437 }
438}