1use crate::{Device, Result, Tensor};
5use std::collections::HashMap;
6use std::time::Instant;
7
8#[derive(Debug, Clone)]
10pub struct SimpleBenchmarkConfig {
11 pub warmup_iterations: usize,
12 pub benchmark_iterations: usize,
13 pub test_sizes: Vec<Vec<usize>>,
14}
15
16impl Default for SimpleBenchmarkConfig {
17 fn default() -> Self {
18 Self {
19 warmup_iterations: 5,
20 benchmark_iterations: 20,
21 test_sizes: vec![
22 vec![256, 256], vec![512, 512], vec![1024, 1024], ],
26 }
27 }
28}
29
30#[derive(Debug, Clone)]
32pub struct SimpleBenchmarkResult {
33 pub benchmark_name: String,
34 pub execution_time_ms: f64,
35 pub throughput_ops_per_sec: f64,
36 pub memory_usage_mb: f64,
37}
38
39pub struct SimpleBenchmarkSuite {
41 config: SimpleBenchmarkConfig,
42 results: HashMap<String, SimpleBenchmarkResult>,
43}
44
45impl SimpleBenchmarkSuite {
46 pub fn new(config: SimpleBenchmarkConfig) -> Self {
48 Self {
49 config,
50 results: HashMap::new(),
51 }
52 }
53
54 pub fn run_benchmarks(&mut self) -> Result<BenchmarkReport> {
56 println!("š Running TenfloweRS Performance Benchmarks...");
57
58 self.benchmark_tensor_creation()?;
60 self.benchmark_element_wise_operations()?;
61 self.benchmark_matrix_operations()?;
62
63 Ok(BenchmarkReport {
65 results: self.results.clone(),
66 summary: self.generate_summary(),
67 })
68 }
69
70 fn benchmark_tensor_creation(&mut self) -> Result<()> {
72 println!(" š Benchmarking tensor creation...");
73
74 for size in &self.config.test_sizes {
75 if size.len() == 2 {
76 let rows = size[0];
77 let cols = size[1];
78
79 for _ in 0..self.config.warmup_iterations {
81 let _: Tensor<f32> = Tensor::zeros(&[rows, cols]);
82 }
83
84 let start = Instant::now();
86 for _ in 0..self.config.benchmark_iterations {
87 let _: Tensor<f32> = Tensor::zeros(&[rows, cols]);
88 }
89 let elapsed = start.elapsed();
90
91 let avg_time_ms =
92 elapsed.as_secs_f64() * 1000.0 / self.config.benchmark_iterations as f64;
93 let ops_per_sec = 1000.0 / avg_time_ms;
94 let memory_mb = (rows * cols * 4) as f64 / (1024.0 * 1024.0); let result = SimpleBenchmarkResult {
97 benchmark_name: format!("TensorCreation_{}x{}", rows, cols),
98 execution_time_ms: avg_time_ms,
99 throughput_ops_per_sec: ops_per_sec,
100 memory_usage_mb: memory_mb,
101 };
102
103 self.results.insert(result.benchmark_name.clone(), result);
104 }
105 }
106
107 Ok(())
108 }
109
110 fn benchmark_element_wise_operations(&mut self) -> Result<()> {
112 println!(" ā” Benchmarking element-wise operations...");
113
114 for size in &self.config.test_sizes {
115 if size.len() == 2 {
116 let rows = size[0];
117 let cols = size[1];
118
119 let a: Tensor<f32> = Tensor::ones(&[rows, cols]);
120 let b: Tensor<f32> = Tensor::ones(&[rows, cols]);
121
122 for _ in 0..self.config.warmup_iterations {
124 let _ = a.add(&b)?;
125 }
126
127 let start = Instant::now();
129 for _ in 0..self.config.benchmark_iterations {
130 let _ = a.add(&b)?;
131 }
132 let elapsed = start.elapsed();
133
134 let avg_time_ms =
135 elapsed.as_secs_f64() * 1000.0 / self.config.benchmark_iterations as f64;
136 let elements_per_sec = (rows * cols) as f64 / (avg_time_ms / 1000.0);
137 let memory_mb = (rows * cols * 3 * 4) as f64 / (1024.0 * 1024.0); let result = SimpleBenchmarkResult {
140 benchmark_name: format!("ElementwiseAdd_{}x{}", rows, cols),
141 execution_time_ms: avg_time_ms,
142 throughput_ops_per_sec: elements_per_sec,
143 memory_usage_mb: memory_mb,
144 };
145
146 self.results.insert(result.benchmark_name.clone(), result);
147 }
148 }
149
150 Ok(())
151 }
152
153 fn benchmark_matrix_operations(&mut self) -> Result<()> {
155 println!(" š¢ Benchmarking matrix operations...");
156
157 for size in &self.config.test_sizes {
158 if size.len() == 2 {
159 let rows = size[0];
160 let cols = size[1];
161
162 let a: Tensor<f32> = Tensor::ones(&[rows, cols]);
163 let b: Tensor<f32> = Tensor::ones(&[cols, rows]);
164
165 for _ in 0..self.config.warmup_iterations {
167 let _ = a.matmul(&b)?;
168 }
169
170 let start = Instant::now();
172 for _ in 0..self.config.benchmark_iterations {
173 let _ = a.matmul(&b)?;
174 }
175 let elapsed = start.elapsed();
176
177 let avg_time_ms =
178 elapsed.as_secs_f64() * 1000.0 / self.config.benchmark_iterations as f64;
179 let flops = (rows * cols * cols * 2) as f64; let gflops_per_sec = flops / (avg_time_ms / 1000.0) / 1e9;
181 let memory_mb = (rows * cols * 2 + rows * rows) as f64 * 4.0 / (1024.0 * 1024.0);
182
183 let result = SimpleBenchmarkResult {
184 benchmark_name: format!("MatMul_{}x{}", rows, cols),
185 execution_time_ms: avg_time_ms,
186 throughput_ops_per_sec: gflops_per_sec,
187 memory_usage_mb: memory_mb,
188 };
189
190 self.results.insert(result.benchmark_name.clone(), result);
191 }
192 }
193
194 Ok(())
195 }
196
197 fn generate_summary(&self) -> BenchmarkSummary {
199 let mut total_time = 0.0f64;
200 let mut max_throughput = 0.0f64;
201 let mut total_memory = 0.0f64;
202
203 for result in self.results.values() {
204 total_time += result.execution_time_ms;
205 max_throughput = max_throughput.max(result.throughput_ops_per_sec);
206 total_memory += result.memory_usage_mb;
207 }
208
209 BenchmarkSummary {
210 total_benchmarks: self.results.len(),
211 average_execution_time_ms: total_time / self.results.len() as f64,
212 peak_throughput: max_throughput,
213 total_memory_usage_mb: total_memory,
214 performance_score: Self::calculate_performance_score(&self.results),
215 }
216 }
217
218 fn calculate_performance_score(results: &HashMap<String, SimpleBenchmarkResult>) -> f64 {
220 let mut score = 0.0;
221 let mut count = 0;
222
223 for result in results.values() {
224 let normalized_score = (result.throughput_ops_per_sec / 1e6).min(10.0); score += normalized_score;
227 count += 1;
228 }
229
230 if count > 0 {
231 score / count as f64
232 } else {
233 0.0
234 }
235 }
236}
237
238#[derive(Debug, Clone)]
240pub struct BenchmarkReport {
241 pub results: HashMap<String, SimpleBenchmarkResult>,
242 pub summary: BenchmarkSummary,
243}
244
245#[derive(Debug, Clone)]
247pub struct BenchmarkSummary {
248 pub total_benchmarks: usize,
249 pub average_execution_time_ms: f64,
250 pub peak_throughput: f64,
251 pub total_memory_usage_mb: f64,
252 pub performance_score: f64,
253}
254
255impl BenchmarkReport {
256 pub fn print_report(&self) {
258 println!("\nšÆ === TENFLOWERS PERFORMANCE BENCHMARK REPORT ===");
259
260 println!("\nš Individual Benchmark Results:");
261 for (name, result) in &self.results {
262 println!(
263 " {} - {:.2}ms, {:.1} ops/s, {:.1} MB",
264 name,
265 result.execution_time_ms,
266 result.throughput_ops_per_sec,
267 result.memory_usage_mb
268 );
269 }
270
271 println!("\nš Summary:");
272 println!(" Total Benchmarks: {}", self.summary.total_benchmarks);
273 println!(
274 " Average Execution Time: {:.2}ms",
275 self.summary.average_execution_time_ms
276 );
277 println!(
278 " Peak Throughput: {:.1} ops/s",
279 self.summary.peak_throughput
280 );
281 println!(
282 " Total Memory Usage: {:.1} MB",
283 self.summary.total_memory_usage_mb
284 );
285 println!(
286 " Performance Score: {:.2}/10",
287 self.summary.performance_score
288 );
289
290 match self.summary.performance_score {
292 score if score >= 8.0 => println!(" ā
Excellent Performance!"),
293 score if score >= 6.0 => println!(" ā
Good Performance"),
294 score if score >= 4.0 => println!(" ā ļø Moderate Performance"),
295 _ => println!(" ā Performance Needs Improvement"),
296 }
297 }
298}
299
300pub fn run_simple_benchmarks() -> Result<BenchmarkReport> {
302 let config = SimpleBenchmarkConfig::default();
303 let mut suite = SimpleBenchmarkSuite::new(config);
304 suite.run_benchmarks()
305}
306
307pub fn validate_optimizations() -> Result<()> {
309 println!("š === OPTIMIZATION VALIDATION ===");
310
311 println!("ā” Testing CPU Performance...");
313 let _cpu_device = Device::Cpu;
314 let a: Tensor<f32> = Tensor::ones(&[1000, 1000]);
315 let b: Tensor<f32> = Tensor::ones(&[1000, 1000]);
316
317 let start = Instant::now();
318 let _result = a.matmul(&b)?;
319 let cpu_time = start.elapsed();
320
321 println!(
322 " CPU MatMul (1000x1000): {:.2}ms",
323 cpu_time.as_secs_f64() * 1000.0
324 );
325
326 println!("š Testing SIMD Effectiveness...");
328 let large_tensor: Tensor<f32> = Tensor::ones(&[10000]);
329 let another_tensor: Tensor<f32> = Tensor::ones(&[10000]);
330
331 let start = Instant::now();
332 let _result = large_tensor.add(&another_tensor)?;
333 let simd_time = start.elapsed();
334
335 println!(
336 " Element-wise Add (10k elements): {:.2}ms",
337 simd_time.as_secs_f64() * 1000.0
338 );
339
340 println!("š¾ Testing Memory Efficiency...");
342 let memory_test_size = 5000;
343 let start = Instant::now();
344 let _large_matrix: Tensor<f32> = Tensor::zeros(&[memory_test_size, memory_test_size]);
345 let memory_time = start.elapsed();
346
347 let memory_mb = (memory_test_size * memory_test_size * 4) as f64 / (1024.0 * 1024.0);
348 let allocation_rate = memory_mb / memory_time.as_secs_f64();
349
350 println!(
351 " Memory Allocation ({}MB): {:.2}ms, {:.1} MB/s",
352 memory_mb,
353 memory_time.as_secs_f64() * 1000.0,
354 allocation_rate
355 );
356
357 println!("ā
Optimization validation complete!");
358
359 Ok(())
360}