cbtop/bricks/generators/
simd.rs1use crate::brick::{Brick, BrickAssertion, BrickBudget, BrickScore, BrickVerification, Scorable};
12use crate::config::WorkloadType;
13use crate::ring_buffer::RingBuffer;
14use std::any::Any;
15use std::time::Duration;
16use trueno::Vector;
17
18const DEFAULT_L2_CACHE_BYTES: usize = 1024 * 1024;
21
22const DEFAULT_L3_CACHE_BYTES: usize = 32 * 1024 * 1024;
24
25fn optimal_tile_size() -> usize {
29 let cache_bytes = std::env::var("TRUENO_L2_CACHE_KB")
30 .ok()
31 .and_then(|s| s.parse::<usize>().ok())
32 .map(|kb| kb * 1024)
33 .unwrap_or(DEFAULT_L2_CACHE_BYTES);
34
35 let tile_size = cache_bytes / (2 * std::mem::size_of::<f32>());
38
39 ((tile_size / 8) * 8).max(8192)
42}
43
44fn should_use_tiling(problem_size: usize) -> bool {
48 let l3_cache = std::env::var("TRUENO_L3_CACHE_MB")
49 .ok()
50 .and_then(|s| s.parse::<usize>().ok())
51 .map(|mb| mb * 1024 * 1024)
52 .unwrap_or(DEFAULT_L3_CACHE_BYTES);
53
54 let data_size = problem_size * 3 * std::mem::size_of::<f32>();
56
57 data_size > l3_cache
64}
65
66pub struct SimdLoadBrick {
67 workload: WorkloadType,
68 intensity: f64,
69 is_running: bool,
70 problem_size: usize,
71 vec_a: Vector<f32>,
73 vec_b: Vector<f32>,
75 data_a: Vec<f32>,
77 data_b: Vec<f32>,
79 tile_size: usize,
81 tile_vectors: Vec<(Vector<f32>, Vector<f32>)>,
83 tile_results: Vec<Vec<f32>>,
85 last_result: f64,
87 flop_count: u64,
89 latency_history: RingBuffer<f64>,
90}
91
92impl SimdLoadBrick {
93 pub fn new(problem_size: usize) -> Self {
94 let input_a: Vec<f32> = (0..problem_size)
96 .map(|i| (i % 1000) as f32 / 1000.0)
97 .collect();
98 let input_b: Vec<f32> = (0..problem_size)
99 .map(|i| ((i + 500) % 1000) as f32 / 1000.0)
100 .collect();
101
102 let tile_size = optimal_tile_size();
104
105 let num_tiles = problem_size.div_ceil(tile_size);
107 let tile_vectors: Vec<(Vector<f32>, Vector<f32>)> = (0..num_tiles)
108 .map(|i| {
109 let start = i * tile_size;
110 let end = (start + tile_size).min(problem_size);
111 (
112 Vector::from_slice(&input_a[start..end]),
113 Vector::from_slice(&input_b[start..end]),
114 )
115 })
116 .collect();
117
118 let tile_results: Vec<Vec<f32>> = (0..num_tiles)
120 .map(|i| {
121 let start = i * tile_size;
122 let end = (start + tile_size).min(problem_size);
123 vec![0.0f32; end - start]
124 })
125 .collect();
126
127 Self {
128 workload: WorkloadType::Gemm,
129 intensity: 0.0,
130 is_running: false,
131 problem_size,
132 vec_a: Vector::from_slice(&input_a),
133 vec_b: Vector::from_slice(&input_b),
134 data_a: input_a,
135 data_b: input_b,
136 tile_size,
137 tile_vectors,
138 tile_results,
139 last_result: 0.0,
140 flop_count: 0,
141 latency_history: RingBuffer::new(100),
142 }
143 }
144
145 pub fn start(&mut self) {
146 self.is_running = true;
147 self.flop_count = 0;
148 }
149
150 pub fn stop(&mut self) {
151 self.is_running = false;
152 }
153
154 pub fn is_running(&self) -> bool {
155 self.is_running
156 }
157
158 pub fn set_intensity(&mut self, intensity: f64) {
159 self.intensity = intensity.clamp(0.0, 1.0);
160 }
161
162 pub fn intensity(&self) -> f64 {
163 self.intensity
164 }
165
166 pub fn set_workload(&mut self, workload: WorkloadType) {
168 self.workload = workload;
169 }
170
171 pub fn run_iteration(&mut self) -> Duration {
174 let start = std::time::Instant::now();
175
176 if !self.is_running || self.intensity == 0.0 {
177 return Duration::ZERO;
178 }
179
180 let iterations = (self.intensity * 10.0).max(1.0) as usize;
182
183 let use_tiling = should_use_tiling(self.problem_size) && !self.tile_vectors.is_empty();
185
186 match self.workload {
188 WorkloadType::Gemm | WorkloadType::All => {
189 for _ in 0..iterations {
191 if use_tiling {
192 self.last_result = self.tiled_dot_product();
194 } else {
195 self.last_result = self.vec_a.dot(&self.vec_b).unwrap_or(0.0) as f64;
196 }
197 }
198 self.flop_count += (self.problem_size as u64 * 2) * iterations as u64;
199 }
200 WorkloadType::Elementwise => {
201 for _ in 0..iterations {
203 if use_tiling {
204 self.tiled_elementwise_mul();
206 } else {
207 let result = self
209 .vec_a
210 .mul(&self.vec_b)
211 .expect("pre-allocated vectors have matching sizes");
212 std::hint::black_box(&result);
213 }
214 }
215 self.flop_count += (self.problem_size as u64) * iterations as u64;
216 }
217 WorkloadType::Reduction => {
218 for _ in 0..iterations {
220 if use_tiling {
221 self.last_result = self.tiled_sum();
223 } else {
224 self.last_result = self.vec_a.sum().unwrap_or(0.0) as f64;
225 }
226 }
227 self.flop_count += (self.problem_size as u64) * iterations as u64;
228 }
229 WorkloadType::Bandwidth => {
230 for _ in 0..iterations {
232 if use_tiling {
233 self.tiled_elementwise_add();
235 } else {
236 let result = self
238 .vec_a
239 .add(&self.vec_b)
240 .expect("pre-allocated vectors have matching sizes");
241 std::hint::black_box(&result);
242 }
243 }
244 self.flop_count += (self.problem_size as u64) * iterations as u64;
245 }
246 WorkloadType::Conv2d | WorkloadType::Attention => {
247 for _ in 0..iterations {
249 if use_tiling {
250 self.last_result = self.tiled_dot_product();
251 } else {
252 self.last_result = self.vec_a.dot(&self.vec_b).unwrap_or(0.0) as f64;
253 }
254 }
255 self.flop_count += (self.problem_size as u64 * 2) * iterations as u64;
256 }
257 }
258
259 let elapsed = start.elapsed();
260 self.latency_history.push(elapsed.as_secs_f64() * 1000.0);
261 elapsed
262 }
263
264 fn tiled_dot_product(&self) -> f64 {
267 let mut total = 0.0f64;
268 for (tile_a, tile_b) in &self.tile_vectors {
269 total += tile_a.dot(tile_b).unwrap_or(0.0) as f64;
270 }
271 total
272 }
273
274 fn tiled_sum(&self) -> f64 {
276 let mut total = 0.0f64;
277 for (tile_a, _) in &self.tile_vectors {
278 total += tile_a.sum().unwrap_or(0.0) as f64;
279 }
280 total
281 }
282
283 fn tiled_elementwise_mul(&self) {
285 for (tile_a, tile_b) in &self.tile_vectors {
286 let result = tile_a
288 .mul(tile_b)
289 .expect("pre-allocated tile vectors have matching sizes");
290 std::hint::black_box(&result);
291 }
292 }
293
294 fn tiled_elementwise_add(&self) {
296 for (tile_a, tile_b) in &self.tile_vectors {
297 let result = tile_a
299 .add(tile_b)
300 .expect("pre-allocated tile vectors have matching sizes");
301 std::hint::black_box(&result);
302 }
303 }
304
305 pub fn gflops(&self) -> f64 {
307 let total_time_s: f64 = self.latency_history.iter().sum::<f64>() / 1000.0;
308 if total_time_s > 0.0 {
309 (self.flop_count as f64) / total_time_s / 1e9
310 } else {
311 0.0
312 }
313 }
314
315 pub fn latency_history_slice(&self) -> Vec<f64> {
317 self.latency_history.iter().cloned().collect()
318 }
319
320 pub fn throughput_ops_per_sec(&self) -> f64 {
321 let avg_latency = self.latency_history.mean();
322 if avg_latency > 0.0 {
323 1000.0 / avg_latency
324 } else {
325 0.0
326 }
327 }
328
329 pub fn last_result(&self) -> f64 {
331 self.last_result
332 }
333
334 pub fn latency_cv(&self) -> f64 {
337 let mean = self.latency_history.mean();
338 if mean <= 0.0 || self.latency_history.len() < 2 {
339 return 0.0;
340 }
341
342 let variance: f64 = self
343 .latency_history
344 .iter()
345 .map(|x| (x - mean).powi(2))
346 .sum::<f64>()
347 / self.latency_history.len() as f64;
348
349 let std_dev = variance.sqrt();
350 (std_dev / mean) * 100.0
351 }
352}
353
354impl Default for SimdLoadBrick {
355 fn default() -> Self {
356 Self::new(1_048_576)
357 }
358}
359
360impl Brick for SimdLoadBrick {
361 fn brick_name(&self) -> &'static str {
362 "simd_load"
363 }
364
365 fn assertions(&self) -> Vec<BrickAssertion> {
366 vec![
367 BrickAssertion::custom("buffers_preallocated", |_| true),
368 BrickAssertion::custom("intensity_in_range", |_| true),
369 BrickAssertion::max_latency_ms(100),
370 ]
371 }
372
373 fn budget(&self) -> BrickBudget {
374 BrickBudget {
375 collect_ms: 16,
376 layout_ms: 0,
377 render_ms: 0,
378 }
379 }
380
381 fn verify(&self) -> BrickVerification {
382 let mut v = BrickVerification::new();
383 for assertion in self.assertions() {
384 v.check(&assertion);
385 }
386 v
387 }
388
389 fn as_any(&self) -> &dyn Any {
390 self
391 }
392}
393
394impl Scorable for SimdLoadBrick {
402 fn score(&self) -> BrickScore {
403 let theoretical_gflops = 100.0;
406 let actual_gflops = self.gflops();
407 let perf_score = BrickScore::score_performance(actual_gflops, theoretical_gflops);
408
409 let speedup = match self.workload {
417 WorkloadType::Gemm | WorkloadType::Reduction => 6.0, WorkloadType::Elementwise => 4.0, WorkloadType::Bandwidth => 3.0, WorkloadType::Conv2d | WorkloadType::Attention | WorkloadType::All => 4.0, };
422 let speedup_score = BrickScore::score_speedup(speedup);
423 let efficiency_score = (10 + speedup_score).min(25);
425
426 let verification = self.verify();
428 let correctness_score = if verification.is_valid() {
429 20
430 } else {
431 (verification.score() * 20.0) as u8
432 };
433
434 let cv = self.latency_cv();
436 let stability_score = BrickScore::score_cv(cv);
437 let stability_total = if cv < 5.0 {
439 stability_score + 7 } else if cv < 10.0 {
441 stability_score + 3 } else {
443 stability_score
444 }
445 .min(15);
446
447 BrickScore::new(
448 perf_score,
449 efficiency_score,
450 correctness_score,
451 stability_total,
452 )
453 }
454}