cbtop/workload_characterization/
mod.rs1#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
18pub enum WorkloadCategory {
19 Gemm,
21 Bandwidth,
23 Attention,
25 Conv2d,
27 Elementwise,
29 Reduction,
31 Unknown,
33}
34
35impl WorkloadCategory {
36 pub fn name(&self) -> &'static str {
38 match self {
39 Self::Gemm => "gemm",
40 Self::Bandwidth => "bandwidth",
41 Self::Attention => "attention",
42 Self::Conv2d => "conv2d",
43 Self::Elementwise => "elementwise",
44 Self::Reduction => "reduction",
45 Self::Unknown => "unknown",
46 }
47 }
48
49 pub fn is_compute_bound(&self) -> bool {
51 matches!(self, Self::Gemm | Self::Conv2d)
52 }
53
54 pub fn is_memory_bound(&self) -> bool {
56 matches!(self, Self::Bandwidth | Self::Elementwise)
57 }
58
59 pub fn typical_intensity_range(&self) -> (f64, f64) {
61 match self {
62 Self::Gemm => (10.0, 100.0),
63 Self::Bandwidth => (0.1, 1.0),
64 Self::Attention => (1.0, 20.0),
65 Self::Conv2d => (5.0, 50.0),
66 Self::Elementwise => (0.1, 0.5),
67 Self::Reduction => (0.5, 5.0),
68 Self::Unknown => (0.0, 100.0),
69 }
70 }
71}
72
73#[derive(Debug, Clone, Copy, PartialEq, Eq)]
75pub enum RecommendedBackend {
76 CpuSimd,
78 Gpu,
80 Either,
82}
83
84impl RecommendedBackend {
85 pub fn name(&self) -> &'static str {
87 match self {
88 Self::CpuSimd => "cpu_simd",
89 Self::Gpu => "gpu",
90 Self::Either => "either",
91 }
92 }
93}
94
95#[derive(Debug, Clone)]
97pub struct WorkloadFeatures {
98 pub arithmetic_intensity: f64,
100 pub memory_footprint: usize,
102 pub working_set: usize,
104 pub access_pattern: f64,
106 pub compute_density: f64,
108 pub branch_rate: f64,
110 pub data_reuse: f64,
112}
113
114impl Default for WorkloadFeatures {
115 fn default() -> Self {
116 Self {
117 arithmetic_intensity: 1.0,
118 memory_footprint: 0,
119 working_set: 0,
120 access_pattern: 0.5,
121 compute_density: 1.0,
122 branch_rate: 0.0,
123 data_reuse: 1.0,
124 }
125 }
126}
127
128impl WorkloadFeatures {
129 pub fn new() -> Self {
131 Self::default()
132 }
133
134 pub fn with_intensity(mut self, intensity: f64) -> Self {
136 self.arithmetic_intensity = intensity.max(0.0);
137 self
138 }
139
140 pub fn with_memory(mut self, footprint: usize, working_set: usize) -> Self {
142 self.memory_footprint = footprint;
143 self.working_set = working_set;
144 self
145 }
146
147 pub fn with_access_pattern(mut self, pattern: f64) -> Self {
149 self.access_pattern = pattern.clamp(0.0, 1.0);
150 self
151 }
152
153 pub fn with_compute_density(mut self, density: f64) -> Self {
155 self.compute_density = density.max(0.0);
156 self
157 }
158
159 pub fn with_branch_rate(mut self, rate: f64) -> Self {
161 self.branch_rate = rate.clamp(0.0, 1.0);
162 self
163 }
164
165 pub fn with_data_reuse(mut self, reuse: f64) -> Self {
167 self.data_reuse = reuse.max(1.0);
168 self
169 }
170
171 pub fn normalize(&self, means: &[f64], stds: &[f64]) -> Vec<f64> {
173 let features = self.to_vec();
174 features
175 .iter()
176 .enumerate()
177 .map(|(i, &v)| {
178 if stds[i] > 1e-10 {
179 (v - means[i]) / stds[i]
180 } else {
181 0.0
182 }
183 })
184 .collect()
185 }
186
187 pub fn to_vec(&self) -> Vec<f64> {
189 vec![
190 self.arithmetic_intensity,
191 self.memory_footprint as f64,
192 self.working_set as f64,
193 self.access_pattern,
194 self.compute_density,
195 self.branch_rate,
196 self.data_reuse,
197 ]
198 }
199
200 pub fn distance(&self, other: &Self) -> f64 {
202 let a = self.to_vec();
203 let b = other.to_vec();
204 a.iter()
205 .zip(b.iter())
206 .map(|(x, y)| (x - y).powi(2))
207 .sum::<f64>()
208 .sqrt()
209 }
210
211 pub fn cosine_similarity(&self, other: &Self) -> f64 {
213 let a = self.to_vec();
214 let b = other.to_vec();
215
216 let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
217 let norm_a: f64 = a.iter().map(|x| x.powi(2)).sum::<f64>().sqrt();
218 let norm_b: f64 = b.iter().map(|x| x.powi(2)).sum::<f64>().sqrt();
219
220 if norm_a < 1e-10 || norm_b < 1e-10 {
221 return 0.0;
222 }
223
224 (dot / (norm_a * norm_b)).clamp(-1.0, 1.0)
225 }
226}
227
228#[derive(Debug, Clone)]
230pub struct ClassificationResult {
231 pub category: WorkloadCategory,
233 pub confidence: f64,
235 pub distance: f64,
237 pub recommended_backend: RecommendedBackend,
239 pub gpu_crossover_size: Option<usize>,
241}
242
243impl ClassificationResult {
244 pub fn is_confident(&self) -> bool {
246 self.confidence > 0.7
247 }
248}
249
250#[derive(Debug)]
252pub struct WorkloadCharacterizer {
253 prototypes: Vec<(WorkloadCategory, WorkloadFeatures)>,
255 gpu_thresholds: Vec<(WorkloadCategory, usize)>,
257}
258
259impl Default for WorkloadCharacterizer {
260 fn default() -> Self {
261 Self::new()
262 }
263}
264
265impl WorkloadCharacterizer {
266 pub fn new() -> Self {
268 let prototypes = vec![
269 (
271 WorkloadCategory::Gemm,
272 WorkloadFeatures::new()
273 .with_intensity(50.0)
274 .with_compute_density(8.0)
275 .with_access_pattern(0.8)
276 .with_data_reuse(32.0)
277 .with_branch_rate(0.01),
278 ),
279 (
281 WorkloadCategory::Bandwidth,
282 WorkloadFeatures::new()
283 .with_intensity(0.25)
284 .with_compute_density(0.5)
285 .with_access_pattern(1.0)
286 .with_data_reuse(1.0)
287 .with_branch_rate(0.0),
288 ),
289 (
291 WorkloadCategory::Attention,
292 WorkloadFeatures::new()
293 .with_intensity(5.0)
294 .with_compute_density(4.0)
295 .with_access_pattern(0.6)
296 .with_data_reuse(4.0)
297 .with_branch_rate(0.05),
298 ),
299 (
301 WorkloadCategory::Conv2d,
302 WorkloadFeatures::new()
303 .with_intensity(20.0)
304 .with_compute_density(6.0)
305 .with_access_pattern(0.7)
306 .with_data_reuse(9.0)
307 .with_branch_rate(0.02),
308 ),
309 (
311 WorkloadCategory::Elementwise,
312 WorkloadFeatures::new()
313 .with_intensity(0.125)
314 .with_compute_density(1.0)
315 .with_access_pattern(1.0)
316 .with_data_reuse(1.0)
317 .with_branch_rate(0.0),
318 ),
319 (
321 WorkloadCategory::Reduction,
322 WorkloadFeatures::new()
323 .with_intensity(1.0)
324 .with_compute_density(2.0)
325 .with_access_pattern(0.5)
326 .with_data_reuse(2.0)
327 .with_branch_rate(0.1),
328 ),
329 ];
330
331 let gpu_thresholds = vec![
332 (WorkloadCategory::Gemm, 10_000), (WorkloadCategory::Bandwidth, 1_000_000), (WorkloadCategory::Attention, 50_000), (WorkloadCategory::Conv2d, 100_000), (WorkloadCategory::Elementwise, 500_000), (WorkloadCategory::Reduction, 100_000), ];
339
340 Self {
341 prototypes,
342 gpu_thresholds,
343 }
344 }
345
346 pub fn extract_features(
348 &self,
349 flops: f64,
350 bytes_accessed: f64,
351 memory_footprint: usize,
352 working_set: usize,
353 ) -> WorkloadFeatures {
354 let intensity = if bytes_accessed > 0.0 {
355 flops / bytes_accessed
356 } else {
357 0.0
358 };
359
360 WorkloadFeatures::new()
361 .with_intensity(intensity)
362 .with_memory(memory_footprint, working_set)
363 }
364
365 pub fn classify(&self, features: &WorkloadFeatures) -> ClassificationResult {
367 let mut best_category = WorkloadCategory::Unknown;
368 let mut best_distance = f64::MAX;
369 let mut second_best_distance = f64::MAX;
370
371 for (category, prototype) in &self.prototypes {
372 let distance = features.distance(prototype);
373 if distance < best_distance {
374 second_best_distance = best_distance;
375 best_distance = distance;
376 best_category = *category;
377 } else if distance < second_best_distance {
378 second_best_distance = distance;
379 }
380 }
381
382 let confidence = if second_best_distance > 1e-10 {
384 (1.0 - best_distance / second_best_distance).clamp(0.0, 1.0)
385 } else {
386 1.0
387 };
388
389 let recommended_backend = self.recommend_backend(best_category, features.memory_footprint);
391
392 let gpu_crossover_size = self
394 .gpu_thresholds
395 .iter()
396 .find(|(c, _)| *c == best_category)
397 .map(|(_, t)| *t);
398
399 ClassificationResult {
400 category: best_category,
401 confidence,
402 distance: best_distance,
403 recommended_backend,
404 gpu_crossover_size,
405 }
406 }
407
408 pub fn workload_similarity(&self, a: &WorkloadFeatures, b: &WorkloadFeatures) -> f64 {
410 (a.cosine_similarity(b) + 1.0) / 2.0
412 }
413
414 pub fn recommend_backend(&self, category: WorkloadCategory, size: usize) -> RecommendedBackend {
416 let threshold = self
417 .gpu_thresholds
418 .iter()
419 .find(|(c, _)| *c == category)
420 .map(|(_, t)| *t)
421 .unwrap_or(100_000);
422
423 if size < threshold / 2 {
424 RecommendedBackend::CpuSimd
425 } else if size > threshold * 2 {
426 RecommendedBackend::Gpu
427 } else {
428 RecommendedBackend::Either
429 }
430 }
431
432 pub fn predict_crossover(&self, category: WorkloadCategory) -> Option<usize> {
434 self.gpu_thresholds
435 .iter()
436 .find(|(c, _)| *c == category)
437 .map(|(_, t)| *t)
438 }
439
440 pub fn add_prototype(&mut self, category: WorkloadCategory, features: WorkloadFeatures) {
442 self.prototypes.push((category, features));
443 }
444
445 pub fn get_prototypes(&self) -> &[(WorkloadCategory, WorkloadFeatures)] {
447 &self.prototypes
448 }
449}
450
451#[cfg(test)]
452mod tests;