Skip to main content

cbtop/workload_characterization/
mod.rs

1//! Workload Characterization System (PMAT-035)
2//!
3//! Automatic workload classification based on runtime metrics.
4//!
5//! # Features
6//!
7//! - Feature extraction from workload metrics
8//! - Workload classification (GEMM, Bandwidth, Attention, etc.)
9//! - Similarity computation between workloads
10//! - Backend recommendation based on classification
11//!
12//! # Falsification Criteria (F1271-F1280)
13//!
14//! See `tests/workload_characterization_f1271.rs` for falsification tests.
15
16/// Known workload categories
17#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
18pub enum WorkloadCategory {
19    /// Matrix multiplication (compute-bound)
20    Gemm,
21    /// Memory bandwidth test (memory-bound)
22    Bandwidth,
23    /// Attention mechanism (mixed)
24    Attention,
25    /// Convolution (compute-bound)
26    Conv2d,
27    /// Elementwise operations (memory-bound)
28    Elementwise,
29    /// Reduction operations (mixed)
30    Reduction,
31    /// Unknown workload type
32    Unknown,
33}
34
35impl WorkloadCategory {
36    /// Get category name
37    pub fn name(&self) -> &'static str {
38        match self {
39            Self::Gemm => "gemm",
40            Self::Bandwidth => "bandwidth",
41            Self::Attention => "attention",
42            Self::Conv2d => "conv2d",
43            Self::Elementwise => "elementwise",
44            Self::Reduction => "reduction",
45            Self::Unknown => "unknown",
46        }
47    }
48
49    /// Check if compute-bound
50    pub fn is_compute_bound(&self) -> bool {
51        matches!(self, Self::Gemm | Self::Conv2d)
52    }
53
54    /// Check if memory-bound
55    pub fn is_memory_bound(&self) -> bool {
56        matches!(self, Self::Bandwidth | Self::Elementwise)
57    }
58
59    /// Get typical arithmetic intensity range
60    pub fn typical_intensity_range(&self) -> (f64, f64) {
61        match self {
62            Self::Gemm => (10.0, 100.0),
63            Self::Bandwidth => (0.1, 1.0),
64            Self::Attention => (1.0, 20.0),
65            Self::Conv2d => (5.0, 50.0),
66            Self::Elementwise => (0.1, 0.5),
67            Self::Reduction => (0.5, 5.0),
68            Self::Unknown => (0.0, 100.0),
69        }
70    }
71}
72
73/// Recommended compute backend
74#[derive(Debug, Clone, Copy, PartialEq, Eq)]
75pub enum RecommendedBackend {
76    /// CPU with SIMD (small sizes)
77    CpuSimd,
78    /// GPU (large sizes, parallel workloads)
79    Gpu,
80    /// Either CPU or GPU acceptable
81    Either,
82}
83
84impl RecommendedBackend {
85    /// Get backend name
86    pub fn name(&self) -> &'static str {
87        match self {
88            Self::CpuSimd => "cpu_simd",
89            Self::Gpu => "gpu",
90            Self::Either => "either",
91        }
92    }
93}
94
95/// Workload feature vector
96#[derive(Debug, Clone)]
97pub struct WorkloadFeatures {
98    /// Arithmetic intensity (FLOPs / Bytes)
99    pub arithmetic_intensity: f64,
100    /// Memory footprint in bytes
101    pub memory_footprint: usize,
102    /// Working set size in bytes
103    pub working_set: usize,
104    /// Access pattern score (0 = random, 1 = sequential)
105    pub access_pattern: f64,
106    /// Compute density (ops per cycle)
107    pub compute_density: f64,
108    /// Branch rate (branches per operation)
109    pub branch_rate: f64,
110    /// Data reuse factor
111    pub data_reuse: f64,
112}
113
114impl Default for WorkloadFeatures {
115    fn default() -> Self {
116        Self {
117            arithmetic_intensity: 1.0,
118            memory_footprint: 0,
119            working_set: 0,
120            access_pattern: 0.5,
121            compute_density: 1.0,
122            branch_rate: 0.0,
123            data_reuse: 1.0,
124        }
125    }
126}
127
128impl WorkloadFeatures {
129    /// Create new feature vector
130    pub fn new() -> Self {
131        Self::default()
132    }
133
134    /// Set arithmetic intensity
135    pub fn with_intensity(mut self, intensity: f64) -> Self {
136        self.arithmetic_intensity = intensity.max(0.0);
137        self
138    }
139
140    /// Set memory footprint
141    pub fn with_memory(mut self, footprint: usize, working_set: usize) -> Self {
142        self.memory_footprint = footprint;
143        self.working_set = working_set;
144        self
145    }
146
147    /// Set access pattern (0 = random, 1 = sequential)
148    pub fn with_access_pattern(mut self, pattern: f64) -> Self {
149        self.access_pattern = pattern.clamp(0.0, 1.0);
150        self
151    }
152
153    /// Set compute density
154    pub fn with_compute_density(mut self, density: f64) -> Self {
155        self.compute_density = density.max(0.0);
156        self
157    }
158
159    /// Set branch rate
160    pub fn with_branch_rate(mut self, rate: f64) -> Self {
161        self.branch_rate = rate.clamp(0.0, 1.0);
162        self
163    }
164
165    /// Set data reuse factor
166    pub fn with_data_reuse(mut self, reuse: f64) -> Self {
167        self.data_reuse = reuse.max(1.0);
168        self
169    }
170
171    /// Normalize features to Z-scores
172    pub fn normalize(&self, means: &[f64], stds: &[f64]) -> Vec<f64> {
173        let features = self.to_vec();
174        features
175            .iter()
176            .enumerate()
177            .map(|(i, &v)| {
178                if stds[i] > 1e-10 {
179                    (v - means[i]) / stds[i]
180                } else {
181                    0.0
182                }
183            })
184            .collect()
185    }
186
187    /// Convert to feature vector
188    pub fn to_vec(&self) -> Vec<f64> {
189        vec![
190            self.arithmetic_intensity,
191            self.memory_footprint as f64,
192            self.working_set as f64,
193            self.access_pattern,
194            self.compute_density,
195            self.branch_rate,
196            self.data_reuse,
197        ]
198    }
199
200    /// Compute Euclidean distance to another feature vector
201    pub fn distance(&self, other: &Self) -> f64 {
202        let a = self.to_vec();
203        let b = other.to_vec();
204        a.iter()
205            .zip(b.iter())
206            .map(|(x, y)| (x - y).powi(2))
207            .sum::<f64>()
208            .sqrt()
209    }
210
211    /// Compute cosine similarity
212    pub fn cosine_similarity(&self, other: &Self) -> f64 {
213        let a = self.to_vec();
214        let b = other.to_vec();
215
216        let dot: f64 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
217        let norm_a: f64 = a.iter().map(|x| x.powi(2)).sum::<f64>().sqrt();
218        let norm_b: f64 = b.iter().map(|x| x.powi(2)).sum::<f64>().sqrt();
219
220        if norm_a < 1e-10 || norm_b < 1e-10 {
221            return 0.0;
222        }
223
224        (dot / (norm_a * norm_b)).clamp(-1.0, 1.0)
225    }
226}
227
228/// Classification result
229#[derive(Debug, Clone)]
230pub struct ClassificationResult {
231    /// Predicted category
232    pub category: WorkloadCategory,
233    /// Confidence score (0-1)
234    pub confidence: f64,
235    /// Distance to nearest prototype
236    pub distance: f64,
237    /// Recommended backend
238    pub recommended_backend: RecommendedBackend,
239    /// Size threshold for GPU crossover
240    pub gpu_crossover_size: Option<usize>,
241}
242
243impl ClassificationResult {
244    /// Check if classification is confident
245    pub fn is_confident(&self) -> bool {
246        self.confidence > 0.7
247    }
248}
249
250/// Workload characterization system
251#[derive(Debug)]
252pub struct WorkloadCharacterizer {
253    /// Prototype features for known workloads
254    prototypes: Vec<(WorkloadCategory, WorkloadFeatures)>,
255    /// GPU crossover thresholds by category
256    gpu_thresholds: Vec<(WorkloadCategory, usize)>,
257}
258
259impl Default for WorkloadCharacterizer {
260    fn default() -> Self {
261        Self::new()
262    }
263}
264
265impl WorkloadCharacterizer {
266    /// Create new characterizer with default prototypes
267    pub fn new() -> Self {
268        let prototypes = vec![
269            // GEMM: High intensity, high compute density, high reuse
270            (
271                WorkloadCategory::Gemm,
272                WorkloadFeatures::new()
273                    .with_intensity(50.0)
274                    .with_compute_density(8.0)
275                    .with_access_pattern(0.8)
276                    .with_data_reuse(32.0)
277                    .with_branch_rate(0.01),
278            ),
279            // Bandwidth: Low intensity, sequential access
280            (
281                WorkloadCategory::Bandwidth,
282                WorkloadFeatures::new()
283                    .with_intensity(0.25)
284                    .with_compute_density(0.5)
285                    .with_access_pattern(1.0)
286                    .with_data_reuse(1.0)
287                    .with_branch_rate(0.0),
288            ),
289            // Attention: Medium intensity, mixed access
290            (
291                WorkloadCategory::Attention,
292                WorkloadFeatures::new()
293                    .with_intensity(5.0)
294                    .with_compute_density(4.0)
295                    .with_access_pattern(0.6)
296                    .with_data_reuse(4.0)
297                    .with_branch_rate(0.05),
298            ),
299            // Conv2D: High intensity, sliding window access
300            (
301                WorkloadCategory::Conv2d,
302                WorkloadFeatures::new()
303                    .with_intensity(20.0)
304                    .with_compute_density(6.0)
305                    .with_access_pattern(0.7)
306                    .with_data_reuse(9.0)
307                    .with_branch_rate(0.02),
308            ),
309            // Elementwise: Very low intensity, sequential
310            (
311                WorkloadCategory::Elementwise,
312                WorkloadFeatures::new()
313                    .with_intensity(0.125)
314                    .with_compute_density(1.0)
315                    .with_access_pattern(1.0)
316                    .with_data_reuse(1.0)
317                    .with_branch_rate(0.0),
318            ),
319            // Reduction: Low intensity, tree pattern
320            (
321                WorkloadCategory::Reduction,
322                WorkloadFeatures::new()
323                    .with_intensity(1.0)
324                    .with_compute_density(2.0)
325                    .with_access_pattern(0.5)
326                    .with_data_reuse(2.0)
327                    .with_branch_rate(0.1),
328            ),
329        ];
330
331        let gpu_thresholds = vec![
332            (WorkloadCategory::Gemm, 10_000),         // GPU wins at ~100x100
333            (WorkloadCategory::Bandwidth, 1_000_000), // GPU wins at 1M elements
334            (WorkloadCategory::Attention, 50_000),    // GPU wins at ~224 seq len
335            (WorkloadCategory::Conv2d, 100_000),      // GPU wins at moderate sizes
336            (WorkloadCategory::Elementwise, 500_000), // GPU wins at 500K elements
337            (WorkloadCategory::Reduction, 100_000),   // GPU wins at 100K elements
338        ];
339
340        Self {
341            prototypes,
342            gpu_thresholds,
343        }
344    }
345
346    /// Extract features from workload metrics
347    pub fn extract_features(
348        &self,
349        flops: f64,
350        bytes_accessed: f64,
351        memory_footprint: usize,
352        working_set: usize,
353    ) -> WorkloadFeatures {
354        let intensity = if bytes_accessed > 0.0 {
355            flops / bytes_accessed
356        } else {
357            0.0
358        };
359
360        WorkloadFeatures::new()
361            .with_intensity(intensity)
362            .with_memory(memory_footprint, working_set)
363    }
364
365    /// Classify workload based on features
366    pub fn classify(&self, features: &WorkloadFeatures) -> ClassificationResult {
367        let mut best_category = WorkloadCategory::Unknown;
368        let mut best_distance = f64::MAX;
369        let mut second_best_distance = f64::MAX;
370
371        for (category, prototype) in &self.prototypes {
372            let distance = features.distance(prototype);
373            if distance < best_distance {
374                second_best_distance = best_distance;
375                best_distance = distance;
376                best_category = *category;
377            } else if distance < second_best_distance {
378                second_best_distance = distance;
379            }
380        }
381
382        // Confidence based on distance ratio
383        let confidence = if second_best_distance > 1e-10 {
384            (1.0 - best_distance / second_best_distance).clamp(0.0, 1.0)
385        } else {
386            1.0
387        };
388
389        // Recommend backend
390        let recommended_backend = self.recommend_backend(best_category, features.memory_footprint);
391
392        // Get GPU crossover threshold
393        let gpu_crossover_size = self
394            .gpu_thresholds
395            .iter()
396            .find(|(c, _)| *c == best_category)
397            .map(|(_, t)| *t);
398
399        ClassificationResult {
400            category: best_category,
401            confidence,
402            distance: best_distance,
403            recommended_backend,
404            gpu_crossover_size,
405        }
406    }
407
408    /// Compute similarity between two workloads
409    pub fn workload_similarity(&self, a: &WorkloadFeatures, b: &WorkloadFeatures) -> f64 {
410        // Normalize cosine similarity to 0-1 range
411        (a.cosine_similarity(b) + 1.0) / 2.0
412    }
413
414    /// Recommend backend for workload
415    pub fn recommend_backend(&self, category: WorkloadCategory, size: usize) -> RecommendedBackend {
416        let threshold = self
417            .gpu_thresholds
418            .iter()
419            .find(|(c, _)| *c == category)
420            .map(|(_, t)| *t)
421            .unwrap_or(100_000);
422
423        if size < threshold / 2 {
424            RecommendedBackend::CpuSimd
425        } else if size > threshold * 2 {
426            RecommendedBackend::Gpu
427        } else {
428            RecommendedBackend::Either
429        }
430    }
431
432    /// Predict GPU crossover size
433    pub fn predict_crossover(&self, category: WorkloadCategory) -> Option<usize> {
434        self.gpu_thresholds
435            .iter()
436            .find(|(c, _)| *c == category)
437            .map(|(_, t)| *t)
438    }
439
440    /// Add custom prototype
441    pub fn add_prototype(&mut self, category: WorkloadCategory, features: WorkloadFeatures) {
442        self.prototypes.push((category, features));
443    }
444
445    /// Get all prototypes
446    pub fn get_prototypes(&self) -> &[(WorkloadCategory, WorkloadFeatures)] {
447        &self.prototypes
448    }
449}
450
451#[cfg(test)]
452mod tests;