Skip to main content

cbtop/
cache_analysis.rs

1//! Cache Efficiency Analysis Module (PMAT-025)
2//!
3//! Implements cache efficiency analysis for L1/L2/L3 cache behavior prediction
4//! and optimization recommendations based on working set size.
5//!
6//! # Motivation
7//!
8//! §31.2 identifies memory bandwidth cliff at 4M elements (32MB) due to L3 overflow.
9//! This module predicts and recommends optimal problem sizes.
10//!
11//! # Components
12//!
13//! | Component | Description | Use Case |
14//! |-----------|-------------|----------|
15//! | Working Set Estimator | Bytes = elements × sizeof(T) × factor | Predict cache fit |
16//! | Cache Level Classifier | L1/L2/L3/RAM based on size | Identify bottleneck |
17//! | Tiling Recommender | Optimal tile size for cache | Loop blocking advice |
18//! | Bandwidth Estimator | Theoretical vs achieved BW | Efficiency score |
19
20/// Cache level classification
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum CacheLevel {
23    /// L1 data cache (typically 32KB-64KB per core)
24    L1,
25    /// L2 cache (typically 256KB-1MB per core)
26    L2,
27    /// L3 cache (typically 4MB-64MB shared)
28    L3,
29    /// Main memory (RAM)
30    Ram,
31}
32
33impl CacheLevel {
34    /// Get human-readable name
35    pub fn name(&self) -> &'static str {
36        match self {
37            CacheLevel::L1 => "L1 cache",
38            CacheLevel::L2 => "L2 cache",
39            CacheLevel::L3 => "L3 cache",
40            CacheLevel::Ram => "main memory",
41        }
42    }
43
44    /// Get typical latency in CPU cycles
45    pub fn typical_latency_cycles(&self) -> u32 {
46        match self {
47            CacheLevel::L1 => 4,
48            CacheLevel::L2 => 12,
49            CacheLevel::L3 => 40,
50            CacheLevel::Ram => 200,
51        }
52    }
53
54    /// Get typical bandwidth relative to L1 (as fraction)
55    pub fn relative_bandwidth(&self) -> f64 {
56        match self {
57            CacheLevel::L1 => 1.0,
58            CacheLevel::L2 => 0.8,
59            CacheLevel::L3 => 0.5,
60            CacheLevel::Ram => 0.1,
61        }
62    }
63}
64
65/// Cache hierarchy configuration
66#[derive(Debug, Clone)]
67pub struct CacheConfig {
68    /// L1 data cache size in bytes per core
69    pub l1_size: usize,
70    /// L2 cache size in bytes per core
71    pub l2_size: usize,
72    /// L3 cache size in bytes (shared)
73    pub l3_size: usize,
74    /// Number of cores sharing L3
75    pub l3_sharing: usize,
76    /// Cache line size in bytes
77    pub line_size: usize,
78}
79
80impl Default for CacheConfig {
81    fn default() -> Self {
82        Self {
83            l1_size: 32 * 1024,        // 32KB
84            l2_size: 512 * 1024,       // 512KB
85            l3_size: 32 * 1024 * 1024, // 32MB
86            l3_sharing: 8,             // 8 cores share L3
87            line_size: 64,             // 64 bytes
88        }
89    }
90}
91
92impl CacheConfig {
93    /// Create config for AMD Zen4 (Ryzen 7000 series)
94    pub fn zen4() -> Self {
95        Self {
96            l1_size: 32 * 1024,        // 32KB per core
97            l2_size: 1024 * 1024,      // 1MB per core
98            l3_size: 32 * 1024 * 1024, // 32MB per CCD
99            l3_sharing: 8,
100            line_size: 64,
101        }
102    }
103
104    /// Create config for Intel Sapphire Rapids
105    pub fn sapphire_rapids() -> Self {
106        Self {
107            l1_size: 48 * 1024,        // 48KB per core
108            l2_size: 2 * 1024 * 1024,  // 2MB per core
109            l3_size: 60 * 1024 * 1024, // 60MB shared
110            l3_sharing: 16,
111            line_size: 64,
112        }
113    }
114
115    /// Create config for Apple M2
116    pub fn apple_m2() -> Self {
117        Self {
118            l1_size: 128 * 1024,       // 128KB per P-core
119            l2_size: 16 * 1024 * 1024, // 16MB shared L2
120            l3_size: 0,                // No L3
121            l3_sharing: 1,
122            line_size: 128, // 128-byte cache lines
123        }
124    }
125
126    /// Classify working set size to cache level
127    pub fn classify(&self, working_set_bytes: usize) -> CacheLevel {
128        if working_set_bytes <= self.l1_size * 3 / 4 {
129            CacheLevel::L1
130        } else if working_set_bytes <= self.l2_size * 3 / 4 {
131            CacheLevel::L2
132        } else if self.l3_size > 0 && working_set_bytes <= self.l3_size * 3 / 4 {
133            CacheLevel::L3
134        } else {
135            CacheLevel::Ram
136        }
137    }
138
139    /// Get effective L3 size per core (accounting for sharing)
140    pub fn l3_per_core(&self) -> usize {
141        if self.l3_sharing > 0 {
142            self.l3_size / self.l3_sharing
143        } else {
144            self.l3_size
145        }
146    }
147}
148
149/// Working set analysis result
150#[derive(Debug, Clone)]
151pub struct WorkingSetAnalysis {
152    /// Total working set in bytes
153    pub working_set_bytes: usize,
154    /// Cache level where working set fits
155    pub cache_level: CacheLevel,
156    /// Cache utilization percentage (working_set / cache_size * 100)
157    pub utilization_percent: f64,
158    /// Expected bandwidth efficiency (relative to peak)
159    pub expected_efficiency: f64,
160    /// Whether tiling is recommended
161    pub tiling_recommended: bool,
162    /// Recommended tile size if tiling is recommended
163    pub recommended_tile_bytes: Option<usize>,
164}
165
166impl WorkingSetAnalysis {
167    /// Analyze working set against cache hierarchy
168    pub fn analyze(
169        elements: usize,
170        element_size: usize,
171        access_factor: f64,
172        config: &CacheConfig,
173    ) -> Self {
174        let working_set_bytes = (elements as f64 * element_size as f64 * access_factor) as usize;
175        let cache_level = config.classify(working_set_bytes);
176
177        let (utilization_percent, _cache_size) = match cache_level {
178            CacheLevel::L1 => (
179                (working_set_bytes as f64 / config.l1_size as f64) * 100.0,
180                config.l1_size,
181            ),
182            CacheLevel::L2 => (
183                (working_set_bytes as f64 / config.l2_size as f64) * 100.0,
184                config.l2_size,
185            ),
186            CacheLevel::L3 => (
187                (working_set_bytes as f64 / config.l3_size as f64) * 100.0,
188                config.l3_size,
189            ),
190            CacheLevel::Ram => (100.0, working_set_bytes),
191        };
192
193        let expected_efficiency = cache_level.relative_bandwidth();
194
195        // Recommend tiling if working set exceeds L2
196        let tiling_recommended = working_set_bytes > config.l2_size;
197
198        let recommended_tile_bytes = if tiling_recommended {
199            // Recommend tile size that fits 75% of L2
200            Some(config.l2_size * 3 / 4)
201        } else {
202            None
203        };
204
205        Self {
206            working_set_bytes,
207            cache_level,
208            utilization_percent,
209            expected_efficiency,
210            tiling_recommended,
211            recommended_tile_bytes,
212        }
213    }
214
215    /// Get recommendation string
216    pub fn recommendation(&self) -> String {
217        if self.tiling_recommended {
218            format!(
219                "Working set ({} bytes) exceeds L2. Recommend tiling with {} byte tiles for {} cache.",
220                self.working_set_bytes,
221                self.recommended_tile_bytes.unwrap_or(0),
222                CacheLevel::L2.name()
223            )
224        } else {
225            format!(
226                "Working set ({} bytes) fits in {}. No tiling needed.",
227                self.working_set_bytes,
228                self.cache_level.name()
229            )
230        }
231    }
232}
233
234/// Calculate working set size for matrix operations
235pub fn matrix_working_set(m: usize, n: usize, k: usize, element_size: usize) -> usize {
236    // For C = A × B: A is m×k, B is k×n, C is m×n
237    let a_size = m * k * element_size;
238    let b_size = k * n * element_size;
239    let c_size = m * n * element_size;
240    a_size + b_size + c_size
241}
242
243/// Calculate optimal tile size for matrix multiply
244pub fn optimal_matmul_tile(config: &CacheConfig, element_size: usize) -> usize {
245    // For tiled matmul, need 3 tiles: A_tile, B_tile, C_tile
246    // Each tile is tile_size × tile_size
247    // Total: 3 × tile_size² × element_size ≤ L2 × 0.75
248    let target_bytes = config.l2_size * 3 / 4;
249    let max_tile_elements = target_bytes / (3 * element_size);
250    let tile_size = (max_tile_elements as f64).sqrt() as usize;
251
252    // Round down to multiple of cache line for alignment
253    let elements_per_line = config.line_size / element_size;
254    (tile_size / elements_per_line) * elements_per_line
255}
256
257/// Calculate working set for elementwise operations
258pub fn elementwise_working_set(
259    elements: usize,
260    inputs: usize,
261    outputs: usize,
262    element_size: usize,
263) -> usize {
264    elements * (inputs + outputs) * element_size
265}
266
267/// Streaming vs reuse pattern detection
268#[derive(Debug, Clone, Copy, PartialEq, Eq)]
269pub enum AccessPattern {
270    /// Data accessed once and discarded (streaming)
271    Streaming,
272    /// Data reused multiple times (cache-friendly)
273    Reuse,
274    /// Random access pattern (cache-unfriendly)
275    Random,
276}
277
278impl AccessPattern {
279    /// Estimate based on working set vs cache size
280    pub fn estimate(working_set: usize, iterations: usize, config: &CacheConfig) -> Self {
281        if iterations == 1 {
282            AccessPattern::Streaming
283        } else if working_set <= config.l2_size {
284            AccessPattern::Reuse
285        } else {
286            AccessPattern::Random
287        }
288    }
289
290    /// Get name
291    pub fn name(&self) -> &'static str {
292        match self {
293            AccessPattern::Streaming => "streaming",
294            AccessPattern::Reuse => "reuse",
295            AccessPattern::Random => "random",
296        }
297    }
298
299    /// Get expected efficiency multiplier
300    pub fn efficiency_factor(&self) -> f64 {
301        match self {
302            AccessPattern::Streaming => 0.5, // 50% - prefetching helps
303            AccessPattern::Reuse => 1.0,     // 100% - cache hits
304            AccessPattern::Random => 0.1,    // 10% - cache misses
305        }
306    }
307}
308
309/// Bandwidth prediction result
310#[derive(Debug, Clone)]
311pub struct BandwidthPrediction {
312    /// Peak theoretical bandwidth (GB/s)
313    pub peak_bandwidth_gbps: f64,
314    /// Predicted achievable bandwidth (GB/s)
315    pub predicted_bandwidth_gbps: f64,
316    /// Efficiency percentage
317    pub efficiency_percent: f64,
318    /// Limiting factor description
319    pub limiting_factor: String,
320}
321
322impl BandwidthPrediction {
323    /// Predict bandwidth for given access pattern
324    pub fn predict(
325        peak_bandwidth_gbps: f64,
326        working_set: usize,
327        access_pattern: AccessPattern,
328        config: &CacheConfig,
329    ) -> Self {
330        let cache_level = config.classify(working_set);
331        let cache_efficiency = cache_level.relative_bandwidth();
332        let pattern_efficiency = access_pattern.efficiency_factor();
333
334        let overall_efficiency = cache_efficiency * pattern_efficiency;
335        let predicted_bandwidth_gbps = peak_bandwidth_gbps * overall_efficiency;
336
337        let limiting_factor = if pattern_efficiency < cache_efficiency {
338            format!("{} access pattern", access_pattern.name())
339        } else {
340            format!("{} bandwidth", cache_level.name())
341        };
342
343        Self {
344            peak_bandwidth_gbps,
345            predicted_bandwidth_gbps,
346            efficiency_percent: overall_efficiency * 100.0,
347            limiting_factor,
348        }
349    }
350}
351
352#[cfg(test)]
353mod tests {
354    use super::*;
355
356    #[test]
357    fn test_cache_level_classification() {
358        let config = CacheConfig::default();
359
360        assert_eq!(config.classify(1024), CacheLevel::L1);
361        assert_eq!(config.classify(100 * 1024), CacheLevel::L2);
362        assert_eq!(config.classify(10 * 1024 * 1024), CacheLevel::L3);
363        assert_eq!(config.classify(100 * 1024 * 1024), CacheLevel::Ram);
364    }
365
366    #[test]
367    fn test_working_set_analysis() {
368        let config = CacheConfig::default();
369        let analysis = WorkingSetAnalysis::analyze(1000, 4, 2.0, &config);
370
371        // 1000 elements × 4 bytes × 2.0 factor = 8KB → fits in L1
372        assert_eq!(analysis.cache_level, CacheLevel::L1);
373        assert!(!analysis.tiling_recommended);
374    }
375
376    #[test]
377    fn test_matrix_working_set() {
378        let ws = matrix_working_set(1024, 1024, 1024, 4);
379        // 3 matrices × 1024² × 4 = 12MB
380        assert_eq!(ws, 3 * 1024 * 1024 * 4);
381    }
382
383    #[test]
384    fn test_optimal_tile_size() {
385        let config = CacheConfig::default();
386        let tile = optimal_matmul_tile(&config, 4);
387
388        // Tile should be reasonable size
389        assert!(tile > 0);
390        assert!(tile <= 512); // Should be ≤ 512 for typical cache
391    }
392
393    #[test]
394    fn test_access_pattern() {
395        let config = CacheConfig::default();
396
397        assert_eq!(
398            AccessPattern::estimate(1024, 1, &config),
399            AccessPattern::Streaming
400        );
401        assert_eq!(
402            AccessPattern::estimate(1024, 10, &config),
403            AccessPattern::Reuse
404        );
405        assert_eq!(
406            AccessPattern::estimate(100 * 1024 * 1024, 10, &config),
407            AccessPattern::Random
408        );
409    }
410
411    #[test]
412    fn test_bandwidth_prediction() {
413        let config = CacheConfig::default();
414        let prediction = BandwidthPrediction::predict(
415            100.0, // 100 GB/s peak
416            1024,  // 1KB working set
417            AccessPattern::Reuse,
418            &config,
419        );
420
421        // L1 with reuse should be near peak
422        assert!(prediction.efficiency_percent > 90.0);
423    }
424}