1#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum CacheLevel {
23 L1,
25 L2,
27 L3,
29 Ram,
31}
32
33impl CacheLevel {
34 pub fn name(&self) -> &'static str {
36 match self {
37 CacheLevel::L1 => "L1 cache",
38 CacheLevel::L2 => "L2 cache",
39 CacheLevel::L3 => "L3 cache",
40 CacheLevel::Ram => "main memory",
41 }
42 }
43
44 pub fn typical_latency_cycles(&self) -> u32 {
46 match self {
47 CacheLevel::L1 => 4,
48 CacheLevel::L2 => 12,
49 CacheLevel::L3 => 40,
50 CacheLevel::Ram => 200,
51 }
52 }
53
54 pub fn relative_bandwidth(&self) -> f64 {
56 match self {
57 CacheLevel::L1 => 1.0,
58 CacheLevel::L2 => 0.8,
59 CacheLevel::L3 => 0.5,
60 CacheLevel::Ram => 0.1,
61 }
62 }
63}
64
65#[derive(Debug, Clone)]
67pub struct CacheConfig {
68 pub l1_size: usize,
70 pub l2_size: usize,
72 pub l3_size: usize,
74 pub l3_sharing: usize,
76 pub line_size: usize,
78}
79
80impl Default for CacheConfig {
81 fn default() -> Self {
82 Self {
83 l1_size: 32 * 1024, l2_size: 512 * 1024, l3_size: 32 * 1024 * 1024, l3_sharing: 8, line_size: 64, }
89 }
90}
91
92impl CacheConfig {
93 pub fn zen4() -> Self {
95 Self {
96 l1_size: 32 * 1024, l2_size: 1024 * 1024, l3_size: 32 * 1024 * 1024, l3_sharing: 8,
100 line_size: 64,
101 }
102 }
103
104 pub fn sapphire_rapids() -> Self {
106 Self {
107 l1_size: 48 * 1024, l2_size: 2 * 1024 * 1024, l3_size: 60 * 1024 * 1024, l3_sharing: 16,
111 line_size: 64,
112 }
113 }
114
115 pub fn apple_m2() -> Self {
117 Self {
118 l1_size: 128 * 1024, l2_size: 16 * 1024 * 1024, l3_size: 0, l3_sharing: 1,
122 line_size: 128, }
124 }
125
126 pub fn classify(&self, working_set_bytes: usize) -> CacheLevel {
128 if working_set_bytes <= self.l1_size * 3 / 4 {
129 CacheLevel::L1
130 } else if working_set_bytes <= self.l2_size * 3 / 4 {
131 CacheLevel::L2
132 } else if self.l3_size > 0 && working_set_bytes <= self.l3_size * 3 / 4 {
133 CacheLevel::L3
134 } else {
135 CacheLevel::Ram
136 }
137 }
138
139 pub fn l3_per_core(&self) -> usize {
141 if self.l3_sharing > 0 {
142 self.l3_size / self.l3_sharing
143 } else {
144 self.l3_size
145 }
146 }
147}
148
149#[derive(Debug, Clone)]
151pub struct WorkingSetAnalysis {
152 pub working_set_bytes: usize,
154 pub cache_level: CacheLevel,
156 pub utilization_percent: f64,
158 pub expected_efficiency: f64,
160 pub tiling_recommended: bool,
162 pub recommended_tile_bytes: Option<usize>,
164}
165
166impl WorkingSetAnalysis {
167 pub fn analyze(
169 elements: usize,
170 element_size: usize,
171 access_factor: f64,
172 config: &CacheConfig,
173 ) -> Self {
174 let working_set_bytes = (elements as f64 * element_size as f64 * access_factor) as usize;
175 let cache_level = config.classify(working_set_bytes);
176
177 let (utilization_percent, _cache_size) = match cache_level {
178 CacheLevel::L1 => (
179 (working_set_bytes as f64 / config.l1_size as f64) * 100.0,
180 config.l1_size,
181 ),
182 CacheLevel::L2 => (
183 (working_set_bytes as f64 / config.l2_size as f64) * 100.0,
184 config.l2_size,
185 ),
186 CacheLevel::L3 => (
187 (working_set_bytes as f64 / config.l3_size as f64) * 100.0,
188 config.l3_size,
189 ),
190 CacheLevel::Ram => (100.0, working_set_bytes),
191 };
192
193 let expected_efficiency = cache_level.relative_bandwidth();
194
195 let tiling_recommended = working_set_bytes > config.l2_size;
197
198 let recommended_tile_bytes = if tiling_recommended {
199 Some(config.l2_size * 3 / 4)
201 } else {
202 None
203 };
204
205 Self {
206 working_set_bytes,
207 cache_level,
208 utilization_percent,
209 expected_efficiency,
210 tiling_recommended,
211 recommended_tile_bytes,
212 }
213 }
214
215 pub fn recommendation(&self) -> String {
217 if self.tiling_recommended {
218 format!(
219 "Working set ({} bytes) exceeds L2. Recommend tiling with {} byte tiles for {} cache.",
220 self.working_set_bytes,
221 self.recommended_tile_bytes.unwrap_or(0),
222 CacheLevel::L2.name()
223 )
224 } else {
225 format!(
226 "Working set ({} bytes) fits in {}. No tiling needed.",
227 self.working_set_bytes,
228 self.cache_level.name()
229 )
230 }
231 }
232}
233
234pub fn matrix_working_set(m: usize, n: usize, k: usize, element_size: usize) -> usize {
236 let a_size = m * k * element_size;
238 let b_size = k * n * element_size;
239 let c_size = m * n * element_size;
240 a_size + b_size + c_size
241}
242
243pub fn optimal_matmul_tile(config: &CacheConfig, element_size: usize) -> usize {
245 let target_bytes = config.l2_size * 3 / 4;
249 let max_tile_elements = target_bytes / (3 * element_size);
250 let tile_size = (max_tile_elements as f64).sqrt() as usize;
251
252 let elements_per_line = config.line_size / element_size;
254 (tile_size / elements_per_line) * elements_per_line
255}
256
257pub fn elementwise_working_set(
259 elements: usize,
260 inputs: usize,
261 outputs: usize,
262 element_size: usize,
263) -> usize {
264 elements * (inputs + outputs) * element_size
265}
266
267#[derive(Debug, Clone, Copy, PartialEq, Eq)]
269pub enum AccessPattern {
270 Streaming,
272 Reuse,
274 Random,
276}
277
278impl AccessPattern {
279 pub fn estimate(working_set: usize, iterations: usize, config: &CacheConfig) -> Self {
281 if iterations == 1 {
282 AccessPattern::Streaming
283 } else if working_set <= config.l2_size {
284 AccessPattern::Reuse
285 } else {
286 AccessPattern::Random
287 }
288 }
289
290 pub fn name(&self) -> &'static str {
292 match self {
293 AccessPattern::Streaming => "streaming",
294 AccessPattern::Reuse => "reuse",
295 AccessPattern::Random => "random",
296 }
297 }
298
299 pub fn efficiency_factor(&self) -> f64 {
301 match self {
302 AccessPattern::Streaming => 0.5, AccessPattern::Reuse => 1.0, AccessPattern::Random => 0.1, }
306 }
307}
308
309#[derive(Debug, Clone)]
311pub struct BandwidthPrediction {
312 pub peak_bandwidth_gbps: f64,
314 pub predicted_bandwidth_gbps: f64,
316 pub efficiency_percent: f64,
318 pub limiting_factor: String,
320}
321
322impl BandwidthPrediction {
323 pub fn predict(
325 peak_bandwidth_gbps: f64,
326 working_set: usize,
327 access_pattern: AccessPattern,
328 config: &CacheConfig,
329 ) -> Self {
330 let cache_level = config.classify(working_set);
331 let cache_efficiency = cache_level.relative_bandwidth();
332 let pattern_efficiency = access_pattern.efficiency_factor();
333
334 let overall_efficiency = cache_efficiency * pattern_efficiency;
335 let predicted_bandwidth_gbps = peak_bandwidth_gbps * overall_efficiency;
336
337 let limiting_factor = if pattern_efficiency < cache_efficiency {
338 format!("{} access pattern", access_pattern.name())
339 } else {
340 format!("{} bandwidth", cache_level.name())
341 };
342
343 Self {
344 peak_bandwidth_gbps,
345 predicted_bandwidth_gbps,
346 efficiency_percent: overall_efficiency * 100.0,
347 limiting_factor,
348 }
349 }
350}
351
352#[cfg(test)]
353mod tests {
354 use super::*;
355
356 #[test]
357 fn test_cache_level_classification() {
358 let config = CacheConfig::default();
359
360 assert_eq!(config.classify(1024), CacheLevel::L1);
361 assert_eq!(config.classify(100 * 1024), CacheLevel::L2);
362 assert_eq!(config.classify(10 * 1024 * 1024), CacheLevel::L3);
363 assert_eq!(config.classify(100 * 1024 * 1024), CacheLevel::Ram);
364 }
365
366 #[test]
367 fn test_working_set_analysis() {
368 let config = CacheConfig::default();
369 let analysis = WorkingSetAnalysis::analyze(1000, 4, 2.0, &config);
370
371 assert_eq!(analysis.cache_level, CacheLevel::L1);
373 assert!(!analysis.tiling_recommended);
374 }
375
376 #[test]
377 fn test_matrix_working_set() {
378 let ws = matrix_working_set(1024, 1024, 1024, 4);
379 assert_eq!(ws, 3 * 1024 * 1024 * 4);
381 }
382
383 #[test]
384 fn test_optimal_tile_size() {
385 let config = CacheConfig::default();
386 let tile = optimal_matmul_tile(&config, 4);
387
388 assert!(tile > 0);
390 assert!(tile <= 512); }
392
393 #[test]
394 fn test_access_pattern() {
395 let config = CacheConfig::default();
396
397 assert_eq!(
398 AccessPattern::estimate(1024, 1, &config),
399 AccessPattern::Streaming
400 );
401 assert_eq!(
402 AccessPattern::estimate(1024, 10, &config),
403 AccessPattern::Reuse
404 );
405 assert_eq!(
406 AccessPattern::estimate(100 * 1024 * 1024, 10, &config),
407 AccessPattern::Random
408 );
409 }
410
411 #[test]
412 fn test_bandwidth_prediction() {
413 let config = CacheConfig::default();
414 let prediction = BandwidthPrediction::predict(
415 100.0, 1024, AccessPattern::Reuse,
418 &config,
419 );
420
421 assert!(prediction.efficiency_percent > 90.0);
423 }
424}