numrs2/memory_optimize/
cache_layout.rs

1//! Memory layout optimization for cache efficiency
2//!
3//! This module provides functions for reorganizing data in memory to improve
4//! cache efficiency, taking advantage of both spatial and temporal locality.
5
6#[cfg(target_arch = "x86_64")]
7use std::arch::x86_64::__cpuid;
8use std::cmp;
9use std::mem;
10use std::ptr;
11
12/// Cache information for optimal layout decisions
13#[derive(Debug, Clone)]
14struct CacheInfo {
15    line_size: usize,
16    l1_size: usize,
17    l2_size: usize,
18    l3_size: usize,
19    #[allow(dead_code)]
20    associativity: usize,
21}
22
23lazy_static::lazy_static! {
24    static ref CACHE_DATA: CacheInfo = detect_cache_info();
25}
26
27/// Strategy for optimizing memory layout
28#[derive(Debug, Copy, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
29pub enum LayoutStrategy {
30    /// Row-major order (C-style) - optimized for row-wise operations
31    RowMajor,
32    /// Column-major order (Fortran-style) - optimized for column-wise operations
33    ColumnMajor,
34    /// Morton order (Z-order curve) - good for 2D traversal
35    Morton,
36    /// Hilbert curve order - better locality than Morton
37    Hilbert,
38    /// Cache-oblivious layout - adapts to any cache size
39    CacheOblivious,
40    /// Blocked layout for optimizing matrix operations
41    Blocked(usize), // block size
42}
43
44/// Optimize the memory layout of a slice of data
45///
46/// # Arguments
47///
48/// * `data` - The data to optimize
49/// * `strategy` - The layout strategy to use
50///
51/// This function reorganizes the data in memory according to the specified strategy
52/// to improve cache efficiency. It uses in-place algorithms when possible to
53/// minimize additional memory usage.
54pub fn optimize_layout<T: Copy>(data: &mut [T], strategy: LayoutStrategy) {
55    match strategy {
56        LayoutStrategy::RowMajor => {
57            // Data is already in row-major order in most cases
58            // But we can ensure optimal alignment
59            align_for_cache_line(data);
60        }
61        LayoutStrategy::ColumnMajor => {
62            // For 1D data, no transpose needed. For actual multidimensional data,
63            // this would require shape information
64            // This optimization assumes data will be accessed column-wise
65            optimize_for_column_access(data);
66        }
67        LayoutStrategy::Morton => {
68            // Reorder data along a Z-order curve for 2D locality
69            apply_morton_order(data);
70        }
71        LayoutStrategy::Hilbert => {
72            // Reorder data along a Hilbert curve for better 2D locality than Morton
73            apply_hilbert_order(data);
74        }
75        LayoutStrategy::CacheOblivious => {
76            // Use recursive layout that works well regardless of cache size
77            apply_cache_oblivious_layout(data);
78        }
79        LayoutStrategy::Blocked(block_size) => {
80            // Reorganize data into blocks for better cache usage in matrix operations
81            apply_blocked_layout(data, block_size);
82        }
83    }
84}
85
86/// Align data to cache line boundaries for better cache efficiency
87///
88/// This function ensures that the start of the data is aligned to the cache line size
89/// of the CPU, which can significantly improve memory access performance.
90fn align_for_cache_line<T: Copy>(data: &mut [T]) {
91    // Get the cache line size (typical values are 64 or 128 bytes)
92    let cache_line_size = get_cache_line_size();
93
94    // Calculate the current alignment
95    let data_ptr = data.as_ptr() as usize;
96    let misalignment = data_ptr % cache_line_size;
97
98    if misalignment == 0 {
99        // Already aligned
100        return;
101    }
102
103    // Realign by shifting data
104    // This is a simplification; real implementation would be more sophisticated
105    // and would handle edge cases better
106    let shift = cache_line_size - misalignment;
107    if shift < std::mem::size_of_val(data) {
108        unsafe {
109            let src = data.as_ptr();
110            let dst = (data.as_mut_ptr() as *mut u8).add(shift) as *mut T;
111            ptr::copy(src, dst, data.len());
112        }
113    }
114}
115
116/// Get the CPU's cache line size
117///
118/// This function queries the CPU for its actual cache line size.
119/// If it cannot be determined, it returns a sensible default.
120fn get_cache_line_size() -> usize {
121    get_cache_info().line_size
122}
123
124/// Get comprehensive CPU cache information
125fn get_cache_info() -> &'static CacheInfo {
126    &CACHE_DATA
127}
128
129/// Detect CPU cache information using CPUID
130fn detect_cache_info() -> CacheInfo {
131    #[cfg(target_arch = "x86_64")]
132    {
133        detect_x86_cache_info()
134    }
135
136    #[cfg(not(target_arch = "x86_64"))]
137    {
138        // Default values for non-x86_64 architectures
139        CacheInfo {
140            line_size: 64,
141            l1_size: 32 * 1024,
142            l2_size: 256 * 1024,
143            l3_size: 8 * 1024 * 1024,
144            associativity: 8,
145        }
146    }
147}
148
149#[cfg(target_arch = "x86_64")]
150fn detect_x86_cache_info() -> CacheInfo {
151    #[cfg(target_arch = "x86_64")]
152    use std::arch::x86_64::__cpuid;
153
154    let mut info = CacheInfo {
155        line_size: 64,
156        l1_size: 32 * 1024,
157        l2_size: 256 * 1024,
158        l3_size: 8 * 1024 * 1024,
159        associativity: 8,
160    };
161
162    // Check if CPUID leaf 0x80000006 is available (cache info).
163    // __cpuid is a safe fn on x86_64 in current Rust.
164    let cpuid_result = __cpuid(0x80000000);
165    if cpuid_result.eax >= 0x80000006 {
166        let cache_result = __cpuid(0x80000006);
167
168        // L1 data cache info from ECX register
169        info.l1_size = ((cache_result.ecx >> 24) & 0xFF) as usize * 1024;
170        info.line_size = (cache_result.ecx & 0xFF) as usize;
171        info.associativity = ((cache_result.ecx >> 16) & 0xFF) as usize;
172
173        // L2 cache info from ECX register
174        info.l2_size = ((cache_result.ecx >> 16) & 0xFFFF) as usize * 1024;
175
176        // L3 cache info from EDX register
177        info.l3_size = ((cache_result.edx >> 18) & 0x3FFF) as usize * 512 * 1024;
178    }
179
180    // Intel-specific cache detection
181    let vendor_result = __cpuid(0);
182    if vendor_result.ebx == 0x756e6547 && // "Genu"
183       vendor_result.edx == 0x49656e69 && // "ineI"
184       vendor_result.ecx == 0x6c65746e
185    {
186        // "ntel"
187        detect_intel_cache_info(&mut info);
188    }
189
190    // AMD-specific cache detection
191    if vendor_result.ebx == 0x68747541 && // "Auth"
192       vendor_result.edx == 0x69746e65 && // "enti"
193       vendor_result.ecx == 0x444d4163
194    {
195        // "cAMD"
196        detect_amd_cache_info(&mut info);
197    }
198
199    info
200}
201
202#[cfg(target_arch = "x86_64")]
203fn detect_intel_cache_info(info: &mut CacheInfo) {
204    // Intel cache detection via CPUID leaf 4
205    unsafe {
206        let mut cache_level = 0;
207        loop {
208            let cache_info = __cpuid_count(4, cache_level);
209
210            // No more cache levels
211            if cache_info.eax & 0x1F == 0 {
212                break;
213            }
214
215            let cache_type = cache_info.eax & 0x1F;
216            let level = (cache_info.eax >> 5) & 0x7;
217            let line_size = ((cache_info.ebx & 0xFFF) + 1) as usize;
218            let partitions = (((cache_info.ebx >> 12) & 0x3FF) + 1) as usize;
219            let ways = (((cache_info.ebx >> 22) & 0x3FF) + 1) as usize;
220            let sets = (cache_info.ecx + 1) as usize;
221
222            let size = line_size * partitions * ways * sets;
223
224            // Data cache or unified cache
225            if cache_type == 1 || cache_type == 3 {
226                match level {
227                    1 => {
228                        info.l1_size = size;
229                        info.line_size = line_size;
230                        info.associativity = ways;
231                    }
232                    2 => info.l2_size = size,
233                    3 => info.l3_size = size,
234                    _ => {}
235                }
236            }
237
238            cache_level += 1;
239            if cache_level > 10 {
240                // Safety check
241                break;
242            }
243        }
244    }
245}
246
247#[cfg(target_arch = "x86_64")]
248fn detect_amd_cache_info(info: &mut CacheInfo) {
249    // AMD cache detection via CPUID leaves 0x80000005 and 0x80000006.
250    // __cpuid is a safe fn on x86_64 in current Rust.
251    // L1 cache info
252    let l1_info = __cpuid(0x80000005);
253    info.l1_size = ((l1_info.ecx >> 24) & 0xFF) as usize * 1024;
254    info.line_size = (l1_info.ecx & 0xFF) as usize;
255    info.associativity = ((l1_info.ecx >> 16) & 0xFF) as usize;
256
257    // L2/L3 cache info
258    let l23_info = __cpuid(0x80000006);
259    info.l2_size = ((l23_info.ecx >> 16) & 0xFFFF) as usize * 1024;
260    info.l3_size = ((l23_info.edx >> 18) & 0x3FFF) as usize * 512 * 1024;
261}
262
263#[cfg(target_arch = "x86_64")]
264unsafe fn __cpuid_count(leaf: u32, sub_leaf: u32) -> std::arch::x86_64::CpuidResult {
265    let mut eax = leaf;
266    let mut ecx = sub_leaf;
267    let mut edx = 0;
268
269    // Use a workaround for rbx register constraint issue
270    let ebx: u32;
271    std::arch::asm!(
272        "push rbx",      // Save rbx
273        "cpuid",         // Execute cpuid
274        "mov {0:e}, ebx", // Copy ebx to output (32-bit)
275        "pop rbx",       // Restore rbx
276        out(reg) ebx,
277        inout("eax") eax,
278        inout("ecx") ecx,
279        inout("edx") edx,
280    );
281
282    std::arch::x86_64::CpuidResult { eax, ebx, ecx, edx }
283}
284
285/// Calculate the optimal block size for the current CPU's cache
286///
287/// This function estimates the best block size for blocked algorithms based on
288/// the CPU's cache size and the data type size.
289pub fn calculate_optimal_block_size<T>() -> usize {
290    // Get the L1 data cache size
291    let l1_cache_size = get_l1_cache_size();
292    let type_size = mem::size_of::<T>();
293
294    // A simple heuristic: we want the block to fit in L1 cache
295    // Square root because we're typically dealing with 2D blocks
296    let elements_per_cache = l1_cache_size / type_size;
297    let block_size = (elements_per_cache as f64).sqrt() as usize;
298
299    // Ensure the block size is at least 1 and reasonable
300    block_size.clamp(1, 1024)
301}
302
303/// Optimize data layout for column-wise access patterns
304fn optimize_for_column_access<T: Copy>(data: &mut [T]) {
305    // For 1D data, we can prefetch data in patterns that will be accessed
306    // In a real implementation, this would reorganize multidimensional data
307    // For now, we apply cache-friendly prefetch patterns
308    prefetch_data_pattern(data, get_cache_line_size());
309}
310
311/// Apply Morton (Z-order) curve ordering to data
312fn apply_morton_order<T: Copy>(data: &mut [T]) {
313    let len = data.len();
314    if len < 4 {
315        return; // Too small to benefit from reordering
316    }
317
318    // For simplicity, assume we're working with a power-of-2 sized array
319    // that represents a 2D grid
320    let side = (len as f64).sqrt() as usize;
321    if side * side != len {
322        // Not a perfect square, fall back to blocked layout
323        apply_blocked_layout(data, calculate_optimal_block_size::<T>());
324        return;
325    }
326
327    // Create a temporary buffer for reordered data
328    let mut temp = vec![data[0]; len];
329
330    // Reorder according to Morton curve
331    for (i, temp_item) in temp.iter_mut().enumerate().take(len) {
332        let (x, y) = morton_decode(i, side);
333        if x < side && y < side {
334            let linear_index = y * side + x;
335            if linear_index < len {
336                *temp_item = data[linear_index];
337            }
338        }
339    }
340
341    // Copy back to original array
342    data.copy_from_slice(&temp);
343}
344
345/// Apply Hilbert curve ordering to data  
346fn apply_hilbert_order<T: Copy>(data: &mut [T]) {
347    let len = data.len();
348    if len < 4 {
349        return; // Too small to benefit from reordering
350    }
351
352    // For simplicity, assume we're working with a power-of-2 sized array
353    let side = (len as f64).sqrt() as usize;
354    if side * side != len || !side.is_power_of_two() {
355        // Not a perfect square power of 2, fall back to Morton order
356        apply_morton_order(data);
357        return;
358    }
359
360    // Create a temporary buffer for reordered data
361    let mut temp = vec![data[0]; len];
362
363    // Reorder according to Hilbert curve
364    for (i, temp_item) in temp.iter_mut().enumerate().take(len) {
365        let (x, y) = hilbert_decode(i, side);
366        if x < side && y < side {
367            let linear_index = y * side + x;
368            if linear_index < len {
369                *temp_item = data[linear_index];
370            }
371        }
372    }
373
374    // Copy back to original array
375    data.copy_from_slice(&temp);
376}
377
378/// Apply cache-oblivious recursive layout
379fn apply_cache_oblivious_layout<T: Copy>(data: &mut [T]) {
380    if data.len() <= get_cache_line_size() / mem::size_of::<T>() {
381        return; // Small enough to fit in cache line
382    }
383
384    // Divide and conquer approach
385    cache_oblivious_recursive(data, 0, data.len());
386}
387
388/// Recursive helper for cache-oblivious layout
389fn cache_oblivious_recursive<T: Copy>(data: &mut [T], start: usize, end: usize) {
390    let len = end - start;
391    if len <= 1 {
392        return;
393    }
394
395    let cache_size = get_cache_info().l1_size / mem::size_of::<T>();
396    if len <= cache_size {
397        return; // Fits in cache
398    }
399
400    // Split in half and recursively optimize
401    let mid = start + len / 2;
402    cache_oblivious_recursive(data, start, mid);
403    cache_oblivious_recursive(data, mid, end);
404
405    // Interleave the two halves for better locality
406    interleave_data(&mut data[start..end]);
407}
408
409/// Apply blocked layout for matrix operations
410fn apply_blocked_layout<T: Copy>(data: &mut [T], block_size: usize) {
411    let len = data.len();
412    if len < block_size * block_size {
413        return; // Too small to benefit from blocking
414    }
415
416    // Assume square matrix layout
417    let side = (len as f64).sqrt() as usize;
418    if side * side != len {
419        return; // Not a square matrix
420    }
421
422    // Create temporary buffer for blocked data
423    let mut temp = vec![data[0]; len];
424    let mut temp_idx = 0;
425
426    // Copy data in blocks
427    for block_row in (0..side).step_by(block_size) {
428        for block_col in (0..side).step_by(block_size) {
429            let max_row = cmp::min(block_row + block_size, side);
430            let max_col = cmp::min(block_col + block_size, side);
431
432            for row in block_row..max_row {
433                for col in block_col..max_col {
434                    let linear_idx = row * side + col;
435                    if linear_idx < len && temp_idx < len {
436                        temp[temp_idx] = data[linear_idx];
437                        temp_idx += 1;
438                    }
439                }
440            }
441        }
442    }
443
444    // Copy back to original array
445    data.copy_from_slice(&temp);
446}
447
448/// Prefetch data in a cache-friendly pattern
449fn prefetch_data_pattern<T: Copy>(data: &mut [T], cache_line_size: usize) {
450    let elements_per_line = cache_line_size / mem::size_of::<T>();
451
452    // Touch every cache line to ensure it's loaded
453    for i in (0..data.len()).step_by(elements_per_line) {
454        // Prefetch hint for the next cache line
455        if i + elements_per_line < data.len() {
456            #[cfg(target_arch = "x86_64")]
457            unsafe {
458                {
459                    let ptr = data.as_ptr().add(i + elements_per_line);
460                    std::arch::x86_64::_mm_prefetch(
461                        ptr as *const i8,
462                        std::arch::x86_64::_MM_HINT_T0,
463                    );
464                }
465            }
466        }
467    }
468}
469
470/// Decode Morton index to 2D coordinates
471fn morton_decode(morton: usize, side: usize) -> (usize, usize) {
472    let mut x = 0;
473    let mut y = 0;
474    let mut bit = 0;
475    let mut m = morton;
476
477    while m > 0 && bit < 32 {
478        if (m & 1) != 0 {
479            x |= 1 << (bit / 2);
480        }
481        m >>= 1;
482
483        if (m & 1) != 0 {
484            y |= 1 << (bit / 2);
485        }
486        m >>= 1;
487
488        bit += 2;
489    }
490
491    (x % side, y % side)
492}
493
494/// Decode Hilbert index to 2D coordinates
495fn hilbert_decode(h: usize, n: usize) -> (usize, usize) {
496    let mut t = h;
497    let mut x = 0;
498    let mut y = 0;
499    let mut s = 1;
500
501    while s < n {
502        let rx = 1 & (t / 2);
503        let ry = 1 & (t ^ rx);
504
505        if ry == 0 {
506            if rx == 1 {
507                x = s - 1 - x;
508                y = s - 1 - y;
509            }
510
511            // Swap x and y
512            std::mem::swap(&mut x, &mut y);
513        }
514
515        x += s * rx;
516        y += s * ry;
517        t /= 4;
518        s *= 2;
519    }
520
521    (x % n, y % n)
522}
523
524/// Interleave two halves of data for better cache locality
525fn interleave_data<T: Copy>(data: &mut [T]) {
526    let len = data.len();
527    if len < 2 {
528        return;
529    }
530
531    let mid = len / 2;
532    let mut temp = vec![data[0]; len];
533
534    // Interleave first and second half
535    for i in 0..mid {
536        temp[2 * i] = data[i];
537        if 2 * i + 1 < len && i + mid < len {
538            temp[2 * i + 1] = data[i + mid];
539        }
540    }
541
542    // Handle odd lengths
543    if len % 2 == 1 {
544        temp[len - 1] = data[len - 1];
545    }
546
547    data.copy_from_slice(&temp);
548}
549
550/// Get the CPU's L1 data cache size
551fn get_l1_cache_size() -> usize {
552    get_cache_info().l1_size
553}
554
555/// Get the CPU's L2 cache size
556#[allow(dead_code)]
557fn get_l2_cache_size() -> usize {
558    get_cache_info().l2_size
559}
560
561/// Get the CPU's L3 cache size
562#[allow(dead_code)]
563fn get_l3_cache_size() -> usize {
564    get_cache_info().l3_size
565}
566
567#[cfg(test)]
568mod tests {
569    use super::*;
570
571    /// Regression test for issue #11: verify that detect_cache_info() compiles and runs
572    /// without requiring an `unsafe` block at the call site on x86_64 targets.
573    ///
574    /// Before the fix, `detect_x86_cache_info()` and `detect_amd_cache_info()` called
575    /// `std::arch::x86_64::__cpuid` (an unsafe function) without an enclosing `unsafe {}`
576    /// block, triggering E0133 on x86_64 builds.
577    #[test]
578    fn test_issue_11_cpuid_safe() {
579        // Calling detect_cache_info() exercises the x86_64 CPUID paths when compiled
580        // for that target. On any other architecture the fallback defaults are returned.
581        // If the bug were present this crate would fail to compile on x86_64 and this
582        // test could never execute.
583        let info = detect_cache_info();
584        // Sanity-check: cache sizes must be non-zero (defaults guarantee this).
585        assert!(info.line_size > 0, "cache line size should be non-zero");
586        assert!(info.l1_size > 0, "L1 cache size should be non-zero");
587        assert!(info.l2_size > 0, "L2 cache size should be non-zero");
588        assert!(info.l3_size > 0, "L3 cache size should be non-zero");
589    }
590}
numrs2/memory_optimize/cache_layout.rs

numrs2/memory_optimize/
cache_layout.rs