scirs2_core/performance/
cache_optimization.rs

1//! Cache-Aware Algorithms and Optimization
2//!
3//! This module provides cache-aware implementations of fundamental algorithms
4//! optimized for modern CPU cache hierarchies. It includes adaptive algorithms
5//! that choose optimal strategies based on data size and cache characteristics.
6
7/// Cache-aware matrix multiplication with adaptive blocking
8pub fn matrix_multiply_cache_aware<T>(a: &[T], b: &[T], c: &mut [T], m: usize, n: usize, k: usize)
9where
10    T: Copy + std::ops::Add<Output = T> + std::ops::Mul<Output = T> + Default,
11{
12    // Detect optimal block size based on cache hierarchy
13    let block_size = detect_optimal_block_size::<T>();
14
15    // Cache-blocked matrix multiplication
16    for ii in (0..m).step_by(block_size) {
17        for jj in (0..n).step_by(block_size) {
18            for kk in (0..k).step_by(block_size) {
19                let m_block = (ii + block_size).min(m);
20                let n_block = (jj + block_size).min(n);
21                let k_block = (kk + block_size).min(k);
22
23                // Micro-kernel for the block
24                for i in ii..m_block {
25                    // Prefetch next cache line
26                    if i + 1 < m_block {
27                        crate::performance_optimization::PerformanceHints::prefetch_read(
28                            &a[(i + 1) * k + kk],
29                        );
30                    }
31
32                    for j in jj..n_block {
33                        let mut sum = T::default();
34
35                        // Unroll inner loop for better instruction scheduling
36                        let mut l = kk;
37                        while l + 4 <= k_block {
38                            sum = sum + a[i * k + l] * b[l * n + j];
39                            sum = sum + a[i * k + l + 1] * b[(l + 1) * n + j];
40                            sum = sum + a[i * k + l + 2] * b[(l + 2) * n + j];
41                            sum = sum + a[i * k + l + 3] * b[(l + 3) * n + j];
42                            l += 4;
43                        }
44
45                        // Handle remaining elements
46                        while l < k_block {
47                            sum = sum + a[i * k + l] * b[l * n + j];
48                            l += 1;
49                        }
50
51                        c[i * n + j] = sum;
52                    }
53                }
54            }
55        }
56    }
57}
58
59/// Adaptive sorting algorithm that chooses the best strategy based on data characteristics
60pub fn adaptive_sort<T: Ord + Copy>(data: &mut [T]) {
61    let len = data.len();
62
63    if len <= 1 {
64        return;
65    }
66
67    // Choose algorithm based on size and cache characteristics
68    if len < 64 {
69        // Use insertion sort for small arrays (cache-friendly)
70        cache_aware_insertion_sort(data);
71    } else if len < 2048 {
72        // Use cache-optimized quicksort for medium arrays
73        cache_aware_quicksort(data, 0, len - 1);
74    } else {
75        // Use cache-oblivious merge sort for large arrays
76        cache_oblivious_merge_sort(data);
77    }
78}
79
80/// Cache-aware insertion sort optimized for modern cache hierarchies
81fn cache_aware_insertion_sort<T: Ord + Copy>(data: &mut [T]) {
82    for i in 1..data.len() {
83        let key = data[i];
84        let mut j = i;
85
86        // Prefetch next elements to improve cache utilization
87        if i + 1 < data.len() {
88            crate::performance_optimization::PerformanceHints::prefetch_read(&data[i + 1]);
89        }
90
91        while j > 0 && data[j - 1] > key {
92            data[j] = data[j - 1];
93            j -= 1;
94        }
95        data[j] = key;
96    }
97}
98
99/// Cache-optimized quicksort with adaptive pivot selection
100fn cache_aware_quicksort<T: Ord + Copy>(data: &mut [T], low: usize, high: usize) {
101    if low < high {
102        // Use median-of-3 for better pivot selection
103        let pivot = partition_with_prefetch(data, low, high);
104
105        if pivot > 0 {
106            cache_aware_quicksort(data, low, pivot - 1);
107        }
108        cache_aware_quicksort(data, pivot + 1, high);
109    }
110}
111
112/// Partitioning with prefetching for better cache utilization
113fn partition_with_prefetch<T: Ord + Copy>(data: &mut [T], low: usize, high: usize) -> usize {
114    // Median-of-3 pivot selection
115    let mid = low + (high - low) / 2;
116    if data[mid] < data[low] {
117        data.swap(low, mid);
118    }
119    if data[high] < data[low] {
120        data.swap(low, high);
121    }
122    if data[high] < data[mid] {
123        data.swap(mid, high);
124    }
125    data.swap(mid, high);
126
127    let pivot = data[high];
128    let mut i = low;
129
130    for j in low..high {
131        // Prefetch next iteration
132        if j + 8 < high {
133            crate::performance_optimization::PerformanceHints::prefetch_read(&data[j + 8]);
134        }
135
136        if data[j] <= pivot {
137            data.swap(i, j);
138            i += 1;
139        }
140    }
141    data.swap(i, high);
142    i
143}
144
145/// Cache-oblivious merge sort for optimal cache performance on large datasets
146fn cache_oblivious_merge_sort<T: Ord + Copy>(data: &mut [T]) {
147    let len = data.len();
148    if len <= 1 {
149        return;
150    }
151
152    let mut temp = vec![data[0]; len];
153    cache_oblivious_merge_sort_recursive(data, &mut temp, 0, len - 1);
154}
155
156fn cache_oblivious_merge_sort_recursive<T: Ord + Copy>(
157    data: &mut [T],
158    temp: &mut [T],
159    left: usize,
160    right: usize,
161) {
162    if left >= right {
163        return;
164    }
165
166    let mid = left + (right - left) / 2;
167    cache_oblivious_merge_sort_recursive(data, temp, left, mid);
168    cache_oblivious_merge_sort_recursive(data, temp, mid + 1, right);
169    cache_aware_merge(data, temp, left, mid, right);
170}
171
172/// Cache-aware merge operation with prefetching
173fn cache_aware_merge<T: Ord + Copy>(
174    data: &mut [T],
175    temp: &mut [T],
176    left: usize,
177    mid: usize,
178    right: usize,
179) {
180    // Copy to temporary array
181    temp[left..(right + 1)].copy_from_slice(&data[left..(right + 1)]);
182
183    let mut i = left;
184    let mut j = mid + 1;
185    let mut k = left;
186
187    while i <= mid && j <= right {
188        // Prefetch ahead in both arrays
189        if i + 8 <= mid {
190            crate::performance_optimization::PerformanceHints::prefetch_read(&temp[i + 8]);
191        }
192        if j + 8 <= right {
193            crate::performance_optimization::PerformanceHints::prefetch_read(&temp[j + 8]);
194        }
195
196        if temp[i] <= temp[j] {
197            data[k] = temp[i];
198            i += 1;
199        } else {
200            data[k] = temp[j];
201            j += 1;
202        }
203        k += 1;
204    }
205
206    // Copy remaining elements
207    while i <= mid {
208        data[k] = temp[i];
209        i += 1;
210        k += 1;
211    }
212
213    while j <= right {
214        data[k] = temp[j];
215        j += 1;
216        k += 1;
217    }
218}
219
220/// Detect optimal block size for cache-aware algorithms
221fn detect_optimal_block_size<T>() -> usize {
222    // Estimate based on L1 cache size and element size
223    let l1_cache_size = 32 * 1024; // 32KB typical L1 cache
224    let element_size = std::mem::size_of::<T>();
225    let cache_lines = l1_cache_size / 64; // 64-byte cache lines
226    let elements_per_line = 64 / element_size.max(1);
227
228    // Use square root of cache capacity for 2D blocking
229    let block_elements = (cache_lines * elements_per_line / 3) as f64; // Divide by 3 for 3 arrays
230    (block_elements.sqrt() as usize)
231        .next_power_of_two()
232        .min(512)
233}
234
235/// Cache-aware vector reduction with optimal memory access patterns
236pub fn cache_aware_reduce<T, F>(data: &[T], init: T, op: F) -> T
237where
238    T: Copy,
239    F: Fn(T, T) -> T,
240{
241    if data.is_empty() {
242        return init;
243    }
244
245    let _len = data.len();
246    let block_size = 64; // Process in cache-line-sized blocks
247    let mut result = init;
248
249    // Process in blocks to maintain cache locality
250    for chunk in data.chunks(block_size) {
251        // Prefetch next chunk
252        if chunk.as_ptr() as usize + std::mem::size_of_val(chunk)
253            < data.as_ptr() as usize + std::mem::size_of_val(data)
254        {
255            let next_chunk_start = unsafe { chunk.as_ptr().add(chunk.len()) };
256            crate::performance_optimization::PerformanceHints::prefetch_read(unsafe {
257                &*next_chunk_start
258            });
259        }
260
261        // Reduce within the chunk
262        for &item in chunk {
263            result = op(result, item);
264        }
265    }
266
267    result
268}
269
270/// Adaptive memory copy with optimal strategy selection
271pub fn adaptive_memcpy<T: Copy>(src: &[T], dst: &mut [T]) {
272    debug_assert_eq!(src.len(), dst.len());
273
274    let _len = src.len();
275    let size_bytes = std::mem::size_of_val(src);
276
277    // Choose strategy based on size
278    if size_bytes <= 64 {
279        // Small copy - use simple loop
280        dst.copy_from_slice(src);
281    } else if size_bytes <= 4096 {
282        // Medium copy - use cache-optimized copy with prefetching
283        cache_optimized_copy(src, dst);
284    } else {
285        // Large copy - use streaming copy to avoid cache pollution
286        streaming_copy(src, dst);
287    }
288}
289
290/// Cache-optimized copy with prefetching
291fn cache_optimized_copy<T: Copy>(src: &[T], dst: &mut [T]) {
292    let chunk_size = 64 / std::mem::size_of::<T>(); // One cache line worth
293
294    for (src_chunk, dst_chunk) in src.chunks(chunk_size).zip(dst.chunks_mut(chunk_size)) {
295        // Prefetch next source chunk
296        if src_chunk.as_ptr() as usize + std::mem::size_of_val(src_chunk)
297            < src.as_ptr() as usize + std::mem::size_of_val(src)
298        {
299            let nextsrc = unsafe { src_chunk.as_ptr().add(chunk_size) };
300            crate::performance_optimization::PerformanceHints::prefetch_read(unsafe { &*nextsrc });
301        }
302
303        dst_chunk.copy_from_slice(src_chunk);
304    }
305}
306
307/// Streaming copy for large data to avoid cache pollution
308fn streaming_copy<T: Copy>(src: &[T], dst: &mut [T]) {
309    // Use non-temporal stores for large copies to bypass cache
310    // For now, fall back to regular copy as non-temporal intrinsics are unstable
311    dst.copy_from_slice(src);
312}
313
314/// Cache-aware 2D array transpose
315pub fn cache_aware_transpose<T: Copy>(src: &[T], dst: &mut [T], rows: usize, cols: usize) {
316    debug_assert_eq!(src.len(), rows * cols);
317    debug_assert_eq!(dst.len(), rows * cols);
318
319    let block_size = detect_optimal_block_size::<T>().min(32);
320
321    // Block-wise transpose for better cache locality
322    for i in (0..rows).step_by(block_size) {
323        for j in (0..cols).step_by(block_size) {
324            let max_i = (i + block_size).min(rows);
325            let max_j = (j + block_size).min(cols);
326
327            // Transpose within the block
328            for ii in i..max_i {
329                // Prefetch next row
330                if ii + 1 < max_i {
331                    crate::performance_optimization::PerformanceHints::prefetch_read(
332                        &src[(ii + 1) * cols + j],
333                    );
334                }
335
336                for jj in j..max_j {
337                    dst[jj * rows + ii] = src[ii * cols + jj];
338                }
339            }
340        }
341    }
342}
343
344#[cfg(test)]
345mod tests {
346    use super::*;
347
348    #[test]
349    fn test_cache_aware_matrix_multiply() {
350        let a = vec![1.0, 2.0, 3.0, 4.0]; // 2x2 matrix
351        let b = vec![5.0, 6.0, 7.0, 8.0]; // 2x2 matrix
352        let mut c = vec![0.0; 4]; // 2x2 result matrix
353
354        matrix_multiply_cache_aware(&a, &b, &mut c, 2, 2, 2);
355
356        // Result should be [[19, 22], [43, 50]]
357        assert!((c[0] - 19.0_f64).abs() < 1e-10_f64);
358        assert!((c[1] - 22.0_f64).abs() < 1e-10_f64);
359        assert!((c[2] - 43.0_f64).abs() < 1e-10_f64);
360        assert!((c[3] - 50.0_f64).abs() < 1e-10_f64);
361    }
362
363    #[test]
364    fn test_adaptive_sort() {
365        let mut small_data = vec![4, 2, 7, 1, 3];
366        adaptive_sort(&mut small_data);
367        assert_eq!(small_data, vec![1, 2, 3, 4, 7]);
368
369        let mut large_data = (0..1000).rev().collect::<Vec<_>>();
370        adaptive_sort(&mut large_data);
371        assert_eq!(large_data, (0..1000).collect::<Vec<_>>());
372    }
373
374    #[test]
375    fn test_cache_aware_reduce() {
376        let data = vec![1, 2, 3, 4, 5];
377        let sum = cache_aware_reduce(&data, 0, |acc, x| acc + x);
378        assert_eq!(sum, 15);
379
380        let product = cache_aware_reduce(&data, 1, |acc, x| acc * x);
381        assert_eq!(product, 120);
382    }
383
384    #[test]
385    fn test_adaptive_memcpy() {
386        let src = vec![1, 2, 3, 4, 5];
387        let mut dst = vec![0; 5];
388
389        adaptive_memcpy(&src, &mut dst);
390        assert_eq!(src, dst);
391    }
392
393    #[test]
394    fn test_cache_aware_transpose() {
395        let src = vec![1, 2, 3, 4]; // 2x2 matrix: [[1,2],[3,4]]
396        let mut dst = vec![0; 4];
397
398        cache_aware_transpose(&src, &mut dst, 2, 2);
399
400        // Should be transposed to [[1,3],[2,4]]
401        assert_eq!(dst, vec![1, 3, 2, 4]);
402    }
403}