1pub fn matrix_multiply_cache_aware<T>(a: &[T], b: &[T], c: &mut [T], m: usize, n: usize, k: usize)
9where
10 T: Copy + std::ops::Add<Output = T> + std::ops::Mul<Output = T> + Default,
11{
12 let block_size = detect_optimal_block_size::<T>();
14
15 for ii in (0..m).step_by(block_size) {
17 for jj in (0..n).step_by(block_size) {
18 for kk in (0..k).step_by(block_size) {
19 let m_block = (ii + block_size).min(m);
20 let n_block = (jj + block_size).min(n);
21 let k_block = (kk + block_size).min(k);
22
23 for i in ii..m_block {
25 if i + 1 < m_block {
27 crate::performance_optimization::PerformanceHints::prefetch_read(
28 &a[(i + 1) * k + kk],
29 );
30 }
31
32 for j in jj..n_block {
33 let mut sum = T::default();
34
35 let mut l = kk;
37 while l + 4 <= k_block {
38 sum = sum + a[i * k + l] * b[l * n + j];
39 sum = sum + a[i * k + l + 1] * b[(l + 1) * n + j];
40 sum = sum + a[i * k + l + 2] * b[(l + 2) * n + j];
41 sum = sum + a[i * k + l + 3] * b[(l + 3) * n + j];
42 l += 4;
43 }
44
45 while l < k_block {
47 sum = sum + a[i * k + l] * b[l * n + j];
48 l += 1;
49 }
50
51 c[i * n + j] = sum;
52 }
53 }
54 }
55 }
56 }
57}
58
59pub fn adaptive_sort<T: Ord + Copy>(data: &mut [T]) {
61 let len = data.len();
62
63 if len <= 1 {
64 return;
65 }
66
67 if len < 64 {
69 cache_aware_insertion_sort(data);
71 } else if len < 2048 {
72 cache_aware_quicksort(data, 0, len - 1);
74 } else {
75 cache_oblivious_merge_sort(data);
77 }
78}
79
80fn cache_aware_insertion_sort<T: Ord + Copy>(data: &mut [T]) {
82 for i in 1..data.len() {
83 let key = data[i];
84 let mut j = i;
85
86 if i + 1 < data.len() {
88 crate::performance_optimization::PerformanceHints::prefetch_read(&data[i + 1]);
89 }
90
91 while j > 0 && data[j - 1] > key {
92 data[j] = data[j - 1];
93 j -= 1;
94 }
95 data[j] = key;
96 }
97}
98
99fn cache_aware_quicksort<T: Ord + Copy>(data: &mut [T], low: usize, high: usize) {
101 if low < high {
102 let pivot = partition_with_prefetch(data, low, high);
104
105 if pivot > 0 {
106 cache_aware_quicksort(data, low, pivot - 1);
107 }
108 cache_aware_quicksort(data, pivot + 1, high);
109 }
110}
111
112fn partition_with_prefetch<T: Ord + Copy>(data: &mut [T], low: usize, high: usize) -> usize {
114 let mid = low + (high - low) / 2;
116 if data[mid] < data[low] {
117 data.swap(low, mid);
118 }
119 if data[high] < data[low] {
120 data.swap(low, high);
121 }
122 if data[high] < data[mid] {
123 data.swap(mid, high);
124 }
125 data.swap(mid, high);
126
127 let pivot = data[high];
128 let mut i = low;
129
130 for j in low..high {
131 if j + 8 < high {
133 crate::performance_optimization::PerformanceHints::prefetch_read(&data[j + 8]);
134 }
135
136 if data[j] <= pivot {
137 data.swap(i, j);
138 i += 1;
139 }
140 }
141 data.swap(i, high);
142 i
143}
144
145fn cache_oblivious_merge_sort<T: Ord + Copy>(data: &mut [T]) {
147 let len = data.len();
148 if len <= 1 {
149 return;
150 }
151
152 let mut temp = vec![data[0]; len];
153 cache_oblivious_merge_sort_recursive(data, &mut temp, 0, len - 1);
154}
155
156fn cache_oblivious_merge_sort_recursive<T: Ord + Copy>(
157 data: &mut [T],
158 temp: &mut [T],
159 left: usize,
160 right: usize,
161) {
162 if left >= right {
163 return;
164 }
165
166 let mid = left + (right - left) / 2;
167 cache_oblivious_merge_sort_recursive(data, temp, left, mid);
168 cache_oblivious_merge_sort_recursive(data, temp, mid + 1, right);
169 cache_aware_merge(data, temp, left, mid, right);
170}
171
172fn cache_aware_merge<T: Ord + Copy>(
174 data: &mut [T],
175 temp: &mut [T],
176 left: usize,
177 mid: usize,
178 right: usize,
179) {
180 temp[left..(right + 1)].copy_from_slice(&data[left..(right + 1)]);
182
183 let mut i = left;
184 let mut j = mid + 1;
185 let mut k = left;
186
187 while i <= mid && j <= right {
188 if i + 8 <= mid {
190 crate::performance_optimization::PerformanceHints::prefetch_read(&temp[i + 8]);
191 }
192 if j + 8 <= right {
193 crate::performance_optimization::PerformanceHints::prefetch_read(&temp[j + 8]);
194 }
195
196 if temp[i] <= temp[j] {
197 data[k] = temp[i];
198 i += 1;
199 } else {
200 data[k] = temp[j];
201 j += 1;
202 }
203 k += 1;
204 }
205
206 while i <= mid {
208 data[k] = temp[i];
209 i += 1;
210 k += 1;
211 }
212
213 while j <= right {
214 data[k] = temp[j];
215 j += 1;
216 k += 1;
217 }
218}
219
220fn detect_optimal_block_size<T>() -> usize {
222 let l1_cache_size = 32 * 1024; let element_size = std::mem::size_of::<T>();
225 let cache_lines = l1_cache_size / 64; let elements_per_line = 64 / element_size.max(1);
227
228 let block_elements = (cache_lines * elements_per_line / 3) as f64; (block_elements.sqrt() as usize)
231 .next_power_of_two()
232 .min(512)
233}
234
235pub fn cache_aware_reduce<T, F>(data: &[T], init: T, op: F) -> T
237where
238 T: Copy,
239 F: Fn(T, T) -> T,
240{
241 if data.is_empty() {
242 return init;
243 }
244
245 let _len = data.len();
246 let block_size = 64; let mut result = init;
248
249 for chunk in data.chunks(block_size) {
251 if chunk.as_ptr() as usize + std::mem::size_of_val(chunk)
253 < data.as_ptr() as usize + std::mem::size_of_val(data)
254 {
255 let next_chunk_start = unsafe { chunk.as_ptr().add(chunk.len()) };
256 crate::performance_optimization::PerformanceHints::prefetch_read(unsafe {
257 &*next_chunk_start
258 });
259 }
260
261 for &item in chunk {
263 result = op(result, item);
264 }
265 }
266
267 result
268}
269
270pub fn adaptive_memcpy<T: Copy>(src: &[T], dst: &mut [T]) {
272 debug_assert_eq!(src.len(), dst.len());
273
274 let _len = src.len();
275 let size_bytes = std::mem::size_of_val(src);
276
277 if size_bytes <= 64 {
279 dst.copy_from_slice(src);
281 } else if size_bytes <= 4096 {
282 cache_optimized_copy(src, dst);
284 } else {
285 streaming_copy(src, dst);
287 }
288}
289
290fn cache_optimized_copy<T: Copy>(src: &[T], dst: &mut [T]) {
292 let chunk_size = 64 / std::mem::size_of::<T>(); for (src_chunk, dst_chunk) in src.chunks(chunk_size).zip(dst.chunks_mut(chunk_size)) {
295 if src_chunk.as_ptr() as usize + std::mem::size_of_val(src_chunk)
297 < src.as_ptr() as usize + std::mem::size_of_val(src)
298 {
299 let nextsrc = unsafe { src_chunk.as_ptr().add(chunk_size) };
300 crate::performance_optimization::PerformanceHints::prefetch_read(unsafe { &*nextsrc });
301 }
302
303 dst_chunk.copy_from_slice(src_chunk);
304 }
305}
306
307fn streaming_copy<T: Copy>(src: &[T], dst: &mut [T]) {
309 dst.copy_from_slice(src);
312}
313
314pub fn cache_aware_transpose<T: Copy>(src: &[T], dst: &mut [T], rows: usize, cols: usize) {
316 debug_assert_eq!(src.len(), rows * cols);
317 debug_assert_eq!(dst.len(), rows * cols);
318
319 let block_size = detect_optimal_block_size::<T>().min(32);
320
321 for i in (0..rows).step_by(block_size) {
323 for j in (0..cols).step_by(block_size) {
324 let max_i = (i + block_size).min(rows);
325 let max_j = (j + block_size).min(cols);
326
327 for ii in i..max_i {
329 if ii + 1 < max_i {
331 crate::performance_optimization::PerformanceHints::prefetch_read(
332 &src[(ii + 1) * cols + j],
333 );
334 }
335
336 for jj in j..max_j {
337 dst[jj * rows + ii] = src[ii * cols + jj];
338 }
339 }
340 }
341 }
342}
343
344#[cfg(test)]
345mod tests {
346 use super::*;
347
348 #[test]
349 fn test_cache_aware_matrix_multiply() {
350 let a = vec![1.0, 2.0, 3.0, 4.0]; let b = vec![5.0, 6.0, 7.0, 8.0]; let mut c = vec![0.0; 4]; matrix_multiply_cache_aware(&a, &b, &mut c, 2, 2, 2);
355
356 assert!((c[0] - 19.0_f64).abs() < 1e-10_f64);
358 assert!((c[1] - 22.0_f64).abs() < 1e-10_f64);
359 assert!((c[2] - 43.0_f64).abs() < 1e-10_f64);
360 assert!((c[3] - 50.0_f64).abs() < 1e-10_f64);
361 }
362
363 #[test]
364 fn test_adaptive_sort() {
365 let mut small_data = vec![4, 2, 7, 1, 3];
366 adaptive_sort(&mut small_data);
367 assert_eq!(small_data, vec![1, 2, 3, 4, 7]);
368
369 let mut large_data = (0..1000).rev().collect::<Vec<_>>();
370 adaptive_sort(&mut large_data);
371 assert_eq!(large_data, (0..1000).collect::<Vec<_>>());
372 }
373
374 #[test]
375 fn test_cache_aware_reduce() {
376 let data = vec![1, 2, 3, 4, 5];
377 let sum = cache_aware_reduce(&data, 0, |acc, x| acc + x);
378 assert_eq!(sum, 15);
379
380 let product = cache_aware_reduce(&data, 1, |acc, x| acc * x);
381 assert_eq!(product, 120);
382 }
383
384 #[test]
385 fn test_adaptive_memcpy() {
386 let src = vec![1, 2, 3, 4, 5];
387 let mut dst = vec![0; 5];
388
389 adaptive_memcpy(&src, &mut dst);
390 assert_eq!(src, dst);
391 }
392
393 #[test]
394 fn test_cache_aware_transpose() {
395 let src = vec![1, 2, 3, 4]; let mut dst = vec![0; 4];
397
398 cache_aware_transpose(&src, &mut dst, 2, 2);
399
400 assert_eq!(dst, vec![1, 3, 2, 4]);
402 }
403}