1use crate::common::IntegrateFloat;
29use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2};
30use std::collections::VecDeque;
31
32#[derive(Debug)]
34pub struct MemoryPool<F: IntegrateFloat> {
35 buffers: std::collections::HashMap<usize, VecDeque<Vec<F>>>,
37 max_buffer_size: usize,
39 total_allocated: usize,
41 max_total_memory: usize,
43}
44
45impl<F: IntegrateFloat> MemoryPool<F> {
46 pub fn new(_max_totalmemory: usize) -> Self {
48 Self {
49 buffers: std::collections::HashMap::new(),
50 max_buffer_size: _max_totalmemory / 4, total_allocated: 0,
52 max_total_memory: _max_totalmemory,
53 }
54 }
55
56 pub fn allocate(&mut self, size: usize) -> PooledBuffer<F> {
58 if size <= self.max_buffer_size {
59 if let Some(queue) = self.buffers.get_mut(&size) {
61 if let Some(mut buffer) = queue.pop_front() {
62 buffer.clear();
63 buffer.resize(size, F::zero());
64 return PooledBuffer::new(buffer, Some(size));
65 }
66 }
67 }
68
69 if self.total_allocated + size * std::mem::size_of::<F>() <= self.max_total_memory {
71 let buffer = vec![F::zero(); size];
72 self.total_allocated += size * std::mem::size_of::<F>();
73
74 if size <= self.max_buffer_size {
75 PooledBuffer::new(buffer, Some(size))
76 } else {
77 PooledBuffer::new(buffer, None)
78 }
79 } else {
80 PooledBuffer::new(vec![F::zero(); size], None)
82 }
83 }
84
85 pub fn deallocate(&mut self, buffer: Vec<F>) {
87 let size = buffer.len();
88 if size <= self.max_buffer_size && self.buffers.len() < 100 {
89 self.buffers.entry(size).or_default().push_back(buffer);
90 } else {
91 self.total_allocated = self
92 .total_allocated
93 .saturating_sub(size * std::mem::size_of::<F>());
94 }
95 }
96
97 pub fn memory_usage(&self) -> MemoryUsage {
99 let pooled_buffers: usize = self.buffers.values().map(|q| q.len()).sum();
100 let pooled_memory: usize = self
101 .buffers
102 .iter()
103 .map(|(size, queue)| size * queue.len() * std::mem::size_of::<F>())
104 .sum();
105
106 MemoryUsage {
107 total_allocated: self.total_allocated,
108 pooled_memory,
109 pooled_buffers,
110 max_total_memory: self.max_total_memory,
111 }
112 }
113
114 pub fn clear(&mut self) {
116 self.buffers.clear();
117 self.total_allocated = 0;
118 }
119}
120
121pub struct PooledBuffer<F: IntegrateFloat> {
123 buffer: Option<Vec<F>>,
124 #[allow(dead_code)]
125 pool_size: Option<usize>,
126}
127
128impl<F: IntegrateFloat> PooledBuffer<F> {
129 fn new(buffer: Vec<F>, poolsize: Option<usize>) -> Self {
130 Self {
131 buffer: Some(buffer),
132 pool_size: poolsize,
133 }
134 }
135
136 pub fn as_slice(&mut self) -> &[F] {
138 self.buffer.as_ref().unwrap()
139 }
140
141 pub fn as_mut_slice(&mut self) -> &mut [F] {
143 self.buffer.as_mut().unwrap()
144 }
145
146 pub fn into_vec(&mut self) -> Vec<F> {
148 self.buffer.take().unwrap()
149 }
150
151 pub fn len(&self) -> usize {
153 self.buffer.as_ref().unwrap().len()
154 }
155
156 pub fn is_empty(&self) -> bool {
158 self.buffer.as_ref().unwrap().is_empty()
159 }
160}
161
162#[derive(Debug, Clone)]
164pub struct MemoryUsage {
165 pub total_allocated: usize,
166 pub pooled_memory: usize,
167 pub pooled_buffers: usize,
168 pub max_total_memory: usize,
169}
170
171#[derive(Debug, Clone)]
173pub struct CacheFriendlyMatrix<F: IntegrateFloat> {
174 data: Vec<F>,
175 rows: usize,
176 cols: usize,
177 layout: MatrixLayout,
178}
179
180#[derive(Debug, Clone, Copy)]
181pub enum MatrixLayout {
182 RowMajor,
183 ColumnMajor,
184 Blocked { block_size: usize },
185}
186
187impl<F: IntegrateFloat> CacheFriendlyMatrix<F> {
188 pub fn new(rows: usize, cols: usize, layout: MatrixLayout) -> Self {
190 let data = vec![F::zero(); rows * cols];
191 Self {
192 data,
193 rows,
194 cols,
195 layout,
196 }
197 }
198
199 pub fn from_array(array: &Array2<F>) -> Self {
201 let (rows, cols) = array.dim();
202 let mut matrix = Self::new(rows, cols, MatrixLayout::RowMajor);
203
204 for i in 0..rows {
205 for j in 0..cols {
206 matrix.set(i, j, array[[i, j]]);
207 }
208 }
209
210 matrix
211 }
212
213 pub fn get(&self, row: usize, col: usize) -> F {
215 let index = self.compute_index(row, col);
216 self.data[index]
217 }
218
219 pub fn set(&mut self, row: usize, col: usize, value: F) {
221 let index = self.compute_index(row, col);
222 self.data[index] = value;
223 }
224
225 fn compute_index(&self, row: usize, col: usize) -> usize {
227 match self.layout {
228 MatrixLayout::RowMajor => row * self.cols + col,
229 MatrixLayout::ColumnMajor => col * self.rows + row,
230 MatrixLayout::Blocked { block_size } => {
231 let block_row = row / block_size;
232 let block_col = col / block_size;
233 let in_block_row = row % block_size;
234 let in_block_col = col % block_size;
235
236 let blocks_per_row = self.cols.div_ceil(block_size);
237 let block_index = block_row * blocks_per_row + block_col;
238 let in_block_index = in_block_row * block_size + in_block_col;
239
240 block_index * block_size * block_size + in_block_index
241 }
242 }
243 }
244
245 pub fn dim(&self) -> (usize, usize) {
247 (self.rows, self.cols)
248 }
249
250 pub fn matvec(&self, x: ArrayView1<F>) -> Array1<F> {
252 let mut y = Array1::zeros(self.rows);
253
254 match self.layout {
255 MatrixLayout::RowMajor => {
256 for i in 0..self.rows {
257 let mut sum = F::zero();
258 for j in 0..self.cols {
259 sum += self.get(i, j) * x[j];
260 }
261 y[i] = sum;
262 }
263 }
264 MatrixLayout::ColumnMajor => {
265 for j in 0..self.cols {
266 let x_j = x[j];
267 for i in 0..self.rows {
268 y[i] += self.get(i, j) * x_j;
269 }
270 }
271 }
272 MatrixLayout::Blocked { block_size } => {
273 self.blocked_matvec(x.view(), y.view_mut(), block_size);
274 }
275 }
276
277 y
278 }
279
280 fn blocked_matvec(&self, x: ArrayView1<F>, mut y: ArrayViewMut1<F>, blocksize: usize) {
282 let rows_blocks = self.rows.div_ceil(blocksize);
283 let cols_blocks = self.cols.div_ceil(blocksize);
284
285 for i_block in 0..rows_blocks {
286 for j_block in 0..cols_blocks {
287 let i_start = i_block * blocksize;
288 let i_end = (i_start + blocksize).min(self.rows);
289 let j_start = j_block * blocksize;
290 let j_end = (j_start + blocksize).min(self.cols);
291
292 for i in i_start..i_end {
293 let mut sum = F::zero();
294 for j in j_start..j_end {
295 sum += self.get(i, j) * x[j];
296 }
297 y[i] += sum;
298 }
299 }
300 }
301 }
302}
303
304#[derive(Debug, Clone)]
306pub struct BlockingStrategy {
307 pub l1_block_size: usize,
309 pub l2_block_size: usize,
311 pub l3_block_size: usize,
313}
314
315impl BlockingStrategy {
316 pub fn new(_l1_blocksize: usize) -> Self {
318 Self {
319 l1_block_size: _l1_blocksize,
320 l2_block_size: _l1_blocksize * 4,
321 l3_block_size: _l1_blocksize * 16,
322 }
323 }
324
325 pub fn optimal_block_size(&self, _matrix_size: usize, cachelevel: CacheLevel) -> usize {
327 let block_size = match cachelevel {
328 CacheLevel::L1 => self.l1_block_size,
329 CacheLevel::L2 => self.l2_block_size,
330 CacheLevel::L3 => self.l3_block_size,
331 };
332
333 block_size.min(_matrix_size)
335 }
336}
337
338#[derive(Debug, Clone, Copy)]
339pub enum CacheLevel {
340 L1,
341 L2,
342 L3,
343}
344
345pub struct CacheAwareAlgorithms;
347
348impl CacheAwareAlgorithms {
349 pub fn transpose<F: IntegrateFloat>(
351 input: ArrayView2<F>,
352 mut output: ArrayViewMut2<F>,
353 block_size: usize,
354 ) {
355 let (rows, cols) = input.dim();
356
357 for i_block in (0..rows).step_by(block_size) {
358 for j_block in (0..cols).step_by(block_size) {
359 let i_end = (i_block + block_size).min(rows);
360 let j_end = (j_block + block_size).min(cols);
361
362 for i in i_block..i_end {
363 for j in j_block..j_end {
364 output[[j, i]] = input[[i, j]];
365 }
366 }
367 }
368 }
369 }
370
371 pub fn vector_add_blocked<F: IntegrateFloat>(
373 a: ArrayView1<F>,
374 b: ArrayView1<F>,
375 mut c: ArrayViewMut1<F>,
376 block_size: usize,
377 ) {
378 let n = a.len();
379
380 for start in (0..n).step_by(block_size) {
381 let end = (start + block_size).min(n);
382
383 for i in start..end {
384 c[i] = a[i] + b[i];
385 }
386 }
387 }
388
389 pub fn reduction_blocked<F: IntegrateFloat>(data: ArrayView1<F>, blocksize: usize) -> F {
391 let n = data.len();
392 let mut partial_sums = Vec::new();
393
394 for start in (0..n).step_by(blocksize) {
396 let end = (start + blocksize).min(n);
397 let mut sum = F::zero();
398
399 for i in start..end {
400 sum += data[i];
401 }
402
403 partial_sums.push(sum);
404 }
405
406 partial_sums.into_iter().fold(F::zero(), |acc, x| acc + x)
408 }
409}
410
411pub struct DataLayoutOptimizer;
413
414impl DataLayoutOptimizer {
415 pub fn aos_to_soa<F: IntegrateFloat>(
417 data: &[(F, F, F)], ) -> (Vec<F>, Vec<F>, Vec<F>) {
419 let mut x_values = Vec::with_capacity(data.len());
420 let mut y_values = Vec::with_capacity(data.len());
421 let mut z_values = Vec::with_capacity(data.len());
422
423 for &(x, y, z) in data {
424 x_values.push(x);
425 y_values.push(y);
426 z_values.push(z);
427 }
428
429 (x_values, y_values, z_values)
430 }
431
432 pub fn soa_to_aos<F: IntegrateFloat>(
434 x_values: &[F],
435 y_values: &[F],
436 z_values: &[F],
437 ) -> Vec<(F, F, F)> {
438 assert_eq!(x_values.len(), y_values.len());
439 assert_eq!(y_values.len(), z_values.len());
440
441 x_values
442 .iter()
443 .zip(y_values.iter())
444 .zip(z_values.iter())
445 .map(|((&x, &y), &z)| (x, y, z))
446 .collect()
447 }
448
449 pub fn spatial_reorder<F: IntegrateFloat>(data: &mut [(F, F)], ) {
452 data.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
454 }
455}
456
457pub struct MemoryPrefetch;
459
460impl MemoryPrefetch {
461 #[inline]
463 pub fn prefetch_read<T>(ptr: *const T) {
464 #[cfg(target_arch = "x86_64")]
465 unsafe {
466 std::arch::x86_64::_mm_prefetch(ptr as *const i8, std::arch::x86_64::_MM_HINT_T0);
467 }
468
469 #[cfg(not(target_arch = "x86_64"))]
470 {
471 let _ = ptr;
473 }
474 }
475
476 #[inline]
478 pub fn prefetch_write<T>(ptr: *const T) {
479 #[cfg(target_arch = "x86_64")]
480 unsafe {
481 std::arch::x86_64::_mm_prefetch(ptr as *const i8, std::arch::x86_64::_MM_HINT_T0);
482 }
483
484 #[cfg(not(target_arch = "x86_64"))]
485 {
486 let _ = ptr;
488 }
489 }
490
491 #[inline]
493 pub fn prefetch_stream<T>(ptr: *const T) {
494 #[cfg(target_arch = "x86_64")]
495 unsafe {
496 std::arch::x86_64::_mm_prefetch(ptr as *const i8, std::arch::x86_64::_MM_HINT_NTA);
497 }
498
499 #[cfg(not(target_arch = "x86_64"))]
500 {
501 let _ = ptr;
503 }
504 }
505}
506
507#[cfg(test)]
508mod tests {
509 use super::*;
510 use approx::assert_abs_diff_eq;
511
512 #[test]
513 fn test_memory_pool() {
514 let mut pool = MemoryPool::<f64>::new(1024 * 1024); let mut buffer = pool.allocate(100);
518 assert_eq!(buffer.len(), 100);
519
520 let usage = pool.memory_usage();
522 assert!(usage.total_allocated > 0);
523
524 let vec_buffer = buffer.into_vec();
526 pool.deallocate(vec_buffer);
527
528 let buffer2 = pool.allocate(100);
530 assert_eq!(buffer2.len(), 100);
531 }
532
533 #[test]
534 fn test_cache_friendly_matrix() {
535 let mut matrix = CacheFriendlyMatrix::new(3, 3, MatrixLayout::RowMajor);
536
537 matrix.set(0, 0, 1.0);
539 matrix.set(1, 1, 2.0);
540 matrix.set(2, 2, 3.0);
541
542 assert_abs_diff_eq!(matrix.get(0, 0), 1.0);
544 assert_abs_diff_eq!(matrix.get(1, 1), 2.0);
545 assert_abs_diff_eq!(matrix.get(2, 2), 3.0);
546
547 let x = Array1::from_vec(vec![1.0, 1.0, 1.0]);
549 let y = matrix.matvec(x.view());
550
551 assert_abs_diff_eq!(y[0], 1.0);
552 assert_abs_diff_eq!(y[1], 2.0);
553 assert_abs_diff_eq!(y[2], 3.0);
554 }
555
556 #[test]
557 fn test_blocking_strategy() {
558 let strategy = BlockingStrategy::new(64);
559
560 assert_eq!(strategy.l1_block_size, 64);
561 assert_eq!(strategy.l2_block_size, 256);
562 assert_eq!(strategy.l3_block_size, 1024);
563
564 let block_size = strategy.optimal_block_size(32, CacheLevel::L1);
566 assert_eq!(block_size, 32); let block_size = strategy.optimal_block_size(128, CacheLevel::L1);
569 assert_eq!(block_size, 64); }
571
572 #[test]
573 fn test_data_layout_optimizer() {
574 let aos_data = vec![(1.0, 2.0, 3.0), (4.0, 5.0, 6.0), (7.0, 8.0, 9.0)];
576 let (x, y, z) = DataLayoutOptimizer::aos_to_soa(&aos_data);
577
578 assert_eq!(x, vec![1.0, 4.0, 7.0]);
579 assert_eq!(y, vec![2.0, 5.0, 8.0]);
580 assert_eq!(z, vec![3.0, 6.0, 9.0]);
581
582 let reconstructed = DataLayoutOptimizer::soa_to_aos(&x, &y, &z);
584 assert_eq!(reconstructed, aos_data);
585 }
586
587 #[test]
588 fn test_cache_aware_algorithms() {
589 let data = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
590
591 let sum = CacheAwareAlgorithms::reduction_blocked(data.view(), 3);
593 assert_abs_diff_eq!(sum, 36.0); let a = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0]);
597 let b = Array1::from_vec(vec![5.0, 6.0, 7.0, 8.0]);
598 let mut c = Array1::zeros(4);
599
600 CacheAwareAlgorithms::vector_add_blocked(a.view(), b.view(), c.view_mut(), 2);
601
602 assert_abs_diff_eq!(c[0], 6.0);
603 assert_abs_diff_eq!(c[1], 8.0);
604 assert_abs_diff_eq!(c[2], 10.0);
605 assert_abs_diff_eq!(c[3], 12.0);
606 }
607}