scirs2_integrate/
memory.rs

1//! Memory access pattern optimization utilities
2//!
3//! This module provides utilities for optimizing memory access patterns in
4//! numerical computations. It includes cache-friendly algorithms, data layout
5//! optimizations, and memory pooling strategies to improve performance.
6//!
7//! # Memory Layout Optimization
8//!
9//! Different memory layouts can significantly impact performance:
10//! - Row-major vs column-major for matrices
11//! - Structure of Arrays (SoA) vs Array of Structures (AoS)
12//! - Cache-friendly blocking and tiling
13//!
14//! # Examples
15//!
16//! ```
17//! use scirs2_integrate::memory::{MemoryPool, CacheFriendlyMatrix, BlockingStrategy, MatrixLayout};
18//!
19//! // Use memory pool for frequent allocations
20//! let mut pool = MemoryPool::<f64>::new(1024);
21//! let buffer = pool.allocate(100);
22//!
23//! // Cache-friendly matrix operations
24//! let matrix = CacheFriendlyMatrix::<f64>::new(100, 100, MatrixLayout::RowMajor);
25//! let blocking = BlockingStrategy::new(64); // 64x64 blocks
26//! ```
27
28use crate::common::IntegrateFloat;
29use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2};
30use std::collections::VecDeque;
31
32/// Memory pool for efficient allocation and reuse of buffers
33#[derive(Debug)]
34pub struct MemoryPool<F: IntegrateFloat> {
35    /// Available buffers by size
36    buffers: std::collections::HashMap<usize, VecDeque<Vec<F>>>,
37    /// Maximum buffer size to pool
38    max_buffer_size: usize,
39    /// Total memory currently allocated
40    total_allocated: usize,
41    /// Maximum total memory allowed
42    max_total_memory: usize,
43}
44
45impl<F: IntegrateFloat> MemoryPool<F> {
46    /// Create a new memory pool
47    pub fn new(_max_totalmemory: usize) -> Self {
48        Self {
49            buffers: std::collections::HashMap::new(),
50            max_buffer_size: _max_totalmemory / 4, // Quarter of total for single buffer
51            total_allocated: 0,
52            max_total_memory: _max_totalmemory,
53        }
54    }
55
56    /// Allocate a buffer of specified size
57    pub fn allocate(&mut self, size: usize) -> PooledBuffer<F> {
58        if size <= self.max_buffer_size {
59            // Try to reuse existing buffer
60            if let Some(queue) = self.buffers.get_mut(&size) {
61                if let Some(mut buffer) = queue.pop_front() {
62                    buffer.clear();
63                    buffer.resize(size, F::zero());
64                    return PooledBuffer::new(buffer, Some(size));
65                }
66            }
67        }
68
69        // Allocate new buffer
70        if self.total_allocated + size * std::mem::size_of::<F>() <= self.max_total_memory {
71            let buffer = vec![F::zero(); size];
72            self.total_allocated += size * std::mem::size_of::<F>();
73
74            if size <= self.max_buffer_size {
75                PooledBuffer::new(buffer, Some(size))
76            } else {
77                PooledBuffer::new(buffer, None)
78            }
79        } else {
80            // Fallback to regular allocation if pool is full
81            PooledBuffer::new(vec![F::zero(); size], None)
82        }
83    }
84
85    /// Return a buffer to the pool
86    pub fn deallocate(&mut self, buffer: Vec<F>) {
87        let size = buffer.len();
88        if size <= self.max_buffer_size && self.buffers.len() < 100 {
89            self.buffers.entry(size).or_default().push_back(buffer);
90        } else {
91            self.total_allocated = self
92                .total_allocated
93                .saturating_sub(size * std::mem::size_of::<F>());
94        }
95    }
96
97    /// Get memory usage statistics
98    pub fn memory_usage(&self) -> MemoryUsage {
99        let pooled_buffers: usize = self.buffers.values().map(|q| q.len()).sum();
100        let pooled_memory: usize = self
101            .buffers
102            .iter()
103            .map(|(size, queue)| size * queue.len() * std::mem::size_of::<F>())
104            .sum();
105
106        MemoryUsage {
107            total_allocated: self.total_allocated,
108            pooled_memory,
109            pooled_buffers,
110            max_total_memory: self.max_total_memory,
111        }
112    }
113
114    /// Clear all pooled buffers
115    pub fn clear(&mut self) {
116        self.buffers.clear();
117        self.total_allocated = 0;
118    }
119}
120
121/// RAII wrapper for pooled buffers
122pub struct PooledBuffer<F: IntegrateFloat> {
123    buffer: Option<Vec<F>>,
124    #[allow(dead_code)]
125    pool_size: Option<usize>,
126}
127
128impl<F: IntegrateFloat> PooledBuffer<F> {
129    fn new(buffer: Vec<F>, poolsize: Option<usize>) -> Self {
130        Self {
131            buffer: Some(buffer),
132            pool_size: poolsize,
133        }
134    }
135
136    /// Get reference to the buffer
137    pub fn as_slice(&mut self) -> &[F] {
138        self.buffer.as_ref().unwrap()
139    }
140
141    /// Get mutable reference to the buffer
142    pub fn as_mut_slice(&mut self) -> &mut [F] {
143        self.buffer.as_mut().unwrap()
144    }
145
146    /// Convert to owned Vec (consumes the buffer)
147    pub fn into_vec(&mut self) -> Vec<F> {
148        self.buffer.take().unwrap()
149    }
150
151    /// Get length of buffer
152    pub fn len(&self) -> usize {
153        self.buffer.as_ref().unwrap().len()
154    }
155
156    /// Check if buffer is empty
157    pub fn is_empty(&self) -> bool {
158        self.buffer.as_ref().unwrap().is_empty()
159    }
160}
161
162/// Memory usage statistics
163#[derive(Debug, Clone)]
164pub struct MemoryUsage {
165    pub total_allocated: usize,
166    pub pooled_memory: usize,
167    pub pooled_buffers: usize,
168    pub max_total_memory: usize,
169}
170
171/// Cache-friendly matrix with optimized memory layout
172#[derive(Debug, Clone)]
173pub struct CacheFriendlyMatrix<F: IntegrateFloat> {
174    data: Vec<F>,
175    rows: usize,
176    cols: usize,
177    layout: MatrixLayout,
178}
179
180#[derive(Debug, Clone, Copy)]
181pub enum MatrixLayout {
182    RowMajor,
183    ColumnMajor,
184    Blocked { block_size: usize },
185}
186
187impl<F: IntegrateFloat> CacheFriendlyMatrix<F> {
188    /// Create new matrix with specified layout
189    pub fn new(rows: usize, cols: usize, layout: MatrixLayout) -> Self {
190        let data = vec![F::zero(); rows * cols];
191        Self {
192            data,
193            rows,
194            cols,
195            layout,
196        }
197    }
198
199    /// Create from existing Array2 with optimal layout
200    pub fn from_array(array: &Array2<F>) -> Self {
201        let (rows, cols) = array.dim();
202        let mut matrix = Self::new(rows, cols, MatrixLayout::RowMajor);
203
204        for i in 0..rows {
205            for j in 0..cols {
206                matrix.set(i, j, array[[i, j]]);
207            }
208        }
209
210        matrix
211    }
212
213    /// Get element at (row, col)
214    pub fn get(&self, row: usize, col: usize) -> F {
215        let index = self.compute_index(row, col);
216        self.data[index]
217    }
218
219    /// Set element at (row, col)
220    pub fn set(&mut self, row: usize, col: usize, value: F) {
221        let index = self.compute_index(row, col);
222        self.data[index] = value;
223    }
224
225    /// Compute linear index based on layout
226    fn compute_index(&self, row: usize, col: usize) -> usize {
227        match self.layout {
228            MatrixLayout::RowMajor => row * self.cols + col,
229            MatrixLayout::ColumnMajor => col * self.rows + row,
230            MatrixLayout::Blocked { block_size } => {
231                let block_row = row / block_size;
232                let block_col = col / block_size;
233                let in_block_row = row % block_size;
234                let in_block_col = col % block_size;
235
236                let blocks_per_row = self.cols.div_ceil(block_size);
237                let block_index = block_row * blocks_per_row + block_col;
238                let in_block_index = in_block_row * block_size + in_block_col;
239
240                block_index * block_size * block_size + in_block_index
241            }
242        }
243    }
244
245    /// Get matrix dimensions
246    pub fn dim(&self) -> (usize, usize) {
247        (self.rows, self.cols)
248    }
249
250    /// Cache-friendly matrix-vector multiplication
251    pub fn matvec(&self, x: ArrayView1<F>) -> Array1<F> {
252        let mut y = Array1::zeros(self.rows);
253
254        match self.layout {
255            MatrixLayout::RowMajor => {
256                for i in 0..self.rows {
257                    let mut sum = F::zero();
258                    for j in 0..self.cols {
259                        sum += self.get(i, j) * x[j];
260                    }
261                    y[i] = sum;
262                }
263            }
264            MatrixLayout::ColumnMajor => {
265                for j in 0..self.cols {
266                    let x_j = x[j];
267                    for i in 0..self.rows {
268                        y[i] += self.get(i, j) * x_j;
269                    }
270                }
271            }
272            MatrixLayout::Blocked { block_size } => {
273                self.blocked_matvec(x.view(), y.view_mut(), block_size);
274            }
275        }
276
277        y
278    }
279
280    /// Blocked matrix-vector multiplication for better cache performance
281    fn blocked_matvec(&self, x: ArrayView1<F>, mut y: ArrayViewMut1<F>, blocksize: usize) {
282        let rows_blocks = self.rows.div_ceil(blocksize);
283        let cols_blocks = self.cols.div_ceil(blocksize);
284
285        for i_block in 0..rows_blocks {
286            for j_block in 0..cols_blocks {
287                let i_start = i_block * blocksize;
288                let i_end = (i_start + blocksize).min(self.rows);
289                let j_start = j_block * blocksize;
290                let j_end = (j_start + blocksize).min(self.cols);
291
292                for i in i_start..i_end {
293                    let mut sum = F::zero();
294                    for j in j_start..j_end {
295                        sum += self.get(i, j) * x[j];
296                    }
297                    y[i] += sum;
298                }
299            }
300        }
301    }
302}
303
304/// Blocking strategy for cache-friendly algorithms
305#[derive(Debug, Clone)]
306pub struct BlockingStrategy {
307    /// Block size for L1 cache
308    pub l1_block_size: usize,
309    /// Block size for L2 cache
310    pub l2_block_size: usize,
311    /// Block size for L3 cache
312    pub l3_block_size: usize,
313}
314
315impl BlockingStrategy {
316    /// Create blocking strategy with specified L1 block size
317    pub fn new(_l1_blocksize: usize) -> Self {
318        Self {
319            l1_block_size: _l1_blocksize,
320            l2_block_size: _l1_blocksize * 4,
321            l3_block_size: _l1_blocksize * 16,
322        }
323    }
324
325    /// Get optimal block size for given matrix dimension and cache level
326    pub fn optimal_block_size(&self, _matrix_size: usize, cachelevel: CacheLevel) -> usize {
327        let block_size = match cachelevel {
328            CacheLevel::L1 => self.l1_block_size,
329            CacheLevel::L2 => self.l2_block_size,
330            CacheLevel::L3 => self.l3_block_size,
331        };
332
333        // Adjust for small matrices
334        block_size.min(_matrix_size)
335    }
336}
337
338#[derive(Debug, Clone, Copy)]
339pub enum CacheLevel {
340    L1,
341    L2,
342    L3,
343}
344
345/// Cache-aware algorithms for common operations
346pub struct CacheAwareAlgorithms;
347
348impl CacheAwareAlgorithms {
349    /// Cache-friendly matrix transpose
350    pub fn transpose<F: IntegrateFloat>(
351        input: ArrayView2<F>,
352        mut output: ArrayViewMut2<F>,
353        block_size: usize,
354    ) {
355        let (rows, cols) = input.dim();
356
357        for i_block in (0..rows).step_by(block_size) {
358            for j_block in (0..cols).step_by(block_size) {
359                let i_end = (i_block + block_size).min(rows);
360                let j_end = (j_block + block_size).min(cols);
361
362                for i in i_block..i_end {
363                    for j in j_block..j_end {
364                        output[[j, i]] = input[[i, j]];
365                    }
366                }
367            }
368        }
369    }
370
371    /// Cache-friendly vector operations with temporal locality
372    pub fn vector_add_blocked<F: IntegrateFloat>(
373        a: ArrayView1<F>,
374        b: ArrayView1<F>,
375        mut c: ArrayViewMut1<F>,
376        block_size: usize,
377    ) {
378        let n = a.len();
379
380        for start in (0..n).step_by(block_size) {
381            let end = (start + block_size).min(n);
382
383            for i in start..end {
384                c[i] = a[i] + b[i];
385            }
386        }
387    }
388
389    /// Memory-efficient reduction with minimal cache misses
390    pub fn reduction_blocked<F: IntegrateFloat>(data: ArrayView1<F>, blocksize: usize) -> F {
391        let n = data.len();
392        let mut partial_sums = Vec::new();
393
394        // Compute partial sums for each block
395        for start in (0..n).step_by(blocksize) {
396            let end = (start + blocksize).min(n);
397            let mut sum = F::zero();
398
399            for i in start..end {
400                sum += data[i];
401            }
402
403            partial_sums.push(sum);
404        }
405
406        // Final reduction of partial sums
407        partial_sums.into_iter().fold(F::zero(), |acc, x| acc + x)
408    }
409}
410
411/// Data layout optimization utilities
412pub struct DataLayoutOptimizer;
413
414impl DataLayoutOptimizer {
415    /// Convert Array of Structures (AoS) to Structure of Arrays (SoA)
416    pub fn aos_to_soa<F: IntegrateFloat>(
417        data: &[(F, F, F)], // Input: array of (x, y, z) tuples
418    ) -> (Vec<F>, Vec<F>, Vec<F>) {
419        let mut x_values = Vec::with_capacity(data.len());
420        let mut y_values = Vec::with_capacity(data.len());
421        let mut z_values = Vec::with_capacity(data.len());
422
423        for &(x, y, z) in data {
424            x_values.push(x);
425            y_values.push(y);
426            z_values.push(z);
427        }
428
429        (x_values, y_values, z_values)
430    }
431
432    /// Convert Structure of Arrays (SoA) to Array of Structures (AoS)
433    pub fn soa_to_aos<F: IntegrateFloat>(
434        x_values: &[F],
435        y_values: &[F],
436        z_values: &[F],
437    ) -> Vec<(F, F, F)> {
438        assert_eq!(x_values.len(), y_values.len());
439        assert_eq!(y_values.len(), z_values.len());
440
441        x_values
442            .iter()
443            .zip(y_values.iter())
444            .zip(z_values.iter())
445            .map(|((&x, &y), &z)| (x, y, z))
446            .collect()
447    }
448
449    /// Reorder data for better spatial locality
450    pub fn spatial_reorder<F: IntegrateFloat>(data: &mut [(F, F)], // Input: (value, key) pairs
451    ) {
452        // Sort by key to improve spatial locality
453        data.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
454    }
455}
456
457/// Memory prefetching utilities for performance optimization
458pub struct MemoryPrefetch;
459
460impl MemoryPrefetch {
461    /// Software prefetch hint (platform-specific)
462    #[inline]
463    pub fn prefetch_read<T>(ptr: *const T) {
464        #[cfg(target_arch = "x86_64")]
465        unsafe {
466            std::arch::x86_64::_mm_prefetch(ptr as *const i8, std::arch::x86_64::_MM_HINT_T0);
467        }
468
469        #[cfg(not(target_arch = "x86_64"))]
470        {
471            // Fallback for other architectures
472            let _ = ptr;
473        }
474    }
475
476    /// Prefetch for write access
477    #[inline]
478    pub fn prefetch_write<T>(ptr: *const T) {
479        #[cfg(target_arch = "x86_64")]
480        unsafe {
481            std::arch::x86_64::_mm_prefetch(ptr as *const i8, std::arch::x86_64::_MM_HINT_T0);
482        }
483
484        #[cfg(not(target_arch = "x86_64"))]
485        {
486            // Fallback for other architectures
487            let _ = ptr;
488        }
489    }
490
491    /// Stream data access pattern (non-temporal)
492    #[inline]
493    pub fn prefetch_stream<T>(ptr: *const T) {
494        #[cfg(target_arch = "x86_64")]
495        unsafe {
496            std::arch::x86_64::_mm_prefetch(ptr as *const i8, std::arch::x86_64::_MM_HINT_NTA);
497        }
498
499        #[cfg(not(target_arch = "x86_64"))]
500        {
501            // Fallback for other architectures
502            let _ = ptr;
503        }
504    }
505}
506
507#[cfg(test)]
508mod tests {
509    use super::*;
510    use approx::assert_abs_diff_eq;
511
512    #[test]
513    fn test_memory_pool() {
514        let mut pool = MemoryPool::<f64>::new(1024 * 1024); // 1MB
515
516        // Allocate buffer
517        let mut buffer = pool.allocate(100);
518        assert_eq!(buffer.len(), 100);
519
520        // Check memory usage
521        let usage = pool.memory_usage();
522        assert!(usage.total_allocated > 0);
523
524        // Return buffer to pool
525        let vec_buffer = buffer.into_vec();
526        pool.deallocate(vec_buffer);
527
528        // Allocate again (should reuse)
529        let buffer2 = pool.allocate(100);
530        assert_eq!(buffer2.len(), 100);
531    }
532
533    #[test]
534    fn test_cache_friendly_matrix() {
535        let mut matrix = CacheFriendlyMatrix::new(3, 3, MatrixLayout::RowMajor);
536
537        // Set some values
538        matrix.set(0, 0, 1.0);
539        matrix.set(1, 1, 2.0);
540        matrix.set(2, 2, 3.0);
541
542        // Check retrieval
543        assert_abs_diff_eq!(matrix.get(0, 0), 1.0);
544        assert_abs_diff_eq!(matrix.get(1, 1), 2.0);
545        assert_abs_diff_eq!(matrix.get(2, 2), 3.0);
546
547        // Test matrix-vector multiplication
548        let x = Array1::from_vec(vec![1.0, 1.0, 1.0]);
549        let y = matrix.matvec(x.view());
550
551        assert_abs_diff_eq!(y[0], 1.0);
552        assert_abs_diff_eq!(y[1], 2.0);
553        assert_abs_diff_eq!(y[2], 3.0);
554    }
555
556    #[test]
557    fn test_blocking_strategy() {
558        let strategy = BlockingStrategy::new(64);
559
560        assert_eq!(strategy.l1_block_size, 64);
561        assert_eq!(strategy.l2_block_size, 256);
562        assert_eq!(strategy.l3_block_size, 1024);
563
564        // Test optimal block size selection
565        let block_size = strategy.optimal_block_size(32, CacheLevel::L1);
566        assert_eq!(block_size, 32); // Should be limited by matrix size
567
568        let block_size = strategy.optimal_block_size(128, CacheLevel::L1);
569        assert_eq!(block_size, 64); // Should use L1 block size
570    }
571
572    #[test]
573    fn test_data_layout_optimizer() {
574        // Test AoS to SoA conversion
575        let aos_data = vec![(1.0, 2.0, 3.0), (4.0, 5.0, 6.0), (7.0, 8.0, 9.0)];
576        let (x, y, z) = DataLayoutOptimizer::aos_to_soa(&aos_data);
577
578        assert_eq!(x, vec![1.0, 4.0, 7.0]);
579        assert_eq!(y, vec![2.0, 5.0, 8.0]);
580        assert_eq!(z, vec![3.0, 6.0, 9.0]);
581
582        // Test SoA to AoS conversion
583        let reconstructed = DataLayoutOptimizer::soa_to_aos(&x, &y, &z);
584        assert_eq!(reconstructed, aos_data);
585    }
586
587    #[test]
588    fn test_cache_aware_algorithms() {
589        let data = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
590
591        // Test blocked reduction
592        let sum = CacheAwareAlgorithms::reduction_blocked(data.view(), 3);
593        assert_abs_diff_eq!(sum, 36.0); // 1+2+...+8 = 36
594
595        // Test blocked vector addition
596        let a = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0]);
597        let b = Array1::from_vec(vec![5.0, 6.0, 7.0, 8.0]);
598        let mut c = Array1::zeros(4);
599
600        CacheAwareAlgorithms::vector_add_blocked(a.view(), b.view(), c.view_mut(), 2);
601
602        assert_abs_diff_eq!(c[0], 6.0);
603        assert_abs_diff_eq!(c[1], 8.0);
604        assert_abs_diff_eq!(c[2], 10.0);
605        assert_abs_diff_eq!(c[3], 12.0);
606    }
607}