use crate::common::IntegrateFloat;
use scirs2_core::ndarray::{Array1, Array2, ArrayView1, ArrayView2, ArrayViewMut1, ArrayViewMut2};
use std::collections::VecDeque;
#[derive(Debug)]
pub struct MemoryPool<F: IntegrateFloat> {
buffers: std::collections::HashMap<usize, VecDeque<Vec<F>>>,
max_buffer_size: usize,
total_allocated: usize,
max_total_memory: usize,
}
impl<F: IntegrateFloat> MemoryPool<F> {
pub fn new(_max_totalmemory: usize) -> Self {
Self {
buffers: std::collections::HashMap::new(),
max_buffer_size: _max_totalmemory / 4, total_allocated: 0,
max_total_memory: _max_totalmemory,
}
}
pub fn allocate(&mut self, size: usize) -> PooledBuffer<F> {
if size <= self.max_buffer_size {
if let Some(queue) = self.buffers.get_mut(&size) {
if let Some(mut buffer) = queue.pop_front() {
buffer.clear();
buffer.resize(size, F::zero());
return PooledBuffer::new(buffer, Some(size));
}
}
}
if self.total_allocated + size * std::mem::size_of::<F>() <= self.max_total_memory {
let buffer = vec![F::zero(); size];
self.total_allocated += size * std::mem::size_of::<F>();
if size <= self.max_buffer_size {
PooledBuffer::new(buffer, Some(size))
} else {
PooledBuffer::new(buffer, None)
}
} else {
PooledBuffer::new(vec![F::zero(); size], None)
}
}
pub fn deallocate(&mut self, buffer: Vec<F>) {
let size = buffer.len();
if size <= self.max_buffer_size && self.buffers.len() < 100 {
self.buffers.entry(size).or_default().push_back(buffer);
} else {
self.total_allocated = self
.total_allocated
.saturating_sub(size * std::mem::size_of::<F>());
}
}
pub fn memory_usage(&self) -> MemoryUsage {
let pooled_buffers: usize = self.buffers.values().map(|q| q.len()).sum();
let pooled_memory: usize = self
.buffers
.iter()
.map(|(size, queue)| size * queue.len() * std::mem::size_of::<F>())
.sum();
MemoryUsage {
total_allocated: self.total_allocated,
pooled_memory,
pooled_buffers,
max_total_memory: self.max_total_memory,
}
}
pub fn clear(&mut self) {
self.buffers.clear();
self.total_allocated = 0;
}
}
pub struct PooledBuffer<F: IntegrateFloat> {
buffer: Option<Vec<F>>,
#[allow(dead_code)]
pool_size: Option<usize>,
}
impl<F: IntegrateFloat> PooledBuffer<F> {
fn new(buffer: Vec<F>, poolsize: Option<usize>) -> Self {
Self {
buffer: Some(buffer),
pool_size: poolsize,
}
}
pub fn as_slice(&mut self) -> &[F] {
self.buffer.as_ref().expect("Operation failed")
}
pub fn as_mut_slice(&mut self) -> &mut [F] {
self.buffer.as_mut().expect("Operation failed")
}
pub fn into_vec(&mut self) -> Vec<F> {
self.buffer.take().expect("Operation failed")
}
pub fn len(&self) -> usize {
self.buffer.as_ref().expect("Operation failed").len()
}
pub fn is_empty(&self) -> bool {
self.buffer.as_ref().expect("Operation failed").is_empty()
}
}
#[derive(Debug, Clone)]
pub struct MemoryUsage {
pub total_allocated: usize,
pub pooled_memory: usize,
pub pooled_buffers: usize,
pub max_total_memory: usize,
}
#[derive(Debug, Clone)]
pub struct CacheFriendlyMatrix<F: IntegrateFloat> {
data: Vec<F>,
rows: usize,
cols: usize,
layout: MatrixLayout,
}
#[derive(Debug, Clone, Copy)]
pub enum MatrixLayout {
RowMajor,
ColumnMajor,
Blocked { block_size: usize },
}
impl<F: IntegrateFloat> CacheFriendlyMatrix<F> {
pub fn new(rows: usize, cols: usize, layout: MatrixLayout) -> Self {
let data = vec![F::zero(); rows * cols];
Self {
data,
rows,
cols,
layout,
}
}
pub fn from_array(array: &Array2<F>) -> Self {
let (rows, cols) = array.dim();
let mut matrix = Self::new(rows, cols, MatrixLayout::RowMajor);
for i in 0..rows {
for j in 0..cols {
matrix.set(i, j, array[[i, j]]);
}
}
matrix
}
pub fn get(&self, row: usize, col: usize) -> F {
let index = self.compute_index(row, col);
self.data[index]
}
pub fn set(&mut self, row: usize, col: usize, value: F) {
let index = self.compute_index(row, col);
self.data[index] = value;
}
fn compute_index(&self, row: usize, col: usize) -> usize {
match self.layout {
MatrixLayout::RowMajor => row * self.cols + col,
MatrixLayout::ColumnMajor => col * self.rows + row,
MatrixLayout::Blocked { block_size } => {
let block_row = row / block_size;
let block_col = col / block_size;
let in_block_row = row % block_size;
let in_block_col = col % block_size;
let blocks_per_row = self.cols.div_ceil(block_size);
let block_index = block_row * blocks_per_row + block_col;
let in_block_index = in_block_row * block_size + in_block_col;
block_index * block_size * block_size + in_block_index
}
}
}
pub fn dim(&self) -> (usize, usize) {
(self.rows, self.cols)
}
pub fn matvec(&self, x: ArrayView1<F>) -> Array1<F> {
let mut y = Array1::zeros(self.rows);
match self.layout {
MatrixLayout::RowMajor => {
for i in 0..self.rows {
let mut sum = F::zero();
for j in 0..self.cols {
sum += self.get(i, j) * x[j];
}
y[i] = sum;
}
}
MatrixLayout::ColumnMajor => {
for j in 0..self.cols {
let x_j = x[j];
for i in 0..self.rows {
y[i] += self.get(i, j) * x_j;
}
}
}
MatrixLayout::Blocked { block_size } => {
self.blocked_matvec(x.view(), y.view_mut(), block_size);
}
}
y
}
fn blocked_matvec(&self, x: ArrayView1<F>, mut y: ArrayViewMut1<F>, blocksize: usize) {
let rows_blocks = self.rows.div_ceil(blocksize);
let cols_blocks = self.cols.div_ceil(blocksize);
for i_block in 0..rows_blocks {
for j_block in 0..cols_blocks {
let i_start = i_block * blocksize;
let i_end = (i_start + blocksize).min(self.rows);
let j_start = j_block * blocksize;
let j_end = (j_start + blocksize).min(self.cols);
for i in i_start..i_end {
let mut sum = F::zero();
for j in j_start..j_end {
sum += self.get(i, j) * x[j];
}
y[i] += sum;
}
}
}
}
}
#[derive(Debug, Clone)]
pub struct BlockingStrategy {
pub l1_block_size: usize,
pub l2_block_size: usize,
pub l3_block_size: usize,
}
impl BlockingStrategy {
pub fn new(_l1_blocksize: usize) -> Self {
Self {
l1_block_size: _l1_blocksize,
l2_block_size: _l1_blocksize * 4,
l3_block_size: _l1_blocksize * 16,
}
}
pub fn optimal_block_size(&self, _matrix_size: usize, cachelevel: CacheLevel) -> usize {
let block_size = match cachelevel {
CacheLevel::L1 => self.l1_block_size,
CacheLevel::L2 => self.l2_block_size,
CacheLevel::L3 => self.l3_block_size,
};
block_size.min(_matrix_size)
}
}
#[derive(Debug, Clone, Copy)]
pub enum CacheLevel {
L1,
L2,
L3,
}
pub struct CacheAwareAlgorithms;
impl CacheAwareAlgorithms {
pub fn transpose<F: IntegrateFloat>(
input: ArrayView2<F>,
mut output: ArrayViewMut2<F>,
block_size: usize,
) {
let (rows, cols) = input.dim();
for i_block in (0..rows).step_by(block_size) {
for j_block in (0..cols).step_by(block_size) {
let i_end = (i_block + block_size).min(rows);
let j_end = (j_block + block_size).min(cols);
for i in i_block..i_end {
for j in j_block..j_end {
output[[j, i]] = input[[i, j]];
}
}
}
}
}
pub fn vector_add_blocked<F: IntegrateFloat>(
a: ArrayView1<F>,
b: ArrayView1<F>,
mut c: ArrayViewMut1<F>,
block_size: usize,
) {
let n = a.len();
for start in (0..n).step_by(block_size) {
let end = (start + block_size).min(n);
for i in start..end {
c[i] = a[i] + b[i];
}
}
}
pub fn reduction_blocked<F: IntegrateFloat>(data: ArrayView1<F>, blocksize: usize) -> F {
let n = data.len();
let mut partial_sums = Vec::new();
for start in (0..n).step_by(blocksize) {
let end = (start + blocksize).min(n);
let mut sum = F::zero();
for i in start..end {
sum += data[i];
}
partial_sums.push(sum);
}
partial_sums.into_iter().fold(F::zero(), |acc, x| acc + x)
}
}
pub struct DataLayoutOptimizer;
impl DataLayoutOptimizer {
pub fn aos_to_soa<F: IntegrateFloat>(
data: &[(F, F, F)], ) -> (Vec<F>, Vec<F>, Vec<F>) {
let mut x_values = Vec::with_capacity(data.len());
let mut y_values = Vec::with_capacity(data.len());
let mut z_values = Vec::with_capacity(data.len());
for &(x, y, z) in data {
x_values.push(x);
y_values.push(y);
z_values.push(z);
}
(x_values, y_values, z_values)
}
pub fn soa_to_aos<F: IntegrateFloat>(
x_values: &[F],
y_values: &[F],
z_values: &[F],
) -> Vec<(F, F, F)> {
assert_eq!(x_values.len(), y_values.len());
assert_eq!(y_values.len(), z_values.len());
x_values
.iter()
.zip(y_values.iter())
.zip(z_values.iter())
.map(|((&x, &y), &z)| (x, y, z))
.collect()
}
pub fn spatial_reorder<F: IntegrateFloat>(data: &mut [(F, F)], ) {
data.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
}
}
pub struct MemoryPrefetch;
impl MemoryPrefetch {
#[inline]
pub fn prefetch_read<T>(ptr: *const T) {
#[cfg(target_arch = "x86_64")]
unsafe {
std::arch::x86_64::_mm_prefetch(ptr as *const i8, std::arch::x86_64::_MM_HINT_T0);
}
#[cfg(not(target_arch = "x86_64"))]
{
let _ = ptr;
}
}
#[inline]
pub fn prefetch_write<T>(ptr: *const T) {
#[cfg(target_arch = "x86_64")]
unsafe {
std::arch::x86_64::_mm_prefetch(ptr as *const i8, std::arch::x86_64::_MM_HINT_T0);
}
#[cfg(not(target_arch = "x86_64"))]
{
let _ = ptr;
}
}
#[inline]
pub fn prefetch_stream<T>(ptr: *const T) {
#[cfg(target_arch = "x86_64")]
unsafe {
std::arch::x86_64::_mm_prefetch(ptr as *const i8, std::arch::x86_64::_MM_HINT_NTA);
}
#[cfg(not(target_arch = "x86_64"))]
{
let _ = ptr;
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_abs_diff_eq;
#[test]
fn test_memory_pool() {
let mut pool = MemoryPool::<f64>::new(1024 * 1024);
let mut buffer = pool.allocate(100);
assert_eq!(buffer.len(), 100);
let usage = pool.memory_usage();
assert!(usage.total_allocated > 0);
let vec_buffer = buffer.into_vec();
pool.deallocate(vec_buffer);
let buffer2 = pool.allocate(100);
assert_eq!(buffer2.len(), 100);
}
#[test]
fn test_cache_friendly_matrix() {
let mut matrix = CacheFriendlyMatrix::new(3, 3, MatrixLayout::RowMajor);
matrix.set(0, 0, 1.0);
matrix.set(1, 1, 2.0);
matrix.set(2, 2, 3.0);
assert_abs_diff_eq!(matrix.get(0, 0), 1.0);
assert_abs_diff_eq!(matrix.get(1, 1), 2.0);
assert_abs_diff_eq!(matrix.get(2, 2), 3.0);
let x = Array1::from_vec(vec![1.0, 1.0, 1.0]);
let y = matrix.matvec(x.view());
assert_abs_diff_eq!(y[0], 1.0);
assert_abs_diff_eq!(y[1], 2.0);
assert_abs_diff_eq!(y[2], 3.0);
}
#[test]
fn test_blocking_strategy() {
let strategy = BlockingStrategy::new(64);
assert_eq!(strategy.l1_block_size, 64);
assert_eq!(strategy.l2_block_size, 256);
assert_eq!(strategy.l3_block_size, 1024);
let block_size = strategy.optimal_block_size(32, CacheLevel::L1);
assert_eq!(block_size, 32);
let block_size = strategy.optimal_block_size(128, CacheLevel::L1);
assert_eq!(block_size, 64); }
#[test]
fn test_data_layout_optimizer() {
let aos_data = vec![(1.0, 2.0, 3.0), (4.0, 5.0, 6.0), (7.0, 8.0, 9.0)];
let (x, y, z) = DataLayoutOptimizer::aos_to_soa(&aos_data);
assert_eq!(x, vec![1.0, 4.0, 7.0]);
assert_eq!(y, vec![2.0, 5.0, 8.0]);
assert_eq!(z, vec![3.0, 6.0, 9.0]);
let reconstructed = DataLayoutOptimizer::soa_to_aos(&x, &y, &z);
assert_eq!(reconstructed, aos_data);
}
#[test]
fn test_cache_aware_algorithms() {
let data = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
let sum = CacheAwareAlgorithms::reduction_blocked(data.view(), 3);
assert_abs_diff_eq!(sum, 36.0);
let a = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0]);
let b = Array1::from_vec(vec![5.0, 6.0, 7.0, 8.0]);
let mut c = Array1::zeros(4);
CacheAwareAlgorithms::vector_add_blocked(a.view(), b.view(), c.view_mut(), 2);
assert_abs_diff_eq!(c[0], 6.0);
assert_abs_diff_eq!(c[1], 8.0);
assert_abs_diff_eq!(c[2], 10.0);
assert_abs_diff_eq!(c[3], 12.0);
}
}