use ndarray::{Array2, ArrayView2};
const L1_CACHE_SIZE: usize = 32768;
#[inline]
fn calculate_block_size(element_size: usize) -> usize {
let optimal = ((L1_CACHE_SIZE / 2) / element_size) as f64;
let block_size = optimal.sqrt() as usize;
block_size.clamp(8, 64)
}
pub fn simd_transpose_blocked_f32(a: &ArrayView2<f32>) -> Array2<f32> {
let (rows, cols) = a.dim();
if rows * cols < 4096 {
return a.t().to_owned();
}
let mut result = Array2::zeros((cols, rows));
let block_size = calculate_block_size(std::mem::size_of::<f32>());
for i_block in (0..rows).step_by(block_size) {
for j_block in (0..cols).step_by(block_size) {
let i_end = (i_block + block_size).min(rows);
let j_end = (j_block + block_size).min(cols);
for i in i_block..i_end {
for j in j_block..j_end {
result[[j, i]] = a[[i, j]];
}
}
}
}
result
}
pub fn simd_transpose_blocked_f64(a: &ArrayView2<f64>) -> Array2<f64> {
let (rows, cols) = a.dim();
if rows * cols < 4096 {
return a.t().to_owned();
}
let mut result = Array2::zeros((cols, rows));
let block_size = calculate_block_size(std::mem::size_of::<f64>());
for i_block in (0..rows).step_by(block_size) {
for j_block in (0..cols).step_by(block_size) {
let i_end = (i_block + block_size).min(rows);
let j_end = (j_block + block_size).min(cols);
for i in i_block..i_end {
for j in j_block..j_end {
result[[j, i]] = a[[i, j]];
}
}
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
use ndarray::array;
#[test]
fn test_blocked_transpose_f32_small() {
let a = array![[1.0f32, 2.0, 3.0], [4.0, 5.0, 6.0]];
let result = simd_transpose_blocked_f32(&a.view());
let expected = array![[1.0f32, 4.0], [2.0, 5.0], [3.0, 6.0]];
assert_eq!(result, expected);
}
#[test]
fn test_blocked_transpose_f64_small() {
let a = array![[1.0f64, 2.0, 3.0], [4.0, 5.0, 6.0]];
let result = simd_transpose_blocked_f64(&a.view());
let expected = array![[1.0f64, 4.0], [2.0, 5.0], [3.0, 6.0]];
assert_eq!(result, expected);
}
#[test]
fn test_blocked_transpose_f32_large() {
let size = 128;
let a: Array2<f32> = Array2::from_shape_fn((size, size), |(i, j)| (i * size + j) as f32);
let result = simd_transpose_blocked_f32(&a.view());
for i in 0..size {
for j in 0..size {
assert_eq!(result[[j, i]], a[[i, j]]);
}
}
}
#[test]
fn test_blocked_transpose_f64_large() {
let size = 128;
let a: Array2<f64> = Array2::from_shape_fn((size, size), |(i, j)| (i * size + j) as f64);
let result = simd_transpose_blocked_f64(&a.view());
for i in 0..size {
for j in 0..size {
assert_eq!(result[[j, i]], a[[i, j]]);
}
}
}
#[test]
fn test_blocked_transpose_rectangular() {
let a: Array2<f32> = Array2::from_shape_fn((100, 200), |(i, j)| (i * 200 + j) as f32);
let result = simd_transpose_blocked_f32(&a.view());
assert_eq!(result.dim(), (200, 100));
for i in 0..100 {
for j in 0..200 {
assert_eq!(result[[j, i]], a[[i, j]]);
}
}
}
#[test]
fn test_block_size_calculation() {
let block_f32 = calculate_block_size(4);
assert!(block_f32 >= 8 && block_f32 <= 64);
let block_f64 = calculate_block_size(8);
assert!(block_f64 >= 8 && block_f64 <= 64);
}
}