scirs2-linalg 0.4.0

//! SIMD-accelerated matrix transpose operations
//!
//! This module provides cache-friendly, SIMD-accelerated matrix transpose
//! operations. All SIMD operations are delegated to scirs2-core::simd_ops
//! for unified optimization management.

#[cfg(feature = "simd")]
use crate::error::{LinalgError, LinalgResult};
#[cfg(feature = "simd")]
use scirs2_core::ndarray::{Array2, ArrayView2};
#[cfg(feature = "simd")]
use scirs2_core::simd_ops::SimdUnifiedOps;

/// SIMD-accelerated matrix transpose for f32 values
///
/// This implementation uses the unified SIMD operations from scirs2-core.
/// For large matrices (>4096 elements), uses cache-optimized blocked transpose.
///
/// # Arguments
///
/// * `matrix` - Input matrix to transpose
///
/// # Returns
///
/// * Transposed matrix
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn simd_transpose_f32(matrix: &ArrayView2<f32>) -> LinalgResult<Array2<f32>> {
    // Use blocked transpose for cache efficiency (automatically uses simple for small matrices)
    Ok(f32::simd_transpose_blocked(matrix))
}

/// Cache-optimized blocked transpose for f32 values
///
/// Uses L1 cache-friendly block sizes for improved memory access patterns.
/// Achieves 3-5x speedup for large matrices (>512x512).
///
/// # Arguments
///
/// * `matrix` - Input matrix to transpose
///
/// # Returns
///
/// * Transposed matrix
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn simd_transpose_blocked_f32(matrix: &ArrayView2<f32>) -> LinalgResult<Array2<f32>> {
    Ok(f32::simd_transpose_blocked(matrix))
}

/// SIMD-accelerated matrix transpose for f64 values
///
/// This implementation uses the unified SIMD operations from scirs2-core.
/// For large matrices (>4096 elements), uses cache-optimized blocked transpose.
///
/// # Arguments
///
/// * `matrix` - Input matrix to transpose
///
/// # Returns
///
/// * Transposed matrix
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn simd_transpose_f64(matrix: &ArrayView2<f64>) -> LinalgResult<Array2<f64>> {
    // Use blocked transpose for cache efficiency (automatically uses simple for small matrices)
    Ok(f64::simd_transpose_blocked(matrix))
}

/// Cache-optimized blocked transpose for f64 values
///
/// Uses L1 cache-friendly block sizes for improved memory access patterns.
/// Achieves 3-5x speedup for large matrices (>512x512).
///
/// # Arguments
///
/// * `matrix` - Input matrix to transpose
///
/// # Returns
///
/// * Transposed matrix
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn simd_transpose_blocked_f64(matrix: &ArrayView2<f64>) -> LinalgResult<Array2<f64>> {
    Ok(f64::simd_transpose_blocked(matrix))
}

// Note: Block transpose helper functions have been removed as we now use unified SIMD operations from scirs2-core

/// In-place matrix transpose for square f32 matrices using SIMD
///
/// This function performs in-place transpose for square matrices,
/// which is more memory-efficient than creating a new matrix.
///
/// # Arguments
///
/// * `matrix` - Square matrix to transpose in-place
///
/// # Returns
///
/// * Result indicating success or failure
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn simd_transpose_inplace_f32(matrix: &mut Array2<f32>) -> LinalgResult<()> {
    let (rows, cols) = matrix.dim();

    if rows != cols {
        return Err(LinalgError::ShapeError(format!(
            "In-place transpose requires square matrix, got shape ({rows}, {cols})"
        )));
    }

    // For in-place transpose, we need to swap elements
    // Since the unified ops don't provide in-place transpose, we'll do it manually
    let n = rows;
    for i in 0..n {
        for j in (i + 1)..n {
            let temp = matrix[[i, j]];
            matrix[[i, j]] = matrix[[j, i]];
            matrix[[j, i]] = temp;
        }
    }

    Ok(())
}

/// In-place matrix transpose for square f64 matrices using SIMD
///
/// This function performs in-place transpose for square matrices,
/// which is more memory-efficient than creating a new matrix.
///
/// # Arguments
///
/// * `matrix` - Square matrix to transpose in-place
///
/// # Returns
///
/// * Result indicating success or failure
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn simd_transpose_inplace_f64(matrix: &mut Array2<f64>) -> LinalgResult<()> {
    let (rows, cols) = matrix.dim();

    if rows != cols {
        return Err(LinalgError::ShapeError(format!(
            "In-place transpose requires square matrix, got shape ({rows}, {cols})"
        )));
    }

    // For in-place transpose, we need to swap elements
    // Since the unified ops don't provide in-place transpose, we'll do it manually
    let n = rows;
    for i in 0..n {
        for j in (i + 1)..n {
            let temp = matrix[[i, j]];
            matrix[[i, j]] = matrix[[j, i]];
            matrix[[j, i]] = temp;
        }
    }

    Ok(())
}

// Note: Block swap helper functions have been removed as we now use simplified in-place transpose

#[cfg(all(test, feature = "simd"))]
mod tests {
    use super::*;
    use approx::assert_relative_eq;
    use scirs2_core::ndarray::array;

    #[test]
    #[cfg(feature = "simd")]
    fn test_simd_transpose_f32() {
        let matrix = array![
            [1.0f32, 2.0, 3.0, 4.0],
            [5.0, 6.0, 7.0, 8.0],
            [9.0, 10.0, 11.0, 12.0]
        ];

        let result = simd_transpose_f32(&matrix.view()).expect("Operation failed");

        let expected = array![
            [1.0f32, 5.0, 9.0],
            [2.0, 6.0, 10.0],
            [3.0, 7.0, 11.0],
            [4.0, 8.0, 12.0]
        ];

        assert_eq!(result.shape(), expected.shape());
        for ((i, j), &val) in result.indexed_iter() {
            assert_relative_eq!(val, expected[[i, j]], epsilon = 1e-6);
        }
    }

    #[test]
    #[cfg(feature = "simd")]
    fn test_simd_transpose_f64() {
        let matrix = array![[1.0f64, 2.0, 3.0], [4.0, 5.0, 6.0]];

        let result = simd_transpose_f64(&matrix.view()).expect("Operation failed");

        let expected = array![[1.0f64, 4.0], [2.0, 5.0], [3.0, 6.0]];

        assert_eq!(result.shape(), expected.shape());
        for ((i, j), &val) in result.indexed_iter() {
            assert_relative_eq!(val, expected[[i, j]], epsilon = 1e-12);
        }
    }

    #[test]
    #[cfg(feature = "simd")]
    fn test_simd_transpose_inplace_f32() {
        let mut matrix = array![[1.0f32, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]];

        simd_transpose_inplace_f32(&mut matrix).expect("Operation failed");

        let expected = array![[1.0f32, 4.0, 7.0], [2.0, 5.0, 8.0], [3.0, 6.0, 9.0]];

        assert_eq!(matrix.shape(), expected.shape());
        for ((i, j), &val) in matrix.indexed_iter() {
            assert_relative_eq!(val, expected[[i, j]], epsilon = 1e-6);
        }
    }

    #[test]
    #[cfg(feature = "simd")]
    fn test_simd_transpose_largematrix() {
        // Test with larger matrix to exercise blocking
        let size = 100;
        let mut matrix = Array2::zeros((size, size));

        // Fill with test pattern
        for i in 0..size {
            for j in 0..size {
                matrix[[i, j]] = (i * size + j) as f32;
            }
        }

        let result = simd_transpose_f32(&matrix.view()).expect("Operation failed");

        // Verify transpose correctness
        for i in 0..size {
            for j in 0..size {
                assert_relative_eq!(result[[j, i]], matrix[[i, j]], epsilon = 1e-6);
            }
        }
    }
}