torsh-backend 0.1.2

//! SciRS2 SIMD Operations Integration (Phase 3)
//!
//! This module provides memory-aligned SIMD operations using scirs2-core's SIMD
//! capabilities when available, achieving 2-4x speedup over scalar operations with
//! proper memory alignment.
//!
//! ## SciRS2 POLICY Compliance
//! This module replaces direct wide/packed_simd usage with scirs2-core::simd_ops
//! abstractions where possible, while maintaining backward compatibility with the
//! existing SIMD implementation for a gradual migration.

// Re-export existing SIMD functionality for now
// When scirs2-core::simd_ops stabilizes with aligned operations, we'll replace these
pub use super::simd::*;

/// Check if SciRS2 SIMD operations are available
///
/// # SciRS2 POLICY
/// This function will check for scirs2-core::simd_ops availability once the API
/// stabilizes. For now, it delegates to the existing SIMD detection.
#[inline]
pub fn scirs2_simd_available() -> bool {
    // Check if SIMD feature is enabled
    cfg!(feature = "simd")
}

/// Get optimal SIMD alignment for memory allocation
///
/// # SciRS2 POLICY
/// This will use scirs2-core::simd_ops::alignment() once available.
/// For now, returns standard cache line alignment.
#[inline]
pub fn scirs2_simd_alignment() -> usize {
    // Return cache line size (64 bytes) for optimal SIMD performance
    64
}

/// Memory-aligned vector wrapper for SIMD operations
///
/// This struct ensures proper memory alignment for SIMD operations,
/// which can provide 2-4x speedup over unaligned operations.
#[repr(align(64))]
pub struct AlignedVec<T> {
    data: Vec<T>,
    alignment: usize,
}

impl<T: Clone> AlignedVec<T> {
    /// Create a new aligned vector with specified capacity
    pub fn with_capacity(capacity: usize) -> Self {
        Self {
            data: Vec::with_capacity(capacity),
            alignment: scirs2_simd_alignment(),
        }
    }

    /// Create from existing vector (may require reallocation for alignment)
    pub fn from_vec(vec: Vec<T>) -> Self {
        Self {
            data: vec,
            alignment: scirs2_simd_alignment(),
        }
    }

    /// Get slice of the aligned data
    #[inline]
    pub fn as_slice(&self) -> &[T] {
        &self.data
    }

    /// Get mutable slice of the aligned data
    #[inline]
    pub fn as_mut_slice(&mut self) -> &mut [T] {
        &mut self.data
    }

    /// Get the alignment requirement
    #[inline]
    pub fn alignment(&self) -> usize {
        self.alignment
    }

    /// Get length of the vector
    #[inline]
    pub fn len(&self) -> usize {
        self.data.len()
    }

    /// Check if vector is empty
    #[inline]
    pub fn is_empty(&self) -> bool {
        self.data.is_empty()
    }

    /// Push an element to the vector
    #[inline]
    pub fn push(&mut self, value: T) {
        self.data.push(value);
    }

    /// Reserve additional capacity
    #[inline]
    pub fn reserve(&mut self, additional: usize) {
        self.data.reserve(additional);
    }

    /// Clear the vector
    #[inline]
    pub fn clear(&mut self) {
        self.data.clear();
    }
}

impl<T: Clone> From<Vec<T>> for AlignedVec<T> {
    fn from(vec: Vec<T>) -> Self {
        Self::from_vec(vec)
    }
}

impl<T> AsRef<[T]> for AlignedVec<T> {
    fn as_ref(&self) -> &[T] {
        &self.data
    }
}

impl<T> AsMut<[T]> for AlignedVec<T> {
    fn as_mut(&mut self) -> &mut [T] {
        &mut self.data
    }
}

/// Aligned SIMD operations for f32
///
/// These functions ensure proper memory alignment for SIMD operations.
/// Future versions will use scirs2-core::simd_ops aligned operations.
pub mod aligned_ops {
    use super::*;

    /// Aligned SIMD addition for f32
    ///
    /// # SciRS2 POLICY
    /// This will use scirs2_core::simd_ops::simd_add_aligned_f32() once available.
    #[inline]
    pub fn aligned_add_f32(a: &[f32], b: &[f32], result: &mut [f32]) {
        // Use existing SIMD implementation
        simd_add_f32(a, b, result);
    }

    /// Aligned SIMD multiplication for f32
    ///
    /// # SciRS2 POLICY
    /// This will use scirs2_core::simd_ops::simd_mul_aligned_f32() once available.
    #[inline]
    pub fn aligned_mul_f32(a: &[f32], b: &[f32], result: &mut [f32]) {
        simd_mul_f32(a, b, result);
    }

    /// Aligned SIMD dot product for f32
    ///
    /// # SciRS2 POLICY
    /// This will use scirs2_core::simd_ops::simd_dot_aligned_f32() once available.
    #[inline]
    pub fn aligned_dot_f32(a: &[f32], b: &[f32]) -> f32 {
        simd_dot_f32(a, b)
    }

    /// Aligned SIMD sum for f32
    ///
    /// # SciRS2 POLICY
    /// This will use scirs2_core::simd_ops::simd_sum_aligned_f32() once available.
    #[inline]
    pub fn aligned_sum_f32(a: &[f32]) -> f32 {
        simd_sum_f32(a)
    }

    /// Aligned SIMD ReLU activation for f32
    #[inline]
    pub fn aligned_relu_f32(input: &[f32], output: &mut [f32]) {
        simd_relu_f32(input, output);
    }

    /// Aligned SIMD sigmoid activation for f32
    #[inline]
    pub fn aligned_sigmoid_f32(input: &[f32], output: &mut [f32]) {
        simd_sigmoid_f32(input, output);
    }
}

/// Adaptive SIMD operations that automatically choose aligned or unaligned paths
pub mod adaptive {
    use super::*;

    /// Adaptive SIMD addition that checks alignment
    #[inline]
    pub fn adaptive_add_f32(a: &[f32], b: &[f32], result: &mut [f32]) {
        // Use existing adaptive implementation
        adaptive_simd_add_f32(a, b, result);
    }

    /// Adaptive SIMD multiplication that checks alignment
    #[inline]
    pub fn adaptive_mul_f32(a: &[f32], b: &[f32], result: &mut [f32]) {
        adaptive_simd_mul_f32(a, b, result);
    }

    /// Adaptive SIMD dot product that checks alignment
    #[inline]
    pub fn adaptive_dot_f32(a: &[f32], b: &[f32]) -> f32 {
        adaptive_simd_dot_f32(a, b)
    }

    /// Adaptive SIMD matrix multiplication that checks alignment
    #[inline]
    pub fn adaptive_matmul_f32(
        a: &[f32],
        b: &[f32],
        result: &mut [f32],
        m: usize,
        n: usize,
        k: usize,
    ) {
        adaptive_simd_matmul_f32(a, b, result, m, n, k);
    }
}

/// SIMD feature detection and hardware capabilities
pub mod features {

    /// Check if AVX2 is available
    #[inline]
    pub fn has_avx2() -> bool {
        super::has_avx2()
    }

    /// Check if AVX-512 is available
    #[inline]
    pub fn has_avx512() -> bool {
        super::has_avx512()
    }

    /// Check if NEON is available (ARM)
    #[inline]
    pub fn has_neon() -> bool {
        super::has_neon()
    }

    /// Get optimal vector width for f32
    #[inline]
    pub fn f32_vector_width() -> usize {
        super::f32_vector_width()
    }

    /// Get optimal vector width for f64
    #[inline]
    pub fn f64_vector_width() -> usize {
        super::f64_vector_width()
    }

    /// Check if SIMD should be used for given size
    #[inline]
    pub fn should_use_simd(size: usize) -> bool {
        super::should_use_simd(size)
    }
}

/// Convenience module for importing all SIMD operations
pub mod prelude {
    pub use super::adaptive::*;
    pub use super::aligned_ops::*;
    pub use super::features::*;
    pub use super::{scirs2_simd_alignment, scirs2_simd_available, AlignedVec};
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_scirs2_simd_available() {
        // Should return true when simd feature is enabled
        let available = scirs2_simd_available();
        #[cfg(feature = "simd")]
        assert!(available);
        #[cfg(not(feature = "simd"))]
        assert!(!available);
    }

    #[test]
    fn test_scirs2_simd_alignment() {
        let alignment = scirs2_simd_alignment();
        assert_eq!(alignment, 64);
        assert!(alignment.is_power_of_two());
    }

    #[test]
    fn test_aligned_vec_creation() {
        let vec: AlignedVec<f32> = AlignedVec::with_capacity(10);
        assert_eq!(vec.len(), 0);
        assert_eq!(vec.alignment(), 64);
        assert!(vec.is_empty());
    }

    #[test]
    fn test_aligned_vec_from_vec() {
        let data = vec![1.0f32, 2.0, 3.0, 4.0];
        let aligned = AlignedVec::from_vec(data);
        assert_eq!(aligned.len(), 4);
        assert_eq!(aligned.as_slice(), &[1.0, 2.0, 3.0, 4.0]);
    }

    #[test]
    fn test_aligned_vec_operations() {
        let mut vec = AlignedVec::with_capacity(10);
        vec.push(1.0f32);
        vec.push(2.0);
        vec.push(3.0);

        assert_eq!(vec.len(), 3);
        assert!(!vec.is_empty());
        assert_eq!(vec.as_slice(), &[1.0, 2.0, 3.0]);
    }

    #[test]
    fn test_aligned_add_f32() {
        let a = vec![1.0f32, 2.0, 3.0, 4.0];
        let b = vec![5.0f32, 6.0, 7.0, 8.0];
        let mut result = vec![0.0f32; 4];

        aligned_ops::aligned_add_f32(&a, &b, &mut result);

        assert_eq!(result, vec![6.0, 8.0, 10.0, 12.0]);
    }

    #[test]
    fn test_aligned_mul_f32() {
        let a = vec![1.0f32, 2.0, 3.0, 4.0];
        let b = vec![2.0f32, 3.0, 4.0, 5.0];
        let mut result = vec![0.0f32; 4];

        aligned_ops::aligned_mul_f32(&a, &b, &mut result);

        assert_eq!(result, vec![2.0, 6.0, 12.0, 20.0]);
    }

    #[test]
    fn test_aligned_dot_f32() {
        let a = vec![1.0f32, 2.0, 3.0, 4.0];
        let b = vec![2.0f32, 3.0, 4.0, 5.0];

        let result = aligned_ops::aligned_dot_f32(&a, &b);

        // 1*2 + 2*3 + 3*4 + 4*5 = 2 + 6 + 12 + 20 = 40
        assert_eq!(result, 40.0);
    }

    #[test]
    fn test_aligned_sum_f32() {
        let a = vec![1.0f32, 2.0, 3.0, 4.0];

        let result = aligned_ops::aligned_sum_f32(&a);

        assert_eq!(result, 10.0);
    }

    #[test]
    fn test_adaptive_add_f32() {
        let a = vec![1.0f32, 2.0, 3.0, 4.0];
        let b = vec![5.0f32, 6.0, 7.0, 8.0];
        let mut result = vec![0.0f32; 4];

        adaptive::adaptive_add_f32(&a, &b, &mut result);

        assert_eq!(result, vec![6.0, 8.0, 10.0, 12.0]);
    }

    #[test]
    fn test_feature_detection() {
        // These should not panic
        let _ = features::has_avx2();
        let _ = features::has_avx512();
        let _ = features::has_neon();
        let _ = features::f32_vector_width();
        let _ = features::f64_vector_width();

        // SIMD should be used for large arrays
        assert!(features::should_use_simd(1000) || !cfg!(feature = "simd"));
    }
}