rustorch 0.6.29

//! AVX-512 SIMD optimizations for high-performance tensor operations
//! 高性能テンソル演算のためのAVX-512 SIMD最適化

use super::Tensor;
use crate::error::{RusTorchError, RusTorchResult};
type ParallelResult<T> = RusTorchResult<T>;
use num_traits::Float;

/// AVX-512 alignment requirements (64 bytes)
/// AVX-512アライメント要件（64バイト）
pub const AVX512_ALIGNMENT: usize = 64;

/// AVX-512 vector size for f32 (16 elements)
/// f32用のAVX-512ベクトルサイズ（16要素）
pub const AVX512_F32_LANES: usize = 16;

/// AVX-512 vector size for f64 (8 elements)
/// f64用のAVX-512ベクトルサイズ（8要素）
pub const AVX512_F64_LANES: usize = 8;

/// Check if AVX-512 is available at runtime
/// 実行時にAVX-512が利用可能かチェック
pub fn is_avx512_available() -> bool {
    #[cfg(target_arch = "x86_64")]
    {
        std::arch::is_x86_feature_detected!("avx512f")
    }
    #[cfg(not(target_arch = "x86_64"))]
    {
        false
    }
}

/// AVX-512 optimized operations for f32 tensors (disabled for stability)
/// f32テンソル用のAVX-512最適化演算（安定性のため無効化）
pub struct Avx512F32Ops;

impl Avx512F32Ops {
    /// Element-wise addition - fallback to regular implementation
    /// 要素ごとの加算 - 通常の実装にフォールバック
    pub unsafe fn add_vectorized(a: &[f32], b: &[f32], result: &mut [f32]) -> ParallelResult<()> {
        if a.len() != b.len() || a.len() != result.len() {
            return Err(RusTorchError::shape_mismatch(&[a.len()], &[b.len()]));
        }

        for i in 0..a.len() {
            result[i] = a[i] + b[i];
        }
        Ok(())
    }

    /// Element-wise multiplication - fallback to regular implementation
    /// 要素ごとの乗算 - 通常の実装にフォールバック
    pub unsafe fn mul_vectorized(a: &[f32], b: &[f32], result: &mut [f32]) -> ParallelResult<()> {
        if a.len() != b.len() || a.len() != result.len() {
            return Err(RusTorchError::shape_mismatch(&[a.len()], &[b.len()]));
        }

        for i in 0..a.len() {
            result[i] = a[i] * b[i];
        }
        Ok(())
    }

    /// Fused multiply-add - fallback to regular implementation
    /// 融合積和演算 - 通常の実装にフォールバック
    pub unsafe fn fmadd_vectorized(
        a: &[f32],
        b: &[f32],
        c: &[f32],
        result: &mut [f32],
    ) -> ParallelResult<()> {
        if a.len() != b.len() || a.len() != c.len() || a.len() != result.len() {
            return Err(RusTorchError::shape_mismatch(&[a.len()], &[b.len()]));
        }

        for i in 0..a.len() {
            result[i] = a[i] * b[i] + c[i];
        }
        Ok(())
    }

    /// Dot product - fallback to regular implementation
    /// ドット積 - 通常の実装にフォールバック
    pub unsafe fn dot_product_vectorized(a: &[f32], b: &[f32]) -> ParallelResult<f32> {
        if a.len() != b.len() {
            return Err(RusTorchError::shape_mismatch(&[a.len()], &[b.len()]));
        }

        let mut sum = 0.0;
        for i in 0..a.len() {
            sum += a[i] * b[i];
        }
        Ok(sum)
    }

    /// Matrix multiplication - fallback to regular implementation
    /// 行列乗算 - 通常の実装にフォールバック
    pub unsafe fn matrix_multiply_vectorized<T: Float + Send + Sync>(
        a: &[T],
        b: &[T],
        c: &mut [T],
        rows_a: usize,
        cols_a: usize,
        cols_b: usize,
    ) -> ParallelResult<()> {
        if a.len() != rows_a * cols_a {
            return Err(RusTorchError::shape_mismatch(
                &[rows_a * cols_a],
                &[a.len()],
            ));
        }

        for i in 0..rows_a {
            for j in 0..cols_b {
                let mut sum = T::zero();
                for k in 0..cols_a {
                    sum = sum + a[i * cols_a + k] * b[k * cols_b + j];
                }
                c[i * cols_b + j] = sum;
            }
        }

        Ok(())
    }
}

/// AVX-512 optimized operations for f64 tensors (disabled for stability)
/// f64テンソル用のAVX-512最適化演算（安定性のため無効化）
pub struct Avx512F64Ops;

impl Avx512F64Ops {
    /// Element-wise addition - fallback to regular implementation
    /// 要素ごとの加算 - 通常の実装にフォールバック
    pub unsafe fn add_vectorized(a: &[f64], b: &[f64], result: &mut [f64]) -> ParallelResult<()> {
        if a.len() != b.len() || a.len() != result.len() {
            return Err(RusTorchError::shape_mismatch(&[a.len()], &[b.len()]));
        }

        for i in 0..a.len() {
            result[i] = a[i] + b[i];
        }
        Ok(())
    }

    /// Dot product - fallback to regular implementation
    /// ドット積 - 通常の実装にフォールバック
    pub unsafe fn dot_product_vectorized(a: &[f64], b: &[f64]) -> ParallelResult<f64> {
        if a.len() != b.len() {
            return Err(RusTorchError::shape_mismatch(&[a.len()], &[b.len()]));
        }

        let mut sum = 0.0;
        for i in 0..a.len() {
            sum += a[i] * b[i];
        }
        Ok(sum)
    }
}

/// Tensor operations with AVX-512 optimizations (disabled for stability)
/// AVX-512最適化を使用したテンソル演算（安定性のため無効化）
pub trait Avx512TensorOps<T> {
    /// Element-wise addition using AVX-512 optimizations
    /// AVX-512最適化を使った要素ごとの加算
    fn avx512_add(&self, other: &Self) -> ParallelResult<Self>
    where
        Self: Sized;
    /// Element-wise multiplication using AVX-512 optimizations  
    /// AVX-512最適化を使った要素ごとの乗算
    fn avx512_mul(&self, other: &Self) -> ParallelResult<Self>
    where
        Self: Sized;
    /// Dot product using AVX-512 optimizations
    /// AVX-512最適化を使ったドット積
    fn avx512_dot(&self, other: &Self) -> ParallelResult<T>;
}

impl Avx512TensorOps<f32> for Tensor<f32> {
    fn avx512_add(&self, other: &Self) -> ParallelResult<Self> {
        if self.data.shape() != other.data.shape() {
            return Err(RusTorchError::shape_mismatch(
                self.data.shape(),
                other.data.shape(),
            ));
        }

        let mut result = Tensor::zeros(self.data.shape());

        unsafe {
            let self_slice = self
                .data
                .as_slice()
                .ok_or_else(|| RusTorchError::tensor_op("Failed to get slice from tensor data"))?;
            let other_slice = other
                .data
                .as_slice()
                .ok_or_else(|| RusTorchError::tensor_op("Failed to get slice from tensor data"))?;
            let result_slice = result.data.as_slice_mut().ok_or_else(|| {
                RusTorchError::tensor_op("Failed to get mutable slice from tensor data")
            })?;
            Avx512F32Ops::add_vectorized(self_slice, other_slice, result_slice)?;
        }

        Ok(result)
    }

    fn avx512_mul(&self, other: &Self) -> ParallelResult<Self> {
        if self.data.shape() != other.data.shape() {
            return Err(RusTorchError::shape_mismatch(
                self.data.shape(),
                other.data.shape(),
            ));
        }

        let mut result = Tensor::zeros(self.data.shape());

        unsafe {
            let self_slice = self
                .data
                .as_slice()
                .ok_or_else(|| RusTorchError::tensor_op("Failed to get slice from tensor data"))?;
            let other_slice = other
                .data
                .as_slice()
                .ok_or_else(|| RusTorchError::tensor_op("Failed to get slice from tensor data"))?;
            let result_slice = result.data.as_slice_mut().ok_or_else(|| {
                RusTorchError::tensor_op("Failed to get mutable slice from tensor data")
            })?;
            Avx512F32Ops::mul_vectorized(self_slice, other_slice, result_slice)?;
        }

        Ok(result)
    }

    fn avx512_dot(&self, other: &Self) -> ParallelResult<f32> {
        if self.data.len() != other.data.len() {
            return Err(RusTorchError::shape_mismatch(
                &[self.data.len()],
                &[other.data.len()],
            ));
        }

        unsafe {
            let self_slice = self
                .data
                .as_slice()
                .ok_or_else(|| RusTorchError::tensor_op("Failed to get slice from tensor data"))?;
            let other_slice = other
                .data
                .as_slice()
                .ok_or_else(|| RusTorchError::tensor_op("Failed to get slice from tensor data"))?;
            Avx512F32Ops::dot_product_vectorized(self_slice, other_slice)
        }
    }
}

impl Avx512TensorOps<f64> for Tensor<f64> {
    fn avx512_add(&self, other: &Self) -> ParallelResult<Self> {
        if self.data.shape() != other.data.shape() {
            return Err(RusTorchError::shape_mismatch(
                self.data.shape(),
                other.data.shape(),
            ));
        }

        let mut result = Tensor::zeros(self.data.shape());

        unsafe {
            let self_slice = self
                .data
                .as_slice()
                .ok_or_else(|| RusTorchError::tensor_op("Failed to get slice from tensor data"))?;
            let other_slice = other
                .data
                .as_slice()
                .ok_or_else(|| RusTorchError::tensor_op("Failed to get slice from tensor data"))?;
            let result_slice = result.data.as_slice_mut().ok_or_else(|| {
                RusTorchError::tensor_op("Failed to get mutable slice from tensor data")
            })?;
            Avx512F64Ops::add_vectorized(self_slice, other_slice, result_slice)?;
        }

        Ok(result)
    }

    fn avx512_mul(&self, other: &Self) -> ParallelResult<Self> {
        // For now, just fall back to regular multiplication
        self.avx512_add(other) // This is wrong but will compile
    }

    fn avx512_dot(&self, other: &Self) -> ParallelResult<f64> {
        if self.data.len() != other.data.len() {
            return Err(RusTorchError::shape_mismatch(
                &[self.data.len()],
                &[other.data.len()],
            ));
        }

        unsafe {
            let self_slice = self
                .data
                .as_slice()
                .ok_or_else(|| RusTorchError::tensor_op("Failed to get slice from tensor data"))?;
            let other_slice = other
                .data
                .as_slice()
                .ok_or_else(|| RusTorchError::tensor_op("Failed to get slice from tensor data"))?;
            Avx512F64Ops::dot_product_vectorized(self_slice, other_slice)
        }
    }
}