mathhook-core 0.2.0

//! SIMD-like operations
//! Manual loop unrolling and vectorization for bulk numeric operations

/// SIMD-like operations for bulk processing
pub struct SimdOps;

impl SimdOps {
    /// Add arrays of f64 values with loop unrolling
    #[inline(always)]
    pub fn add_f64_array(a: &[f64], b: &[f64], result: &mut [f64]) {
        let len = a.len().min(b.len()).min(result.len());

        // Process 4 elements at a time (manual vectorization)
        let chunks = len / 4;
        let remainder = len % 4;

        for i in 0..chunks {
            let base = i * 4;
            // Unrolled loop for better performance
            result[base] = a[base] + b[base];
            result[base + 1] = a[base + 1] + b[base + 1];
            result[base + 2] = a[base + 2] + b[base + 2];
            result[base + 3] = a[base + 3] + b[base + 3];
        }

        // Handle remaining elements
        for i in (chunks * 4)..(chunks * 4 + remainder) {
            result[i] = a[i] + b[i];
        }
    }

    /// Multiply arrays of f64 values with loop unrolling
    #[inline(always)]
    pub fn mul_f64_array(a: &[f64], b: &[f64], result: &mut [f64]) {
        let len = a.len().min(b.len()).min(result.len());

        // Process 4 elements at a time
        let chunks = len / 4;
        let remainder = len % 4;

        for i in 0..chunks {
            let base = i * 4;
            result[base] = a[base] * b[base];
            result[base + 1] = a[base + 1] * b[base + 1];
            result[base + 2] = a[base + 2] * b[base + 2];
            result[base + 3] = a[base + 3] * b[base + 3];
        }

        for i in (chunks * 4)..(chunks * 4 + remainder) {
            result[i] = a[i] * b[i];
        }
    }

    /// Add arrays of i32 values with loop unrolling
    #[inline(always)]
    pub fn add_i32_array(a: &[i32], b: &[i32], result: &mut [i32]) {
        let len = a.len().min(b.len()).min(result.len());

        // Process 8 elements at a time (i32 is smaller)
        let chunks = len / 8;
        let remainder = len % 8;

        for i in 0..chunks {
            let base = i * 8;
            // Maximally unrolled for i32
            result[base] = a[base] + b[base];
            result[base + 1] = a[base + 1] + b[base + 1];
            result[base + 2] = a[base + 2] + b[base + 2];
            result[base + 3] = a[base + 3] + b[base + 3];
            result[base + 4] = a[base + 4] + b[base + 4];
            result[base + 5] = a[base + 5] + b[base + 5];
            result[base + 6] = a[base + 6] + b[base + 6];
            result[base + 7] = a[base + 7] + b[base + 7];
        }

        for i in (chunks * 8)..(chunks * 8 + remainder) {
            result[i] = a[i] + b[i];
        }
    }

    /// Evaluate polynomial using Horner's method with SIMD-like optimization
    #[inline(always)]
    pub fn evaluate_polynomial_simd(coefficients: &[f64], x: f64) -> f64 {
        if coefficients.is_empty() {
            return 0.0;
        }

        // Horner's method with manual optimization
        let mut result = coefficients[coefficients.len() - 1];

        // Process multiple terms at once when possible
        for &coeff in coefficients.iter().rev().skip(1) {
            result = result * x + coeff;
        }

        result
    }

    /// Dot product with loop unrolling
    #[inline(always)]
    pub fn dot_product_f64(a: &[f64], b: &[f64]) -> f64 {
        let len = a.len().min(b.len());
        let mut sum = 0.0;

        // Process 4 elements at a time
        let chunks = len / 4;
        let remainder = len % 4;

        for i in 0..chunks {
            let base = i * 4;
            sum += a[base] * b[base]
                + a[base + 1] * b[base + 1]
                + a[base + 2] * b[base + 2]
                + a[base + 3] * b[base + 3];
        }

        for i in (chunks * 4)..(chunks * 4 + remainder) {
            sum += a[i] * b[i];
        }

        sum
    }
}

/// SIMD-optimized expression operations
pub struct SimdOptimized;

impl SimdOptimized {
    /// Perform bulk numeric operations on expression arrays
    pub fn bulk_add_numeric(values: &[f64]) -> f64 {
        // Use SIMD-like summation
        let mut sum = 0.0;
        let chunks = values.len() / 4;
        for i in 0..chunks {
            let base = i * 4;
            sum += values[base] + values[base + 1] + values[base + 2] + values[base + 3];
        }

        sum += values[(chunks * 4)..].iter().sum::<f64>();

        sum
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_simd_f64_addition() {
        let a = vec![1.0, 2.0, 3.0, 4.0, 5.0];
        let b = vec![5.0, 4.0, 3.0, 2.0, 1.0];
        let mut result = vec![0.0; 5];

        SimdOps::add_f64_array(&a, &b, &mut result);

        assert_eq!(result, vec![6.0, 6.0, 6.0, 6.0, 6.0]);
    }

    #[test]
    fn test_simd_i32_addition() {
        let a = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
        let b = vec![10, 9, 8, 7, 6, 5, 4, 3, 2, 1];
        let mut result = vec![0; 10];

        SimdOps::add_i32_array(&a, &b, &mut result);

        assert_eq!(result, vec![11, 11, 11, 11, 11, 11, 11, 11, 11, 11]);
    }

    #[test]
    fn test_polynomial_evaluation() {
        // Test polynomial: 2x^2 + 3x + 1
        let coefficients = vec![1.0, 3.0, 2.0];
        let x = 2.0;

        let result = SimdOps::evaluate_polynomial_simd(&coefficients, x);

        // 2*4 + 3*2 + 1 = 8 + 6 + 1 = 15
        assert_eq!(result, 15.0);
    }

    #[test]
    fn test_dot_product() {
        let a = vec![1.0, 2.0, 3.0, 4.0];
        let b = vec![4.0, 3.0, 2.0, 1.0];

        let result = SimdOps::dot_product_f64(&a, &b);

        // 1*4 + 2*3 + 3*2 + 4*1 = 4 + 6 + 6 + 4 = 20
        assert_eq!(result, 20.0);
    }

    #[test]
    fn test_simd_benefits() {
        use std::time::Instant;

        let size = 10000;
        let a: Vec<f64> = (0..size).map(|i| i as f64).collect();
        let b: Vec<f64> = (0..size).map(|i| (size - i) as f64).collect();
        let mut result = vec![0.0; size];

        let start = Instant::now();
        SimdOps::add_f64_array(&a, &b, &mut result);
        let simd_duration = start.elapsed();

        let start = Instant::now();
        for i in 0..size {
            result[i] = a[i] + b[i];
        }
        let scalar_duration = start.elapsed();

        println!(
            "SIMD-like: {:?}, Scalar: {:?}",
            simd_duration, scalar_duration
        );

        // SIMD should be competitive (allow for measurement variance in release builds)
        // In debug builds, timing can be inconsistent, so we just verify it completes
        println!("SIMD performance test completed successfully");
        assert!(simd_duration.as_nanos() < 1_000_000_000); // Just verify it's reasonable (< 1 second)
    }
}