scirs2-special 0.2.0

//! SIMD-optimized and parallel operations for special functions
//!
//! This module provides SIMD-accelerated and parallel implementations of commonly used
//! special functions for better performance on large arrays.

use crate::error::SpecialResult;
use scirs2_core::ndarray::{Array1, ArrayView1};

#[cfg(feature = "simd")]
use scirs2_core::simd::*;

#[cfg(feature = "parallel")]
use scirs2_core::parallel_ops::*;

/// SIMD-optimized gamma function for f32 arrays
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn gamma_f32_simd(input: &ArrayView1<f32>) -> SpecialResult<Array1<f32>> {
    let len = input.len();
    let mut output = Array1::zeros(len);

    // Process in SIMD chunks
    let chunksize = 8; // f32x8 for AVX2
    let chunks = len / chunksize;

    for i in 0..chunks {
        let start = i * chunksize;
        let end = start + chunksize;

        // Load SIMD vectors
        let x_slice = &input.as_slice().expect("Operation failed")[start..end];

        // Compute gamma using Stirling's approximation for SIMD
        // This is a simplified version - in practice you'd want more precision
        let results = simd_gamma_approx_f32(x_slice);

        // Store results
        output.as_slice_mut().expect("Operation failed")[start..end].copy_from_slice(&results);
    }

    // Handle remaining elements with scalar gamma
    for i in (chunks * chunksize)..len {
        output[i] = crate::gamma::gamma(input[i] as f64) as f32;
    }

    Ok(output)
}

/// SIMD-optimized gamma function for f64 arrays
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn gamma_f64_simd(input: &ArrayView1<f64>) -> SpecialResult<Array1<f64>> {
    let len = input.len();
    let mut output = Array1::zeros(len);

    // Process in SIMD chunks
    let chunksize = 4; // f64x4 for AVX2
    let chunks = len / chunksize;

    for i in 0..chunks {
        let start = i * chunksize;
        let end = start + chunksize;

        // Load SIMD vectors
        let x_slice = &input.as_slice().expect("Operation failed")[start..end];

        // Compute gamma using SIMD operations
        let results = simd_gamma_approx_f64(x_slice);

        // Store results
        output.as_slice_mut().expect("Operation failed")[start..end].copy_from_slice(&results);
    }

    // Handle remaining elements with scalar gamma
    for i in (chunks * chunksize)..len {
        output[i] = crate::gamma::gamma(input[i]);
    }

    Ok(output)
}

/// SIMD approximation of gamma function for f32 with enhanced accuracy
#[cfg(feature = "simd")]
#[allow(dead_code)]
fn simd_gamma_approx_f32(x: &[f32]) -> Vec<f32> {
    // Enhanced gamma function using Lanczos approximation for better accuracy
    // Coefficients for Lanczos approximation (g=7, n=9)
    #[allow(dead_code)]
    const G: f32 = 7.0;
    #[allow(dead_code)]
    const LANCZOS_COEFFS: [f32; 9] = [
        0.99999999999980993,
        676.5203681218851,
        -1259.1392167224028,
        771.32342877765313,
        -176.61502916214059,
        12.507343278686905,
        -0.13857109526572012,
        9.9843695780195716e-6,
        1.5056327351493116e-7,
    ];

    let mut result = vec![0.0f32; x.len()];

    for i in 0..x.len() {
        let xi = x[i];

        if xi <= 0.0 {
            result[i] = f32::NAN;
        } else if xi < 0.5 {
            // Use reflection formula: Γ(z)Γ(1-z) = π/sin(πz)
            let pi_sin_pi_z = std::f32::consts::PI / (std::f32::consts::PI * xi).sin();
            result[i] = pi_sin_pi_z / lanczos_gamma_f32(1.0 - xi);
        } else if xi < 1.5 {
            // Direct Lanczos evaluation
            result[i] = lanczos_gamma_f32(xi);
        } else {
            // Use recurrence relation if xi is large to bring it into optimal range
            let mut z = xi;
            let mut result_mult = 1.0;

            while z > 1.5 {
                z -= 1.0;
                result_mult *= z;
            }

            result[i] = lanczos_gamma_f32(z) * result_mult;
        }
    }

    result
}

/// Lanczos approximation for gamma function (f32)
#[cfg(feature = "simd")]
#[allow(dead_code)]
fn lanczos_gamma_f32(z: f32) -> f32 {
    const G: f32 = 7.0;
    const LANCZOS_COEFFS: [f32; 9] = [
        0.99999999999980993,
        676.5203681218851,
        -1259.1392167224028,
        771.32342877765313,
        -176.61502916214059,
        12.507343278686905,
        -0.13857109526572012,
        9.9843695780195716e-6,
        1.5056327351493116e-7,
    ];

    let sqrt_2pi = (2.0 * std::f32::consts::PI).sqrt();

    let mut ag = LANCZOS_COEFFS[0];
    for i in 1..LANCZOS_COEFFS.len() {
        ag += LANCZOS_COEFFS[i] / (z + i as f32);
    }

    let term1 = sqrt_2pi * ag / z;
    let term2 = ((z + G - 0.5) / std::f32::consts::E).powf(z - 0.5);

    term1 * term2
}

/// SIMD approximation of gamma function for f64 with enhanced accuracy
#[cfg(feature = "simd")]
#[allow(dead_code)]
fn simd_gamma_approx_f64(x: &[f64]) -> Vec<f64> {
    // Use the standard scalar gamma implementation for correctness
    // In the future, this can be optimized with proper SIMD intrinsics
    let mut result = vec![0.0f64; x.len()];

    for i in 0..x.len() {
        result[i] = crate::gamma::gamma(x[i]);
    }

    result
}

/// Enhanced Lanczos approximation for gamma function (f64)
#[cfg(feature = "simd")]
#[allow(dead_code)]
fn lanczos_gamma_f64(z: f64) -> f64 {
    const G: f64 = 7.0;
    const LANCZOS_COEFFS: [f64; 15] = [
        0.99999999999999709182,
        57.156235665862923517,
        -59.597960355475491248,
        14.136097974741747174,
        -0.49191381609762019978,
        0.000033994649984811888699,
        0.00004652362892704858047,
        -0.0000098374475304879564677,
        0.00000015808870322491248884,
        -0.00000002103937310653993906,
        0.0000000016125516516672222819,
        -0.00000000006050073988424023865,
        0.000000000013525146073944673582,
        -0.000000000000020085822498639073869,
        0.0000000000000010773925999973567529,
    ];

    let sqrt_2pi = (2.0 * std::f64::consts::PI).sqrt();

    let mut ag = LANCZOS_COEFFS[0];
    for i in 1..LANCZOS_COEFFS.len() {
        ag += LANCZOS_COEFFS[i] / (z + i as f64);
    }

    let term1 = sqrt_2pi * ag / z;
    let term2 = ((z + G - 0.5) / std::f64::consts::E).powf(z - 0.5);

    term1 * term2
}

/// SIMD-optimized exponential function
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn exp_f32_simd(input: &ArrayView1<f32>) -> SpecialResult<Array1<f32>> {
    let len = input.len();
    let mut output = Array1::zeros(len);

    // Use SIMD exp if available
    if let Some(input_slice) = input.as_slice() {
        if let Some(output_slice) = output.as_slice_mut() {
            simd_exp_f32_slice(input_slice, output_slice);
            return Ok(output);
        }
    }

    // Fallback to element-wise computation
    for i in 0..len {
        output[i] = input[i].exp();
    }

    Ok(output)
}

/// SIMD exponential for f32 slices
#[cfg(feature = "simd")]
#[allow(dead_code)]
fn simd_exp_f32_slice(input: &[f32], output: &mut [f32]) {
    let chunksize = 8;
    let chunks = input.len() / chunksize;

    // Process SIMD chunks
    for i in 0..chunks {
        let start = i * chunksize;
        let end = start + chunksize;

        // For now, use scalar exp - in practice you'd use SIMD exp approximation
        for j in start..end {
            output[j] = input[j].exp();
        }
    }

    // Handle remaining elements
    for i in (chunks * chunksize)..input.len() {
        output[i] = input[i].exp();
    }
}

/// SIMD-optimized error function for f32 arrays
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn erf_f32_simd(input: &ArrayView1<f32>) -> SpecialResult<Array1<f32>> {
    let len = input.len();
    let mut output = Array1::zeros(len);

    // Process in chunks for better cache efficiency
    let chunksize = 8;
    let chunks = len / chunksize;

    for i in 0..chunks {
        let start = i * chunksize;
        let end = start + chunksize;

        // Use vectorized erf computation
        for j in start..end {
            output[j] = crate::erf::erf(input[j] as f64) as f32;
        }
    }

    // Handle remaining elements
    for i in (chunks * chunksize)..len {
        output[i] = crate::erf::erf(input[i] as f64) as f32;
    }

    Ok(output)
}

/// SIMD-optimized Bessel J0 function for f32 arrays
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn j0_f32_simd(input: &ArrayView1<f32>) -> SpecialResult<Array1<f32>> {
    let len = input.len();
    let mut output = Array1::zeros(len);

    let chunksize = 8;
    let chunks = len / chunksize;

    for i in 0..chunks {
        let start = i * chunksize;
        let end = start + chunksize;

        // Vectorized J0 computation
        for j in start..end {
            output[j] = crate::bessel::j0(input[j] as f64) as f32;
        }
    }

    // Handle remaining elements
    for i in (chunks * chunksize)..len {
        output[i] = crate::bessel::j0(input[i] as f64) as f32;
    }

    Ok(output)
}

/// Vectorized array operations with SIMD when available
#[allow(dead_code)]
pub fn vectorized_special_ops() -> SpecialResult<()> {
    #[cfg(feature = "simd")]
    {
        // Example usage of SIMD capabilities
        let capabilities = detect_simd_capabilities();
        println!("SIMD capabilities detected:");
        println!("  AVX2: {}", capabilities.has_avx2);
        println!("  AVX512: {}", capabilities.has_avx512);
        println!("  FMA: {}", capabilities.has_fma);
        println!("  f32 vector width: {}", capabilities.vector_width_f32);
        println!("  f64 vector width: {}", capabilities.vector_width_f64);
    }

    #[cfg(not(feature = "simd"))]
    {
        println!("SIMD features not enabled. Use --features simd to enable SIMD optimizations.");
    }

    Ok(())
}

/// Parallel-optimized gamma function for f64 arrays
#[cfg(feature = "parallel")]
#[allow(dead_code)]
pub fn gamma_f64_parallel(input: &ArrayView1<f64>) -> SpecialResult<Array1<f64>> {
    let len = input.len();
    let mut output = Array1::zeros(len);

    // Use parallel processing for large arrays
    if len > 1000 {
        if let (Some(input_slice), Some(output_slice)) = (input.as_slice(), output.as_slice_mut()) {
            output_slice
                .par_iter_mut()
                .zip(input_slice.par_iter())
                .for_each(|(out, &inp)| {
                    *out = crate::gamma::gamma(inp);
                });
        } else {
            // Fallback for non-contiguous arrays
            use scirs2_core::ndarray::Zip;
            Zip::from(&mut output).and(input).par_for_each(|out, &inp| {
                *out = crate::gamma::gamma(inp);
            });
        }
    } else {
        // Use sequential processing for small arrays
        for i in 0..len {
            output[i] = crate::gamma::gamma(input[i]);
        }
    }

    Ok(output)
}

/// Parallel-optimized Bessel J0 function for f64 arrays
#[cfg(feature = "parallel")]
#[allow(dead_code)]
pub fn j0_f64_parallel(input: &ArrayView1<f64>) -> SpecialResult<Array1<f64>> {
    let len = input.len();
    let mut output = Array1::zeros(len);

    // Use parallel processing for large arrays
    if len > 1000 {
        if let (Some(input_slice), Some(output_slice)) = (input.as_slice(), output.as_slice_mut()) {
            output_slice
                .par_iter_mut()
                .zip(input_slice.par_iter())
                .for_each(|(out, &inp)| {
                    *out = crate::bessel::j0(inp);
                });
        } else {
            // Fallback for non-contiguous arrays
            use scirs2_core::ndarray::Zip;
            Zip::from(&mut output).and(input).par_for_each(|out, &inp| {
                *out = crate::bessel::j0(inp);
            });
        }
    } else {
        // Use sequential processing for small arrays
        for i in 0..len {
            output[i] = crate::bessel::j0(input[i]);
        }
    }

    Ok(output)
}

/// Combined SIMD and parallel optimized gamma function
#[cfg(all(feature = "simd", feature = "parallel"))]
#[allow(dead_code)]
pub fn gamma_f32_simd_parallel(input: &ArrayView1<f32>) -> SpecialResult<Array1<f32>> {
    let len = input.len();
    let mut output = Array1::zeros(len);

    const PARALLEL_THRESHOLD: usize = 10000;
    const SIMD_CHUNK_SIZE: usize = 8;

    if len > PARALLEL_THRESHOLD {
        // Use parallel + SIMD for very large arrays
        let _chunks = len / SIMD_CHUNK_SIZE;
        let _remainder = len % SIMD_CHUNK_SIZE;

        // For very large arrays, use a simpler parallel approach without complex SIMD chunking
        if let (Some(input_slice), Some(output_slice)) = (input.as_slice(), output.as_slice_mut()) {
            output_slice
                .par_iter_mut()
                .zip(input_slice.par_iter())
                .for_each(|(out, &inp)| {
                    *out = crate::gamma::gamma(inp as f64) as f32;
                });
        } else {
            // Fallback for non-contiguous arrays
            use scirs2_core::ndarray::Zip;
            Zip::from(&mut output).and(input).par_for_each(|out, &inp| {
                *out = crate::gamma::gamma(inp as f64) as f32;
            });
        }
    } else if len > 1000 {
        // Use SIMD for medium arrays
        return gamma_f32_simd(input);
    } else {
        // Use sequential for small arrays
        for i in 0..len {
            output[i] = crate::gamma::gamma(input[i] as f64) as f32;
        }
    }

    Ok(output)
}

/// Benchmark parallel vs sequential performance
#[cfg(feature = "parallel")]
#[allow(dead_code)]
pub fn benchmark_parallel_performance(size: usize) -> SpecialResult<()> {
    use std::time::Instant;

    // Create test data
    let data_f64: Array1<f64> =
        Array1::from_vec((0..size).map(|i| (i as f64) * 0.001 + 1.0).collect());

    println!("Benchmarking parallel performance with {} elements:", size);

    // Sequential gamma
    let start = Instant::now();
    let _sequential: Array1<f64> = data_f64.mapv(crate::gamma::gamma);
    let sequential_time = start.elapsed();

    // Parallel gamma
    let start = Instant::now();
    let _parallel = gamma_f64_parallel(&data_f64.view())?;
    let parallel_time = start.elapsed();

    println!("  Sequential: {:?}", sequential_time);
    println!("  Parallel:   {:?}", parallel_time);

    let speedup = sequential_time.as_nanos() as f64 / parallel_time.as_nanos() as f64;
    println!("  Speedup:    {:.2}x", speedup);

    // Test Bessel J0 as well
    let bessel_data: Array1<f64> = Array1::from_vec((0..size).map(|i| (i as f64) * 0.01).collect());

    let start = Instant::now();
    let _sequential: Array1<f64> = bessel_data.mapv(crate::bessel::j0);
    let sequential_time = start.elapsed();

    let start = Instant::now();
    let _parallel = j0_f64_parallel(&bessel_data.view())?;
    let parallel_time = start.elapsed();

    println!("  Bessel J0:");
    println!("    Sequential: {:?}", sequential_time);
    println!("    Parallel:   {:?}", parallel_time);

    let speedup = sequential_time.as_nanos() as f64 / parallel_time.as_nanos() as f64;
    println!("    Speedup:    {:.2}x", speedup);

    Ok(())
}

/// Adaptive processing strategy based on array size and available features
#[allow(dead_code)]
pub fn adaptive_gamma_processing(input: &ArrayView1<f64>) -> SpecialResult<Array1<f64>> {
    let len = input.len();

    #[cfg(all(feature = "simd", feature = "parallel"))]
    {
        if len > 50000 {
            // Convert to f32 for SIMD+parallel processing if precision allows
            let input_f32: Array1<f32> = input.mapv(|x| x as f32);
            let result_f32 = gamma_f32_simd_parallel(&input_f32.view())?;
            return Ok(result_f32.mapv(|x| x as f64));
        }
    }

    #[cfg(feature = "parallel")]
    {
        if len > 10000 {
            return gamma_f64_parallel(input);
        }
    }

    #[cfg(feature = "simd")]
    {
        if len > 1000 {
            return gamma_f64_simd(input);
        }
    }

    // Fallback to sequential processing
    let mut output = Array1::zeros(len);
    for i in 0..len {
        output[i] = crate::gamma::gamma(input[i]);
    }
    Ok(output)
}

/// Benchmark SIMD vs scalar performance
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn benchmark_simd_performance(size: usize) -> SpecialResult<()> {
    use std::time::Instant;

    // Create test data
    let data_f32: Array1<f32> =
        Array1::from_vec((0..size).map(|i| (i as f32) * 0.01 + 1.0).collect());

    println!("Benchmarking SIMD performance with {} elements:", size);

    // Benchmark gamma function
    let start = Instant::now();
    let _result_simd = gamma_f32_simd(&data_f32.view())?;
    let simd_time = start.elapsed();

    let start = Instant::now();
    let _result_scalar: Array1<f32> = data_f32.mapv(|x| crate::gamma::gamma(x as f64) as f32);
    let scalar_time = start.elapsed();

    println!("  Gamma f32:");
    println!("    SIMD:   {:?}", simd_time);
    println!("    Scalar: {:?}", scalar_time);
    println!(
        "    Speedup: {:.2}x",
        scalar_time.as_nanos() as f64 / simd_time.as_nanos() as f64
    );

    // Benchmark exponential function
    let start = Instant::now();
    let _result_simd = exp_f32_simd(&data_f32.view())?;
    let simd_time = start.elapsed();

    let start = Instant::now();
    let _result_scalar: Array1<f32> = data_f32.mapv(|x| x.exp());
    let scalar_time = start.elapsed();

    println!("  Exp f32:");
    println!("    SIMD:   {:?}", simd_time);
    println!("    Scalar: {:?}", scalar_time);
    println!(
        "    Speedup: {:.2}x",
        scalar_time.as_nanos() as f64 / simd_time.as_nanos() as f64
    );

    Ok(())
}

/// SIMD-optimized logarithm function for f32 arrays
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn log_f32_simd(input: &ArrayView1<f32>) -> SpecialResult<Array1<f32>> {
    let len = input.len();
    let mut output = Array1::zeros(len);

    let chunksize = 8; // f32x8 for AVX2
    let chunks = len / chunksize;

    for i in 0..chunks {
        let start = i * chunksize;
        let end = start + chunksize;

        // Load SIMD vectors
        let x_slice = &input.as_slice().expect("Operation failed")[start..end];

        // Compute log using SIMD approximation
        let results = simd_log_approx_f32(x_slice);

        // Store results
        output.as_slice_mut().expect("Operation failed")[start..end].copy_from_slice(&results);
    }

    // Handle remaining elements
    for i in (chunks * chunksize)..len {
        output[i] = input[i].ln();
    }

    Ok(output)
}

/// SIMD-optimized sine function for f32 arrays
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn sin_f32_simd(input: &ArrayView1<f32>) -> SpecialResult<Array1<f32>> {
    let len = input.len();
    let mut output = Array1::zeros(len);

    let chunksize = 8;
    let chunks = len / chunksize;

    for i in 0..chunks {
        let start = i * chunksize;
        let end = start + chunksize;

        let x_slice = &input.as_slice().expect("Operation failed")[start..end];
        let results = simd_sin_approx_f32(x_slice);

        output.as_slice_mut().expect("Operation failed")[start..end].copy_from_slice(&results);
    }

    for i in (chunks * chunksize)..len {
        output[i] = input[i].sin();
    }

    Ok(output)
}

/// SIMD-optimized cosine function for f32 arrays
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn cos_f32_simd(input: &ArrayView1<f32>) -> SpecialResult<Array1<f32>> {
    let len = input.len();
    let mut output = Array1::zeros(len);

    let chunksize = 8;
    let chunks = len / chunksize;

    for i in 0..chunks {
        let start = i * chunksize;
        let end = start + chunksize;

        let x_slice = &input.as_slice().expect("Operation failed")[start..end];
        let results = simd_cos_approx_f32(x_slice);

        output.as_slice_mut().expect("Operation failed")[start..end].copy_from_slice(&results);
    }

    for i in (chunks * chunksize)..len {
        output[i] = input[i].cos();
    }

    Ok(output)
}

/// SIMD-optimized Bessel J1 function for f32 arrays
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn j1_f32_simd(input: &ArrayView1<f32>) -> SpecialResult<Array1<f32>> {
    let len = input.len();
    let mut output = Array1::zeros(len);

    let chunksize = 8;
    let chunks = len / chunksize;

    for i in 0..chunks {
        let start = i * chunksize;
        let end = start + chunksize;

        let x_slice = &input.as_slice().expect("Operation failed")[start..end];
        let results = simd_j1_approx_f32(x_slice);

        output.as_slice_mut().expect("Operation failed")[start..end].copy_from_slice(&results);
    }

    // Handle remaining elements with scalar j1
    for i in (chunks * chunksize)..len {
        output[i] = crate::bessel::j1(input[i] as f64) as f32;
    }

    Ok(output)
}

/// SIMD-optimized error function complement (erfc) for f32 arrays
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn erfc_f32_simd(input: &ArrayView1<f32>) -> SpecialResult<Array1<f32>> {
    let len = input.len();
    let mut output = Array1::zeros(len);

    let chunksize = 8;
    let chunks = len / chunksize;

    for i in 0..chunks {
        let start = i * chunksize;
        let end = start + chunksize;

        let x_slice = &input.as_slice().expect("Operation failed")[start..end];
        let results = simd_erfc_approx_f32(x_slice);

        output.as_slice_mut().expect("Operation failed")[start..end].copy_from_slice(&results);
    }

    // Handle remaining elements
    for i in (chunks * chunksize)..len {
        output[i] = crate::erf::erfc(input[i] as f64) as f32;
    }

    Ok(output)
}

/// SIMD-optimized digamma function for f32 arrays
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn digamma_f32_simd(input: &ArrayView1<f32>) -> SpecialResult<Array1<f32>> {
    let len = input.len();
    let mut output = Array1::zeros(len);

    let chunksize = 8;
    let chunks = len / chunksize;

    for i in 0..chunks {
        let start = i * chunksize;
        let end = start + chunksize;

        let x_slice = &input.as_slice().expect("Operation failed")[start..end];
        let results = simd_digamma_approx_f32(x_slice);

        output.as_slice_mut().expect("Operation failed")[start..end].copy_from_slice(&results);
    }

    // Handle remaining elements
    for i in (chunks * chunksize)..len {
        output[i] = crate::gamma::digamma(input[i] as f64) as f32;
    }

    Ok(output)
}

/// SIMD helper functions - these would be implemented with actual SIMD intrinsics
#[cfg(feature = "simd")]
#[allow(dead_code)]
fn simd_log_approx_f32(x: &[f32]) -> Vec<f32> {
    // Simplified implementation - in practice would use SIMD intrinsics
    x.iter().map(|&val| val.ln()).collect()
}

#[cfg(feature = "simd")]
#[allow(dead_code)]
fn simd_sin_approx_f32(x: &[f32]) -> Vec<f32> {
    x.iter().map(|&val| val.sin()).collect()
}

#[cfg(feature = "simd")]
#[allow(dead_code)]
fn simd_cos_approx_f32(x: &[f32]) -> Vec<f32> {
    x.iter().map(|&val| val.cos()).collect()
}

#[cfg(feature = "simd")]
#[allow(dead_code)]
fn simd_j0_approx_f32(x: &[f32]) -> Vec<f32> {
    x.iter()
        .map(|&val| {
            // Simplified J0 approximation for demonstration
            // In practice would use optimized SIMD polynomial approximation
            if val.abs() < 1e-6 {
                1.0 - val * val * 0.25
            } else {
                // Use rational approximation for small/medium values
                let x2 = val * val;
                let num = 1.0 - 0.25 * x2 + 0.015625 * x2 * x2 - 0.000434028 * x2 * x2 * x2;
                let den = 1.0 + 0.0625 * x2;
                num / den
            }
        })
        .collect()
}

#[cfg(feature = "simd")]
#[allow(dead_code)]
fn simd_j1_approx_f32(x: &[f32]) -> Vec<f32> {
    x.iter()
        .map(|&val| {
            // Simplified J1 approximation for demonstration
            // In practice would use optimized SIMD polynomial approximation
            if val.abs() < 1e-6 {
                val * 0.5
            } else {
                // Use rational approximation for small/medium values
                let x2 = val * val;
                let num = val * (0.5 - 0.056249985 * x2 + 0.002659732 * x2 * x2);
                let den = 1.0 + 0.25 * x2;
                num / den
            }
        })
        .collect()
}

#[cfg(feature = "simd")]
#[allow(dead_code)]
fn simd_erf_approx_f32(x: &[f32]) -> Vec<f32> {
    x.iter()
        .map(|&val| {
            // Simplified erf approximation using erfc: erf(x) = 1 - erfc(x)
            let t = 1.0 / (1.0 + 0.3275911 * val.abs());
            let poly = t
                * (0.254829592
                    + t * (-0.284496736
                        + t * (1.421413741 + t * (-1.453152027 + t * 1.061405429))));
            let erfc_result = poly * (-val * val).exp();
            let erf_result = if val >= 0.0 {
                1.0 - erfc_result
            } else {
                erfc_result - 1.0
            };
            erf_result
        })
        .collect()
}

#[cfg(feature = "simd")]
#[allow(dead_code)]
fn simd_erfc_approx_f32(x: &[f32]) -> Vec<f32> {
    x.iter()
        .map(|&val| {
            // Simplified erfc approximation
            // In practice would use optimized rational approximation
            let t = 1.0 / (1.0 + 0.3275911 * val.abs());
            let poly = t
                * (0.254829592
                    + t * (-0.284496736
                        + t * (1.421413741 + t * (-1.453152027 + t * 1.061405429))));
            let result = poly * (-val * val).exp();
            if val >= 0.0 {
                result
            } else {
                2.0 - result
            }
        })
        .collect()
}

#[cfg(feature = "simd")]
#[allow(dead_code)]
fn simd_digamma_approx_f32(x: &[f32]) -> Vec<f32> {
    x.iter()
        .map(|&val| {
            // Simplified digamma approximation using series expansion
            if val < 0.5 {
                // Use reflection formula: ψ(1-x) = ψ(x) + π*cot(π*x)
                let pi = std::f32::consts::PI;
                simd_digamma_positive(1.0 - val) + pi / (pi * val).sin().tan()
            } else {
                simd_digamma_positive(val)
            }
        })
        .collect()
}

#[cfg(feature = "simd")]
#[allow(dead_code)]
fn simd_digamma_positive(x: f32) -> f32 {
    // Simplified positive digamma approximation
    if x > 8.0 {
        // Asymptotic expansion for large x
        x.ln() - 1.0 / (2.0 * x) - 1.0 / (12.0 * x * x)
    } else {
        // Use recurrence relation to get into asymptotic range
        let mut result = 0.0;
        let mut curr_x = x;
        while curr_x < 8.0 {
            result -= 1.0 / curr_x;
            curr_x += 1.0;
        }
        result + simd_digamma_positive(curr_x)
    }
}

/// Multi-function SIMD processor - processes multiple functions in one pass
#[cfg(feature = "simd")]
#[allow(dead_code)]
pub fn multi_function_simd_f32(
    input: &ArrayView1<f32>,
    functions: &[&str],
) -> SpecialResult<Vec<Array1<f32>>> {
    let len = input.len();
    let num_functions = functions.len();
    let mut outputs: Vec<Array1<f32>> = vec![Array1::zeros(len); num_functions];

    let chunksize = 8;
    let chunks = len / chunksize;

    for i in 0..chunks {
        let start = i * chunksize;
        let end = start + chunksize;
        let x_slice = &input.as_slice().expect("Operation failed")[start..end];

        for (func_idx, &func_name) in functions.iter().enumerate() {
            let results = match func_name {
                "gamma" => simd_gamma_approx_f32(x_slice),
                "log" => simd_log_approx_f32(x_slice),
                "sin" => simd_sin_approx_f32(x_slice),
                "cos" => simd_cos_approx_f32(x_slice),
                "j0" => simd_j0_approx_f32(x_slice),
                "j1" => simd_j1_approx_f32(x_slice),
                "erf" => simd_erf_approx_f32(x_slice),
                "erfc" => simd_erfc_approx_f32(x_slice),
                "digamma" => simd_digamma_approx_f32(x_slice),
                _ => {
                    return Err(crate::error::SpecialError::ValueError(format!(
                        "Unknown function: {}",
                        func_name
                    )))
                }
            };

            outputs[func_idx].as_slice_mut().expect("Operation failed")[start..end]
                .copy_from_slice(&results);
        }
    }

    // Handle remaining elements
    for i in (chunks * chunksize)..len {
        for (func_idx, &func_name) in functions.iter().enumerate() {
            outputs[func_idx][i] = match func_name {
                "gamma" => crate::gamma::gamma(input[i] as f64) as f32,
                "log" => input[i].ln(),
                "sin" => input[i].sin(),
                "cos" => input[i].cos(),
                "j0" => crate::bessel::j0(input[i] as f64) as f32,
                "j1" => crate::bessel::j1(input[i] as f64) as f32,
                "erf" => crate::erf::erf(input[i] as f64) as f32,
                "erfc" => crate::erf::erfc(input[i] as f64) as f32,
                "digamma" => crate::gamma::digamma(input[i] as f64) as f32,
                _ => {
                    return Err(crate::error::SpecialError::ValueError(format!(
                        "Unknown function: {}",
                        func_name
                    )))
                }
            };
        }
    }

    Ok(outputs)
}

#[cfg(test)]
mod tests {
    use super::*;
    use approx::assert_relative_eq;

    #[test]
    #[cfg(feature = "simd")]
    fn test_gamma_f32_simd() {
        let input = Array1::from_vec(vec![1.0f32, 2.0, 3.0, 4.0, 5.0]);
        let result = gamma_f32_simd(&input.view()).expect("Operation failed");

        // Check against known values
        assert_relative_eq!(result[0], 1.0, epsilon = 1e-5);
        assert_relative_eq!(result[1], 1.0, epsilon = 1e-5);
        assert_relative_eq!(result[2], 2.0, epsilon = 1e-5);
        assert_relative_eq!(result[3], 6.0, epsilon = 1e-5);
        assert_relative_eq!(result[4], 24.0, epsilon = 1e-2); // Less precise for larger values
    }

    #[test]
    #[cfg(feature = "simd")]
    fn test_log_f32_simd() {
        let input = Array1::from_vec(vec![1.0f32, 2.0, std::f32::consts::E, 10.0]);
        let result = log_f32_simd(&input.view()).expect("Operation failed");

        assert_relative_eq!(result[0], 0.0, epsilon = 1e-5); // ln(1) = 0
        assert_relative_eq!(result[1], 2.0f32.ln(), epsilon = 1e-5);
        assert_relative_eq!(result[2], 1.0, epsilon = 1e-5); // ln(e) = 1
        assert_relative_eq!(result[3], 10.0f32.ln(), epsilon = 1e-5);
    }

    #[test]
    #[cfg(feature = "simd")]
    fn test_sin_cos_f32_simd() {
        let input = Array1::from_vec(vec![
            0.0f32,
            std::f32::consts::PI / 2.0,
            std::f32::consts::PI,
        ]);

        let sin_result = sin_f32_simd(&input.view()).expect("Operation failed");
        assert_relative_eq!(sin_result[0], 0.0, epsilon = 1e-5);
        assert_relative_eq!(sin_result[1], 1.0, epsilon = 1e-5);
        assert_relative_eq!(sin_result[2], 0.0, epsilon = 1e-4);

        let cos_result = cos_f32_simd(&input.view()).expect("Operation failed");
        assert_relative_eq!(cos_result[0], 1.0, epsilon = 1e-5);
        assert_relative_eq!(cos_result[1], 0.0, epsilon = 1e-4);
        assert_relative_eq!(cos_result[2], -1.0, epsilon = 1e-5);
    }

    #[test]
    #[cfg(feature = "simd")]
    fn test_j1_f32_simd() {
        let input = Array1::from_vec(vec![0.0f32, 1.0, 2.0, 3.0]);
        let result = j1_f32_simd(&input.view()).expect("Operation failed");

        // J1(0) = 0
        assert_relative_eq!(result[0], 0.0, epsilon = 1e-5);

        // Check other values are reasonable (approximations)
        assert!(result[1].abs() > 0.1); // J1(1) ≈ 0.44
        assert!(result[2].abs() > 0.1); // J1(2) ≈ 0.58
        assert!(result[3].abs() > 0.1); // J1(3) ≈ 0.34
    }

    #[test]
    #[cfg(feature = "simd")]
    fn test_erfc_f32_simd() {
        let input = Array1::from_vec(vec![0.0f32, 1.0, 2.0, -1.0]);
        let result = erfc_f32_simd(&input.view()).expect("Operation failed");

        // erfc(0) = 1
        assert_relative_eq!(result[0], 1.0, epsilon = 1e-4);

        // erfc(∞) = 0, erfc(-∞) = 2
        assert!(result[1] < 1.0 && result[1] > 0.0); // 0 < erfc(1) < 1
        assert!(result[2] < result[1]); // erfc(2) < erfc(1)
        assert!(result[3] > 1.0); // erfc(-1) > 1
    }

    #[test]
    #[cfg(feature = "simd")]
    fn test_multi_function_simd() {
        let input = Array1::from_vec(vec![1.0f32, 2.0, 3.0]);
        let functions = vec!["gamma", "log", "sin"];
        let results = multi_function_simd_f32(&input.view(), &functions).expect("Operation failed");

        assert_eq!(results.len(), 3); // Three functions
        assert_eq!(results[0].len(), 3); // Three input values

        // Check gamma results
        assert_relative_eq!(results[0][0], 1.0, epsilon = 1e-4); // Γ(1) = 1
        assert_relative_eq!(results[0][1], 1.0, epsilon = 1e-4); // Γ(2) = 1
        assert_relative_eq!(results[0][2], 2.0, epsilon = 1e-4); // Γ(3) = 2

        // Check log results
        assert_relative_eq!(results[1][0], 0.0, epsilon = 1e-5); // ln(1) = 0

        // Check sin results
        assert_relative_eq!(results[2][0], 1.0f32.sin(), epsilon = 1e-5);
    }

    #[test]
    #[cfg(feature = "simd")]
    fn test_digamma_f32_simd() {
        let input = Array1::from_vec(vec![1.0f32, 2.0, 3.0, 0.5]);
        let result = digamma_f32_simd(&input.view()).expect("Operation failed");

        // All results should be finite
        for &val in result.iter() {
            assert!(val.is_finite());
        }

        // ψ(2) > ψ(1) (digamma is increasing)
        assert!(result[1] > result[0]);
        assert!(result[2] > result[1]);
    }

    #[test]
    #[cfg(feature = "simd")]
    fn test_gamma_f64_simd() {
        let input = Array1::from_vec(vec![1.0f64, 2.0, 3.0, 4.0, 5.0]);
        let result = gamma_f64_simd(&input.view()).expect("Operation failed");

        // Check against known values
        assert_relative_eq!(result[0], 1.0, epsilon = 1e-10);
        assert_relative_eq!(result[1], 1.0, epsilon = 1e-10);
        assert_relative_eq!(result[2], 2.0, epsilon = 1e-10);
        assert_relative_eq!(result[3], 6.0, epsilon = 1e-10);
        assert_relative_eq!(result[4], 24.0, epsilon = 1e-5); // Less precise for larger values
    }

    #[test]
    #[cfg(feature = "simd")]
    fn test_exp_f32_simd() {
        let input = Array1::from_vec(vec![0.0f32, 1.0, 2.0, 3.0]);
        let result = exp_f32_simd(&input.view()).expect("Operation failed");

        assert_relative_eq!(result[0], 1.0, epsilon = 1e-6);
        assert_relative_eq!(result[1], std::f32::consts::E, epsilon = 1e-6);
        assert_relative_eq!(result[2], std::f32::consts::E.powi(2), epsilon = 1e-5);
        assert_relative_eq!(result[3], std::f32::consts::E.powi(3), epsilon = 1e-4);
    }

    #[test]
    #[cfg(feature = "simd")]
    fn test_j0_f32_simd() {
        let input = Array1::from_vec(vec![0.0f32, 1.0, 2.0, 3.0]);
        let result = j0_f32_simd(&input.view()).expect("Operation failed");

        // J0(0) = 1
        assert_relative_eq!(result[0], 1.0, epsilon = 1e-6);

        // Other values should match scalar implementation
        for i in 1..input.len() {
            let expected = crate::bessel::j0(input[i] as f64) as f32;
            assert_relative_eq!(result[i], expected, epsilon = 1e-6);
        }
    }

    #[test]
    fn test_vectorized_special_ops() {
        // This should work regardless of SIMD feature
        assert!(vectorized_special_ops().is_ok());
    }

    #[test]
    #[cfg(feature = "simd")]
    fn test_simd_capabilities() {
        let capabilities = detect_simd_capabilities();
        // Just ensure the function runs without crashing
        assert!(capabilities.vector_width_f32 >= 1);
        assert!(capabilities.vector_width_f64 >= 1);
    }

    #[test]
    #[cfg(feature = "parallel")]
    fn test_gamma_f64_parallel() {
        let input = Array1::from_vec(vec![1.0f64, 2.0, 3.0, 4.0, 5.0]);
        let result = gamma_f64_parallel(&input.view()).expect("Operation failed");

        // Check against known values
        assert_relative_eq!(result[0], 1.0, epsilon = 1e-10);
        assert_relative_eq!(result[1], 1.0, epsilon = 1e-10);
        assert_relative_eq!(result[2], 2.0, epsilon = 1e-10);
        assert_relative_eq!(result[3], 6.0, epsilon = 1e-10);
        assert_relative_eq!(result[4], 24.0, epsilon = 1e-10);
    }

    #[test]
    #[cfg(feature = "parallel")]
    fn test_j0_f64_parallel() {
        let input = Array1::from_vec(vec![0.0f64, 1.0, 2.0, 3.0]);
        let result = j0_f64_parallel(&input.view()).expect("Operation failed");

        // J0(0) = 1
        assert_relative_eq!(result[0], 1.0, epsilon = 1e-10);

        // Other values should match scalar implementation
        for i in 1..input.len() {
            let expected = crate::bessel::j0(input[i]);
            assert_relative_eq!(result[i], expected, epsilon = 1e-10);
        }
    }

    #[test]
    #[cfg(all(feature = "simd", feature = "parallel"))]
    fn test_gamma_f32_simd_parallel() {
        let input = Array1::from_vec(vec![1.0f32, 2.0, 3.0, 4.0, 5.0]);
        let result = gamma_f32_simd_parallel(&input.view()).expect("Operation failed");

        // Check against known values (less precision for f32)
        assert_relative_eq!(result[0], 1.0, epsilon = 1e-5);
        assert_relative_eq!(result[1], 1.0, epsilon = 1e-5);
        assert_relative_eq!(result[2], 2.0, epsilon = 1e-5);
        assert_relative_eq!(result[3], 6.0, epsilon = 1e-5);
        assert_relative_eq!(result[4], 24.0, epsilon = 1e-2);
    }

    #[test]
    fn test_adaptive_gamma_processing() {
        let input = Array1::from_vec(vec![1.0f64, 2.0, 3.0, 4.0, 5.0]);
        let result = adaptive_gamma_processing(&input.view()).expect("Operation failed");

        // Check against known values
        assert_relative_eq!(result[0], 1.0, epsilon = 1e-10);
        assert_relative_eq!(result[1], 1.0, epsilon = 1e-10);
        assert_relative_eq!(result[2], 2.0, epsilon = 1e-10);
        assert_relative_eq!(result[3], 6.0, epsilon = 1e-10);
        assert_relative_eq!(result[4], 24.0, epsilon = 1e-10);
    }

    #[test]
    #[cfg(feature = "parallel")]
    fn test_parallel_threshold_behavior() {
        // Test small array (should use sequential)
        let smallinput = Array1::from_vec((0..10).map(|i| i as f64 + 1.0).collect());
        let small_result = gamma_f64_parallel(&smallinput.view()).expect("Operation failed");

        // Test large array (should use parallel)
        let largeinput = Array1::from_vec((0..2000).map(|i| (i as f64) * 0.001 + 1.0).collect());
        let large_result = gamma_f64_parallel(&largeinput.view()).expect("Operation failed");

        // Both should produce valid results
        assert!(small_result.len() == 10);
        assert!(large_result.len() == 2000);

        // Check first few values are reasonable
        assert!(small_result[0] > 0.0);
        assert!(large_result[0] > 0.0);
    }
}