etensor-core 0.0.1

//! High-performance CPU Unary operation kernels.
//! 
//! ReLU is bandwidth-bound (simple comparison) → single-threaded, LLVM auto-vectorizes.
//! Sigmoid is compute-bound (exp() is expensive) → uses Rayon for large tensors.

use crate::tensor::Tensor;
use crate::buffer::Buffer;
use crate::device::Device;
use crate::errors::EtensorResult;
use rayon::prelude::*;

/// Sigmoid is compute-bound (exp() ~20 cycles vs add's ~1 cycle), so parallelism helps.
/// The threshold is set high enough that Rayon's ~20µs overhead is amortized.
const RAYON_THRESHOLD_COMPUTE: usize = 131_072; // 128K elements

/// Executes the Rectified Linear Unit (ReLU) activation function: f(x) = max(0, x)
/// 
/// Bandwidth-bound: a single max() comparison per element. Single-threaded 
/// SIMD already saturates the memory bus.
pub fn relu_forward(a: &Tensor) -> EtensorResult<Tensor> {
    let slice = a.data.as_f32_slice()?;
    
    let out_vec: Vec<f32> = slice.iter().map(|&x| x.max(0.0)).collect();

    Ok(Tensor::new(
        Buffer::from_f32_vec(out_vec),
        a.shape.clone(),
        Device::Cpu,
        a.dtype,
        false, // Gradients are exclusively managed by the Dispatcher.
    ))
}

/// Executes the Sigmoid activation function: f(x) = 1 / (1 + exp(-x))
/// 
/// Compute-bound: exp() is ~20x more expensive than add/mul per element.
/// Rayon parallelism provides genuine speedup at large sizes.
pub fn sigmoid_forward(a: &Tensor) -> EtensorResult<Tensor> {
    let slice = a.data.as_f32_slice()?;
    
    let out_vec: Vec<f32> = if slice.len() < RAYON_THRESHOLD_COMPUTE {
        slice.iter().map(|&x| 1.0 / (1.0 + (-x).exp())).collect()
    } else {
        slice.par_iter().map(|&x| 1.0 / (1.0 + (-x).exp())).collect()
    };

    Ok(Tensor::new(
        Buffer::from_f32_vec(out_vec),
        a.shape.clone(),
        Device::Cpu,
        a.dtype,
        false,
    ))
}

// =====================================================================
// UNIT TESTS
// =====================================================================
#[cfg(test)]
mod tests {
    use super::*;
    use crate::shape::Shape;
    use crate::dtypes::DType;

    // Helper to generate a test matrix
    fn make_test_tensor(data: Vec<f32>) -> Tensor {
        let len = data.len();
        Tensor::new(
            Buffer::from_f32_vec(data),
            Shape::new(vec![len]),
            Device::Cpu,
            DType::F32,
            false,
        )
    }

    #[test]
    fn test_cpu_relu() {
        let a = make_test_tensor(vec![-5.0, 0.0, 3.14, -0.01, 42.0]);
        
        let c = relu_forward(&a).unwrap();
        let slice = c.data.as_f32_slice().unwrap();

        assert_eq!(slice, &[0.0, 0.0, 3.14, 0.0, 42.0]);
    }

    #[test]
    fn test_cpu_sigmoid() {
        let a = make_test_tensor(vec![0.0, 100.0, -100.0]);
        
        let c = sigmoid_forward(&a).unwrap();
        let slice = c.data.as_f32_slice().unwrap();

        assert_eq!(slice[0], 0.5);
        assert!(slice[1] > 0.999);
        assert!(slice[2] < 0.001);
    }
}