use crate::error::ErrorContext;
use crate::{Result, TensorError};
pub struct ReductionOps;
impl ReductionOps {
pub fn sum_f32_unchecked(input: &[f32]) -> f32 {
let len = input.len();
if len < 8 {
let mut sum = 0.0f32;
for i in 0..len {
unsafe {
sum += *input.get_unchecked(i);
}
}
return sum;
}
let chunks = len / 8;
let remainder = len % 8;
let mut acc1 = 0.0f32;
let mut acc2 = 0.0f32;
let mut acc3 = 0.0f32;
let mut acc4 = 0.0f32;
for chunk in 0..chunks {
let base = chunk * 8;
unsafe {
acc1 += *input.get_unchecked(base) + *input.get_unchecked(base + 1);
acc2 += *input.get_unchecked(base + 2) + *input.get_unchecked(base + 3);
acc3 += *input.get_unchecked(base + 4) + *input.get_unchecked(base + 5);
acc4 += *input.get_unchecked(base + 6) + *input.get_unchecked(base + 7);
}
}
let mut total_sum = acc1 + acc2 + acc3 + acc4;
let remainder_start = chunks * 8;
for i in 0..remainder {
unsafe {
total_sum += *input.get_unchecked(remainder_start + i);
}
}
total_sum
}
pub fn reduce_sum_f32_optimized(input: &[f32]) -> Result<f32> {
if input.is_empty() {
return Ok(0.0);
}
let len = input.len();
let unroll_size = 8;
let main_loops = len / unroll_size;
let mut sum1 = 0.0f32;
let mut sum2 = 0.0f32;
let mut sum3 = 0.0f32;
let mut sum4 = 0.0f32;
for i in 0..main_loops {
let base = i * unroll_size;
sum1 += input[base] + input[base + 1];
sum2 += input[base + 2] + input[base + 3];
sum3 += input[base + 4] + input[base + 5];
sum4 += input[base + 6] + input[base + 7];
}
let mut remaining_sum = 0.0f32;
#[allow(clippy::needless_range_loop)] for i in (main_loops * unroll_size)..len {
remaining_sum += input[i];
}
Ok(sum1 + sum2 + sum3 + sum4 + remaining_sum)
}
pub fn max_f32_unchecked(input: &[f32]) -> f32 {
let len = input.len();
if len == 0 {
return f32::NEG_INFINITY;
}
if len < 8 {
let mut max_val = unsafe { *input.get_unchecked(0) };
for i in 1..len {
unsafe {
let val = *input.get_unchecked(i);
if val > max_val {
max_val = val;
}
}
}
return max_val;
}
let chunks = len / 8;
let remainder = len % 8;
let mut max1 = unsafe { *input.get_unchecked(0) };
let mut max2 = max1;
let mut max3 = max1;
let mut max4 = max1;
for chunk in 0..chunks {
let base = chunk * 8;
unsafe {
max1 = max1
.max(*input.get_unchecked(base))
.max(*input.get_unchecked(base + 1));
max2 = max2
.max(*input.get_unchecked(base + 2))
.max(*input.get_unchecked(base + 3));
max3 = max3
.max(*input.get_unchecked(base + 4))
.max(*input.get_unchecked(base + 5));
max4 = max4
.max(*input.get_unchecked(base + 6))
.max(*input.get_unchecked(base + 7));
}
}
let mut global_max = max1.max(max2).max(max3).max(max4);
let remainder_start = chunks * 8;
for i in 0..remainder {
unsafe {
global_max = global_max.max(*input.get_unchecked(remainder_start + i));
}
}
global_max
}
pub fn reduce_min_max_f32_optimized(input: &[f32]) -> Result<(f32, f32)> {
if input.is_empty() {
return Err(TensorError::InvalidOperation {
operation: "SIMD min_max".to_string(),
reason: "Cannot find min/max of empty array".to_string(),
context: Some(ErrorContext::new()),
});
}
let len = input.len();
let unroll_size = 8;
let main_loops = len / unroll_size;
let mut min_val = input[0];
let mut max_val = input[0];
for i in 0..main_loops {
let base = i * unroll_size;
min_val = min_val
.min(input[base])
.min(input[base + 1])
.min(input[base + 2])
.min(input[base + 3])
.min(input[base + 4])
.min(input[base + 5])
.min(input[base + 6])
.min(input[base + 7]);
max_val = max_val
.max(input[base])
.max(input[base + 1])
.max(input[base + 2])
.max(input[base + 3])
.max(input[base + 4])
.max(input[base + 5])
.max(input[base + 6])
.max(input[base + 7]);
}
#[allow(clippy::needless_range_loop)] for i in (main_loops * unroll_size)..len {
min_val = min_val.min(input[i]);
max_val = max_val.max(input[i]);
}
Ok((min_val, max_val))
}
pub fn normalize_f32_optimized(input: &[f32], output: &mut [f32], eps: f32) -> Result<()> {
if input.len() != output.len() {
return Err(TensorError::ShapeMismatch {
operation: "SIMD normalize".to_string(),
expected: format!("arrays of length {}", input.len()),
got: format!("input: {}, output: {}", input.len(), output.len()),
context: Some(ErrorContext::new()),
});
}
let sum = Self::reduce_sum_f32_optimized(input)?;
let mean = sum / input.len() as f32;
let mut variance_sum = 0.0f32;
let len = input.len();
let unroll_size = 8;
let main_loops = len / unroll_size;
for i in 0..main_loops {
let base = i * unroll_size;
let d0 = input[base] - mean;
let d1 = input[base + 1] - mean;
let d2 = input[base + 2] - mean;
let d3 = input[base + 3] - mean;
let d4 = input[base + 4] - mean;
let d5 = input[base + 5] - mean;
let d6 = input[base + 6] - mean;
let d7 = input[base + 7] - mean;
variance_sum +=
d0 * d0 + d1 * d1 + d2 * d2 + d3 * d3 + d4 * d4 + d5 * d5 + d6 * d6 + d7 * d7;
}
#[allow(clippy::needless_range_loop)] for i in (main_loops * unroll_size)..len {
let diff = input[i] - mean;
variance_sum += diff * diff;
}
let variance = variance_sum / input.len() as f32;
let inv_std = 1.0 / (variance + eps).sqrt();
for i in 0..main_loops {
let base = i * unroll_size;
output[base] = (input[base] - mean) * inv_std;
output[base + 1] = (input[base + 1] - mean) * inv_std;
output[base + 2] = (input[base + 2] - mean) * inv_std;
output[base + 3] = (input[base + 3] - mean) * inv_std;
output[base + 4] = (input[base + 4] - mean) * inv_std;
output[base + 5] = (input[base + 5] - mean) * inv_std;
output[base + 6] = (input[base + 6] - mean) * inv_std;
output[base + 7] = (input[base + 7] - mean) * inv_std;
}
#[allow(clippy::needless_range_loop)] for i in (main_loops * unroll_size)..len {
output[i] = (input[i] - mean) * inv_std;
}
Ok(())
}
pub fn mean_variance_f32_optimized(input: &[f32]) -> Result<(f32, f32)> {
if input.is_empty() {
return Err(TensorError::InvalidOperation {
operation: "SIMD mean_variance".to_string(),
reason: "Cannot compute mean/variance of empty array".to_string(),
context: Some(ErrorContext::new()),
});
}
let mut mean = 0.0f32;
let mut m2 = 0.0f32;
for (i, &value) in input.iter().enumerate() {
let count = (i + 1) as f32;
let delta = value - mean;
mean += delta / count;
let delta2 = value - mean;
m2 += delta * delta2;
}
let variance = if input.len() > 1 {
m2 / (input.len() - 1) as f32
} else {
0.0
};
Ok((mean, variance))
}
pub fn std_f32_optimized(input: &[f32]) -> Result<f32> {
let (_, variance) = Self::mean_variance_f32_optimized(input)?;
Ok(variance.sqrt())
}
pub fn reduce_f32_optimized<F>(input: &[f32], initial: f32, op: F) -> Result<f32>
where
F: Fn(f32, f32) -> f32,
{
if input.is_empty() {
return Ok(initial);
}
let mut result = initial;
let len = input.len();
let unroll_size = 4;
let main_loops = len / unroll_size;
for i in 0..main_loops {
let base = i * unroll_size;
result = op(result, input[base]);
result = op(result, input[base + 1]);
result = op(result, input[base + 2]);
result = op(result, input[base + 3]);
}
#[allow(clippy::needless_range_loop)] for i in (main_loops * unroll_size)..len {
result = op(result, input[i]);
}
Ok(result)
}
pub fn l2_norm_f32_optimized(input: &[f32]) -> Result<f32> {
if input.is_empty() {
return Ok(0.0);
}
let len = input.len();
let unroll_size = 8;
let main_loops = len / unroll_size;
let mut sum_squares = 0.0f32;
for i in 0..main_loops {
let base = i * unroll_size;
sum_squares += input[base] * input[base];
sum_squares += input[base + 1] * input[base + 1];
sum_squares += input[base + 2] * input[base + 2];
sum_squares += input[base + 3] * input[base + 3];
sum_squares += input[base + 4] * input[base + 4];
sum_squares += input[base + 5] * input[base + 5];
sum_squares += input[base + 6] * input[base + 6];
sum_squares += input[base + 7] * input[base + 7];
}
#[allow(clippy::needless_range_loop)] for i in (main_loops * unroll_size)..len {
sum_squares += input[i] * input[i];
}
Ok(sum_squares.sqrt())
}
pub fn l1_norm_f32_optimized(input: &[f32]) -> Result<f32> {
if input.is_empty() {
return Ok(0.0);
}
let len = input.len();
let unroll_size = 8;
let main_loops = len / unroll_size;
let mut sum_abs = 0.0f32;
for i in 0..main_loops {
let base = i * unroll_size;
sum_abs += input[base].abs();
sum_abs += input[base + 1].abs();
sum_abs += input[base + 2].abs();
sum_abs += input[base + 3].abs();
sum_abs += input[base + 4].abs();
sum_abs += input[base + 5].abs();
sum_abs += input[base + 6].abs();
sum_abs += input[base + 7].abs();
}
#[allow(clippy::needless_range_loop)] for i in (main_loops * unroll_size)..len {
sum_abs += input[i].abs();
}
Ok(sum_abs)
}
pub fn argmax_f32_optimized(input: &[f32]) -> Result<usize> {
if input.is_empty() {
return Err(TensorError::InvalidOperation {
operation: "SIMD argmax".to_string(),
reason: "Cannot find argmax of empty array".to_string(),
context: Some(ErrorContext::new()),
});
}
let mut max_val = input[0];
let mut max_idx = 0;
for (i, &val) in input.iter().enumerate().skip(1) {
if val > max_val {
max_val = val;
max_idx = i;
}
}
Ok(max_idx)
}
pub fn argmin_f32_optimized(input: &[f32]) -> Result<usize> {
if input.is_empty() {
return Err(TensorError::InvalidOperation {
operation: "SIMD argmin".to_string(),
reason: "Cannot find argmin of empty array".to_string(),
context: Some(ErrorContext::new()),
});
}
let mut min_val = input[0];
let mut min_idx = 0;
for (i, &val) in input.iter().enumerate().skip(1) {
if val < min_val {
min_val = val;
min_idx = i;
}
}
Ok(min_idx)
}
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_relative_eq;
#[test]
fn test_sum_f32_unchecked() {
let input = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let expected = 15.0;
let result = ReductionOps::sum_f32_unchecked(&input);
assert_relative_eq!(result, expected, epsilon = 1e-6);
}
#[test]
fn test_reduce_sum_f32_optimized() {
let input = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
let expected = 55.0;
let result = ReductionOps::reduce_sum_f32_optimized(&input)
.expect("test: reduce_sum_f32_optimized should succeed");
assert_relative_eq!(result, expected, epsilon = 1e-6);
}
#[test]
fn test_max_f32_unchecked() {
let input = vec![1.0, 5.0, 3.0, 9.0, 2.0, 7.0];
let expected = 9.0;
let result = ReductionOps::max_f32_unchecked(&input);
assert_relative_eq!(result, expected, epsilon = 1e-6);
}
#[test]
fn test_reduce_min_max_f32_optimized() {
let input = vec![3.0, 1.0, 7.0, 2.0, 9.0, 4.0];
let expected = (1.0, 9.0);
let result = ReductionOps::reduce_min_max_f32_optimized(&input)
.expect("test: reduce_min_max_f32_optimized should succeed");
assert_relative_eq!(result.0, expected.0, epsilon = 1e-6);
assert_relative_eq!(result.1, expected.1, epsilon = 1e-6);
}
#[test]
fn test_normalize_f32_optimized() {
let input = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let mut output = vec![0.0; 5];
let eps = 1e-8;
ReductionOps::normalize_f32_optimized(&input, &mut output, eps)
.expect("test: normalize_f32_optimized should succeed");
let sum: f32 = output.iter().sum();
let mean = sum / output.len() as f32;
assert_relative_eq!(mean, 0.0, epsilon = 1e-5);
let variance: f32 =
output.iter().map(|&x| (x - mean) * (x - mean)).sum::<f32>() / output.len() as f32;
let std = variance.sqrt();
assert_relative_eq!(std, 1.0, epsilon = 1e-5);
}
#[test]
fn test_mean_variance_f32_optimized() {
let input = vec![2.0, 4.0, 6.0, 8.0];
let expected_mean = 5.0;
let expected_variance = 20.0 / 3.0;
let (mean, variance) = ReductionOps::mean_variance_f32_optimized(&input)
.expect("test: mean_variance_f32_optimized should succeed");
assert_relative_eq!(mean, expected_mean, epsilon = 1e-6);
assert_relative_eq!(variance, expected_variance, epsilon = 1e-6);
}
#[test]
fn test_l2_norm_f32_optimized() {
let input = vec![3.0, 4.0]; let expected = 5.0;
let result = ReductionOps::l2_norm_f32_optimized(&input)
.expect("test: l2_norm_f32_optimized should succeed");
assert_relative_eq!(result, expected, epsilon = 1e-6);
}
#[test]
fn test_l1_norm_f32_optimized() {
let input = vec![-2.0, 3.0, -4.0, 1.0];
let expected = 10.0;
let result = ReductionOps::l1_norm_f32_optimized(&input)
.expect("test: l1_norm_f32_optimized should succeed");
assert_relative_eq!(result, expected, epsilon = 1e-6);
}
#[test]
fn test_argmax_f32_optimized() {
let input = vec![1.0, 5.0, 3.0, 9.0, 2.0];
let expected = 3;
let result = ReductionOps::argmax_f32_optimized(&input)
.expect("test: argmax_f32_optimized should succeed");
assert_eq!(result, expected);
}
#[test]
fn test_argmin_f32_optimized() {
let input = vec![5.0, 1.0, 3.0, 9.0, 2.0];
let expected = 1;
let result = ReductionOps::argmin_f32_optimized(&input)
.expect("test: argmin_f32_optimized should succeed");
assert_eq!(result, expected);
}
#[test]
fn test_empty_array_handling() {
let empty: Vec<f32> = vec![];
let sum = ReductionOps::reduce_sum_f32_optimized(&empty)
.expect("test: reduce_sum_f32_optimized should succeed");
assert_relative_eq!(sum, 0.0, epsilon = 1e-6);
let min_max = ReductionOps::reduce_min_max_f32_optimized(&empty);
assert!(min_max.is_err());
let argmax = ReductionOps::argmax_f32_optimized(&empty);
assert!(argmax.is_err());
let argmin = ReductionOps::argmin_f32_optimized(&empty);
assert!(argmin.is_err());
}
#[test]
fn test_reduce_f32_optimized_with_custom_op() {
let input = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let initial = 1.0;
let product_op = |acc: f32, x: f32| acc * x;
let expected = 120.0;
let result = ReductionOps::reduce_f32_optimized(&input, initial, product_op)
.expect("test: reduce_f32_optimized should succeed");
assert_relative_eq!(result, expected, epsilon = 1e-6);
}
}