#![allow(unsafe_code)]
use crate::error::{AlgorithmError, Result};
#[cfg(target_arch = "aarch64")]
mod neon_impl {
use std::arch::aarch64::*;
#[inline(always)]
unsafe fn hsum_f32(v: float32x4_t) -> f32 {
unsafe {
let pair = vpaddq_f32(v, v);
let sum = vpaddq_f32(pair, pair);
vgetq_lane_f32(sum, 0)
}
}
#[inline(always)]
unsafe fn hmin_f32(v: float32x4_t) -> f32 {
unsafe {
let pair = vpminq_f32(v, v);
let min = vpminq_f32(pair, pair);
vgetq_lane_f32(min, 0)
}
}
#[inline(always)]
unsafe fn hmax_f32(v: float32x4_t) -> f32 {
unsafe {
let pair = vpmaxq_f32(v, v);
let max = vpmaxq_f32(pair, pair);
vgetq_lane_f32(max, 0)
}
}
#[target_feature(enable = "neon")]
pub(crate) unsafe fn sum_f32(data: &[f32]) -> f32 {
unsafe {
let len = data.len();
let ptr = data.as_ptr();
let chunks = len / 16;
let mut acc0 = vdupq_n_f32(0.0);
let mut acc1 = vdupq_n_f32(0.0);
let mut acc2 = vdupq_n_f32(0.0);
let mut acc3 = vdupq_n_f32(0.0);
for i in 0..chunks {
let off = i * 16;
acc0 = vaddq_f32(acc0, vld1q_f32(ptr.add(off)));
acc1 = vaddq_f32(acc1, vld1q_f32(ptr.add(off + 4)));
acc2 = vaddq_f32(acc2, vld1q_f32(ptr.add(off + 8)));
acc3 = vaddq_f32(acc3, vld1q_f32(ptr.add(off + 12)));
}
let sum01 = vaddq_f32(acc0, acc1);
let sum23 = vaddq_f32(acc2, acc3);
let sum_all = vaddq_f32(sum01, sum23);
let mut total = hsum_f32(sum_all);
let rem = chunks * 16;
for i in rem..len {
total += *ptr.add(i);
}
total
}
}
#[target_feature(enable = "neon")]
pub(crate) unsafe fn min_f32(data: &[f32]) -> f32 {
unsafe {
let len = data.len();
let ptr = data.as_ptr();
let chunks = len / 16;
let mut min0 = vdupq_n_f32(f32::MAX);
let mut min1 = vdupq_n_f32(f32::MAX);
let mut min2 = vdupq_n_f32(f32::MAX);
let mut min3 = vdupq_n_f32(f32::MAX);
for i in 0..chunks {
let off = i * 16;
min0 = vminq_f32(min0, vld1q_f32(ptr.add(off)));
min1 = vminq_f32(min1, vld1q_f32(ptr.add(off + 4)));
min2 = vminq_f32(min2, vld1q_f32(ptr.add(off + 8)));
min3 = vminq_f32(min3, vld1q_f32(ptr.add(off + 12)));
}
let min01 = vminq_f32(min0, min1);
let min23 = vminq_f32(min2, min3);
let min_all = vminq_f32(min01, min23);
let mut min_val = hmin_f32(min_all);
let rem = chunks * 16;
for i in rem..len {
let v = *ptr.add(i);
if v < min_val {
min_val = v;
}
}
min_val
}
}
#[target_feature(enable = "neon")]
pub(crate) unsafe fn max_f32(data: &[f32]) -> f32 {
unsafe {
let len = data.len();
let ptr = data.as_ptr();
let chunks = len / 16;
let mut max0 = vdupq_n_f32(f32::MIN);
let mut max1 = vdupq_n_f32(f32::MIN);
let mut max2 = vdupq_n_f32(f32::MIN);
let mut max3 = vdupq_n_f32(f32::MIN);
for i in 0..chunks {
let off = i * 16;
max0 = vmaxq_f32(max0, vld1q_f32(ptr.add(off)));
max1 = vmaxq_f32(max1, vld1q_f32(ptr.add(off + 4)));
max2 = vmaxq_f32(max2, vld1q_f32(ptr.add(off + 8)));
max3 = vmaxq_f32(max3, vld1q_f32(ptr.add(off + 12)));
}
let max01 = vmaxq_f32(max0, max1);
let max23 = vmaxq_f32(max2, max3);
let max_all = vmaxq_f32(max01, max23);
let mut max_val = hmax_f32(max_all);
let rem = chunks * 16;
for i in rem..len {
let v = *ptr.add(i);
if v > max_val {
max_val = v;
}
}
max_val
}
}
#[target_feature(enable = "neon")]
pub(crate) unsafe fn minmax_f32(data: &[f32]) -> (f32, f32) {
unsafe {
let len = data.len();
let ptr = data.as_ptr();
let chunks = len / 8;
let mut vmin0 = vdupq_n_f32(f32::MAX);
let mut vmin1 = vdupq_n_f32(f32::MAX);
let mut vmax0 = vdupq_n_f32(f32::MIN);
let mut vmax1 = vdupq_n_f32(f32::MIN);
for i in 0..chunks {
let off = i * 8;
let a = vld1q_f32(ptr.add(off));
let b = vld1q_f32(ptr.add(off + 4));
vmin0 = vminq_f32(vmin0, a);
vmin1 = vminq_f32(vmin1, b);
vmax0 = vmaxq_f32(vmax0, a);
vmax1 = vmaxq_f32(vmax1, b);
}
let vmin_all = vminq_f32(vmin0, vmin1);
let vmax_all = vmaxq_f32(vmax0, vmax1);
let mut min_val = hmin_f32(vmin_all);
let mut max_val = hmax_f32(vmax_all);
let rem = chunks * 8;
for i in rem..len {
let v = *ptr.add(i);
if v < min_val {
min_val = v;
}
if v > max_val {
max_val = v;
}
}
(min_val, max_val)
}
}
#[target_feature(enable = "neon")]
pub(crate) unsafe fn variance_f32(data: &[f32], mean: f32) -> f32 {
unsafe {
let len = data.len();
let ptr = data.as_ptr();
let chunks = len / 8;
let vmean = vdupq_n_f32(mean);
let mut acc0 = vdupq_n_f32(0.0);
let mut acc1 = vdupq_n_f32(0.0);
for i in 0..chunks {
let off = i * 8;
let a = vsubq_f32(vld1q_f32(ptr.add(off)), vmean);
let b = vsubq_f32(vld1q_f32(ptr.add(off + 4)), vmean);
acc0 = vfmaq_f32(acc0, a, a);
acc1 = vfmaq_f32(acc1, b, b);
}
let sum_vec = vaddq_f32(acc0, acc1);
let mut sum_sq = hsum_f32(sum_vec);
let rem = chunks * 8;
for i in rem..len {
let diff = *ptr.add(i) - mean;
sum_sq += diff * diff;
}
sum_sq
}
}
}
mod scalar_impl {
pub(crate) fn sum_f32(data: &[f32]) -> f32 {
const LANES: usize = 8;
let chunks = data.len() / LANES;
let mut accumulators = [0.0_f32; LANES];
for i in 0..chunks {
let start = i * LANES;
for j in 0..LANES {
accumulators[j] += data[start + j];
}
}
let mut total: f32 = accumulators.iter().sum();
let remainder_start = chunks * LANES;
for &val in &data[remainder_start..] {
total += val;
}
total
}
pub(crate) fn min_f32(data: &[f32]) -> f32 {
const LANES: usize = 8;
let chunks = data.len() / LANES;
let mut mins = [f32::MAX; LANES];
if chunks > 0 {
for j in 0..LANES {
mins[j] = data[j];
}
}
for i in 1..chunks {
let start = i * LANES;
for j in 0..LANES {
mins[j] = mins[j].min(data[start + j]);
}
}
let mut min_val = mins.iter().copied().fold(f32::MAX, f32::min);
let remainder_start = chunks * LANES;
for &val in &data[remainder_start..] {
min_val = min_val.min(val);
}
min_val
}
pub(crate) fn max_f32(data: &[f32]) -> f32 {
const LANES: usize = 8;
let chunks = data.len() / LANES;
let mut maxs = [f32::MIN; LANES];
if chunks > 0 {
for j in 0..LANES {
maxs[j] = data[j];
}
}
for i in 1..chunks {
let start = i * LANES;
for j in 0..LANES {
maxs[j] = maxs[j].max(data[start + j]);
}
}
let mut max_val = maxs.iter().copied().fold(f32::MIN, f32::max);
let remainder_start = chunks * LANES;
for &val in &data[remainder_start..] {
max_val = max_val.max(val);
}
max_val
}
pub(crate) fn minmax_f32(data: &[f32]) -> (f32, f32) {
const LANES: usize = 8;
let chunks = data.len() / LANES;
let mut mins = [f32::MAX; LANES];
let mut maxs = [f32::MIN; LANES];
if chunks > 0 {
for j in 0..LANES {
mins[j] = data[j];
maxs[j] = data[j];
}
}
for i in 1..chunks {
let start = i * LANES;
for j in 0..LANES {
let val = data[start + j];
mins[j] = mins[j].min(val);
maxs[j] = maxs[j].max(val);
}
}
let mut min_val = mins.iter().copied().fold(f32::MAX, f32::min);
let mut max_val = maxs.iter().copied().fold(f32::MIN, f32::max);
let remainder_start = chunks * LANES;
for &val in &data[remainder_start..] {
min_val = min_val.min(val);
max_val = max_val.max(val);
}
(min_val, max_val)
}
pub(crate) fn variance_f32(data: &[f32], mean: f32) -> f32 {
const LANES: usize = 8;
let chunks = data.len() / LANES;
let mut accumulators = [0.0_f32; LANES];
for i in 0..chunks {
let start = i * LANES;
for j in 0..LANES {
let diff = data[start + j] - mean;
accumulators[j] += diff * diff;
}
}
let mut sum_squared_diff: f32 = accumulators.iter().sum();
let remainder_start = chunks * LANES;
for &val in &data[remainder_start..] {
let diff = val - mean;
sum_squared_diff += diff * diff;
}
sum_squared_diff
}
}
#[must_use]
pub fn sum_f32(data: &[f32]) -> f32 {
if data.is_empty() {
return 0.0;
}
#[cfg(target_arch = "aarch64")]
{
unsafe { neon_impl::sum_f32(data) }
}
#[cfg(not(target_arch = "aarch64"))]
{
scalar_impl::sum_f32(data)
}
}
#[must_use]
pub fn sum_f64(data: &[f64]) -> f64 {
const LANES: usize = 4;
let chunks = data.len() / LANES;
let mut accumulators = [0.0_f64; LANES];
for i in 0..chunks {
let start = i * LANES;
for j in 0..LANES {
accumulators[j] += data[start + j];
}
}
let mut total: f64 = accumulators.iter().sum();
let remainder_start = chunks * LANES;
for &val in &data[remainder_start..] {
total += val;
}
total
}
pub fn mean_f32(data: &[f32]) -> Result<f32> {
if data.is_empty() {
return Err(AlgorithmError::InvalidParameter {
parameter: "input",
message: "Cannot compute mean of empty slice".to_string(),
});
}
let sum = sum_f32(data);
Ok(sum / data.len() as f32)
}
pub fn mean_f64(data: &[f64]) -> Result<f64> {
if data.is_empty() {
return Err(AlgorithmError::InvalidParameter {
parameter: "input",
message: "Cannot compute mean of empty slice".to_string(),
});
}
let sum = sum_f64(data);
Ok(sum / data.len() as f64)
}
pub fn min_f32(data: &[f32]) -> Result<f32> {
if data.is_empty() {
return Err(AlgorithmError::InvalidParameter {
parameter: "input",
message: "Cannot find min of empty slice".to_string(),
});
}
#[cfg(target_arch = "aarch64")]
{
unsafe { Ok(neon_impl::min_f32(data)) }
}
#[cfg(not(target_arch = "aarch64"))]
{
Ok(scalar_impl::min_f32(data))
}
}
pub fn max_f32(data: &[f32]) -> Result<f32> {
if data.is_empty() {
return Err(AlgorithmError::InvalidParameter {
parameter: "input",
message: "Cannot find max of empty slice".to_string(),
});
}
#[cfg(target_arch = "aarch64")]
{
unsafe { Ok(neon_impl::max_f32(data)) }
}
#[cfg(not(target_arch = "aarch64"))]
{
Ok(scalar_impl::max_f32(data))
}
}
pub fn minmax_f32(data: &[f32]) -> Result<(f32, f32)> {
if data.is_empty() {
return Err(AlgorithmError::InvalidParameter {
parameter: "input",
message: "Cannot find minmax of empty slice".to_string(),
});
}
#[cfg(target_arch = "aarch64")]
{
unsafe { Ok(neon_impl::minmax_f32(data)) }
}
#[cfg(not(target_arch = "aarch64"))]
{
Ok(scalar_impl::minmax_f32(data))
}
}
pub fn variance_f32(data: &[f32]) -> Result<f32> {
if data.is_empty() {
return Err(AlgorithmError::InvalidParameter {
parameter: "input",
message: "Cannot compute variance of empty slice".to_string(),
});
}
let mean = mean_f32(data)?;
#[cfg(target_arch = "aarch64")]
let sum_sq = {
unsafe { neon_impl::variance_f32(data, mean) }
};
#[cfg(not(target_arch = "aarch64"))]
let sum_sq = scalar_impl::variance_f32(data, mean);
Ok(sum_sq / data.len() as f32)
}
pub fn std_dev_f32(data: &[f32]) -> Result<f32> {
let var = variance_f32(data)?;
Ok(var.sqrt())
}
pub fn histogram_f32(data: &[f32], num_bins: usize, min: f32, max: f32) -> Result<Vec<usize>> {
if num_bins == 0 {
return Err(AlgorithmError::InvalidParameter {
parameter: "input",
message: "Number of bins must be greater than 0".to_string(),
});
}
if min >= max {
return Err(AlgorithmError::InvalidParameter {
parameter: "input",
message: "Min must be less than max".to_string(),
});
}
if data.is_empty() {
return Err(AlgorithmError::InvalidParameter {
parameter: "input",
message: "Cannot compute histogram of empty slice".to_string(),
});
}
let mut bins = vec![0_usize; num_bins];
let range = max - min;
let inv_bin_width = num_bins as f32 / range;
for &val in data {
if val >= min && val < max {
let bin_idx = ((val - min) * inv_bin_width) as usize;
let bin_idx = bin_idx.min(num_bins - 1); bins[bin_idx] += 1;
}
}
Ok(bins)
}
pub fn histogram_auto_f32(data: &[f32], num_bins: usize) -> Result<Vec<usize>> {
let (min, max) = minmax_f32(data)?;
let max = max + (max - min) * 1e-6;
histogram_f32(data, num_bins, min, max)
}
pub fn argmin_f32(data: &[f32]) -> Result<usize> {
if data.is_empty() {
return Err(AlgorithmError::InvalidParameter {
parameter: "input",
message: "Cannot find argmin of empty slice".to_string(),
});
}
let mut min_val = data[0];
let mut min_idx = 0;
for (i, &val) in data.iter().enumerate().skip(1) {
if val < min_val {
min_val = val;
min_idx = i;
}
}
Ok(min_idx)
}
pub fn argmax_f32(data: &[f32]) -> Result<usize> {
if data.is_empty() {
return Err(AlgorithmError::InvalidParameter {
parameter: "input",
message: "Cannot find argmax of empty slice".to_string(),
});
}
let mut max_val = data[0];
let mut max_idx = 0;
for (i, &val) in data.iter().enumerate().skip(1) {
if val > max_val {
max_val = val;
max_idx = i;
}
}
Ok(max_idx)
}
pub fn welford_variance_f32(data: &[f32]) -> Result<(f32, f32, usize)> {
if data.is_empty() {
return Err(AlgorithmError::InvalidParameter {
parameter: "input",
message: "Cannot compute variance of empty slice".to_string(),
});
}
let mut count = 0_usize;
let mut mean = 0.0_f32;
let mut m2 = 0.0_f32;
for &x in data {
count += 1;
let delta = x - mean;
mean += delta / count as f32;
let delta2 = x - mean;
m2 += delta * delta2;
}
let variance = if count > 1 { m2 / count as f32 } else { 0.0 };
Ok((mean, variance, count))
}
pub fn covariance_f32(a: &[f32], b: &[f32]) -> Result<f32> {
if a.is_empty() || b.is_empty() {
return Err(AlgorithmError::InvalidParameter {
parameter: "input",
message: "Cannot compute covariance of empty slice".to_string(),
});
}
if a.len() != b.len() {
return Err(AlgorithmError::InvalidParameter {
parameter: "input",
message: format!("Slice length mismatch: a={}, b={}", a.len(), b.len()),
});
}
let mean_a = mean_f32(a)?;
let mean_b = mean_f32(b)?;
let n = a.len() as f32;
let mut sum = 0.0_f32;
for i in 0..a.len() {
sum += (a[i] - mean_a) * (b[i] - mean_b);
}
Ok(sum / n)
}
#[cfg(test)]
mod tests {
use super::*;
use approx::assert_relative_eq;
#[test]
fn test_sum_f32() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let sum = sum_f32(&data);
assert_relative_eq!(sum, 15.0);
}
#[test]
fn test_sum_f32_large() {
let data = vec![1.0; 1000];
let sum = sum_f32(&data);
assert_relative_eq!(sum, 1000.0);
}
#[test]
fn test_sum_f32_very_large() {
let data: Vec<f32> = (1..=10000).map(|i| i as f32).collect();
let sum = sum_f32(&data);
assert_relative_eq!(sum, 50_005_000.0, epsilon = 1.0);
}
#[test]
fn test_sum_empty() {
let data: Vec<f32> = vec![];
assert_relative_eq!(sum_f32(&data), 0.0);
}
#[test]
fn test_mean_f32() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let mean = mean_f32(&data).expect("mean_f32 failed");
assert_relative_eq!(mean, 3.0);
}
#[test]
fn test_mean_empty() {
let data: Vec<f32> = vec![];
assert!(mean_f32(&data).is_err());
}
#[test]
fn test_minmax_f32() {
let data = vec![3.0, 1.0, 4.0, 1.5, 9.0, 2.0, 6.0];
let (min, max) = minmax_f32(&data).expect("minmax_f32 failed");
assert_relative_eq!(min, 1.0);
assert_relative_eq!(max, 9.0);
}
#[test]
fn test_minmax_single() {
let data = vec![42.0];
let (min, max) = minmax_f32(&data).expect("minmax_f32 failed");
assert_relative_eq!(min, 42.0);
assert_relative_eq!(max, 42.0);
}
#[test]
fn test_minmax_large() {
let data: Vec<f32> = (0..10000).map(|i| i as f32).collect();
let (min, max) = minmax_f32(&data).expect("minmax_f32 failed");
assert_relative_eq!(min, 0.0);
assert_relative_eq!(max, 9999.0);
}
#[test]
fn test_min_max_separate() {
let data = vec![3.0, 1.0, 4.0, 1.5, 9.0, 2.0, 6.0];
let min = min_f32(&data).expect("min_f32 failed");
let max = max_f32(&data).expect("max_f32 failed");
assert_relative_eq!(min, 1.0);
assert_relative_eq!(max, 9.0);
}
#[test]
fn test_variance_std_dev() {
let data = vec![2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0];
let variance = variance_f32(&data).expect("variance_f32 failed");
let std_dev = std_dev_f32(&data).expect("std_dev_f32 failed");
assert_relative_eq!(variance, 4.0, epsilon = 1e-4);
assert_relative_eq!(std_dev, 2.0, epsilon = 1e-4);
}
#[test]
fn test_welford_variance() {
let data = vec![2.0, 4.0, 4.0, 4.0, 5.0, 5.0, 7.0, 9.0];
let (mean, variance, count) = welford_variance_f32(&data).expect("welford failed");
assert_eq!(count, 8);
assert_relative_eq!(mean, 5.0, epsilon = 1e-4);
assert_relative_eq!(variance, 4.0, epsilon = 1e-4);
}
#[test]
fn test_histogram() {
let data = vec![0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5];
let bins = histogram_f32(&data, 5, 0.0, 10.0).expect("histogram_f32 failed");
assert_eq!(bins, vec![2, 2, 2, 2, 2]);
}
#[test]
fn test_histogram_auto() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0];
let bins = histogram_auto_f32(&data, 5).expect("histogram_auto_f32 failed");
assert_eq!(bins.len(), 5);
assert_eq!(bins.iter().sum::<usize>(), 10);
}
#[test]
fn test_argmin_argmax() {
let data = vec![3.0, 1.0, 4.0, 1.5, 9.0, 2.0, 6.0];
let min_idx = argmin_f32(&data).expect("argmin_f32 failed");
let max_idx = argmax_f32(&data).expect("argmax_f32 failed");
assert_eq!(min_idx, 1); assert_eq!(max_idx, 4); }
#[test]
fn test_large_dataset() {
let data: Vec<f32> = (0..10000).map(|i| i as f32).collect();
let sum = sum_f32(&data);
assert_relative_eq!(sum, 49_995_000.0, epsilon = 1.0);
let mean = mean_f32(&data).expect("mean_f32 failed");
assert_relative_eq!(mean, 4999.5, epsilon = 0.5);
let (min, max) = minmax_f32(&data).expect("minmax_f32 failed");
assert_relative_eq!(min, 0.0);
assert_relative_eq!(max, 9999.0);
}
#[test]
fn test_histogram_edge_cases() {
let data = vec![0.0, 5.0, 10.0];
let bins = histogram_f32(&data, 2, 0.0, 10.0).expect("histogram_f32 failed");
assert_eq!(bins[0], 1);
assert_eq!(bins[1], 1);
}
#[test]
fn test_sum_f64() {
let data = vec![1.0_f64, 2.0, 3.0, 4.0, 5.0];
let sum = sum_f64(&data);
assert_relative_eq!(sum, 15.0);
}
#[test]
fn test_mean_f64() {
let data = vec![1.0_f64, 2.0, 3.0, 4.0, 5.0];
let mean = mean_f64(&data).expect("mean_f64 failed");
assert_relative_eq!(mean, 3.0);
}
#[test]
fn test_covariance() {
let a = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let b = vec![2.0, 4.0, 6.0, 8.0, 10.0];
let cov = covariance_f32(&a, &b).expect("covariance_f32 failed");
assert_relative_eq!(cov, 4.0, epsilon = 1e-4);
}
}