pub fn simd_variance_f64(data: &[f64], ddof: usize) -> f64 {
if data.len() <= ddof {
return f64::NAN;
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_variance_f64_avx2(data, ddof) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_variance_f64_sse2(data, ddof) };
}
}
scalar_variance_f64(data, ddof)
}
pub fn simd_std_f64(data: &[f64], ddof: usize) -> f64 {
simd_variance_f64(data, ddof).sqrt()
}
pub fn simd_dot_product_f64(a: &[f64], b: &[f64]) -> f64 {
let len = a.len().min(b.len());
if len == 0 {
return 0.0;
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_dot_product_f64_avx2(&a[..len], &b[..len]) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_dot_product_f64_sse2(&a[..len], &b[..len]) };
}
}
a[..len]
.iter()
.zip(b[..len].iter())
.map(|(x, y)| x * y)
.sum()
}
pub fn simd_covariance_f64(x: &[f64], y: &[f64], ddof: usize) -> f64 {
let len = x.len().min(y.len());
if len <= ddof {
return f64::NAN;
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_covariance_f64_avx2(&x[..len], &y[..len], ddof) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_covariance_f64_sse2(&x[..len], &y[..len], ddof) };
}
}
scalar_covariance_f64(&x[..len], &y[..len], ddof)
}
pub fn simd_correlation_f64(x: &[f64], y: &[f64]) -> f64 {
let len = x.len().min(y.len());
if len < 2 {
return f64::NAN;
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_correlation_f64_avx2(&x[..len], &y[..len]) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_correlation_f64_sse2(&x[..len], &y[..len]) };
}
}
scalar_correlation_f64(&x[..len], &y[..len])
}
pub fn simd_skewness_f64(data: &[f64]) -> f64 {
if data.len() < 3 {
return f64::NAN;
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_skewness_f64_avx2(data) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_skewness_f64_sse2(data) };
}
}
scalar_skewness_f64(data)
}
pub fn simd_kurtosis_f64(data: &[f64]) -> f64 {
if data.len() < 4 {
return f64::NAN;
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_kurtosis_f64_avx2(data) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_kurtosis_f64_sse2(data) };
}
}
scalar_kurtosis_f64(data)
}
pub fn simd_sum_of_squares_f64(data: &[f64], mean: f64) -> f64 {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_sum_of_squares_f64_avx2(data, mean) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_sum_of_squares_f64_sse2(data, mean) };
}
}
data.iter().map(|&x| (x - mean).powi(2)).sum()
}
pub fn simd_weighted_mean_f64(data: &[f64], weights: &[f64]) -> f64 {
let len = data.len().min(weights.len());
if len == 0 {
return f64::NAN;
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_weighted_mean_f64_avx2(&data[..len], &weights[..len]) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_weighted_mean_f64_sse2(&data[..len], &weights[..len]) };
}
}
let weighted_sum: f64 = data[..len]
.iter()
.zip(weights[..len].iter())
.map(|(d, w)| d * w)
.sum();
let weight_sum: f64 = weights[..len].iter().sum();
if weight_sum == 0.0 {
f64::NAN
} else {
weighted_sum / weight_sum
}
}
pub fn simd_l2_norm_f64(data: &[f64]) -> f64 {
simd_dot_product_f64(data, data).sqrt()
}
pub fn simd_cosine_similarity_f64(a: &[f64], b: &[f64]) -> f64 {
let dot = simd_dot_product_f64(a, b);
let norm_a = simd_l2_norm_f64(a);
let norm_b = simd_l2_norm_f64(b);
if norm_a == 0.0 || norm_b == 0.0 {
return f64::NAN;
}
dot / (norm_a * norm_b)
}
fn scalar_variance_f64(data: &[f64], ddof: usize) -> f64 {
let n = data.len();
if n <= ddof {
return f64::NAN;
}
let mean: f64 = data.iter().sum::<f64>() / n as f64;
let sum_sq: f64 = data.iter().map(|&x| (x - mean).powi(2)).sum();
sum_sq / (n - ddof) as f64
}
fn scalar_covariance_f64(x: &[f64], y: &[f64], ddof: usize) -> f64 {
let n = x.len();
if n <= ddof {
return f64::NAN;
}
let mean_x: f64 = x.iter().sum::<f64>() / n as f64;
let mean_y: f64 = y.iter().sum::<f64>() / n as f64;
let cov: f64 = x
.iter()
.zip(y.iter())
.map(|(&xi, &yi)| (xi - mean_x) * (yi - mean_y))
.sum();
cov / (n - ddof) as f64
}
fn scalar_correlation_f64(x: &[f64], y: &[f64]) -> f64 {
let n = x.len();
if n < 2 {
return f64::NAN;
}
let mean_x: f64 = x.iter().sum::<f64>() / n as f64;
let mean_y: f64 = y.iter().sum::<f64>() / n as f64;
let mut sum_xy = 0.0;
let mut sum_x2 = 0.0;
let mut sum_y2 = 0.0;
for (&xi, &yi) in x.iter().zip(y.iter()) {
let dx = xi - mean_x;
let dy = yi - mean_y;
sum_xy += dx * dy;
sum_x2 += dx * dx;
sum_y2 += dy * dy;
}
if sum_x2 == 0.0 || sum_y2 == 0.0 {
return f64::NAN;
}
sum_xy / (sum_x2.sqrt() * sum_y2.sqrt())
}
fn scalar_skewness_f64(data: &[f64]) -> f64 {
let n = data.len();
if n < 3 {
return f64::NAN;
}
let mean: f64 = data.iter().sum::<f64>() / n as f64;
let m2: f64 = data.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / n as f64;
let m3: f64 = data.iter().map(|&x| (x - mean).powi(3)).sum::<f64>() / n as f64;
if m2 == 0.0 {
return f64::NAN;
}
m3 / m2.powf(1.5)
}
fn scalar_kurtosis_f64(data: &[f64]) -> f64 {
let n = data.len();
if n < 4 {
return f64::NAN;
}
let mean: f64 = data.iter().sum::<f64>() / n as f64;
let m2: f64 = data.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / n as f64;
let m4: f64 = data.iter().map(|&x| (x - mean).powi(4)).sum::<f64>() / n as f64;
if m2 == 0.0 {
return f64::NAN;
}
(m4 / m2.powi(2)) - 3.0
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_variance_f64_avx2(data: &[f64], ddof: usize) -> f64 {
use std::arch::x86_64::*;
let n = data.len();
if n <= ddof {
return f64::NAN;
}
let mut sum_vec = _mm256_setzero_pd();
let chunks = data.chunks_exact(4);
let remainder = chunks.remainder();
for chunk in chunks {
let vec = _mm256_loadu_pd(chunk.as_ptr());
sum_vec = _mm256_add_pd(sum_vec, vec);
}
let mut sum_arr = [0.0; 4];
_mm256_storeu_pd(sum_arr.as_mut_ptr(), sum_vec);
let mut sum = sum_arr[0] + sum_arr[1] + sum_arr[2] + sum_arr[3];
for &val in remainder {
sum += val;
}
let mean = sum / n as f64;
let mean_vec = _mm256_set1_pd(mean);
let mut sq_sum_vec = _mm256_setzero_pd();
let chunks = data.chunks_exact(4);
let remainder = chunks.remainder();
for chunk in chunks {
let vec = _mm256_loadu_pd(chunk.as_ptr());
let diff = _mm256_sub_pd(vec, mean_vec);
let sq = _mm256_mul_pd(diff, diff);
sq_sum_vec = _mm256_add_pd(sq_sum_vec, sq);
}
let mut sq_sum_arr = [0.0; 4];
_mm256_storeu_pd(sq_sum_arr.as_mut_ptr(), sq_sum_vec);
let mut sq_sum = sq_sum_arr[0] + sq_sum_arr[1] + sq_sum_arr[2] + sq_sum_arr[3];
for &val in remainder {
sq_sum += (val - mean).powi(2);
}
sq_sum / (n - ddof) as f64
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_dot_product_f64_avx2(a: &[f64], b: &[f64]) -> f64 {
use std::arch::x86_64::*;
let len = a.len();
let mut sum_vec = _mm256_setzero_pd();
let chunks_a = a.chunks_exact(4);
let chunks_b = b.chunks_exact(4);
let remainder_a = chunks_a.remainder();
let remainder_b = chunks_b.remainder();
for (chunk_a, chunk_b) in chunks_a.zip(chunks_b) {
let vec_a = _mm256_loadu_pd(chunk_a.as_ptr());
let vec_b = _mm256_loadu_pd(chunk_b.as_ptr());
let prod = _mm256_mul_pd(vec_a, vec_b);
sum_vec = _mm256_add_pd(sum_vec, prod);
}
let mut sum_arr = [0.0; 4];
_mm256_storeu_pd(sum_arr.as_mut_ptr(), sum_vec);
let mut sum = sum_arr[0] + sum_arr[1] + sum_arr[2] + sum_arr[3];
for (&va, &vb) in remainder_a.iter().zip(remainder_b.iter()) {
sum += va * vb;
}
sum
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_covariance_f64_avx2(x: &[f64], y: &[f64], ddof: usize) -> f64 {
use std::arch::x86_64::*;
let n = x.len();
if n <= ddof {
return f64::NAN;
}
let mut sum_x_vec = _mm256_setzero_pd();
let mut sum_y_vec = _mm256_setzero_pd();
let chunks_x = x.chunks_exact(4);
let chunks_y = y.chunks_exact(4);
let rem_x = chunks_x.remainder();
let rem_y = chunks_y.remainder();
for (cx, cy) in x.chunks_exact(4).zip(y.chunks_exact(4)) {
let vx = _mm256_loadu_pd(cx.as_ptr());
let vy = _mm256_loadu_pd(cy.as_ptr());
sum_x_vec = _mm256_add_pd(sum_x_vec, vx);
sum_y_vec = _mm256_add_pd(sum_y_vec, vy);
}
let mut sum_x_arr = [0.0; 4];
let mut sum_y_arr = [0.0; 4];
_mm256_storeu_pd(sum_x_arr.as_mut_ptr(), sum_x_vec);
_mm256_storeu_pd(sum_y_arr.as_mut_ptr(), sum_y_vec);
let mut sum_x = sum_x_arr.iter().sum::<f64>();
let mut sum_y = sum_y_arr.iter().sum::<f64>();
for (&vx, &vy) in rem_x.iter().zip(rem_y.iter()) {
sum_x += vx;
sum_y += vy;
}
let mean_x = sum_x / n as f64;
let mean_y = sum_y / n as f64;
let mean_x_vec = _mm256_set1_pd(mean_x);
let mean_y_vec = _mm256_set1_pd(mean_y);
let mut cov_vec = _mm256_setzero_pd();
for (cx, cy) in x.chunks_exact(4).zip(y.chunks_exact(4)) {
let vx = _mm256_loadu_pd(cx.as_ptr());
let vy = _mm256_loadu_pd(cy.as_ptr());
let dx = _mm256_sub_pd(vx, mean_x_vec);
let dy = _mm256_sub_pd(vy, mean_y_vec);
let prod = _mm256_mul_pd(dx, dy);
cov_vec = _mm256_add_pd(cov_vec, prod);
}
let mut cov_arr = [0.0; 4];
_mm256_storeu_pd(cov_arr.as_mut_ptr(), cov_vec);
let mut cov = cov_arr.iter().sum::<f64>();
for (&vx, &vy) in rem_x.iter().zip(rem_y.iter()) {
cov += (vx - mean_x) * (vy - mean_y);
}
cov / (n - ddof) as f64
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_correlation_f64_avx2(x: &[f64], y: &[f64]) -> f64 {
use std::arch::x86_64::*;
let n = x.len();
if n < 2 {
return f64::NAN;
}
let mut sum_x_vec = _mm256_setzero_pd();
let mut sum_y_vec = _mm256_setzero_pd();
for (cx, cy) in x.chunks_exact(4).zip(y.chunks_exact(4)) {
let vx = _mm256_loadu_pd(cx.as_ptr());
let vy = _mm256_loadu_pd(cy.as_ptr());
sum_x_vec = _mm256_add_pd(sum_x_vec, vx);
sum_y_vec = _mm256_add_pd(sum_y_vec, vy);
}
let mut sum_x_arr = [0.0; 4];
let mut sum_y_arr = [0.0; 4];
_mm256_storeu_pd(sum_x_arr.as_mut_ptr(), sum_x_vec);
_mm256_storeu_pd(sum_y_arr.as_mut_ptr(), sum_y_vec);
let mut sum_x = sum_x_arr.iter().sum::<f64>();
let mut sum_y = sum_y_arr.iter().sum::<f64>();
let rem_start = (n / 4) * 4;
for i in rem_start..n {
sum_x += x[i];
sum_y += y[i];
}
let mean_x = sum_x / n as f64;
let mean_y = sum_y / n as f64;
let mean_x_vec = _mm256_set1_pd(mean_x);
let mean_y_vec = _mm256_set1_pd(mean_y);
let mut sum_xy_vec = _mm256_setzero_pd();
let mut sum_x2_vec = _mm256_setzero_pd();
let mut sum_y2_vec = _mm256_setzero_pd();
for (cx, cy) in x.chunks_exact(4).zip(y.chunks_exact(4)) {
let vx = _mm256_loadu_pd(cx.as_ptr());
let vy = _mm256_loadu_pd(cy.as_ptr());
let dx = _mm256_sub_pd(vx, mean_x_vec);
let dy = _mm256_sub_pd(vy, mean_y_vec);
sum_xy_vec = _mm256_add_pd(sum_xy_vec, _mm256_mul_pd(dx, dy));
sum_x2_vec = _mm256_add_pd(sum_x2_vec, _mm256_mul_pd(dx, dx));
sum_y2_vec = _mm256_add_pd(sum_y2_vec, _mm256_mul_pd(dy, dy));
}
let mut sum_xy_arr = [0.0; 4];
let mut sum_x2_arr = [0.0; 4];
let mut sum_y2_arr = [0.0; 4];
_mm256_storeu_pd(sum_xy_arr.as_mut_ptr(), sum_xy_vec);
_mm256_storeu_pd(sum_x2_arr.as_mut_ptr(), sum_x2_vec);
_mm256_storeu_pd(sum_y2_arr.as_mut_ptr(), sum_y2_vec);
let mut sum_xy = sum_xy_arr.iter().sum::<f64>();
let mut sum_x2 = sum_x2_arr.iter().sum::<f64>();
let mut sum_y2 = sum_y2_arr.iter().sum::<f64>();
for i in rem_start..n {
let dx = x[i] - mean_x;
let dy = y[i] - mean_y;
sum_xy += dx * dy;
sum_x2 += dx * dx;
sum_y2 += dy * dy;
}
if sum_x2 == 0.0 || sum_y2 == 0.0 {
return f64::NAN;
}
sum_xy / (sum_x2.sqrt() * sum_y2.sqrt())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_skewness_f64_avx2(data: &[f64]) -> f64 {
use std::arch::x86_64::*;
let n = data.len();
if n < 3 {
return f64::NAN;
}
let mut sum_vec = _mm256_setzero_pd();
for chunk in data.chunks_exact(4) {
let vec = _mm256_loadu_pd(chunk.as_ptr());
sum_vec = _mm256_add_pd(sum_vec, vec);
}
let mut sum_arr = [0.0; 4];
_mm256_storeu_pd(sum_arr.as_mut_ptr(), sum_vec);
let mut sum = sum_arr.iter().sum::<f64>();
let rem_start = (n / 4) * 4;
for i in rem_start..n {
sum += data[i];
}
let mean = sum / n as f64;
let mean_vec = _mm256_set1_pd(mean);
let mut m2_vec = _mm256_setzero_pd();
let mut m3_vec = _mm256_setzero_pd();
for chunk in data.chunks_exact(4) {
let vec = _mm256_loadu_pd(chunk.as_ptr());
let diff = _mm256_sub_pd(vec, mean_vec);
let diff2 = _mm256_mul_pd(diff, diff);
let diff3 = _mm256_mul_pd(diff2, diff);
m2_vec = _mm256_add_pd(m2_vec, diff2);
m3_vec = _mm256_add_pd(m3_vec, diff3);
}
let mut m2_arr = [0.0; 4];
let mut m3_arr = [0.0; 4];
_mm256_storeu_pd(m2_arr.as_mut_ptr(), m2_vec);
_mm256_storeu_pd(m3_arr.as_mut_ptr(), m3_vec);
let mut m2 = m2_arr.iter().sum::<f64>();
let mut m3 = m3_arr.iter().sum::<f64>();
for i in rem_start..n {
let diff = data[i] - mean;
m2 += diff * diff;
m3 += diff * diff * diff;
}
m2 /= n as f64;
m3 /= n as f64;
if m2 == 0.0 {
return f64::NAN;
}
m3 / m2.powf(1.5)
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_kurtosis_f64_avx2(data: &[f64]) -> f64 {
use std::arch::x86_64::*;
let n = data.len();
if n < 4 {
return f64::NAN;
}
let mut sum_vec = _mm256_setzero_pd();
for chunk in data.chunks_exact(4) {
let vec = _mm256_loadu_pd(chunk.as_ptr());
sum_vec = _mm256_add_pd(sum_vec, vec);
}
let mut sum_arr = [0.0; 4];
_mm256_storeu_pd(sum_arr.as_mut_ptr(), sum_vec);
let mut sum = sum_arr.iter().sum::<f64>();
let rem_start = (n / 4) * 4;
for i in rem_start..n {
sum += data[i];
}
let mean = sum / n as f64;
let mean_vec = _mm256_set1_pd(mean);
let mut m2_vec = _mm256_setzero_pd();
let mut m4_vec = _mm256_setzero_pd();
for chunk in data.chunks_exact(4) {
let vec = _mm256_loadu_pd(chunk.as_ptr());
let diff = _mm256_sub_pd(vec, mean_vec);
let diff2 = _mm256_mul_pd(diff, diff);
let diff4 = _mm256_mul_pd(diff2, diff2);
m2_vec = _mm256_add_pd(m2_vec, diff2);
m4_vec = _mm256_add_pd(m4_vec, diff4);
}
let mut m2_arr = [0.0; 4];
let mut m4_arr = [0.0; 4];
_mm256_storeu_pd(m2_arr.as_mut_ptr(), m2_vec);
_mm256_storeu_pd(m4_arr.as_mut_ptr(), m4_vec);
let mut m2 = m2_arr.iter().sum::<f64>();
let mut m4 = m4_arr.iter().sum::<f64>();
for i in rem_start..n {
let diff = data[i] - mean;
let diff2 = diff * diff;
m2 += diff2;
m4 += diff2 * diff2;
}
m2 /= n as f64;
m4 /= n as f64;
if m2 == 0.0 {
return f64::NAN;
}
(m4 / m2.powi(2)) - 3.0
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_sum_of_squares_f64_avx2(data: &[f64], mean: f64) -> f64 {
use std::arch::x86_64::*;
let mean_vec = _mm256_set1_pd(mean);
let mut sq_sum_vec = _mm256_setzero_pd();
for chunk in data.chunks_exact(4) {
let vec = _mm256_loadu_pd(chunk.as_ptr());
let diff = _mm256_sub_pd(vec, mean_vec);
let sq = _mm256_mul_pd(diff, diff);
sq_sum_vec = _mm256_add_pd(sq_sum_vec, sq);
}
let mut sq_sum_arr = [0.0; 4];
_mm256_storeu_pd(sq_sum_arr.as_mut_ptr(), sq_sum_vec);
let mut sq_sum = sq_sum_arr.iter().sum::<f64>();
let rem_start = (data.len() / 4) * 4;
for i in rem_start..data.len() {
sq_sum += (data[i] - mean).powi(2);
}
sq_sum
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_weighted_mean_f64_avx2(data: &[f64], weights: &[f64]) -> f64 {
use std::arch::x86_64::*;
let len = data.len();
let mut weighted_sum_vec = _mm256_setzero_pd();
let mut weight_sum_vec = _mm256_setzero_pd();
for (cd, cw) in data.chunks_exact(4).zip(weights.chunks_exact(4)) {
let vd = _mm256_loadu_pd(cd.as_ptr());
let vw = _mm256_loadu_pd(cw.as_ptr());
let prod = _mm256_mul_pd(vd, vw);
weighted_sum_vec = _mm256_add_pd(weighted_sum_vec, prod);
weight_sum_vec = _mm256_add_pd(weight_sum_vec, vw);
}
let mut ws_arr = [0.0; 4];
let mut w_arr = [0.0; 4];
_mm256_storeu_pd(ws_arr.as_mut_ptr(), weighted_sum_vec);
_mm256_storeu_pd(w_arr.as_mut_ptr(), weight_sum_vec);
let mut weighted_sum = ws_arr.iter().sum::<f64>();
let mut weight_sum = w_arr.iter().sum::<f64>();
let rem_start = (len / 4) * 4;
for i in rem_start..len {
weighted_sum += data[i] * weights[i];
weight_sum += weights[i];
}
if weight_sum == 0.0 {
f64::NAN
} else {
weighted_sum / weight_sum
}
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_variance_f64_sse2(data: &[f64], ddof: usize) -> f64 {
use std::arch::x86_64::*;
let n = data.len();
if n <= ddof {
return f64::NAN;
}
let mut sum_vec = _mm_setzero_pd();
let chunks = data.chunks_exact(2);
let remainder = chunks.remainder();
for chunk in chunks {
let vec = _mm_loadu_pd(chunk.as_ptr());
sum_vec = _mm_add_pd(sum_vec, vec);
}
let mut sum_arr = [0.0; 2];
_mm_storeu_pd(sum_arr.as_mut_ptr(), sum_vec);
let mut sum = sum_arr[0] + sum_arr[1];
for &val in remainder {
sum += val;
}
let mean = sum / n as f64;
let mean_vec = _mm_set1_pd(mean);
let mut sq_sum_vec = _mm_setzero_pd();
let chunks = data.chunks_exact(2);
let remainder = chunks.remainder();
for chunk in chunks {
let vec = _mm_loadu_pd(chunk.as_ptr());
let diff = _mm_sub_pd(vec, mean_vec);
let sq = _mm_mul_pd(diff, diff);
sq_sum_vec = _mm_add_pd(sq_sum_vec, sq);
}
let mut sq_sum_arr = [0.0; 2];
_mm_storeu_pd(sq_sum_arr.as_mut_ptr(), sq_sum_vec);
let mut sq_sum = sq_sum_arr[0] + sq_sum_arr[1];
for &val in remainder {
sq_sum += (val - mean).powi(2);
}
sq_sum / (n - ddof) as f64
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_dot_product_f64_sse2(a: &[f64], b: &[f64]) -> f64 {
use std::arch::x86_64::*;
let len = a.len();
let mut sum_vec = _mm_setzero_pd();
for (chunk_a, chunk_b) in a.chunks_exact(2).zip(b.chunks_exact(2)) {
let vec_a = _mm_loadu_pd(chunk_a.as_ptr());
let vec_b = _mm_loadu_pd(chunk_b.as_ptr());
let prod = _mm_mul_pd(vec_a, vec_b);
sum_vec = _mm_add_pd(sum_vec, prod);
}
let mut sum_arr = [0.0; 2];
_mm_storeu_pd(sum_arr.as_mut_ptr(), sum_vec);
let mut sum = sum_arr[0] + sum_arr[1];
let rem_start = (len / 2) * 2;
for i in rem_start..len {
sum += a[i] * b[i];
}
sum
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_covariance_f64_sse2(x: &[f64], y: &[f64], ddof: usize) -> f64 {
use std::arch::x86_64::*;
let n = x.len();
if n <= ddof {
return f64::NAN;
}
let mut sum_x_vec = _mm_setzero_pd();
let mut sum_y_vec = _mm_setzero_pd();
for (cx, cy) in x.chunks_exact(2).zip(y.chunks_exact(2)) {
let vx = _mm_loadu_pd(cx.as_ptr());
let vy = _mm_loadu_pd(cy.as_ptr());
sum_x_vec = _mm_add_pd(sum_x_vec, vx);
sum_y_vec = _mm_add_pd(sum_y_vec, vy);
}
let mut sum_x_arr = [0.0; 2];
let mut sum_y_arr = [0.0; 2];
_mm_storeu_pd(sum_x_arr.as_mut_ptr(), sum_x_vec);
_mm_storeu_pd(sum_y_arr.as_mut_ptr(), sum_y_vec);
let mut sum_x = sum_x_arr[0] + sum_x_arr[1];
let mut sum_y = sum_y_arr[0] + sum_y_arr[1];
let rem_start = (n / 2) * 2;
for i in rem_start..n {
sum_x += x[i];
sum_y += y[i];
}
let mean_x = sum_x / n as f64;
let mean_y = sum_y / n as f64;
let mean_x_vec = _mm_set1_pd(mean_x);
let mean_y_vec = _mm_set1_pd(mean_y);
let mut cov_vec = _mm_setzero_pd();
for (cx, cy) in x.chunks_exact(2).zip(y.chunks_exact(2)) {
let vx = _mm_loadu_pd(cx.as_ptr());
let vy = _mm_loadu_pd(cy.as_ptr());
let dx = _mm_sub_pd(vx, mean_x_vec);
let dy = _mm_sub_pd(vy, mean_y_vec);
let prod = _mm_mul_pd(dx, dy);
cov_vec = _mm_add_pd(cov_vec, prod);
}
let mut cov_arr = [0.0; 2];
_mm_storeu_pd(cov_arr.as_mut_ptr(), cov_vec);
let mut cov = cov_arr[0] + cov_arr[1];
for i in rem_start..n {
cov += (x[i] - mean_x) * (y[i] - mean_y);
}
cov / (n - ddof) as f64
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_correlation_f64_sse2(x: &[f64], y: &[f64]) -> f64 {
use std::arch::x86_64::*;
let n = x.len();
if n < 2 {
return f64::NAN;
}
let mut sum_x_vec = _mm_setzero_pd();
let mut sum_y_vec = _mm_setzero_pd();
for (cx, cy) in x.chunks_exact(2).zip(y.chunks_exact(2)) {
let vx = _mm_loadu_pd(cx.as_ptr());
let vy = _mm_loadu_pd(cy.as_ptr());
sum_x_vec = _mm_add_pd(sum_x_vec, vx);
sum_y_vec = _mm_add_pd(sum_y_vec, vy);
}
let mut sum_x_arr = [0.0; 2];
let mut sum_y_arr = [0.0; 2];
_mm_storeu_pd(sum_x_arr.as_mut_ptr(), sum_x_vec);
_mm_storeu_pd(sum_y_arr.as_mut_ptr(), sum_y_vec);
let mut sum_x = sum_x_arr[0] + sum_x_arr[1];
let mut sum_y = sum_y_arr[0] + sum_y_arr[1];
let rem_start = (n / 2) * 2;
for i in rem_start..n {
sum_x += x[i];
sum_y += y[i];
}
let mean_x = sum_x / n as f64;
let mean_y = sum_y / n as f64;
let mean_x_vec = _mm_set1_pd(mean_x);
let mean_y_vec = _mm_set1_pd(mean_y);
let mut sum_xy_vec = _mm_setzero_pd();
let mut sum_x2_vec = _mm_setzero_pd();
let mut sum_y2_vec = _mm_setzero_pd();
for (cx, cy) in x.chunks_exact(2).zip(y.chunks_exact(2)) {
let vx = _mm_loadu_pd(cx.as_ptr());
let vy = _mm_loadu_pd(cy.as_ptr());
let dx = _mm_sub_pd(vx, mean_x_vec);
let dy = _mm_sub_pd(vy, mean_y_vec);
sum_xy_vec = _mm_add_pd(sum_xy_vec, _mm_mul_pd(dx, dy));
sum_x2_vec = _mm_add_pd(sum_x2_vec, _mm_mul_pd(dx, dx));
sum_y2_vec = _mm_add_pd(sum_y2_vec, _mm_mul_pd(dy, dy));
}
let mut sum_xy_arr = [0.0; 2];
let mut sum_x2_arr = [0.0; 2];
let mut sum_y2_arr = [0.0; 2];
_mm_storeu_pd(sum_xy_arr.as_mut_ptr(), sum_xy_vec);
_mm_storeu_pd(sum_x2_arr.as_mut_ptr(), sum_x2_vec);
_mm_storeu_pd(sum_y2_arr.as_mut_ptr(), sum_y2_vec);
let mut sum_xy = sum_xy_arr[0] + sum_xy_arr[1];
let mut sum_x2 = sum_x2_arr[0] + sum_x2_arr[1];
let mut sum_y2 = sum_y2_arr[0] + sum_y2_arr[1];
for i in rem_start..n {
let dx = x[i] - mean_x;
let dy = y[i] - mean_y;
sum_xy += dx * dy;
sum_x2 += dx * dx;
sum_y2 += dy * dy;
}
if sum_x2 == 0.0 || sum_y2 == 0.0 {
return f64::NAN;
}
sum_xy / (sum_x2.sqrt() * sum_y2.sqrt())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_skewness_f64_sse2(data: &[f64]) -> f64 {
use std::arch::x86_64::*;
let n = data.len();
if n < 3 {
return f64::NAN;
}
let mut sum_vec = _mm_setzero_pd();
for chunk in data.chunks_exact(2) {
let vec = _mm_loadu_pd(chunk.as_ptr());
sum_vec = _mm_add_pd(sum_vec, vec);
}
let mut sum_arr = [0.0; 2];
_mm_storeu_pd(sum_arr.as_mut_ptr(), sum_vec);
let mut sum = sum_arr[0] + sum_arr[1];
let rem_start = (n / 2) * 2;
for i in rem_start..n {
sum += data[i];
}
let mean = sum / n as f64;
let mean_vec = _mm_set1_pd(mean);
let mut m2_vec = _mm_setzero_pd();
let mut m3_vec = _mm_setzero_pd();
for chunk in data.chunks_exact(2) {
let vec = _mm_loadu_pd(chunk.as_ptr());
let diff = _mm_sub_pd(vec, mean_vec);
let diff2 = _mm_mul_pd(diff, diff);
let diff3 = _mm_mul_pd(diff2, diff);
m2_vec = _mm_add_pd(m2_vec, diff2);
m3_vec = _mm_add_pd(m3_vec, diff3);
}
let mut m2_arr = [0.0; 2];
let mut m3_arr = [0.0; 2];
_mm_storeu_pd(m2_arr.as_mut_ptr(), m2_vec);
_mm_storeu_pd(m3_arr.as_mut_ptr(), m3_vec);
let mut m2 = m2_arr[0] + m2_arr[1];
let mut m3 = m3_arr[0] + m3_arr[1];
for i in rem_start..n {
let diff = data[i] - mean;
m2 += diff * diff;
m3 += diff * diff * diff;
}
m2 /= n as f64;
m3 /= n as f64;
if m2 == 0.0 {
return f64::NAN;
}
m3 / m2.powf(1.5)
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_kurtosis_f64_sse2(data: &[f64]) -> f64 {
use std::arch::x86_64::*;
let n = data.len();
if n < 4 {
return f64::NAN;
}
let mut sum_vec = _mm_setzero_pd();
for chunk in data.chunks_exact(2) {
let vec = _mm_loadu_pd(chunk.as_ptr());
sum_vec = _mm_add_pd(sum_vec, vec);
}
let mut sum_arr = [0.0; 2];
_mm_storeu_pd(sum_arr.as_mut_ptr(), sum_vec);
let mut sum = sum_arr[0] + sum_arr[1];
let rem_start = (n / 2) * 2;
for i in rem_start..n {
sum += data[i];
}
let mean = sum / n as f64;
let mean_vec = _mm_set1_pd(mean);
let mut m2_vec = _mm_setzero_pd();
let mut m4_vec = _mm_setzero_pd();
for chunk in data.chunks_exact(2) {
let vec = _mm_loadu_pd(chunk.as_ptr());
let diff = _mm_sub_pd(vec, mean_vec);
let diff2 = _mm_mul_pd(diff, diff);
let diff4 = _mm_mul_pd(diff2, diff2);
m2_vec = _mm_add_pd(m2_vec, diff2);
m4_vec = _mm_add_pd(m4_vec, diff4);
}
let mut m2_arr = [0.0; 2];
let mut m4_arr = [0.0; 2];
_mm_storeu_pd(m2_arr.as_mut_ptr(), m2_vec);
_mm_storeu_pd(m4_arr.as_mut_ptr(), m4_vec);
let mut m2 = m2_arr[0] + m2_arr[1];
let mut m4 = m4_arr[0] + m4_arr[1];
for i in rem_start..n {
let diff = data[i] - mean;
let diff2 = diff * diff;
m2 += diff2;
m4 += diff2 * diff2;
}
m2 /= n as f64;
m4 /= n as f64;
if m2 == 0.0 {
return f64::NAN;
}
(m4 / m2.powi(2)) - 3.0
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_sum_of_squares_f64_sse2(data: &[f64], mean: f64) -> f64 {
use std::arch::x86_64::*;
let mean_vec = _mm_set1_pd(mean);
let mut sq_sum_vec = _mm_setzero_pd();
for chunk in data.chunks_exact(2) {
let vec = _mm_loadu_pd(chunk.as_ptr());
let diff = _mm_sub_pd(vec, mean_vec);
let sq = _mm_mul_pd(diff, diff);
sq_sum_vec = _mm_add_pd(sq_sum_vec, sq);
}
let mut sq_sum_arr = [0.0; 2];
_mm_storeu_pd(sq_sum_arr.as_mut_ptr(), sq_sum_vec);
let mut sq_sum = sq_sum_arr[0] + sq_sum_arr[1];
let rem_start = (data.len() / 2) * 2;
for i in rem_start..data.len() {
sq_sum += (data[i] - mean).powi(2);
}
sq_sum
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_weighted_mean_f64_sse2(data: &[f64], weights: &[f64]) -> f64 {
use std::arch::x86_64::*;
let len = data.len();
let mut weighted_sum_vec = _mm_setzero_pd();
let mut weight_sum_vec = _mm_setzero_pd();
for (cd, cw) in data.chunks_exact(2).zip(weights.chunks_exact(2)) {
let vd = _mm_loadu_pd(cd.as_ptr());
let vw = _mm_loadu_pd(cw.as_ptr());
let prod = _mm_mul_pd(vd, vw);
weighted_sum_vec = _mm_add_pd(weighted_sum_vec, prod);
weight_sum_vec = _mm_add_pd(weight_sum_vec, vw);
}
let mut ws_arr = [0.0; 2];
let mut w_arr = [0.0; 2];
_mm_storeu_pd(ws_arr.as_mut_ptr(), weighted_sum_vec);
_mm_storeu_pd(w_arr.as_mut_ptr(), weight_sum_vec);
let mut weighted_sum = ws_arr[0] + ws_arr[1];
let mut weight_sum = w_arr[0] + w_arr[1];
let rem_start = (len / 2) * 2;
for i in rem_start..len {
weighted_sum += data[i] * weights[i];
weight_sum += weights[i];
}
if weight_sum == 0.0 {
f64::NAN
} else {
weighted_sum / weight_sum
}
}
#[cfg(test)]
mod tests {
use super::*;
const EPSILON: f64 = 1e-10;
fn approx_eq(a: f64, b: f64) -> bool {
if a.is_nan() && b.is_nan() {
return true;
}
(a - b).abs() < EPSILON
}
#[test]
fn test_simd_variance() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let var = simd_variance_f64(&data, 0);
let expected = 5.25; assert!(
approx_eq(var, expected),
"var={}, expected={}",
var,
expected
);
let var_sample = simd_variance_f64(&data, 1);
let expected_sample = 6.0; assert!(
approx_eq(var_sample, expected_sample),
"var_sample={}, expected={}",
var_sample,
expected_sample
);
}
#[test]
fn test_simd_std() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let std = simd_std_f64(&data, 0);
let expected = 5.25_f64.sqrt();
assert!(
approx_eq(std, expected),
"std={}, expected={}",
std,
expected
);
}
#[test]
fn test_simd_dot_product() {
let a = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let b = vec![2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0];
let dot = simd_dot_product_f64(&a, &b);
let expected = 72.0; assert!(
approx_eq(dot, expected),
"dot={}, expected={}",
dot,
expected
);
}
#[test]
fn test_simd_covariance() {
let x = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let y = vec![2.0, 4.0, 6.0, 8.0, 10.0]; let cov = simd_covariance_f64(&x, &y, 0);
let expected = 4.0; assert!(
approx_eq(cov, expected),
"cov={}, expected={}",
cov,
expected
);
}
#[test]
fn test_simd_correlation() {
let x = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let y = vec![2.0, 4.0, 6.0, 8.0, 10.0]; let corr = simd_correlation_f64(&x, &y);
let expected = 1.0; assert!(
approx_eq(corr, expected),
"corr={}, expected={}",
corr,
expected
);
}
#[test]
fn test_simd_correlation_negative() {
let x = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let y = vec![10.0, 8.0, 6.0, 4.0, 2.0]; let corr = simd_correlation_f64(&x, &y);
let expected = -1.0; assert!(
approx_eq(corr, expected),
"corr={}, expected={}",
corr,
expected
);
}
#[test]
fn test_simd_skewness() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
let skew = simd_skewness_f64(&data);
assert!(skew.abs() < 0.01, "skew={}, expected ~0", skew);
}
#[test]
fn test_simd_kurtosis() {
let data: Vec<f64> = (1..=20).map(|x| x as f64).collect();
let kurt = simd_kurtosis_f64(&data);
assert!(kurt < 0.0, "kurt={}, expected negative", kurt);
}
#[test]
fn test_simd_weighted_mean() {
let data = vec![1.0, 2.0, 3.0, 4.0];
let weights = vec![1.0, 1.0, 1.0, 1.0];
let mean = simd_weighted_mean_f64(&data, &weights);
let expected = 2.5;
assert!(
approx_eq(mean, expected),
"mean={}, expected={}",
mean,
expected
);
let weights2 = vec![1.0, 2.0, 3.0, 4.0];
let mean2 = simd_weighted_mean_f64(&data, &weights2);
let expected2 = 3.0; assert!(
approx_eq(mean2, expected2),
"mean2={}, expected={}",
mean2,
expected2
);
}
#[test]
fn test_simd_l2_norm() {
let data = vec![3.0, 4.0];
let norm = simd_l2_norm_f64(&data);
let expected = 5.0;
assert!(
approx_eq(norm, expected),
"norm={}, expected={}",
norm,
expected
);
}
#[test]
fn test_simd_cosine_similarity() {
let a = vec![1.0, 0.0];
let b = vec![1.0, 0.0];
let sim = simd_cosine_similarity_f64(&a, &b);
let expected = 1.0;
assert!(
approx_eq(sim, expected),
"sim={}, expected={}",
sim,
expected
);
let c = vec![1.0, 0.0];
let d = vec![0.0, 1.0];
let sim2 = simd_cosine_similarity_f64(&c, &d);
let expected2 = 0.0; assert!(
approx_eq(sim2, expected2),
"sim2={}, expected={}",
sim2,
expected2
);
}
#[test]
fn test_simd_sum_of_squares() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let mean = 3.0;
let ss = simd_sum_of_squares_f64(&data, mean);
let expected = 10.0; assert!(approx_eq(ss, expected), "ss={}, expected={}", ss, expected);
}
#[test]
fn test_edge_cases() {
assert!(simd_variance_f64(&[], 0).is_nan());
assert!(simd_correlation_f64(&[], &[]).is_nan());
assert!(simd_variance_f64(&[1.0], 1).is_nan());
let constant = vec![5.0, 5.0, 5.0, 5.0];
let var = simd_variance_f64(&constant, 0);
assert!(
approx_eq(var, 0.0),
"var of constant should be 0, got {}",
var
);
}
#[test]
fn test_large_data() {
let data: Vec<f64> = (0..1000).map(|x| x as f64).collect();
let var = simd_variance_f64(&data, 0);
let expected = (999.0 * 999.0 - 1.0) / 12.0;
assert!(
(var - expected).abs() / expected < 0.01,
"var={}, expected={}",
var,
expected
);
}
}