#[cfg(feature = "no-std")]
use alloc::vec;
#[cfg(feature = "no-std")]
use alloc::vec::Vec;
#[cfg(not(feature = "no-std"))]
use std::vec::Vec;
#[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
use std::arch::is_aarch64_feature_detected;
pub fn dot_product(a: &[f32], b: &[f32]) -> f32 {
assert_eq!(a.len(), b.len(), "Vectors must have the same length");
if a.is_empty() {
return 0.0;
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if crate::simd_feature_detected!("avx512f") {
return unsafe { dot_product_avx512(a, b) };
} else if crate::simd_feature_detected!("avx2") {
return unsafe { dot_product_avx2(a, b) };
} else if crate::simd_feature_detected!("sse2") {
return unsafe { dot_product_sse2(a, b) };
}
}
#[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
{
if is_aarch64_feature_detected!("neon") {
return unsafe { dot_product_neon(a, b) };
}
}
dot_product_scalar(a, b)
}
pub fn norm_l2(x: &[f32]) -> f32 {
dot_product(x, x).sqrt()
}
pub fn norm_l1(x: &[f32]) -> f32 {
if x.is_empty() {
return 0.0;
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if crate::simd_feature_detected!("avx2") {
return unsafe { norm_l1_avx2(x) };
} else if crate::simd_feature_detected!("sse2") {
return unsafe { norm_l1_sse2(x) };
}
}
#[cfg(all(target_arch = "aarch64", not(feature = "no-std")))]
{
if is_aarch64_feature_detected!("neon") {
return unsafe { norm_l1_neon(x) };
}
}
norm_l1_scalar(x)
}
pub fn norm_inf(x: &[f32]) -> f32 {
if x.is_empty() {
return 0.0;
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if crate::simd_feature_detected!("avx2") {
return unsafe { norm_inf_avx2(x) };
} else if crate::simd_feature_detected!("sse2") {
return unsafe { norm_inf_sse2(x) };
}
}
norm_inf_scalar(x)
}
pub fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
assert_eq!(a.len(), b.len(), "Vectors must have the same length");
if a.is_empty() {
return 0.0;
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if crate::simd_feature_detected!("avx2") {
return unsafe { euclidean_distance_avx2(a, b) };
} else if crate::simd_feature_detected!("sse2") {
return unsafe { euclidean_distance_sse2(a, b) };
}
}
euclidean_distance_scalar(a, b)
}
pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
assert_eq!(a.len(), b.len(), "Vectors must have the same length");
if a.is_empty() {
return 1.0; }
let dot_ab = dot_product(a, b);
let norm_a = norm_l2(a);
let norm_b = norm_l2(b);
if norm_a == 0.0 || norm_b == 0.0 {
return 0.0; }
dot_ab / (norm_a * norm_b)
}
pub fn cross_product(a: &[f32], b: &[f32]) -> Result<Vec<f32>, &'static str> {
if a.len() != 3 || b.len() != 3 {
return Err("Cross product requires exactly 3-dimensional vectors");
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if crate::simd_feature_detected!("sse2") {
return Ok(unsafe { cross_product_sse2(a, b) });
}
}
Ok(cross_product_scalar(a, b))
}
pub fn outer_product(a: &[f32], b: &[f32]) -> Vec<Vec<f32>> {
let m = a.len();
let n = b.len();
if m == 0 || n == 0 {
return vec![];
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
{
if crate::simd_feature_detected!("avx2") {
return unsafe { outer_product_avx2(a, b) };
} else if crate::simd_feature_detected!("sse2") {
return unsafe { outer_product_sse2(a, b) };
}
}
outer_product_scalar(a, b)
}
fn dot_product_scalar(a: &[f32], b: &[f32]) -> f32 {
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
}
fn norm_l1_scalar(x: &[f32]) -> f32 {
x.iter().map(|&v| v.abs()).sum()
}
fn norm_inf_scalar(x: &[f32]) -> f32 {
x.iter().map(|&v| v.abs()).fold(0.0f32, |a, b| a.max(b))
}
fn euclidean_distance_scalar(a: &[f32], b: &[f32]) -> f32 {
a.iter()
.zip(b.iter())
.map(|(x, y)| {
let diff = x - y;
diff * diff
})
.sum::<f32>()
.sqrt()
}
fn cross_product_scalar(a: &[f32], b: &[f32]) -> Vec<f32> {
vec![
a[1] * b[2] - a[2] * b[1], a[2] * b[0] - a[0] * b[2], a[0] * b[1] - a[1] * b[0], ]
}
fn outer_product_scalar(a: &[f32], b: &[f32]) -> Vec<Vec<f32>> {
let m = a.len();
let n = b.len();
let mut result = vec![vec![0.0; n]; m];
for i in 0..m {
for (j, &b_val) in b.iter().enumerate().take(n) {
result[i][j] = a[i] * b_val;
}
}
result
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse2")]
unsafe fn dot_product_sse2(a: &[f32], b: &[f32]) -> f32 {
#[cfg(feature = "no-std")]
use core::arch::x86_64::*;
#[cfg(not(feature = "no-std"))]
use core::arch::x86_64::*;
let mut sum = _mm_setzero_ps();
let mut i = 0;
while i + 4 <= a.len() {
let a_vec = _mm_loadu_ps(a.as_ptr().add(i));
let b_vec = _mm_loadu_ps(b.as_ptr().add(i));
let prod = _mm_mul_ps(a_vec, b_vec);
sum = _mm_add_ps(sum, prod);
i += 4;
}
let mut result = [0.0f32; 4];
_mm_storeu_ps(result.as_mut_ptr(), sum);
let mut scalar_sum = result[0] + result[1] + result[2] + result[3];
while i < a.len() {
scalar_sum += a[i] * b[i];
i += 1;
}
scalar_sum
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse2")]
unsafe fn norm_l1_sse2(x: &[f32]) -> f32 {
#[cfg(feature = "no-std")]
use core::arch::x86_64::*;
#[cfg(not(feature = "no-std"))]
use core::arch::x86_64::*;
let abs_mask = _mm_set1_ps(f32::from_bits(0x7FFFFFFF));
let mut sum = _mm_setzero_ps();
let mut i = 0;
while i + 4 <= x.len() {
let x_vec = _mm_loadu_ps(x.as_ptr().add(i));
let abs_vec = _mm_and_ps(x_vec, abs_mask);
sum = _mm_add_ps(sum, abs_vec);
i += 4;
}
let mut result = [0.0f32; 4];
_mm_storeu_ps(result.as_mut_ptr(), sum);
let mut scalar_sum = result[0] + result[1] + result[2] + result[3];
while i < x.len() {
scalar_sum += x[i].abs();
i += 1;
}
scalar_sum
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse2")]
unsafe fn norm_inf_sse2(x: &[f32]) -> f32 {
#[cfg(feature = "no-std")]
use core::arch::x86_64::*;
#[cfg(not(feature = "no-std"))]
use core::arch::x86_64::*;
let abs_mask = _mm_set1_ps(f32::from_bits(0x7FFFFFFF));
let mut max_vec = _mm_setzero_ps();
let mut i = 0;
while i + 4 <= x.len() {
let x_vec = _mm_loadu_ps(x.as_ptr().add(i));
let abs_vec = _mm_and_ps(x_vec, abs_mask);
max_vec = _mm_max_ps(max_vec, abs_vec);
i += 4;
}
let mut result = [0.0f32; 4];
_mm_storeu_ps(result.as_mut_ptr(), max_vec);
let mut max_val = result[0].max(result[1]).max(result[2]).max(result[3]);
while i < x.len() {
max_val = max_val.max(x[i].abs());
i += 1;
}
max_val
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse2")]
unsafe fn euclidean_distance_sse2(a: &[f32], b: &[f32]) -> f32 {
#[cfg(feature = "no-std")]
use core::arch::x86_64::*;
#[cfg(not(feature = "no-std"))]
use core::arch::x86_64::*;
let mut sum = _mm_setzero_ps();
let mut i = 0;
while i + 4 <= a.len() {
let a_vec = _mm_loadu_ps(a.as_ptr().add(i));
let b_vec = _mm_loadu_ps(b.as_ptr().add(i));
let diff = _mm_sub_ps(a_vec, b_vec);
let squared = _mm_mul_ps(diff, diff);
sum = _mm_add_ps(sum, squared);
i += 4;
}
let mut result = [0.0f32; 4];
_mm_storeu_ps(result.as_mut_ptr(), sum);
let mut scalar_sum = result[0] + result[1] + result[2] + result[3];
while i < a.len() {
let diff = a[i] - b[i];
scalar_sum += diff * diff;
i += 1;
}
scalar_sum.sqrt()
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse2")]
unsafe fn cross_product_sse2(a: &[f32], b: &[f32]) -> Vec<f32> {
#[cfg(feature = "no-std")]
use core::arch::x86_64::*;
#[cfg(not(feature = "no-std"))]
use core::arch::x86_64::*;
let a_vec = _mm_set_ps(0.0, a[2], a[1], a[0]);
let b_vec = _mm_set_ps(0.0, b[2], b[1], b[0]);
let a_yzx = _mm_shuffle_ps(a_vec, a_vec, 0b00_01_10_01);
let b_zxy = _mm_shuffle_ps(b_vec, b_vec, 0b00_10_00_10);
let a_zxy = _mm_shuffle_ps(a_vec, a_vec, 0b00_10_00_10);
let b_yzx = _mm_shuffle_ps(b_vec, b_vec, 0b00_01_10_01);
let prod1 = _mm_mul_ps(a_yzx, b_zxy);
let prod2 = _mm_mul_ps(a_zxy, b_yzx);
let result_vec = _mm_sub_ps(prod1, prod2);
let mut output = [0.0f32; 4];
_mm_storeu_ps(output.as_mut_ptr(), result_vec);
vec![output[0], output[1], output[2]]
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "sse2")]
unsafe fn outer_product_sse2(a: &[f32], b: &[f32]) -> Vec<Vec<f32>> {
#[cfg(feature = "no-std")]
use core::arch::x86_64::*;
#[cfg(not(feature = "no-std"))]
use core::arch::x86_64::*;
let m = a.len();
let n = b.len();
let mut result = vec![vec![0.0; n]; m];
for i in 0..m {
let a_broadcast = _mm_set1_ps(a[i]);
let mut j = 0;
while j + 4 <= n {
let b_vec = _mm_loadu_ps(b.as_ptr().add(j));
let prod = _mm_mul_ps(a_broadcast, b_vec);
_mm_storeu_ps(result[i].as_mut_ptr().add(j), prod);
j += 4;
}
while j < n {
result[i][j] = a[i] * b[j];
j += 1;
}
}
result
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn dot_product_avx2(a: &[f32], b: &[f32]) -> f32 {
#[cfg(feature = "no-std")]
use core::arch::x86_64::*;
#[cfg(not(feature = "no-std"))]
use core::arch::x86_64::*;
let mut sum = _mm256_setzero_ps();
let mut i = 0;
while i + 8 <= a.len() {
let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
let prod = _mm256_mul_ps(a_vec, b_vec);
sum = _mm256_add_ps(sum, prod);
i += 8;
}
let mut result = [0.0f32; 8];
_mm256_storeu_ps(result.as_mut_ptr(), sum);
let mut scalar_sum = result.iter().sum::<f32>();
while i < a.len() {
scalar_sum += a[i] * b[i];
i += 1;
}
scalar_sum
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn norm_l1_avx2(x: &[f32]) -> f32 {
#[cfg(feature = "no-std")]
use core::arch::x86_64::*;
#[cfg(not(feature = "no-std"))]
use core::arch::x86_64::*;
let abs_mask = _mm256_set1_ps(f32::from_bits(0x7FFFFFFF));
let mut sum = _mm256_setzero_ps();
let mut i = 0;
while i + 8 <= x.len() {
let x_vec = _mm256_loadu_ps(x.as_ptr().add(i));
let abs_vec = _mm256_and_ps(x_vec, abs_mask);
sum = _mm256_add_ps(sum, abs_vec);
i += 8;
}
let mut result = [0.0f32; 8];
_mm256_storeu_ps(result.as_mut_ptr(), sum);
let mut scalar_sum = result.iter().sum::<f32>();
while i < x.len() {
scalar_sum += x[i].abs();
i += 1;
}
scalar_sum
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn norm_inf_avx2(x: &[f32]) -> f32 {
#[cfg(feature = "no-std")]
use core::arch::x86_64::*;
#[cfg(not(feature = "no-std"))]
use core::arch::x86_64::*;
let abs_mask = _mm256_set1_ps(f32::from_bits(0x7FFFFFFF));
let mut max_vec = _mm256_setzero_ps();
let mut i = 0;
while i + 8 <= x.len() {
let x_vec = _mm256_loadu_ps(x.as_ptr().add(i));
let abs_vec = _mm256_and_ps(x_vec, abs_mask);
max_vec = _mm256_max_ps(max_vec, abs_vec);
i += 8;
}
let mut result = [0.0f32; 8];
_mm256_storeu_ps(result.as_mut_ptr(), max_vec);
let mut max_val = result.iter().fold(0.0f32, |a, &b| a.max(b));
while i < x.len() {
max_val = max_val.max(x[i].abs());
i += 1;
}
max_val
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn euclidean_distance_avx2(a: &[f32], b: &[f32]) -> f32 {
#[cfg(feature = "no-std")]
use core::arch::x86_64::*;
#[cfg(not(feature = "no-std"))]
use core::arch::x86_64::*;
let mut sum = _mm256_setzero_ps();
let mut i = 0;
while i + 8 <= a.len() {
let a_vec = _mm256_loadu_ps(a.as_ptr().add(i));
let b_vec = _mm256_loadu_ps(b.as_ptr().add(i));
let diff = _mm256_sub_ps(a_vec, b_vec);
let squared = _mm256_mul_ps(diff, diff);
sum = _mm256_add_ps(sum, squared);
i += 8;
}
let mut result = [0.0f32; 8];
_mm256_storeu_ps(result.as_mut_ptr(), sum);
let mut scalar_sum = result.iter().sum::<f32>();
while i < a.len() {
let diff = a[i] - b[i];
scalar_sum += diff * diff;
i += 1;
}
scalar_sum.sqrt()
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx2")]
unsafe fn outer_product_avx2(a: &[f32], b: &[f32]) -> Vec<Vec<f32>> {
#[cfg(feature = "no-std")]
use core::arch::x86_64::*;
#[cfg(not(feature = "no-std"))]
use core::arch::x86_64::*;
let m = a.len();
let n = b.len();
let mut result = vec![vec![0.0; n]; m];
for i in 0..m {
let a_broadcast = _mm256_set1_ps(a[i]);
let mut j = 0;
while j + 8 <= n {
let b_vec = _mm256_loadu_ps(b.as_ptr().add(j));
let prod = _mm256_mul_ps(a_broadcast, b_vec);
_mm256_storeu_ps(result[i].as_mut_ptr().add(j), prod);
j += 8;
}
while j < n {
result[i][j] = a[i] * b[j];
j += 1;
}
}
result
}
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[target_feature(enable = "avx512f")]
unsafe fn dot_product_avx512(a: &[f32], b: &[f32]) -> f32 {
#[cfg(feature = "no-std")]
use core::arch::x86_64::*;
#[cfg(not(feature = "no-std"))]
use core::arch::x86_64::*;
let mut sum = _mm512_setzero_ps();
let mut i = 0;
while i + 16 <= a.len() {
let a_vec = _mm512_loadu_ps(a.as_ptr().add(i));
let b_vec = _mm512_loadu_ps(b.as_ptr().add(i));
sum = _mm512_fmadd_ps(a_vec, b_vec, sum); i += 16;
}
let scalar_sum = _mm512_reduce_add_ps(sum);
let mut remaining_sum = 0.0f32;
while i < a.len() {
remaining_sum += a[i] * b[i];
i += 1;
}
scalar_sum + remaining_sum
}
#[cfg(target_arch = "aarch64")]
#[allow(dead_code)] #[target_feature(enable = "neon")]
unsafe fn dot_product_neon(a: &[f32], b: &[f32]) -> f32 {
use core::arch::aarch64::*;
let mut sum = vdupq_n_f32(0.0);
let mut i = 0;
while i + 4 <= a.len() {
let a_vec = vld1q_f32(a.as_ptr().add(i));
let b_vec = vld1q_f32(b.as_ptr().add(i));
sum = vfmaq_f32(sum, a_vec, b_vec); i += 4;
}
let sum_pair = vpadd_f32(vget_low_f32(sum), vget_high_f32(sum));
let final_sum = vpadd_f32(sum_pair, sum_pair);
let mut scalar_sum = vget_lane_f32(final_sum, 0);
while i < a.len() {
scalar_sum += a[i] * b[i];
i += 1;
}
scalar_sum
}
#[cfg(target_arch = "aarch64")]
#[allow(dead_code)] #[target_feature(enable = "neon")]
unsafe fn norm_l1_neon(x: &[f32]) -> f32 {
use core::arch::aarch64::*;
let mut sum = vdupq_n_f32(0.0);
let mut i = 0;
while i + 4 <= x.len() {
let x_vec = vld1q_f32(x.as_ptr().add(i));
let abs_vec = vabsq_f32(x_vec);
sum = vaddq_f32(sum, abs_vec);
i += 4;
}
let sum_pair = vpadd_f32(vget_low_f32(sum), vget_high_f32(sum));
let final_sum = vpadd_f32(sum_pair, sum_pair);
let mut scalar_sum = vget_lane_f32(final_sum, 0);
while i < x.len() {
scalar_sum += x[i].abs();
i += 1;
}
scalar_sum
}
#[allow(non_snake_case)]
#[cfg(all(test, not(feature = "no-std")))]
mod tests {
use super::*;
#[test]
fn test_dot_product() {
let a = vec![1.0, 2.0, 3.0, 4.0];
let b = vec![5.0, 6.0, 7.0, 8.0];
let result = dot_product(&a, &b);
assert_eq!(result, 70.0);
let empty_a: Vec<f32> = vec![];
let empty_b: Vec<f32> = vec![];
assert_eq!(dot_product(&empty_a, &empty_b), 0.0);
let single_a = vec![3.0];
let single_b = vec![4.0];
assert_eq!(dot_product(&single_a, &single_b), 12.0);
}
#[test]
fn test_norms() {
let x = vec![3.0, 4.0];
let norm2 = norm_l2(&x);
assert_eq!(norm2, 5.0);
let norm1 = norm_l1(&x);
assert_eq!(norm1, 7.0);
let norm_inf_val = norm_inf(&x);
assert_eq!(norm_inf_val, 4.0);
let y = vec![-3.0, 4.0, -5.0];
assert_eq!(norm_l1(&y), 12.0); assert_eq!(norm_inf(&y), 5.0);
let empty: Vec<f32> = vec![];
assert_eq!(norm_l2(&empty), 0.0);
assert_eq!(norm_l1(&empty), 0.0);
assert_eq!(norm_inf(&empty), 0.0);
}
#[test]
fn test_euclidean_distance() {
let a = vec![1.0, 2.0, 3.0];
let b = vec![4.0, 5.0, 6.0];
let result = euclidean_distance(&a, &b);
assert!((result - 5.196).abs() < 0.01);
let identical = euclidean_distance(&a, &a);
assert_eq!(identical, 0.0);
let empty_a: Vec<f32> = vec![];
let empty_b: Vec<f32> = vec![];
assert_eq!(euclidean_distance(&empty_a, &empty_b), 0.0);
}
#[test]
fn test_cosine_similarity() {
let a = vec![1.0, 0.0, 0.0];
let b = vec![0.0, 1.0, 0.0];
let result = cosine_similarity(&a, &b);
assert!((result - 0.0).abs() < f32::EPSILON);
let identical = cosine_similarity(&a, &a);
assert!((identical - 1.0).abs() < f32::EPSILON);
let opposite = vec![-1.0, 0.0, 0.0];
let opposite_sim = cosine_similarity(&a, &opposite);
assert!((opposite_sim - (-1.0)).abs() < f32::EPSILON);
let zero = vec![0.0, 0.0, 0.0];
let zero_sim = cosine_similarity(&a, &zero);
assert_eq!(zero_sim, 0.0);
let empty_a: Vec<f32> = vec![];
let empty_b: Vec<f32> = vec![];
assert_eq!(cosine_similarity(&empty_a, &empty_b), 1.0);
}
#[test]
fn test_cross_product() {
let i = vec![1.0, 0.0, 0.0];
let j = vec![0.0, 1.0, 0.0];
let result = cross_product(&i, &j).expect("operation should succeed");
assert_eq!(result, vec![0.0, 0.0, 1.0]);
let a = vec![1.0, 2.0, 3.0];
let b = vec![4.0, 5.0, 6.0];
let cross = cross_product(&a, &b).expect("operation should succeed");
assert_eq!(cross, vec![-3.0, 6.0, -3.0]);
let wrong_dim = vec![1.0, 2.0];
assert!(cross_product(&wrong_dim, &j).is_err());
}
#[test]
fn test_outer_product() {
let a = vec![1.0, 2.0];
let b = vec![3.0, 4.0, 5.0];
let result = outer_product(&a, &b);
assert_eq!(result.len(), 2);
assert_eq!(result[0].len(), 3);
assert_eq!(result[0], vec![3.0, 4.0, 5.0]);
assert_eq!(result[1], vec![6.0, 8.0, 10.0]);
let empty_a: Vec<f32> = vec![];
let empty_result = outer_product(&empty_a, &b);
assert!(empty_result.is_empty());
let empty_b: Vec<f32> = vec![];
let empty_result2 = outer_product(&a, &empty_b);
assert!(empty_result2.is_empty());
}
#[test]
#[should_panic(expected = "Vectors must have the same length")]
fn test_dot_product_dimension_mismatch() {
let a = vec![1.0, 2.0, 3.0];
let b = vec![4.0, 5.0];
dot_product(&a, &b);
}
#[test]
#[should_panic(expected = "Vectors must have the same length")]
fn test_euclidean_distance_dimension_mismatch() {
let a = vec![1.0, 2.0, 3.0];
let b = vec![4.0, 5.0];
euclidean_distance(&a, &b);
}
#[test]
#[should_panic(expected = "Vectors must have the same length")]
fn test_cosine_similarity_dimension_mismatch() {
let a = vec![1.0, 2.0, 3.0];
let b = vec![4.0, 5.0];
cosine_similarity(&a, &b);
}
}