use crate::core::error::{Error, Result};
use std::cmp::Ordering;
pub fn simd_add_f64(left: &[f64], right: &[f64], result: &mut [f64]) -> Result<()> {
if left.len() != right.len() || left.len() != result.len() {
return Err(Error::InvalidOperation(
"Vector lengths must match for element-wise operations".to_string(),
));
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_add_f64_avx2(left, right, result) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_add_f64_sse2(left, right, result) };
}
}
for i in 0..left.len() {
result[i] = left[i] + right[i];
}
Ok(())
}
pub fn simd_subtract_f64(left: &[f64], right: &[f64], result: &mut [f64]) -> Result<()> {
if left.len() != right.len() || left.len() != result.len() {
return Err(Error::InvalidOperation(
"Vector lengths must match for element-wise operations".to_string(),
));
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_subtract_f64_avx2(left, right, result) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_subtract_f64_sse2(left, right, result) };
}
}
for i in 0..left.len() {
result[i] = left[i] - right[i];
}
Ok(())
}
pub fn simd_multiply_f64(left: &[f64], right: &[f64], result: &mut [f64]) -> Result<()> {
if left.len() != right.len() || left.len() != result.len() {
return Err(Error::InvalidOperation(
"Vector lengths must match for element-wise operations".to_string(),
));
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_multiply_f64_avx2(left, right, result) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_multiply_f64_sse2(left, right, result) };
}
}
for i in 0..left.len() {
result[i] = left[i] * right[i];
}
Ok(())
}
pub fn simd_divide_f64(left: &[f64], right: &[f64], result: &mut [f64]) -> Result<()> {
if left.len() != right.len() || left.len() != result.len() {
return Err(Error::InvalidOperation(
"Vector lengths must match for element-wise operations".to_string(),
));
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_divide_f64_avx2(left, right, result) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_divide_f64_sse2(left, right, result) };
}
}
for i in 0..left.len() {
result[i] = if right[i] != 0.0 {
left[i] / right[i]
} else {
f64::NAN
};
}
Ok(())
}
pub fn simd_add_scalar_f64(data: &[f64], scalar: f64, result: &mut [f64]) -> Result<()> {
if data.len() != result.len() {
return Err(Error::InvalidOperation(
"Input and output vectors must have same length".to_string(),
));
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_add_scalar_f64_avx2(data, scalar, result) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_add_scalar_f64_sse2(data, scalar, result) };
}
}
for i in 0..data.len() {
result[i] = data[i] + scalar;
}
Ok(())
}
pub fn simd_multiply_scalar_f64(data: &[f64], scalar: f64, result: &mut [f64]) -> Result<()> {
if data.len() != result.len() {
return Err(Error::InvalidOperation(
"Input and output vectors must have same length".to_string(),
));
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_multiply_scalar_f64_avx2(data, scalar, result) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_multiply_scalar_f64_sse2(data, scalar, result) };
}
}
for i in 0..data.len() {
result[i] = data[i] * scalar;
}
Ok(())
}
pub fn simd_abs_f64(data: &[f64], result: &mut [f64]) -> Result<()> {
if data.len() != result.len() {
return Err(Error::InvalidOperation(
"Input and output vectors must have same length".to_string(),
));
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_abs_f64_avx2(data, result) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_abs_f64_sse2(data, result) };
}
}
for i in 0..data.len() {
result[i] = data[i].abs();
}
Ok(())
}
pub fn simd_sqrt_f64(data: &[f64], result: &mut [f64]) -> Result<()> {
if data.len() != result.len() {
return Err(Error::InvalidOperation(
"Input and output vectors must have same length".to_string(),
));
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_sqrt_f64_avx2(data, result) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_sqrt_f64_sse2(data, result) };
}
}
for i in 0..data.len() {
result[i] = if data[i] >= 0.0 {
data[i].sqrt()
} else {
f64::NAN
};
}
Ok(())
}
pub fn simd_compare_f64(
left: &[f64],
right: &[f64],
op: ComparisonOp,
result: &mut [bool],
) -> Result<()> {
if left.len() != right.len() || left.len() != result.len() {
return Err(Error::InvalidOperation(
"Vector lengths must match for comparison operations".to_string(),
));
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_compare_f64_avx2(left, right, op, result) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_compare_f64_sse2(left, right, op, result) };
}
}
for i in 0..left.len() {
result[i] = match op {
ComparisonOp::Equal => (left[i] - right[i]).abs() < f64::EPSILON,
ComparisonOp::NotEqual => (left[i] - right[i]).abs() >= f64::EPSILON,
ComparisonOp::LessThan => left[i] < right[i],
ComparisonOp::LessThanEqual => left[i] <= right[i],
ComparisonOp::GreaterThan => left[i] > right[i],
ComparisonOp::GreaterThanEqual => left[i] >= right[i],
};
}
Ok(())
}
#[derive(Debug, Clone, Copy)]
pub enum ComparisonOp {
Equal,
NotEqual,
LessThan,
LessThanEqual,
GreaterThan,
GreaterThanEqual,
}
pub fn simd_add_i64(left: &[i64], right: &[i64], result: &mut [i64]) -> Result<()> {
if left.len() != right.len() || left.len() != result.len() {
return Err(Error::InvalidOperation(
"Vector lengths must match for element-wise operations".to_string(),
));
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_add_i64_avx2(left, right, result) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_add_i64_sse2(left, right, result) };
}
}
for i in 0..left.len() {
result[i] = left[i].saturating_add(right[i]);
}
Ok(())
}
pub fn simd_subtract_i64(left: &[i64], right: &[i64], result: &mut [i64]) -> Result<()> {
if left.len() != right.len() || left.len() != result.len() {
return Err(Error::InvalidOperation(
"Vector lengths must match for element-wise operations".to_string(),
));
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_subtract_i64_avx2(left, right, result) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_subtract_i64_sse2(left, right, result) };
}
}
for i in 0..left.len() {
result[i] = left[i].saturating_sub(right[i]);
}
Ok(())
}
pub fn simd_multiply_i64(left: &[i64], right: &[i64], result: &mut [i64]) -> Result<()> {
if left.len() != right.len() || left.len() != result.len() {
return Err(Error::InvalidOperation(
"Vector lengths must match for element-wise operations".to_string(),
));
}
for i in 0..left.len() {
result[i] = left[i].saturating_mul(right[i]);
}
Ok(())
}
pub fn simd_add_scalar_i64(data: &[i64], scalar: i64, result: &mut [i64]) -> Result<()> {
if data.len() != result.len() {
return Err(Error::InvalidOperation(
"Input and output vectors must have same length".to_string(),
));
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_add_scalar_i64_avx2(data, scalar, result) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_add_scalar_i64_sse2(data, scalar, result) };
}
}
for i in 0..data.len() {
result[i] = data[i].saturating_add(scalar);
}
Ok(())
}
pub fn simd_abs_i64(data: &[i64], result: &mut [i64]) -> Result<()> {
if data.len() != result.len() {
return Err(Error::InvalidOperation(
"Input and output vectors must have same length".to_string(),
));
}
for i in 0..data.len() {
result[i] = data[i].abs();
}
Ok(())
}
pub fn simd_compare_i64(
left: &[i64],
right: &[i64],
op: ComparisonOp,
result: &mut [bool],
) -> Result<()> {
if left.len() != right.len() || left.len() != result.len() {
return Err(Error::InvalidOperation(
"Vector lengths must match for comparison operations".to_string(),
));
}
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
return unsafe { simd_compare_i64_avx2(left, right, op, result) };
} else if is_x86_feature_detected!("sse2") {
return unsafe { simd_compare_i64_sse2(left, right, op, result) };
}
}
for i in 0..left.len() {
result[i] = match op {
ComparisonOp::Equal => left[i] == right[i],
ComparisonOp::NotEqual => left[i] != right[i],
ComparisonOp::LessThan => left[i] < right[i],
ComparisonOp::LessThanEqual => left[i] <= right[i],
ComparisonOp::GreaterThan => left[i] > right[i],
ComparisonOp::GreaterThanEqual => left[i] >= right[i],
};
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_add_f64_avx2(left: &[f64], right: &[f64], result: &mut [f64]) -> Result<()> {
use std::arch::x86_64::*;
let chunks = left.len() / 4;
let remainder = left.len() % 4;
for i in 0..chunks {
let offset = i * 4;
let left_vec = _mm256_loadu_pd(left.as_ptr().add(offset));
let right_vec = _mm256_loadu_pd(right.as_ptr().add(offset));
let result_vec = _mm256_add_pd(left_vec, right_vec);
_mm256_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 4)..left.len() {
result[i] = left[i] + right[i];
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_subtract_f64_avx2(left: &[f64], right: &[f64], result: &mut [f64]) -> Result<()> {
use std::arch::x86_64::*;
let chunks = left.len() / 4;
for i in 0..chunks {
let offset = i * 4;
let left_vec = _mm256_loadu_pd(left.as_ptr().add(offset));
let right_vec = _mm256_loadu_pd(right.as_ptr().add(offset));
let result_vec = _mm256_sub_pd(left_vec, right_vec);
_mm256_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 4)..left.len() {
result[i] = left[i] - right[i];
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_multiply_f64_avx2(left: &[f64], right: &[f64], result: &mut [f64]) -> Result<()> {
use std::arch::x86_64::*;
let chunks = left.len() / 4;
for i in 0..chunks {
let offset = i * 4;
let left_vec = _mm256_loadu_pd(left.as_ptr().add(offset));
let right_vec = _mm256_loadu_pd(right.as_ptr().add(offset));
let result_vec = _mm256_mul_pd(left_vec, right_vec);
_mm256_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 4)..left.len() {
result[i] = left[i] * right[i];
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_divide_f64_avx2(left: &[f64], right: &[f64], result: &mut [f64]) -> Result<()> {
use std::arch::x86_64::*;
let chunks = left.len() / 4;
for i in 0..chunks {
let offset = i * 4;
let left_vec = _mm256_loadu_pd(left.as_ptr().add(offset));
let right_vec = _mm256_loadu_pd(right.as_ptr().add(offset));
let result_vec = _mm256_div_pd(left_vec, right_vec);
_mm256_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 4)..left.len() {
result[i] = if right[i] != 0.0 {
left[i] / right[i]
} else {
f64::NAN
};
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_add_scalar_f64_avx2(data: &[f64], scalar: f64, result: &mut [f64]) -> Result<()> {
use std::arch::x86_64::*;
let scalar_vec = _mm256_set1_pd(scalar);
let chunks = data.len() / 4;
for i in 0..chunks {
let offset = i * 4;
let data_vec = _mm256_loadu_pd(data.as_ptr().add(offset));
let result_vec = _mm256_add_pd(data_vec, scalar_vec);
_mm256_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 4)..data.len() {
result[i] = data[i] + scalar;
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_multiply_scalar_f64_avx2(
data: &[f64],
scalar: f64,
result: &mut [f64],
) -> Result<()> {
use std::arch::x86_64::*;
let scalar_vec = _mm256_set1_pd(scalar);
let chunks = data.len() / 4;
for i in 0..chunks {
let offset = i * 4;
let data_vec = _mm256_loadu_pd(data.as_ptr().add(offset));
let result_vec = _mm256_mul_pd(data_vec, scalar_vec);
_mm256_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 4)..data.len() {
result[i] = data[i] * scalar;
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_abs_f64_avx2(data: &[f64], result: &mut [f64]) -> Result<()> {
use std::arch::x86_64::*;
let sign_mask = _mm256_set1_pd(-0.0);
let chunks = data.len() / 4;
for i in 0..chunks {
let offset = i * 4;
let data_vec = _mm256_loadu_pd(data.as_ptr().add(offset));
let result_vec = _mm256_andnot_pd(sign_mask, data_vec);
_mm256_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 4)..data.len() {
result[i] = data[i].abs();
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_sqrt_f64_avx2(data: &[f64], result: &mut [f64]) -> Result<()> {
use std::arch::x86_64::*;
let chunks = data.len() / 4;
for i in 0..chunks {
let offset = i * 4;
let data_vec = _mm256_loadu_pd(data.as_ptr().add(offset));
let result_vec = _mm256_sqrt_pd(data_vec);
_mm256_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 4)..data.len() {
result[i] = if data[i] >= 0.0 {
data[i].sqrt()
} else {
f64::NAN
};
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_compare_f64_avx2(
left: &[f64],
right: &[f64],
op: ComparisonOp,
result: &mut [bool],
) -> Result<()> {
use std::arch::x86_64::*;
let chunks = left.len() / 4;
for i in 0..chunks {
let offset = i * 4;
let left_vec = _mm256_loadu_pd(left.as_ptr().add(offset));
let right_vec = _mm256_loadu_pd(right.as_ptr().add(offset));
let cmp_result = match op {
ComparisonOp::Equal => _mm256_cmp_pd(left_vec, right_vec, _CMP_EQ_OQ),
ComparisonOp::NotEqual => _mm256_cmp_pd(left_vec, right_vec, _CMP_NEQ_OQ),
ComparisonOp::LessThan => _mm256_cmp_pd(left_vec, right_vec, _CMP_LT_OQ),
ComparisonOp::LessThanEqual => _mm256_cmp_pd(left_vec, right_vec, _CMP_LE_OQ),
ComparisonOp::GreaterThan => _mm256_cmp_pd(left_vec, right_vec, _CMP_GT_OQ),
ComparisonOp::GreaterThanEqual => _mm256_cmp_pd(left_vec, right_vec, _CMP_GE_OQ),
};
let mut temp_result = [0.0; 4];
_mm256_storeu_pd(temp_result.as_mut_ptr(), cmp_result);
for j in 0..4 {
result[offset + j] = temp_result[j].to_bits() != 0;
}
}
for i in (chunks * 4)..left.len() {
result[i] = match op {
ComparisonOp::Equal => (left[i] - right[i]).abs() < f64::EPSILON,
ComparisonOp::NotEqual => (left[i] - right[i]).abs() >= f64::EPSILON,
ComparisonOp::LessThan => left[i] < right[i],
ComparisonOp::LessThanEqual => left[i] <= right[i],
ComparisonOp::GreaterThan => left[i] > right[i],
ComparisonOp::GreaterThanEqual => left[i] >= right[i],
};
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_add_f64_sse2(left: &[f64], right: &[f64], result: &mut [f64]) -> Result<()> {
use std::arch::x86_64::*;
let chunks = left.len() / 2;
for i in 0..chunks {
let offset = i * 2;
let left_vec = _mm_loadu_pd(left.as_ptr().add(offset));
let right_vec = _mm_loadu_pd(right.as_ptr().add(offset));
let result_vec = _mm_add_pd(left_vec, right_vec);
_mm_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 2)..left.len() {
result[i] = left[i] + right[i];
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_subtract_f64_sse2(left: &[f64], right: &[f64], result: &mut [f64]) -> Result<()> {
use std::arch::x86_64::*;
let chunks = left.len() / 2;
for i in 0..chunks {
let offset = i * 2;
let left_vec = _mm_loadu_pd(left.as_ptr().add(offset));
let right_vec = _mm_loadu_pd(right.as_ptr().add(offset));
let result_vec = _mm_sub_pd(left_vec, right_vec);
_mm_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 2)..left.len() {
result[i] = left[i] - right[i];
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_multiply_f64_sse2(left: &[f64], right: &[f64], result: &mut [f64]) -> Result<()> {
use std::arch::x86_64::*;
let chunks = left.len() / 2;
for i in 0..chunks {
let offset = i * 2;
let left_vec = _mm_loadu_pd(left.as_ptr().add(offset));
let right_vec = _mm_loadu_pd(right.as_ptr().add(offset));
let result_vec = _mm_mul_pd(left_vec, right_vec);
_mm_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 2)..left.len() {
result[i] = left[i] * right[i];
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_divide_f64_sse2(left: &[f64], right: &[f64], result: &mut [f64]) -> Result<()> {
use std::arch::x86_64::*;
let chunks = left.len() / 2;
for i in 0..chunks {
let offset = i * 2;
let left_vec = _mm_loadu_pd(left.as_ptr().add(offset));
let right_vec = _mm_loadu_pd(right.as_ptr().add(offset));
let result_vec = _mm_div_pd(left_vec, right_vec);
_mm_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 2)..left.len() {
result[i] = if right[i] != 0.0 {
left[i] / right[i]
} else {
f64::NAN
};
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_add_scalar_f64_sse2(data: &[f64], scalar: f64, result: &mut [f64]) -> Result<()> {
use std::arch::x86_64::*;
let scalar_vec = _mm_set1_pd(scalar);
let chunks = data.len() / 2;
for i in 0..chunks {
let offset = i * 2;
let data_vec = _mm_loadu_pd(data.as_ptr().add(offset));
let result_vec = _mm_add_pd(data_vec, scalar_vec);
_mm_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 2)..data.len() {
result[i] = data[i] + scalar;
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_multiply_scalar_f64_sse2(
data: &[f64],
scalar: f64,
result: &mut [f64],
) -> Result<()> {
use std::arch::x86_64::*;
let scalar_vec = _mm_set1_pd(scalar);
let chunks = data.len() / 2;
for i in 0..chunks {
let offset = i * 2;
let data_vec = _mm_loadu_pd(data.as_ptr().add(offset));
let result_vec = _mm_mul_pd(data_vec, scalar_vec);
_mm_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 2)..data.len() {
result[i] = data[i] * scalar;
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_abs_f64_sse2(data: &[f64], result: &mut [f64]) -> Result<()> {
use std::arch::x86_64::*;
let sign_mask = _mm_set1_pd(-0.0);
let chunks = data.len() / 2;
for i in 0..chunks {
let offset = i * 2;
let data_vec = _mm_loadu_pd(data.as_ptr().add(offset));
let result_vec = _mm_andnot_pd(sign_mask, data_vec);
_mm_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 2)..data.len() {
result[i] = data[i].abs();
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_sqrt_f64_sse2(data: &[f64], result: &mut [f64]) -> Result<()> {
use std::arch::x86_64::*;
let chunks = data.len() / 2;
for i in 0..chunks {
let offset = i * 2;
let data_vec = _mm_loadu_pd(data.as_ptr().add(offset));
let result_vec = _mm_sqrt_pd(data_vec);
_mm_storeu_pd(result.as_mut_ptr().add(offset), result_vec);
}
for i in (chunks * 2)..data.len() {
result[i] = if data[i] >= 0.0 {
data[i].sqrt()
} else {
f64::NAN
};
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_compare_f64_sse2(
left: &[f64],
right: &[f64],
op: ComparisonOp,
result: &mut [bool],
) -> Result<()> {
use std::arch::x86_64::*;
let chunks = left.len() / 2;
for i in 0..chunks {
let offset = i * 2;
let left_vec = _mm_loadu_pd(left.as_ptr().add(offset));
let right_vec = _mm_loadu_pd(right.as_ptr().add(offset));
let cmp_result = match op {
ComparisonOp::Equal => _mm_cmpeq_pd(left_vec, right_vec),
ComparisonOp::NotEqual => _mm_cmpneq_pd(left_vec, right_vec),
ComparisonOp::LessThan => _mm_cmplt_pd(left_vec, right_vec),
ComparisonOp::LessThanEqual => _mm_cmple_pd(left_vec, right_vec),
ComparisonOp::GreaterThan => _mm_cmpgt_pd(left_vec, right_vec),
ComparisonOp::GreaterThanEqual => _mm_cmpge_pd(left_vec, right_vec),
};
let mut temp_result = [0.0; 2];
_mm_storeu_pd(temp_result.as_mut_ptr(), cmp_result);
for j in 0..2 {
result[offset + j] = temp_result[j].to_bits() != 0;
}
}
for i in (chunks * 2)..left.len() {
result[i] = match op {
ComparisonOp::Equal => (left[i] - right[i]).abs() < f64::EPSILON,
ComparisonOp::NotEqual => (left[i] - right[i]).abs() >= f64::EPSILON,
ComparisonOp::LessThan => left[i] < right[i],
ComparisonOp::LessThanEqual => left[i] <= right[i],
ComparisonOp::GreaterThan => left[i] > right[i],
ComparisonOp::GreaterThanEqual => left[i] >= right[i],
};
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_add_i64_avx2(left: &[i64], right: &[i64], result: &mut [i64]) -> Result<()> {
use std::arch::x86_64::*;
let chunks = left.len() / 4;
for i in 0..chunks {
let offset = i * 4;
let left_vec = _mm256_loadu_si256(left.as_ptr().add(offset) as *const __m256i);
let right_vec = _mm256_loadu_si256(right.as_ptr().add(offset) as *const __m256i);
let result_vec = _mm256_add_epi64(left_vec, right_vec);
_mm256_storeu_si256(result.as_mut_ptr().add(offset) as *mut __m256i, result_vec);
}
for i in (chunks * 4)..left.len() {
result[i] = left[i].saturating_add(right[i]);
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_subtract_i64_avx2(left: &[i64], right: &[i64], result: &mut [i64]) -> Result<()> {
use std::arch::x86_64::*;
let chunks = left.len() / 4;
for i in 0..chunks {
let offset = i * 4;
let left_vec = _mm256_loadu_si256(left.as_ptr().add(offset) as *const __m256i);
let right_vec = _mm256_loadu_si256(right.as_ptr().add(offset) as *const __m256i);
let result_vec = _mm256_sub_epi64(left_vec, right_vec);
_mm256_storeu_si256(result.as_mut_ptr().add(offset) as *mut __m256i, result_vec);
}
for i in (chunks * 4)..left.len() {
result[i] = left[i].saturating_sub(right[i]);
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_add_scalar_i64_avx2(data: &[i64], scalar: i64, result: &mut [i64]) -> Result<()> {
use std::arch::x86_64::*;
let scalar_vec = _mm256_set1_epi64x(scalar);
let chunks = data.len() / 4;
for i in 0..chunks {
let offset = i * 4;
let data_vec = _mm256_loadu_si256(data.as_ptr().add(offset) as *const __m256i);
let result_vec = _mm256_add_epi64(data_vec, scalar_vec);
_mm256_storeu_si256(result.as_mut_ptr().add(offset) as *mut __m256i, result_vec);
}
for i in (chunks * 4)..data.len() {
result[i] = data[i].saturating_add(scalar);
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_compare_i64_avx2(
left: &[i64],
right: &[i64],
op: ComparisonOp,
result: &mut [bool],
) -> Result<()> {
use std::arch::x86_64::*;
let chunks = left.len() / 4;
for i in 0..chunks {
let offset = i * 4;
let left_vec = _mm256_loadu_si256(left.as_ptr().add(offset) as *const __m256i);
let right_vec = _mm256_loadu_si256(right.as_ptr().add(offset) as *const __m256i);
let cmp_result = match op {
ComparisonOp::Equal => _mm256_cmpeq_epi64(left_vec, right_vec),
ComparisonOp::GreaterThan => _mm256_cmpgt_epi64(left_vec, right_vec),
_ => {
for j in 0..4 {
let l_val = *left.get_unchecked(offset + j);
let r_val = *right.get_unchecked(offset + j);
result[offset + j] = match op {
ComparisonOp::NotEqual => l_val != r_val,
ComparisonOp::LessThan => l_val < r_val,
ComparisonOp::LessThanEqual => l_val <= r_val,
ComparisonOp::GreaterThanEqual => l_val >= r_val,
_ => unreachable!(),
};
}
continue;
}
};
if matches!(op, ComparisonOp::Equal | ComparisonOp::GreaterThan) {
let mut temp_result = [0i64; 4];
_mm256_storeu_si256(temp_result.as_mut_ptr() as *mut __m256i, cmp_result);
for j in 0..4 {
result[offset + j] = temp_result[j] != 0;
}
}
}
for i in (chunks * 4)..left.len() {
result[i] = match op {
ComparisonOp::Equal => left[i] == right[i],
ComparisonOp::NotEqual => left[i] != right[i],
ComparisonOp::LessThan => left[i] < right[i],
ComparisonOp::LessThanEqual => left[i] <= right[i],
ComparisonOp::GreaterThan => left[i] > right[i],
ComparisonOp::GreaterThanEqual => left[i] >= right[i],
};
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_add_i64_sse2(left: &[i64], right: &[i64], result: &mut [i64]) -> Result<()> {
use std::arch::x86_64::*;
let chunks = left.len() / 2;
for i in 0..chunks {
let offset = i * 2;
let left_vec = _mm_loadu_si128(left.as_ptr().add(offset) as *const __m128i);
let right_vec = _mm_loadu_si128(right.as_ptr().add(offset) as *const __m128i);
let result_vec = _mm_add_epi64(left_vec, right_vec);
_mm_storeu_si128(result.as_mut_ptr().add(offset) as *mut __m128i, result_vec);
}
for i in (chunks * 2)..left.len() {
result[i] = left[i].saturating_add(right[i]);
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_subtract_i64_sse2(left: &[i64], right: &[i64], result: &mut [i64]) -> Result<()> {
use std::arch::x86_64::*;
let chunks = left.len() / 2;
for i in 0..chunks {
let offset = i * 2;
let left_vec = _mm_loadu_si128(left.as_ptr().add(offset) as *const __m128i);
let right_vec = _mm_loadu_si128(right.as_ptr().add(offset) as *const __m128i);
let result_vec = _mm_sub_epi64(left_vec, right_vec);
_mm_storeu_si128(result.as_mut_ptr().add(offset) as *mut __m128i, result_vec);
}
for i in (chunks * 2)..left.len() {
result[i] = left[i].saturating_sub(right[i]);
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_add_scalar_i64_sse2(data: &[i64], scalar: i64, result: &mut [i64]) -> Result<()> {
use std::arch::x86_64::*;
let scalar_vec = _mm_set1_epi64x(scalar);
let chunks = data.len() / 2;
for i in 0..chunks {
let offset = i * 2;
let data_vec = _mm_loadu_si128(data.as_ptr().add(offset) as *const __m128i);
let result_vec = _mm_add_epi64(data_vec, scalar_vec);
_mm_storeu_si128(result.as_mut_ptr().add(offset) as *mut __m128i, result_vec);
}
for i in (chunks * 2)..data.len() {
result[i] = data[i].saturating_add(scalar);
}
Ok(())
}
#[cfg(target_arch = "x86_64")]
unsafe fn simd_compare_i64_sse2(
left: &[i64],
right: &[i64],
op: ComparisonOp,
result: &mut [bool],
) -> Result<()> {
let chunks = left.len() / 2;
for i in 0..left.len() {
result[i] = match op {
ComparisonOp::Equal => left[i] == right[i],
ComparisonOp::NotEqual => left[i] != right[i],
ComparisonOp::LessThan => left[i] < right[i],
ComparisonOp::LessThanEqual => left[i] <= right[i],
ComparisonOp::GreaterThan => left[i] > right[i],
ComparisonOp::GreaterThanEqual => left[i] >= right[i],
};
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simd_add_f64() {
let left = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
let right = vec![1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0];
let mut result = vec![0.0; 8];
simd_add_f64(&left, &right, &mut result).expect("operation should succeed");
let expected = vec![2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0];
for i in 0..8 {
assert!((result[i] - expected[i]).abs() < 1e-10);
}
}
#[test]
fn test_simd_multiply_scalar_f64() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let mut result = vec![0.0; 5];
simd_multiply_scalar_f64(&data, 2.0, &mut result).expect("operation should succeed");
let expected = vec![2.0, 4.0, 6.0, 8.0, 10.0];
for i in 0..5 {
assert!((result[i] - expected[i]).abs() < 1e-10);
}
}
#[test]
fn test_simd_abs_f64() {
let data = vec![-1.0, 2.0, -3.0, 4.0, -5.0];
let mut result = vec![0.0; 5];
simd_abs_f64(&data, &mut result).expect("operation should succeed");
let expected = vec![1.0, 2.0, 3.0, 4.0, 5.0];
for i in 0..5 {
assert!((result[i] - expected[i]).abs() < 1e-10);
}
}
#[test]
fn test_simd_compare_f64() {
let left = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let right = vec![1.0, 1.0, 4.0, 4.0, 6.0];
let mut result = vec![false; 5];
simd_compare_f64(&left, &right, ComparisonOp::GreaterThan, &mut result)
.expect("operation should succeed");
let expected = vec![false, true, false, false, false];
assert_eq!(result, expected);
}
#[test]
fn test_simd_add_i64() {
let left = vec![1i64, 2, 3, 4, 5, 6, 7, 8];
let right = vec![1i64, 1, 1, 1, 1, 1, 1, 1];
let mut result = vec![0i64; 8];
simd_add_i64(&left, &right, &mut result).expect("operation should succeed");
let expected = vec![2i64, 3, 4, 5, 6, 7, 8, 9];
assert_eq!(result, expected);
}
#[test]
fn test_simd_compare_i64() {
let left = vec![1i64, 2, 3, 4, 5];
let right = vec![1i64, 1, 4, 4, 6];
let mut result = vec![false; 5];
simd_compare_i64(&left, &right, ComparisonOp::Equal, &mut result)
.expect("operation should succeed");
let expected = vec![true, false, false, true, false];
assert_eq!(result, expected);
}
#[test]
fn test_vector_length_mismatch() {
let left = vec![1.0, 2.0];
let right = vec![1.0];
let mut result = vec![0.0; 2];
let err = simd_add_f64(&left, &right, &mut result);
assert!(err.is_err());
}
}