use crate::types::Value;
pub fn simd_sum_i64(values: &[i64]) -> i64 {
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
{
unsafe { simd_sum_i64_avx2(values) }
}
#[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
not(all(target_arch = "x86_64", target_feature = "avx2"))
))]
{
unsafe { simd_sum_i64_neon(values) }
}
#[cfg(not(any(
all(target_arch = "x86_64", target_feature = "avx2"),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
simd_sum_i64_fallback(values)
}
}
pub fn simd_sum_f64(values: &[f64]) -> f64 {
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
{
unsafe { simd_sum_f64_avx2(values) }
}
#[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
not(all(target_arch = "x86_64", target_feature = "avx2"))
))]
{
unsafe { simd_sum_f64_neon(values) }
}
#[cfg(not(any(
all(target_arch = "x86_64", target_feature = "avx2"),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
simd_sum_f64_fallback(values)
}
}
pub fn simd_min_i64(values: &[i64]) -> Option<i64> {
if values.is_empty() {
return None;
}
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
{
Some(unsafe { simd_min_i64_avx2(values) })
}
#[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
not(all(target_arch = "x86_64", target_feature = "avx2"))
))]
{
Some(unsafe { simd_min_i64_neon(values) })
}
#[cfg(not(any(
all(target_arch = "x86_64", target_feature = "avx2"),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
Some(simd_min_i64_fallback(values))
}
}
pub fn simd_max_i64(values: &[i64]) -> Option<i64> {
if values.is_empty() {
return None;
}
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
{
Some(unsafe { simd_max_i64_avx2(values) })
}
#[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
not(all(target_arch = "x86_64", target_feature = "avx2"))
))]
{
Some(unsafe { simd_max_i64_neon(values) })
}
#[cfg(not(any(
all(target_arch = "x86_64", target_feature = "avx2"),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
Some(simd_max_i64_fallback(values))
}
}
pub fn simd_filter_eq_i64(values: &[i64], target: i64) -> Vec<usize> {
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
{
unsafe { simd_filter_eq_i64_avx2(values, target) }
}
#[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
not(all(target_arch = "x86_64", target_feature = "avx2"))
))]
{
unsafe { simd_filter_eq_i64_neon(values, target) }
}
#[cfg(not(any(
all(target_arch = "x86_64", target_feature = "avx2"),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
simd_filter_eq_i64_fallback(values, target)
}
}
pub fn simd_filter_range_i64(values: &[i64], min: i64, max: i64) -> Vec<usize> {
#[cfg(all(target_arch = "x86_64", target_feature = "avx2"))]
{
unsafe { simd_filter_range_i64_avx2(values, min, max) }
}
#[cfg(all(
target_arch = "aarch64",
target_feature = "neon",
not(all(target_arch = "x86_64", target_feature = "avx2"))
))]
{
unsafe { simd_filter_range_i64_neon(values, min, max) }
}
#[cfg(not(any(
all(target_arch = "x86_64", target_feature = "avx2"),
all(target_arch = "aarch64", target_feature = "neon")
)))]
{
simd_filter_range_i64_fallback(values, min, max)
}
}
fn simd_sum_i64_fallback(values: &[i64]) -> i64 {
values.iter().sum()
}
fn simd_sum_f64_fallback(values: &[f64]) -> f64 {
values.iter().sum()
}
fn simd_min_i64_fallback(values: &[i64]) -> i64 {
values.iter().min().copied().unwrap_or(i64::MAX)
}
fn simd_max_i64_fallback(values: &[i64]) -> i64 {
values.iter().max().copied().unwrap_or(i64::MIN)
}
fn simd_filter_eq_i64_fallback(values: &[i64], target: i64) -> Vec<usize> {
values.iter()
.enumerate()
.filter_map(|(i, &v)| if v == target { Some(i) } else { None })
.collect()
}
fn simd_filter_range_i64_fallback(values: &[i64], min: i64, max: i64) -> Vec<usize> {
values.iter()
.enumerate()
.filter_map(|(i, &v)| if v >= min && v <= max { Some(i) } else { None })
.collect()
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn simd_sum_i64_avx2(values: &[i64]) -> i64 {
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let len = values.len();
let mut sum = _mm256_setzero_si256();
let chunks = len / 4;
let remainder = len % 4;
for i in 0..chunks {
let idx = i * 4;
let data = _mm256_loadu_si256(values.as_ptr().add(idx) as *const __m256i);
sum = _mm256_add_epi64(sum, data);
}
let mut result = [0i64; 4];
_mm256_storeu_si256(result.as_mut_ptr() as *mut __m256i, sum);
let mut total = result.iter().sum::<i64>();
for i in (chunks * 4)..len {
total += values[i];
}
total
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn simd_sum_f64_avx2(values: &[f64]) -> f64 {
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let len = values.len();
let mut sum = _mm256_setzero_pd();
let chunks = len / 4;
let remainder = len % 4;
for i in 0..chunks {
let idx = i * 4;
let data = _mm256_loadu_pd(values.as_ptr().add(idx));
sum = _mm256_add_pd(sum, data);
}
let mut result = [0.0f64; 4];
_mm256_storeu_pd(result.as_mut_ptr(), sum);
let mut total = result.iter().sum::<f64>();
for i in (chunks * 4)..len {
total += values[i];
}
total
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn simd_min_i64_avx2(values: &[i64]) -> i64 {
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let len = values.len();
if len == 0 {
return i64::MAX;
}
let mut min_vec = _mm256_set1_epi64x(values[0]);
let chunks = len / 4;
for i in 0..chunks {
let idx = i * 4;
let data = _mm256_loadu_si256(values.as_ptr().add(idx) as *const __m256i);
let cmp = _mm256_cmpgt_epi64(min_vec, data);
min_vec = _mm256_blendv_epi8(min_vec, data, cmp);
}
let mut result = [0i64; 4];
_mm256_storeu_si256(result.as_mut_ptr() as *mut __m256i, min_vec);
let mut min_val = result.iter().min().copied().unwrap_or(i64::MAX);
for i in (chunks * 4)..len {
min_val = min_val.min(values[i]);
}
min_val
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn simd_max_i64_avx2(values: &[i64]) -> i64 {
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let len = values.len();
if len == 0 {
return i64::MIN;
}
let mut max_vec = _mm256_set1_epi64x(values[0]);
let chunks = len / 4;
for i in 0..chunks {
let idx = i * 4;
let data = _mm256_loadu_si256(values.as_ptr().add(idx) as *const __m256i);
let cmp = _mm256_cmpgt_epi64(data, max_vec);
max_vec = _mm256_blendv_epi8(max_vec, data, cmp);
}
let mut result = [0i64; 4];
_mm256_storeu_si256(result.as_mut_ptr() as *mut __m256i, max_vec);
let mut max_val = result.iter().max().copied().unwrap_or(i64::MIN);
for i in (chunks * 4)..len {
max_val = max_val.max(values[i]);
}
max_val
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn simd_filter_eq_i64_avx2(values: &[i64], target: i64) -> Vec<usize> {
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let len = values.len();
let mut results = Vec::with_capacity(len / 4);
let target_vec = _mm256_set1_epi64x(target);
let chunks = len / 4;
for i in 0..chunks {
let idx = i * 4;
let data = _mm256_loadu_si256(values.as_ptr().add(idx) as *const __m256i);
let cmp = _mm256_cmpeq_epi64(data, target_vec);
let mask = _mm256_movemask_pd(_mm256_castsi256_pd(cmp));
if mask & 0x1 != 0 { results.push(idx); }
if mask & 0x2 != 0 { results.push(idx + 1); }
if mask & 0x4 != 0 { results.push(idx + 2); }
if mask & 0x8 != 0 { results.push(idx + 3); }
}
for i in (chunks * 4)..len {
if values[i] == target {
results.push(i);
}
}
results
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn simd_filter_range_i64_avx2(values: &[i64], min: i64, max: i64) -> Vec<usize> {
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
let len = values.len();
let mut results = Vec::with_capacity(len / 2);
let min_vec = _mm256_set1_epi64x(min);
let max_vec = _mm256_set1_epi64x(max);
let chunks = len / 4;
for i in 0..chunks {
let idx = i * 4;
let data = _mm256_loadu_si256(values.as_ptr().add(idx) as *const __m256i);
let cmp_min = _mm256_or_si256(
_mm256_cmpgt_epi64(data, min_vec),
_mm256_cmpeq_epi64(data, min_vec)
);
let cmp_max = _mm256_or_si256(
_mm256_cmpgt_epi64(max_vec, data),
_mm256_cmpeq_epi64(data, max_vec)
);
let cmp = _mm256_and_si256(cmp_min, cmp_max);
let mask = _mm256_movemask_pd(_mm256_castsi256_pd(cmp));
if mask & 0x1 != 0 { results.push(idx); }
if mask & 0x2 != 0 { results.push(idx + 1); }
if mask & 0x4 != 0 { results.push(idx + 2); }
if mask & 0x8 != 0 { results.push(idx + 3); }
}
for i in (chunks * 4)..len {
if values[i] >= min && values[i] <= max {
results.push(i);
}
}
results
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn simd_sum_i64_neon(values: &[i64]) -> i64 {
#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;
let len = values.len();
let mut sum = vdupq_n_s64(0);
let chunks = len / 2;
let remainder = len % 2;
for i in 0..chunks {
let idx = i * 2;
let data = vld1q_s64(values.as_ptr().add(idx));
sum = vaddq_s64(sum, data);
}
let result = [vgetq_lane_s64(sum, 0), vgetq_lane_s64(sum, 1)];
let mut total = result.iter().sum::<i64>();
for i in (chunks * 2)..len {
total += values[i];
}
total
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn simd_sum_f64_neon(values: &[f64]) -> f64 {
#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;
let len = values.len();
let mut sum = vdupq_n_f64(0.0);
let chunks = len / 2;
for i in 0..chunks {
let idx = i * 2;
let data = vld1q_f64(values.as_ptr().add(idx));
sum = vaddq_f64(sum, data);
}
let result = [vgetq_lane_f64(sum, 0), vgetq_lane_f64(sum, 1)];
let mut total = result.iter().sum::<f64>();
for i in (chunks * 2)..len {
total += values[i];
}
total
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn simd_min_i64_neon(values: &[i64]) -> i64 {
simd_min_i64_fallback(values)
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn simd_max_i64_neon(values: &[i64]) -> i64 {
simd_max_i64_fallback(values)
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn simd_filter_eq_i64_neon(values: &[i64], target: i64) -> Vec<usize> {
#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;
let len = values.len();
let mut results = Vec::with_capacity(len / 4);
let target_vec = vdupq_n_s64(target);
let chunks = len / 2;
for i in 0..chunks {
let idx = i * 2;
let data = vld1q_s64(values.as_ptr().add(idx));
let cmp = vceqq_s64(data, target_vec);
let result_arr = [vgetq_lane_u64(cmp, 0), vgetq_lane_u64(cmp, 1)];
if result_arr[0] != 0 {
results.push(idx);
}
if result_arr[1] != 0 {
results.push(idx + 1);
}
}
for i in (chunks * 2)..len {
if values[i] == target {
results.push(i);
}
}
results
}
#[cfg(target_arch = "aarch64")]
#[target_feature(enable = "neon")]
unsafe fn simd_filter_range_i64_neon(values: &[i64], min: i64, max: i64) -> Vec<usize> {
#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;
let len = values.len();
let mut results = Vec::with_capacity(len / 2);
let min_vec = vdupq_n_s64(min);
let max_vec = vdupq_n_s64(max);
let chunks = len / 2;
for i in 0..chunks {
let idx = i * 2;
let data = vld1q_s64(values.as_ptr().add(idx));
let cmp_min = vcgeq_s64(data, min_vec);
let cmp_max = vcleq_s64(data, max_vec);
let cmp = vandq_u64(cmp_min, cmp_max);
if vgetq_lane_u64(cmp, 0) != 0 {
results.push(idx);
}
if vgetq_lane_u64(cmp, 1) != 0 {
results.push(idx + 1);
}
}
for i in (chunks * 2)..len {
if values[i] >= min && values[i] <= max {
results.push(i);
}
}
results
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simd_sum_i64() {
let values = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
let sum = simd_sum_i64(&values);
assert_eq!(sum, 55);
}
#[test]
fn test_simd_sum_f64() {
let values = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let sum = simd_sum_f64(&values);
assert!((sum - 15.0).abs() < 1e-10);
}
#[test]
fn test_simd_min_max() {
let values = vec![5, 2, 9, 1, 7, 3];
assert_eq!(simd_min_i64(&values), Some(1));
assert_eq!(simd_max_i64(&values), Some(9));
}
#[test]
fn test_simd_filter_eq() {
let values = vec![1, 2, 3, 2, 4, 2, 5];
let indices = simd_filter_eq_i64(&values, 2);
assert_eq!(indices, vec![1, 3, 5]);
}
#[test]
fn test_simd_filter_range() {
let values = vec![1, 5, 10, 15, 20, 25, 30];
let indices = simd_filter_range_i64(&values, 10, 20);
assert_eq!(indices, vec![2, 3, 4]);
}
#[test]
fn test_large_dataset() {
let values: Vec<i64> = (0..100_000).collect();
let sum = simd_sum_i64(&values);
assert_eq!(sum, 4_999_950_000);
}
}