#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
pub struct SimdOps;
impl SimdOps {
#[inline]
pub fn sum_f64(data: &[f64]) -> f64 {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
unsafe { Self::sum_f64_avx2(data) }
} else {
Self::sum_f64_scalar(data)
}
}
#[cfg(not(target_arch = "x86_64"))]
{
Self::sum_f64_scalar(data)
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn sum_f64_avx2(data: &[f64]) -> f64 {
let mut sum = _mm256_setzero_pd();
let chunks = data.chunks_exact(4);
let remainder = chunks.remainder();
for chunk in chunks {
let values = _mm256_loadu_pd(chunk.as_ptr());
sum = _mm256_add_pd(sum, values);
}
let mut result = [0.0; 4];
_mm256_storeu_pd(result.as_mut_ptr(), sum);
let simd_sum = result.iter().sum::<f64>();
simd_sum + remainder.iter().sum::<f64>()
}
#[inline]
fn sum_f64_scalar(data: &[f64]) -> f64 {
data.iter().sum()
}
#[inline]
pub fn mean_f64(data: &[f64]) -> f64 {
if data.is_empty() {
return 0.0;
}
Self::sum_f64(data) / data.len() as f64
}
#[inline]
pub fn dot_product_f64(a: &[f64], b: &[f64]) -> f64 {
assert_eq!(a.len(), b.len(), "Arrays must have same length");
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
unsafe { Self::dot_product_f64_avx2(a, b) }
} else {
Self::dot_product_f64_scalar(a, b)
}
}
#[cfg(not(target_arch = "x86_64"))]
{
Self::dot_product_f64_scalar(a, b)
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn dot_product_f64_avx2(a: &[f64], b: &[f64]) -> f64 {
let mut sum = _mm256_setzero_pd();
let chunks_a = a.chunks_exact(4);
let chunks_b = b.chunks_exact(4);
let remainder_a = chunks_a.remainder();
let remainder_b = chunks_b.remainder();
for (chunk_a, chunk_b) in chunks_a.zip(chunks_b) {
let va = _mm256_loadu_pd(chunk_a.as_ptr());
let vb = _mm256_loadu_pd(chunk_b.as_ptr());
let prod = _mm256_mul_pd(va, vb);
sum = _mm256_add_pd(sum, prod);
}
let mut result = [0.0; 4];
_mm256_storeu_pd(result.as_mut_ptr(), sum);
let simd_sum = result.iter().sum::<f64>();
let remainder_sum: f64 = remainder_a
.iter()
.zip(remainder_b.iter())
.map(|(x, y)| x * y)
.sum();
simd_sum + remainder_sum
}
#[inline]
fn dot_product_f64_scalar(a: &[f64], b: &[f64]) -> f64 {
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
}
pub fn variance_f64(data: &[f64]) -> f64 {
if data.len() < 2 {
return 0.0;
}
let mean = Self::mean_f64(data);
let squared_diffs: Vec<f64> = data.iter().map(|&x| (x - mean).powi(2)).collect();
Self::sum_f64(&squared_diffs) / data.len() as f64
}
#[inline]
pub fn add_f64(a: &[f64], b: &[f64], result: &mut [f64]) {
assert_eq!(a.len(), b.len(), "Arrays must have same length");
assert_eq!(a.len(), result.len(), "Result array must have same length");
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") {
unsafe { Self::add_f64_avx2(a, b, result) }
} else {
Self::add_f64_scalar(a, b, result)
}
}
#[cfg(not(target_arch = "x86_64"))]
{
Self::add_f64_scalar(a, b, result)
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn add_f64_avx2(a: &[f64], b: &[f64], result: &mut [f64]) {
let chunks_a = a.chunks_exact(4);
let chunks_b = b.chunks_exact(4);
let remainder_a = chunks_a.remainder();
let remainder_b = chunks_b.remainder();
let remainder_len = remainder_a.len();
let mut chunks_r = result.chunks_exact_mut(4);
for ((chunk_a, chunk_b), chunk_r) in chunks_a.zip(chunks_b).zip(&mut chunks_r) {
let va = _mm256_loadu_pd(chunk_a.as_ptr());
let vb = _mm256_loadu_pd(chunk_b.as_ptr());
let sum = _mm256_add_pd(va, vb);
_mm256_storeu_pd(chunk_r.as_mut_ptr(), sum);
}
let remainder_r = chunks_r.into_remainder();
for i in 0..remainder_len {
remainder_r[i] = remainder_a[i] + remainder_b[i];
}
}
#[inline]
fn add_f64_scalar(a: &[f64], b: &[f64], result: &mut [f64]) {
for i in 0..a.len() {
result[i] = a[i] + b[i];
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_sum() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let sum = SimdOps::sum_f64(&data);
assert_eq!(sum, 15.0);
}
#[test]
fn test_mean() {
let data = vec![1.0, 2.0, 3.0, 4.0];
let mean = SimdOps::mean_f64(&data);
assert_eq!(mean, 2.5);
}
#[test]
fn test_dot_product() {
let a = vec![1.0, 2.0, 3.0, 4.0];
let b = vec![2.0, 3.0, 4.0, 5.0];
let dot = SimdOps::dot_product_f64(&a, &b);
assert_eq!(dot, 40.0);
}
#[test]
fn test_variance() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
let variance = SimdOps::variance_f64(&data);
assert!((variance - 2.0).abs() < 1e-10);
}
#[test]
fn test_add() {
let a = vec![1.0, 2.0, 3.0, 4.0];
let b = vec![5.0, 6.0, 7.0, 8.0];
let mut result = vec![0.0; 4];
SimdOps::add_f64(&a, &b, &mut result);
assert_eq!(result, vec![6.0, 8.0, 10.0, 12.0]);
}
#[test]
fn test_simd_with_non_aligned_length() {
let data = vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0];
let sum = SimdOps::sum_f64(&data);
assert_eq!(sum, 28.0);
}
}
impl SimdOps {
#[inline]
pub fn is_visible_natural_simd(yi: f64, yj: f64, intermediate: &[f64], start_idx: usize, end_idx: usize) -> bool {
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") && intermediate.len() >= 4 {
unsafe { Self::is_visible_natural_avx2(yi, yj, intermediate, start_idx, end_idx) }
} else {
Self::is_visible_natural_scalar(yi, yj, intermediate, start_idx, end_idx, start_idx + 1)
}
}
#[cfg(target_arch = "aarch64")]
{
if std::arch::is_aarch64_feature_detected!("neon") && intermediate.len() >= 2 {
unsafe { Self::is_visible_natural_neon(yi, yj, intermediate, start_idx, end_idx) }
} else {
Self::is_visible_natural_scalar(yi, yj, intermediate, start_idx, end_idx, start_idx + 1)
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
Self::is_visible_natural_scalar(yi, yj, intermediate, start_idx, end_idx, start_idx + 1)
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn is_visible_natural_avx2(
yi: f64,
yj: f64,
intermediate: &[f64],
start_idx: usize,
end_idx: usize,
) -> bool {
let yi_vec = _mm256_set1_pd(yi);
let slope = (yj - yi) / ((end_idx - start_idx) as f64);
let slope_vec = _mm256_set1_pd(slope);
let chunks = intermediate.chunks_exact(4);
let remainder = chunks.remainder();
for (idx, chunk) in chunks.enumerate() {
let k_base = start_idx + 1 + idx * 4;
let yk = _mm256_loadu_pd(chunk.as_ptr());
let offsets = _mm256_set_pd(
(k_base + 3 - start_idx) as f64, (k_base + 2 - start_idx) as f64, (k_base + 1 - start_idx) as f64, (k_base - start_idx) as f64, );
let line_height = _mm256_fmadd_pd(slope_vec, offsets, yi_vec);
let cmp = _mm256_cmp_pd(yk, line_height, _CMP_GE_OQ);
let mask = _mm256_movemask_pd(cmp);
if mask != 0 {
return false; }
}
let offset = intermediate.len() - remainder.len();
let remainder_start_idx = start_idx + 1 + offset;
Self::is_visible_natural_scalar(yi, yj, remainder, start_idx, end_idx, remainder_start_idx)
}
fn is_visible_natural_scalar(
yi: f64,
yj: f64,
intermediate: &[f64],
original_start_idx: usize,
original_end_idx: usize,
actual_start_idx: usize,
) -> bool {
let slope = (yj - yi) / ((original_end_idx - original_start_idx) as f64);
for (offset, &yk) in intermediate.iter().enumerate() {
let k = actual_start_idx + offset;
let line_height = yi + slope * ((k - original_start_idx) as f64);
if yk >= line_height {
return false;
}
}
true
}
pub fn is_visible_horizontal_simd(yi: f64, yj: f64, intermediate: &[f64]) -> bool {
let min_h = yi.min(yj);
#[cfg(target_arch = "x86_64")]
{
if is_x86_feature_detected!("avx2") && intermediate.len() >= 4 {
unsafe { Self::is_visible_horizontal_avx2(min_h, intermediate) }
} else {
intermediate.iter().all(|&yk| yk < min_h)
}
}
#[cfg(target_arch = "aarch64")]
{
if std::arch::is_aarch64_feature_detected!("neon") && intermediate.len() >= 2 {
unsafe { Self::is_visible_horizontal_neon(min_h, intermediate) }
} else {
intermediate.iter().all(|&yk| yk < min_h)
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
intermediate.iter().all(|&yk| yk < min_h)
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn is_visible_horizontal_avx2(min_h: f64, intermediate: &[f64]) -> bool {
let min_vec = _mm256_set1_pd(min_h);
let chunks = intermediate.chunks_exact(4);
let remainder = chunks.remainder();
for chunk in chunks {
let values = _mm256_loadu_pd(chunk.as_ptr());
let cmp = _mm256_cmp_pd(values, min_vec, _CMP_GE_OQ);
let mask = _mm256_movemask_pd(cmp);
if mask != 0 {
return false; }
}
remainder.iter().all(|&yk| yk < min_h)
}
}
#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;
#[cfg(target_arch = "aarch64")]
impl SimdOps {
#[target_feature(enable = "neon")]
unsafe fn is_visible_natural_neon(
yi: f64,
yj: f64,
intermediate: &[f64],
start_idx: usize,
end_idx: usize,
) -> bool {
let slope = (yj - yi) / ((end_idx - start_idx) as f64);
let yi_vec = vdupq_n_f64(yi);
let slope_vec = vdupq_n_f64(slope);
let chunks = intermediate.chunks_exact(2);
let remainder = chunks.remainder();
for (idx, chunk) in chunks.enumerate() {
let k_base = start_idx + 1 + idx * 2;
let yk = vld1q_f64(chunk.as_ptr());
let offset0 = (k_base - start_idx) as f64;
let offset1 = (k_base + 1 - start_idx) as f64;
let offsets = vsetq_lane_f64::<1>(offset1, vdupq_n_f64(offset0));
let slope_times_offset = vmulq_f64(slope_vec, offsets);
let line_height = vaddq_f64(yi_vec, slope_times_offset);
let cmp = vcgeq_f64(yk, line_height);
let mask0 = vgetq_lane_u64(cmp, 0);
let mask1 = vgetq_lane_u64(cmp, 1);
if mask0 != 0 || mask1 != 0 {
return false; }
}
let offset = intermediate.len() - remainder.len();
let remainder_start_idx = start_idx + 1 + offset;
Self::is_visible_natural_scalar(yi, yj, remainder, start_idx, end_idx, remainder_start_idx)
}
#[target_feature(enable = "neon")]
unsafe fn is_visible_horizontal_neon(min_h: f64, intermediate: &[f64]) -> bool {
let min_vec = vdupq_n_f64(min_h);
let chunks = intermediate.chunks_exact(2);
let remainder = chunks.remainder();
for chunk in chunks {
let values = vld1q_f64(chunk.as_ptr());
let cmp = vcgeq_f64(values, min_vec);
let mask = vgetq_lane_u64(vreinterpretq_u64_u8(vandq_u8(
vreinterpretq_u8_u64(cmp),
vdupq_n_u8(1)
)), 0);
if mask != 0 {
return false; }
}
remainder.iter().all(|&yk| yk < min_h)
}
}