use ::ndarray::{Array1, ArrayView1};
#[inline(always)]
#[allow(clippy::uninit_vec)] pub fn simd_add_f32_ultra_optimized(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
let len = a.len();
assert_eq!(len, b.len(), "Arrays must have same length");
let mut result = Vec::with_capacity(len);
unsafe {
result.set_len(len);
}
#[cfg(target_arch = "x86_64")]
{
unsafe {
use std::arch::x86_64::*;
let a_ptr = a.as_slice().expect("Operation failed").as_ptr();
let b_ptr = b.as_slice().expect("Operation failed").as_ptr();
let result_ptr = result.as_mut_ptr();
if is_x86_feature_detected!("avx512f") {
avx512_add_f32_inner(a_ptr, b_ptr, result_ptr, len);
} else if is_x86_feature_detected!("avx2") {
avx2_add_f32_inner(a_ptr, b_ptr, result_ptr, len);
} else if is_x86_feature_detected!("sse") {
sse_add_f32_inner(a_ptr, b_ptr, result_ptr, len);
} else {
scalar_add_f32_inner(a_ptr, b_ptr, result_ptr, len);
}
}
}
#[cfg(target_arch = "aarch64")]
{
unsafe {
let a_ptr = a.as_slice().expect("Operation failed").as_ptr();
let b_ptr = b.as_slice().expect("Operation failed").as_ptr();
let result_ptr = result.as_mut_ptr();
if std::arch::is_aarch64_feature_detected!("neon") {
neon_add_f32_inner(a_ptr, b_ptr, result_ptr, len);
} else {
scalar_add_f32_inner(a_ptr, b_ptr, result_ptr, len);
}
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
unsafe {
let a_ptr = a.as_slice().expect("Operation failed").as_ptr();
let b_ptr = b.as_slice().expect("Operation failed").as_ptr();
let result_ptr = result.as_mut_ptr();
scalar_add_f32_inner(a_ptr, b_ptr, result_ptr, len);
}
}
Array1::from_vec(result)
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx512f")]
unsafe fn avx512_add_f32_inner(a: *const f32, b: *const f32, result: *mut f32, len: usize) {
use std::arch::x86_64::*;
let mut i = 0;
const PREFETCH_DISTANCE: usize = 512;
let a_aligned = (a as usize) % 64 == 0;
let b_aligned = (b as usize) % 64 == 0;
let result_aligned = (result as usize) % 64 == 0;
if a_aligned && b_aligned && result_aligned {
while i + 64 <= len {
if i + PREFETCH_DISTANCE < len {
_mm_prefetch(a.add(i + PREFETCH_DISTANCE) as *const i8, _MM_HINT_T0);
_mm_prefetch(b.add(i + PREFETCH_DISTANCE) as *const i8, _MM_HINT_T0);
}
let a1 = _mm512_load_ps(a.add(i));
let b1 = _mm512_load_ps(b.add(i));
let a2 = _mm512_load_ps(a.add(i + 16));
let b2 = _mm512_load_ps(b.add(i + 16));
let a3 = _mm512_load_ps(a.add(i + 32));
let b3 = _mm512_load_ps(b.add(i + 32));
let a4 = _mm512_load_ps(a.add(i + 48));
let b4 = _mm512_load_ps(b.add(i + 48));
let r1 = _mm512_add_ps(a1, b1);
let r2 = _mm512_add_ps(a2, b2);
let r3 = _mm512_add_ps(a3, b3);
let r4 = _mm512_add_ps(a4, b4);
_mm512_store_ps(result.add(i), r1);
_mm512_store_ps(result.add(i + 16), r2);
_mm512_store_ps(result.add(i + 32), r3);
_mm512_store_ps(result.add(i + 48), r4);
i += 64;
}
}
while i + 16 <= len {
let a_vec = _mm512_loadu_ps(a.add(i));
let b_vec = _mm512_loadu_ps(b.add(i));
let result_vec = _mm512_add_ps(a_vec, b_vec);
_mm512_storeu_ps(result.add(i), result_vec);
i += 16;
}
while i < len {
*result.add(i) = *a.add(i) + *b.add(i);
i += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx2")]
unsafe fn avx2_add_f32_inner(a: *const f32, b: *const f32, result: *mut f32, len: usize) {
use std::arch::x86_64::*;
let mut i = 0;
const PREFETCH_DISTANCE: usize = 256;
let a_aligned = (a as usize) % 32 == 0;
let b_aligned = (b as usize) % 32 == 0;
let result_aligned = (result as usize) % 32 == 0;
if a_aligned && b_aligned && result_aligned && len >= 64 {
while i + 64 <= len {
if i + PREFETCH_DISTANCE < len {
_mm_prefetch(a.add(i + PREFETCH_DISTANCE) as *const i8, _MM_HINT_T0);
_mm_prefetch(b.add(i + PREFETCH_DISTANCE) as *const i8, _MM_HINT_T0);
}
let a1 = _mm256_load_ps(a.add(i));
let b1 = _mm256_load_ps(b.add(i));
let a2 = _mm256_load_ps(a.add(i + 8));
let b2 = _mm256_load_ps(b.add(i + 8));
let a3 = _mm256_load_ps(a.add(i + 16));
let b3 = _mm256_load_ps(b.add(i + 16));
let a4 = _mm256_load_ps(a.add(i + 24));
let b4 = _mm256_load_ps(b.add(i + 24));
let a5 = _mm256_load_ps(a.add(i + 32));
let b5 = _mm256_load_ps(b.add(i + 32));
let a6 = _mm256_load_ps(a.add(i + 40));
let b6 = _mm256_load_ps(b.add(i + 40));
let a7 = _mm256_load_ps(a.add(i + 48));
let b7 = _mm256_load_ps(b.add(i + 48));
let a8 = _mm256_load_ps(a.add(i + 56));
let b8 = _mm256_load_ps(b.add(i + 56));
let r1 = _mm256_add_ps(a1, b1);
let r2 = _mm256_add_ps(a2, b2);
let r3 = _mm256_add_ps(a3, b3);
let r4 = _mm256_add_ps(a4, b4);
let r5 = _mm256_add_ps(a5, b5);
let r6 = _mm256_add_ps(a6, b6);
let r7 = _mm256_add_ps(a7, b7);
let r8 = _mm256_add_ps(a8, b8);
_mm256_store_ps(result.add(i), r1);
_mm256_store_ps(result.add(i + 8), r2);
_mm256_store_ps(result.add(i + 16), r3);
_mm256_store_ps(result.add(i + 24), r4);
_mm256_store_ps(result.add(i + 32), r5);
_mm256_store_ps(result.add(i + 40), r6);
_mm256_store_ps(result.add(i + 48), r7);
_mm256_store_ps(result.add(i + 56), r8);
i += 64;
}
}
while i + 8 <= len {
let a_vec = _mm256_loadu_ps(a.add(i));
let b_vec = _mm256_loadu_ps(b.add(i));
let result_vec = _mm256_add_ps(a_vec, b_vec);
_mm256_storeu_ps(result.add(i), result_vec);
i += 8;
}
while i < len {
*result.add(i) = *a.add(i) + *b.add(i);
i += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "sse")]
unsafe fn sse_add_f32_inner(a: *const f32, b: *const f32, result: *mut f32, len: usize) {
use std::arch::x86_64::*;
let mut i = 0;
while i + 16 <= len {
let a1 = _mm_loadu_ps(a.add(i));
let b1 = _mm_loadu_ps(b.add(i));
let a2 = _mm_loadu_ps(a.add(i + 4));
let b2 = _mm_loadu_ps(b.add(i + 4));
let a3 = _mm_loadu_ps(a.add(i + 8));
let b3 = _mm_loadu_ps(b.add(i + 8));
let a4 = _mm_loadu_ps(a.add(i + 12));
let b4 = _mm_loadu_ps(b.add(i + 12));
let r1 = _mm_add_ps(a1, b1);
let r2 = _mm_add_ps(a2, b2);
let r3 = _mm_add_ps(a3, b3);
let r4 = _mm_add_ps(a4, b4);
_mm_storeu_ps(result.add(i), r1);
_mm_storeu_ps(result.add(i + 4), r2);
_mm_storeu_ps(result.add(i + 8), r3);
_mm_storeu_ps(result.add(i + 12), r4);
i += 16;
}
while i + 4 <= len {
let a_vec = _mm_loadu_ps(a.add(i));
let b_vec = _mm_loadu_ps(b.add(i));
let result_vec = _mm_add_ps(a_vec, b_vec);
_mm_storeu_ps(result.add(i), result_vec);
i += 4;
}
while i < len {
*result.add(i) = *a.add(i) + *b.add(i);
i += 1;
}
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
unsafe fn neon_add_f32_inner(a: *const f32, b: *const f32, result: *mut f32, len: usize) {
use std::arch::aarch64::*;
let mut i = 0;
while i + 16 <= len {
let a1 = vld1q_f32(a.add(i));
let b1 = vld1q_f32(b.add(i));
let a2 = vld1q_f32(a.add(i + 4));
let b2 = vld1q_f32(b.add(i + 4));
let a3 = vld1q_f32(a.add(i + 8));
let b3 = vld1q_f32(b.add(i + 8));
let a4 = vld1q_f32(a.add(i + 12));
let b4 = vld1q_f32(b.add(i + 12));
let r1 = vaddq_f32(a1, b1);
let r2 = vaddq_f32(a2, b2);
let r3 = vaddq_f32(a3, b3);
let r4 = vaddq_f32(a4, b4);
vst1q_f32(result.add(i), r1);
vst1q_f32(result.add(i + 4), r2);
vst1q_f32(result.add(i + 8), r3);
vst1q_f32(result.add(i + 12), r4);
i += 16;
}
while i + 4 <= len {
let a_vec = vld1q_f32(a.add(i));
let b_vec = vld1q_f32(b.add(i));
let result_vec = vaddq_f32(a_vec, b_vec);
vst1q_f32(result.add(i), result_vec);
i += 4;
}
while i < len {
*result.add(i) = *a.add(i) + *b.add(i);
i += 1;
}
}
#[inline(always)]
unsafe fn scalar_add_f32_inner(a: *const f32, b: *const f32, result: *mut f32, len: usize) {
for i in 0..len {
*result.add(i) = *a.add(i) + *b.add(i);
}
}
#[cfg(test)]
mod tests {
use super::*;
use ::ndarray::Array1;
#[test]
fn test_ultra_optimized_add() {
let a = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
let b = Array1::from_vec(vec![8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]);
let result = simd_add_f32_ultra_optimized(&a.view(), &b.view());
for i in 0..8 {
assert_eq!(result[i], 9.0);
}
}
#[test]
fn test_large_array() {
let size = 10000;
let a = Array1::from_elem(size, 2.0f32);
let b = Array1::from_elem(size, 3.0f32);
let result = simd_add_f32_ultra_optimized(&a.view(), &b.view());
for i in 0..size {
assert_eq!(result[i], 5.0);
}
}
}
#[inline(always)]
#[allow(clippy::uninit_vec)] pub fn simd_mul_f32_ultra_optimized(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> Array1<f32> {
let len = a.len();
assert_eq!(len, b.len(), "Arrays must have same length");
let mut result = Vec::with_capacity(len);
unsafe {
result.set_len(len);
}
#[cfg(target_arch = "x86_64")]
{
unsafe {
use std::arch::x86_64::*;
let a_ptr = a.as_slice().expect("Operation failed").as_ptr();
let b_ptr = b.as_slice().expect("Operation failed").as_ptr();
let result_ptr = result.as_mut_ptr();
if is_x86_feature_detected!("avx512f") {
avx512_mul_f32_inner(a_ptr, b_ptr, result_ptr, len);
} else if is_x86_feature_detected!("avx2") {
avx2_mul_f32_inner(a_ptr, b_ptr, result_ptr, len);
} else if is_x86_feature_detected!("sse") {
sse_mul_f32_inner(a_ptr, b_ptr, result_ptr, len);
} else {
scalar_mul_f32_inner(a_ptr, b_ptr, result_ptr, len);
}
}
}
#[cfg(target_arch = "aarch64")]
{
unsafe {
let a_ptr = a.as_slice().expect("Operation failed").as_ptr();
let b_ptr = b.as_slice().expect("Operation failed").as_ptr();
let result_ptr = result.as_mut_ptr();
if std::arch::is_aarch64_feature_detected!("neon") {
neon_mul_f32_inner(a_ptr, b_ptr, result_ptr, len);
} else {
scalar_mul_f32_inner(a_ptr, b_ptr, result_ptr, len);
}
}
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
unsafe {
let a_ptr = a.as_slice().expect("Operation failed").as_ptr();
let b_ptr = b.as_slice().expect("Operation failed").as_ptr();
let result_ptr = result.as_mut_ptr();
scalar_mul_f32_inner(a_ptr, b_ptr, result_ptr, len);
}
}
Array1::from_vec(result)
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx512f")]
unsafe fn avx512_mul_f32_inner(a: *const f32, b: *const f32, result: *mut f32, len: usize) {
use std::arch::x86_64::*;
let mut i = 0;
const PREFETCH_DISTANCE: usize = 512;
let a_aligned = (a as usize) % 64 == 0;
let b_aligned = (b as usize) % 64 == 0;
let result_aligned = (result as usize) % 64 == 0;
if a_aligned && b_aligned && result_aligned {
while i + 64 <= len {
if i + PREFETCH_DISTANCE < len {
_mm_prefetch(a.add(i + PREFETCH_DISTANCE) as *const i8, _MM_HINT_T0);
_mm_prefetch(b.add(i + PREFETCH_DISTANCE) as *const i8, _MM_HINT_T0);
}
let a1 = _mm512_load_ps(a.add(i));
let b1 = _mm512_load_ps(b.add(i));
let a2 = _mm512_load_ps(a.add(i + 16));
let b2 = _mm512_load_ps(b.add(i + 16));
let a3 = _mm512_load_ps(a.add(i + 32));
let b3 = _mm512_load_ps(b.add(i + 32));
let a4 = _mm512_load_ps(a.add(i + 48));
let b4 = _mm512_load_ps(b.add(i + 48));
let r1 = _mm512_mul_ps(a1, b1);
let r2 = _mm512_mul_ps(a2, b2);
let r3 = _mm512_mul_ps(a3, b3);
let r4 = _mm512_mul_ps(a4, b4);
_mm512_store_ps(result.add(i), r1);
_mm512_store_ps(result.add(i + 16), r2);
_mm512_store_ps(result.add(i + 32), r3);
_mm512_store_ps(result.add(i + 48), r4);
i += 64;
}
}
while i + 16 <= len {
let a_vec = _mm512_loadu_ps(a.add(i));
let b_vec = _mm512_loadu_ps(b.add(i));
let result_vec = _mm512_mul_ps(a_vec, b_vec);
_mm512_storeu_ps(result.add(i), result_vec);
i += 16;
}
while i < len {
*result.add(i) = *a.add(i) * *b.add(i);
i += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx2")]
unsafe fn avx2_mul_f32_inner(a: *const f32, b: *const f32, result: *mut f32, len: usize) {
use std::arch::x86_64::*;
let mut i = 0;
const PREFETCH_DISTANCE: usize = 256;
let a_aligned = (a as usize) % 32 == 0;
let b_aligned = (b as usize) % 32 == 0;
let result_aligned = (result as usize) % 32 == 0;
if a_aligned && b_aligned && result_aligned && len >= 64 {
while i + 64 <= len {
if i + PREFETCH_DISTANCE < len {
_mm_prefetch(a.add(i + PREFETCH_DISTANCE) as *const i8, _MM_HINT_T0);
_mm_prefetch(b.add(i + PREFETCH_DISTANCE) as *const i8, _MM_HINT_T0);
}
let a1 = _mm256_load_ps(a.add(i));
let b1 = _mm256_load_ps(b.add(i));
let a2 = _mm256_load_ps(a.add(i + 8));
let b2 = _mm256_load_ps(b.add(i + 8));
let a3 = _mm256_load_ps(a.add(i + 16));
let b3 = _mm256_load_ps(b.add(i + 16));
let a4 = _mm256_load_ps(a.add(i + 24));
let b4 = _mm256_load_ps(b.add(i + 24));
let a5 = _mm256_load_ps(a.add(i + 32));
let b5 = _mm256_load_ps(b.add(i + 32));
let a6 = _mm256_load_ps(a.add(i + 40));
let b6 = _mm256_load_ps(b.add(i + 40));
let a7 = _mm256_load_ps(a.add(i + 48));
let b7 = _mm256_load_ps(b.add(i + 48));
let a8 = _mm256_load_ps(a.add(i + 56));
let b8 = _mm256_load_ps(b.add(i + 56));
let r1 = _mm256_mul_ps(a1, b1);
let r2 = _mm256_mul_ps(a2, b2);
let r3 = _mm256_mul_ps(a3, b3);
let r4 = _mm256_mul_ps(a4, b4);
let r5 = _mm256_mul_ps(a5, b5);
let r6 = _mm256_mul_ps(a6, b6);
let r7 = _mm256_mul_ps(a7, b7);
let r8 = _mm256_mul_ps(a8, b8);
_mm256_store_ps(result.add(i), r1);
_mm256_store_ps(result.add(i + 8), r2);
_mm256_store_ps(result.add(i + 16), r3);
_mm256_store_ps(result.add(i + 24), r4);
_mm256_store_ps(result.add(i + 32), r5);
_mm256_store_ps(result.add(i + 40), r6);
_mm256_store_ps(result.add(i + 48), r7);
_mm256_store_ps(result.add(i + 56), r8);
i += 64;
}
}
while i + 8 <= len {
let a_vec = _mm256_loadu_ps(a.add(i));
let b_vec = _mm256_loadu_ps(b.add(i));
let result_vec = _mm256_mul_ps(a_vec, b_vec);
_mm256_storeu_ps(result.add(i), result_vec);
i += 8;
}
while i < len {
*result.add(i) = *a.add(i) * *b.add(i);
i += 1;
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "sse")]
unsafe fn sse_mul_f32_inner(a: *const f32, b: *const f32, result: *mut f32, len: usize) {
use std::arch::x86_64::*;
let mut i = 0;
while i + 16 <= len {
let a1 = _mm_loadu_ps(a.add(i));
let b1 = _mm_loadu_ps(b.add(i));
let a2 = _mm_loadu_ps(a.add(i + 4));
let b2 = _mm_loadu_ps(b.add(i + 4));
let a3 = _mm_loadu_ps(a.add(i + 8));
let b3 = _mm_loadu_ps(b.add(i + 8));
let a4 = _mm_loadu_ps(a.add(i + 12));
let b4 = _mm_loadu_ps(b.add(i + 12));
let r1 = _mm_mul_ps(a1, b1);
let r2 = _mm_mul_ps(a2, b2);
let r3 = _mm_mul_ps(a3, b3);
let r4 = _mm_mul_ps(a4, b4);
_mm_storeu_ps(result.add(i), r1);
_mm_storeu_ps(result.add(i + 4), r2);
_mm_storeu_ps(result.add(i + 8), r3);
_mm_storeu_ps(result.add(i + 12), r4);
i += 16;
}
while i + 4 <= len {
let a_vec = _mm_loadu_ps(a.add(i));
let b_vec = _mm_loadu_ps(b.add(i));
let result_vec = _mm_mul_ps(a_vec, b_vec);
_mm_storeu_ps(result.add(i), result_vec);
i += 4;
}
while i < len {
*result.add(i) = *a.add(i) * *b.add(i);
i += 1;
}
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
unsafe fn neon_mul_f32_inner(a: *const f32, b: *const f32, result: *mut f32, len: usize) {
use std::arch::aarch64::*;
let mut i = 0;
while i + 16 <= len {
let a1 = vld1q_f32(a.add(i));
let b1 = vld1q_f32(b.add(i));
let a2 = vld1q_f32(a.add(i + 4));
let b2 = vld1q_f32(b.add(i + 4));
let a3 = vld1q_f32(a.add(i + 8));
let b3 = vld1q_f32(b.add(i + 8));
let a4 = vld1q_f32(a.add(i + 12));
let b4 = vld1q_f32(b.add(i + 12));
let r1 = vmulq_f32(a1, b1);
let r2 = vmulq_f32(a2, b2);
let r3 = vmulq_f32(a3, b3);
let r4 = vmulq_f32(a4, b4);
vst1q_f32(result.add(i), r1);
vst1q_f32(result.add(i + 4), r2);
vst1q_f32(result.add(i + 8), r3);
vst1q_f32(result.add(i + 12), r4);
i += 16;
}
while i + 4 <= len {
let a_vec = vld1q_f32(a.add(i));
let b_vec = vld1q_f32(b.add(i));
let result_vec = vmulq_f32(a_vec, b_vec);
vst1q_f32(result.add(i), result_vec);
i += 4;
}
while i < len {
*result.add(i) = *a.add(i) * *b.add(i);
i += 1;
}
}
#[inline(always)]
unsafe fn scalar_mul_f32_inner(a: *const f32, b: *const f32, result: *mut f32, len: usize) {
for i in 0..len {
*result.add(i) = *a.add(i) * *b.add(i);
}
}
#[cfg(test)]
mod mul_tests {
use super::*;
use ::ndarray::Array1;
#[test]
fn test_ultra_optimized_mul() {
let a = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
let b = Array1::from_vec(vec![2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0]);
let result = simd_mul_f32_ultra_optimized(&a.view(), &b.view());
assert_eq!(result[0], 2.0);
assert_eq!(result[1], 6.0);
assert_eq!(result[2], 12.0);
assert_eq!(result[7], 72.0);
}
}
#[inline(always)]
pub fn simd_dot_f32_ultra_optimized(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> f32 {
let len = a.len();
assert_eq!(len, b.len(), "Arrays must have same length");
#[cfg(target_arch = "x86_64")]
{
unsafe {
let a_ptr = a.as_slice().expect("Operation failed").as_ptr();
let b_ptr = b.as_slice().expect("Operation failed").as_ptr();
if is_x86_feature_detected!("avx512f") {
return avx512_dot_f32_inner(a_ptr, b_ptr, len);
} else if is_x86_feature_detected!("avx2") {
return avx2_dot_f32_inner(a_ptr, b_ptr, len);
} else if is_x86_feature_detected!("sse2") {
return sse_dot_f32_inner(a_ptr, b_ptr, len);
} else {
return scalar_dot_f32(a, b);
}
}
}
#[cfg(target_arch = "aarch64")]
unsafe {
let a_ptr = a.as_slice().expect("Operation failed").as_ptr();
let b_ptr = b.as_slice().expect("Operation failed").as_ptr();
return neon_dot_f32_inner(a_ptr, b_ptr, len);
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
scalar_dot_f32(a, b)
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx512f")]
unsafe fn avx512_dot_f32_inner(a: *const f32, b: *const f32, len: usize) -> f32 {
use std::arch::x86_64::*;
const PREFETCH_DISTANCE: usize = 512;
const VECTOR_SIZE: usize = 16; const UNROLL_FACTOR: usize = 4;
const CHUNK_SIZE: usize = VECTOR_SIZE * UNROLL_FACTOR;
let mut i = 0;
let mut acc1 = _mm512_setzero_ps();
let mut acc2 = _mm512_setzero_ps();
let mut acc3 = _mm512_setzero_ps();
let mut acc4 = _mm512_setzero_ps();
let a_aligned = (a as usize) % 64 == 0;
let b_aligned = (b as usize) % 64 == 0;
if a_aligned && b_aligned && len >= CHUNK_SIZE {
while i + CHUNK_SIZE <= len {
if i + PREFETCH_DISTANCE < len {
_mm_prefetch(a.add(i + PREFETCH_DISTANCE) as *const i8, _MM_HINT_T0);
_mm_prefetch(b.add(i + PREFETCH_DISTANCE) as *const i8, _MM_HINT_T0);
}
let a1 = _mm512_load_ps(a.add(i));
let a2 = _mm512_load_ps(a.add(i + 16));
let a3 = _mm512_load_ps(a.add(i + 32));
let a4 = _mm512_load_ps(a.add(i + 48));
let b1 = _mm512_load_ps(b.add(i));
let b2 = _mm512_load_ps(b.add(i + 16));
let b3 = _mm512_load_ps(b.add(i + 32));
let b4 = _mm512_load_ps(b.add(i + 48));
acc1 = _mm512_fmadd_ps(a1, b1, acc1);
acc2 = _mm512_fmadd_ps(a2, b2, acc2);
acc3 = _mm512_fmadd_ps(a3, b3, acc3);
acc4 = _mm512_fmadd_ps(a4, b4, acc4);
i += CHUNK_SIZE;
}
}
while i + VECTOR_SIZE <= len {
let a_vec = _mm512_loadu_ps(a.add(i));
let b_vec = _mm512_loadu_ps(b.add(i));
acc1 = _mm512_fmadd_ps(a_vec, b_vec, acc1);
i += VECTOR_SIZE;
}
let combined1 = _mm512_add_ps(acc1, acc2);
let combined2 = _mm512_add_ps(acc3, acc4);
let final_acc = _mm512_add_ps(combined1, combined2);
let mut result = _mm512_reduce_add_ps(final_acc);
while i < len {
result += *a.add(i) * *b.add(i);
i += 1;
}
result
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx2")]
#[target_feature(enable = "fma")]
unsafe fn avx2_dot_f32_inner(a: *const f32, b: *const f32, len: usize) -> f32 {
use std::arch::x86_64::*;
const PREFETCH_DISTANCE: usize = 256;
const VECTOR_SIZE: usize = 8; const UNROLL_FACTOR: usize = 8;
const CHUNK_SIZE: usize = VECTOR_SIZE * UNROLL_FACTOR;
let mut i = 0;
let mut acc1 = _mm256_setzero_ps();
let mut acc2 = _mm256_setzero_ps();
let mut acc3 = _mm256_setzero_ps();
let mut acc4 = _mm256_setzero_ps();
let mut acc5 = _mm256_setzero_ps();
let mut acc6 = _mm256_setzero_ps();
let mut acc7 = _mm256_setzero_ps();
let mut acc8 = _mm256_setzero_ps();
let a_aligned = (a as usize) % 32 == 0;
let b_aligned = (b as usize) % 32 == 0;
if a_aligned && b_aligned && len >= CHUNK_SIZE {
while i + CHUNK_SIZE <= len {
if i + PREFETCH_DISTANCE < len {
_mm_prefetch(a.add(i + PREFETCH_DISTANCE) as *const i8, _MM_HINT_T0);
_mm_prefetch(b.add(i + PREFETCH_DISTANCE) as *const i8, _MM_HINT_T0);
}
let a1 = _mm256_load_ps(a.add(i));
let a2 = _mm256_load_ps(a.add(i + 8));
let a3 = _mm256_load_ps(a.add(i + 16));
let a4 = _mm256_load_ps(a.add(i + 24));
let a5 = _mm256_load_ps(a.add(i + 32));
let a6 = _mm256_load_ps(a.add(i + 40));
let a7 = _mm256_load_ps(a.add(i + 48));
let a8 = _mm256_load_ps(a.add(i + 56));
let b1 = _mm256_load_ps(b.add(i));
let b2 = _mm256_load_ps(b.add(i + 8));
let b3 = _mm256_load_ps(b.add(i + 16));
let b4 = _mm256_load_ps(b.add(i + 24));
let b5 = _mm256_load_ps(b.add(i + 32));
let b6 = _mm256_load_ps(b.add(i + 40));
let b7 = _mm256_load_ps(b.add(i + 48));
let b8 = _mm256_load_ps(b.add(i + 56));
acc1 = _mm256_fmadd_ps(a1, b1, acc1);
acc2 = _mm256_fmadd_ps(a2, b2, acc2);
acc3 = _mm256_fmadd_ps(a3, b3, acc3);
acc4 = _mm256_fmadd_ps(a4, b4, acc4);
acc5 = _mm256_fmadd_ps(a5, b5, acc5);
acc6 = _mm256_fmadd_ps(a6, b6, acc6);
acc7 = _mm256_fmadd_ps(a7, b7, acc7);
acc8 = _mm256_fmadd_ps(a8, b8, acc8);
i += CHUNK_SIZE;
}
}
while i + VECTOR_SIZE <= len {
let a_vec = _mm256_loadu_ps(a.add(i));
let b_vec = _mm256_loadu_ps(b.add(i));
acc1 = _mm256_fmadd_ps(a_vec, b_vec, acc1);
i += VECTOR_SIZE;
}
let combined1 = _mm256_add_ps(acc1, acc2);
let combined2 = _mm256_add_ps(acc3, acc4);
let combined3 = _mm256_add_ps(acc5, acc6);
let combined4 = _mm256_add_ps(acc7, acc8);
let combined12 = _mm256_add_ps(combined1, combined2);
let combined34 = _mm256_add_ps(combined3, combined4);
let final_acc = _mm256_add_ps(combined12, combined34);
let high = _mm256_extractf128_ps(final_acc, 1);
let low = _mm256_castps256_ps128(final_acc);
let sum128 = _mm_add_ps(low, high);
let shuf = _mm_shuffle_ps(sum128, sum128, 0b1110);
let sum_partial = _mm_add_ps(sum128, shuf);
let shuf2 = _mm_shuffle_ps(sum_partial, sum_partial, 0b0001);
let final_result = _mm_add_ps(sum_partial, shuf2);
let mut result = _mm_cvtss_f32(final_result);
while i < len {
result += *a.add(i) * *b.add(i);
i += 1;
}
result
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "sse2")]
unsafe fn sse_dot_f32_inner(a: *const f32, b: *const f32, len: usize) -> f32 {
use std::arch::x86_64::*;
const VECTOR_SIZE: usize = 4; const UNROLL_FACTOR: usize = 4;
const CHUNK_SIZE: usize = VECTOR_SIZE * UNROLL_FACTOR;
let mut i = 0;
let mut acc1 = _mm_setzero_ps();
let mut acc2 = _mm_setzero_ps();
let mut acc3 = _mm_setzero_ps();
let mut acc4 = _mm_setzero_ps();
while i + CHUNK_SIZE <= len {
let a1 = _mm_loadu_ps(a.add(i));
let a2 = _mm_loadu_ps(a.add(i + 4));
let a3 = _mm_loadu_ps(a.add(i + 8));
let a4 = _mm_loadu_ps(a.add(i + 12));
let b1 = _mm_loadu_ps(b.add(i));
let b2 = _mm_loadu_ps(b.add(i + 4));
let b3 = _mm_loadu_ps(b.add(i + 8));
let b4 = _mm_loadu_ps(b.add(i + 12));
let prod1 = _mm_mul_ps(a1, b1);
let prod2 = _mm_mul_ps(a2, b2);
let prod3 = _mm_mul_ps(a3, b3);
let prod4 = _mm_mul_ps(a4, b4);
acc1 = _mm_add_ps(acc1, prod1);
acc2 = _mm_add_ps(acc2, prod2);
acc3 = _mm_add_ps(acc3, prod3);
acc4 = _mm_add_ps(acc4, prod4);
i += CHUNK_SIZE;
}
while i + VECTOR_SIZE <= len {
let a_vec = _mm_loadu_ps(a.add(i));
let b_vec = _mm_loadu_ps(b.add(i));
let prod = _mm_mul_ps(a_vec, b_vec);
acc1 = _mm_add_ps(acc1, prod);
i += VECTOR_SIZE;
}
let combined1 = _mm_add_ps(acc1, acc2);
let combined2 = _mm_add_ps(acc3, acc4);
let final_acc = _mm_add_ps(combined1, combined2);
let shuf = _mm_shuffle_ps(final_acc, final_acc, 0b1110);
let sum_partial = _mm_add_ps(final_acc, shuf);
let shuf2 = _mm_shuffle_ps(sum_partial, sum_partial, 0b0001);
let final_result = _mm_add_ps(sum_partial, shuf2);
let mut result = _mm_cvtss_f32(final_result);
while i < len {
result += *a.add(i) * *b.add(i);
i += 1;
}
result
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
unsafe fn neon_dot_f32_inner(a: *const f32, b: *const f32, len: usize) -> f32 {
use std::arch::aarch64::*;
const VECTOR_SIZE: usize = 4; const UNROLL_FACTOR: usize = 4;
const CHUNK_SIZE: usize = VECTOR_SIZE * UNROLL_FACTOR;
let mut i = 0;
let mut acc1 = vdupq_n_f32(0.0);
let mut acc2 = vdupq_n_f32(0.0);
let mut acc3 = vdupq_n_f32(0.0);
let mut acc4 = vdupq_n_f32(0.0);
while i + CHUNK_SIZE <= len {
let a1 = vld1q_f32(a.add(i));
let a2 = vld1q_f32(a.add(i + 4));
let a3 = vld1q_f32(a.add(i + 8));
let a4 = vld1q_f32(a.add(i + 12));
let b1 = vld1q_f32(b.add(i));
let b2 = vld1q_f32(b.add(i + 4));
let b3 = vld1q_f32(b.add(i + 8));
let b4 = vld1q_f32(b.add(i + 12));
acc1 = vfmaq_f32(acc1, a1, b1);
acc2 = vfmaq_f32(acc2, a2, b2);
acc3 = vfmaq_f32(acc3, a3, b3);
acc4 = vfmaq_f32(acc4, a4, b4);
i += CHUNK_SIZE;
}
while i + VECTOR_SIZE <= len {
let a_vec = vld1q_f32(a.add(i));
let b_vec = vld1q_f32(b.add(i));
acc1 = vfmaq_f32(acc1, a_vec, b_vec);
i += VECTOR_SIZE;
}
let combined1 = vaddq_f32(acc1, acc2);
let combined2 = vaddq_f32(acc3, acc4);
let final_acc = vaddq_f32(combined1, combined2);
let mut result = vaddvq_f32(final_acc);
while i < len {
result += *a.add(i) * *b.add(i);
i += 1;
}
result
}
#[inline(always)]
fn scalar_dot_f32(a: &ArrayView1<f32>, b: &ArrayView1<f32>) -> f32 {
let a_slice = a.as_slice().expect("Operation failed");
let b_slice = b.as_slice().expect("Operation failed");
a_slice.iter().zip(b_slice.iter()).map(|(x, y)| x * y).sum()
}
#[cfg(test)]
mod dot_tests {
use super::*;
use ndarray::Array1;
#[test]
fn test_dot_product_ultra_optimized() {
let a = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
let b = Array1::from_vec(vec![8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0]);
let result = simd_dot_f32_ultra_optimized(&a.view(), &b.view());
assert_eq!(result, 120.0);
}
#[test]
fn test_dot_product_large_array() {
let size = 10000;
let a = Array1::from_elem(size, 2.0f32);
let b = Array1::from_elem(size, 3.0f32);
let result = simd_dot_f32_ultra_optimized(&a.view(), &b.view());
assert!((result - 60000.0).abs() < 0.001);
}
}
#[inline(always)]
pub fn simd_sum_f32_ultra_optimized(input: &ArrayView1<f32>) -> f32 {
let len = input.len();
if len == 0 {
return 0.0;
}
#[cfg(target_arch = "x86_64")]
{
unsafe {
let ptr = input.as_slice().expect("Operation failed").as_ptr();
if is_x86_feature_detected!("avx512f") {
return avx512_sum_f32_inner(ptr, len);
} else if is_x86_feature_detected!("avx2") {
return avx2_sum_f32_inner(ptr, len);
} else if is_x86_feature_detected!("sse2") {
return sse_sum_f32_inner(ptr, len);
} else {
return scalar_sum_f32(input);
}
}
}
#[cfg(target_arch = "aarch64")]
unsafe {
let ptr = input.as_slice().expect("Operation failed").as_ptr();
return neon_sum_f32_inner(ptr, len);
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
scalar_sum_f32(input)
}
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx512f")]
unsafe fn avx512_sum_f32_inner(ptr: *const f32, len: usize) -> f32 {
use std::arch::x86_64::*;
const PREFETCH_DISTANCE: usize = 512;
const VECTOR_SIZE: usize = 16; const UNROLL_FACTOR: usize = 4;
const CHUNK_SIZE: usize = VECTOR_SIZE * UNROLL_FACTOR;
let mut i = 0;
let mut acc1 = _mm512_setzero_ps();
let mut acc2 = _mm512_setzero_ps();
let mut acc3 = _mm512_setzero_ps();
let mut acc4 = _mm512_setzero_ps();
let aligned = (ptr as usize) % 64 == 0;
if aligned && len >= CHUNK_SIZE {
while i + CHUNK_SIZE <= len {
if i + PREFETCH_DISTANCE < len {
_mm_prefetch(ptr.add(i + PREFETCH_DISTANCE) as *const i8, _MM_HINT_T0);
}
let v1 = _mm512_load_ps(ptr.add(i));
let v2 = _mm512_load_ps(ptr.add(i + 16));
let v3 = _mm512_load_ps(ptr.add(i + 32));
let v4 = _mm512_load_ps(ptr.add(i + 48));
acc1 = _mm512_add_ps(acc1, v1);
acc2 = _mm512_add_ps(acc2, v2);
acc3 = _mm512_add_ps(acc3, v3);
acc4 = _mm512_add_ps(acc4, v4);
i += CHUNK_SIZE;
}
}
while i + VECTOR_SIZE <= len {
let v = _mm512_loadu_ps(ptr.add(i));
acc1 = _mm512_add_ps(acc1, v);
i += VECTOR_SIZE;
}
let combined1 = _mm512_add_ps(acc1, acc2);
let combined2 = _mm512_add_ps(acc3, acc4);
let final_acc = _mm512_add_ps(combined1, combined2);
let mut result = _mm512_reduce_add_ps(final_acc);
while i < len {
result += *ptr.add(i);
i += 1;
}
result
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "avx2")]
unsafe fn avx2_sum_f32_inner(ptr: *const f32, len: usize) -> f32 {
use std::arch::x86_64::*;
const PREFETCH_DISTANCE: usize = 256;
const VECTOR_SIZE: usize = 8; const UNROLL_FACTOR: usize = 8;
const CHUNK_SIZE: usize = VECTOR_SIZE * UNROLL_FACTOR;
let mut i = 0;
let mut acc1 = _mm256_setzero_ps();
let mut acc2 = _mm256_setzero_ps();
let mut acc3 = _mm256_setzero_ps();
let mut acc4 = _mm256_setzero_ps();
let mut acc5 = _mm256_setzero_ps();
let mut acc6 = _mm256_setzero_ps();
let mut acc7 = _mm256_setzero_ps();
let mut acc8 = _mm256_setzero_ps();
let aligned = (ptr as usize) % 32 == 0;
if aligned && len >= CHUNK_SIZE {
while i + CHUNK_SIZE <= len {
if i + PREFETCH_DISTANCE < len {
_mm_prefetch(ptr.add(i + PREFETCH_DISTANCE) as *const i8, _MM_HINT_T0);
}
let v1 = _mm256_load_ps(ptr.add(i));
let v2 = _mm256_load_ps(ptr.add(i + 8));
let v3 = _mm256_load_ps(ptr.add(i + 16));
let v4 = _mm256_load_ps(ptr.add(i + 24));
let v5 = _mm256_load_ps(ptr.add(i + 32));
let v6 = _mm256_load_ps(ptr.add(i + 40));
let v7 = _mm256_load_ps(ptr.add(i + 48));
let v8 = _mm256_load_ps(ptr.add(i + 56));
acc1 = _mm256_add_ps(acc1, v1);
acc2 = _mm256_add_ps(acc2, v2);
acc3 = _mm256_add_ps(acc3, v3);
acc4 = _mm256_add_ps(acc4, v4);
acc5 = _mm256_add_ps(acc5, v5);
acc6 = _mm256_add_ps(acc6, v6);
acc7 = _mm256_add_ps(acc7, v7);
acc8 = _mm256_add_ps(acc8, v8);
i += CHUNK_SIZE;
}
}
while i + VECTOR_SIZE <= len {
let v = _mm256_loadu_ps(ptr.add(i));
acc1 = _mm256_add_ps(acc1, v);
i += VECTOR_SIZE;
}
let combined1 = _mm256_add_ps(acc1, acc2);
let combined2 = _mm256_add_ps(acc3, acc4);
let combined3 = _mm256_add_ps(acc5, acc6);
let combined4 = _mm256_add_ps(acc7, acc8);
let combined12 = _mm256_add_ps(combined1, combined2);
let combined34 = _mm256_add_ps(combined3, combined4);
let final_acc = _mm256_add_ps(combined12, combined34);
let high = _mm256_extractf128_ps(final_acc, 1);
let low = _mm256_castps256_ps128(final_acc);
let sum128 = _mm_add_ps(low, high);
let shuf = _mm_shuffle_ps(sum128, sum128, 0b1110);
let sum_partial = _mm_add_ps(sum128, shuf);
let shuf2 = _mm_shuffle_ps(sum_partial, sum_partial, 0b0001);
let final_result = _mm_add_ps(sum_partial, shuf2);
let mut result = _mm_cvtss_f32(final_result);
while i < len {
result += *ptr.add(i);
i += 1;
}
result
}
#[cfg(target_arch = "x86_64")]
#[inline]
#[target_feature(enable = "sse2")]
unsafe fn sse_sum_f32_inner(ptr: *const f32, len: usize) -> f32 {
use std::arch::x86_64::*;
const VECTOR_SIZE: usize = 4; const UNROLL_FACTOR: usize = 4;
const CHUNK_SIZE: usize = VECTOR_SIZE * UNROLL_FACTOR;
let mut i = 0;
let mut acc1 = _mm_setzero_ps();
let mut acc2 = _mm_setzero_ps();
let mut acc3 = _mm_setzero_ps();
let mut acc4 = _mm_setzero_ps();
while i + CHUNK_SIZE <= len {
let v1 = _mm_loadu_ps(ptr.add(i));
let v2 = _mm_loadu_ps(ptr.add(i + 4));
let v3 = _mm_loadu_ps(ptr.add(i + 8));
let v4 = _mm_loadu_ps(ptr.add(i + 12));
acc1 = _mm_add_ps(acc1, v1);
acc2 = _mm_add_ps(acc2, v2);
acc3 = _mm_add_ps(acc3, v3);
acc4 = _mm_add_ps(acc4, v4);
i += CHUNK_SIZE;
}
while i + VECTOR_SIZE <= len {
let v = _mm_loadu_ps(ptr.add(i));
acc1 = _mm_add_ps(acc1, v);
i += VECTOR_SIZE;
}
let combined1 = _mm_add_ps(acc1, acc2);
let combined2 = _mm_add_ps(acc3, acc4);
let final_acc = _mm_add_ps(combined1, combined2);
let shuf = _mm_shuffle_ps(final_acc, final_acc, 0b1110);
let sum_partial = _mm_add_ps(final_acc, shuf);
let shuf2 = _mm_shuffle_ps(sum_partial, sum_partial, 0b0001);
let final_result = _mm_add_ps(sum_partial, shuf2);
let mut result = _mm_cvtss_f32(final_result);
while i < len {
result += *ptr.add(i);
i += 1;
}
result
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
unsafe fn neon_sum_f32_inner(ptr: *const f32, len: usize) -> f32 {
use std::arch::aarch64::*;
const VECTOR_SIZE: usize = 4; const UNROLL_FACTOR: usize = 4;
const CHUNK_SIZE: usize = VECTOR_SIZE * UNROLL_FACTOR;
let mut i = 0;
let mut acc1 = vdupq_n_f32(0.0);
let mut acc2 = vdupq_n_f32(0.0);
let mut acc3 = vdupq_n_f32(0.0);
let mut acc4 = vdupq_n_f32(0.0);
while i + CHUNK_SIZE <= len {
let v1 = vld1q_f32(ptr.add(i));
let v2 = vld1q_f32(ptr.add(i + 4));
let v3 = vld1q_f32(ptr.add(i + 8));
let v4 = vld1q_f32(ptr.add(i + 12));
acc1 = vaddq_f32(acc1, v1);
acc2 = vaddq_f32(acc2, v2);
acc3 = vaddq_f32(acc3, v3);
acc4 = vaddq_f32(acc4, v4);
i += CHUNK_SIZE;
}
while i + VECTOR_SIZE <= len {
let v = vld1q_f32(ptr.add(i));
acc1 = vaddq_f32(acc1, v);
i += VECTOR_SIZE;
}
let combined1 = vaddq_f32(acc1, acc2);
let combined2 = vaddq_f32(acc3, acc4);
let final_acc = vaddq_f32(combined1, combined2);
let mut result = vaddvq_f32(final_acc);
while i < len {
result += *ptr.add(i);
i += 1;
}
result
}
#[inline(always)]
fn scalar_sum_f32(input: &ArrayView1<f32>) -> f32 {
let slice = input.as_slice().expect("Operation failed");
slice.iter().sum()
}
#[cfg(test)]
mod sum_tests {
use super::*;
use ndarray::Array1;
#[test]
fn test_sum_ultra_optimized() {
let a = Array1::from_vec(vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]);
let result = simd_sum_f32_ultra_optimized(&a.view());
assert_eq!(result, 36.0);
}
#[test]
fn test_sum_large_array() {
let size = 10000;
let a = Array1::from_elem(size, 2.5f32);
let result = simd_sum_f32_ultra_optimized(&a.view());
assert!((result - 25000.0).abs() < 0.001);
}
#[test]
fn test_sum_empty() {
let a = Array1::<f32>::from_vec(vec![]);
let result = simd_sum_f32_ultra_optimized(&a.view());
assert_eq!(result, 0.0);
}
}