use super::{EnhancedSimdOps, AVX2_F32_LANES, AVX2_F64_LANES, PREFETCH_DISTANCE};
use crate::array::Array;
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
impl EnhancedSimdOps {
#[cfg(target_arch = "x86_64")]
pub fn vectorized_sin_f32_simd(input: &Array<f32>) -> Array<f32> {
input.map(|x| x.sin())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn avx2_sin_f32(input: &[f32], output: &mut [f32]) {
let len = input.len();
let simd_len = len & !(AVX2_F32_LANES - 1);
let pi = _mm256_set1_ps(std::f32::consts::PI);
let two_pi = _mm256_set1_ps(2.0 * std::f32::consts::PI);
let pi_2 = _mm256_set1_ps(std::f32::consts::PI / 2.0);
let one = _mm256_set1_ps(1.0);
let c3 = _mm256_set1_ps(-1.0 / 6.0);
let c5 = _mm256_set1_ps(1.0 / 120.0);
let c7 = _mm256_set1_ps(-1.0 / 5040.0);
let c9 = _mm256_set1_ps(1.0 / 362880.0);
for i in (0..simd_len).step_by(AVX2_F32_LANES) {
let mut x = _mm256_loadu_ps(input.as_ptr().add(i));
let k = _mm256_round_ps(_mm256_div_ps(x, two_pi), _MM_FROUND_TO_NEAREST_INT);
x = _mm256_fmsub_ps(k, two_pi, x);
let abs_x = _mm256_and_ps(x, _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)));
let quadrant = _mm256_cmp_ps(abs_x, pi_2, _CMP_GT_OQ);
let x_adj = _mm256_blendv_ps(x, _mm256_sub_ps(pi, abs_x), quadrant);
let sign_adj = _mm256_blendv_ps(
one,
_mm256_set1_ps(-1.0),
_mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LT_OQ),
);
let x2 = _mm256_mul_ps(x_adj, x_adj);
let x3 = _mm256_mul_ps(x2, x_adj);
let x5 = _mm256_mul_ps(x3, x2);
let x7 = _mm256_mul_ps(x5, x2);
let x9 = _mm256_mul_ps(x7, x2);
let poly = _mm256_fmadd_ps(
c9,
x9,
_mm256_fmadd_ps(
c7,
x7,
_mm256_fmadd_ps(c5, x5, _mm256_fmadd_ps(c3, x3, x_adj)),
),
);
let result = _mm256_mul_ps(poly, sign_adj);
_mm256_storeu_ps(output.as_mut_ptr().add(i), result);
}
for i in simd_len..len {
output[i] = input[i].sin();
}
}
#[cfg(target_arch = "x86_64")]
pub fn vectorized_sin_f64(input: &Array<f64>) -> Array<f64> {
let data = input.to_vec();
let mut result = vec![0.0f64; data.len()];
unsafe {
Self::avx2_sin_f64(&data, &mut result);
}
Array::from_vec(result).reshape(&input.shape())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn avx2_sin_f64(input: &[f64], output: &mut [f64]) {
let len = input.len();
let simd_len = len & !(AVX2_F64_LANES - 1);
let pi = _mm256_set1_pd(std::f64::consts::PI);
let two_pi = _mm256_set1_pd(2.0 * std::f64::consts::PI);
let pi_2 = _mm256_set1_pd(std::f64::consts::FRAC_PI_2);
let one = _mm256_set1_pd(1.0);
let neg_one = _mm256_set1_pd(-1.0);
let c3 = _mm256_set1_pd(-1.0 / 6.0);
let c5 = _mm256_set1_pd(1.0 / 120.0);
let c7 = _mm256_set1_pd(-1.0 / 5040.0);
let c9 = _mm256_set1_pd(1.0 / 362880.0);
let c11 = _mm256_set1_pd(-1.0 / 39916800.0);
for i in (0..simd_len).step_by(AVX2_F64_LANES) {
if i + PREFETCH_DISTANCE / 2 < len {
_mm_prefetch(
input.as_ptr().add(i + PREFETCH_DISTANCE / 2) as *const i8,
_MM_HINT_T0,
);
}
let mut x = _mm256_loadu_pd(input.as_ptr().add(i));
let k = _mm256_round_pd(_mm256_div_pd(x, two_pi), _MM_FROUND_TO_NEAREST_INT);
x = _mm256_sub_pd(x, _mm256_mul_pd(k, two_pi));
let sign_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LT_OQ);
let sign = _mm256_blendv_pd(one, neg_one, sign_mask);
let abs_mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
let abs_x = _mm256_and_pd(x, abs_mask);
let need_adjust = _mm256_cmp_pd(abs_x, pi_2, _CMP_GT_OQ);
let x_adj = _mm256_blendv_pd(abs_x, _mm256_sub_pd(pi, abs_x), need_adjust);
let x2 = _mm256_mul_pd(x_adj, x_adj);
let x3 = _mm256_mul_pd(x2, x_adj);
let x5 = _mm256_mul_pd(x3, x2);
let x7 = _mm256_mul_pd(x5, x2);
let x9 = _mm256_mul_pd(x7, x2);
let x11 = _mm256_mul_pd(x9, x2);
let poly = _mm256_fmadd_pd(
c11,
x11,
_mm256_fmadd_pd(
c9,
x9,
_mm256_fmadd_pd(
c7,
x7,
_mm256_fmadd_pd(c5, x5, _mm256_fmadd_pd(c3, x3, x_adj)),
),
),
);
let result = _mm256_mul_pd(poly, sign);
_mm256_storeu_pd(output.as_mut_ptr().add(i), result);
}
for i in simd_len..len {
output[i] = input[i].sin();
}
}
#[cfg(target_arch = "x86_64")]
pub fn vectorized_cos_f32(input: &Array<f32>) -> Array<f32> {
input.map(|x| x.cos())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn avx2_cos_f32(input: &[f32], output: &mut [f32]) {
let len = input.len();
let simd_len = len & !(AVX2_F32_LANES - 1);
let two_pi = _mm256_set1_ps(2.0 * std::f32::consts::PI);
let one = _mm256_set1_ps(1.0);
let c2 = _mm256_set1_ps(-0.5);
let c4 = _mm256_set1_ps(1.0 / 24.0);
let c6 = _mm256_set1_ps(-1.0 / 720.0);
let c8 = _mm256_set1_ps(1.0 / 40320.0);
for i in (0..simd_len).step_by(AVX2_F32_LANES) {
let mut x = _mm256_loadu_ps(input.as_ptr().add(i));
let k = _mm256_round_ps(_mm256_div_ps(x, two_pi), _MM_FROUND_TO_NEAREST_INT);
x = _mm256_fmsub_ps(k, two_pi, x);
let abs_x = _mm256_and_ps(x, _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)));
let x2 = _mm256_mul_ps(abs_x, abs_x);
let x4 = _mm256_mul_ps(x2, x2);
let x6 = _mm256_mul_ps(x4, x2);
let x8 = _mm256_mul_ps(x4, x4);
let poly = _mm256_fmadd_ps(
c8,
x8,
_mm256_fmadd_ps(
c6,
x6,
_mm256_fmadd_ps(c4, x4, _mm256_fmadd_ps(c2, x2, one)),
),
);
_mm256_storeu_ps(output.as_mut_ptr().add(i), poly);
}
for i in simd_len..len {
output[i] = input[i].cos();
}
}
#[cfg(target_arch = "x86_64")]
pub fn vectorized_cos_f64(input: &Array<f64>) -> Array<f64> {
input.map(|x| x.cos())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn avx2_cos_f64(input: &[f64], output: &mut [f64]) {
let len = input.len();
let simd_len = len & !(AVX2_F64_LANES - 1);
let two_pi = _mm256_set1_pd(2.0 * std::f64::consts::PI);
let one = _mm256_set1_pd(1.0);
let c2 = _mm256_set1_pd(-0.5);
let c4 = _mm256_set1_pd(1.0 / 24.0);
let c6 = _mm256_set1_pd(-1.0 / 720.0);
let c8 = _mm256_set1_pd(1.0 / 40320.0);
let c10 = _mm256_set1_pd(-1.0 / 3628800.0);
for i in (0..simd_len).step_by(AVX2_F64_LANES) {
if i + PREFETCH_DISTANCE / 2 < len {
_mm_prefetch(
input.as_ptr().add(i + PREFETCH_DISTANCE / 2) as *const i8,
_MM_HINT_T0,
);
}
let mut x = _mm256_loadu_pd(input.as_ptr().add(i));
let k = _mm256_round_pd(_mm256_div_pd(x, two_pi), _MM_FROUND_TO_NEAREST_INT);
x = _mm256_sub_pd(x, _mm256_mul_pd(k, two_pi));
let abs_mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
let abs_x = _mm256_and_pd(x, abs_mask);
let x2 = _mm256_mul_pd(abs_x, abs_x);
let x4 = _mm256_mul_pd(x2, x2);
let x6 = _mm256_mul_pd(x4, x2);
let x8 = _mm256_mul_pd(x4, x4);
let x10 = _mm256_mul_pd(x8, x2);
let poly = _mm256_fmadd_pd(
c10,
x10,
_mm256_fmadd_pd(
c8,
x8,
_mm256_fmadd_pd(
c6,
x6,
_mm256_fmadd_pd(c4, x4, _mm256_fmadd_pd(c2, x2, one)),
),
),
);
_mm256_storeu_pd(output.as_mut_ptr().add(i), poly);
}
for i in simd_len..len {
output[i] = input[i].cos();
}
}
#[cfg(target_arch = "x86_64")]
pub fn vectorized_tan_f32(input: &Array<f32>) -> Array<f32> {
input.map(|x| x.tan())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn avx2_tan_f32(input: &[f32], output: &mut [f32]) {
let len = input.len();
let simd_len = len & !(AVX2_F32_LANES - 1);
let two_pi = _mm256_set1_ps(2.0 * std::f32::consts::PI);
let one = _mm256_set1_ps(1.0);
let s3 = _mm256_set1_ps(-1.0 / 6.0);
let s5 = _mm256_set1_ps(1.0 / 120.0);
let c2 = _mm256_set1_ps(-0.5);
let c4 = _mm256_set1_ps(1.0 / 24.0);
let c6 = _mm256_set1_ps(-1.0 / 720.0);
for i in (0..simd_len).step_by(AVX2_F32_LANES) {
let mut x = _mm256_loadu_ps(input.as_ptr().add(i));
let k = _mm256_round_ps(_mm256_div_ps(x, two_pi), _MM_FROUND_TO_NEAREST_INT);
x = _mm256_fmsub_ps(k, two_pi, x);
let abs_x = _mm256_and_ps(x, _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF)));
let x2 = _mm256_mul_ps(abs_x, abs_x);
let x3 = _mm256_mul_ps(x2, abs_x);
let x5 = _mm256_mul_ps(x3, x2);
let sin_val = _mm256_fmadd_ps(s5, x5, _mm256_fmadd_ps(s3, x3, abs_x));
let x4 = _mm256_mul_ps(x2, x2);
let x6 = _mm256_mul_ps(x4, x2);
let cos_val = _mm256_fmadd_ps(
c6,
x6,
_mm256_fmadd_ps(c4, x4, _mm256_fmadd_ps(c2, x2, one)),
);
let sign = _mm256_and_ps(x, _mm256_set1_ps(-0.0));
let tan_val = _mm256_div_ps(sin_val, cos_val);
let result = _mm256_xor_ps(tan_val, sign);
_mm256_storeu_ps(output.as_mut_ptr().add(i), result);
}
for i in simd_len..len {
output[i] = input[i].tan();
}
}
#[cfg(target_arch = "x86_64")]
pub fn vectorized_tan_f64(input: &Array<f64>) -> Array<f64> {
input.map(|x| x.tan())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn avx2_tan_f64(input: &[f64], output: &mut [f64]) {
let len = input.len();
let simd_len = len & !(AVX2_F64_LANES - 1);
let two_pi = _mm256_set1_pd(2.0 * std::f64::consts::PI);
let pi = _mm256_set1_pd(std::f64::consts::PI);
let pi_2 = _mm256_set1_pd(std::f64::consts::FRAC_PI_2);
let one = _mm256_set1_pd(1.0);
let neg_one = _mm256_set1_pd(-1.0);
let s3 = _mm256_set1_pd(-1.0 / 6.0);
let s5 = _mm256_set1_pd(1.0 / 120.0);
let s7 = _mm256_set1_pd(-1.0 / 5040.0);
let s9 = _mm256_set1_pd(1.0 / 362880.0);
let c2 = _mm256_set1_pd(-0.5);
let c4 = _mm256_set1_pd(1.0 / 24.0);
let c6 = _mm256_set1_pd(-1.0 / 720.0);
let c8 = _mm256_set1_pd(1.0 / 40320.0);
for i in (0..simd_len).step_by(AVX2_F64_LANES) {
if i + PREFETCH_DISTANCE / 2 < len {
_mm_prefetch(
input.as_ptr().add(i + PREFETCH_DISTANCE / 2) as *const i8,
_MM_HINT_T0,
);
}
let mut x = _mm256_loadu_pd(input.as_ptr().add(i));
let k = _mm256_round_pd(_mm256_div_pd(x, two_pi), _MM_FROUND_TO_NEAREST_INT);
x = _mm256_sub_pd(x, _mm256_mul_pd(k, two_pi));
let sign_mask = _mm256_cmp_pd(x, _mm256_setzero_pd(), _CMP_LT_OQ);
let sign = _mm256_blendv_pd(one, neg_one, sign_mask);
let abs_mask = _mm256_castsi256_pd(_mm256_set1_epi64x(0x7FFFFFFFFFFFFFFF));
let abs_x = _mm256_and_pd(x, abs_mask);
let need_adjust = _mm256_cmp_pd(abs_x, pi_2, _CMP_GT_OQ);
let x_adj = _mm256_blendv_pd(abs_x, _mm256_sub_pd(pi, abs_x), need_adjust);
let tan_sign = _mm256_blendv_pd(one, neg_one, need_adjust);
let x2 = _mm256_mul_pd(x_adj, x_adj);
let x3 = _mm256_mul_pd(x2, x_adj);
let x5 = _mm256_mul_pd(x3, x2);
let x7 = _mm256_mul_pd(x5, x2);
let x9 = _mm256_mul_pd(x7, x2);
let sin_val = _mm256_fmadd_pd(
s9,
x9,
_mm256_fmadd_pd(
s7,
x7,
_mm256_fmadd_pd(s5, x5, _mm256_fmadd_pd(s3, x3, x_adj)),
),
);
let x4 = _mm256_mul_pd(x2, x2);
let x6 = _mm256_mul_pd(x4, x2);
let x8 = _mm256_mul_pd(x4, x4);
let cos_val = _mm256_fmadd_pd(
c8,
x8,
_mm256_fmadd_pd(
c6,
x6,
_mm256_fmadd_pd(c4, x4, _mm256_fmadd_pd(c2, x2, one)),
),
);
let tan_val = _mm256_div_pd(sin_val, cos_val);
let tan_adjusted = _mm256_mul_pd(tan_val, tan_sign);
let result = _mm256_mul_pd(tan_adjusted, sign);
_mm256_storeu_pd(output.as_mut_ptr().add(i), result);
}
for i in simd_len..len {
output[i] = input[i].tan();
}
}
#[cfg(target_arch = "x86_64")]
pub fn vectorized_sinh_f64(input: &Array<f64>) -> Array<f64> {
let data = input.to_vec();
let mut result = vec![0.0f64; data.len()];
unsafe {
Self::avx2_sinh_f64(&data, &mut result);
}
Array::from_vec(result).reshape(&input.shape())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn avx2_sinh_f64(input: &[f64], output: &mut [f64]) {
let len = input.len();
let simd_len = len & !(AVX2_F64_LANES - 1);
let half = _mm256_set1_pd(0.5);
for i in (0..simd_len).step_by(AVX2_F64_LANES) {
if i + PREFETCH_DISTANCE / 2 < len {
_mm_prefetch(
input.as_ptr().add(i + PREFETCH_DISTANCE / 2) as *const i8,
_MM_HINT_T0,
);
}
let x = _mm256_loadu_pd(input.as_ptr().add(i));
let exp_x = Self::simd_exp_pd(x);
let neg_x = _mm256_sub_pd(_mm256_setzero_pd(), x);
let exp_neg_x = Self::simd_exp_pd(neg_x);
let diff = _mm256_sub_pd(exp_x, exp_neg_x);
let result = _mm256_mul_pd(diff, half);
_mm256_storeu_pd(output.as_mut_ptr().add(i), result);
}
for i in simd_len..len {
output[i] = input[i].sinh();
}
}
#[cfg(target_arch = "x86_64")]
pub fn vectorized_cosh_f64(input: &Array<f64>) -> Array<f64> {
let data = input.to_vec();
let mut result = vec![0.0f64; data.len()];
unsafe {
Self::avx2_cosh_f64(&data, &mut result);
}
Array::from_vec(result).reshape(&input.shape())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn avx2_cosh_f64(input: &[f64], output: &mut [f64]) {
let len = input.len();
let simd_len = len & !(AVX2_F64_LANES - 1);
let half = _mm256_set1_pd(0.5);
for i in (0..simd_len).step_by(AVX2_F64_LANES) {
if i + PREFETCH_DISTANCE / 2 < len {
_mm_prefetch(
input.as_ptr().add(i + PREFETCH_DISTANCE / 2) as *const i8,
_MM_HINT_T0,
);
}
let x = _mm256_loadu_pd(input.as_ptr().add(i));
let exp_x = Self::simd_exp_pd(x);
let neg_x = _mm256_sub_pd(_mm256_setzero_pd(), x);
let exp_neg_x = Self::simd_exp_pd(neg_x);
let sum = _mm256_add_pd(exp_x, exp_neg_x);
let result = _mm256_mul_pd(sum, half);
_mm256_storeu_pd(output.as_mut_ptr().add(i), result);
}
for i in simd_len..len {
output[i] = input[i].cosh();
}
}
#[cfg(target_arch = "x86_64")]
pub fn vectorized_tanh_f64(input: &Array<f64>) -> Array<f64> {
let data = input.to_vec();
let mut result = vec![0.0f64; data.len()];
unsafe {
Self::avx2_tanh_f64(&data, &mut result);
}
Array::from_vec(result).reshape(&input.shape())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn avx2_tanh_f64(input: &[f64], output: &mut [f64]) {
let len = input.len();
let simd_len = len & !(AVX2_F64_LANES - 1);
let one = _mm256_set1_pd(1.0);
let two = _mm256_set1_pd(2.0);
for i in (0..simd_len).step_by(AVX2_F64_LANES) {
if i + PREFETCH_DISTANCE / 2 < len {
_mm_prefetch(
input.as_ptr().add(i + PREFETCH_DISTANCE / 2) as *const i8,
_MM_HINT_T0,
);
}
let x = _mm256_loadu_pd(input.as_ptr().add(i));
let two_x = _mm256_mul_pd(x, two);
let exp_2x = Self::simd_exp_pd(two_x);
let numerator = _mm256_sub_pd(exp_2x, one);
let denominator = _mm256_add_pd(exp_2x, one);
let result = _mm256_div_pd(numerator, denominator);
_mm256_storeu_pd(output.as_mut_ptr().add(i), result);
}
for i in simd_len..len {
output[i] = input[i].tanh();
}
}
#[cfg(target_arch = "x86_64")]
pub fn vectorized_asin_f64(input: &Array<f64>) -> Array<f64> {
let data = input.to_vec();
let mut result = vec![0.0f64; data.len()];
unsafe {
Self::avx2_asin_f64(&data, &mut result);
}
Array::from_vec(result).reshape(&input.shape())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn avx2_asin_f64(input: &[f64], output: &mut [f64]) {
let len = input.len();
let simd_len = len & !(AVX2_F64_LANES - 1);
let one = _mm256_set1_pd(1.0);
for i in (0..simd_len).step_by(AVX2_F64_LANES) {
if i + PREFETCH_DISTANCE / 2 < len {
_mm_prefetch(
input.as_ptr().add(i + PREFETCH_DISTANCE / 2) as *const i8,
_MM_HINT_T0,
);
}
let x = _mm256_loadu_pd(input.as_ptr().add(i));
let x_sq = _mm256_mul_pd(x, x);
let one_minus_x_sq = _mm256_sub_pd(one, x_sq);
let sqrt_term = _mm256_sqrt_pd(one_minus_x_sq);
let result = Self::simd_atan2_pd(x, sqrt_term);
_mm256_storeu_pd(output.as_mut_ptr().add(i), result);
}
for i in simd_len..len {
output[i] = input[i].asin();
}
}
#[cfg(target_arch = "x86_64")]
pub fn vectorized_acos_f64(input: &Array<f64>) -> Array<f64> {
let data = input.to_vec();
let mut result = vec![0.0f64; data.len()];
unsafe {
Self::avx2_acos_f64(&data, &mut result);
}
Array::from_vec(result).reshape(&input.shape())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn avx2_acos_f64(input: &[f64], output: &mut [f64]) {
let len = input.len();
let simd_len = len & !(AVX2_F64_LANES - 1);
let one = _mm256_set1_pd(1.0);
for i in (0..simd_len).step_by(AVX2_F64_LANES) {
if i + PREFETCH_DISTANCE / 2 < len {
_mm_prefetch(
input.as_ptr().add(i + PREFETCH_DISTANCE / 2) as *const i8,
_MM_HINT_T0,
);
}
let x = _mm256_loadu_pd(input.as_ptr().add(i));
let x_sq = _mm256_mul_pd(x, x);
let one_minus_x_sq = _mm256_sub_pd(one, x_sq);
let sqrt_term = _mm256_sqrt_pd(one_minus_x_sq);
let result = Self::simd_atan2_pd(sqrt_term, x);
_mm256_storeu_pd(output.as_mut_ptr().add(i), result);
}
for i in simd_len..len {
output[i] = input[i].acos();
}
}
#[cfg(target_arch = "x86_64")]
pub fn vectorized_atan_f64(input: &Array<f64>) -> Array<f64> {
let data = input.to_vec();
let mut result = vec![0.0f64; data.len()];
unsafe {
Self::avx2_atan_f64(&data, &mut result);
}
Array::from_vec(result).reshape(&input.shape())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn avx2_atan_f64(input: &[f64], output: &mut [f64]) {
let len = input.len();
let simd_len = len & !(AVX2_F64_LANES - 1);
for i in (0..simd_len).step_by(AVX2_F64_LANES) {
if i + PREFETCH_DISTANCE / 2 < len {
_mm_prefetch(
input.as_ptr().add(i + PREFETCH_DISTANCE / 2) as *const i8,
_MM_HINT_T0,
);
}
let x = _mm256_loadu_pd(input.as_ptr().add(i));
let result = Self::simd_atan_pd(x);
_mm256_storeu_pd(output.as_mut_ptr().add(i), result);
}
for i in simd_len..len {
output[i] = input[i].atan();
}
}
#[cfg(target_arch = "x86_64")]
pub fn vectorized_asinh_f64(input: &Array<f64>) -> Array<f64> {
let data = input.to_vec();
let mut result = vec![0.0f64; data.len()];
unsafe {
Self::avx2_asinh_f64(&data, &mut result);
}
Array::from_vec(result).reshape(&input.shape())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn avx2_asinh_f64(input: &[f64], output: &mut [f64]) {
let len = input.len();
let simd_len = len & !(AVX2_F64_LANES - 1);
let one = _mm256_set1_pd(1.0);
for i in (0..simd_len).step_by(AVX2_F64_LANES) {
if i + PREFETCH_DISTANCE / 2 < len {
_mm_prefetch(
input.as_ptr().add(i + PREFETCH_DISTANCE / 2) as *const i8,
_MM_HINT_T0,
);
}
let x = _mm256_loadu_pd(input.as_ptr().add(i));
let x_sq = _mm256_mul_pd(x, x);
let x_sq_plus_1 = _mm256_add_pd(x_sq, one);
let sqrt_term = _mm256_sqrt_pd(x_sq_plus_1);
let arg = _mm256_add_pd(x, sqrt_term);
let result = Self::simd_log_pd(arg);
_mm256_storeu_pd(output.as_mut_ptr().add(i), result);
}
for i in simd_len..len {
output[i] = input[i].asinh();
}
}
#[cfg(target_arch = "x86_64")]
pub fn vectorized_acosh_f64(input: &Array<f64>) -> Array<f64> {
let data = input.to_vec();
let mut result = vec![0.0f64; data.len()];
unsafe {
Self::avx2_acosh_f64(&data, &mut result);
}
Array::from_vec(result).reshape(&input.shape())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn avx2_acosh_f64(input: &[f64], output: &mut [f64]) {
let len = input.len();
let simd_len = len & !(AVX2_F64_LANES - 1);
let one = _mm256_set1_pd(1.0);
for i in (0..simd_len).step_by(AVX2_F64_LANES) {
if i + PREFETCH_DISTANCE / 2 < len {
_mm_prefetch(
input.as_ptr().add(i + PREFETCH_DISTANCE / 2) as *const i8,
_MM_HINT_T0,
);
}
let x = _mm256_loadu_pd(input.as_ptr().add(i));
let x_sq = _mm256_mul_pd(x, x);
let x_sq_minus_1 = _mm256_sub_pd(x_sq, one);
let sqrt_term = _mm256_sqrt_pd(x_sq_minus_1);
let arg = _mm256_add_pd(x, sqrt_term);
let result = Self::simd_log_pd(arg);
_mm256_storeu_pd(output.as_mut_ptr().add(i), result);
}
for i in simd_len..len {
output[i] = input[i].acosh();
}
}
#[cfg(target_arch = "x86_64")]
pub fn vectorized_atanh_f64(input: &Array<f64>) -> Array<f64> {
let data = input.to_vec();
let mut result = vec![0.0f64; data.len()];
unsafe {
Self::avx2_atanh_f64(&data, &mut result);
}
Array::from_vec(result).reshape(&input.shape())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn avx2_atanh_f64(input: &[f64], output: &mut [f64]) {
let len = input.len();
let simd_len = len & !(AVX2_F64_LANES - 1);
let one = _mm256_set1_pd(1.0);
let half = _mm256_set1_pd(0.5);
for i in (0..simd_len).step_by(AVX2_F64_LANES) {
if i + PREFETCH_DISTANCE / 2 < len {
_mm_prefetch(
input.as_ptr().add(i + PREFETCH_DISTANCE / 2) as *const i8,
_MM_HINT_T0,
);
}
let x = _mm256_loadu_pd(input.as_ptr().add(i));
let one_plus_x = _mm256_add_pd(one, x);
let one_minus_x = _mm256_sub_pd(one, x);
let ratio = _mm256_div_pd(one_plus_x, one_minus_x);
let log_ratio = Self::simd_log_pd(ratio);
let result = _mm256_mul_pd(half, log_ratio);
_mm256_storeu_pd(output.as_mut_ptr().add(i), result);
}
for i in simd_len..len {
output[i] = input[i].atanh();
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn simd_exp_pd(x: __m256d) -> __m256d {
let mut vals = [0.0f64; 4];
_mm256_storeu_pd(vals.as_mut_ptr(), x);
vals[0] = vals[0].exp();
vals[1] = vals[1].exp();
vals[2] = vals[2].exp();
vals[3] = vals[3].exp();
_mm256_loadu_pd(vals.as_ptr())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn simd_log_pd(x: __m256d) -> __m256d {
let mut vals = [0.0f64; 4];
_mm256_storeu_pd(vals.as_mut_ptr(), x);
vals[0] = vals[0].ln();
vals[1] = vals[1].ln();
vals[2] = vals[2].ln();
vals[3] = vals[3].ln();
_mm256_loadu_pd(vals.as_ptr())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn simd_atan_pd(x: __m256d) -> __m256d {
let mut vals = [0.0f64; 4];
_mm256_storeu_pd(vals.as_mut_ptr(), x);
vals[0] = vals[0].atan();
vals[1] = vals[1].atan();
vals[2] = vals[2].atan();
vals[3] = vals[3].atan();
_mm256_loadu_pd(vals.as_ptr())
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn simd_atan2_pd(y: __m256d, x: __m256d) -> __m256d {
let mut y_vals = [0.0f64; 4];
let mut x_vals = [0.0f64; 4];
_mm256_storeu_pd(y_vals.as_mut_ptr(), y);
_mm256_storeu_pd(x_vals.as_mut_ptr(), x);
y_vals[0] = y_vals[0].atan2(x_vals[0]);
y_vals[1] = y_vals[1].atan2(x_vals[1]);
y_vals[2] = y_vals[2].atan2(x_vals[2]);
y_vals[3] = y_vals[3].atan2(x_vals[3]);
_mm256_loadu_pd(y_vals.as_ptr())
}
}