#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
#[target_feature(enable = "avx2", enable = "fma")]
#[inline]
pub unsafe fn hmax_f32(v: __m256) -> f32 {
let high = _mm256_extractf128_ps(v, 1);
let low = _mm256_castps256_ps128(v);
let max128 = _mm_max_ps(low, high);
let shuf = _mm_movehdup_ps(max128);
let max64 = _mm_max_ps(max128, shuf);
let shuf2 = _mm_movehl_ps(max64, max64);
let max32 = _mm_max_ss(max64, shuf2);
_mm_cvtss_f32(max32)
}
#[target_feature(enable = "avx2", enable = "fma")]
#[inline]
pub unsafe fn hmax_f64(v: __m256d) -> f64 {
let high = _mm256_extractf128_pd(v, 1);
let low = _mm256_castpd256_pd128(v);
let max128 = _mm_max_pd(low, high);
let shuf = _mm_unpackhi_pd(max128, max128);
let max64 = _mm_max_sd(max128, shuf);
_mm_cvtsd_f64(max64)
}
#[target_feature(enable = "avx2", enable = "fma")]
#[inline]
pub unsafe fn hsum_f32(v: __m256) -> f32 {
let high = _mm256_extractf128_ps(v, 1);
let low = _mm256_castps256_ps128(v);
let sum128 = _mm_add_ps(low, high);
let shuf = _mm_movehdup_ps(sum128);
let sum64 = _mm_add_ps(sum128, shuf);
let shuf2 = _mm_movehl_ps(sum64, sum64);
let sum32 = _mm_add_ss(sum64, shuf2);
_mm_cvtss_f32(sum32)
}
#[target_feature(enable = "avx2", enable = "fma")]
#[inline]
pub unsafe fn hsum_f64(v: __m256d) -> f64 {
let high = _mm256_extractf128_pd(v, 1);
let low = _mm256_castpd256_pd128(v);
let sum128 = _mm_add_pd(low, high);
let shuf = _mm_unpackhi_pd(sum128, sum128);
let sum64 = _mm_add_sd(sum128, shuf);
_mm_cvtsd_f64(sum64)
}