#![allow(clippy::unreadable_literal, clippy::inline_always)]
#![allow(clippy::excessive_precision)]
const PI2_HI: f64 = 1.57079625129699707031e+0; const PI2_MI: f64 = 7.54978941586159635335e-8; const PI2_LO: f64 = 5.39030285815811905290e-15; const TWO_OVER_PI: f64 = 2.0 / std::f64::consts::PI;
#[inline(always)]
#[must_use]
pub fn sin_fast_f64(x: f64) -> f64 {
let kf = (x * TWO_OVER_PI).round();
let y = kf.mul_add(-PI2_LO, kf.mul_add(-PI2_MI, kf.mul_add(-PI2_HI, x)));
let y2 = y * y;
let sin_y = sin_poly_cephes(y, y2);
let cos_y = cos_poly_cephes(y2);
let q4 = kf - 4.0 * (kf * 0.25).floor();
let swap = kf - 2.0 * (kf * 0.5).floor();
let neg = (q4 * 0.5).floor();
let base = if swap != 0.0 { cos_y } else { sin_y };
(1.0 - 2.0 * neg) * base
}
#[inline(always)]
#[must_use]
pub fn cos_fast_f64(x: f64) -> f64 {
let kf = (x * TWO_OVER_PI).round();
let y = kf.mul_add(-PI2_LO, kf.mul_add(-PI2_MI, kf.mul_add(-PI2_HI, x)));
let y2 = y * y;
let sin_y = sin_poly_cephes(y, y2);
let cos_y = cos_poly_cephes(y2);
let q4 = kf - 4.0 * (kf * 0.25).floor();
let swap = kf - 2.0 * (kf * 0.5).floor();
let neg_sin = (q4 * 0.5).floor();
let neg = neg_sin + swap - 2.0 * neg_sin * swap;
let base = if swap != 0.0 { sin_y } else { cos_y };
(1.0 - 2.0 * neg) * base
}
#[inline(always)]
fn sin_poly_cephes(y: f64, y2: f64) -> f64 {
let p = 1.58962301576546568060e-10_f64;
let p = p.mul_add(y2, -2.50507477628578072866e-8);
let p = p.mul_add(y2, 2.75573136213857245213e-6);
let p = p.mul_add(y2, -1.98412698295895385996e-4);
let p = p.mul_add(y2, 8.33333333332211858878e-3);
let p = p.mul_add(y2, -1.66666666666666307295e-1);
y.mul_add(p * y2, y)
}
#[inline(always)]
fn cos_poly_cephes(y2: f64) -> f64 {
let p = -1.13585365213876817300e-11_f64;
let p = p.mul_add(y2, 2.08757008419747316778e-9);
let p = p.mul_add(y2, -2.75573141792967388112e-7);
let p = p.mul_add(y2, 2.48015872888517045348e-5);
let p = p.mul_add(y2, -1.38888888888730564116e-3);
let p = p.mul_add(y2, 4.16666666666665929218e-2);
let half_y2 = y2 * 0.5;
let one_minus = 1.0 - half_y2;
let y4 = y2 * y2;
y4.mul_add(p, one_minus)
}
#[inline(always)]
#[must_use]
pub fn sin_fast_f32(x: f32) -> f32 {
sin_fast_f64(f64::from(x)) as f32
}
#[inline(always)]
#[must_use]
pub fn cos_fast_f32(x: f32) -> f32 {
cos_fast_f64(f64::from(x)) as f32
}
macro_rules! simd_batch {
($name:ident, $avx:ident, $ty:ty, $kernel:path, $libm:path) => {
#[doc = concat!("Batch fast ", stringify!($kernel), " over slices — runtime-multiversioned (AVX2+FMA when present, libm fallback otherwise).")]
#[inline(never)]
pub fn $name(input: &[$ty], output: &mut [$ty]) {
debug_assert_eq!(input.len(), output.len());
#[cfg(target_arch = "x86_64")]
{
if std::is_x86_feature_detected!("avx2") && std::is_x86_feature_detected!("fma") {
unsafe { $avx(input, output) };
} else {
for (o, &x) in output.iter_mut().zip(input) {
*o = $libm(x);
}
}
}
#[cfg(not(target_arch = "x86_64"))]
for (o, &x) in output.iter_mut().zip(input) {
*o = $kernel(x);
}
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2,fma")]
unsafe fn $avx(input: &[$ty], output: &mut [$ty]) {
for (o, &x) in output.iter_mut().zip(input) {
*o = $kernel(x);
}
}
};
}
simd_batch!(
sin_fast_batch_f64,
sin_fast_batch_f64_avx2,
f64,
sin_fast_f64,
f64::sin
);
simd_batch!(
cos_fast_batch_f64,
cos_fast_batch_f64_avx2,
f64,
cos_fast_f64,
f64::cos
);
simd_batch!(
sin_fast_batch_f32,
sin_fast_batch_f32_avx2,
f32,
sin_fast_f32,
f32::sin
);
simd_batch!(
cos_fast_batch_f32,
cos_fast_batch_f32_avx2,
f32,
cos_fast_f32,
f32::cos
);
#[cfg(test)]
mod tests {
use super::*;
use std::f64::consts::{FRAC_PI_2, FRAC_PI_4, PI, TAU};
#[test]
fn sin_fast_zero() {
assert_eq!(sin_fast_f64(0.0), 0.0);
}
#[test]
fn sin_fast_common_angles() {
assert!((sin_fast_f64(FRAC_PI_2) - 1.0).abs() < 1e-14);
assert!((sin_fast_f64(PI)).abs() < 1e-14);
assert!((sin_fast_f64(3.0 * FRAC_PI_2) + 1.0).abs() < 1e-14);
assert!((sin_fast_f64(TAU)).abs() < 1e-13);
}
#[test]
fn sin_fast_within_1_ulp_of_libm() {
let mut max_ulp = 0.0_f64;
for i in -10_000..=10_000 {
let x = f64::from(i) * 0.001; let fast = sin_fast_f64(x);
let libm = x.sin();
if libm == 0.0 {
continue;
}
let ulp = (fast - libm).abs() / (libm.abs() * f64::EPSILON);
if ulp > max_ulp {
max_ulp = ulp;
}
assert!(
ulp <= 4.0,
"sin_fast_f64({x}): fast={fast} libm={libm} ulp={ulp}"
);
}
assert!(max_ulp < 4.0, "max ulp {max_ulp} too high");
}
#[test]
fn sin_fast_nan_inf() {
assert!(sin_fast_f64(f64::NAN).is_nan());
assert!(sin_fast_f64(f64::INFINITY).is_nan());
assert!(sin_fast_f64(f64::NEG_INFINITY).is_nan());
}
#[test]
fn cos_fast_zero_is_one() {
assert_eq!(cos_fast_f64(0.0), 1.0);
}
#[test]
fn cos_fast_common_angles() {
assert!((cos_fast_f64(0.0) - 1.0).abs() < 1e-15);
assert!((cos_fast_f64(FRAC_PI_2)).abs() < 1e-14);
assert!((cos_fast_f64(PI) + 1.0).abs() < 1e-14);
assert!((cos_fast_f64(TAU) - 1.0).abs() < 1e-13);
}
#[test]
fn cos_fast_within_bound_of_libm() {
let mut max_ulp = 0.0_f64;
for i in -10_000..=10_000 {
let x = f64::from(i) * 0.001;
let fast = cos_fast_f64(x);
let libm = x.cos();
if libm == 0.0 {
continue;
}
let ulp = (fast - libm).abs() / (libm.abs() * f64::EPSILON);
if ulp > max_ulp {
max_ulp = ulp;
}
assert!(ulp <= 4.0, "cos_fast_f64({x}): ulp={ulp}");
}
}
#[test]
fn cos_fast_nan_inf() {
assert!(cos_fast_f64(f64::NAN).is_nan());
assert!(cos_fast_f64(f64::INFINITY).is_nan());
assert!(cos_fast_f64(f64::NEG_INFINITY).is_nan());
}
#[test]
fn pythagorean_identity() {
for i in -1_000..=1_000 {
let x = f64::from(i) * 0.01;
let s = sin_fast_f64(x);
let c = cos_fast_f64(x);
let sum = s * s + c * c;
assert!((sum - 1.0).abs() < 1e-13, "sin² + cos² at x={x}: {sum}");
}
}
#[test]
fn sin_cos_fast_f32_basic() {
assert!((sin_fast_f32(0.0_f32)).abs() < 1e-6);
assert!((cos_fast_f32(0.0_f32) - 1.0).abs() < 1e-6);
assert!((sin_fast_f32(std::f32::consts::FRAC_PI_2) - 1.0).abs() < 1e-6);
assert!((cos_fast_f32(std::f32::consts::PI) + 1.0).abs() < 1e-6);
assert!(sin_fast_f32(f32::NAN).is_nan());
}
#[test]
fn sin_fast_batch_matches_scalar_f64() {
let input: Vec<f64> = (-100..=100).map(|i| f64::from(i) * 0.1).collect();
let mut output = vec![0.0_f64; input.len()];
sin_fast_batch_f64(&input, &mut output);
for (i, &x) in input.iter().enumerate() {
assert_eq!(output[i].to_bits(), sin_fast_f64(x).to_bits());
}
}
#[test]
fn cos_fast_batch_matches_scalar_f64() {
let input: Vec<f64> = (-100..=100).map(|i| f64::from(i) * 0.1).collect();
let mut output = vec![0.0_f64; input.len()];
cos_fast_batch_f64(&input, &mut output);
for (i, &x) in input.iter().enumerate() {
assert_eq!(output[i].to_bits(), cos_fast_f64(x).to_bits());
}
}
#[test]
fn sin_cos_fast_batch_matches_scalar_f32() {
let input: Vec<f32> = (-100..=100).map(|i| i as f32 * 0.1).collect();
let mut sout = vec![0.0_f32; input.len()];
let mut cout = vec![0.0_f32; input.len()];
sin_fast_batch_f32(&input, &mut sout);
cos_fast_batch_f32(&input, &mut cout);
for (i, &x) in input.iter().enumerate() {
assert_eq!(sout[i].to_bits(), sin_fast_f32(x).to_bits());
assert_eq!(cout[i].to_bits(), cos_fast_f32(x).to_bits());
}
}
#[test]
fn pi_over_4_boundary() {
let sqrt2_over_2 = std::f64::consts::FRAC_1_SQRT_2;
assert!((sin_fast_f64(FRAC_PI_4) - sqrt2_over_2).abs() < 1e-15);
assert!((cos_fast_f64(FRAC_PI_4) - sqrt2_over_2).abs() < 1e-15);
}
}