simdeez 3.0.1

SIMD library to abstract over different instruction sets and widths
Documentation
use criterion::Criterion;
use simdeez::math::{SimdMathF32BinaryMisc, SimdMathF64BinaryMisc};
use simdeez::scalar::Scalar;
use simdeez::{prelude::*, simd_unsafe_generate_all};

use crate::shared::{
    self, BenchTargets, BenchTargetsF64, BinaryBenchTargets, BinaryBenchTargetsF64, INPUT_LEN,
};

#[inline(never)]
fn scalar_log10_sum(input: &[f32]) -> f32 {
    input.iter().copied().map(f32::log10).sum()
}

#[inline(never)]
fn scalar_atan2_sum(a: &[f32], b: &[f32]) -> f32 {
    a.iter().zip(b.iter()).map(|(&x, &y)| x.atan2(y)).sum()
}

#[inline(never)]
fn scalar_fmod_sum(a: &[f32], b: &[f32]) -> f32 {
    a.iter().zip(b.iter()).map(|(&x, &y)| x % y).sum()
}

#[inline(never)]
fn scalar_hypot_sum(a: &[f32], b: &[f32]) -> f32 {
    a.iter().zip(b.iter()).map(|(&x, &y)| x.hypot(y)).sum()
}

#[inline(never)]
fn scalar_log10_sum_f64(input: &[f64]) -> f64 {
    input.iter().copied().map(f64::log10).sum()
}

#[inline(never)]
fn scalar_atan2_sum_f64(a: &[f64], b: &[f64]) -> f64 {
    a.iter().zip(b.iter()).map(|(&x, &y)| x.atan2(y)).sum()
}

#[inline(never)]
fn scalar_fmod_sum_f64(a: &[f64], b: &[f64]) -> f64 {
    a.iter().zip(b.iter()).map(|(&x, &y)| x % y).sum()
}

#[inline(never)]
fn scalar_hypot_sum_f64(a: &[f64], b: &[f64]) -> f64 {
    a.iter().zip(b.iter()).map(|(&x, &y)| x.hypot(y)).sum()
}

simd_unsafe_generate_all!(
    fn simdeez_log10_sum(input: &[f32]) -> f32 {
        shared::simdeez_unary_sum_impl::<S>(input, |v| v.log10_u35())
    }
);

simd_unsafe_generate_all!(
    fn simdeez_atan2_sum(a: &[f32], b: &[f32]) -> f32 {
        shared::simdeez_binary_sum_impl::<S>(a, b, |x, y| x.atan2_u35(y))
    }
);

simd_unsafe_generate_all!(
    fn simdeez_hypot_sum(a: &[f32], b: &[f32]) -> f32 {
        shared::simdeez_binary_sum_impl::<S>(a, b, |x, y| x.hypot_u35(y))
    }
);

simd_unsafe_generate_all!(
    fn simdeez_fmod_sum(a: &[f32], b: &[f32]) -> f32 {
        shared::simdeez_binary_sum_impl::<S>(a, b, |x, y| x.fmod(y))
    }
);

simd_unsafe_generate_all!(
    fn simdeez_log10_sum_f64(input: &[f64]) -> f64 {
        shared::simdeez_unary_sum_impl_f64::<S>(input, |v| v.log10_u35())
    }
);

simd_unsafe_generate_all!(
    fn simdeez_atan2_sum_f64(a: &[f64], b: &[f64]) -> f64 {
        shared::simdeez_binary_sum_impl_f64::<S>(a, b, |x, y| x.atan2_u35(y))
    }
);

simd_unsafe_generate_all!(
    fn simdeez_hypot_sum_f64(a: &[f64], b: &[f64]) -> f64 {
        shared::simdeez_binary_sum_impl_f64::<S>(a, b, |x, y| x.hypot_u35(y))
    }
);

simd_unsafe_generate_all!(
    fn simdeez_fmod_sum_f64(a: &[f64], b: &[f64]) -> f64 {
        shared::simdeez_binary_sum_impl_f64::<S>(a, b, |x, y| x.fmod(y))
    }
);

#[inline(never)]
fn forced_scalar_log10_sum(input: &[f32]) -> f32 {
    shared::force_scalar_sum(input, |v: <Scalar as Simd>::Vf32| v.log10_u35())
}

#[inline(never)]
fn forced_scalar_atan2_sum(a: &[f32], b: &[f32]) -> f32 {
    shared::force_scalar_binary_sum(a, b, |x: <Scalar as Simd>::Vf32, y| x.atan2_u35(y))
}

#[inline(never)]
fn forced_scalar_hypot_sum(a: &[f32], b: &[f32]) -> f32 {
    shared::force_scalar_binary_sum(a, b, |x: <Scalar as Simd>::Vf32, y| x.hypot_u35(y))
}

#[inline(never)]
fn forced_scalar_fmod_sum(a: &[f32], b: &[f32]) -> f32 {
    shared::force_scalar_binary_sum(a, b, |x: <Scalar as Simd>::Vf32, y| x.fmod(y))
}

#[inline(never)]
fn forced_scalar_log10_sum_f64(input: &[f64]) -> f64 {
    shared::force_scalar_sum_f64(input, |v: <Scalar as Simd>::Vf64| v.log10_u35())
}

#[inline(never)]
fn forced_scalar_atan2_sum_f64(a: &[f64], b: &[f64]) -> f64 {
    shared::force_scalar_binary_sum_f64(a, b, |x: <Scalar as Simd>::Vf64, y| x.atan2_u35(y))
}

#[inline(never)]
fn forced_scalar_hypot_sum_f64(a: &[f64], b: &[f64]) -> f64 {
    shared::force_scalar_binary_sum_f64(a, b, |x: <Scalar as Simd>::Vf64, y| x.hypot_u35(y))
}

#[inline(never)]
fn forced_scalar_fmod_sum_f64(a: &[f64], b: &[f64]) -> f64 {
    shared::force_scalar_binary_sum_f64(a, b, |x: <Scalar as Simd>::Vf64, y| x.fmod(y))
}

pub fn register(c: &mut Criterion) {
    let log10_inputs = shared::make_positive_inputs(INPUT_LEN, 0xDEADB004, 1.0e-20, 1.0e20);
    let (atan2_y, atan2_x) =
        shared::make_binary_inputs(INPUT_LEN, 0xDEADB005, -100.0..100.0, -100.0..100.0);
    let (hypot_x, hypot_y) =
        shared::make_binary_inputs(INPUT_LEN, 0xDEADB006, -1.0e20..1.0e20, -1.0e20..1.0e20);
    let (fmod_x, mut fmod_y) =
        shared::make_binary_inputs(INPUT_LEN, 0xDEADB007, -1000.0..1000.0, -1000.0..1000.0);
    for y in &mut fmod_y {
        if *y == 0.0 {
            *y = 1.0;
        }
    }
    let log10_inputs_f64 =
        shared::make_positive_inputs_f64(INPUT_LEN, 0xDEADB104, 1.0e-200, 1.0e200);
    let (atan2_y_f64, atan2_x_f64) =
        shared::make_binary_inputs_f64(INPUT_LEN, 0xDEADB105, -100.0..100.0, -100.0..100.0);
    let (hypot_x_f64, hypot_y_f64) =
        shared::make_binary_inputs_f64(INPUT_LEN, 0xDEADB106, -1.0e200..1.0e200, -1.0e200..1.0e200);
    let (fmod_x_f64, mut fmod_y_f64) =
        shared::make_binary_inputs_f64(INPUT_LEN, 0xDEADB107, -1000.0..1000.0, -1000.0..1000.0);
    for y in &mut fmod_y_f64 {
        if *y == 0.0 {
            *y = 1.0;
        }
    }

    shared::bench_unary_variants(
        c,
        "simd_math_baseline/f32/log10_u35",
        &log10_inputs,
        BenchTargets {
            scalar_native: scalar_log10_sum,
            simdeez_runtime: simdeez_log10_sum,
            simdeez_scalar: forced_scalar_log10_sum,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse2: simdeez_log10_sum_sse2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse41: simdeez_log10_sum_sse41,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx2: simdeez_log10_sum_avx2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx512: simdeez_log10_sum_avx512,
        },
    );

    shared::bench_binary_variants(
        c,
        "simd_math_baseline/f32/atan2_u35",
        &atan2_y,
        &atan2_x,
        BinaryBenchTargets {
            scalar_native: scalar_atan2_sum,
            simdeez_runtime: simdeez_atan2_sum,
            simdeez_scalar: forced_scalar_atan2_sum,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse2: simdeez_atan2_sum_sse2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse41: simdeez_atan2_sum_sse41,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx2: simdeez_atan2_sum_avx2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx512: simdeez_atan2_sum_avx512,
        },
    );
    shared::bench_binary_variants(
        c,
        "simd_math_baseline/f32/hypot_u35",
        &hypot_x,
        &hypot_y,
        BinaryBenchTargets {
            scalar_native: scalar_hypot_sum,
            simdeez_runtime: simdeez_hypot_sum,
            simdeez_scalar: forced_scalar_hypot_sum,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse2: simdeez_hypot_sum_sse2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse41: simdeez_hypot_sum_sse41,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx2: simdeez_hypot_sum_avx2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx512: simdeez_hypot_sum_avx512,
        },
    );
    shared::bench_binary_variants(
        c,
        "simd_math_baseline/f32/fmod",
        &fmod_x,
        &fmod_y,
        BinaryBenchTargets {
            scalar_native: scalar_fmod_sum,
            simdeez_runtime: simdeez_fmod_sum,
            simdeez_scalar: forced_scalar_fmod_sum,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse2: simdeez_fmod_sum_sse2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse41: simdeez_fmod_sum_sse41,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx2: simdeez_fmod_sum_avx2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx512: simdeez_fmod_sum_avx512,
        },
    );

    shared::bench_unary_variants_f64(
        c,
        "simd_math_baseline/f64/log10_u35",
        &log10_inputs_f64,
        BenchTargetsF64 {
            scalar_native: scalar_log10_sum_f64,
            simdeez_runtime: simdeez_log10_sum_f64,
            simdeez_scalar: forced_scalar_log10_sum_f64,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse2: simdeez_log10_sum_f64_sse2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse41: simdeez_log10_sum_f64_sse41,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx2: simdeez_log10_sum_f64_avx2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx512: simdeez_log10_sum_f64_avx512,
        },
    );
    shared::bench_binary_variants_f64(
        c,
        "simd_math_baseline/f64/atan2_u35",
        &atan2_y_f64,
        &atan2_x_f64,
        BinaryBenchTargetsF64 {
            scalar_native: scalar_atan2_sum_f64,
            simdeez_runtime: simdeez_atan2_sum_f64,
            simdeez_scalar: forced_scalar_atan2_sum_f64,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse2: simdeez_atan2_sum_f64_sse2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse41: simdeez_atan2_sum_f64_sse41,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx2: simdeez_atan2_sum_f64_avx2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx512: simdeez_atan2_sum_f64_avx512,
        },
    );
    shared::bench_binary_variants_f64(
        c,
        "simd_math_baseline/f64/hypot_u35",
        &hypot_x_f64,
        &hypot_y_f64,
        BinaryBenchTargetsF64 {
            scalar_native: scalar_hypot_sum_f64,
            simdeez_runtime: simdeez_hypot_sum_f64,
            simdeez_scalar: forced_scalar_hypot_sum_f64,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse2: simdeez_hypot_sum_f64_sse2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse41: simdeez_hypot_sum_f64_sse41,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx2: simdeez_hypot_sum_f64_avx2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx512: simdeez_hypot_sum_f64_avx512,
        },
    );
    shared::bench_binary_variants_f64(
        c,
        "simd_math_baseline/f64/fmod",
        &fmod_x_f64,
        &fmod_y_f64,
        BinaryBenchTargetsF64 {
            scalar_native: scalar_fmod_sum_f64,
            simdeez_runtime: simdeez_fmod_sum_f64,
            simdeez_scalar: forced_scalar_fmod_sum_f64,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse2: simdeez_fmod_sum_f64_sse2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_sse41: simdeez_fmod_sum_f64_sse41,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx2: simdeez_fmod_sum_f64_avx2,
            #[cfg(any(target_arch = "x86_64", target_arch = "x86"))]
            simdeez_avx512: simdeez_fmod_sum_f64_avx512,
        },
    );
}