ew_impl_wrap!(
f32,
arm64simd_gelu_f32_4n,
4,
4,
(),
#[inline(never)]
fn run(buf: &mut [f32], _: ()) {
const SQRT_2_OVER_PI: f32 = 0.7978845608028654;
const COEF: f32 = 0.044715;
const CHUNK: usize = 256;
let mut scratch = [0f32; CHUNK];
let mut start = 0;
while start < buf.len() {
let end = (start + CHUNK).min(buf.len());
let chunk = &mut buf[start..end];
let n = chunk.len();
for i in 0..n {
let x = chunk[i];
scratch[i] = x;
chunk[i] = SQRT_2_OVER_PI * (x + COEF * x * x * x);
}
super::arm64simd_tanh_f32_4n::run(chunk, ());
for i in 0..n {
chunk[i] = 0.5 * scratch[i] * (1.0 + chunk[i]);
}
start = end;
}
}
);
#[cfg(test)]
pub mod test_arm64simd_gelu_f32_4n {
use super::*;
gelu_frame_tests!(true, f32, arm64simd_gelu_f32_4n);
}