pub fn gelu(output: &mut [f32], input: &[f32])
GeLU activation (approximate): 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))