#version 450
// Unary GELU (tanh approximation, matching hanzo-ml's CPU Gelu):
// 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))
layout(local_size_x = 64) in;
layout(set = 0, binding = 0) readonly buffer In { float inp[]; };
layout(set = 0, binding = 1) writeonly buffer Out { float o[]; };
layout(push_constant) uniform Pc { uint n; };
const float SQRT_TWO_OVER_PI = 0.7978845608028654;
void main() {
uint i = gl_GlobalInvocationID.x;
if (i < n) {
float x = inp[i];
float inner = SQRT_TWO_OVER_PI * (x + 0.044715 * x * x * x);
o[i] = 0.5 * x * (1.0 + tanh(inner));
}
}