rullama 0.1.0 - Docs.rs

// GeGLU split: y[i] = gelu(gate[i]) * up[i], elementwise.
//
// Mirrors `geglu_split` in `src/reference/ops.rs` exactly: erf-based GELU using
// Abramowitz & Stegun 7.1.26, so f32 outputs are bit-identical to the CPU oracle
// modulo accumulation-order differences (none here — pure elementwise).

struct Params {
    n: u32,
    _pad0: u32,
    _pad1: u32,
    _pad2: u32,
}

@group(0) @binding(0) var<uniform>             params: Params;
@group(0) @binding(1) var<storage, read>       gate: array<f32>;
@group(0) @binding(2) var<storage, read>       up:   array<f32>;
@group(0) @binding(3) var<storage, read_write> y:    array<f32>;

const SQRT_HALF: f32 = 0.70710677;
const A1: f32 =  0.254829592;
const A2: f32 = -0.284496736;
const A3: f32 =  1.421413741;
const A4: f32 = -1.453152027;
const A5: f32 =  1.061405429;
const P:  f32 =  0.3275911;

fn erf_approx(xx: f32) -> f32 {
    var sign: f32 = 1.0;
    var x: f32 = xx;
    if (x < 0.0) { sign = -1.0; x = -x; }
    let t  = 1.0 / (1.0 + P * x);
    let v  = 1.0 - (((((A5 * t + A4) * t) + A3) * t + A2) * t + A1) * t * exp(-x * x);
    return sign * v;
}

fn gelu_exact(g: f32) -> f32 {
    return 0.5 * g * (1.0 + erf_approx(g * SQRT_HALF));
}

@compute @workgroup_size(64)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    let i = gid.x;
    if (i >= params.n) { return; }
    y[i] = gelu_exact(gate[i]) * up[i];
}