rullama 0.2.0

Browser-resident Gemma 4 inference: pure Rust → WebAssembly + WebGPU. Loads Ollama's on-disk GGUF blobs and runs the forward pass on the local GPU via hand-written WGSL.
Documentation
// GLU split: y[t, d] = x[t, d] * sigmoid(x[t, d_inner + d]).
//
// `x` is [T, 2*D] flat (channel-LAST). The first D channels are the data half,
// the second D are the gate half. Used in Conformer LightConv after conv_pw1.
//
// Output `y` is [T, D] (half the channel count). Element-major. Caller dispatches
// `T * D / 64` workgroups.

struct Params {
    seq:    u32,
    inner:  u32,    // D — half of x's channel count
    _p0:    u32,
    _p1:    u32,
}

@group(0) @binding(0) var<uniform>             params: Params;
@group(0) @binding(1) var<storage, read>       x:      array<f32>;
@group(0) @binding(2) var<storage, read_write> y:      array<f32>;

@compute @workgroup_size(64)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    let total: u32 = params.seq * params.inner;
    let idx: u32 = gid.x;
    if (idx >= total) { return; }
    let t: u32 = idx / params.inner;
    let d: u32 = idx - t * params.inner;
    let two_inner: u32 = params.inner * 2u;
    let data: f32 = x[t * two_inner + d];
    let g:    f32 = x[t * two_inner + params.inner + d];
    let sig:  f32 = 1.0 / (1.0 + exp(-g));
    y[idx] = data * sig;
}