any-gpu 0.1.0 - Docs.rs

// Unlicense — cochranblock.org
// Contributors: GotEmCoach, KOVA, Claude Opus 4.6
//
// Element-wise ops: add, mul, sub, scale, relu, sigmoid, swish, tanh.

use crate::device::{GpuBuffer, GpuDevice};
use anyhow::{ensure, Result};

const SHADER_ADD: &str = "
struct Params { n: u32, _p0: u32, _p1: u32, _p2: u32, }
@group(0) @binding(0) var<uniform> params: Params;
@group(0) @binding(1) var<storage, read> a: array<f32>;
@group(0) @binding(2) var<storage, read> b: array<f32>;
@group(0) @binding(3) var<storage, read_write> out: array<f32>;
@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    let idx = gid.x + gid.y * 65535u * 256u;
    if idx >= params.n { return; }
    out[idx] = a[idx] + b[idx];
}
";

const SHADER_SUB: &str = "
struct Params { n: u32, _p0: u32, _p1: u32, _p2: u32, }
@group(0) @binding(0) var<uniform> params: Params;
@group(0) @binding(1) var<storage, read> a: array<f32>;
@group(0) @binding(2) var<storage, read> b: array<f32>;
@group(0) @binding(3) var<storage, read_write> out: array<f32>;
@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    let idx = gid.x + gid.y * 65535u * 256u;
    if idx >= params.n { return; }
    out[idx] = a[idx] - b[idx];
}
";

const SHADER_MUL: &str = "
struct Params { n: u32, _p0: u32, _p1: u32, _p2: u32, }
@group(0) @binding(0) var<uniform> params: Params;
@group(0) @binding(1) var<storage, read> a: array<f32>;
@group(0) @binding(2) var<storage, read> b: array<f32>;
@group(0) @binding(3) var<storage, read_write> out: array<f32>;
@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    let idx = gid.x + gid.y * 65535u * 256u;
    if idx >= params.n { return; }
    out[idx] = a[idx] * b[idx];
}
";

const SHADER_RELU: &str = "
struct Params { n: u32, _p0: u32, _p1: u32, _p2: u32, }
@group(0) @binding(0) var<uniform> params: Params;
@group(0) @binding(1) var<storage, read> a: array<f32>;
@group(0) @binding(2) var<storage, read_write> out: array<f32>;
@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    let idx = gid.x + gid.y * 65535u * 256u;
    if idx >= params.n { return; }
    out[idx] = max(a[idx], 0.0);
}
";

const SHADER_SIGMOID: &str = "
struct Params { n: u32, _p0: u32, _p1: u32, _p2: u32, }
@group(0) @binding(0) var<uniform> params: Params;
@group(0) @binding(1) var<storage, read> a: array<f32>;
@group(0) @binding(2) var<storage, read_write> out: array<f32>;
@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    let idx = gid.x + gid.y * 65535u * 256u;
    if idx >= params.n { return; }
    out[idx] = 1.0 / (1.0 + exp(-a[idx]));
}
";

const SHADER_SWISH: &str = "
struct Params { n: u32, _p0: u32, _p1: u32, _p2: u32, }
@group(0) @binding(0) var<uniform> params: Params;
@group(0) @binding(1) var<storage, read> a: array<f32>;
@group(0) @binding(2) var<storage, read_write> out: array<f32>;
@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    let idx = gid.x + gid.y * 65535u * 256u;
    if idx >= params.n { return; }
    let x = a[idx];
    out[idx] = x / (1.0 + exp(-x));
}
";

const SHADER_TANH: &str = "
struct Params { n: u32, _p0: u32, _p1: u32, _p2: u32, }
@group(0) @binding(0) var<uniform> params: Params;
@group(0) @binding(1) var<storage, read> a: array<f32>;
@group(0) @binding(2) var<storage, read_write> out: array<f32>;
@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    let idx = gid.x + gid.y * 65535u * 256u;
    if idx >= params.n { return; }
    out[idx] = tanh(a[idx]);
}
";

#[repr(C)]
#[derive(Copy, Clone, bytemuck::Pod, bytemuck::Zeroable)]
struct ScaleParams {
    n: u32,
    scale: f32,
    _pad: [u32; 2],
}

const SHADER_SCALE: &str = "
struct Params { n: u32, scale: f32, _p0: u32, _p1: u32, }
@group(0) @binding(0) var<uniform> params: Params;
@group(0) @binding(1) var<storage, read> a: array<f32>;
@group(0) @binding(2) var<storage, read_write> out: array<f32>;
@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    let idx = gid.x + gid.y * 65535u * 256u;
    if idx >= params.n { return; }
    out[idx] = a[idx] * params.scale;
}
";

// --- Backward shaders ---

const SHADER_RELU_BACKWARD: &str = "
struct Params { n: u32, _p0: u32, _p1: u32, _p2: u32, }
@group(0) @binding(0) var<uniform> params: Params;
@group(0) @binding(1) var<storage, read> grad_out: array<f32>;
@group(0) @binding(2) var<storage, read> input: array<f32>;
@group(0) @binding(3) var<storage, read_write> out: array<f32>;
@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    let idx = gid.x + gid.y * 65535u * 256u;
    if idx >= params.n { return; }
    out[idx] = select(0.0, grad_out[idx], input[idx] > 0.0);
}
";

const SHADER_SIGMOID_BACKWARD: &str = "
struct Params { n: u32, _p0: u32, _p1: u32, _p2: u32, }
@group(0) @binding(0) var<uniform> params: Params;
@group(0) @binding(1) var<storage, read> grad_out: array<f32>;
@group(0) @binding(2) var<storage, read> sig_out: array<f32>;
@group(0) @binding(3) var<storage, read_write> out: array<f32>;
@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    let idx = gid.x + gid.y * 65535u * 256u;
    if idx >= params.n { return; }
    let s = sig_out[idx];
    out[idx] = grad_out[idx] * s * (1.0 - s);
}
";

const SHADER_SWISH_BACKWARD: &str = "
struct Params { n: u32, _p0: u32, _p1: u32, _p2: u32, }
@group(0) @binding(0) var<uniform> params: Params;
@group(0) @binding(1) var<storage, read> grad_out: array<f32>;
@group(0) @binding(2) var<storage, read> input: array<f32>;
@group(0) @binding(3) var<storage, read_write> out: array<f32>;
@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    let idx = gid.x + gid.y * 65535u * 256u;
    if idx >= params.n { return; }
    let x = input[idx];
    let s = 1.0 / (1.0 + exp(-x));
    out[idx] = grad_out[idx] * (s + x * s * (1.0 - s));
}
";

const SHADER_TANH_BACKWARD: &str = "
struct Params { n: u32, _p0: u32, _p1: u32, _p2: u32, }
@group(0) @binding(0) var<uniform> params: Params;
@group(0) @binding(1) var<storage, read> grad_out: array<f32>;
@group(0) @binding(2) var<storage, read> tanh_out: array<f32>;
@group(0) @binding(3) var<storage, read_write> out: array<f32>;
@compute @workgroup_size(256)
fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
    let idx = gid.x + gid.y * 65535u * 256u;
    if idx >= params.n { return; }
    let t = tanh_out[idx];
    out[idx] = grad_out[idx] * (1.0 - t * t);
}
";

impl GpuDevice {
    pub fn add(&self, a: &GpuBuffer, b: &GpuBuffer) -> Result<GpuBuffer> {
        ensure!(a.len == b.len, "add: length mismatch ({} vs {})", a.len, b.len);
        self.binary_op(SHADER_ADD, a, b)
    }

    pub fn sub(&self, a: &GpuBuffer, b: &GpuBuffer) -> Result<GpuBuffer> {
        ensure!(a.len == b.len, "sub: length mismatch ({} vs {})", a.len, b.len);
        self.binary_op(SHADER_SUB, a, b)
    }

    pub fn mul(&self, a: &GpuBuffer, b: &GpuBuffer) -> Result<GpuBuffer> {
        ensure!(a.len == b.len, "mul: length mismatch ({} vs {})", a.len, b.len);
        self.binary_op(SHADER_MUL, a, b)
    }

    pub fn relu(&self, a: &GpuBuffer) -> Result<GpuBuffer> {
        self.unary_op(SHADER_RELU, a)
    }

    pub fn sigmoid(&self, a: &GpuBuffer) -> Result<GpuBuffer> {
        self.unary_op(SHADER_SIGMOID, a)
    }

    pub fn swish(&self, a: &GpuBuffer) -> Result<GpuBuffer> {
        self.unary_op(SHADER_SWISH, a)
    }

    pub fn tanh_act(&self, a: &GpuBuffer) -> Result<GpuBuffer> {
        self.unary_op(SHADER_TANH, a)
    }

    pub fn scale(&self, a: &GpuBuffer, s: f32) -> Result<GpuBuffer> {
        let out = self.alloc(a.len);
        let params = ScaleParams { n: a.len as u32, scale: s, _pad: [0; 2] };
        self.dispatch_shader(SHADER_SCALE, None, &params, &[a], &out, super::dispatch_1d(a.len as u32));
        Ok(out)
    }

    // --- Backward shaders for autograd ---

    /// ReLU backward: grad_a = grad_out * (input > 0)
    pub fn relu_backward(&self, grad_out: &GpuBuffer, input: &GpuBuffer) -> Result<GpuBuffer> {
        ensure!(grad_out.len == input.len);
        self.binary_op(SHADER_RELU_BACKWARD, grad_out, input)
    }

    /// Sigmoid backward: grad_a = grad_out * output * (1 - output)
    pub fn sigmoid_backward(&self, grad_out: &GpuBuffer, output: &GpuBuffer) -> Result<GpuBuffer> {
        ensure!(grad_out.len == output.len);
        self.binary_op(SHADER_SIGMOID_BACKWARD, grad_out, output)
    }

    /// Swish backward: grad_a = grad_out * (sig(x) + x * sig(x) * (1 - sig(x)))
    pub fn swish_backward(&self, grad_out: &GpuBuffer, input: &GpuBuffer) -> Result<GpuBuffer> {
        ensure!(grad_out.len == input.len);
        self.binary_op(SHADER_SWISH_BACKWARD, grad_out, input)
    }

    /// Tanh backward: grad_a = grad_out * (1 - output^2)
    pub fn tanh_backward(&self, grad_out: &GpuBuffer, output: &GpuBuffer) -> Result<GpuBuffer> {
        ensure!(grad_out.len == output.len);
        self.binary_op(SHADER_TANH_BACKWARD, grad_out, output)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::ops::assert_approx;
    fn dev() -> &'static GpuDevice { &crate::ops::TEST_DEV }

    // CPU references for cross-validation
    fn cpu_sigmoid(x: f32) -> f32 { 1.0 / (1.0 + (-x).exp()) }
    fn cpu_swish(x: f32) -> f32 { x * cpu_sigmoid(x) }

    #[test]
    fn test_add() {
        let a = dev().upload(&[1.0, 2.0, 3.0, 4.0]);
        let b = dev().upload(&[10.0, 20.0, 30.0, 40.0]);
        let result = dev().read(&dev().add(&a, &b).unwrap()).unwrap();
        assert_eq!(result, vec![11.0, 22.0, 33.0, 44.0]);
    }

    #[test]
    fn test_add_odd_size() {
        // 13 elements — not aligned to workgroup size 256
        let a_data: Vec<f32> = (0..13).map(|i| i as f32).collect();
        let b_data: Vec<f32> = (0..13).map(|i| i as f32 * 10.0).collect();
        let expected: Vec<f32> = a_data.iter().zip(&b_data).map(|(a, b)| a + b).collect();
        let result = dev().read(&dev().add(&dev().upload(&a_data), &dev().upload(&b_data)).unwrap()).unwrap();
        assert_eq!(result, expected);
    }

    #[test]
    fn test_add_single_element() {
        let result = dev().read(&dev().add(&dev().upload(&[42.0]), &dev().upload(&[-42.0])).unwrap()).unwrap();
        assert_eq!(result, vec![0.0]);
    }

    #[test]
    fn test_sub() {
        let a = dev().upload(&[10.0, 20.0, 30.0]);
        let b = dev().upload(&[1.0, 2.0, 3.0]);
        let result = dev().read(&dev().sub(&a, &b).unwrap()).unwrap();
        assert_eq!(result, vec![9.0, 18.0, 27.0]);
    }

    #[test]
    fn test_mul() {
        let a = dev().upload(&[1.0, 2.0, 3.0, 4.0]);
        let b = dev().upload(&[10.0, 20.0, 30.0, 40.0]);
        let result = dev().read(&dev().mul(&a, &b).unwrap()).unwrap();
        assert_eq!(result, vec![10.0, 40.0, 90.0, 160.0]);
    }

    #[test]
    fn test_mul_zeros() {
        let a = dev().upload(&[1.0, 2.0, 3.0]);
        let b = dev().upload(&[0.0, 0.0, 0.0]);
        let result = dev().read(&dev().mul(&a, &b).unwrap()).unwrap();
        assert_eq!(result, vec![0.0, 0.0, 0.0]);
    }

    #[test]
    fn test_relu() {
        let a = dev().upload(&[-2.0, -1.0, 0.0, 1.0, 2.0]);
        let result = dev().read(&dev().relu(&a).unwrap()).unwrap();
        assert_eq!(result, vec![0.0, 0.0, 0.0, 1.0, 2.0]);
    }

    #[test]
    fn test_relu_all_negative() {
        let result = dev().read(&dev().relu(&dev().upload(&[-100.0, -0.001, -1e-10])).unwrap()).unwrap();
        assert_eq!(result, vec![0.0, 0.0, 0.0]);
    }

    #[test]
    fn test_sigmoid_vs_cpu() {
        let data: Vec<f32> = vec![-50.0, -10.0, -1.0, 0.0, 1.0, 10.0, 50.0];
        let expected: Vec<f32> = data.iter().map(|&x| cpu_sigmoid(x)).collect();
        let result = dev().read(&dev().sigmoid(&dev().upload(&data)).unwrap()).unwrap();
        assert_approx(&result, &expected, 1e-4);
    }

    #[test]
    fn test_swish_vs_cpu() {
        let data: Vec<f32> = vec![-5.0, -2.0, -1.0, 0.0, 1.0, 2.0, 5.0];
        let expected: Vec<f32> = data.iter().map(|&x| cpu_swish(x)).collect();
        let result = dev().read(&dev().swish(&dev().upload(&data)).unwrap()).unwrap();
        assert_approx(&result, &expected, 1e-4);
    }

    #[test]
    fn test_tanh_vs_cpu() {
        let data: Vec<f32> = vec![-10.0, -1.0, 0.0, 1.0, 10.0];
        let expected: Vec<f32> = data.iter().map(|&x| x.tanh()).collect();
        let result = dev().read(&dev().tanh_act(&dev().upload(&data)).unwrap()).unwrap();
        assert_approx(&result, &expected, 1e-4);
    }

    #[test]
    fn test_scale() {
        let result = dev().read(&dev().scale(&dev().upload(&[1.0, 2.0, 3.0, 4.0]), 0.5).unwrap()).unwrap();
        assert_eq!(result, vec![0.5, 1.0, 1.5, 2.0]);
    }

    #[test]
    fn test_scale_zero() {
        let result = dev().read(&dev().scale(&dev().upload(&[99.0, -99.0]), 0.0).unwrap()).unwrap();
        assert_eq!(result, vec![0.0, 0.0]);
    }

    #[test]
    fn test_scale_negative() {
        let result = dev().read(&dev().scale(&dev().upload(&[1.0, -2.0, 3.0]), -2.0).unwrap()).unwrap();
        assert_eq!(result, vec![-2.0, 4.0, -6.0]);
    }

    // --- Error path tests ---

    #[test]
    fn test_add_length_mismatch() {
        let a = dev().upload(&[1.0, 2.0]);
        let b = dev().upload(&[1.0, 2.0, 3.0]);
        assert!(dev().add(&a, &b).is_err());
    }

    #[test]
    fn test_sub_length_mismatch() {
        let a = dev().upload(&[1.0]);
        let b = dev().upload(&[1.0, 2.0]);
        assert!(dev().sub(&a, &b).is_err());
    }

    #[test]
    fn test_mul_length_mismatch() {
        let a = dev().upload(&[1.0, 2.0, 3.0]);
        let b = dev().upload(&[1.0]);
        assert!(dev().mul(&a, &b).is_err());
    }

    // --- CPU cross-validation for add/sub/mul ---

    #[test]
    fn test_add_vs_cpu() {
        let a: Vec<f32> = (0..100).map(|i| (i as f32) * 0.3 - 15.0).collect();
        let b: Vec<f32> = (0..100).map(|i| (i as f32) * -0.2 + 10.0).collect();
        let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x + y).collect();
        let result = dev().read(&dev().add(&dev().upload(&a), &dev().upload(&b)).unwrap()).unwrap();
        assert_approx(&result, &expected, 1e-5);
    }

    #[test]
    fn test_sub_vs_cpu() {
        let a: Vec<f32> = (0..100).map(|i| (i as f32) * 0.7).collect();
        let b: Vec<f32> = (0..100).map(|i| (i as f32) * 0.3).collect();
        let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x - y).collect();
        let result = dev().read(&dev().sub(&dev().upload(&a), &dev().upload(&b)).unwrap()).unwrap();
        assert_approx(&result, &expected, 1e-5);
    }

    #[test]
    fn test_mul_vs_cpu() {
        let a: Vec<f32> = (0..100).map(|i| (i as f32) * 0.1 - 5.0).collect();
        let b: Vec<f32> = (0..100).map(|i| (i as f32) * 0.05 + 0.5).collect();
        let expected: Vec<f32> = a.iter().zip(&b).map(|(x, y)| x * y).collect();
        let result = dev().read(&dev().mul(&dev().upload(&a), &dev().upload(&b)).unwrap()).unwrap();
        assert_approx(&result, &expected, 1e-4);
    }

    // --- Backward shader direct tests ---

    #[test]
    fn test_relu_backward_vs_cpu() {
        let grad = dev().upload(&[1.0, 2.0, 3.0, 4.0, 5.0]);
        let input = dev().upload(&[-1.0, 0.5, 0.0, -0.1, 2.0]);
        let result = dev().read(&dev().relu_backward(&grad, &input).unwrap()).unwrap();
        // relu_backward: grad * (input > 0)
        assert_approx(&result, &[0.0, 2.0, 0.0, 0.0, 5.0], 1e-5);
    }

    #[test]
    fn test_sigmoid_backward_vs_cpu() {
        let sig_out = vec![0.5, 0.7311, 0.2689]; // sigmoid outputs
        let grad = vec![1.0, 1.0, 1.0];
        let expected: Vec<f32> = sig_out.iter().zip(&grad).map(|(s, g)| g * s * (1.0 - s)).collect();
        let result = dev().read(&dev().sigmoid_backward(&dev().upload(&grad), &dev().upload(&sig_out)).unwrap()).unwrap();
        assert_approx(&result, &expected, 1e-3);
    }

    #[test]
    fn test_swish_backward_vs_cpu() {
        let input = vec![0.0, 1.0, -1.0, 2.0];
        let grad = vec![1.0, 1.0, 1.0, 1.0];
        let expected: Vec<f32> = input.iter().map(|&x| {
            let s = 1.0f32 / (1.0f32 + (-(x as f32)).exp());
            s + x * s * (1.0 - s)
        }).collect();
        let result = dev().read(&dev().swish_backward(&dev().upload(&grad), &dev().upload(&input)).unwrap()).unwrap();
        assert_approx(&result, &expected, 1e-3);
    }

    #[test]
    fn test_tanh_backward_vs_cpu() {
        let tanh_out = vec![0.0, 0.7616, -0.7616, 0.9951]; // tanh outputs
        let grad = vec![1.0, 1.0, 1.0, 1.0];
        let expected: Vec<f32> = tanh_out.iter().map(|&t| 1.0 - t * t).collect();
        let result = dev().read(&dev().tanh_backward(&dev().upload(&grad), &dev().upload(&tanh_out)).unwrap()).unwrap();
        assert_approx(&result, &expected, 1e-3);
    }
}