#![cfg(feature = "cuda")]
use std::path::PathBuf;
use std::sync::Once;
use serde_json::Value;
use ferrotorch_gpu::device::GpuDevice;
use ferrotorch_gpu::error::GpuResult;
use ferrotorch_gpu::transfer::{alloc_zeros_f32, cpu_to_gpu, gpu_to_cpu};
fn ensure_cuda() {
static INIT: Once = Once::new();
INIT.call_once(|| {
ferrotorch_gpu::init_cuda_backend().expect("CUDA backend init for C8.2");
});
}
fn device() -> GpuDevice {
GpuDevice::new(0).expect("GpuDevice::new(0)")
}
fn fixture_path() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/conformance/fixtures/gpu_kernels.json")
}
fn load_fixtures() -> Value {
let path = fixture_path();
let raw =
std::fs::read_to_string(&path).unwrap_or_else(|e| panic!("failed to read {path:?}: {e}"));
serde_json::from_str(&raw).expect("gpu_kernels.json parse")
}
fn pick<'a>(all: &'a Value, module: &str, op: &str) -> &'a Value {
let fixtures = all["fixtures"].as_array().expect("fixtures array");
for f in fixtures {
if f["module"].as_str() == Some(module) && f["op"].as_str() == Some(op) {
return f;
}
}
panic!("fixture not found: module={module} op={op}");
}
#[allow(dead_code, reason = "future flash-attention fixture expansion")]
fn pick_all<'a>(all: &'a Value, module: &str, op_prefix: &str) -> Vec<&'a Value> {
let fixtures = all["fixtures"].as_array().expect("fixtures array");
fixtures
.iter()
.filter(|f| {
f["module"].as_str() == Some(module)
&& f["op"]
.as_str()
.map(|s| s.starts_with(op_prefix))
.unwrap_or(false)
})
.collect()
}
fn as_f32_vec(v: &Value) -> Vec<f32> {
v.as_array()
.expect("expected array")
.iter()
.map(|x| match x {
Value::String(s) if s == "NaN" => f32::NAN,
Value::String(s) if s == "Infinity" => f32::INFINITY,
Value::String(s) if s == "-Infinity" => f32::NEG_INFINITY,
_ => x.as_f64().expect("f64") as f32,
})
.collect()
}
fn as_f64_vec(v: &Value) -> Vec<f64> {
v.as_array()
.expect("expected array")
.iter()
.map(|x| match x {
Value::String(s) if s == "NaN" => f64::NAN,
Value::String(s) if s == "Infinity" => f64::INFINITY,
Value::String(s) if s == "-Infinity" => f64::NEG_INFINITY,
_ => x.as_f64().expect("f64"),
})
.collect()
}
fn as_u32_vec(v: &Value) -> Vec<u32> {
v.as_array()
.expect("expected array")
.iter()
.map(|x| x.as_u64().expect("u64") as u32)
.collect()
}
const TOL_ELEMENTWISE: f32 = 1e-6;
const TOL_TRANSCENDENTAL: f32 = 1e-5;
const TOL_REDUCTION: f32 = 1e-5;
const TOL_NORMALISATION: f32 = 1e-5;
const TOL_MATMUL_F32: f32 = 1e-4;
const TOL_FLASH_F32: f32 = 1e-4;
const TOL_FLASH_F64: f64 = 1e-10;
const TOL_SPMM_F32: f32 = 1e-4;
const TOL_SPMM_F64: f64 = 1e-10;
const TOL_CONV_F32: f32 = 1e-4;
fn assert_close_f32(actual: &[f32], expected: &[f32], tol: f32, label: &str) {
assert_eq!(
actual.len(),
expected.len(),
"{label}: length mismatch (got={}, want={})",
actual.len(),
expected.len()
);
for (i, (&a, &e)) in actual.iter().zip(expected.iter()).enumerate() {
if a.is_nan() && e.is_nan() {
continue;
}
if a.is_infinite() && e.is_infinite() && a.signum() == e.signum() {
continue;
}
let diff = (a - e).abs();
let scale = e.abs().max(1.0);
assert!(
diff <= tol * scale,
"{label}[{i}]: diff={diff:.3e} exceeds tol={tol:.3e} (actual={a}, expected={e})"
);
}
}
fn assert_close_f64(actual: &[f64], expected: &[f64], tol: f64, label: &str) {
assert_eq!(
actual.len(),
expected.len(),
"{label}: length mismatch (got={}, want={})",
actual.len(),
expected.len()
);
for (i, (&a, &e)) in actual.iter().zip(expected.iter()).enumerate() {
if a.is_nan() && e.is_nan() {
continue;
}
if a.is_infinite() && e.is_infinite() && a.signum() == e.signum() {
continue;
}
let diff = (a - e).abs();
let scale = e.abs().max(1.0);
assert!(
diff <= tol * scale,
"{label}[{i}]: diff={diff:.3e} exceeds tol={tol:.3e} (actual={a}, expected={e})"
);
}
}
#[allow(dead_code)]
fn cascade_skip(test_name: &str, issue: &str) {
eprintln!(
"CONFORMANCE CASCADE-SKIP [{test_name}]: {issue} \
— filed as cascade bug, re-enabled when fixed."
);
}
#[test]
fn kernel_strided_copy_max_dims_value() {
assert_eq!(
ferrotorch_gpu::kernels::STRIDED_COPY_MAX_DIMS,
8,
"STRIDED_COPY_MAX_DIMS must equal 8"
);
}
#[test]
fn kernel_add_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "add");
let a_h = as_f32_vec(&f["input_a"]);
let b_h = as_f32_vec(&f["input_b"]);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let b = cpu_to_gpu(&b_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_add(&a, &b, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_ELEMENTWISE, "gpu_add f32");
}
#[test]
fn kernel_sub_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "sub");
let a_h = as_f32_vec(&f["input_a"]);
let b_h = as_f32_vec(&f["input_b"]);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let b = cpu_to_gpu(&b_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_sub(&a, &b, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_ELEMENTWISE, "gpu_sub f32");
}
#[test]
fn kernel_mul_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "mul");
let a_h = as_f32_vec(&f["input_a"]);
let b_h = as_f32_vec(&f["input_b"]);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let b = cpu_to_gpu(&b_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_mul(&a, &b, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_ELEMENTWISE, "gpu_mul f32");
}
#[test]
fn kernel_div_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "div");
let a_h = as_f32_vec(&f["input_a"]);
let b_h = as_f32_vec(&f["input_b"]);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let b = cpu_to_gpu(&b_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_div(&a, &b, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_ELEMENTWISE, "gpu_div f32");
}
#[test]
fn kernel_add_f64_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "add_f64");
let a_h = as_f64_vec(&f["input_a"]);
let b_h = as_f64_vec(&f["input_b"]);
let expected = as_f64_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let b = cpu_to_gpu(&b_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_add_f64(&a, &b, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f64(&actual, &expected, 1e-12, "gpu_add_f64");
}
#[test]
fn kernel_sub_f64_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "sub_f64");
let a_h = as_f64_vec(&f["input_a"]);
let b_h = as_f64_vec(&f["input_b"]);
let expected = as_f64_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let b = cpu_to_gpu(&b_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_sub_f64(&a, &b, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f64(&actual, &expected, 1e-12, "gpu_sub_f64");
}
#[test]
fn kernel_mul_f64_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "mul_f64");
let a_h = as_f64_vec(&f["input_a"]);
let b_h = as_f64_vec(&f["input_b"]);
let expected = as_f64_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let b = cpu_to_gpu(&b_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_mul_f64(&a, &b, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f64(&actual, &expected, 1e-12, "gpu_mul_f64");
}
#[test]
fn kernel_div_f64_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "div_f64");
let a_h = as_f64_vec(&f["input_a"]);
let b_h = as_f64_vec(&f["input_b"]);
let expected = as_f64_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let b = cpu_to_gpu(&b_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_div_f64(&a, &b, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f64(&actual, &expected, 1e-12, "gpu_div_f64");
}
#[test]
fn kernel_neg_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "neg");
let a_h = as_f32_vec(&f["input_a"]);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_neg(&a, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_ELEMENTWISE, "gpu_neg f32");
}
#[test]
fn kernel_relu_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "relu");
let a_h = as_f32_vec(&f["input_a"]);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_relu(&a, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_ELEMENTWISE, "gpu_relu f32");
}
#[test]
fn kernel_unary_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
struct Case {
op_name: &'static str,
launcher: fn(
&ferrotorch_gpu::buffer::CudaBuffer<f32>,
&GpuDevice,
) -> GpuResult<ferrotorch_gpu::buffer::CudaBuffer<f32>>,
tol: f32,
}
let cases: &[Case] = &[
Case {
op_name: "abs",
launcher: ferrotorch_gpu::kernels::gpu_abs,
tol: TOL_ELEMENTWISE,
},
Case {
op_name: "exp",
launcher: ferrotorch_gpu::kernels::gpu_exp,
tol: TOL_TRANSCENDENTAL,
},
Case {
op_name: "sigmoid",
launcher: ferrotorch_gpu::kernels::gpu_sigmoid,
tol: TOL_TRANSCENDENTAL,
},
Case {
op_name: "tanh",
launcher: ferrotorch_gpu::kernels::gpu_tanh,
tol: TOL_TRANSCENDENTAL,
},
Case {
op_name: "silu",
launcher: ferrotorch_gpu::kernels::gpu_silu,
tol: TOL_TRANSCENDENTAL,
},
Case {
op_name: "mish",
launcher: ferrotorch_gpu::kernels::gpu_mish,
tol: TOL_TRANSCENDENTAL,
},
];
for c in cases {
let f = pick(&all, "kernels", c.op_name);
let a_h = as_f32_vec(&f["input_a"]);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = (c.launcher)(&a, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, c.tol, c.op_name);
}
}
#[test]
fn kernel_log_f32_positive_matches_reference() {
ensure_cuda();
let dev = device();
let a_h: Vec<f32> = (1..=32).map(|i| i as f32 * 0.1).collect();
let expected: Vec<f32> = a_h.iter().map(|&x| x.ln()).collect();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_log(&a, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_TRANSCENDENTAL, "gpu_log positive");
}
#[test]
fn kernel_sqrt_f32_positive_matches_reference() {
ensure_cuda();
let dev = device();
let a_h: Vec<f32> = (0..32).map(|i| i as f32 * 0.25).collect();
let expected: Vec<f32> = a_h.iter().map(|&x| x.sqrt()).collect();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_sqrt(&a, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_TRANSCENDENTAL, "gpu_sqrt positive");
}
#[test]
fn kernel_gelu_variants_f32() {
ensure_cuda();
let all = load_fixtures();
{
let f = pick(&all, "kernels", "gelu_tanh");
let a_h = as_f32_vec(&f["input_a"]);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_gelu_tanh(&a, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_TRANSCENDENTAL, "gpu_gelu_tanh");
}
{
let f = pick(&all, "kernels", "gelu");
let a_h = as_f32_vec(&f["input_a"]);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_gelu_erf(&a, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_TRANSCENDENTAL, "gpu_gelu_erf");
}
}
#[test]
fn kernel_elu_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "elu");
let a_h = as_f32_vec(&f["input_a"]);
let alpha = f["alpha"].as_f64().unwrap() as f32;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_elu(&a, alpha, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_TRANSCENDENTAL, "gpu_elu");
}
#[test]
fn kernel_clamp_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "clamp");
let a_h = as_f32_vec(&f["input_a"]);
let min = f["min"].as_f64().unwrap() as f32;
let max = f["max"].as_f64().unwrap() as f32;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_clamp(&a, min, max, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_ELEMENTWISE, "gpu_clamp");
}
#[test]
fn kernel_scale_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "scale");
let a_h = as_f32_vec(&f["input_a"]);
let scalar = f["scalar"].as_f64().unwrap() as f32;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_scale(&a, scalar, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_ELEMENTWISE, "gpu_scale");
}
#[test]
fn kernel_pow_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "pow");
let a_h = as_f32_vec(&f["input_a"]);
let exp = f["exponent"].as_f64().unwrap() as f32;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_pow(&a, exp, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_TRANSCENDENTAL, "gpu_pow");
}
#[test]
fn kernel_broadcast_add_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "broadcast_add");
let a_h = as_f32_vec(&f["input_a"]);
let b_h = as_f32_vec(&f["input_b"]);
let rows = f["shape_a"][0].as_u64().unwrap() as usize;
let cols = f["shape_a"][1].as_u64().unwrap() as usize;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let b = cpu_to_gpu(&b_h, &dev).unwrap();
let a_shape = vec![rows, cols];
let b_shape = vec![cols];
let out_shape = vec![rows, cols];
let out =
ferrotorch_gpu::kernels::gpu_broadcast_add(&a, &b, &a_shape, &b_shape, &out_shape, &dev)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_ELEMENTWISE, "gpu_broadcast_add");
}
#[test]
fn kernel_broadcast_sub_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "broadcast_add");
let a_h = as_f32_vec(&f["input_a"]);
let b_h = as_f32_vec(&f["input_b"]);
let rows = f["shape_a"][0].as_u64().unwrap() as usize;
let cols = f["shape_a"][1].as_u64().unwrap() as usize;
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let b = cpu_to_gpu(&b_h, &dev).unwrap();
let a_shape = vec![rows, cols];
let b_shape = vec![cols];
let out_shape = vec![rows, cols];
let out =
ferrotorch_gpu::kernels::gpu_broadcast_sub(&a, &b, &a_shape, &b_shape, &out_shape, &dev)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_eq!(actual.len(), rows * cols, "broadcast_sub shape");
}
#[test]
fn kernel_broadcast_mul_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "broadcast_mul");
let a_h = as_f32_vec(&f["input_a"]);
let b_h = as_f32_vec(&f["input_b"]);
let rows = f["shape_a"][0].as_u64().unwrap() as usize;
let cols = f["shape_a"][1].as_u64().unwrap() as usize;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let b = cpu_to_gpu(&b_h, &dev).unwrap();
let a_shape = vec![rows, cols];
let b_shape = vec![cols];
let out_shape = vec![rows, cols];
let out =
ferrotorch_gpu::kernels::gpu_broadcast_mul(&a, &b, &a_shape, &b_shape, &out_shape, &dev)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_ELEMENTWISE, "gpu_broadcast_mul");
}
#[test]
fn kernel_transpose_2d_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "transpose_2d");
let a_h = as_f32_vec(&f["input_a"]);
let rows = f["shape_in"][0].as_u64().unwrap() as usize;
let cols = f["shape_in"][1].as_u64().unwrap() as usize;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_transpose_2d(&a, rows, cols, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, 0.0, "gpu_transpose_2d");
}
#[test]
fn kernel_permute_0213_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "permute_0213");
let a_h = as_f32_vec(&f["input_a"]);
let b = f["shape_in"][0].as_u64().unwrap() as usize;
let h = f["shape_in"][1].as_u64().unwrap() as usize;
let s = f["shape_in"][2].as_u64().unwrap() as usize;
let d = f["shape_in"][3].as_u64().unwrap() as usize;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_permute_0213(&a, b, h, s, d, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, 0.0, "gpu_permute_0213");
}
#[test]
fn kernel_reduce_sum_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "reduce_sum");
let a_h = as_f32_vec(&f["input_a"]);
let expected = f["expected"][0].as_f64().unwrap() as f32;
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_reduce_sum(&a, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_eq!(actual.len(), 1, "reduce_sum output len");
let diff = (actual[0] - expected).abs();
let scale = expected.abs().max(1.0);
assert!(
diff <= TOL_REDUCTION * scale,
"gpu_reduce_sum: {diff:.3e} > tol (actual={}, expected={expected})",
actual[0]
);
}
#[test]
fn kernel_reduce_prod_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "reduce_prod");
let a_h = as_f32_vec(&f["input_a"]);
let expected = f["expected"][0].as_f64().unwrap() as f32;
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_reduce_prod(&a, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_eq!(actual.len(), 1, "reduce_prod output len");
let diff = (actual[0] - expected).abs();
let scale = expected.abs().max(1.0);
assert!(
diff <= TOL_REDUCTION * scale,
"gpu_reduce_prod: {diff:.3e} > tol (actual={}, expected={expected})",
actual[0]
);
}
#[test]
fn kernel_reduce_min_max_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "reduce_min");
let a_h = as_f32_vec(&f["input_a"]);
let expected_min = f["expected"][0].as_f64().unwrap() as f32;
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_reduce_min(&a, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_eq!(actual.len(), 1);
assert!(
(actual[0] - expected_min).abs() < 1e-6,
"gpu_reduce_min: {} != {expected_min}",
actual[0]
);
let f2 = pick(&all, "kernels", "reduce_max");
let a2_h = as_f32_vec(&f2["input_a"]);
let expected_max = f2["expected"][0].as_f64().unwrap() as f32;
let a2 = cpu_to_gpu(&a2_h, &dev).unwrap();
let out2 = ferrotorch_gpu::kernels::gpu_reduce_max(&a2, &dev).unwrap();
let actual2 = gpu_to_cpu(&out2, &dev).unwrap();
assert_eq!(actual2.len(), 1);
assert!(
(actual2[0] - expected_max).abs() < 1e-6,
"gpu_reduce_max: {} != {expected_max}",
actual2[0]
);
}
#[test]
fn kernel_sum_axis_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "sum_axis");
let a_h = as_f32_vec(&f["input_a"]);
let rows = f["shape_in"][0].as_u64().unwrap() as usize;
let cols = f["shape_in"][1].as_u64().unwrap() as usize;
let axis = f["axis"].as_u64().unwrap() as usize;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let (outer, axis_size, inner) = if axis == 1 {
(rows, cols, 1)
} else {
(1, rows, cols)
};
let out = ferrotorch_gpu::kernels::gpu_sum_axis(&a, outer, axis_size, inner, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_REDUCTION, "gpu_sum_axis");
}
#[test]
fn kernel_cumsum_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "cumsum");
let a_h = as_f32_vec(&f["input_a"]);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_cumsum(&a, 1, a_h.len(), 1, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_REDUCTION, "gpu_cumsum");
}
#[test]
fn kernel_cumprod_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "cumprod");
let a_h = as_f32_vec(&f["input_a"]);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_cumprod(&a, 1, a_h.len(), 1, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_REDUCTION, "gpu_cumprod");
}
#[test]
fn kernel_cummax_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "cummax");
let a_h = as_f32_vec(&f["input_a"]);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let (out_vals, _out_idx) =
ferrotorch_gpu::kernels::gpu_cummax(&a, 1, a_h.len(), 1, &dev).unwrap();
let actual = gpu_to_cpu(&out_vals, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_REDUCTION, "gpu_cummax");
}
#[test]
fn kernel_cummin_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "cummin");
let a_h = as_f32_vec(&f["input_a"]);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let (out_vals, _out_idx) =
ferrotorch_gpu::kernels::gpu_cummin(&a, 1, a_h.len(), 1, &dev).unwrap();
let actual = gpu_to_cpu(&out_vals, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_REDUCTION, "gpu_cummin");
}
#[test]
fn kernel_logcumsumexp_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "logcumsumexp");
let a_h = as_f32_vec(&f["input_a"]);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_logcumsumexp(&a, 1, a_h.len(), 1, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_TRANSCENDENTAL, "gpu_logcumsumexp");
}
#[test]
fn kernel_softmax_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "softmax");
let a_h = as_f32_vec(&f["input_a"]);
let rows = f["shape"][0].as_u64().unwrap() as usize;
let cols = f["shape"][1].as_u64().unwrap() as usize;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_softmax(&a, rows, cols, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_NORMALISATION, "gpu_softmax");
}
#[test]
fn kernel_log_softmax_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "log_softmax");
let a_h = as_f32_vec(&f["input_a"]);
let rows = f["shape"][0].as_u64().unwrap() as usize;
let cols = f["shape"][1].as_u64().unwrap() as usize;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let _ = rows; let out = ferrotorch_gpu::kernels::gpu_log_softmax(&a, cols, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_NORMALISATION, "gpu_log_softmax");
}
#[test]
fn kernel_layernorm_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "layernorm");
let input_h = as_f32_vec(&f["input"]);
let weight_h = as_f32_vec(&f["weight"]);
let bias_h = as_f32_vec(&f["bias"]);
let rows = f["rows"].as_u64().unwrap() as usize;
let cols = f["cols"].as_u64().unwrap() as usize;
let eps = f["eps"].as_f64().unwrap() as f32;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let inp = cpu_to_gpu(&input_h, &dev).unwrap();
let w = cpu_to_gpu(&weight_h, &dev).unwrap();
let b = cpu_to_gpu(&bias_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_layernorm(&inp, &w, &b, rows, cols, eps, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_NORMALISATION, "gpu_layernorm");
}
#[test]
fn kernel_rmsnorm_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "rmsnorm");
let input_h = as_f32_vec(&f["input"]);
let weight_h = as_f32_vec(&f["weight"]);
let rows = f["rows"].as_u64().unwrap() as usize;
let cols = f["cols"].as_u64().unwrap() as usize;
let eps = f["eps"].as_f64().unwrap() as f32;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let inp = cpu_to_gpu(&input_h, &dev).unwrap();
let w = cpu_to_gpu(&weight_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_rmsnorm(&inp, &w, rows, cols, eps, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_NORMALISATION, "gpu_rmsnorm");
}
#[test]
fn kernel_batchnorm_forward_shape_and_stats() {
ensure_cuda();
let dev = device();
let n = 4usize;
let c = 2usize;
let h = 2usize;
let w = 2usize;
let spatial = h * w; let total_per_ch = n * spatial;
let input_data: Vec<f32> = (0..n * c * h * w).map(|i| (i as f32 * 0.3) - 2.0).collect();
let weight = vec![1.0f32; c];
let bias = vec![0.0f32; c];
let running_mean = vec![0.0f32; c];
let running_var = vec![1.0f32; c];
let inp = cpu_to_gpu(&input_data, &dev).unwrap();
let wt = cpu_to_gpu(&weight, &dev).unwrap();
let bi = cpu_to_gpu(&bias, &dev).unwrap();
let mut rm = cpu_to_gpu(&running_mean, &dev).unwrap();
let mut rv = cpu_to_gpu(&running_var, &dev).unwrap();
let (out, _save_mean, _save_invstd) = ferrotorch_gpu::kernels::gpu_batchnorm_forward(
&inp, &wt, &bi, &mut rm, &mut rv, c, spatial, 1e-5, 0.1, true, &dev,
)
.expect("gpu_batchnorm_forward must succeed after fix #892");
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_eq!(actual.len(), n * c * h * w, "output length");
for ch in 0..c {
let mut ch_vals: Vec<f32> = Vec::with_capacity(total_per_ch);
for bi in 0..n {
for hi in 0..h {
for wi in 0..w {
let flat = bi * c * h * w + ch * h * w + hi * w + wi;
ch_vals.push(actual[flat]);
}
}
}
let mean: f32 = ch_vals.iter().sum::<f32>() / ch_vals.len() as f32;
let var: f32 =
ch_vals.iter().map(|x| (x - mean).powi(2)).sum::<f32>() / ch_vals.len() as f32;
let std = var.sqrt();
assert!(
mean.abs() < 1e-4,
"channel {ch} mean {mean} not near 0 after batch norm"
);
assert!(
(std - 1.0).abs() < 1e-3,
"channel {ch} std {std} not near 1 after batch norm"
);
}
}
#[test]
fn kernel_embed_lookup_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "embed_lookup");
let weight_h = as_f32_vec(&f["weight"]);
let token_ids: Vec<u32> = f["token_ids"]
.as_array()
.unwrap()
.iter()
.map(|x| x.as_u64().unwrap() as u32)
.collect();
let _vocab_size = f["vocab_size"].as_u64().unwrap() as usize;
let d = f["d"].as_u64().unwrap() as usize;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let w = cpu_to_gpu(&weight_h, &dev).unwrap();
let ids_f32: Vec<f32> = token_ids.iter().map(|&x| x as f32).collect();
let ids = cpu_to_gpu(&ids_f32, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_embed_lookup_batch(&ids, &w, token_ids.len(), d, &dev)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(
&actual,
&expected,
TOL_ELEMENTWISE,
"gpu_embed_lookup_batch",
);
}
#[test]
fn kernel_index_select_1d_matches_reference() {
ensure_cuda();
let dev = device();
let n = 32usize;
let input_h: Vec<f32> = (0..n).map(|i| i as f32 * 1.5).collect();
let idx_f32: Vec<f32> = vec![0.0, 5.0, 10.0, 15.0, 20.0];
let expected: Vec<f32> = idx_f32.iter().map(|&i| input_h[i as usize]).collect();
let src = cpu_to_gpu(&input_h, &dev).unwrap();
let idx_buf = cpu_to_gpu(&idx_f32, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_index_select_1d(&src, &idx_buf, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_ELEMENTWISE, "gpu_index_select_1d");
}
#[test]
fn kernel_scatter_add_1d_reachable() {
ensure_cuda();
let dev = device();
let n_out = 8usize;
let grad: Vec<f32> = vec![1.0, 2.0, 3.0, 4.0];
let ids_f32: Vec<f32> = vec![0.0, 2.0, 4.0, 6.0];
let g = cpu_to_gpu(&grad, &dev).unwrap();
let i = cpu_to_gpu(&ids_f32, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_scatter_add_1d(&g, &i, n_out, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_eq!(actual.len(), n_out, "scatter_add_1d output len");
}
#[test]
fn kernel_scatter_add_rows_reachable() {
ensure_cuda();
let dev = device();
let vocab_size = 16usize;
let d = 8usize;
let n_tokens = 4usize;
let grad: Vec<f32> = (0..n_tokens * d).map(|i| i as f32 * 0.01).collect();
let ids_f32: Vec<f32> = vec![0.0, 3.0, 7.0, 12.0];
let g = cpu_to_gpu(&grad, &dev).unwrap();
let i = cpu_to_gpu(&ids_f32, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_scatter_add_rows(&g, &i, vocab_size, d, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_eq!(actual.len(), vocab_size * d);
}
#[test]
fn kernel_slice_write_read_roundtrip() {
ensure_cuda();
let rows = 4usize;
let d = 8usize;
let max_len = 8usize;
let src: Vec<f32> = (0..rows * d).map(|i| i as f32).collect();
let _pos = 2usize;
let dev = device();
let src_buf = cpu_to_gpu(&src, &dev).unwrap();
let mut dst = alloc_zeros_f32(rows * max_len * d, &dev).unwrap();
ferrotorch_gpu::kernels::gpu_slice_write(&src_buf, &mut dst, rows, d, max_len, 0, &dev)
.unwrap();
let out = ferrotorch_gpu::kernels::gpu_slice_read(&dst, rows, d, 1, max_len, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &src, 0.0, "slice_write + slice_read roundtrip");
}
#[test]
fn kernel_strided_split_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "strided_split");
let inp_h = as_f32_vec(&f["input"]);
let rows = f["rows"].as_u64().unwrap() as usize;
let cols = f["cols"].as_u64().unwrap() as usize;
let expected_left = as_f32_vec(&f["expected_left"]);
let expected_right = as_f32_vec(&f["expected_right"]);
let dev = device();
let inp = cpu_to_gpu(&inp_h, &dev).unwrap();
let left =
ferrotorch_gpu::kernels::gpu_strided_split(&inp, cols * 2, 0, cols, 1, rows * cols, &dev)
.unwrap();
let right = ferrotorch_gpu::kernels::gpu_strided_split(
&inp,
cols * 2,
cols,
cols,
1,
rows * cols,
&dev,
)
.unwrap();
let left_actual = gpu_to_cpu(&left, &dev).unwrap();
let right_actual = gpu_to_cpu(&right, &dev).unwrap();
assert_close_f32(&left_actual, &expected_left, 0.0, "strided_split left");
assert_close_f32(&right_actual, &expected_right, 0.0, "strided_split right");
}
#[test]
fn kernel_strided_cat_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "strided_cat");
let left_h = as_f32_vec(&f["input_left"]);
let right_h = as_f32_vec(&f["input_right"]);
let rows = f["rows"].as_u64().unwrap() as usize;
let cols = f["cols"].as_u64().unwrap() as usize;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let left = cpu_to_gpu(&left_h, &dev).unwrap();
let right = cpu_to_gpu(&right_h, &dev).unwrap();
let total_cols = cols * 2;
let n_chunk = rows * cols;
let mut out = alloc_zeros_f32(rows * total_cols, &dev).unwrap();
ferrotorch_gpu::kernels::gpu_strided_cat(
&left, &mut out, total_cols, 0, cols, 1, n_chunk, &dev,
)
.unwrap();
ferrotorch_gpu::kernels::gpu_strided_cat(
&right, &mut out, total_cols, cols, cols, 1, n_chunk, &dev,
)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, 0.0, "strided_cat");
}
#[test]
fn kernel_strided_copy_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "strided_copy");
let inp_h = as_f32_vec(&f["input"]);
let out_shape: Vec<usize> = f["out_shape"]
.as_array()
.unwrap()
.iter()
.map(|x| x.as_u64().unwrap() as usize)
.collect();
let in_strides: Vec<isize> = f["in_strides"]
.as_array()
.unwrap()
.iter()
.map(|x| x.as_i64().unwrap() as isize)
.collect();
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let inp = cpu_to_gpu(&inp_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_strided_copy(
&inp,
&out_shape,
&in_strides,
0, &dev,
)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, 0.0, "gpu_strided_copy");
}
#[test]
fn kernel_fill_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "fill_f32");
let n = f["n"].as_u64().unwrap() as usize;
let scalar = f["scalar"].as_f64().unwrap() as f32;
let dev = device();
let out = ferrotorch_gpu::kernels::gpu_fill_f32(n, scalar, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_eq!(actual.len(), n);
for &v in &actual {
assert!(
(v - scalar).abs() < 1e-7,
"fill_f32: expected {scalar} got {v}"
);
}
}
#[test]
fn kernel_dropout_shape_and_sparsity() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "dropout");
let inp_h = as_f32_vec(&f["input"]);
let p = f["p"].as_f64().unwrap() as f32;
let dev = device();
let inp = cpu_to_gpu(&inp_h, &dev).unwrap();
let n = inp_h.len();
let threshold = (p as f64 * u32::MAX as f64) as u32;
let scale = 1.0f32 / (1.0 - p);
let out = ferrotorch_gpu::kernels::gpu_dropout(&inp, threshold, scale, 0, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_eq!(actual.len(), n, "dropout output len");
let n_zeros = actual.iter().filter(|&&x| x == 0.0).count();
let expected_zeros = (n as f32 * p) as usize;
let margin = ((n as f32).sqrt() * 3.0) as usize + 4;
assert!(
n_zeros.abs_diff(expected_zeros) <= margin,
"dropout zero count {n_zeros} not near expected {expected_zeros} (margin ±{margin})"
);
}
#[test]
fn kernel_has_inf_nan_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "has_inf_nan");
let clean_h = as_f32_vec(&f["input_clean"]);
let inf_h = as_f32_vec(&f["input_inf"]);
let nan_h = as_f32_vec(&f["input_nan"]);
let dev = device();
let clean = cpu_to_gpu(&clean_h, &dev).unwrap();
let inf_buf = cpu_to_gpu(&inf_h, &dev).unwrap();
let nan_buf = cpu_to_gpu(&nan_h, &dev).unwrap();
assert!(
!ferrotorch_gpu::kernels::gpu_has_inf_nan(&clean, &dev).unwrap(),
"clean input should not have inf/nan"
);
assert!(
ferrotorch_gpu::kernels::gpu_has_inf_nan(&inf_buf, &dev).unwrap(),
"inf input should detect inf"
);
assert!(
ferrotorch_gpu::kernels::gpu_has_inf_nan(&nan_buf, &dev).unwrap(),
"nan input should detect nan"
);
}
#[test]
fn kernel_inplace_ops_reachable() {
ensure_cuda();
let dev = device();
let a_h = vec![1.0f32, 2.0, 3.0, 4.0];
let b_h = vec![0.5f32, 1.5, 2.5, 3.5];
let scalar = 2.0f32;
{
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let b = cpu_to_gpu(&b_h, &dev).unwrap();
let mut out = alloc_zeros_f32(a_h.len(), &dev).unwrap();
ferrotorch_gpu::kernels::gpu_add_into(&a, &b, &mut out, &dev).unwrap();
let result = gpu_to_cpu(&out, &dev).unwrap();
assert_eq!(result.len(), 4);
for (i, (&r, (&av, &bv))) in result.iter().zip(a_h.iter().zip(b_h.iter())).enumerate() {
assert!(
(r - (av + bv)).abs() < 1e-6,
"add_into[{i}]: {r} != {}",
av + bv
);
}
}
{
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let b = cpu_to_gpu(&b_h, &dev).unwrap();
let mut out = alloc_zeros_f32(a_h.len(), &dev).unwrap();
ferrotorch_gpu::kernels::gpu_mul_into(&a, &b, &mut out, &dev).unwrap();
let result = gpu_to_cpu(&out, &dev).unwrap();
assert_eq!(result.len(), 4);
}
{
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let mut out = alloc_zeros_f32(a_h.len(), &dev).unwrap();
ferrotorch_gpu::kernels::gpu_scale_into(&a, scalar, &mut out, &dev).unwrap();
let result = gpu_to_cpu(&out, &dev).unwrap();
for (i, (&r, &inp)) in result.iter().zip(a_h.iter()).enumerate() {
assert!(
(r - inp * scalar).abs() < 1e-6,
"scale_into[{i}]: got {r}, expected {}",
inp * scalar
);
}
}
}
#[test]
fn kernel_fused_adam_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "fused_adam");
let param_h = as_f32_vec(&f["param"]);
let grad_h = as_f32_vec(&f["grad"]);
let ea_h = as_f32_vec(&f["exp_avg"]);
let eas_h = as_f32_vec(&f["exp_avg_sq"]);
let expected_param = as_f32_vec(&f["expected_param"]);
let expected_ea = as_f32_vec(&f["expected_exp_avg"]);
let expected_eas = as_f32_vec(&f["expected_exp_avg_sq"]);
let beta1 = f["beta1"].as_f64().unwrap() as f32;
let beta2 = f["beta2"].as_f64().unwrap() as f32;
let lr = f["lr"].as_f64().unwrap() as f32;
let eps = f["eps"].as_f64().unwrap() as f32;
let bc1 = f["bc1"].as_f64().unwrap() as f32;
let bc2 = f["bc2"].as_f64().unwrap() as f32;
let wd = f["weight_decay"].as_f64().unwrap() as f32;
let dev = device();
let mut param = cpu_to_gpu(¶m_h, &dev).unwrap();
let grad = cpu_to_gpu(&grad_h, &dev).unwrap();
let mut ea = cpu_to_gpu(&ea_h, &dev).unwrap();
let mut eas = cpu_to_gpu(&eas_h, &dev).unwrap();
ferrotorch_gpu::kernels::gpu_fused_adam(
&mut param, &grad, &mut ea, &mut eas, beta1, beta2, lr, eps, bc1, bc2, wd, &dev,
)
.unwrap();
let actual_param = gpu_to_cpu(¶m, &dev).unwrap();
let actual_ea = gpu_to_cpu(&ea, &dev).unwrap();
let actual_eas = gpu_to_cpu(&eas, &dev).unwrap();
assert_close_f32(&actual_param, &expected_param, 1e-5, "adam param");
assert_close_f32(&actual_ea, &expected_ea, TOL_ELEMENTWISE, "adam exp_avg");
assert_close_f32(
&actual_eas,
&expected_eas,
TOL_ELEMENTWISE,
"adam exp_avg_sq",
);
}
#[test]
fn kernel_fused_gru_forward_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "fused_gru_forward");
let ig_h = as_f32_vec(&f["input_gates"]);
let hg_h = as_f32_vec(&f["hidden_gates"]);
let bih_h = as_f32_vec(&f["bias_ih"]);
let bhh_h = as_f32_vec(&f["bias_hh"]);
let hx_h = as_f32_vec(&f["hx"]);
let expected_hy = as_f32_vec(&f["expected_hy"]);
let hsz = f["hsz"].as_u64().unwrap() as usize;
let dev = device();
let ig = cpu_to_gpu(&ig_h, &dev).unwrap();
let hg = cpu_to_gpu(&hg_h, &dev).unwrap();
let bih = cpu_to_gpu(&bih_h, &dev).unwrap();
let bhh = cpu_to_gpu(&bhh_h, &dev).unwrap();
let hx = cpu_to_gpu(&hx_h, &dev).unwrap();
let (hy, _workspace) =
ferrotorch_gpu::kernels::gpu_fused_gru_forward(&ig, &hg, &bih, &bhh, &hx, hsz, &dev)
.unwrap();
let actual_hy = gpu_to_cpu(&hy, &dev).unwrap();
assert_close_f32(
&actual_hy,
&expected_hy,
TOL_TRANSCENDENTAL,
"gru_forward hy",
);
}
#[test]
fn kernel_maxpool2d_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "maxpool2d");
let inp_h = as_f32_vec(&f["input"]);
let inp_shape: Vec<usize> = f["input_shape"]
.as_array()
.unwrap()
.iter()
.map(|x| x.as_u64().unwrap() as usize)
.collect();
let kh = f["kernel_h"].as_u64().unwrap() as usize;
let kw = f["kernel_w"].as_u64().unwrap() as usize;
let sh = f["stride_h"].as_u64().unwrap() as usize;
let sw = f["stride_w"].as_u64().unwrap() as usize;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let inp = cpu_to_gpu(&inp_h, &dev).unwrap();
let [b, c, h, w] = [inp_shape[0], inp_shape[1], inp_shape[2], inp_shape[3]];
let (out, _out_shape) =
ferrotorch_gpu::kernels::gpu_maxpool2d(&inp, b, c, h, w, kh, kw, sh, sw, 0, 0, &dev)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_ELEMENTWISE, "gpu_maxpool2d");
}
#[test]
fn kernel_avgpool2d_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "kernels", "avgpool2d");
let inp_h = as_f32_vec(&f["input"]);
let inp_shape: Vec<usize> = f["input_shape"]
.as_array()
.unwrap()
.iter()
.map(|x| x.as_u64().unwrap() as usize)
.collect();
let kh = f["kernel_h"].as_u64().unwrap() as usize;
let kw = f["kernel_w"].as_u64().unwrap() as usize;
let sh = f["stride_h"].as_u64().unwrap() as usize;
let sw = f["stride_w"].as_u64().unwrap() as usize;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let inp = cpu_to_gpu(&inp_h, &dev).unwrap();
let [b, c, h, w] = [inp_shape[0], inp_shape[1], inp_shape[2], inp_shape[3]];
let (out, _out_shape) =
ferrotorch_gpu::kernels::gpu_avgpool2d(&inp, b, c, h, w, kh, kw, sh, sw, 0, 0, &dev)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_ELEMENTWISE, "gpu_avgpool2d");
}
#[test]
fn kernel_small_matmul_matches_reference() {
ensure_cuda();
let dev = device();
let m = 4usize;
let k = 4usize;
let n = 4usize;
let a_h: Vec<f32> = (0..m * k).map(|i| (i as f32 + 1.0) * 0.1).collect();
let b_h: Vec<f32> = (0..k * n).map(|i| (i as f32 + 1.0) * 0.1).collect();
let mut expected = vec![0.0f32; m * n];
for i in 0..m {
for j in 0..n {
for kk in 0..k {
expected[i * n + j] += a_h[i * k + kk] * b_h[kk * n + j];
}
}
}
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let b = cpu_to_gpu(&b_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_small_matmul(&a, &b, m, k, n, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_MATMUL_F32, "gpu_small_matmul");
}
#[test]
fn kernel_small_bmm_reachable() {
ensure_cuda();
let dev = device();
let batch = 2usize;
let m = 4usize;
let k = 4usize;
let n = 4usize;
let a_h: Vec<f32> = (0..batch * m * k).map(|i| (i as f32 + 1.0) * 0.1).collect();
let b_h: Vec<f32> = (0..batch * k * n).map(|i| (i as f32 + 1.0) * 0.1).collect();
let a = cpu_to_gpu(&a_h, &dev).unwrap();
let b = cpu_to_gpu(&b_h, &dev).unwrap();
let out = ferrotorch_gpu::kernels::gpu_small_bmm(&a, &b, batch, m, k, n, &dev).unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_eq!(actual.len(), batch * m * n, "small_bmm output len");
}
#[test]
fn kernel_precompile_decode_kernels_succeeds() {
ensure_cuda();
let dev = device();
ferrotorch_gpu::kernels::precompile_decode_kernels(&dev).expect("precompile_decode_kernels");
}
#[test]
fn flash_attention_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = {
let fixtures = all["fixtures"].as_array().unwrap();
fixtures
.iter()
.find(|f| {
f["module"].as_str() == Some("flash_attention")
&& f["dtype"].as_str() == Some("float32")
&& f["causal"].as_bool() == Some(false)
})
.expect("flash_attention f32 non-causal fixture")
};
let q_h = as_f32_vec(&f["query"]);
let k_h = as_f32_vec(&f["key"]);
let v_h = as_f32_vec(&f["value"]);
let n_q = f["n_q"].as_u64().unwrap() as usize;
let n_k = f["n_k"].as_u64().unwrap() as usize;
let d = f["d"].as_u64().unwrap() as usize;
let d_v = f["d_v"].as_u64().unwrap() as usize;
let batch_heads = f["batch_heads"].as_u64().unwrap() as usize;
let scale = f["scale"].as_f64().unwrap() as f32;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let q = cpu_to_gpu(&q_h, &dev).unwrap();
let k = cpu_to_gpu(&k_h, &dev).unwrap();
let v = cpu_to_gpu(&v_h, &dev).unwrap();
let out = ferrotorch_gpu::flash_attention::gpu_flash_attention_f32(
&q,
&k,
&v,
n_q,
n_k,
d,
d_v,
batch_heads,
scale,
false,
&dev,
)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_FLASH_F32, "flash_attention_f32");
}
#[test]
fn flash_attention_f32_causal_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = {
let fixtures = all["fixtures"].as_array().unwrap();
fixtures
.iter()
.find(|f| {
f["module"].as_str() == Some("flash_attention")
&& f["dtype"].as_str() == Some("float32")
&& f["causal"].as_bool() == Some(true)
})
.expect("flash_attention f32 causal fixture")
};
let q_h = as_f32_vec(&f["query"]);
let k_h = as_f32_vec(&f["key"]);
let v_h = as_f32_vec(&f["value"]);
let n_q = f["n_q"].as_u64().unwrap() as usize;
let n_k = f["n_k"].as_u64().unwrap() as usize;
let d = f["d"].as_u64().unwrap() as usize;
let d_v = f["d_v"].as_u64().unwrap() as usize;
let batch_heads = f["batch_heads"].as_u64().unwrap() as usize;
let scale = f["scale"].as_f64().unwrap() as f32;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let q = cpu_to_gpu(&q_h, &dev).unwrap();
let k = cpu_to_gpu(&k_h, &dev).unwrap();
let v = cpu_to_gpu(&v_h, &dev).unwrap();
let out = ferrotorch_gpu::flash_attention::gpu_flash_attention_f32(
&q,
&k,
&v,
n_q,
n_k,
d,
d_v,
batch_heads,
scale,
true,
&dev,
)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(
&actual,
&expected,
TOL_FLASH_F32,
"flash_attention_f32_causal",
);
}
#[test]
fn flash_attention_f64_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = {
let fixtures = all["fixtures"].as_array().unwrap();
fixtures
.iter()
.find(|f| {
f["module"].as_str() == Some("flash_attention")
&& f["dtype"].as_str() == Some("float64")
&& f["causal"].as_bool() == Some(false)
})
.expect("flash_attention f64 non-causal fixture")
};
let q_h = as_f64_vec(&f["query"]);
let k_h = as_f64_vec(&f["key"]);
let v_h = as_f64_vec(&f["value"]);
let n_q = f["n_q"].as_u64().unwrap() as usize;
let n_k = f["n_k"].as_u64().unwrap() as usize;
let d = f["d"].as_u64().unwrap() as usize;
let d_v = f["d_v"].as_u64().unwrap() as usize;
let batch_heads = f["batch_heads"].as_u64().unwrap() as usize;
let scale = f["scale"].as_f64().unwrap();
let expected = as_f64_vec(&f["expected"]);
let dev = device();
let q = cpu_to_gpu(&q_h, &dev).unwrap();
let k = cpu_to_gpu(&k_h, &dev).unwrap();
let v = cpu_to_gpu(&v_h, &dev).unwrap();
let out = ferrotorch_gpu::flash_attention::gpu_flash_attention_f64(
&q,
&k,
&v,
n_q,
n_k,
d,
d_v,
batch_heads,
scale,
false,
&dev,
)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f64(&actual, &expected, TOL_FLASH_F64, "flash_attention_f64");
}
#[test]
fn sparse_cusparse_handle_create_drop() {
ensure_cuda();
let _h = ferrotorch_gpu::sparse::CusparseHandle::new().expect("CusparseHandle::new");
}
#[test]
fn sparse_spmm_csr_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "sparse", "spmm_csr_f32");
let crow: Vec<u32> = as_u32_vec(&f["crow_indices"]);
let col: Vec<u32> = as_u32_vec(&f["col_indices"]);
let vals: Vec<f32> = as_f32_vec(&f["values"]);
let dense_h: Vec<f32> = as_f32_vec(&f["dense"]);
let m = f["m"].as_u64().unwrap() as usize;
let k = f["k"].as_u64().unwrap() as usize;
let n = f["n"].as_u64().unwrap() as usize;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let handle = ferrotorch_gpu::sparse::CusparseHandle::new().unwrap();
let dense = cpu_to_gpu(&dense_h, &dev).unwrap();
let out = ferrotorch_gpu::sparse::gpu_spmm_csr_f32(
&handle, &crow, &col, &vals, &dense, m, k, n, &dev,
)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_SPMM_F32, "spmm_csr_f32");
}
#[test]
fn sparse_spmm_csr_f64_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "sparse", "spmm_csr_f64");
let crow: Vec<u32> = as_u32_vec(&f["crow_indices"]);
let col: Vec<u32> = as_u32_vec(&f["col_indices"]);
let vals_f = &f["values"];
let vals: Vec<f64> = vals_f
.as_array()
.unwrap()
.iter()
.map(|x| x.as_f64().unwrap())
.collect();
let dense_h: Vec<f64> = as_f64_vec(&f["dense"]);
let m = f["m"].as_u64().unwrap() as usize;
let k = f["k"].as_u64().unwrap() as usize;
let n = f["n"].as_u64().unwrap() as usize;
let expected = as_f64_vec(&f["expected"]);
let dev = device();
let handle = ferrotorch_gpu::sparse::CusparseHandle::new().unwrap();
let dense = cpu_to_gpu(&dense_h, &dev).unwrap();
let out = ferrotorch_gpu::sparse::gpu_spmm_csr_f64(
&handle, &crow, &col, &vals, &dense, m, k, n, &dev,
)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f64(&actual, &expected, TOL_SPMM_F64, "spmm_csr_f64");
}
#[test]
fn sparse_to_dense_csr_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "sparse", "sparse_to_dense_csr_f32");
let crow: Vec<u32> = as_u32_vec(&f["crow_indices"]);
let col: Vec<u32> = as_u32_vec(&f["col_indices"]);
let vals: Vec<f32> = as_f32_vec(&f["values"]);
let m = f["m"].as_u64().unwrap() as usize;
let n = f["n"].as_u64().unwrap() as usize;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let handle = ferrotorch_gpu::sparse::CusparseHandle::new().unwrap();
let out = ferrotorch_gpu::sparse::gpu_sparse_to_dense_csr_f32(
&handle, &crow, &col, &vals, m, n, &dev,
)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, 0.0, "sparse_to_dense_csr_f32");
}
#[test]
fn sparse_to_dense_csr_f64_reachable() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "sparse", "sparse_to_dense_csr_f32");
let crow: Vec<u32> = as_u32_vec(&f["crow_indices"]);
let col: Vec<u32> = as_u32_vec(&f["col_indices"]);
let vals_f32: Vec<f32> = as_f32_vec(&f["values"]);
let vals: Vec<f64> = vals_f32.iter().map(|&x| x as f64).collect();
let m = f["m"].as_u64().unwrap() as usize;
let n = f["n"].as_u64().unwrap() as usize;
let dev = device();
let handle = ferrotorch_gpu::sparse::CusparseHandle::new().unwrap();
let out = ferrotorch_gpu::sparse::gpu_sparse_to_dense_csr_f64(
&handle, &crow, &col, &vals, m, n, &dev,
)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_eq!(actual.len(), m * n);
}
#[test]
fn sparse_dense_to_sparse_csr_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "sparse", "dense_to_sparse_csr_f32");
let dense_h: Vec<f32> = as_f32_vec(&f["dense"]);
let m = f["m"].as_u64().unwrap() as usize;
let n = f["n"].as_u64().unwrap() as usize;
let expected_crow: Vec<u32> = as_u32_vec(&f["expected_crow"]);
let expected_col: Vec<u32> = as_u32_vec(&f["expected_col"]);
let expected_vals: Vec<f32> = as_f32_vec(&f["expected_vals"]);
let dev = device();
let handle = ferrotorch_gpu::sparse::CusparseHandle::new().unwrap();
let dense = cpu_to_gpu(&dense_h, &dev).unwrap();
let (crow, col, vals) =
ferrotorch_gpu::sparse::gpu_dense_to_sparse_csr_f32(&handle, &dense, m, n, &dev).unwrap();
assert_eq!(crow, expected_crow, "dense_to_csr crow_indices");
assert_eq!(col, expected_col, "dense_to_csr col_indices");
assert_close_f32(&vals, &expected_vals, 0.0, "dense_to_csr values");
}
#[test]
fn sparse_dense_to_sparse_csr_f64_reachable() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "sparse", "dense_to_sparse_csr_f32");
let dense_f32: Vec<f32> = as_f32_vec(&f["dense"]);
let dense_h: Vec<f64> = dense_f32.iter().map(|&x| x as f64).collect();
let m = f["m"].as_u64().unwrap() as usize;
let n = f["n"].as_u64().unwrap() as usize;
let dev = device();
let handle = ferrotorch_gpu::sparse::CusparseHandle::new().unwrap();
let dense = cpu_to_gpu(&dense_h, &dev).unwrap();
let (crow, _col, vals) =
ferrotorch_gpu::sparse::gpu_dense_to_sparse_csr_f64(&handle, &dense, m, n, &dev).unwrap();
assert_eq!(crow.len(), m + 1);
assert_eq!(crow.len(), m + 1, "crow len = m+1");
let _ = vals;
}
#[test]
fn sparse_csc_to_dense_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "sparse", "csc_to_dense_f32");
let col_ptr: Vec<u32> = as_u32_vec(&f["col_ptr"]);
let row_idx: Vec<u32> = as_u32_vec(&f["row_idx"]);
let csc_vals: Vec<f32> = as_f32_vec(&f["csc_vals"]);
let m = f["m"].as_u64().unwrap() as usize;
let n = f["n"].as_u64().unwrap() as usize;
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let handle = ferrotorch_gpu::sparse::CusparseHandle::new().unwrap();
let out = ferrotorch_gpu::sparse::gpu_csc_to_dense_f32(
&handle, &col_ptr, &row_idx, &csc_vals, m, n, &dev,
)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, 0.0, "csc_to_dense_f32");
}
#[test]
fn sparse_csc_to_dense_f64_reachable() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "sparse", "csc_to_dense_f32");
let col_ptr: Vec<u32> = as_u32_vec(&f["col_ptr"]);
let row_idx: Vec<u32> = as_u32_vec(&f["row_idx"]);
let csc_f32: Vec<f32> = as_f32_vec(&f["csc_vals"]);
let csc_vals: Vec<f64> = csc_f32.iter().map(|&x| x as f64).collect();
let m = f["m"].as_u64().unwrap() as usize;
let n = f["n"].as_u64().unwrap() as usize;
let dev = device();
let handle = ferrotorch_gpu::sparse::CusparseHandle::new().unwrap();
let out = ferrotorch_gpu::sparse::gpu_csc_to_dense_f64(
&handle, &col_ptr, &row_idx, &csc_vals, m, n, &dev,
)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_eq!(actual.len(), m * n);
}
#[test]
fn sparse_csr_to_csc_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "sparse", "csr_to_csc_f32");
let crow: Vec<u32> = as_u32_vec(&f["crow_indices"]);
let col: Vec<u32> = as_u32_vec(&f["col_indices"]);
let vals: Vec<f32> = as_f32_vec(&f["values"]);
let m = f["m"].as_u64().unwrap() as usize;
let n = f["n"].as_u64().unwrap() as usize;
let expected_col_ptr: Vec<u32> = as_u32_vec(&f["expected_col_ptr"]);
let expected_row_idx: Vec<u32> = as_u32_vec(&f["expected_row_idx"]);
let expected_csc_vals: Vec<f32> = as_f32_vec(&f["expected_csc_vals"]);
let dev = device();
let handle = ferrotorch_gpu::sparse::CusparseHandle::new().unwrap();
let (col_ptr, row_idx, csc_vals) =
ferrotorch_gpu::sparse::gpu_csr_to_csc_f32(&handle, &crow, &col, &vals, m, n, &dev)
.unwrap();
assert_eq!(col_ptr, expected_col_ptr, "csr_to_csc col_ptr");
assert_eq!(row_idx, expected_row_idx, "csr_to_csc row_idx");
assert_close_f32(&csc_vals, &expected_csc_vals, 0.0, "csr_to_csc values");
}
#[test]
fn sparse_csr_to_csc_f64_reachable() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "sparse", "csr_to_csc_f32");
let crow: Vec<u32> = as_u32_vec(&f["crow_indices"]);
let col: Vec<u32> = as_u32_vec(&f["col_indices"]);
let vals_f32: Vec<f32> = as_f32_vec(&f["values"]);
let vals: Vec<f64> = vals_f32.iter().map(|&x| x as f64).collect();
let m = f["m"].as_u64().unwrap() as usize;
let n = f["n"].as_u64().unwrap() as usize;
let dev = device();
let handle = ferrotorch_gpu::sparse::CusparseHandle::new().unwrap();
let (col_ptr, _row_idx, _csc_vals) =
ferrotorch_gpu::sparse::gpu_csr_to_csc_f64(&handle, &crow, &col, &vals, m, n, &dev)
.unwrap();
assert_eq!(col_ptr.len(), n + 1, "csc col_ptr len = n+1");
}
#[test]
fn sparse_coo_to_csr_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "sparse", "coo_to_csr_f32");
let row_idx: Vec<u32> = as_u32_vec(&f["row_indices"]);
let col_idx: Vec<u32> = as_u32_vec(&f["col_indices"]);
let vals: Vec<f32> = as_f32_vec(&f["values"]);
let m = f["m"].as_u64().unwrap() as usize;
let n = f["n"].as_u64().unwrap() as usize;
let expected_crow: Vec<u32> = as_u32_vec(&f["expected_crow"]);
let expected_col: Vec<u32> = as_u32_vec(&f["expected_col"]);
let expected_vals: Vec<f32> = as_f32_vec(&f["expected_vals"]);
let dev = device();
let handle = ferrotorch_gpu::sparse::CusparseHandle::new().unwrap();
let (crow, col, csr_vals) =
ferrotorch_gpu::sparse::gpu_coo_to_csr_f32(&handle, &row_idx, &col_idx, &vals, m, n, &dev)
.unwrap();
assert_eq!(crow, expected_crow, "coo_to_csr crow");
assert_eq!(col, expected_col, "coo_to_csr col");
assert_close_f32(&csr_vals, &expected_vals, 0.0, "coo_to_csr vals");
}
#[test]
fn sparse_coo_to_csr_f64_reachable() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "sparse", "coo_to_csr_f32");
let row_idx: Vec<u32> = as_u32_vec(&f["row_indices"]);
let col_idx: Vec<u32> = as_u32_vec(&f["col_indices"]);
let vals_f32: Vec<f32> = as_f32_vec(&f["values"]);
let vals: Vec<f64> = vals_f32.iter().map(|&x| x as f64).collect();
let m = f["m"].as_u64().unwrap() as usize;
let n = f["n"].as_u64().unwrap() as usize;
let dev = device();
let handle = ferrotorch_gpu::sparse::CusparseHandle::new().unwrap();
let (crow, _col, _vals) =
ferrotorch_gpu::sparse::gpu_coo_to_csr_f64(&handle, &row_idx, &col_idx, &vals, m, n, &dev)
.unwrap();
assert_eq!(crow.len(), m + 1, "coo_to_csr_f64 crow len = m+1");
}
#[test]
fn sparse_csr_to_coo_f32_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "sparse", "csr_to_coo_f32");
let crow: Vec<u32> = as_u32_vec(&f["crow_indices"]);
let col: Vec<u32> = as_u32_vec(&f["col_indices"]);
let vals: Vec<f32> = as_f32_vec(&f["values"]);
let m = f["m"].as_u64().unwrap() as usize;
let n = f["n"].as_u64().unwrap() as usize;
let expected_row: Vec<u32> = as_u32_vec(&f["expected_row"]);
let expected_col: Vec<u32> = as_u32_vec(&f["expected_col"]);
let expected_vals: Vec<f32> = as_f32_vec(&f["expected_vals"]);
let dev = device();
let handle = ferrotorch_gpu::sparse::CusparseHandle::new().unwrap();
let (row_out, col_out, vals_out) =
ferrotorch_gpu::sparse::gpu_csr_to_coo_f32(&handle, &crow, &col, &vals, m, n, &dev)
.unwrap();
assert_eq!(row_out, expected_row, "csr_to_coo row");
assert_eq!(col_out, expected_col, "csr_to_coo col");
assert_close_f32(&vals_out, &expected_vals, 0.0, "csr_to_coo vals");
}
#[test]
fn sparse_csr_to_coo_f64_reachable() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "sparse", "csr_to_coo_f32");
let crow: Vec<u32> = as_u32_vec(&f["crow_indices"]);
let col: Vec<u32> = as_u32_vec(&f["col_indices"]);
let vals_f32: Vec<f32> = as_f32_vec(&f["values"]);
let vals: Vec<f64> = vals_f32.iter().map(|&x| x as f64).collect();
let m = f["m"].as_u64().unwrap() as usize;
let n = f["n"].as_u64().unwrap() as usize;
let dev = device();
let handle = ferrotorch_gpu::sparse::CusparseHandle::new().unwrap();
let (row_out, _col_out, _vals_out) =
ferrotorch_gpu::sparse::gpu_csr_to_coo_f64(&handle, &crow, &col, &vals, m, n, &dev)
.unwrap();
assert_eq!(row_out.len(), vals.len(), "csr_to_coo_f64 row.len == nnz");
}
#[test]
fn conv_conv2d_f32_no_bias_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "conv", "conv2d_no_bias");
let inp_h = as_f32_vec(&f["input"]);
let wt_h = as_f32_vec(&f["weight"]);
let inp_shape: [usize; 4] = {
let s = f["input_shape"].as_array().unwrap();
[
s[0].as_u64().unwrap() as usize,
s[1].as_u64().unwrap() as usize,
s[2].as_u64().unwrap() as usize,
s[3].as_u64().unwrap() as usize,
]
};
let wt_shape: [usize; 4] = {
let s = f["weight_shape"].as_array().unwrap();
[
s[0].as_u64().unwrap() as usize,
s[1].as_u64().unwrap() as usize,
s[2].as_u64().unwrap() as usize,
s[3].as_u64().unwrap() as usize,
]
};
let stride = (
f["stride"][0].as_u64().unwrap() as usize,
f["stride"][1].as_u64().unwrap() as usize,
);
let padding = (
f["padding"][0].as_u64().unwrap() as usize,
f["padding"][1].as_u64().unwrap() as usize,
);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let inp = cpu_to_gpu(&inp_h, &dev).unwrap();
let wt = cpu_to_gpu(&wt_h, &dev).unwrap();
let (out, _out_shape) = ferrotorch_gpu::conv::gpu_conv2d_f32(
&inp, &wt, None, inp_shape, wt_shape, stride, padding, (1, 1), 1, &dev,
)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_CONV_F32, "conv2d_no_bias");
}
#[test]
fn conv_conv2d_f32_with_bias_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "conv", "conv2d_with_bias");
let inp_h = as_f32_vec(&f["input"]);
let wt_h = as_f32_vec(&f["weight"]);
let bias_h = as_f32_vec(&f["bias"]);
let inp_shape: [usize; 4] = {
let s = f["input_shape"].as_array().unwrap();
[
s[0].as_u64().unwrap() as usize,
s[1].as_u64().unwrap() as usize,
s[2].as_u64().unwrap() as usize,
s[3].as_u64().unwrap() as usize,
]
};
let wt_shape: [usize; 4] = {
let s = f["weight_shape"].as_array().unwrap();
[
s[0].as_u64().unwrap() as usize,
s[1].as_u64().unwrap() as usize,
s[2].as_u64().unwrap() as usize,
s[3].as_u64().unwrap() as usize,
]
};
let stride = (
f["stride"][0].as_u64().unwrap() as usize,
f["stride"][1].as_u64().unwrap() as usize,
);
let padding = (
f["padding"][0].as_u64().unwrap() as usize,
f["padding"][1].as_u64().unwrap() as usize,
);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let inp = cpu_to_gpu(&inp_h, &dev).unwrap();
let wt = cpu_to_gpu(&wt_h, &dev).unwrap();
let bias = cpu_to_gpu(&bias_h, &dev).unwrap();
let (out, _out_shape) = ferrotorch_gpu::conv::gpu_conv2d_f32(
&inp,
&wt,
Some(&bias),
inp_shape,
wt_shape,
stride,
padding,
(1, 1),
1,
&dev,
)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_CONV_F32, "conv2d_with_bias");
}
#[test]
fn conv_conv2d_f32_padded_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "conv", "conv2d_padded");
let inp_h = as_f32_vec(&f["input"]);
let wt_h = as_f32_vec(&f["weight"]);
let inp_shape: [usize; 4] = {
let s = f["input_shape"].as_array().unwrap();
[
s[0].as_u64().unwrap() as usize,
s[1].as_u64().unwrap() as usize,
s[2].as_u64().unwrap() as usize,
s[3].as_u64().unwrap() as usize,
]
};
let wt_shape: [usize; 4] = {
let s = f["weight_shape"].as_array().unwrap();
[
s[0].as_u64().unwrap() as usize,
s[1].as_u64().unwrap() as usize,
s[2].as_u64().unwrap() as usize,
s[3].as_u64().unwrap() as usize,
]
};
let stride = (
f["stride"][0].as_u64().unwrap() as usize,
f["stride"][1].as_u64().unwrap() as usize,
);
let padding = (
f["padding"][0].as_u64().unwrap() as usize,
f["padding"][1].as_u64().unwrap() as usize,
);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let inp = cpu_to_gpu(&inp_h, &dev).unwrap();
let wt = cpu_to_gpu(&wt_h, &dev).unwrap();
let (out, _out_shape) = ferrotorch_gpu::conv::gpu_conv2d_f32(
&inp, &wt, None, inp_shape, wt_shape, stride, padding, (1, 1), 1, &dev,
)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_CONV_F32, "conv2d_padded");
}
#[test]
fn conv_conv2d_f32_multichannel_matches_reference() {
ensure_cuda();
let all = load_fixtures();
let f = pick(&all, "conv", "conv2d_multichannel");
let inp_h = as_f32_vec(&f["input"]);
let wt_h = as_f32_vec(&f["weight"]);
let inp_shape: [usize; 4] = {
let s = f["input_shape"].as_array().unwrap();
[
s[0].as_u64().unwrap() as usize,
s[1].as_u64().unwrap() as usize,
s[2].as_u64().unwrap() as usize,
s[3].as_u64().unwrap() as usize,
]
};
let wt_shape: [usize; 4] = {
let s = f["weight_shape"].as_array().unwrap();
[
s[0].as_u64().unwrap() as usize,
s[1].as_u64().unwrap() as usize,
s[2].as_u64().unwrap() as usize,
s[3].as_u64().unwrap() as usize,
]
};
let stride = (
f["stride"][0].as_u64().unwrap() as usize,
f["stride"][1].as_u64().unwrap() as usize,
);
let padding = (
f["padding"][0].as_u64().unwrap() as usize,
f["padding"][1].as_u64().unwrap() as usize,
);
let expected = as_f32_vec(&f["expected"]);
let dev = device();
let inp = cpu_to_gpu(&inp_h, &dev).unwrap();
let wt = cpu_to_gpu(&wt_h, &dev).unwrap();
let (out, _out_shape) = ferrotorch_gpu::conv::gpu_conv2d_f32(
&inp, &wt, None, inp_shape, wt_shape, stride, padding, (1, 1), 1, &dev,
)
.unwrap();
let actual = gpu_to_cpu(&out, &dev).unwrap();
assert_close_f32(&actual, &expected, TOL_CONV_F32, "conv2d_multichannel");
}