use hanzo_kernel::cubecl::cpu::{CpuDevice, CpuRuntime};
use hanzo_kernel::prelude::*;
fn main() {
let client = CpuRuntime::client(&CpuDevice::default());
let normed = hanzo_kernel::norm::rms_norm_run::<CpuRuntime>(
&client,
&[1.0, 2.0, 3.0, 4.0], &[1.0; 4], 1, 4, 1e-6, );
println!("rms_norm = {normed:?}");
let ln = hanzo_kernel::norm::layer_norm_run::<CpuRuntime>(
&client, &[1.0, 2.0, 3.0, 4.0], &[1.0; 4], &[0.0; 4], 1, 4, 1e-6,
);
println!("layer_norm = {ln:?}");
let rope = hanzo_kernel::rope::rope_run::<CpuRuntime>(
&client, &[1.0, 2.0, 3.0, 4.0], &[1.0, 1.0], &[0.0, 0.0], 1, 4, false,
);
println!("rope (neox) = {rope:?}");
let mv = hanzo_kernel::quant::matvec_q8_run::<CpuRuntime>(
&client, &[0.1; 4], &[1; 16], &[1.0; 4], 4, 4,
);
println!("matvec_q8 = {mv:?}");
assert!(normed.iter().chain(&ln).chain(&rope).chain(&mv).all(|v| v.is_finite()));
println!("\nall ops ran on the CPU runtime — the same source lowers to every GPU backend.");
}