#![cfg(feature = "cuda")]
use trueno_gpu::kernels::{
Activation, AttentionKernel, BiasActivationKernel, GemmKernel, Kernel, LayerNormKernel,
SoftmaxKernel,
};
#[cfg(feature = "gpu-pixels")]
use jugar_probar::gpu_pixels::{validate_ptx, PtxBugClass};
const PTX_TOLERANCE: f32 = 1e-5;
mod ptx_analysis;
mod ptx_runtime;
fn scalar_softmax(x: &[f32]) -> Vec<f32> {
let max_val = x.iter().cloned().fold(f32::NEG_INFINITY, f32::max);
let exp_vals: Vec<f32> = x.iter().map(|xi| (xi - max_val).exp()).collect();
let sum: f32 = exp_vals.iter().sum();
exp_vals.iter().map(|e| e / sum).collect()
}
fn scalar_bias_activation(x: &[f32], bias: &[f32], activation: Activation) -> Vec<f32> {
x.iter()
.enumerate()
.map(|(i, &val)| {
let biased = val + bias[i % bias.len()];
match activation {
Activation::None => biased,
Activation::ReLU => biased.max(0.0),
Activation::GELU => {
let scaled = 1.702 * biased;
let sigmoid = 1.0 / (1.0 + (-scaled).exp());
biased * sigmoid
}
}
})
.collect()
}
fn scalar_layernorm(x: &[f32], gamma: &[f32], beta: &[f32], eps: f32) -> Vec<f32> {
let n = x.len() as f32;
let mean: f32 = x.iter().sum::<f32>() / n;
let variance: f32 = x.iter().map(|xi| (xi - mean).powi(2)).sum::<f32>() / n;
let std = (variance + eps).sqrt();
x.iter()
.zip(gamma.iter())
.zip(beta.iter())
.map(|((xi, gi), bi)| ((xi - mean) / std) * gi + bi)
.collect()
}
fn scalar_gemm(a: &[f32], b: &[f32], m: usize, n: usize, k: usize) -> Vec<f32> {
let mut c = vec![0.0f32; m * n];
for i in 0..m {
for j in 0..n {
let mut sum = 0.0f32;
for l in 0..k {
sum += a[i * k + l] * b[l * n + j];
}
c[i * n + j] = sum;
}
}
c
}
struct SimpleRng {
state: u64,
}
impl SimpleRng {
fn new(seed: u64) -> Self {
Self { state: seed }
}
fn next_f32(&mut self) -> f32 {
self.state ^= self.state << 13;
self.state ^= self.state >> 7;
self.state ^= self.state << 17;
(self.state as f32 / u64::MAX as f32) * 2.0 - 1.0
}
fn gen_vec(&mut self, n: usize) -> Vec<f32> {
(0..n).map(|_| self.next_f32()).collect()
}
}
#[test]
#[cfg(feature = "cuda")]
fn ptx_pixel_fkr_quantize_kernel() {
use trueno_gpu::kernels::QuantizeKernel;
let test_cases = [
(2560, 1, 2560), (1024, 1, 4096), (4096, 4096, 4096), (17, 1, 17), (256, 256, 256), ];
for (m, n, k) in test_cases {
let kernel = QuantizeKernel::new(m, n, k);
let ptx = kernel.emit_ptx();
assert!(
ptx.contains(".version"),
"QuantizeKernel[{m}x{n}x{k}] missing PTX version"
);
assert!(
ptx.contains(".target"),
"QuantizeKernel[{m}x{n}x{k}] missing PTX target"
);
assert!(
ptx.contains(".entry") || ptx.contains(".visible"),
"QuantizeKernel[{m}x{n}x{k}] missing entry point"
);
#[cfg(feature = "gpu-pixels")]
{
let result = validate_ptx(&ptx);
assert!(
result.is_valid(),
"QuantizeKernel[{m}x{n}x{k}] has PTX bugs: {:?}",
result.bugs
);
}
println!(
"ptx_pixel_fkr_quantize[{m}x{n}x{k}]: PASS ({} bytes)",
ptx.len()
);
}
}
#[test]
#[cfg(feature = "cuda")]
fn ptx_pixel_fkr_bias_activation_runtime() {
use trueno_gpu::driver::CudaContext;
fn cuda_available() -> bool {
CudaContext::new(0).is_ok()
}
if !cuda_available() {
eprintln!("Skipping PTX BiasActivation runtime test: no CUDA device");
return;
}
let n: usize = 1024;
let bias_size: usize = 64;
let mut rng = SimpleRng::new(45678);
let x = rng.gen_vec(n);
let bias = rng.gen_vec(bias_size);
for activation in [Activation::None, Activation::ReLU, Activation::GELU] {
let scalar_result = scalar_bias_activation(&x, &bias, activation);
let kernel =
BiasActivationKernel::new(n as u32, bias_size as u32).with_activation(activation);
let ptx = kernel.emit_ptx();
assert!(
ptx.contains(".entry"),
"BiasActivation PTX should have entry point"
);
println!(
"ptx_pixel_fkr_bias_activation_{:?}: PTX generated ({} bytes)",
activation,
ptx.len()
);
println!(" Scalar result[0]: {:.6}", scalar_result[0]);
}
}
#[test]
fn ptx_pixel_fkr_summary() {
println!("");
println!("========================================");
println!(" PTX Pixel FKR Suite (trueno-gpu)");
println!("========================================");
println!("");
println!(" Static Analysis Tests:");
println!(" - gemm_tiled_no_bugs");
println!(" - gemm_tensor_core");
println!(" - attention");
println!(" - attention_causal");
println!(" - softmax_entry");
println!(" - layernorm_entry");
println!(" - bias_activation_entry");
println!(" - bias_activation_gelu_approx");
println!(" - bias_activation_relu_max");
println!("");
println!(" Runtime Validation Tests:");
println!(" - softmax_runtime");
println!(" - gemm_runtime");
println!(" - layernorm_runtime");
println!(" - bias_activation_runtime");
println!("");
println!(" Issue #67 Prevention:");
println!(" - quantize_kernel (multiple dimensions)");
println!("");
println!("========================================");
}