use std::collections::hash_map::DefaultHasher;
use std::fs;
use std::hash::{Hash, Hasher};
use std::path::PathBuf;
use crate::backend::hip_dense::{
hipcc_compile_executable, hipcc_compiler_fingerprint, hipcc_recheck_artifact,
};
use crate::backend::hip_gelu::{f16_to_f32, f32_to_f16};
use crate::backend::kernel_server;
use crate::backend::rocm::{RocmHipCapabilityReport, detect_local_rocm_hip};
use crate::{Error, Result};
pub const ROCM_HIP_GELU_BWD_BACKEND: &str = "rocm_hip_gelu_bwd_pilot";
pub const ROCM_HIP_GELU_BWD_LOWERING_ID: &str = "hip.gelu.fp16_f32.bwd";
const GELU_BWD_KERNEL_TYPE: &str = "hip-gelu-bwd";
pub const HIP_GELU_BWD_KERNEL: &str = r#"
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>
// Analytical GELU backward kernel. For each element:
// inner = c * (x + a * x^3) where c = sqrt(2/pi), a = 0.044715
// t = tanh(inner)
// dydx = 0.5 * (1 + t) + 0.5 * x * (1 - t^2) * c * (1 + 3 * a * x^2)
// grad_input = grad_output * dydx
// Internal math is fp32; only the IO conversions touch fp16.
__global__ void gelu_bw_fp16_f32_kernel(
const __half* grad_output,
const __half* input,
__half* grad_input,
int n) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) {
return;
}
float go = __half2float(grad_output[idx]);
float x = __half2float(input[idx]);
float x2 = x * x;
float x3 = x2 * x;
const float c = 0.7978845608f; // sqrt(2/pi)
const float a = 0.044715f;
float inner = c * (x + a * x3);
float t = tanhf(inner);
float dt = 1.0f - t * t;
float dydx = 0.5f * (1.0f + t)
+ 0.5f * x * dt * c * (1.0f + 3.0f * a * x2);
grad_input[idx] = __float2half_rn(go * dydx);
}
static void check(hipError_t status, const char* label) {
if (status != hipSuccess) {
std::cerr << "HIP_ERROR " << label << "=" << hipGetErrorString(status) << "\n";
std::exit(10);
}
}
// Forward declaration of the existing main() body, extracted into
// a static helper so the server-mode loop can call it on each
// request. The default `main()` also routes through this helper so
// the one-shot and server code paths share the same compute logic.
static int run_one_shot_from_main_body();
// Persistent server-mode protocol (see hip_gemm_f16.rs for the full
// design rationale). The host writes a little-endian u32 payload_len
// followed by `payload_len` bytes of the existing text payload, then
// reads back a little-endian u32 response_len followed by
// `response_len` bytes of the existing text response.
static int run_server_mode() {
while (true) {
uint32_t payload_len = 0;
std::cin.read(reinterpret_cast<char*>(&payload_len), 4);
if (!std::cin || std::cin.gcount() == 0) {
return 0; // clean EOF
}
if (std::cin.gcount() != 4) {
std::cerr << "server_mode: short read on payload_len (got "
<< std::cin.gcount() << " bytes)\n";
return 20;
}
std::vector<char> payload(payload_len);
if (payload_len > 0) {
std::cin.read(payload.data(), payload_len);
if (static_cast<uint32_t>(std::cin.gcount()) != payload_len) {
std::cerr << "server_mode: short read on payload (got "
<< std::cin.gcount() << " of " << payload_len << ")\n";
return 21;
}
}
std::string payload_str(payload.begin(), payload.end());
std::istringstream fake_stdin(payload_str);
std::streambuf* old_buf = std::cin.rdbuf(fake_stdin.rdbuf());
std::ostringstream captured;
std::streambuf* old_cout = std::cout.rdbuf(captured.rdbuf());
std::ostringstream captured_err;
std::streambuf* old_cerr = std::cerr.rdbuf(captured_err.rdbuf());
int rc = run_one_shot_from_main_body();
std::cin.rdbuf(old_buf);
std::cout.rdbuf(old_cout);
std::cerr.rdbuf(old_cerr);
std::string response = captured.str();
if (rc != 0) {
std::string err_str = captured_err.str();
response += err_str;
}
uint32_t response_len = static_cast<uint32_t>(response.size());
std::cout.write(reinterpret_cast<const char*>(&response_len), 4);
if (response_len > 0) {
std::cout.write(response.data(), response_len);
}
std::cout.flush();
if (rc != 0) {
return rc;
}
}
}
int main(int argc, char** argv) {
if (argc > 1 && std::string(argv[1]) == "--server") {
return run_server_mode();
}
return run_one_shot_from_main_body();
}
static int run_one_shot_from_main_body() {
int n = 0;
if (!(std::cin >> n)) {
std::cerr << "usage: stdin payload is \"N\\n<grad_output_bits> <input_bits>\\n\"\n";
return 2;
}
if (n <= 0) {
std::cerr << "N must be positive\n";
return 3;
}
std::size_t count = static_cast<std::size_t>(n);
std::vector<uint16_t> go_bits(count);
std::vector<uint16_t> in_bits(count);
for (std::size_t i = 0; i < count; ++i) {
if (!(std::cin >> go_bits[i])) {
std::cerr << "failed to read grad_output element " << i << "\n";
return 4;
}
}
for (std::size_t i = 0; i < count; ++i) {
if (!(std::cin >> in_bits[i])) {
std::cerr << "failed to read input element " << i << "\n";
return 5;
}
}
int device = 0;
check(hipSetDevice(device), "hipSetDevice");
hipDeviceProp_t props;
check(hipGetDeviceProperties(&props, device), "hipGetDeviceProperties");
__half* d_go = nullptr;
__half* d_in = nullptr;
__half* d_out = nullptr;
std::size_t bytes = count * sizeof(__half);
check(hipMalloc(&d_go, bytes), "hipMalloc(grad_output)");
check(hipMalloc(&d_in, bytes), "hipMalloc(input)");
check(hipMalloc(&d_out, bytes), "hipMalloc(output)");
check(hipMemcpy(d_go, go_bits.data(), bytes, hipMemcpyHostToDevice), "hipMemcpy(grad_output)");
check(hipMemcpy(d_in, in_bits.data(), bytes, hipMemcpyHostToDevice), "hipMemcpy(input)");
int block = 256;
int grid = (n + block - 1) / block;
hipEvent_t start;
hipEvent_t stop;
check(hipEventCreate(&start), "hipEventCreate(start)");
check(hipEventCreate(&stop), "hipEventCreate(stop)");
check(hipEventRecord(start), "hipEventRecord(start)");
hipLaunchKernelGGL(gelu_bw_fp16_f32_kernel, dim3(grid), dim3(block), 0, 0, d_go, d_in, d_out, n);
check(hipGetLastError(), "hipLaunchKernelGGL");
check(hipEventRecord(stop), "hipEventRecord(stop)");
check(hipEventSynchronize(stop), "hipEventSynchronize");
float kernel_time_ms = 0.0f;
check(hipEventElapsedTime(&kernel_time_ms, start, stop), "hipEventElapsedTime");
check(hipEventDestroy(start), "hipEventDestroy(start)");
check(hipEventDestroy(stop), "hipEventDestroy(stop)");
std::vector<uint16_t> out_bits(count);
check(hipMemcpy(out_bits.data(), d_out, bytes, hipMemcpyDeviceToHost), "hipMemcpy(out)");
check(hipFree(d_go), "hipFree(grad_output)");
check(hipFree(d_in), "hipFree(input)");
check(hipFree(d_out), "hipFree(output)");
std::cout << "DEVICE_NAME=" << props.name << "\n";
std::cout << "GFX=" << props.gcnArchName << "\n";
std::cout << "N=" << n << "\n";
std::cout << "GRID=" << grid << "\n";
std::cout << "BLOCK=" << block << "\n";
std::cout << "KERNEL_TIME_MS=" << kernel_time_ms << "\n";
std::cout << "RESULTS=";
for (std::size_t i = 0; i < out_bits.size(); ++i) {
if (i != 0) {
std::cout << " ";
}
std::cout << out_bits[i];
}
std::cout << "\n";
return 0;
}
"#;
#[derive(Debug, Clone, PartialEq)]
pub struct RocmHipGeluBwdReport {
pub n: usize,
pub outputs: Vec<u16>,
pub cpu_oracle_outputs: Vec<u16>,
pub max_abs_error: f32,
pub within_tolerance: bool,
pub kernel_time_ms: f32,
pub kernel_source_fingerprint: String,
pub compiler_fingerprint: String,
pub build_command: String,
pub executable_path: String,
pub device_evidence: RocmHipCapabilityReport,
pub evidence: Vec<String>,
pub non_claims: Vec<String>,
}
impl RocmHipGeluBwdReport {
pub fn to_markdown(&self) -> String {
let mut lines = vec![
"# ROCm/HIP fp16 GELU Backward Pilot".to_string(),
String::new(),
format!("backend: {}", ROCM_HIP_GELU_BWD_BACKEND),
format!("n: {}", self.n),
format!("max_abs_error: {}", self.max_abs_error),
format!("within_tolerance: {}", self.within_tolerance),
format!("kernel_time_ms: {}", self.kernel_time_ms),
format!(
"kernel_source_fingerprint: {}",
self.kernel_source_fingerprint
),
format!("compiler_fingerprint: {}", self.compiler_fingerprint),
String::new(),
"## Evidence".to_string(),
];
for item in &self.evidence {
lines.push(format!("- {item}"));
}
lines.push(String::new());
lines.push("## Non-Claims".to_string());
for item in &self.non_claims {
lines.push(format!("- {item}"));
}
lines.join("\n")
}
}
pub fn run_rocm_hip_gelu_bwd(grad_output: &[u16], input: &[u16], n: usize) -> Result<Vec<u16>> {
if grad_output.len() != n {
return Err(Error::backend(format!(
"fp16 GELU bwd grad_output length {} does not match n={}",
grad_output.len(),
n
)));
}
if input.len() != n {
return Err(Error::backend(format!(
"fp16 GELU bwd input length {} does not match n={}",
input.len(),
n
)));
}
if n == 0 {
return Err(Error::backend("fp16 GELU bwd n must be positive"));
}
let device_evidence = detect_local_rocm_hip();
if !device_evidence.available {
return Err(Error::backend(
"ROCm/HIP is unavailable; fp16 GELU bwd pilot remains inadmissible",
));
}
let source_fingerprint = hip_gelu_bwd_kernel_source_fingerprint();
let cache_dir = PathBuf::from("target/rocm-hip-cache");
fs::create_dir_all(&cache_dir)
.map_err(|err| Error::backend(format!("failed to create HIP cache directory: {err}")))?;
let source_path = cache_dir.join(format!("{source_fingerprint}.cpp"));
let executable_path = cache_dir.join(format!("{source_fingerprint}-gelu-bwd-fp16"));
fs::write(&source_path, HIP_GELU_BWD_KERNEL)
.map_err(|err| Error::backend(format!("failed to write HIP kernel source: {err}")))?;
let hipcc = "/opt/rocm/bin/hipcc";
hipcc_compile_executable(hipcc, &source_path, &executable_path, Some("gfx1101"))?;
let mut payload = String::with_capacity((grad_output.len() + input.len()) * 8);
payload.push_str(&format!("{n}\n"));
for (i, v) in grad_output.iter().enumerate() {
if i != 0 {
payload.push(' ');
}
payload.push_str(&v.to_string());
}
payload.push('\n');
for (i, v) in input.iter().enumerate() {
if i != 0 {
payload.push(' ');
}
payload.push_str(&v.to_string());
}
payload.push('\n');
let stdout = run_gelu_bwd_executable(&executable_path, &source_path, &payload)?;
Ok(parse_gelu_bwd_results(&stdout)?)
}
pub fn run_rocm_hip_gelu_bwd_reported(
grad_output: &[u16],
input: &[u16],
n: usize,
) -> Result<RocmHipGeluBwdReport> {
if grad_output.len() != n {
return Err(Error::backend(format!(
"fp16 GELU bwd grad_output length {} does not match n={}",
grad_output.len(),
n
)));
}
if input.len() != n {
return Err(Error::backend(format!(
"fp16 GELU bwd input length {} does not match n={}",
input.len(),
n
)));
}
if n == 0 {
return Err(Error::backend("fp16 GELU bwd n must be positive"));
}
let device_evidence = detect_local_rocm_hip();
if !device_evidence.available {
return Err(Error::backend(
"ROCm/HIP is unavailable; fp16 GELU bwd pilot remains inadmissible",
));
}
let source_fingerprint = hip_gelu_bwd_kernel_source_fingerprint();
let cache_dir = PathBuf::from("target/rocm-hip-cache");
fs::create_dir_all(&cache_dir)
.map_err(|err| Error::backend(format!("failed to create HIP cache directory: {err}")))?;
let source_path = cache_dir.join(format!("{source_fingerprint}.cpp"));
let executable_path = cache_dir.join(format!("{source_fingerprint}-gelu-bwd-fp16"));
fs::write(&source_path, HIP_GELU_BWD_KERNEL)
.map_err(|err| Error::backend(format!("failed to write HIP kernel source: {err}")))?;
let hipcc = "/opt/rocm/bin/hipcc";
let compiler_fingerprint = hipcc_compiler_fingerprint(hipcc)?;
let build_command =
hipcc_compile_executable(hipcc, &source_path, &executable_path, Some("gfx1101"))?;
let mut payload = String::with_capacity((grad_output.len() + input.len()) * 8);
payload.push_str(&format!("{n}\n"));
for (i, v) in grad_output.iter().enumerate() {
if i != 0 {
payload.push(' ');
}
payload.push_str(&v.to_string());
}
payload.push('\n');
for (i, v) in input.iter().enumerate() {
if i != 0 {
payload.push(' ');
}
payload.push_str(&v.to_string());
}
payload.push('\n');
let stdout = run_gelu_bwd_executable(&executable_path, &source_path, &payload)?;
let outputs = parse_gelu_bwd_results(&stdout)?;
let kernel_time_ms = parse_gelu_bwd_f32_line(&stdout, "KERNEL_TIME_MS=")
.ok_or_else(|| Error::backend("HIP fp16 GELU bwd did not print KERNEL_TIME_MS marker"))?;
let cpu_oracle_outputs = cpu_gelu_bwd_fp16(grad_output, input, n);
let mut max_abs_error = 0.0f32;
for (g, c) in outputs.iter().zip(cpu_oracle_outputs.iter()) {
let g_f = f16_to_f32(*g);
let c_f = f16_to_f32(*c);
let err = (g_f - c_f).abs();
if err > max_abs_error {
max_abs_error = err;
}
}
let within_tolerance = max_abs_error < 1e-2;
Ok(RocmHipGeluBwdReport {
n,
outputs,
cpu_oracle_outputs,
max_abs_error,
within_tolerance,
kernel_time_ms,
kernel_source_fingerprint: source_fingerprint,
compiler_fingerprint,
build_command,
executable_path: executable_path.display().to_string(),
device_evidence,
evidence: vec![
"compiled HIP kernel with /opt/rocm/bin/hipcc -O2 --offload-arch=gfx1101".to_string(),
"shipped grad_output and input bits to the kernel via stdin (Stdio::piped)".to_string(),
"launched gelu_bw_fp16_f32_kernel with grid=(n/256+1) block=(256)".to_string(),
"captured kernel time with hipEventRecord/hipEventSynchronize".to_string(),
"compared every output element against the fp64 CPU oracle within 1e-2".to_string(),
],
non_claims: vec![
"not production speedup evidence".to_string(),
"not vectorized GELU backward (no half2 loads/stores, no shared memory)".to_string(),
"not fused with the GELU forward pass".to_string(),
"not machine-code verification".to_string(),
],
})
}
pub fn hip_gelu_bwd_kernel_source_fingerprint() -> String {
fingerprint("hip-gelu-bwd-source", HIP_GELU_BWD_KERNEL)
}
fn run_gelu_bwd_executable(
executable_path: &std::path::Path,
source_path: &std::path::Path,
payload: &str,
) -> Result<String> {
hipcc_recheck_artifact(
"/opt/rocm/bin/hipcc",
source_path,
executable_path,
Some("gfx1101"),
)?;
kernel_server::run_persistent(GELU_BWD_KERNEL_TYPE, executable_path, payload)
}
pub fn cpu_gelu_bwd_fp16(grad_output: &[u16], input: &[u16], n: usize) -> Vec<u16> {
debug_assert_eq!(grad_output.len(), n);
debug_assert_eq!(input.len(), n);
let mut out = Vec::with_capacity(n);
for i in 0..n {
let go = f16_to_f32(grad_output[i]) as f64;
let x = f16_to_f32(input[i]) as f64;
let c = (2.0_f64 / std::f64::consts::PI).sqrt();
let a = 0.044715_f64;
let inner = c * (x + a * x * x * x);
let t = inner.tanh();
let dydx = 0.5 * (1.0 + t) + 0.5 * x * (1.0 - t * t) * c * (1.0 + 3.0 * a * x * x);
out.push(f32_to_f16((go * dydx) as f32));
}
out
}
fn parse_gelu_bwd_results(stdout: &str) -> Result<Vec<u16>> {
let line = stdout
.lines()
.find_map(|line| line.strip_prefix("RESULTS="))
.ok_or_else(|| Error::backend("HIP fp16 GELU bwd did not print RESULTS marker"))?;
if line.trim().is_empty() {
return Ok(Vec::new());
}
line.split_whitespace()
.map(|value| {
value.trim().parse::<u16>().map_err(|err| {
Error::backend(format!(
"invalid HIP fp16 GELU bwd output value {value:?}: {err}"
))
})
})
.collect()
}
fn parse_gelu_bwd_f32_line(stdout: &str, prefix: &str) -> Option<f32> {
stdout
.lines()
.find_map(|line| line.strip_prefix(prefix))
.and_then(|value| value.trim().parse::<f32>().ok())
}
fn fingerprint(label: &str, value: &str) -> String {
let mut hasher = DefaultHasher::new();
label.hash(&mut hasher);
value.hash(&mut hasher);
format!("{label}-{:016x}", hasher.finish())
}