use std::collections::hash_map::DefaultHasher;
use std::fs;
use std::hash::{Hash, Hasher};
use std::path::{Path, PathBuf};
use std::process::Command;
use std::thread::sleep;
use std::time::Duration;
use crate::backend::BackendCapabilities;
use crate::backend::gpu::{
GpuExecutionContract, GpuKernelRegistryEntry, GpuSynchronizationModel, GpuTransferLifecycle,
};
use crate::backend::hardware::{DeviceKind, HardwareTarget, MemorySpace};
use crate::backend::rocm::{RocmHipCapabilityReport, detect_local_rocm_hip};
use crate::object::Representation;
use crate::op::{LoweringCapability, LoweringEvidenceKind, LoweringRule, OperatorRegistry};
use crate::{Error, Result};
pub const ROCM_HIP_DENSE_I32_BACKEND: &str = "rocm_hip_dense_i32_pilot";
pub const ROCM_HIP_DENSE_I32_LOWERING_ID: &str = "hip.add.dense_i32";
pub const HIP_DENSE_I32_ADD_KERNEL: &str = r#"
#include <hip/hip_runtime.h>
#include <cstdint>
#include <cstdlib>
#include <iostream>
#include <string>
#include <vector>
__global__ void add_i32_kernel(const int32_t* lhs, const int32_t* rhs, int32_t* out, std::size_t n) {
std::size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < n) {
out[idx] = lhs[idx] + rhs[idx];
}
}
static void check(hipError_t status, const char* label) {
if (status != hipSuccess) {
std::cerr << "HIP_ERROR " << label << "=" << hipGetErrorString(status) << "\n";
std::exit(10);
}
}
int main(int argc, char** argv) {
if (argc < 2) {
std::cerr << "usage: rocm_dense_i32_add N LHS... RHS...\n";
return 2;
}
std::size_t n = static_cast<std::size_t>(std::stoul(argv[1]));
if (argc != static_cast<int>(2 + 2 * n)) {
std::cerr << "argument count does not match N\n";
return 3;
}
int device = 0;
check(hipSetDevice(device), "hipSetDevice");
hipDeviceProp_t props;
check(hipGetDeviceProperties(&props, device), "hipGetDeviceProperties");
std::vector<int32_t> lhs(n);
std::vector<int32_t> rhs(n);
std::vector<int32_t> out(n);
for (std::size_t i = 0; i < n; ++i) {
lhs[i] = static_cast<int32_t>(std::stol(argv[2 + i]));
rhs[i] = static_cast<int32_t>(std::stol(argv[2 + n + i]));
}
int32_t* d_lhs = nullptr;
int32_t* d_rhs = nullptr;
int32_t* d_out = nullptr;
std::size_t bytes = n * sizeof(int32_t);
check(hipMalloc(&d_lhs, bytes), "hipMalloc(lhs)");
check(hipMalloc(&d_rhs, bytes), "hipMalloc(rhs)");
check(hipMalloc(&d_out, bytes), "hipMalloc(out)");
check(hipMemcpy(d_lhs, lhs.data(), bytes, hipMemcpyHostToDevice), "hipMemcpy(lhs)");
check(hipMemcpy(d_rhs, rhs.data(), bytes, hipMemcpyHostToDevice), "hipMemcpy(rhs)");
int block = 256;
int grid = static_cast<int>((n + block - 1) / block);
hipLaunchKernelGGL(add_i32_kernel, dim3(grid), dim3(block), 0, 0, d_lhs, d_rhs, d_out, n);
check(hipGetLastError(), "hipLaunchKernelGGL");
check(hipDeviceSynchronize(), "hipDeviceSynchronize");
check(hipMemcpy(out.data(), d_out, bytes, hipMemcpyDeviceToHost), "hipMemcpy(out)");
check(hipFree(d_lhs), "hipFree(lhs)");
check(hipFree(d_rhs), "hipFree(rhs)");
check(hipFree(d_out), "hipFree(out)");
std::cout << "DEVICE_NAME=" << props.name << "\n";
std::cout << "GFX=" << props.gcnArchName << "\n";
std::cout << "N=" << n << "\n";
std::cout << "GRID=" << grid << "\n";
std::cout << "BLOCK=" << block << "\n";
std::cout << "RESULTS=";
for (std::size_t i = 0; i < out.size(); ++i) {
if (i != 0) {
std::cout << ",";
}
std::cout << out[i];
}
std::cout << "\n";
return 0;
}
"#;
pub fn rocm_hip_dense_i32_execution_contract() -> GpuExecutionContract {
GpuExecutionContract {
backend: ROCM_HIP_DENSE_I32_BACKEND.to_string(),
target: HardwareTarget {
id: ROCM_HIP_DENSE_I32_BACKEND.to_string(),
kind: DeviceKind::Gpu,
memory_space: MemorySpace::Device,
},
scope: "feature-gated ROCm/HIP dense i32 add".to_string(),
real_device_execution: true,
lifecycle: GpuTransferLifecycle {
allocates_device_memory: true,
host_to_device_copy: true,
device_to_host_copy: true,
synchronization: GpuSynchronizationModel::StreamSynchronized {
stream: "default HIP stream with hipDeviceSynchronize".to_string(),
},
cpu_oracle_verification: true,
},
kernels: vec![GpuKernelRegistryEntry {
op_name: "add".to_string(),
kernel_symbol: "add_i32_kernel".to_string(),
scalar_type: "i32".to_string(),
supported_domain: "integer".to_string(),
supported_representation: Representation::dense_cpu().id().0,
source_fingerprint: hip_dense_i32_kernel_source_fingerprint(),
}],
evidence: vec![
"kernel allocates d_lhs, d_rhs, and d_out with hipMalloc".to_string(),
"kernel copies lhs/rhs host buffers to device with hipMemcpyHostToDevice".to_string(),
"kernel launches add_i32_kernel and synchronizes with hipDeviceSynchronize".to_string(),
"kernel copies output to host with hipMemcpyDeviceToHost and compares with CPU oracle"
.to_string(),
],
non_claims: vec![
"not generic GPU execution".to_string(),
"not portable ROCm support".to_string(),
"not production speedup evidence".to_string(),
"not machine-code verification".to_string(),
],
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RocmHipDenseI32AddReport {
pub backend: String,
pub op_name: String,
pub scalar_type: String,
pub len: usize,
pub outputs: Vec<i32>,
pub cpu_oracle_outputs: Vec<i32>,
pub cpu_oracle_matches: bool,
pub kernel_source_fingerprint: String,
pub compiler_fingerprint: String,
pub build_command: String,
pub executable_path: String,
pub launch_grid: u32,
pub launch_block: u32,
pub device_evidence: RocmHipCapabilityReport,
pub evidence: Vec<String>,
pub non_claims: Vec<String>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RocmHipDenseI32LoweringContract {
pub required_gfx: String,
pub device_capability_fingerprint: String,
pub kernel_source_fingerprint: String,
pub compiler_fingerprint: String,
}
impl RocmHipDenseI32LoweringContract {
pub fn from_report(
report: &RocmHipCapabilityReport,
compiler_fingerprint: impl Into<String>,
) -> Result<Self> {
let selected = report.selected_device.as_ref().ok_or_else(|| {
Error::backend("ROCm/HIP lowering admission requires a selected device")
})?;
Ok(Self {
required_gfx: selected.gfx.clone(),
device_capability_fingerprint: report.capability_fingerprint.clone(),
kernel_source_fingerprint: hip_dense_i32_kernel_source_fingerprint(),
compiler_fingerprint: compiler_fingerprint.into(),
})
}
pub fn lowering_rule(&self) -> LoweringRule {
LoweringRule::new(
ROCM_HIP_DENSE_I32_LOWERING_ID,
"add",
ROCM_HIP_DENSE_I32_BACKEND,
vec![Representation::dense_cpu().id().0],
)
.with_supported_domain("integer")
.with_capability(LoweringCapability::rocm_hip_dense_i32(
self.required_gfx.clone(),
self.device_capability_fingerprint.clone(),
self.kernel_source_fingerprint.clone(),
self.compiler_fingerprint.clone(),
))
.with_required_evidence(
LoweringEvidenceKind::ExactnessPreserved,
"HIP dense i32 add preserves integer elementwise addition only after CPU oracle comparison",
)
.with_required_evidence(
LoweringEvidenceKind::MetadataPreserved,
"HIP dense i32 add preserves dense tensor shape and host-visible output metadata",
)
.with_obligation(
"HIP add operands must be dense i32 tensors with identical shape",
"the kernel indexes lhs, rhs, and output buffers by the same flat element id",
)
.with_obligation(
"HIP output must be copied back and compared against CpuScalarBackend semantics",
"the CPU oracle remains the semantic authority for the feature-gated HIP pilot",
)
}
}
impl RocmHipDenseI32AddReport {
pub fn to_markdown(&self) -> String {
let mut lines = vec![
"# ROCm/HIP Dense i32 Add Pilot".to_string(),
String::new(),
format!("backend: {}", self.backend),
format!("op: {}", self.op_name),
format!("scalar_type: {}", self.scalar_type),
format!("len: {}", self.len),
format!("cpu_oracle_matches: {}", self.cpu_oracle_matches),
format!(
"kernel_source_fingerprint: {}",
self.kernel_source_fingerprint
),
format!("compiler_fingerprint: {}", self.compiler_fingerprint),
format!("launch_grid: {}", self.launch_grid),
format!("launch_block: {}", self.launch_block),
String::new(),
"## Evidence".to_string(),
];
for item in &self.evidence {
lines.push(format!("- {item}"));
}
lines.push(String::new());
lines.push("## Non-Claims".to_string());
for item in &self.non_claims {
lines.push(format!("- {item}"));
}
lines.join("\n")
}
}
pub fn run_rocm_hip_dense_i32_add(lhs: &[i32], rhs: &[i32]) -> Result<RocmHipDenseI32AddReport> {
if lhs.is_empty() {
return Err(Error::backend(
"HIP dense i32 add requires a non-empty input",
));
}
if lhs.len() != rhs.len() {
return Err(Error::backend(format!(
"HIP dense i32 add shape mismatch lhs={} rhs={}",
lhs.len(),
rhs.len()
)));
}
let cpu_oracle_outputs = lhs
.iter()
.zip(rhs)
.map(|(left, right)| left + right)
.collect::<Vec<_>>();
let device_evidence = detect_local_rocm_hip();
if !device_evidence.available {
return Err(Error::backend(
"ROCm/HIP is unavailable; dense i32 HIP pilot remains inadmissible",
));
}
let source_fingerprint = hip_dense_i32_kernel_source_fingerprint();
let cache_dir = PathBuf::from("target/rocm-hip-cache");
fs::create_dir_all(&cache_dir)
.map_err(|err| Error::backend(format!("failed to create HIP cache directory: {err}")))?;
let source_path = cache_dir.join(format!("{source_fingerprint}.cpp"));
let executable_path = cache_dir.join(format!("{source_fingerprint}-dense-i32-add"));
fs::write(&source_path, HIP_DENSE_I32_ADD_KERNEL)
.map_err(|err| Error::backend(format!("failed to write HIP kernel source: {err}")))?;
let hipcc = "/opt/rocm/bin/hipcc";
let compiler_fingerprint = hipcc_compiler_fingerprint(hipcc)?;
let build_command = hipcc_compile_executable(hipcc, &source_path, &executable_path, None)?;
let mut args = vec![lhs.len().to_string()];
args.extend(lhs.iter().map(i32::to_string));
args.extend(rhs.iter().map(i32::to_string));
hipcc_recheck_artifact(hipcc, &source_path, &executable_path, None)?;
let run = Command::new(&executable_path)
.args(args)
.output()
.map_err(|err| Error::backend(format!("failed to run HIP dense i32 pilot: {err}")))?;
if !run.status.success() {
return Err(Error::backend(format!(
"HIP dense i32 pilot failed: {}{}",
String::from_utf8_lossy(&run.stderr),
String::from_utf8_lossy(&run.stdout)
)));
}
let stdout = String::from_utf8_lossy(&run.stdout);
let outputs = parse_results(&stdout)?;
let launch_grid = parse_u32_line(&stdout, "GRID=").unwrap_or(0);
let launch_block = parse_u32_line(&stdout, "BLOCK=").unwrap_or(0);
let cpu_oracle_matches = outputs == cpu_oracle_outputs;
if !cpu_oracle_matches {
return Err(Error::backend(format!(
"HIP dense i32 pilot failed CPU oracle comparison hip={outputs:?} cpu={cpu_oracle_outputs:?}"
)));
}
Ok(RocmHipDenseI32AddReport {
backend: ROCM_HIP_DENSE_I32_BACKEND.to_string(),
op_name: "add".to_string(),
scalar_type: "i32".to_string(),
len: lhs.len(),
outputs,
cpu_oracle_outputs,
cpu_oracle_matches,
kernel_source_fingerprint: source_fingerprint,
compiler_fingerprint,
build_command,
executable_path: executable_path.display().to_string(),
launch_grid,
launch_block,
device_evidence,
evidence: vec![
"compiled HIP kernel with /opt/rocm/bin/hipcc".to_string(),
"executed add_i32_kernel on selected ROCm device".to_string(),
"copied output back to host and compared every element with CPU oracle".to_string(),
],
non_claims: vec![
"not broad GPU support".to_string(),
"not p-adic or finite-site sheaf acceleration".to_string(),
"not production performance evidence".to_string(),
"not machine-code verification".to_string(),
],
})
}
pub fn hip_dense_i32_kernel_source_fingerprint() -> String {
fingerprint("hip-source", HIP_DENSE_I32_ADD_KERNEL)
}
pub fn hipcc_compiler_fingerprint(hipcc: &str) -> Result<String> {
let output = Command::new(hipcc)
.arg("--version")
.output()
.map_err(|err| Error::backend(format!("failed to invoke hipcc --version: {err}")))?;
if !output.status.success() {
return Err(Error::backend(format!(
"hipcc --version failed: {}{}",
String::from_utf8_lossy(&output.stderr),
String::from_utf8_lossy(&output.stdout)
)));
}
Ok(fingerprint(
"hipcc",
&String::from_utf8_lossy(&output.stdout),
))
}
pub const HIPCC_BUILD_MAX_ATTEMPTS: u32 = 3;
pub const HIPCC_BUILD_RETRY_BACKOFF_MS: u64 = 50;
pub fn hipcc_artifact_is_present(path: &Path) -> bool {
match fs::metadata(path) {
Ok(meta) => meta.is_file() && meta.len() > 0,
Err(_) => false,
}
}
pub fn hipcc_recheck_artifact(
hipcc: &str,
source_path: &Path,
executable_path: &Path,
offload_arch: Option<&str>,
) -> Result<()> {
if !hipcc_artifact_is_present(executable_path) {
hipcc_compile_executable(hipcc, source_path, executable_path, offload_arch)?;
}
Ok(())
}
pub fn hipcc_compile_executable(
hipcc: &str,
source_path: &Path,
executable_path: &Path,
offload_arch: Option<&str>,
) -> Result<String> {
if hipcc_artifact_is_present(executable_path) {
return Ok(format!(
"{} -O2 {} {} -o {} (cached)",
hipcc,
offload_arch
.map(|a| format!("--offload-arch={a}"))
.unwrap_or_default(),
source_path.display(),
executable_path.display(),
));
}
let mut last_err: Option<String> = None;
for attempt in 0..HIPCC_BUILD_MAX_ATTEMPTS {
let mut cmd = Command::new(hipcc);
cmd.arg("-O2");
if let Some(arch) = offload_arch {
cmd.arg(format!("--offload-arch={arch}"));
}
cmd.arg(source_path).arg("-o").arg(executable_path);
let build = match cmd.output() {
Ok(out) => out,
Err(err) => {
last_err = Some(format!("failed to invoke hipcc: {err}"));
sleep(Duration::from_millis(
HIPCC_BUILD_RETRY_BACKOFF_MS * (attempt as u64 + 1),
));
continue;
}
};
if !build.status.success() {
return Err(Error::backend(format!(
"hipcc failed: {}{}",
String::from_utf8_lossy(&build.stderr),
String::from_utf8_lossy(&build.stdout)
)));
}
match fs::metadata(executable_path) {
Ok(meta) if meta.is_file() && meta.len() > 0 => {
return Ok(format!(
"{} -O2 {} {} -o {}",
hipcc,
offload_arch
.map(|a| format!("--offload-arch={a}"))
.unwrap_or_default(),
source_path.display(),
executable_path.display(),
));
}
Ok(meta) => {
last_err = Some(format!(
"hipcc exited 0 but executable {} is empty (size={})",
executable_path.display(),
meta.len()
));
}
Err(err) => {
last_err = Some(format!(
"hipcc exited 0 but executable {} is missing: {err}",
executable_path.display()
));
}
}
sleep(Duration::from_millis(
HIPCC_BUILD_RETRY_BACKOFF_MS * (attempt as u64 + 1),
));
}
Err(Error::backend(format!(
"hipcc build did not produce a usable executable at {} after {} attempts: {}",
executable_path.display(),
HIPCC_BUILD_MAX_ATTEMPTS,
last_err.unwrap_or_else(|| "no diagnostic".to_string())
)))
}
pub fn rocm_hip_dense_i32_backend_capabilities(
contract: &RocmHipDenseI32LoweringContract,
) -> BackendCapabilities {
BackendCapabilities {
name: ROCM_HIP_DENSE_I32_BACKEND.to_string(),
exact: true,
deterministic: true,
supported_representations: vec![Representation::dense_cpu().id().0],
supported_domains: vec![
"integer".to_string(),
"rocm:hip".to_string(),
format!("gfx:{}", contract.required_gfx),
format!(
"device_capability:{}",
contract.device_capability_fingerprint
),
format!("hip_kernel_source:{}", contract.kernel_source_fingerprint),
format!("hip_compiler:{}", contract.compiler_fingerprint),
"cpu_oracle:required".to_string(),
],
semantic_degradations: vec![
"feature_gated:rocm-hip".to_string(),
"scalar:i32_only".to_string(),
"transfer_obligation:host_to_device_lhs".to_string(),
"transfer_obligation:host_to_device_rhs".to_string(),
"transfer_obligation:device_to_host_output".to_string(),
"unsupported:padic:fixed_precision".to_string(),
"unsupported:sheaf:finite_site".to_string(),
],
}
}
pub fn register_rocm_hip_dense_i32_lowering(
registry: &mut OperatorRegistry,
contract: &RocmHipDenseI32LoweringContract,
) -> Result<()> {
registry.register_lowering(contract.lowering_rule())
}
fn parse_results(stdout: &str) -> Result<Vec<i32>> {
let line = stdout
.lines()
.find_map(|line| line.strip_prefix("RESULTS="))
.ok_or_else(|| Error::backend("HIP dense i32 pilot did not print RESULTS"))?;
if line.trim().is_empty() {
return Ok(Vec::new());
}
line.split(',')
.map(|value| {
value
.trim()
.parse::<i32>()
.map_err(|err| Error::backend(format!("invalid HIP output value {value}: {err}")))
})
.collect()
}
fn parse_u32_line(stdout: &str, prefix: &str) -> Option<u32> {
stdout
.lines()
.find_map(|line| line.strip_prefix(prefix))
.and_then(|value| value.trim().parse::<u32>().ok())
}
fn fingerprint(label: &str, value: &str) -> String {
let mut hasher = DefaultHasher::new();
label.hash(&mut hasher);
value.hash(&mut hasher);
format!("{label}-{:016x}", hasher.finish())
}