use std::collections::hash_map::DefaultHasher;
use std::fs;
use std::hash::{Hash, Hasher};
use std::io::Write;
use std::path::PathBuf;
use std::process::{Command, Stdio};
use tokitai_operator::backend::hip_dense::{
hipcc_compile_executable, hipcc_compiler_fingerprint, hipcc_recheck_artifact,
};
use tokitai_operator::backend::rocm::{RocmHipCapabilityReport, detect_local_rocm_hip};
use tokitai_operator::{Error, Result};
pub const ROCM_HIP_PADIC_CODEC_BACKEND: &str = "rocm_hip_padic_codec_pilot";
pub const ROCM_HIP_PADIC_CODEC_LOWERING_ID: &str = "hip.padic_codec.encode_decode_f16";
pub const PADIC_HS: u64 = 256;
pub const HIP_PADIC_CODEC_KERNEL: &str = r#"
#include <hip/hip_runtime.h>
#include <cstdint>
#include <cstdlib>
#include <iostream>
#include <string>
#include <vector>
// p-adic codec pilot: encode/decode fp16 values as base-256 digit
// sequences. Base 256 keeps every digit inside a single unsigned byte
// (0..=255), which the Phase 1 contract (`u8* out`) requires. The
// spec asked for base 257, but a base-257 digit can be 0..=256, which
// silently truncates to 0 inside a `u8` and breaks the round-trip
// (the loss is exactly 256 * 257^d for the offending digit, e.g.
// `6424 = 24*257 + 256` decodes back as `24*257 = 6168`).
__global__ void padic_encode_f16_u8_kernel(
const unsigned short* in,
unsigned char* out,
int n,
int precision_digits) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) {
return;
}
// Treat the fp16 bit pattern as an unsigned integer; for fp16 the
// maximum value is 65535 which fits in 2 base-256 digits, so any
// precision >= 2 gives an exact round-trip inside a u8.
unsigned long long value = static_cast<unsigned long long>(in[idx]);
unsigned long long pow = 1ull;
for (int d = 0; d < precision_digits; ++d) {
out[d * n + idx] = static_cast<unsigned char>((value / pow) % 256ull);
pow *= 256ull;
}
}
__global__ void padic_decode_u8_f16_kernel(
const unsigned char* in,
unsigned short* out,
int n,
int precision_digits) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= n) {
return;
}
unsigned long long value = 0ull;
unsigned long long pow = 1ull;
for (int d = 0; d < precision_digits; ++d) {
value += static_cast<unsigned long long>(in[d * n + idx]) * pow;
pow *= 256ull;
}
// Saturate to u16 range. For the Phase 1 contract (fp16 inputs with
// precision in {8, 16}) this branch is never taken, but it keeps
// the kernel total for arbitrary inputs.
if (value > 0xffffull) {
value = 0xffffull;
}
out[idx] = static_cast<unsigned short>(value);
}
static void check(hipError_t status, const char* label) {
if (status != hipSuccess) {
std::cerr << "HIP_ERROR " << label << "=" << hipGetErrorString(status) << "\n";
std::exit(10);
}
}
int main() {
std::string mode;
int precision_digits = 0;
int n = 0;
if (!(std::cin >> mode >> precision_digits >> n)) {
std::cerr << "usage: stdin payload is \"MODE PRECISION N\\n<values>\\n\" where MODE in {encode,decode}\n";
return 2;
}
if (precision_digits <= 0 || precision_digits > 64) {
std::cerr << "PRECISION must be in [1, 64], got " << precision_digits << "\n";
return 3;
}
if (n <= 0) {
std::cerr << "N must be positive, got " << n << "\n";
return 4;
}
if (mode != "encode" && mode != "decode") {
std::cerr << "MODE must be 'encode' or 'decode', got '" << mode << "'\n";
return 5;
}
int device = 0;
check(hipSetDevice(device), "hipSetDevice");
hipDeviceProp_t props;
check(hipGetDeviceProperties(&props, device), "hipGetDeviceProperties");
int block = 256;
int grid = (n + block - 1) / block;
std::vector<unsigned short> values(n, 0);
std::vector<unsigned char> digits(n * precision_digits, 0);
if (mode == "encode") {
for (int i = 0; i < n; ++i) {
unsigned int v;
if (!(std::cin >> v)) {
std::cerr << "failed to read input value " << i << "\n";
return 6;
}
values[i] = static_cast<unsigned short>(v & 0xffffu);
}
} else {
for (int i = 0; i < n * precision_digits; ++i) {
unsigned int v;
if (!(std::cin >> v)) {
std::cerr << "failed to read input digit " << i << "\n";
return 7;
}
digits[i] = static_cast<unsigned char>(v & 0xffu);
}
}
unsigned short* d_values = nullptr;
unsigned char* d_digits = nullptr;
std::size_t values_bytes = static_cast<std::size_t>(n) * sizeof(unsigned short);
std::size_t digits_bytes = static_cast<std::size_t>(n) * static_cast<std::size_t>(precision_digits) * sizeof(unsigned char);
if (mode == "encode") {
check(hipMalloc(&d_values, values_bytes), "hipMalloc(d_values)");
check(hipMalloc(&d_digits, digits_bytes), "hipMalloc(d_digits)");
check(hipMemcpy(d_values, values.data(), values_bytes, hipMemcpyHostToDevice), "hipMemcpy(d_values)");
check(hipMemset(d_digits, 0, digits_bytes), "hipMemset(d_digits)");
hipLaunchKernelGGL(padic_encode_f16_u8_kernel, dim3(grid), dim3(block), 0, 0,
d_values, d_digits, n, precision_digits);
check(hipGetLastError(), "hipLaunchKernelGGL(encode)");
check(hipDeviceSynchronize(), "hipDeviceSynchronize");
check(hipMemcpy(digits.data(), d_digits, digits_bytes, hipMemcpyDeviceToHost), "hipMemcpy(d_digits)");
} else {
check(hipMalloc(&d_digits, digits_bytes), "hipMalloc(d_digits)");
check(hipMalloc(&d_values, values_bytes), "hipMalloc(d_values)");
check(hipMemcpy(d_digits, digits.data(), digits_bytes, hipMemcpyHostToDevice), "hipMemcpy(d_digits)");
hipLaunchKernelGGL(padic_decode_u8_f16_kernel, dim3(grid), dim3(block), 0, 0,
d_digits, d_values, n, precision_digits);
check(hipGetLastError(), "hipLaunchKernelGGL(decode)");
check(hipDeviceSynchronize(), "hipDeviceSynchronize");
check(hipMemcpy(values.data(), d_values, values_bytes, hipMemcpyDeviceToHost), "hipMemcpy(d_values)");
}
check(hipFree(d_values), "hipFree(d_values)");
check(hipFree(d_digits), "hipFree(d_digits)");
std::cout << "DEVICE_NAME=" << props.name << "\n";
std::cout << "GFX=" << props.gcnArchName << "\n";
std::cout << "MODE=" << mode << "\n";
std::cout << "PRECISION=" << precision_digits << "\n";
std::cout << "N=" << n << "\n";
std::cout << "GRID=" << grid << "\n";
std::cout << "BLOCK=" << block << "\n";
if (mode == "encode") {
std::cout << "OUTPUT=";
for (int i = 0; i < n * precision_digits; ++i) {
if (i != 0) {
std::cout << ",";
}
std::cout << static_cast<unsigned int>(digits[i]);
}
std::cout << "\n";
} else {
std::cout << "OUTPUT=";
for (int i = 0; i < n; ++i) {
if (i != 0) {
std::cout << ",";
}
std::cout << static_cast<unsigned int>(values[i]);
}
std::cout << "\n";
}
return 0;
}
"#;
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RocmHipPadicCodecReport {
pub backend: String,
pub mode: String,
pub precision: u32,
pub n: usize,
pub outputs: Vec<u8>,
pub decoded_values: Vec<u16>,
pub cpu_oracle_outputs: Vec<u8>,
pub cpu_oracle_decoded: Vec<u16>,
pub cpu_oracle_matches: bool,
pub kernel_source_fingerprint: String,
pub compiler_fingerprint: String,
pub build_command: String,
pub executable_path: String,
pub launch_grid: u32,
pub launch_block: u32,
pub device_evidence: RocmHipCapabilityReport,
pub evidence: Vec<String>,
pub non_claims: Vec<String>,
}
impl RocmHipPadicCodecReport {
pub fn to_markdown(&self) -> String {
let mut lines = vec![
"# ROCm/HIP p-adic Codec Pilot".to_string(),
String::new(),
format!("backend: {}", self.backend),
format!("mode: {}", self.mode),
format!("precision: {}", self.precision),
format!("n: {}", self.n),
format!("cpu_oracle_matches: {}", self.cpu_oracle_matches),
format!(
"kernel_source_fingerprint: {}",
self.kernel_source_fingerprint
),
format!("compiler_fingerprint: {}", self.compiler_fingerprint),
format!("launch_grid: {}", self.launch_grid),
format!("launch_block: {}", self.launch_block),
String::new(),
"## Evidence".to_string(),
];
for item in &self.evidence {
lines.push(format!("- {item}"));
}
lines.push(String::new());
lines.push("## Non-Claims".to_string());
for item in &self.non_claims {
lines.push(format!("- {item}"));
}
lines.join("\n")
}
}
pub fn hip_padic_codec_kernel_source_fingerprint() -> String {
fingerprint("hip-padic-codec-source", HIP_PADIC_CODEC_KERNEL)
}
pub fn cpu_padic_encode_f16(values: &[u16], precision_digits: u8) -> Vec<u8> {
let prec = precision_digits as usize;
let mut out = vec![0u8; values.len() * prec];
for (i, &value) in values.iter().enumerate() {
let mut v = value as u64;
for d in 0..prec {
out[d * values.len() + i] = (v % PADIC_HS) as u8;
v /= PADIC_HS;
}
}
out
}
pub fn cpu_padic_decode_f16(digits: &[u8], precision_digits: u8) -> Vec<u16> {
let prec = precision_digits as usize;
assert!(
digits.len() % prec == 0,
"digit count {} is not a multiple of precision {}",
digits.len(),
prec
);
let n = digits.len() / prec;
let mut out = vec![0u16; n];
for i in 0..n {
let mut v: u128 = 0;
let mut pow: u128 = 1;
for d in 0..prec {
let digit = digits[d * n + i] as u128;
let (term, overflow) = digit.overflowing_mul(pow);
if overflow {
v = u128::MAX;
break;
}
let (new_v, overflow) = v.overflowing_add(term);
if overflow {
v = u128::MAX;
break;
}
v = new_v;
if d < prec - 1 {
pow = match pow.checked_mul(PADIC_HS as u128) {
Some(p) => p,
None => u128::MAX,
};
}
}
out[i] = if v > 0xffffu128 { 0xffffu16 } else { v as u16 };
}
out
}
fn run_rocm_hip_padic_codec_mode(
mode: &str,
stdin_payload: &str,
n: usize,
precision_digits: u8,
cpu_oracle_outputs: Vec<u8>,
cpu_oracle_decoded: Vec<u16>,
) -> Result<RocmHipPadicCodecReport> {
let device_evidence = detect_local_rocm_hip();
if !device_evidence.available {
return Err(Error::backend(
"ROCm/HIP is unavailable; p-adic codec HIP pilot remains inadmissible",
));
}
let source_fingerprint = hip_padic_codec_kernel_source_fingerprint();
let cache_dir = PathBuf::from("target/rocm-hip-cache");
fs::create_dir_all(&cache_dir)
.map_err(|err| Error::backend(format!("failed to create HIP cache directory: {err}")))?;
let source_path = cache_dir.join(format!("{source_fingerprint}.cpp"));
let executable_path = cache_dir.join(format!("{source_fingerprint}-padic-codec"));
fs::write(&source_path, HIP_PADIC_CODEC_KERNEL)
.map_err(|err| Error::backend(format!("failed to write HIP kernel source: {err}")))?;
let hipcc = "/opt/rocm/bin/hipcc";
let compiler_fingerprint = hipcc_compiler_fingerprint(hipcc)?;
let build_command =
hipcc_compile_executable(hipcc, &source_path, &executable_path, Some("gfx1101"))?;
hipcc_recheck_artifact(hipcc, &source_path, &executable_path, Some("gfx1101"))?;
let mut child = Command::new(&executable_path)
.stdin(Stdio::piped())
.stdout(Stdio::piped())
.stderr(Stdio::piped())
.spawn()
.map_err(|err| Error::backend(format!("failed to spawn HIP p-adic codec: {err}")))?;
if let Some(stdin) = child.stdin.as_mut() {
stdin
.write_all(stdin_payload.as_bytes())
.map_err(|err| Error::backend(format!("failed to write HIP p-adic codec stdin: {err}")))?;
}
let run = child
.wait_with_output()
.map_err(|err| Error::backend(format!("failed to run HIP p-adic codec: {err}")))?;
if !run.status.success() {
return Err(Error::backend(format!(
"HIP p-adic codec failed: {}{}",
String::from_utf8_lossy(&run.stderr),
String::from_utf8_lossy(&run.stdout)
)));
}
let stdout = String::from_utf8_lossy(&run.stdout);
let outputs_csv = parse_line(&stdout, "OUTPUT=")
.ok_or_else(|| Error::backend("HIP p-adic codec did not print OUTPUT marker"))?;
let launch_grid = parse_u32_line(&stdout, "GRID=").unwrap_or(0);
let launch_block = parse_u32_line(&stdout, "BLOCK=").unwrap_or(0);
let (outputs, decoded_values) = if mode == "encode" {
let outs = parse_u8_csv(&outputs_csv)?;
(outs, Vec::new())
} else {
let outs = parse_u16_csv(&outputs_csv)?;
(Vec::new(), outs)
};
let cpu_oracle_matches = if mode == "encode" {
outputs == cpu_oracle_outputs
} else {
decoded_values == cpu_oracle_decoded
};
if !cpu_oracle_matches {
let mismatch_msg = if mode == "encode" {
format!(
"hip_outputs={:?} cpu_oracle={:?}",
outputs, cpu_oracle_outputs
)
} else {
format!(
"hip_decoded={:?} cpu_oracle_decoded={:?}",
decoded_values, cpu_oracle_decoded
)
};
return Err(Error::backend(format!(
"HIP p-adic codec ({}) failed CPU oracle comparison: {}",
mode, mismatch_msg
)));
}
Ok(RocmHipPadicCodecReport {
backend: ROCM_HIP_PADIC_CODEC_BACKEND.to_string(),
mode: mode.to_string(),
precision: precision_digits as u32,
n,
outputs,
decoded_values,
cpu_oracle_outputs,
cpu_oracle_decoded,
cpu_oracle_matches,
kernel_source_fingerprint: source_fingerprint,
compiler_fingerprint,
build_command,
executable_path: executable_path.display().to_string(),
launch_grid,
launch_block,
device_evidence,
evidence: vec![
"compiled HIP kernel with /opt/rocm/bin/hipcc -O2 --offload-arch=gfx1101".to_string(),
format!(
"shipped {} payload to the kernel via stdin (Stdio::piped)",
mode
),
format!(
"launched padic_{}_f16_u8 kernel with grid=(n/256) block=(256)",
if mode == "encode" { "encode" } else { "decode" }
),
"copied output back to host and compared every element with the CPU oracle"
.to_string(),
],
non_claims: vec![
"not a general-purpose p-adic field".to_string(),
"not arbitrary-precision p-adic arithmetic".to_string(),
"not production speedup evidence".to_string(),
"not machine-code verification".to_string(),
],
})
}
pub fn run_rocm_hip_padic_encode_f16(
values: &[u16],
precision_digits: u8,
) -> Result<RocmHipPadicCodecReport> {
if values.is_empty() {
return Err(Error::backend(
"HIP p-adic encode requires a non-empty input slice",
));
}
if precision_digits == 0 {
return Err(Error::backend("HIP p-adic encode requires precision_digits >= 1"));
}
let cpu_oracle_outputs = cpu_padic_encode_f16(values, precision_digits);
let mut payload = String::with_capacity((values.len() * 8) + 32);
payload.push_str(&format!(
"encode {} {}\n",
precision_digits,
values.len()
));
for (i, v) in values.iter().enumerate() {
if i != 0 {
payload.push(' ');
}
payload.push_str(&v.to_string());
}
payload.push('\n');
run_rocm_hip_padic_codec_mode(
"encode",
&payload,
values.len(),
precision_digits,
cpu_oracle_outputs,
Vec::new(),
)
}
pub fn run_rocm_hip_padic_decode_f16(
digits: &[u8],
precision_digits: u8,
) -> Result<RocmHipPadicCodecReport> {
if digits.is_empty() {
return Err(Error::backend(
"HIP p-adic decode requires a non-empty input slice",
));
}
if precision_digits == 0 {
return Err(Error::backend("HIP p-adic decode requires precision_digits >= 1"));
}
if !digits.len().is_multiple_of(precision_digits as usize) {
return Err(Error::backend(format!(
"HIP p-adic decode digit count {} is not a multiple of precision {}",
digits.len(),
precision_digits
)));
}
let n = digits.len() / precision_digits as usize;
let cpu_oracle_decoded = cpu_padic_decode_f16(digits, precision_digits);
let mut payload = String::with_capacity((digits.len() * 4) + 32);
payload.push_str(&format!("decode {} {}\n", precision_digits, n));
for (i, d) in digits.iter().enumerate() {
if i != 0 {
payload.push(' ');
}
payload.push_str(&(*d as u32).to_string());
}
payload.push('\n');
run_rocm_hip_padic_codec_mode(
"decode",
&payload,
n,
precision_digits,
Vec::new(),
cpu_oracle_decoded,
)
}
fn parse_line(stdout: &str, prefix: &str) -> Option<String> {
stdout
.lines()
.find_map(|line| line.strip_prefix(prefix))
.map(|s| s.to_string())
}
fn parse_u32_line(stdout: &str, prefix: &str) -> Option<u32> {
stdout
.lines()
.find_map(|line| line.strip_prefix(prefix))
.and_then(|value| value.trim().parse::<u32>().ok())
}
fn parse_u8_csv(line: &str) -> Result<Vec<u8>> {
if line.trim().is_empty() {
return Ok(Vec::new());
}
line.split(',')
.map(|value| {
value
.trim()
.parse::<u32>()
.map(|v| v as u8)
.map_err(|err| Error::backend(format!("invalid HIP p-adic codec byte {value:?}: {err}")))
})
.collect()
}
fn parse_u16_csv(line: &str) -> Result<Vec<u16>> {
if line.trim().is_empty() {
return Ok(Vec::new());
}
line.split(',')
.map(|value| {
value
.trim()
.parse::<u32>()
.map(|v| v as u16)
.map_err(|err| Error::backend(format!("invalid HIP p-adic codec value {value:?}: {err}")))
})
.collect()
}
fn fingerprint(label: &str, value: &str) -> String {
let mut hasher = DefaultHasher::new();
label.hash(&mut hasher);
value.hash(&mut hasher);
format!("{label}-{:016x}", hasher.finish())
}