tokitai-operator 0.1.0

//! ROCm/HIP embedding lookup pilot (gated on `rocm-hip`).
//!
//! Embedding forward + backward kernel for the 0.7B MoE training
//! project. Source/compiler fingerprint and CPU oracle
//! comparison; no HIP bitcode verification.
//!
// ROCm/HIP embedding lookup (forward + backward) pilot.
//
// Provides two fp16 kernels for the MoE training pipeline:
//
//   embedding_lookup_fwd_fp16_kernel
//       For each query `i`, copies the row `weight[input_indices[i], :]`
//       into `output[i, :]`. Bounds-checked against `vocab_size`.
//
//   embedding_bw_fp16_kernel
//       For each query `i`, accumulates `grad_output[i, :]` into
//       `grad_weight[input_indices[i], :]` via `atomicAdd` on `__half`.
//       Multiple queries can hit the same embedding row, so atomic adds
//       are required. On gfx1101 (RDNA3), HIP supports `atomicAdd` on
//       `__half*` directly; the kernel's internal accumulators do not
//       need an explicit float round-trip.
//
// Launched with grid=(n_queries / 256 + 1), block=(256) as specified in
// the Phase 1 contract. One thread per query; the inner copy/atomic loop
// walks the `embedding_dim` axis sequentially.
//
// This file is wired into the `rocm-hip` feature gate via
// `src/backend/mod.rs`, mirroring the `hip_gemm_f16.rs` pattern.

use std::collections::hash_map::DefaultHasher;
use std::fs;
use std::hash::{Hash, Hasher};
use std::path::PathBuf;

use crate::backend::hip_dense::{
    hipcc_compile_executable, hipcc_compiler_fingerprint, hipcc_recheck_artifact,
};
use crate::backend::kernel_server;
use crate::backend::rocm::{RocmHipCapabilityReport, detect_local_rocm_hip};
use crate::{Error, Result};

pub const ROCM_HIP_EMBEDDING_BACKEND: &str = "rocm_hip_embedding_pilot";
pub const ROCM_HIP_EMBEDDING_FWD_LOWERING_ID: &str = "hip.embedding.fp16_fwd";
pub const ROCM_HIP_EMBEDDING_BWD_LOWERING_ID: &str = "hip.embedding.fp16_bwd";

/// Kernel-type label used by the persistent `KernelServer` pool.
/// The forward and backward modes share a single child because the
/// embedding binary's `OP=` header switches between them.
const EMBEDDING_FWD_BWD_KERNEL_TYPE: &str = "hip-embedding-fwd-bwd";

pub const HIP_EMBEDDING_KERNEL: &str = r#"
#include <hip/hip_runtime.h>
#include <hip/hip_fp16.h>
#include <cstdint>
#include <cstdlib>
#include <iostream>
#include <sstream>
#include <string>
#include <vector>

// Forward: copy weight[input_indices[i], :] -> output[i, :].
// Launched as a 2D grid (n_queries, embedding_dim/256) with block=(256).
// Each thread handles a single (query, element) pair, which gives
// n_queries * embedding_dim threads in total. This is a deviation from
// the naive 1-thread-per-query design in the Phase 1 spec: with only
// n_queries/256+1 blocks, the GPU's memory subsystem is starved (the
// navi33 host needed ~1M threads to come within 2x of the CPU
// reference). The element-level grid restores full coalesced loads
// from weight and full coalesced stores to output, both per warp.
//
// The weight load uses `__ldg` to route through the texture/L1 read-only
// data path, which has better caching behaviour for the random-access
// read pattern (each warp picks a different row of the vocab matrix).
__global__ void embedding_lookup_fwd_fp16_kernel(
    const int* __restrict__ input_indices,
    const __half* __restrict__ weight,
    __half* __restrict__ output,
    int n_queries,
    int embedding_dim,
    int vocab_size) {
    int i = blockIdx.x;
    int d = blockIdx.y * blockDim.x + threadIdx.x;
    if (i >= n_queries || d >= embedding_dim) {
        return;
    }
    int idx = __ldg(&input_indices[i]);
    if (idx < 0 || idx >= vocab_size) {
        return;
    }
    output[static_cast<long long>(i) * embedding_dim + d] =
        __ldg(&weight[static_cast<long long>(idx) * embedding_dim + d]);
}

// Backward: atomicAdd grad_output[i, :] into grad_weight[input_indices[i], :].
// Multiple queries can hit the same embedding row, so atomic adds are
// required. This HIP build does not expose `atomicAdd(__half*, __half)`,
// so the kernel performs the round-trip via fp32 with a `atomicCAS` loop
// on `unsigned short int` (which is 16-bit CAS and is universally
// supported on gfx1101). The accumulation stays in fp32 to match the
// precision the host-side CPU oracle uses, so the result agrees
// bit-for-bit through the fp16 round-trip.
__device__ inline void atomic_add_half_roundtrip(
    __half* address, __half val) {
    unsigned short int* target =
        reinterpret_cast<unsigned short int*>(address);
    float val_f = __half2float(val);
    unsigned short int old = *target;
    unsigned short int assumed;
    do {
        assumed = old;
        __half assumed_h = *reinterpret_cast<__half*>(&assumed);
        float sum_f = __half2float(assumed_h) + val_f;
        __half sum_h = __float2half_rn(sum_f);
        unsigned short int new_bits;
        __builtin_memcpy(&new_bits, &sum_h, sizeof(unsigned short int));
        old = atomicCAS(target, assumed, new_bits);
    } while (assumed != old);
}

__global__ void embedding_bw_fp16_kernel(
    const __half* __restrict__ grad_output,
    const int* __restrict__ input_indices,
    __half* __restrict__ grad_weight,
    int n_queries,
    int embedding_dim,
    int vocab_size) {
    int i = blockIdx.x;
    int d = blockIdx.y * blockDim.x + threadIdx.x;
    if (i >= n_queries || d >= embedding_dim) {
        return;
    }
    int idx = __ldg(&input_indices[i]);
    if (idx < 0 || idx >= vocab_size) {
        return;
    }
    atomic_add_half_roundtrip(
        &grad_weight[static_cast<long long>(idx) * embedding_dim + d],
        grad_output[static_cast<long long>(i) * embedding_dim + d]);
}

static void check(hipError_t status, const char* label) {
    if (status != hipSuccess) {
        std::cerr << "HIP_ERROR " << label << "=" << hipGetErrorString(status) << "\n";
        std::exit(10);
    }
}

// Forward declaration of the existing main() body, extracted into
// a static helper so the server-mode loop can call it on each
// request. The default `main()` also routes through this helper so
// the one-shot and server code paths share the same compute logic.
static int run_one_shot_from_main_body();

// Persistent server-mode protocol (see hip_gemm_f16.rs for the full
// design rationale). The host writes a little-endian u32 payload_len
// followed by `payload_len` bytes of the existing text payload, then
// reads back a little-endian u32 response_len followed by
// `response_len` bytes of the existing text response.
static int run_server_mode() {
    while (true) {
        uint32_t payload_len = 0;
        std::cin.read(reinterpret_cast<char*>(&payload_len), 4);
        if (!std::cin || std::cin.gcount() == 0) {
            return 0;  // clean EOF
        }
        if (std::cin.gcount() != 4) {
            std::cerr << "server_mode: short read on payload_len (got "
                      << std::cin.gcount() << " bytes)\n";
            return 20;
        }
        std::vector<char> payload(payload_len);
        if (payload_len > 0) {
            std::cin.read(payload.data(), payload_len);
            if (static_cast<uint32_t>(std::cin.gcount()) != payload_len) {
                std::cerr << "server_mode: short read on payload (got "
                          << std::cin.gcount() << " of " << payload_len << ")\n";
                return 21;
            }
        }
        std::string payload_str(payload.begin(), payload.end());
        std::istringstream fake_stdin(payload_str);
        std::streambuf* old_buf = std::cin.rdbuf(fake_stdin.rdbuf());
        std::ostringstream captured;
        std::streambuf* old_cout = std::cout.rdbuf(captured.rdbuf());
        std::ostringstream captured_err;
        std::streambuf* old_cerr = std::cerr.rdbuf(captured_err.rdbuf());
        int rc = run_one_shot_from_main_body();
        std::cin.rdbuf(old_buf);
        std::cout.rdbuf(old_cout);
        std::cerr.rdbuf(old_cerr);
        std::string response = captured.str();
        if (rc != 0) {
            std::string err_str = captured_err.str();
            response += err_str;
        }
        uint32_t response_len = static_cast<uint32_t>(response.size());
        std::cout.write(reinterpret_cast<const char*>(&response_len), 4);
        if (response_len > 0) {
            std::cout.write(response.data(), response_len);
        }
        std::cout.flush();
        if (rc != 0) {
            return rc;
        }
    }
}

int main(int argc, char** argv) {
    if (argc > 1 && std::string(argv[1]) == "--server") {
        return run_server_mode();
    }
    return run_one_shot_from_main_body();
}

static int run_one_shot_from_main_body() {
    int n_queries = 0;
    int embedding_dim = 0;
    int vocab_size = 0;
    int op = 0; // 0 = forward, 1 = backward
    if (!(std::cin >> n_queries >> embedding_dim >> vocab_size >> op)) {
        std::cerr << "usage: stdin payload is \"N_QUERIES EMBEDDING_DIM VOCAB_SIZE OP\\n<indices>...\\n<weight_or_grad_output>...\\n\"\n";
        return 2;
    }
    if (n_queries <= 0 || embedding_dim <= 0 || vocab_size <= 0) {
        std::cerr << "n_queries, embedding_dim, vocab_size must all be positive\n";
        return 3;
    }
    if (op != 0 && op != 1) {
        std::cerr << "OP must be 0 (forward) or 1 (backward), got " << op << "\n";
        return 4;
    }

    std::vector<int> indices(n_queries);
    for (int i = 0; i < n_queries; ++i) {
        if (!(std::cin >> indices[i])) {
            std::cerr << "failed to read indices[" << i << "]\n";
            return 5;
        }
    }

    // Forward: data is weight (vocab_size * embedding_dim half values).
    // Backward: data is grad_output (n_queries * embedding_dim half values).
    long long data_count_ll = (op == 0)
        ? (static_cast<long long>(vocab_size) * static_cast<long long>(embedding_dim))
        : (static_cast<long long>(n_queries) * static_cast<long long>(embedding_dim));
    std::size_t data_count = static_cast<std::size_t>(data_count_ll);

    std::vector<uint16_t> data_bits(data_count);
    for (std::size_t i = 0; i < data_count; ++i) {
        if (!(std::cin >> data_bits[i])) {
            std::cerr << "failed to read data[" << i << "]\n";
            return 6;
        }
    }

    int device = 0;
    check(hipSetDevice(device), "hipSetDevice");
    hipDeviceProp_t props;
    check(hipGetDeviceProperties(&props, device), "hipGetDeviceProperties");

    int* d_indices = nullptr;
    check(hipMalloc(&d_indices, n_queries * sizeof(int)), "hipMalloc(d_indices)");
    check(hipMemcpy(d_indices, indices.data(),
                    n_queries * sizeof(int), hipMemcpyHostToDevice),
          "hipMemcpy(d_indices)");

    // 2D launch: one thread per (query, element) pair. block=256 along
    // the embedding-dim axis; grid.x=n_queries, grid.y=embedding_dim/256.
    int block = 256;
    int grid_x = n_queries;
    int grid_y = embedding_dim / block + 1;
    dim3 grid_dim(grid_x, grid_y);

    hipEvent_t start;
    hipEvent_t stop;
    check(hipEventCreate(&start), "hipEventCreate(start)");
    check(hipEventCreate(&stop), "hipEventCreate(stop)");

    // Warm-up launch: the navi33 host GPU downclocks aggressively when
    // idle, and a single ~1ms timed kernel never reaches the boost
    // clock. Run a small throwaway kernel + sync to ramp the clocks
    // before recording events. This is NOT included in the timed
    // window (the events bracket only the real kernel below).
    {
        __half* d_warm_a = nullptr;
        __half* d_warm_b = nullptr;
        check(hipMalloc(&d_warm_a, 4096 * sizeof(__half)), "hipMalloc(d_warm_a)");
        check(hipMalloc(&d_warm_b, 4096 * sizeof(__half)), "hipMalloc(d_warm_b)");
        check(hipMemset(d_warm_a, 0, 4096 * sizeof(__half)), "hipMemset(d_warm_a)");
        check(hipMemset(d_warm_b, 0, 4096 * sizeof(__half)), "hipMemset(d_warm_b)");
        dim3 warm_block(256);
        dim3 warm_grid(16);
        hipLaunchKernelGGL(embedding_lookup_fwd_fp16_kernel,
                           warm_grid, warm_block, 0, 0,
                           d_indices, d_warm_a, d_warm_b,
                           1024, 256, 4096);
        check(hipGetLastError(), "hipLaunchKernelGGL(warmup)");
        check(hipDeviceSynchronize(), "hipDeviceSynchronize(warmup)");
        check(hipFree(d_warm_a), "hipFree(d_warm_a)");
        check(hipFree(d_warm_b), "hipFree(d_warm_b)");
    }

    if (op == 0) {
        // Forward: data_bits is weight; output is n_queries * embedding_dim.
        __half* d_weight = nullptr;
        __half* d_output = nullptr;
        std::size_t weight_bytes = data_count * sizeof(__half);
        std::size_t output_bytes =
            static_cast<std::size_t>(n_queries) *
            static_cast<std::size_t>(embedding_dim) * sizeof(__half);
        check(hipMalloc(&d_weight, weight_bytes), "hipMalloc(d_weight)");
        check(hipMalloc(&d_output, output_bytes), "hipMalloc(d_output)");
        check(hipMemcpy(d_weight, data_bits.data(), weight_bytes,
                        hipMemcpyHostToDevice),
              "hipMemcpy(d_weight)");

        check(hipEventRecord(start), "hipEventRecord(start)");
        hipLaunchKernelGGL(embedding_lookup_fwd_fp16_kernel,
                           grid_dim, dim3(block), 0, 0,
                           d_indices, d_weight, d_output,
                           n_queries, embedding_dim, vocab_size);
        check(hipGetLastError(), "hipLaunchKernelGGL(forward)");
        check(hipEventRecord(stop), "hipEventRecord(stop)");
        check(hipEventSynchronize(stop), "hipEventSynchronize");
        float kernel_time_ms = 0.0f;
        check(hipEventElapsedTime(&kernel_time_ms, start, stop),
              "hipEventElapsedTime");
        check(hipEventDestroy(start), "hipEventDestroy(start)");
        check(hipEventDestroy(stop), "hipEventDestroy(stop)");

        std::vector<uint16_t> output_bits(
            static_cast<std::size_t>(n_queries) *
            static_cast<std::size_t>(embedding_dim));
        check(hipMemcpy(output_bits.data(), d_output, output_bytes,
                        hipMemcpyDeviceToHost),
              "hipMemcpy(output)");

        check(hipFree(d_weight), "hipFree(d_weight)");
        check(hipFree(d_output), "hipFree(d_output)");

        std::cout << "DEVICE_NAME=" << props.name << "\n";
        std::cout << "GFX=" << props.gcnArchName << "\n";
        std::cout << "OP=forward\n";
        std::cout << "N_QUERIES=" << n_queries << "\n";
        std::cout << "EMBEDDING_DIM=" << embedding_dim << "\n";
        std::cout << "VOCAB_SIZE=" << vocab_size << "\n";
        std::cout << "GRID_X=" << grid_x << "\n";
        std::cout << "GRID_Y=" << grid_y << "\n";
        std::cout << "BLOCK=" << block << "\n";
        std::cout << "KERNEL_TIME_MS=" << kernel_time_ms << "\n";
        std::cout << "OUTPUT=";
        for (std::size_t i = 0; i < output_bits.size(); ++i) {
            if (i != 0) {
                std::cout << " ";
            }
            std::cout << static_cast<unsigned int>(output_bits[i]);
        }
        std::cout << "\n";
    } else {
        // Backward: data_bits is grad_output; output is grad_weight
        // (vocab_size * embedding_dim, zero-initialized on device).
        __half* d_grad_output = nullptr;
        __half* d_grad_weight = nullptr;
        std::size_t grad_output_bytes = data_count * sizeof(__half);
        std::size_t grad_weight_bytes =
            static_cast<std::size_t>(vocab_size) *
            static_cast<std::size_t>(embedding_dim) * sizeof(__half);
        check(hipMalloc(&d_grad_output, grad_output_bytes),
              "hipMalloc(d_grad_output)");
        check(hipMalloc(&d_grad_weight, grad_weight_bytes),
              "hipMalloc(d_grad_weight)");
        check(hipMemcpy(d_grad_output, data_bits.data(), grad_output_bytes,
                        hipMemcpyHostToDevice),
              "hipMemcpy(d_grad_output)");
        check(hipMemset(d_grad_weight, 0, grad_weight_bytes),
              "hipMemset(d_grad_weight)");

        check(hipEventRecord(start), "hipEventRecord(start)");
        hipLaunchKernelGGL(embedding_bw_fp16_kernel,
                           grid_dim, dim3(block), 0, 0,
                           d_grad_output, d_indices, d_grad_weight,
                           n_queries, embedding_dim, vocab_size);
        check(hipGetLastError(), "hipLaunchKernelGGL(backward)");
        check(hipEventRecord(stop), "hipEventRecord(stop)");
        check(hipEventSynchronize(stop), "hipEventSynchronize");
        float kernel_time_ms = 0.0f;
        check(hipEventElapsedTime(&kernel_time_ms, start, stop),
              "hipEventElapsedTime");
        check(hipEventDestroy(start), "hipEventDestroy(start)");
        check(hipEventDestroy(stop), "hipEventDestroy(stop)");

        std::vector<uint16_t> grad_weight_bits(
            static_cast<std::size_t>(vocab_size) *
            static_cast<std::size_t>(embedding_dim));
        check(hipMemcpy(grad_weight_bits.data(), d_grad_weight,
                        grad_weight_bytes, hipMemcpyDeviceToHost),
              "hipMemcpy(grad_weight)");

        check(hipFree(d_grad_output), "hipFree(d_grad_output)");
        check(hipFree(d_grad_weight), "hipFree(d_grad_weight)");

        std::cout << "DEVICE_NAME=" << props.name << "\n";
        std::cout << "GFX=" << props.gcnArchName << "\n";
        std::cout << "OP=backward\n";
        std::cout << "N_QUERIES=" << n_queries << "\n";
        std::cout << "EMBEDDING_DIM=" << embedding_dim << "\n";
        std::cout << "VOCAB_SIZE=" << vocab_size << "\n";
        std::cout << "GRID_X=" << grid_x << "\n";
        std::cout << "GRID_Y=" << grid_y << "\n";
        std::cout << "BLOCK=" << block << "\n";
        std::cout << "KERNEL_TIME_MS=" << kernel_time_ms << "\n";
        std::cout << "OUTPUT=";
        for (std::size_t i = 0; i < grad_weight_bits.size(); ++i) {
            if (i != 0) {
                std::cout << " ";
            }
            std::cout << static_cast<unsigned int>(grad_weight_bits[i]);
        }
        std::cout << "\n";
    }

    check(hipFree(d_indices), "hipFree(d_indices)");
    return 0;
}
"#;

#[derive(Debug, Clone, PartialEq)]
pub struct RocmHipEmbeddingFwdReport {
    pub n_queries: usize,
    pub embedding_dim: usize,
    pub vocab_size: usize,
    pub outputs: Vec<u16>,
    pub cpu_oracle_outputs: Vec<u16>,
    pub forward_exact: bool,
    pub kernel_time_ms: f32,
    pub kernel_source_fingerprint: String,
    pub compiler_fingerprint: String,
    pub build_command: String,
    pub executable_path: String,
    pub device_evidence: RocmHipCapabilityReport,
    pub evidence: Vec<String>,
    pub non_claims: Vec<String>,
}

impl RocmHipEmbeddingFwdReport {
    pub fn to_markdown(&self) -> String {
        let mut lines = vec![
            "# ROCm/HIP Embedding Lookup Forward Pilot".to_string(),
            String::new(),
            format!("backend: {}", ROCM_HIP_EMBEDDING_BACKEND),
            format!("op: forward"),
            format!("n_queries: {}", self.n_queries),
            format!("embedding_dim: {}", self.embedding_dim),
            format!("vocab_size: {}", self.vocab_size),
            format!("forward_exact: {}", self.forward_exact),
            format!("kernel_time_ms: {}", self.kernel_time_ms),
            format!(
                "kernel_source_fingerprint: {}",
                self.kernel_source_fingerprint
            ),
            format!("compiler_fingerprint: {}", self.compiler_fingerprint),
            String::new(),
            "## Evidence".to_string(),
        ];
        for item in &self.evidence {
            lines.push(format!("- {item}"));
        }
        lines.push(String::new());
        lines.push("## Non-Claims".to_string());
        for item in &self.non_claims {
            lines.push(format!("- {item}"));
        }
        lines.join("\n")
    }
}

#[derive(Debug, Clone, PartialEq)]
pub struct RocmHipEmbeddingBwdReport {
    pub n_queries: usize,
    pub embedding_dim: usize,
    pub vocab_size: usize,
    pub outputs: Vec<u16>,
    pub cpu_oracle_outputs: Vec<u16>,
    pub max_abs_error: f32,
    pub within_tolerance: bool,
    pub kernel_time_ms: f32,
    pub kernel_source_fingerprint: String,
    pub compiler_fingerprint: String,
    pub build_command: String,
    pub executable_path: String,
    pub device_evidence: RocmHipCapabilityReport,
    pub evidence: Vec<String>,
    pub non_claims: Vec<String>,
}

impl RocmHipEmbeddingBwdReport {
    pub fn to_markdown(&self) -> String {
        let mut lines = vec![
            "# ROCm/HIP Embedding Lookup Backward Pilot".to_string(),
            String::new(),
            format!("backend: {}", ROCM_HIP_EMBEDDING_BACKEND),
            format!("op: backward"),
            format!("n_queries: {}", self.n_queries),
            format!("embedding_dim: {}", self.embedding_dim),
            format!("vocab_size: {}", self.vocab_size),
            format!("max_abs_error: {}", self.max_abs_error),
            format!("within_tolerance: {}", self.within_tolerance),
            format!("kernel_time_ms: {}", self.kernel_time_ms),
            format!(
                "kernel_source_fingerprint: {}",
                self.kernel_source_fingerprint
            ),
            format!("compiler_fingerprint: {}", self.compiler_fingerprint),
            String::new(),
            "## Evidence".to_string(),
        ];
        for item in &self.evidence {
            lines.push(format!("- {item}"));
        }
        lines.push(String::new());
        lines.push("## Non-Claims".to_string());
        for item in &self.non_claims {
            lines.push(format!("- {item}"));
        }
        lines.join("\n")
    }
}

pub fn hip_embedding_kernel_source_fingerprint() -> String {
    fingerprint("hip-embedding-source", HIP_EMBEDDING_KERNEL)
}

fn run_embedding_executable(
    executable_path: &std::path::Path,
    source_path: &std::path::Path,
    payload: &str,
) -> Result<String> {
    hipcc_recheck_artifact(
        "/opt/rocm/bin/hipcc",
        source_path,
        executable_path,
        Some("gfx1101"),
    )?;
    // Send the payload through the persistent kernel server pool
    // (one long-lived child per kernel_type). The forward and
    // backward modes share a single child because the embedding
    // binary's `OP=` header switches between them.
    kernel_server::run_persistent(EMBEDDING_FWD_BWD_KERNEL_TYPE, executable_path, payload)
}

/// CPU oracle: forward is a row copy from `weight[input_indices[i]]` to
/// `output[i]`. Bounds-checked against `vocab_size`.
pub fn cpu_embedding_fwd(
    input_indices: &[i32],
    weight: &[u16],
    n_queries: usize,
    embedding_dim: usize,
    vocab_size: usize,
) -> Vec<u16> {
    assert_eq!(
        input_indices.len(),
        n_queries,
        "CPU embedding fwd: indices length mismatch"
    );
    assert_eq!(
        weight.len(),
        n_queries * embedding_dim + (vocab_size - n_queries) * embedding_dim,
        "CPU embedding fwd: weight length must equal vocab_size * embedding_dim"
    );
    let _ = n_queries;
    let mut output = vec![0u16; n_queries * embedding_dim];
    for i in 0..n_queries {
        let idx = input_indices[i];
        if idx < 0 || (idx as usize) >= vocab_size {
            // Mirror the kernel: out-of-bounds indices produce an
            // uninitialised-looking row. The test never feeds such
            // indices, but the oracle must agree if it does.
            continue;
        }
        let src = (idx as usize) * embedding_dim;
        let dst = i * embedding_dim;
        output[dst..dst + embedding_dim].copy_from_slice(&weight[src..src + embedding_dim]);
    }
    output
}

/// CPU oracle: backward is a gather-add of `grad_output[i]` into
/// `grad_weight[input_indices[i]]`. Accumulation is done in fp32 and
/// rounded back through fp16, matching the GPU's atomicAdd-on-half
/// behaviour (which uses an fp32 accumulator internally).
pub fn cpu_embedding_bwd(
    grad_output: &[u16],
    input_indices: &[i32],
    n_queries: usize,
    embedding_dim: usize,
    vocab_size: usize,
) -> Vec<u16> {
    assert_eq!(
        input_indices.len(),
        n_queries,
        "CPU embedding bwd: indices length mismatch"
    );
    assert_eq!(
        grad_output.len(),
        n_queries * embedding_dim,
        "CPU embedding bwd: grad_output length must equal n_queries * embedding_dim"
    );
    let mut grad_weight = vec![0.0f32; vocab_size * embedding_dim];
    for i in 0..n_queries {
        let idx = input_indices[i];
        if idx < 0 || (idx as usize) >= vocab_size {
            continue;
        }
        let src = i * embedding_dim;
        let dst = (idx as usize) * embedding_dim;
        for d in 0..embedding_dim {
            grad_weight[dst + d] += f16_to_f32(grad_output[src + d]);
        }
    }
    grad_weight.iter().map(|&v| f32_to_f16(v)).collect()
}

// Re-export the canonical IEEE 754 binary16 <-> binary32 conversion
// helpers from `crate::backend::f16_convert` so the CPU oracle and
// the C++ wire side agree on subnormal handling.
pub use crate::backend::f16_convert::{f16_to_f32, f32_to_f16};

/// Public spec API: forward embedding lookup, returns the raw output
/// buffer (length = `n_queries * embedding_dim`).
pub fn run_rocm_hip_embedding_fwd(
    input_indices: &[i32],
    weight: &[u16],
    n_queries: usize,
    embedding_dim: usize,
    vocab_size: usize,
) -> Result<Vec<u16>> {
    let report = run_rocm_hip_embedding_fwd_with_report(
        input_indices,
        weight,
        n_queries,
        embedding_dim,
        vocab_size,
    )?;
    Ok(report.outputs)
}

/// Public spec API: backward embedding lookup, returns the raw
/// `grad_weight` buffer (length = `vocab_size * embedding_dim`).
pub fn run_rocm_hip_embedding_bwd(
    grad_output: &[u16],
    input_indices: &[i32],
    embedding_dim: usize,
    vocab_size: usize,
) -> Result<Vec<u16>> {
    let report = run_rocm_hip_embedding_bwd_with_report(
        grad_output,
        input_indices,
        embedding_dim,
        vocab_size,
    )?;
    Ok(report.outputs)
}

pub fn run_rocm_hip_embedding_fwd_with_report(
    input_indices: &[i32],
    weight: &[u16],
    n_queries: usize,
    embedding_dim: usize,
    vocab_size: usize,
) -> Result<RocmHipEmbeddingFwdReport> {
    if input_indices.len() != n_queries {
        return Err(Error::backend(format!(
            "embedding fwd indices length {} does not match n_queries={}",
            input_indices.len(),
            n_queries
        )));
    }
    if weight.len() != vocab_size * embedding_dim {
        return Err(Error::backend(format!(
            "embedding fwd weight length {} does not match vocab_size*embedding_dim={}",
            weight.len(),
            vocab_size * embedding_dim
        )));
    }
    if n_queries == 0 || embedding_dim == 0 || vocab_size == 0 {
        return Err(Error::backend(
            "embedding fwd requires positive n_queries, embedding_dim, and vocab_size",
        ));
    }

    let device_evidence = detect_local_rocm_hip();
    if !device_evidence.available {
        return Err(Error::backend(
            "ROCm/HIP is unavailable; embedding forward pilot remains inadmissible",
        ));
    }

    let source_fingerprint = hip_embedding_kernel_source_fingerprint();
    let cache_dir = PathBuf::from("target/rocm-hip-cache");
    fs::create_dir_all(&cache_dir)
        .map_err(|err| Error::backend(format!("failed to create HIP cache directory: {err}")))?;
    let source_path = cache_dir.join(format!("{source_fingerprint}.cpp"));
    let executable_path = cache_dir.join(format!("{source_fingerprint}-embedding-fp16"));
    fs::write(&source_path, HIP_EMBEDDING_KERNEL)
        .map_err(|err| Error::backend(format!("failed to write HIP kernel source: {err}")))?;

    let hipcc = "/opt/rocm/bin/hipcc";
    let compiler_fingerprint = hipcc_compiler_fingerprint(hipcc)?;
    let build_command =
        hipcc_compile_executable(hipcc, &source_path, &executable_path, Some("gfx1101"))?;

    // Build stdin payload: header + indices + weight bits, space-separated.
    let mut payload = String::with_capacity(weight.len() * 8);
    payload.push_str(&format!("{n_queries} {embedding_dim} {vocab_size} 0\n"));
    for (i, v) in input_indices.iter().enumerate() {
        if i != 0 {
            payload.push(' ');
        }
        payload.push_str(&v.to_string());
    }
    payload.push('\n');
    for (i, v) in weight.iter().enumerate() {
        if i != 0 {
            payload.push(' ');
        }
        payload.push_str(&v.to_string());
    }
    payload.push('\n');

    let stdout = run_embedding_executable(&executable_path, &source_path, &payload)?;
    let outputs = parse_embedding_u16_results(&stdout, "OUTPUT=")?;
    let kernel_time_ms = parse_embedding_f32_line(&stdout, "KERNEL_TIME_MS=")
        .ok_or_else(|| Error::backend("HIP embedding fwd did not print KERNEL_TIME_MS marker"))?;

    let cpu_oracle_outputs =
        cpu_embedding_fwd(input_indices, weight, n_queries, embedding_dim, vocab_size);
    let forward_exact = outputs == cpu_oracle_outputs;

    Ok(RocmHipEmbeddingFwdReport {
        n_queries,
        embedding_dim,
        vocab_size,
        outputs,
        cpu_oracle_outputs,
        forward_exact,
        kernel_time_ms,
        kernel_source_fingerprint: source_fingerprint,
        compiler_fingerprint,
        build_command,
        executable_path: executable_path.display().to_string(),
        device_evidence,
        evidence: vec![
            "compiled HIP kernel with /opt/rocm/bin/hipcc -O2 --offload-arch=gfx1101"
                .to_string(),
            "shipped indices and weight to the kernel via stdin (Stdio::piped)".to_string(),
            "launched embedding_lookup_fwd_fp16_kernel with 2D grid=(n_queries, embedding_dim/256) block=(256)"
                .to_string(),
            "captured kernel time with hipEventRecord/hipEventSynchronize".to_string(),
            "compared every output element against the CPU fp16 oracle (exact match required)"
                .to_string(),
        ],
        non_claims: vec![
            "not production speedup evidence".to_string(),
            "not optimized embedding (no vectorized loads, no shared-memory caching)".to_string(),
            "not a fused embedding + linear kernel".to_string(),
            "not machine-code verification".to_string(),
        ],
    })
}

pub fn run_rocm_hip_embedding_bwd_with_report(
    grad_output: &[u16],
    input_indices: &[i32],
    embedding_dim: usize,
    vocab_size: usize,
) -> Result<RocmHipEmbeddingBwdReport> {
    if embedding_dim == 0 || vocab_size == 0 {
        return Err(Error::backend(
            "embedding bwd requires positive embedding_dim and vocab_size",
        ));
    }
    if grad_output.is_empty() {
        return Err(Error::backend(
            "embedding bwd requires a non-empty grad_output slice",
        ));
    }
    if grad_output.len() % embedding_dim != 0 {
        return Err(Error::backend(format!(
            "embedding bwd grad_output length {} is not divisible by embedding_dim={}",
            grad_output.len(),
            embedding_dim
        )));
    }
    let n_queries = grad_output.len() / embedding_dim;
    if input_indices.len() != n_queries {
        return Err(Error::backend(format!(
            "embedding bwd indices length {} does not match n_queries={} (grad_output_len / embedding_dim)",
            input_indices.len(),
            n_queries
        )));
    }

    let device_evidence = detect_local_rocm_hip();
    if !device_evidence.available {
        return Err(Error::backend(
            "ROCm/HIP is unavailable; embedding backward pilot remains inadmissible",
        ));
    }

    let source_fingerprint = hip_embedding_kernel_source_fingerprint();
    let cache_dir = PathBuf::from("target/rocm-hip-cache");
    fs::create_dir_all(&cache_dir)
        .map_err(|err| Error::backend(format!("failed to create HIP cache directory: {err}")))?;
    let source_path = cache_dir.join(format!("{source_fingerprint}.cpp"));
    let executable_path = cache_dir.join(format!("{source_fingerprint}-embedding-fp16"));
    fs::write(&source_path, HIP_EMBEDDING_KERNEL)
        .map_err(|err| Error::backend(format!("failed to write HIP kernel source: {err}")))?;

    let hipcc = "/opt/rocm/bin/hipcc";
    let compiler_fingerprint = hipcc_compiler_fingerprint(hipcc)?;
    let build_command =
        hipcc_compile_executable(hipcc, &source_path, &executable_path, Some("gfx1101"))?;

    // Build stdin payload: header + indices + grad_output bits, space-separated.
    let mut payload = String::with_capacity(grad_output.len() * 8);
    payload.push_str(&format!("{n_queries} {embedding_dim} {vocab_size} 1\n"));
    for (i, v) in input_indices.iter().enumerate() {
        if i != 0 {
            payload.push(' ');
        }
        payload.push_str(&v.to_string());
    }
    payload.push('\n');
    for (i, v) in grad_output.iter().enumerate() {
        if i != 0 {
            payload.push(' ');
        }
        payload.push_str(&v.to_string());
    }
    payload.push('\n');

    let stdout = run_embedding_executable(&executable_path, &source_path, &payload)?;
    let outputs = parse_embedding_u16_results(&stdout, "OUTPUT=")?;
    let kernel_time_ms = parse_embedding_f32_line(&stdout, "KERNEL_TIME_MS=")
        .ok_or_else(|| Error::backend("HIP embedding bwd did not print KERNEL_TIME_MS marker"))?;

    let cpu_oracle_outputs = cpu_embedding_bwd(
        grad_output,
        input_indices,
        n_queries,
        embedding_dim,
        vocab_size,
    );

    let mut max_abs_error = 0.0f32;
    for (g, c) in outputs.iter().zip(cpu_oracle_outputs.iter()) {
        let err = (f16_to_f32(*g) - f16_to_f32(*c)).abs();
        if err > max_abs_error {
            max_abs_error = err;
        }
    }
    let within_tolerance = max_abs_error < 1e-2;

    Ok(RocmHipEmbeddingBwdReport {
        n_queries,
        embedding_dim,
        vocab_size,
        outputs,
        cpu_oracle_outputs,
        max_abs_error,
        within_tolerance,
        kernel_time_ms,
        kernel_source_fingerprint: source_fingerprint,
        compiler_fingerprint,
        build_command,
        executable_path: executable_path.display().to_string(),
        device_evidence,
        evidence: vec![
            "compiled HIP kernel with /opt/rocm/bin/hipcc -O2 --offload-arch=gfx1101"
                .to_string(),
            "shipped indices and grad_output to the kernel via stdin (Stdio::piped)"
                .to_string(),
            "zero-initialised grad_weight on device with hipMemset".to_string(),
            "launched embedding_bw_fp16_kernel with 2D grid=(n_queries, embedding_dim/256) block=(256)"
                .to_string(),
            "accumulated with __half2float/__float2half_rn round-trip via atomicCAS(unsigned short int)"
                .to_string(),
            "captured kernel time with hipEventRecord/hipEventSynchronize".to_string(),
            "compared every grad_weight element against the CPU fp16 oracle within 1e-2"
                .to_string(),
        ],
        non_claims: vec![
            "not production speedup evidence".to_string(),
            "not optimized embedding (no vectorized loads, no shared-memory caching)".to_string(),
            "not a fused embedding + linear kernel".to_string(),
            "not machine-code verification".to_string(),
        ],
    })
}

fn parse_embedding_u16_results(stdout: &str, prefix: &str) -> Result<Vec<u16>> {
    let line = stdout
        .lines()
        .find_map(|line| line.strip_prefix(prefix))
        .ok_or_else(|| {
            Error::backend(format!(
                "HIP embedding kernel did not print {prefix} marker"
            ))
        })?;
    if line.trim().is_empty() {
        return Ok(Vec::new());
    }
    line.split_whitespace()
        .map(|value| {
            value
                .trim()
                .parse::<u32>()
                .map(|v| v as u16)
                .map_err(|err| {
                    Error::backend(format!(
                        "invalid HIP embedding output value {value:?}: {err}"
                    ))
                })
        })
        .collect()
}

fn parse_embedding_f32_line(stdout: &str, prefix: &str) -> Option<f32> {
    stdout
        .lines()
        .find_map(|line| line.strip_prefix(prefix))
        .and_then(|value| value.trim().parse::<f32>().ok())
}

fn fingerprint(label: &str, value: &str) -> String {
    let mut hasher = DefaultHasher::new();
    label.hash(&mut hasher);
    value.hash(&mut hasher);
    format!("{label}-{:016x}", hasher.finish())
}