uhash-prover 0.4.1

use anyhow::Result;
use cudarc::driver::{
    CudaContext, CudaFunction, CudaSlice, CudaStream, LaunchConfig, PushKernelArg,
};
use serde::{Deserialize, Serialize};
use std::fs;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Instant;
use uhash_core::meets_difficulty;

const CHAINS: usize = 4;
const SCRATCHPAD_SIZE: usize = 512 * 1024;
const TOTAL_MEMORY: usize = CHAINS * SCRATCHPAD_SIZE;
const ROUNDS: usize = 12_288;
const DEFAULT_INFLIGHT_SLOTS: usize = 3;

#[repr(C)]
#[derive(Clone, Copy, Debug)]
struct KernelParams {
    lanes: u32,
    rounds: u32,
    header_len: u32,
    _pad0: u32,
    start_nonce: u64,
}

#[derive(Clone, Debug, Serialize)]
pub struct CudaTelemetry {
    pub device_name: String,
    pub compute_capability: (u32, u32),
    pub total_memory_bytes: u64,
    pub max_threads_per_block: u32,
    pub multiprocessor_count: u32,
    pub warp_size: u32,
    pub inflight_slots: usize,
    pub tuned_chunk_lanes: usize,
    pub max_chunk_lanes: usize,
    pub runtime_profiled: bool,
}

#[derive(Debug, Serialize, Deserialize)]
struct CudaTuneCache {
    version: u32,
    device_name: String,
    compute_capability: (u32, u32),
    tuned_chunk_lanes: usize,
    max_chunk_lanes: usize,
    inflight_slots: usize,
    block_size: u32,
}

pub struct CudaMiner {
    #[allow(dead_code)] // Kept alive to prevent CUDA context destruction
    ctx: Arc<CudaContext>,
    stream: Arc<CudaStream>,
    kernel: CudaFunction,
    // Per-slot device buffers
    d_params: Vec<CudaSlice<u8>>,
    d_hashes: Vec<CudaSlice<u8>>,
    d_scratchpads: Vec<CudaSlice<u8>>,
    d_header: Option<CudaSlice<u8>>,
    header_capacity: usize,
    // Tuning
    chunk_lanes: usize,
    tuned_chunk_lanes: usize,
    max_chunk_lanes: usize,
    inflight_slots: usize,
    block_size: u32,
    did_runtime_profile: bool,
    // Device info
    device_name: String,
    compute_capability: (u32, u32),
    total_memory: u64,
    max_threads_per_block: u32,
    multiprocessor_count: u32,
    warp_size: u32,
}

impl CudaMiner {
    pub fn new() -> Result<Self> {
        let ctx =
            CudaContext::new(0).map_err(|e| anyhow::anyhow!("CUDA device init failed: {}", e))?;

        // Query device properties first (needed for targeted PTX compilation)
        let device_name = ctx
            .name()
            .map_err(|e| anyhow::anyhow!("Failed to get device name: {}", e))?;

        let (cc_major, cc_minor) = ctx
            .compute_capability()
            .map_err(|e| anyhow::anyhow!("Failed to get compute capability: {}", e))?;

        // Compile CUDA C → PTX at runtime via NVRTC, targeting this GPU's architecture
        let arch_str: String = format!("sm_{}{}", cc_major, cc_minor);
        // Leak the string to get a &'static str (lives for program duration, compiled once)
        let arch_static: &'static str = Box::leak(arch_str.into_boxed_str());
        let compile_opts = cudarc::nvrtc::CompileOptions {
            arch: Some(arch_static),
            fmad: Some(true),
            ..Default::default()
        };
        let ptx = cudarc::nvrtc::compile_ptx_with_opts(CUDA_KERNEL_SOURCE, compile_opts)
            .map_err(|e| anyhow::anyhow!("NVRTC compilation failed: {}", e))?;

        let module = ctx
            .load_module(ptx)
            .map_err(|e| anyhow::anyhow!("Failed to load PTX module: {}", e))?;

        let kernel = module
            .load_function("uhash_kernel")
            .map_err(|e| anyhow::anyhow!("Failed to load uhash_kernel: {}", e))?;

        let total_memory = unsafe {
            cudarc::driver::result::device::total_mem(ctx.cu_device())
                .map_err(|e| anyhow::anyhow!("Failed to get total memory: {}", e))?
        } as u64;

        let max_threads_per_block =
            ctx.attribute(
                cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
            )
            .map_err(|e| anyhow::anyhow!("Failed to get max threads: {}", e))? as u32;

        let multiprocessor_count =
            ctx.attribute(
                cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
            )
            .map_err(|e| anyhow::anyhow!("Failed to get SM count: {}", e))? as u32;

        let warp_size =
            ctx.attribute(cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE)
                .map_err(|e| anyhow::anyhow!("Failed to get warp size: {}", e))? as u32;

        let stream = ctx.default_stream();

        let mut miner = Self {
            ctx,
            stream,
            kernel,
            d_params: Vec::new(),
            d_hashes: Vec::new(),
            d_scratchpads: Vec::new(),
            d_header: None,
            header_capacity: 0,
            chunk_lanes: 0,
            tuned_chunk_lanes: 0,
            max_chunk_lanes: 0,
            inflight_slots: DEFAULT_INFLIGHT_SLOTS,
            block_size: 256,
            did_runtime_profile: false,
            device_name,
            compute_capability: (cc_major as u32, cc_minor as u32),
            total_memory,
            max_threads_per_block,
            multiprocessor_count,
            warp_size,
        };
        miner.try_load_tune_cache();
        Ok(miner)
    }

    pub fn recommended_lanes(&mut self, requested: usize) -> usize {
        if self.tuned_chunk_lanes == 0 {
            let (chunk, max_chunk) = self.autotune_chunk_limits();
            self.tuned_chunk_lanes = chunk;
            self.max_chunk_lanes = max_chunk;
        }
        if requested == 0 {
            self.tuned_chunk_lanes
                .saturating_mul(self.inflight_slots)
                .max(1)
        } else {
            requested.max(1)
        }
    }

    pub fn telemetry(&self) -> CudaTelemetry {
        CudaTelemetry {
            device_name: self.device_name.clone(),
            compute_capability: self.compute_capability,
            total_memory_bytes: self.total_memory,
            max_threads_per_block: self.max_threads_per_block,
            multiprocessor_count: self.multiprocessor_count,
            warp_size: self.warp_size,
            inflight_slots: self.inflight_slots,
            tuned_chunk_lanes: self.tuned_chunk_lanes,
            max_chunk_lanes: self.max_chunk_lanes,
            runtime_profiled: self.did_runtime_profile,
        }
    }

    pub fn benchmark_hashes(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
    ) -> Result<usize> {
        Ok(self
            .compute_batch_hashes(header_without_nonce, start_nonce, lanes)?
            .len())
    }

    fn tune_cache_path(&self) -> Option<PathBuf> {
        let base = std::env::var_os("XDG_CONFIG_HOME")
            .map(PathBuf::from)
            .or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".config")))?;
        let (maj, min) = self.compute_capability;
        let safe_name: String = self
            .device_name
            .chars()
            .map(|c| if c.is_alphanumeric() { c } else { '_' })
            .collect();
        Some(
            base.join("uhash")
                .join(format!("cuda_tuning_{}_{}{}.json", safe_name, maj, min)),
        )
    }

    fn try_load_tune_cache(&mut self) {
        let Some(path) = self.tune_cache_path() else {
            return;
        };
        let Ok(raw) = fs::read(&path) else {
            return;
        };
        let Ok(cache) = serde_json::from_slice::<CudaTuneCache>(&raw) else {
            return;
        };
        if cache.version != 1 || cache.compute_capability != self.compute_capability {
            return;
        }
        let (_heuristic_chunk, heuristic_max) = self.autotune_chunk_limits();
        let cap_max = heuristic_max.max(32);
        let chunk = cache.tuned_chunk_lanes.clamp(32, cap_max);
        let max_chunk = cache.max_chunk_lanes.clamp(chunk, cap_max);
        if chunk % 32 != 0 || max_chunk % 32 != 0 {
            return;
        }
        self.tuned_chunk_lanes = chunk;
        self.max_chunk_lanes = max_chunk;
        self.inflight_slots = cache.inflight_slots.clamp(2, 6);
        self.block_size = cache.block_size.clamp(64, 1024);
    }

    fn persist_tune_cache(&self) {
        let Some(path) = self.tune_cache_path() else {
            return;
        };
        let cache = CudaTuneCache {
            version: 1,
            device_name: self.device_name.clone(),
            compute_capability: self.compute_capability,
            tuned_chunk_lanes: self.tuned_chunk_lanes,
            max_chunk_lanes: self.max_chunk_lanes,
            inflight_slots: self.inflight_slots,
            block_size: self.block_size,
        };
        let Ok(raw) = serde_json::to_vec_pretty(&cache) else {
            return;
        };
        if let Some(parent) = path.parent() {
            let _ = fs::create_dir_all(parent);
        }
        let _ = fs::write(path, raw);
    }

    fn autotune_chunk_limits(&self) -> (usize, usize) {
        let mem_budget = (self.total_memory as usize).saturating_mul(7) / 10;
        let per_lane = TOTAL_MEMORY + 32;
        let max_by_mem = mem_budget
            .saturating_div(per_lane.saturating_mul(self.inflight_slots))
            .max(1);
        let max_chunk = max_by_mem.clamp(32, 8192);
        let tuned = max_chunk.min(1024);
        let tuned = (tuned / 32).max(1) * 32;
        let max_chunk = (max_chunk / 32).max(1) * 32;
        (tuned, max_chunk)
    }

    fn round_to_multiple(v: usize, m: usize) -> usize {
        (v / m).max(1) * m
    }

    fn runtime_profile_block_size(
        &mut self,
        header_without_nonce: &[u8],
        chunk_lanes: usize,
    ) -> Result<()> {
        let candidates = [128u32, 256, 512];
        let current = self.block_size;
        let mut best_bs = current;
        let mut best_hps = 0.0f64;

        for &bs in &candidates {
            if bs > self.max_threads_per_block {
                continue;
            }
            self.block_size = bs;
            self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;

            self.dispatch_chunk_sync(0, chunk_lanes, header_without_nonce.len(), 0)?;

            let start = Instant::now();
            self.dispatch_chunk_sync(0, chunk_lanes, header_without_nonce.len(), 1)?;
            let elapsed = start.elapsed().as_secs_f64();
            if elapsed <= 0.0 {
                continue;
            }
            let hps = chunk_lanes as f64 / elapsed;
            if hps > best_hps {
                best_hps = hps;
                best_bs = bs;
            }
        }

        self.block_size = best_bs;
        self.chunk_lanes = 0;
        Ok(())
    }

    fn runtime_profile_chunk_lanes(&mut self, header_without_nonce: &[u8]) -> Result<()> {
        if self.did_runtime_profile {
            return Ok(());
        }
        self.did_runtime_profile = true;

        let base = self.tuned_chunk_lanes.max(32);
        let max_chunk = self.max_chunk_lanes.max(base);
        let mut candidates = vec![
            base,
            Self::round_to_multiple(base.saturating_mul(2), 32).min(max_chunk),
            Self::round_to_multiple(base.saturating_mul(3), 32).min(max_chunk),
            max_chunk,
        ];
        candidates.sort_unstable();
        candidates.dedup();

        let mut best_lane = base;
        let mut best_hps = 0.0f64;
        for &lane in &candidates {
            if lane == 0 {
                continue;
            }
            self.ensure_resources(header_without_nonce.len(), lane)?;

            self.dispatch_chunk_sync(0, lane, header_without_nonce.len(), 0)?;

            let start = Instant::now();
            self.dispatch_chunk_sync(0, lane, header_without_nonce.len(), lane as u64)?;
            let elapsed = start.elapsed().as_secs_f64();
            if elapsed <= 0.0 {
                continue;
            }
            let hps = lane as f64 / elapsed;
            if hps > best_hps {
                best_hps = hps;
                best_lane = lane;
            }
        }

        self.tuned_chunk_lanes = best_lane.max(32);
        self.runtime_profile_block_size(header_without_nonce, self.tuned_chunk_lanes)?;
        self.chunk_lanes = 0;
        self.persist_tune_cache();
        Ok(())
    }

    fn ensure_resources(&mut self, header_len: usize, chunk_lanes: usize) -> Result<()> {
        if header_len > self.header_capacity {
            let new_capacity = header_len.next_power_of_two();
            self.d_header = Some(
                self.stream
                    .alloc_zeros::<u8>(new_capacity)
                    .map_err(|e| anyhow::anyhow!("Failed to alloc header buffer: {}", e))?,
            );
            self.header_capacity = new_capacity;
        }

        if self.chunk_lanes == chunk_lanes
            && self.d_params.len() == self.inflight_slots
            && self.d_hashes.len() == self.inflight_slots
            && self.d_scratchpads.len() == self.inflight_slots
        {
            return Ok(());
        }

        let hash_bytes = chunk_lanes.saturating_mul(32);
        let scratch_bytes = chunk_lanes.saturating_mul(TOTAL_MEMORY);
        if hash_bytes == 0 || scratch_bytes == 0 {
            anyhow::bail!("invalid chunk_lanes={}", chunk_lanes);
        }

        self.d_params.clear();
        self.d_hashes.clear();
        self.d_scratchpads.clear();

        for _ in 0..self.inflight_slots {
            self.d_params.push(
                self.stream
                    .alloc_zeros::<u8>(std::mem::size_of::<KernelParams>())
                    .map_err(|e| anyhow::anyhow!("Failed to alloc params: {}", e))?,
            );
            self.d_hashes.push(
                self.stream
                    .alloc_zeros::<u8>(hash_bytes)
                    .map_err(|e| anyhow::anyhow!("Failed to alloc hashes: {}", e))?,
            );
            self.d_scratchpads.push(
                self.stream
                    .alloc_zeros::<u8>(scratch_bytes)
                    .map_err(|e| anyhow::anyhow!("Failed to alloc scratchpads: {}", e))?,
            );
        }
        self.chunk_lanes = chunk_lanes;
        Ok(())
    }

    fn dispatch_chunk_sync(
        &mut self,
        slot: usize,
        chunk_lanes: usize,
        header_len: usize,
        start_nonce: u64,
    ) -> Result<Vec<u8>> {
        // Upload params as raw bytes
        let params = KernelParams {
            lanes: chunk_lanes as u32,
            rounds: ROUNDS as u32,
            header_len: header_len as u32,
            _pad0: 0,
            start_nonce,
        };
        let params_bytes: Vec<u8> = unsafe {
            std::slice::from_raw_parts(
                &params as *const KernelParams as *const u8,
                std::mem::size_of::<KernelParams>(),
            )
            .to_vec()
        };

        let d_params = self
            .d_params
            .get_mut(slot)
            .ok_or_else(|| anyhow::anyhow!("params slot {} missing", slot))?;
        self.stream
            .memcpy_htod(&params_bytes, d_params)
            .map_err(|e| anyhow::anyhow!("Failed to upload params: {}", e))?;

        // Launch kernel
        let grid_dim = ((chunk_lanes as u32).div_ceil(self.block_size), 1, 1);
        let block_dim = (self.block_size, 1, 1);
        let cfg = LaunchConfig {
            grid_dim,
            block_dim,
            shared_mem_bytes: 0,
        };

        // Get references for kernel args — header is read-only, others are read-write
        let d_header = self
            .d_header
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
        let d_params = self
            .d_params
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("params slot {} missing", slot))?;
        let d_hashes = self
            .d_hashes
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("hashes slot {} missing", slot))?;
        let d_scratch = self
            .d_scratchpads
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("scratchpad slot {} missing", slot))?;

        unsafe {
            self.stream
                .launch_builder(&self.kernel)
                .arg(d_header)
                .arg(d_hashes)
                .arg(d_scratch)
                .arg(d_params)
                .launch(cfg)
                .map_err(|e| anyhow::anyhow!("Kernel launch failed: {}", e))?;
        }

        // Synchronize and read back hashes
        self.stream
            .synchronize()
            .map_err(|e| anyhow::anyhow!("CUDA synchronize failed: {}", e))?;

        let hash_bytes = chunk_lanes * 32;
        let d_hashes = self
            .d_hashes
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("hashes slot {} missing", slot))?;
        let out: Vec<u8> = self
            .stream
            .memcpy_dtov(d_hashes)
            .map_err(|e| anyhow::anyhow!("Failed to copy hashes from device: {}", e))?;

        Ok(out[..hash_bytes].to_vec())
    }

    fn compute_batch_hashes(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
    ) -> Result<Vec<[u8; 32]>> {
        if lanes == 0 {
            return Ok(Vec::new());
        }
        if header_without_nonce.is_empty() {
            anyhow::bail!("header must not be empty");
        }

        if self.tuned_chunk_lanes == 0 || self.max_chunk_lanes == 0 {
            let (chunk, max_chunk) = self.autotune_chunk_limits();
            self.tuned_chunk_lanes = chunk;
            self.max_chunk_lanes = max_chunk;
        }
        if !self.did_runtime_profile {
            self.runtime_profile_chunk_lanes(header_without_nonce)?;
        }
        let chunk_lanes = self
            .tuned_chunk_lanes
            .min(self.max_chunk_lanes)
            .min(lanes)
            .max(1);
        self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;

        // Upload header once
        let d_header = self
            .d_header
            .as_mut()
            .ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
        self.stream
            .memcpy_htod(header_without_nonce, d_header)
            .map_err(|e| anyhow::anyhow!("Failed to upload header: {}", e))?;

        let mut out_hashes = vec![[0u8; 32]; lanes];
        let mut lane_offset = 0usize;

        while lane_offset < lanes {
            let this_lanes = (lanes - lane_offset).min(chunk_lanes);
            let slot = (lane_offset / chunk_lanes) % self.inflight_slots;
            let this_start_nonce = start_nonce.saturating_add(lane_offset as u64);

            let raw = self.dispatch_chunk_sync(
                slot,
                this_lanes,
                header_without_nonce.len(),
                this_start_nonce,
            )?;

            for i in 0..this_lanes {
                let src_offset = i * 32;
                out_hashes[lane_offset + i].copy_from_slice(&raw[src_offset..src_offset + 32]);
            }
            lane_offset += this_lanes;
        }

        Ok(out_hashes)
    }

    pub fn find_proof_batch(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
        difficulty: u32,
    ) -> Result<Option<(u64, [u8; 32])>> {
        let hashes = self.compute_batch_hashes(header_without_nonce, start_nonce, lanes)?;
        for (lane, hash) in hashes.into_iter().enumerate() {
            if meets_difficulty(&hash, difficulty) {
                return Ok(Some((start_nonce + lane as u64, hash)));
            }
        }
        Ok(None)
    }
}

impl crate::solver::Solver for CudaMiner {
    fn backend_name(&self) -> &'static str {
        "cuda"
    }

    fn recommended_lanes(&mut self, requested: usize) -> usize {
        self.recommended_lanes(requested)
    }

    fn find_proof_batch(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
        difficulty: u32,
    ) -> Result<crate::solver::ProofResult> {
        self.find_proof_batch(header_without_nonce, start_nonce, lanes, difficulty)
    }

    fn benchmark_hashes(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
    ) -> Result<usize> {
        self.benchmark_hashes(header_without_nonce, start_nonce, lanes)
    }
}

// ── CUDA Kernel Source ──
// Direct translation of the Metal shader from metal_miner.rs.
// All crypto primitives are software implementations (no HW AES on GPU).

const CUDA_KERNEL_SOURCE: &str = r#"
extern "C" {

#define CHAINS 4u
#define SCRATCHPAD_SIZE 524288u
#define BLOCK_SIZE_BYTES 64u
#define BLOCKS_PER_SCRATCHPAD 8192u
#define ADDRESS_MASK 8191u
#define GOLDEN_RATIO 0x9E3779B97F4A7C15ull

#define B3_CHUNK_START 1u
#define B3_CHUNK_END   2u
#define B3_ROOT        8u

struct KernelParams {
    unsigned int lanes;
    unsigned int rounds;
    unsigned int header_len;
    unsigned int _pad0;
    unsigned long long start_nonce;
};

__constant__ unsigned char SBOX[256] = {
  0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76,
  0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0,
  0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15,
  0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75,
  0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84,
  0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf,
  0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8,
  0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2,
  0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73,
  0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb,
  0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79,
  0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08,
  0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a,
  0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e,
  0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf,
  0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
};

__constant__ unsigned int SHA256_K[64] = {
  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
};

__constant__ unsigned int SHA256_IV[8] = {
  0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
};

__constant__ unsigned int B3_IV[8] = {
  0x6A09E667,0xBB67AE85,0x3C6EF372,0xA54FF53A,0x510E527F,0x9B05688C,0x1F83D9AB,0x5BE0CD19
};

__constant__ unsigned short B3_S[7][16] = {
  {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15},
  {2,6,3,10,7,0,4,13,1,11,12,5,9,14,15,8},
  {3,4,10,12,13,2,7,14,6,5,9,0,11,15,8,1},
  {10,7,12,9,14,3,13,15,4,0,11,2,5,8,1,6},
  {12,13,9,11,15,10,14,8,7,2,5,3,0,1,6,4},
  {9,14,11,5,8,12,15,1,13,3,0,10,2,6,4,7},
  {11,15,5,0,1,9,8,6,14,10,2,12,3,4,7,13}
};

__device__ __forceinline__ unsigned char gf_mul2(unsigned char x) {
    unsigned char hi = x >> 7;
    return (unsigned char)((x << 1) ^ (hi * 0x1b));
}

__device__ __forceinline__ unsigned char gf_mul3(unsigned char x) {
    return (unsigned char)(gf_mul2(x) ^ x);
}

__device__ __forceinline__ unsigned int rotr(unsigned int x, unsigned int n) {
    return (x >> n) | (x << (32u - n));
}

__device__ __forceinline__ unsigned int rd_be32(const unsigned char* p) {
    return ((unsigned int)p[0] << 24) | ((unsigned int)p[1] << 16) |
           ((unsigned int)p[2] << 8) | (unsigned int)p[3];
}

__device__ __forceinline__ unsigned long long rd_le64(const unsigned char* p) {
    return (unsigned long long)p[0] |
           ((unsigned long long)p[1] << 8) |
           ((unsigned long long)p[2] << 16) |
           ((unsigned long long)p[3] << 24) |
           ((unsigned long long)p[4] << 32) |
           ((unsigned long long)p[5] << 40) |
           ((unsigned long long)p[6] << 48) |
           ((unsigned long long)p[7] << 56);
}

__device__ __forceinline__ void wr_be32(unsigned char* p, unsigned int v) {
    p[0] = (unsigned char)(v >> 24);
    p[1] = (unsigned char)(v >> 16);
    p[2] = (unsigned char)(v >> 8);
    p[3] = (unsigned char)(v);
}

__device__ __forceinline__ void wr_le32(unsigned char* p, unsigned int v) {
    p[0] = (unsigned char)(v);
    p[1] = (unsigned char)(v >> 8);
    p[2] = (unsigned char)(v >> 16);
    p[3] = (unsigned char)(v >> 24);
}

__device__ void aesenc_round(unsigned char state[16], const unsigned char rk[16]) {
    unsigned char s[16];
    for (unsigned int i = 0; i < 16; i++) s[i] = SBOX[state[i]];
    unsigned char t[16];
    for (unsigned int i = 0; i < 16; i++) t[i] = s[i];
    s[1]=t[5]; s[5]=t[9]; s[9]=t[13]; s[13]=t[1];
    s[2]=t[10]; s[6]=t[14]; s[10]=t[2]; s[14]=t[6];
    s[3]=t[15]; s[7]=t[3]; s[11]=t[7]; s[15]=t[11];
    unsigned char out[16];
    for (unsigned int c = 0; c < 4; c++) {
        unsigned int i = c * 4;
        unsigned char a0=s[i], a1=s[i+1], a2=s[i+2], a3=s[i+3];
        out[i]   = (unsigned char)(gf_mul2(a0) ^ gf_mul3(a1) ^ a2 ^ a3);
        out[i+1] = (unsigned char)(a0 ^ gf_mul2(a1) ^ gf_mul3(a2) ^ a3);
        out[i+2] = (unsigned char)(a0 ^ a1 ^ gf_mul2(a2) ^ gf_mul3(a3));
        out[i+3] = (unsigned char)(gf_mul3(a0) ^ a1 ^ a2 ^ gf_mul2(a3));
    }
    for (unsigned int i = 0; i < 16; i++) state[i] = (unsigned char)(out[i] ^ rk[i]);
}

__device__ void aes_expand_block(unsigned char state[16], const unsigned char key[16]) {
    aesenc_round(state, key);
    aesenc_round(state, key);
    aesenc_round(state, key);
    aesenc_round(state, key);
}

__device__ void sha256_compress(const unsigned char state_b[32], const unsigned char block[64], unsigned char out[32]) {
    unsigned int s[8];
    for (unsigned int i = 0; i < 8; i++) s[i] = rd_be32(&state_b[i*4]);
    unsigned int w[64];
    for (unsigned int i = 0; i < 16; i++) w[i] = rd_be32(&block[i*4]);
    for (unsigned int i = 16; i < 64; i++) {
        unsigned int s0 = rotr(w[i-15],7) ^ rotr(w[i-15],18) ^ (w[i-15] >> 3);
        unsigned int s1 = rotr(w[i-2],17) ^ rotr(w[i-2],19) ^ (w[i-2] >> 10);
        w[i] = w[i-16] + s0 + w[i-7] + s1;
    }
    unsigned int a=s[0],b=s[1],c=s[2],d=s[3],e=s[4],f=s[5],g=s[6],h=s[7];
    for (unsigned int i = 0; i < 64; i++) {
        unsigned int S1 = rotr(e,6) ^ rotr(e,11) ^ rotr(e,25);
        unsigned int ch = (e & f) ^ ((~e) & g);
        unsigned int t1 = h + S1 + ch + SHA256_K[i] + w[i];
        unsigned int S0 = rotr(a,2) ^ rotr(a,13) ^ rotr(a,22);
        unsigned int maj = (a & b) ^ (a & c) ^ (b & c);
        unsigned int t2 = S0 + maj;
        h=g; g=f; f=e; e=d+t1; d=c; c=b; b=a; a=t1+t2;
    }
    s[0]+=a; s[1]+=b; s[2]+=c; s[3]+=d; s[4]+=e; s[5]+=f; s[6]+=g; s[7]+=h;
    for (unsigned int i = 0; i < 8; i++) wr_be32(&out[i*4], s[i]);
}

__device__ void blake3_g(unsigned int v[16], unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int mx, unsigned int my) {
    v[a] = v[a] + v[b] + mx;
    v[d] = rotr(v[d] ^ v[a], 16);
    v[c] = v[c] + v[d];
    v[b] = rotr(v[b] ^ v[c], 12);
    v[a] = v[a] + v[b] + my;
    v[d] = rotr(v[d] ^ v[a], 8);
    v[c] = v[c] + v[d];
    v[b] = rotr(v[b] ^ v[c], 7);
}

__device__ void blake3_compress(const unsigned char state_b[32], const unsigned char block[64], unsigned char out[32]) {
    unsigned int h[8];
    unsigned int m[16];
    for (unsigned int i = 0; i < 8; i++)
        h[i] = (unsigned int)state_b[i*4] | ((unsigned int)state_b[i*4+1]<<8) |
               ((unsigned int)state_b[i*4+2]<<16) | ((unsigned int)state_b[i*4+3]<<24);
    for (unsigned int i = 0; i < 16; i++)
        m[i] = (unsigned int)block[i*4] | ((unsigned int)block[i*4+1]<<8) |
               ((unsigned int)block[i*4+2]<<16) | ((unsigned int)block[i*4+3]<<24);
    unsigned int v[16];
    for (unsigned int i = 0; i < 8; i++) { v[i] = h[i]; v[i+8] = B3_IV[i]; }
    for (unsigned int r = 0; r < 7; r++) {
        blake3_g(v,0,4,8,12,m[B3_S[r][0]],m[B3_S[r][1]]);
        blake3_g(v,1,5,9,13,m[B3_S[r][2]],m[B3_S[r][3]]);
        blake3_g(v,2,6,10,14,m[B3_S[r][4]],m[B3_S[r][5]]);
        blake3_g(v,3,7,11,15,m[B3_S[r][6]],m[B3_S[r][7]]);
        blake3_g(v,0,5,10,15,m[B3_S[r][8]],m[B3_S[r][9]]);
        blake3_g(v,1,6,11,12,m[B3_S[r][10]],m[B3_S[r][11]]);
        blake3_g(v,2,7,8,13,m[B3_S[r][12]],m[B3_S[r][13]]);
        blake3_g(v,3,4,9,14,m[B3_S[r][14]],m[B3_S[r][15]]);
    }
    for (unsigned int i = 0; i < 8; i++) {
        unsigned int x = v[i] ^ v[i+8];
        wr_le32(&out[i*4], x);
    }
}

__device__ void blake3_compress_hash(
    const unsigned int cv[8],
    const unsigned int block_words[16],
    unsigned int counter_low,
    unsigned int counter_high,
    unsigned int block_len,
    unsigned int flags,
    unsigned int out[16]
) {
    unsigned int v[16];
    for (unsigned int i = 0; i < 8; i++) v[i] = cv[i];
    v[8]  = B3_IV[0]; v[9]  = B3_IV[1]; v[10] = B3_IV[2]; v[11] = B3_IV[3];
    v[12] = counter_low; v[13] = counter_high; v[14] = block_len; v[15] = flags;
    for (unsigned int r = 0; r < 7; r++) {
        blake3_g(v,0,4,8,12,block_words[B3_S[r][0]],block_words[B3_S[r][1]]);
        blake3_g(v,1,5,9,13,block_words[B3_S[r][2]],block_words[B3_S[r][3]]);
        blake3_g(v,2,6,10,14,block_words[B3_S[r][4]],block_words[B3_S[r][5]]);
        blake3_g(v,3,7,11,15,block_words[B3_S[r][6]],block_words[B3_S[r][7]]);
        blake3_g(v,0,5,10,15,block_words[B3_S[r][8]],block_words[B3_S[r][9]]);
        blake3_g(v,1,6,11,12,block_words[B3_S[r][10]],block_words[B3_S[r][11]]);
        blake3_g(v,2,7,8,13,block_words[B3_S[r][12]],block_words[B3_S[r][13]]);
        blake3_g(v,3,4,9,14,block_words[B3_S[r][14]],block_words[B3_S[r][15]]);
    }
    for (unsigned int i = 0; i < 8; i++) {
        out[i]   = v[i] ^ v[i+8];
        out[i+8] = v[i+8] ^ cv[i];
    }
}

__device__ void blake3_compress_inplace(
    unsigned int cv[8],
    const unsigned int block_words[16],
    unsigned int counter_low, unsigned int counter_high,
    unsigned int block_len, unsigned int flags
) {
    unsigned int out_words[16];
    blake3_compress_hash(cv, block_words, counter_low, counter_high, block_len, flags, out_words);
    for (unsigned int i = 0; i < 8; i++) cv[i] = out_words[i];
}

__device__ unsigned char header_nonce_byte(const unsigned char* header, unsigned int header_len, unsigned long long nonce, unsigned int idx) {
    if (idx < header_len) return header[idx];
    unsigned int j = idx - header_len;
    if (j < 8) return (unsigned char)((nonce >> (j * 8)) & 0xffull);
    return 0;
}

__device__ void blake3_hash_header_nonce(
    const unsigned char* header, unsigned int header_len,
    unsigned long long nonce, unsigned char out[32]
) {
    unsigned int total_len = header_len + 8;
    unsigned int blocks = (total_len + 63) / 64;
    unsigned int cv[8];
    for (unsigned int i = 0; i < 8; i++) cv[i] = B3_IV[i];
    for (unsigned int b = 0; b < blocks; b++) {
        unsigned int off = b * 64;
        unsigned int blen = 64;
        if (total_len - off < 64) blen = total_len - off;
        unsigned int words[16];
        for (unsigned int i = 0; i < 16; i++) words[i] = 0;
        for (unsigned int i = 0; i < blen; i++) {
            unsigned char by = header_nonce_byte(header, header_len, nonce, off + i);
            words[i / 4] |= ((unsigned int)by) << ((i % 4) * 8);
        }
        unsigned int flags = 0;
        if (b == 0) flags |= B3_CHUNK_START;
        int last = (b + 1 == blocks) ? 1 : 0;
        if (last) flags |= B3_CHUNK_END;
        if (!last) {
            blake3_compress_inplace(cv, words, 0, 0, blen, flags);
        } else {
            unsigned int out_words[16];
            blake3_compress_hash(cv, words, 0, 0, blen, flags | B3_ROOT, out_words);
            for (unsigned int i = 0; i < 8; i++) wr_le32(&out[i*4], out_words[i]);
        }
    }
}

__device__ void sha256_digest_32(const unsigned char input[32], unsigned char out[32]) {
    unsigned int state[8];
    for (unsigned int i = 0; i < 8; i++) state[i] = SHA256_IV[i];
    unsigned char block[64];
    for (unsigned int i = 0; i < 32; i++) block[i] = input[i];
    block[32] = 0x80;
    for (unsigned int i = 33; i < 56; i++) block[i] = 0;
    block[56] = 0; block[57] = 0; block[58] = 0; block[59] = 0;
    block[60] = 0; block[61] = 0; block[62] = 1; block[63] = 0;
    unsigned int w[64];
    for (unsigned int i = 0; i < 16; i++) w[i] = rd_be32(&block[i*4]);
    for (unsigned int i = 16; i < 64; i++) {
        unsigned int s0 = rotr(w[i-15],7) ^ rotr(w[i-15],18) ^ (w[i-15] >> 3);
        unsigned int s1 = rotr(w[i-2],17) ^ rotr(w[i-2],19) ^ (w[i-2] >> 10);
        w[i] = w[i-16] + s0 + w[i-7] + s1;
    }
    unsigned int a=state[0],b=state[1],c=state[2],d=state[3],e=state[4],f=state[5],g=state[6],h=state[7];
    for (unsigned int i = 0; i < 64; i++) {
        unsigned int S1 = rotr(e,6) ^ rotr(e,11) ^ rotr(e,25);
        unsigned int ch = (e & f) ^ ((~e) & g);
        unsigned int t1 = h + S1 + ch + SHA256_K[i] + w[i];
        unsigned int S0 = rotr(a,2) ^ rotr(a,13) ^ rotr(a,22);
        unsigned int maj = (a & b) ^ (a & c) ^ (b & c);
        unsigned int t2 = S0 + maj;
        h=g; g=f; f=e; e=d+t1; d=c; c=b; b=a; a=t1+t2;
    }
    state[0]+=a; state[1]+=b; state[2]+=c; state[3]+=d;
    state[4]+=e; state[5]+=f; state[6]+=g; state[7]+=h;
    for (unsigned int i = 0; i < 8; i++) wr_be32(&out[i*4], state[i]);
}

__device__ void blake3_hash_32(const unsigned char input[32], unsigned char out[32]) {
    unsigned int cv[8];
    for (unsigned int i = 0; i < 8; i++) cv[i] = B3_IV[i];
    unsigned int block_words[16];
    for (unsigned int i = 0; i < 8; i++) {
        block_words[i] = (unsigned int)input[i*4] | ((unsigned int)input[i*4+1] << 8) |
                         ((unsigned int)input[i*4+2] << 16) | ((unsigned int)input[i*4+3] << 24);
    }
    for (unsigned int i = 8; i < 16; i++) block_words[i] = 0;
    unsigned int out_words[16];
    blake3_compress_hash(cv, block_words, 0, 0, 32, B3_CHUNK_START | B3_CHUNK_END | B3_ROOT, out_words);
    for (unsigned int i = 0; i < 8; i++) wr_le32(&out[i*4], out_words[i]);
}

__device__ unsigned int compute_addr(const unsigned char state[32], unsigned int round) {
    const unsigned long long C = 0x517cc1b727220a95ull;
    unsigned long long lo = rd_le64(&state[0]);
    unsigned long long hi = rd_le64(&state[8]);
    unsigned long long ru = (unsigned long long)round;
    unsigned long long mixed = lo ^ hi ^ ((ru << 13) | (ru >> (64 - 13))) ^ (ru * C);
    return (unsigned int)((mixed & ADDRESS_MASK) * BLOCK_SIZE_BYTES);
}

__device__ void uhash_round(
    unsigned char* chain_scratch, unsigned char state[32],
    unsigned int initial_primitive, unsigned int r
) {
    unsigned int addr = compute_addr(state, r);
    unsigned char block[64];
    for (unsigned int i = 0; i < 64; i++) block[i] = chain_scratch[addr + i];
    unsigned int primitive = (initial_primitive + r + 1) % 3;
    unsigned char new_state[32];
    if (primitive == 0) {
        unsigned char lo[16]; unsigned char hi[16];
        for (unsigned int i = 0; i < 16; i++) { lo[i] = state[i]; hi[i] = state[16+i]; }
        aesenc_round(lo, &block[0]);
        aesenc_round(lo, &block[16]);
        aesenc_round(lo, &block[32]);
        aesenc_round(lo, &block[48]);
        aesenc_round(hi, &block[32]);
        aesenc_round(hi, &block[48]);
        aesenc_round(hi, &block[0]);
        aesenc_round(hi, &block[16]);
        for (unsigned int i = 0; i < 16; i++) { new_state[i] = lo[i]; new_state[16+i] = hi[i]; }
    } else if (primitive == 1) {
        sha256_compress(state, block, new_state);
    } else {
        blake3_compress(state, block, new_state);
    }
    for (unsigned int i = 0; i < 32; i++) chain_scratch[addr+i] = new_state[i];
    for (unsigned int i = 0; i < 32; i++) state[i] = new_state[i];
}

__global__ void uhash_kernel(
    const unsigned char* header,
    unsigned char* hashes_out,
    unsigned char* scratchpads,
    const unsigned char* params_raw
) {
    unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
    const struct KernelParams* params = (const struct KernelParams*)params_raw;
    if (gid >= params->lanes) return;

    unsigned char* lane_scratch = scratchpads + (unsigned long long)gid * (unsigned long long)(CHAINS * SCRATCHPAD_SIZE);
    unsigned char states[CHAINS][32];
    unsigned long long nonce_gid = params->start_nonce + (unsigned long long)gid;

    for (unsigned int c = 0; c < CHAINS; c++) {
        unsigned long long modified_nonce = nonce_gid ^ ((unsigned long long)c * GOLDEN_RATIO);
        blake3_hash_header_nonce(header, params->header_len, modified_nonce, states[c]);
        unsigned char key[16]; unsigned char st[16];
        for (unsigned int i = 0; i < 16; i++) { key[i] = states[c][i]; st[i] = states[c][16+i]; }
        unsigned char* chain_scratch = lane_scratch + c * SCRATCHPAD_SIZE;
        for (unsigned int b = 0; b < BLOCKS_PER_SCRATCHPAD; b++) {
            aes_expand_block(st, key);
            unsigned int off = b * BLOCK_SIZE_BYTES;
            unsigned char st2[16];
            for (unsigned int i = 0; i < 16; i++) st2[i] = st[i];
            aes_expand_block(st2, key);
            for (unsigned int i = 0; i < 16; i++) {
                chain_scratch[off+i]    = st[i];
                chain_scratch[off+16+i] = st2[i];
                chain_scratch[off+32+i] = st[i];
                chain_scratch[off+48+i] = st2[i];
            }
        }
    }

    for (unsigned int c = 0; c < CHAINS; c++) {
        unsigned char* chain_scratch = lane_scratch + c * SCRATCHPAD_SIZE;
        unsigned int initial_primitive = (unsigned int)((nonce_gid + (unsigned long long)c) % 3ull);
        #pragma unroll 2
        for (unsigned int r = 0; r < params->rounds; r++) {
            uhash_round(chain_scratch, states[c], initial_primitive, r);
        }
    }

    unsigned char combined[32];
    for (unsigned int i = 0; i < 32; i++) combined[i] = 0;
    for (unsigned int c = 0; c < CHAINS; c++)
        for (unsigned int i = 0; i < 32; i++) combined[i] ^= states[c][i];
    unsigned char sha_out[32];
    sha256_digest_32(combined, sha_out);
    unsigned char final_out[32];
    blake3_hash_32(sha_out, final_out);
    unsigned char* out = hashes_out + gid * 32;
    for (unsigned int i = 0; i < 32; i++) out[i] = final_out[i];
}

} // extern "C"
"#;

#[cfg(test)]
mod tests {
    use super::CudaMiner;
    use std::time::Instant;
    use uhash_core::UniversalHash;

    #[test]
    fn cuda_hash_matches_cpu_for_single_nonce() {
        let Ok(mut miner) = CudaMiner::new() else {
            eprintln!("Skipping: no CUDA GPU available");
            return;
        };

        let mut header = Vec::new();
        header.extend_from_slice(&[0xAB; 32]);
        header.extend_from_slice(b"bostrom1testaddress");
        header.extend_from_slice(&1_700_000_000u64.to_le_bytes());

        let nonce = 42u64;
        let gpu_hashes = miner
            .compute_batch_hashes(&header, nonce, 1)
            .expect("gpu hash");
        assert_eq!(gpu_hashes.len(), 1);

        let mut input = header.clone();
        input.extend_from_slice(&nonce.to_le_bytes());
        let mut cpu = UniversalHash::new();
        let cpu_hash = cpu.hash(&input);

        assert_eq!(
            gpu_hashes[0],
            cpu_hash,
            "CUDA hash does not match CPU hash!\nGPU: {}\nCPU: {}",
            hex::encode(gpu_hashes[0]),
            hex::encode(cpu_hash)
        );
    }

    #[test]
    fn cuda_hash_matches_cpu_for_multi_nonce() {
        let Ok(mut miner) = CudaMiner::new() else {
            eprintln!("Skipping: no CUDA GPU available");
            return;
        };

        let mut header = Vec::new();
        header.extend_from_slice(&[0xCD; 32]);
        header.extend_from_slice(b"bostrom1multinonce");
        header.extend_from_slice(&1_700_000_001u64.to_le_bytes());

        let start_nonce = 0u64;
        let count = 64usize;
        let gpu_hashes = miner
            .compute_batch_hashes(&header, start_nonce, count)
            .expect("gpu batch hash");
        assert_eq!(gpu_hashes.len(), count);

        for (i, gpu_hash) in gpu_hashes.iter().enumerate().take(count) {
            let nonce = start_nonce + i as u64;
            let mut input = header.clone();
            input.extend_from_slice(&nonce.to_le_bytes());
            let mut cpu = UniversalHash::new();
            let cpu_hash = cpu.hash(&input);

            assert_eq!(
                *gpu_hash,
                cpu_hash,
                "Mismatch at nonce {}: GPU={} CPU={}",
                nonce,
                hex::encode(gpu_hash),
                hex::encode(cpu_hash)
            );
        }
    }

    #[test]
    #[ignore = "performance profile for local CUDA throughput validation"]
    fn cuda_release_throughput_profile() {
        let Ok(mut miner) = CudaMiner::new() else {
            eprintln!("Skipping: no CUDA GPU available");
            return;
        };

        let mut header = Vec::new();
        header.extend_from_slice(&[0xEF; 32]);
        header.extend_from_slice(b"bostrom1cudaprofile");
        header.extend_from_slice(&1_700_000_002u64.to_le_bytes());

        let lane_sets = [64usize, 128, 256, 512, 1024];
        let mut nonce = 0u64;

        for lanes in lane_sets {
            let _ = miner
                .compute_batch_hashes(&header, nonce, lanes)
                .expect("warmup batch");
            nonce = nonce.saturating_add(lanes as u64);

            let start = Instant::now();
            let batches = if lanes >= 1024 { 1u64 } else { 3u64 };
            let mut computed = 0u64;
            for _ in 0..batches {
                let _ = miner
                    .compute_batch_hashes(&header, nonce, lanes)
                    .expect("profile batch");
                nonce = nonce.saturating_add(lanes as u64);
                computed += lanes as u64;
            }
            let elapsed = start.elapsed().as_secs_f64();
            let hps = if elapsed > 0.0 {
                computed as f64 / elapsed
            } else {
                0.0
            };
            eprintln!(
                "cuda-profile lanes={} hashes={} elapsed={:.3}s hashrate={:.2} H/s",
                lanes, computed, elapsed, hps
            );
        }
    }
}