uhash-prover 0.4.1

//! OpenCL GPU solver for AMD/Intel/Apple GPUs.
//!
//! Enable with `--features gpu-opencl`. Uses the `opencl3` crate for
//! OpenCL 1.2+ compatible GPU compute.

use anyhow::Result;
use opencl3::command_queue::{
    CommandQueue, CL_BLOCKING, CL_NON_BLOCKING, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
    CL_QUEUE_PROFILING_ENABLE,
};
use opencl3::context::Context;
use opencl3::device::{get_all_devices, Device, CL_DEVICE_TYPE_GPU};
use opencl3::event::Event;
use opencl3::kernel::{ExecuteKernel, Kernel};
use opencl3::memory::{Buffer, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE, CL_MEM_WRITE_ONLY};
use opencl3::program::Program;
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::fs;
use std::path::PathBuf;
use std::ptr;
use std::time::Instant;
use uhash_core::meets_difficulty;

use crate::solver::{ProofResult, Solver};

const CHAINS: usize = 4;
const SCRATCHPAD_SIZE: usize = 512 * 1024;
const TOTAL_MEMORY: usize = CHAINS * SCRATCHPAD_SIZE;
const ROUNDS: usize = 12_288;
const DEFAULT_INFLIGHT_SLOTS: usize = 3;

const OPENCL_KERNEL_SOURCE: &str = include_str!("../../kernels/uhash.cl");

#[repr(C)]
#[derive(Clone, Copy, Debug)]
struct KernelParams {
    lanes: u32,
    rounds: u32,
    header_len: u32,
    _pad0: u32,
    start_nonce: u64,
}

struct PendingDispatch {
    slot: usize,
    output_lane_offset: usize,
    lanes: usize,
    read_event: Event,
}

#[derive(Clone, Debug, Serialize)]
pub struct OpenClTelemetry {
    pub device_name: String,
    pub device_vendor: String,
    pub max_compute_units: u32,
    pub max_work_group_size: usize,
    pub global_mem_bytes: u64,
    pub inflight_slots: usize,
    pub tuned_chunk_lanes: usize,
    pub max_chunk_lanes: usize,
    pub runtime_profiled: bool,
}

#[derive(Debug, Serialize, Deserialize)]
struct OpenClTuneCache {
    version: u32,
    device_name: String,
    tuned_chunk_lanes: usize,
    max_chunk_lanes: usize,
    inflight_slots: usize,
    work_group_size: usize,
}

pub struct OpenClSolver {
    context: Context,
    queue: CommandQueue,
    kernel: Kernel,
    // Per-slot device buffers
    d_params: Vec<Buffer<u8>>,
    d_hashes: Vec<Buffer<u8>>,
    d_scratchpads: Vec<Buffer<u8>>,
    d_header: Option<Buffer<u8>>,
    header_capacity: usize,
    // Per-slot host readback buffers for non-blocking reads
    h_hashes: Vec<Vec<u8>>,
    // Tuning
    chunk_lanes: usize,
    tuned_chunk_lanes: usize,
    max_chunk_lanes: usize,
    inflight_slots: usize,
    work_group_size: usize,
    did_runtime_profile: bool,
    // Device info
    device_name: String,
    device_vendor: String,
    max_compute_units: u32,
    max_work_group_size: usize,
    global_mem_bytes: u64,
}

impl OpenClSolver {
    pub fn new() -> Result<Self> {
        let device_id = *get_all_devices(CL_DEVICE_TYPE_GPU)
            .map_err(|e| anyhow::anyhow!("OpenCL device enumeration failed: {}", e))?
            .first()
            .ok_or_else(|| anyhow::anyhow!("No OpenCL GPU device found"))?;

        let device = Device::new(device_id);

        let device_name = device
            .name()
            .map_err(|e| anyhow::anyhow!("Failed to get device name: {}", e))?;
        let device_vendor = device
            .vendor()
            .map_err(|e| anyhow::anyhow!("Failed to get device vendor: {}", e))?;
        let max_compute_units = device
            .max_compute_units()
            .map_err(|e| anyhow::anyhow!("Failed to get max compute units: {}", e))?;
        let max_work_group_size = device
            .max_work_group_size()
            .map_err(|e| anyhow::anyhow!("Failed to get max work group size: {}", e))?;
        let global_mem_bytes = device
            .global_mem_size()
            .map_err(|e| anyhow::anyhow!("Failed to get global memory: {}", e))?;

        let context = Context::from_device(&device)
            .map_err(|e| anyhow::anyhow!("OpenCL context creation failed: {}", e))?;

        // Try out-of-order queue first (allows overlapping dispatch + readback),
        // then in-order with profiling, then 1.x fallback for macOS (OpenCL 1.2).
        let queue = CommandQueue::create_default_with_properties(
            &context,
            CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
            0,
        )
        .or_else(|_| {
            CommandQueue::create_default_with_properties(&context, CL_QUEUE_PROFILING_ENABLE, 0)
        })
        .or_else(|_| CommandQueue::create_default(&context, CL_QUEUE_PROFILING_ENABLE))
        .map_err(|e| anyhow::anyhow!("OpenCL command queue creation failed: {}", e))?;

        let program = Program::create_and_build_from_source(&context, OPENCL_KERNEL_SOURCE, "")
            .map_err(|e| anyhow::anyhow!("OpenCL kernel compilation failed: {}", e))?;

        let kernel = Kernel::create(&program, "uhash_kernel")
            .map_err(|e| anyhow::anyhow!("Failed to create uhash_kernel: {}", e))?;

        let mut solver = Self {
            context,
            queue,
            kernel,
            d_params: Vec::new(),
            d_hashes: Vec::new(),
            d_scratchpads: Vec::new(),
            d_header: None,
            header_capacity: 0,
            h_hashes: Vec::new(),
            chunk_lanes: 0,
            tuned_chunk_lanes: 0,
            max_chunk_lanes: 0,
            inflight_slots: DEFAULT_INFLIGHT_SLOTS,
            work_group_size: 256,
            did_runtime_profile: false,
            device_name,
            device_vendor,
            max_compute_units,
            max_work_group_size,
            global_mem_bytes,
        };
        solver.try_load_tune_cache();
        Ok(solver)
    }

    pub fn telemetry(&self) -> OpenClTelemetry {
        OpenClTelemetry {
            device_name: self.device_name.clone(),
            device_vendor: self.device_vendor.clone(),
            max_compute_units: self.max_compute_units,
            max_work_group_size: self.max_work_group_size,
            global_mem_bytes: self.global_mem_bytes,
            inflight_slots: self.inflight_slots,
            tuned_chunk_lanes: self.tuned_chunk_lanes,
            max_chunk_lanes: self.max_chunk_lanes,
            runtime_profiled: self.did_runtime_profile,
        }
    }

    fn tune_cache_path(&self) -> Option<PathBuf> {
        let base = std::env::var_os("XDG_CONFIG_HOME")
            .map(PathBuf::from)
            .or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".config")))?;
        let safe_name: String = self
            .device_name
            .chars()
            .map(|c| if c.is_alphanumeric() { c } else { '_' })
            .collect();
        Some(
            base.join("uhash")
                .join(format!("opencl_tuning_{}.json", safe_name)),
        )
    }

    fn try_load_tune_cache(&mut self) {
        let Some(path) = self.tune_cache_path() else {
            return;
        };
        let Ok(raw) = fs::read(&path) else {
            return;
        };
        let Ok(cache) = serde_json::from_slice::<OpenClTuneCache>(&raw) else {
            return;
        };
        if cache.version != 2 {
            return;
        }
        let (_heuristic_chunk, heuristic_max) = self.autotune_chunk_limits();
        let cap_max = heuristic_max.max(32);
        let chunk = cache.tuned_chunk_lanes.clamp(32, cap_max);
        let max_chunk = cache.max_chunk_lanes.clamp(chunk, cap_max);
        if chunk % 32 != 0 || max_chunk % 32 != 0 {
            return;
        }
        self.tuned_chunk_lanes = chunk;
        self.max_chunk_lanes = max_chunk;
        self.inflight_slots = cache.inflight_slots.clamp(2, 6);
        self.work_group_size = cache.work_group_size.clamp(64, 1024);
    }

    fn persist_tune_cache(&self) {
        let Some(path) = self.tune_cache_path() else {
            return;
        };
        let cache = OpenClTuneCache {
            version: 2,
            device_name: self.device_name.clone(),
            tuned_chunk_lanes: self.tuned_chunk_lanes,
            max_chunk_lanes: self.max_chunk_lanes,
            inflight_slots: self.inflight_slots,
            work_group_size: self.work_group_size,
        };
        let Ok(raw) = serde_json::to_vec_pretty(&cache) else {
            return;
        };
        if let Some(parent) = path.parent() {
            let _ = fs::create_dir_all(parent);
        }
        let _ = fs::write(path, raw);
    }

    fn autotune_chunk_limits(&self) -> (usize, usize) {
        let mem_budget = (self.global_mem_bytes as usize).saturating_mul(7) / 10;
        let per_lane = TOTAL_MEMORY + 32;
        let max_by_mem = mem_budget
            .saturating_div(per_lane.saturating_mul(self.inflight_slots))
            .max(1);
        let max_chunk = max_by_mem.clamp(32, 8192);
        let tuned = max_chunk.min(1024);
        let tuned = (tuned / 32).max(1) * 32;
        let max_chunk = (max_chunk / 32).max(1) * 32;
        (tuned, max_chunk)
    }

    fn round_to_multiple(v: usize, m: usize) -> usize {
        (v / m).max(1) * m
    }

    fn runtime_profile_work_group_size(
        &mut self,
        header_without_nonce: &[u8],
        chunk_lanes: usize,
    ) -> Result<()> {
        let candidates = [64usize, 128, 256];
        let current = self.work_group_size;
        let mut best_wgs = current;
        let mut best_hps = 0.0f64;

        for &wgs in &candidates {
            if wgs > self.max_work_group_size {
                continue;
            }
            self.work_group_size = wgs;
            self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;

            // warmup
            self.dispatch_chunk_sync(0, chunk_lanes, header_without_nonce.len(), 0)?;

            let start = Instant::now();
            self.dispatch_chunk_sync(0, chunk_lanes, header_without_nonce.len(), 1)?;
            let elapsed = start.elapsed().as_secs_f64();
            if elapsed <= 0.0 {
                continue;
            }
            let hps = chunk_lanes as f64 / elapsed;
            if hps > best_hps {
                best_hps = hps;
                best_wgs = wgs;
            }
        }

        self.work_group_size = best_wgs;
        self.chunk_lanes = 0;
        Ok(())
    }

    fn runtime_profile_chunk_lanes(&mut self, header_without_nonce: &[u8]) -> Result<()> {
        if self.did_runtime_profile {
            return Ok(());
        }
        self.did_runtime_profile = true;

        let base = self.tuned_chunk_lanes.max(32);
        let max_chunk = self.max_chunk_lanes.max(base);
        let mut candidates = vec![
            base,
            Self::round_to_multiple(base.saturating_mul(2), 32).min(max_chunk),
            Self::round_to_multiple(base.saturating_mul(3), 32).min(max_chunk),
            max_chunk,
        ];
        candidates.sort_unstable();
        candidates.dedup();

        let mut best_lane = base;
        let mut best_hps = 0.0f64;
        for &lane in &candidates {
            if lane == 0 {
                continue;
            }
            self.ensure_resources(header_without_nonce.len(), lane)?;

            // warmup
            self.dispatch_chunk_sync(0, lane, header_without_nonce.len(), 0)?;

            let start = Instant::now();
            self.dispatch_chunk_sync(0, lane, header_without_nonce.len(), lane as u64)?;
            let elapsed = start.elapsed().as_secs_f64();
            if elapsed <= 0.0 {
                continue;
            }
            let hps = lane as f64 / elapsed;
            if hps > best_hps {
                best_hps = hps;
                best_lane = lane;
            }
        }

        self.tuned_chunk_lanes = best_lane.max(32);
        self.runtime_profile_work_group_size(header_without_nonce, self.tuned_chunk_lanes)?;
        self.runtime_profile_inflight_slots(header_without_nonce, self.tuned_chunk_lanes)?;
        self.chunk_lanes = 0;
        self.persist_tune_cache();
        Ok(())
    }

    fn ensure_resources(&mut self, header_len: usize, chunk_lanes: usize) -> Result<()> {
        if header_len > self.header_capacity {
            let new_capacity = header_len.next_power_of_two();
            self.d_header = Some(unsafe {
                Buffer::<u8>::create(
                    &self.context,
                    CL_MEM_READ_ONLY,
                    new_capacity,
                    ptr::null_mut(),
                )
                .map_err(|e| anyhow::anyhow!("Failed to alloc header buffer: {}", e))?
            });
            self.header_capacity = new_capacity;
        }

        if self.chunk_lanes == chunk_lanes
            && self.d_params.len() == self.inflight_slots
            && self.d_hashes.len() == self.inflight_slots
            && self.d_scratchpads.len() == self.inflight_slots
            && self.h_hashes.len() == self.inflight_slots
        {
            return Ok(());
        }

        let hash_bytes = chunk_lanes.saturating_mul(32);
        let scratch_bytes = chunk_lanes.saturating_mul(TOTAL_MEMORY);
        if hash_bytes == 0 || scratch_bytes == 0 {
            anyhow::bail!("invalid chunk_lanes={}", chunk_lanes);
        }

        self.d_params.clear();
        self.d_hashes.clear();
        self.d_scratchpads.clear();
        self.h_hashes.clear();

        let params_size = std::mem::size_of::<KernelParams>();

        for _ in 0..self.inflight_slots {
            self.d_params.push(unsafe {
                Buffer::<u8>::create(
                    &self.context,
                    CL_MEM_READ_ONLY,
                    params_size,
                    ptr::null_mut(),
                )
                .map_err(|e| anyhow::anyhow!("Failed to alloc params: {}", e))?
            });
            self.d_hashes.push(unsafe {
                Buffer::<u8>::create(
                    &self.context,
                    CL_MEM_WRITE_ONLY,
                    hash_bytes,
                    ptr::null_mut(),
                )
                .map_err(|e| anyhow::anyhow!("Failed to alloc hashes: {}", e))?
            });
            self.d_scratchpads.push(unsafe {
                Buffer::<u8>::create(
                    &self.context,
                    CL_MEM_READ_WRITE,
                    scratch_bytes,
                    ptr::null_mut(),
                )
                .map_err(|e| anyhow::anyhow!("Failed to alloc scratchpads: {}", e))?
            });
            self.h_hashes.push(vec![0u8; hash_bytes]);
        }
        self.chunk_lanes = chunk_lanes;
        Ok(())
    }

    fn dispatch_chunk_sync(
        &mut self,
        slot: usize,
        chunk_lanes: usize,
        header_len: usize,
        start_nonce: u64,
    ) -> Result<Vec<u8>> {
        let params = KernelParams {
            lanes: chunk_lanes as u32,
            rounds: ROUNDS as u32,
            header_len: header_len as u32,
            _pad0: 0,
            start_nonce,
        };
        let params_bytes: &[u8] = unsafe {
            std::slice::from_raw_parts(
                &params as *const KernelParams as *const u8,
                std::mem::size_of::<KernelParams>(),
            )
        };

        let d_params = self
            .d_params
            .get_mut(slot)
            .ok_or_else(|| anyhow::anyhow!("params slot {} missing", slot))?;
        unsafe {
            self.queue
                .enqueue_write_buffer(d_params, CL_BLOCKING, 0, params_bytes, &[])
                .map_err(|e| anyhow::anyhow!("Failed to upload params: {}", e))?;
        }

        // Get raw cl_mem handles for kernel arguments
        let d_header = self
            .d_header
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
        let d_params = self
            .d_params
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("params slot {} missing", slot))?;
        let d_hashes = self
            .d_hashes
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("hashes slot {} missing", slot))?;
        let d_scratch = self
            .d_scratchpads
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("scratchpad slot {} missing", slot))?;

        // Round global size up to multiple of work group size
        let global_size = chunk_lanes.div_ceil(self.work_group_size) * self.work_group_size;

        let kernel_event = unsafe {
            ExecuteKernel::new(&self.kernel)
                .set_arg(d_header)
                .set_arg(d_hashes)
                .set_arg(d_scratch)
                .set_arg(d_params)
                .set_global_work_size(global_size)
                .set_local_work_size(self.work_group_size)
                .enqueue_nd_range(&self.queue)
                .map_err(|e| anyhow::anyhow!("Kernel launch failed: {}", e))?
        };

        kernel_event
            .wait()
            .map_err(|e| anyhow::anyhow!("Kernel wait failed: {}", e))?;

        // Read back hashes
        let hash_bytes = chunk_lanes * 32;
        let mut out = vec![0u8; hash_bytes];
        let d_hashes = self
            .d_hashes
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("hashes slot {} missing", slot))?;
        unsafe {
            self.queue
                .enqueue_read_buffer(d_hashes, CL_BLOCKING, 0, &mut out, &[])
                .map_err(|e| anyhow::anyhow!("Failed to read hashes from device: {}", e))?;
        }

        Ok(out)
    }

    /// Dispatch a chunk asynchronously: enqueue param upload → kernel → hash readback
    /// without blocking. Returns the Event for the final read operation.
    /// The results will land in `self.h_hashes[slot]` when the event completes.
    fn dispatch_chunk_async(
        &mut self,
        slot: usize,
        chunk_lanes: usize,
        header_len: usize,
        start_nonce: u64,
    ) -> Result<Event> {
        let params = KernelParams {
            lanes: chunk_lanes as u32,
            rounds: ROUNDS as u32,
            header_len: header_len as u32,
            _pad0: 0,
            start_nonce,
        };
        let params_bytes: &[u8] = unsafe {
            std::slice::from_raw_parts(
                &params as *const KernelParams as *const u8,
                std::mem::size_of::<KernelParams>(),
            )
        };

        // Non-blocking param upload
        let d_params = self
            .d_params
            .get_mut(slot)
            .ok_or_else(|| anyhow::anyhow!("params slot {} missing", slot))?;
        let write_event = unsafe {
            self.queue
                .enqueue_write_buffer(d_params, CL_NON_BLOCKING, 0, params_bytes, &[])
                .map_err(|e| anyhow::anyhow!("Failed to upload params: {}", e))?
        };

        // Kernel dispatch — wait on param upload
        let d_header = self
            .d_header
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
        let d_params = self
            .d_params
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("params slot {} missing", slot))?;
        let d_hashes = self
            .d_hashes
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("hashes slot {} missing", slot))?;
        let d_scratch = self
            .d_scratchpads
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("scratchpad slot {} missing", slot))?;

        let global_size = chunk_lanes.div_ceil(self.work_group_size) * self.work_group_size;

        let kernel_event = unsafe {
            ExecuteKernel::new(&self.kernel)
                .set_arg(d_header)
                .set_arg(d_hashes)
                .set_arg(d_scratch)
                .set_arg(d_params)
                .set_global_work_size(global_size)
                .set_local_work_size(self.work_group_size)
                .set_wait_event(&write_event)
                .enqueue_nd_range(&self.queue)
                .map_err(|e| anyhow::anyhow!("Kernel launch failed: {}", e))?
        };

        // Non-blocking hash readback — wait on kernel completion
        let hash_bytes = chunk_lanes * 32;
        let d_hashes = self
            .d_hashes
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("hashes slot {} missing", slot))?;
        let h_buf = self
            .h_hashes
            .get_mut(slot)
            .ok_or_else(|| anyhow::anyhow!("host hash slot {} missing", slot))?;
        let read_event = unsafe {
            self.queue
                .enqueue_read_buffer(
                    d_hashes,
                    CL_NON_BLOCKING,
                    0,
                    &mut h_buf[..hash_bytes],
                    &[kernel_event.get()],
                )
                .map_err(|e| anyhow::anyhow!("Failed to enqueue hash readback: {}", e))?
        };

        Ok(read_event)
    }

    fn runtime_profile_inflight_slots(
        &mut self,
        header_without_nonce: &[u8],
        chunk_lanes: usize,
    ) -> Result<()> {
        let candidates = [2usize, 3, 4];
        let original = self.inflight_slots;
        let mut best_slots = original;
        let mut best_hps = 0.0f64;

        for &slots in &candidates {
            self.inflight_slots = slots;
            self.chunk_lanes = 0;
            self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;

            let test_lanes = chunk_lanes
                .saturating_mul(slots)
                .saturating_mul(2)
                .max(chunk_lanes);
            let start = Instant::now();
            let done = self.compute_batch_count(header_without_nonce, 0, test_lanes)?;
            let elapsed = start.elapsed().as_secs_f64();
            if elapsed <= 0.0 || done == 0 {
                continue;
            }
            let hps = done as f64 / elapsed;
            if hps > best_hps {
                best_hps = hps;
                best_slots = slots;
            }
        }

        self.inflight_slots = best_slots;
        self.chunk_lanes = 0;
        self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
        Ok(())
    }

    /// Pipelined batch hash computation — runs multiple chunks through the async
    /// pipeline simultaneously, overlapping GPU compute with host readback.
    fn compute_batch_count(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
    ) -> Result<usize> {
        if lanes == 0 {
            return Ok(0);
        }

        let chunk_lanes = self
            .tuned_chunk_lanes
            .min(self.max_chunk_lanes)
            .min(lanes)
            .max(1);
        self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;

        // Upload header once
        let d_header = self
            .d_header
            .as_mut()
            .ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
        unsafe {
            self.queue
                .enqueue_write_buffer(d_header, CL_BLOCKING, 0, header_without_nonce, &[])
                .map_err(|e| anyhow::anyhow!("Failed to upload header: {}", e))?;
        }

        let mut pending_by_slot: Vec<Option<PendingDispatch>> =
            (0..self.inflight_slots).map(|_| None).collect();
        let mut available_slots: VecDeque<usize> = (0..self.inflight_slots).collect();
        let mut in_flight = 0usize;
        let mut lane_offset = 0usize;
        let mut total_done = 0usize;

        while lane_offset < lanes || in_flight > 0 {
            // Fill pipeline with available slots
            while lane_offset < lanes && !available_slots.is_empty() {
                let this_lanes = (lanes - lane_offset).min(chunk_lanes);
                let slot = available_slots.pop_front().expect("slot available");
                let this_start_nonce = start_nonce.wrapping_add(lane_offset as u64);

                let read_event = self.dispatch_chunk_async(
                    slot,
                    this_lanes,
                    header_without_nonce.len(),
                    this_start_nonce,
                )?;

                pending_by_slot[slot] = Some(PendingDispatch {
                    slot,
                    output_lane_offset: lane_offset,
                    lanes: this_lanes,
                    read_event,
                });
                in_flight += 1;
                lane_offset += this_lanes;
            }

            if in_flight == 0 {
                break;
            }

            // Wait for oldest pending slot
            let oldest_slot = pending_by_slot
                .iter()
                .enumerate()
                .find_map(|(i, p)| p.as_ref().map(|_| i))
                .ok_or_else(|| anyhow::anyhow!("no pending slot found but in_flight > 0"))?;

            let done = pending_by_slot[oldest_slot]
                .take()
                .ok_or_else(|| anyhow::anyhow!("pending slot vanished"))?;
            done.read_event
                .wait()
                .map_err(|e| anyhow::anyhow!("Read event wait failed: {}", e))?;

            total_done += done.lanes;
            available_slots.push_back(oldest_slot);
            in_flight = in_flight.saturating_sub(1);
        }

        Ok(total_done)
    }

    fn compute_batch_hashes(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
    ) -> Result<Vec<[u8; 32]>> {
        if lanes == 0 {
            return Ok(Vec::new());
        }
        if header_without_nonce.is_empty() {
            anyhow::bail!("header must not be empty");
        }

        if self.tuned_chunk_lanes == 0 || self.max_chunk_lanes == 0 {
            let (chunk, max_chunk) = self.autotune_chunk_limits();
            self.tuned_chunk_lanes = chunk;
            self.max_chunk_lanes = max_chunk;
        }
        if !self.did_runtime_profile {
            self.runtime_profile_chunk_lanes(header_without_nonce)?;
        }
        let chunk_lanes = self
            .tuned_chunk_lanes
            .min(self.max_chunk_lanes)
            .min(lanes)
            .max(1);
        self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;

        // Upload header once
        let d_header = self
            .d_header
            .as_mut()
            .ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
        unsafe {
            self.queue
                .enqueue_write_buffer(d_header, CL_BLOCKING, 0, header_without_nonce, &[])
                .map_err(|e| anyhow::anyhow!("Failed to upload header: {}", e))?;
        }

        let mut out_hashes = vec![[0u8; 32]; lanes];
        let mut pending_by_slot: Vec<Option<PendingDispatch>> =
            (0..self.inflight_slots).map(|_| None).collect();
        let mut available_slots: VecDeque<usize> = (0..self.inflight_slots).collect();
        let mut in_flight = 0usize;
        let mut lane_offset = 0usize;

        while lane_offset < lanes || in_flight > 0 {
            // Fill pipeline: dispatch to all available slots
            while lane_offset < lanes && !available_slots.is_empty() {
                let this_lanes = (lanes - lane_offset).min(chunk_lanes);
                let slot = available_slots.pop_front().expect("slot available");
                let this_start_nonce = start_nonce.wrapping_add(lane_offset as u64);

                let read_event = self.dispatch_chunk_async(
                    slot,
                    this_lanes,
                    header_without_nonce.len(),
                    this_start_nonce,
                )?;

                pending_by_slot[slot] = Some(PendingDispatch {
                    slot,
                    output_lane_offset: lane_offset,
                    lanes: this_lanes,
                    read_event,
                });
                in_flight += 1;
                lane_offset += this_lanes;
            }

            if in_flight == 0 {
                break;
            }

            // Wait for oldest pending slot (FIFO order ensures results are correct)
            let oldest_slot = pending_by_slot
                .iter()
                .enumerate()
                .find_map(|(i, p)| p.as_ref().map(|_| i))
                .ok_or_else(|| anyhow::anyhow!("no pending slot found but in_flight > 0"))?;

            let done = pending_by_slot[oldest_slot]
                .take()
                .ok_or_else(|| anyhow::anyhow!("pending slot vanished"))?;
            done.read_event
                .wait()
                .map_err(|e| anyhow::anyhow!("Read event wait failed: {}", e))?;

            // Copy from host readback buffer to output
            let h_buf = &self.h_hashes[done.slot];
            for i in 0..done.lanes {
                let src_offset = i * 32;
                out_hashes[done.output_lane_offset + i]
                    .copy_from_slice(&h_buf[src_offset..src_offset + 32]);
            }

            available_slots.push_back(oldest_slot);
            in_flight = in_flight.saturating_sub(1);
        }

        Ok(out_hashes)
    }
}

impl Solver for OpenClSolver {
    fn backend_name(&self) -> &'static str {
        "opencl"
    }

    fn recommended_lanes(&mut self, requested: usize) -> usize {
        if self.tuned_chunk_lanes == 0 {
            let (chunk, max_chunk) = self.autotune_chunk_limits();
            self.tuned_chunk_lanes = chunk;
            self.max_chunk_lanes = max_chunk;
        }
        if requested == 0 {
            self.tuned_chunk_lanes
                .saturating_mul(self.inflight_slots)
                .max(1)
        } else {
            requested.max(1)
        }
    }

    fn find_proof_batch(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
        difficulty: u32,
    ) -> Result<ProofResult> {
        let hashes = self.compute_batch_hashes(header_without_nonce, start_nonce, lanes)?;
        for (lane, hash) in hashes.into_iter().enumerate() {
            if meets_difficulty(&hash, difficulty) {
                return Ok(Some((start_nonce + lane as u64, hash)));
            }
        }
        Ok(None)
    }

    fn benchmark_hashes(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
    ) -> Result<usize> {
        Ok(self
            .compute_batch_hashes(header_without_nonce, start_nonce, lanes)?
            .len())
    }
}

#[cfg(test)]
mod tests {
    use super::OpenClSolver;
    use std::time::Instant;
    use uhash_core::UniversalHash;

    #[test]
    fn opencl_hash_matches_cpu_for_single_nonce() {
        let Ok(mut solver) = OpenClSolver::new() else {
            eprintln!("Skipping: no OpenCL GPU available");
            return;
        };

        let mut header = Vec::new();
        header.extend_from_slice(&[0xAB; 32]);
        header.extend_from_slice(b"bostrom1testaddress");
        header.extend_from_slice(&1_700_000_000u64.to_le_bytes());

        let nonce = 42u64;
        let gpu_hashes = solver
            .compute_batch_hashes(&header, nonce, 1)
            .expect("gpu hash");
        assert_eq!(gpu_hashes.len(), 1);

        let mut input = header.clone();
        input.extend_from_slice(&nonce.to_le_bytes());
        let mut cpu = UniversalHash::new();
        let cpu_hash = cpu.hash(&input);

        assert_eq!(
            gpu_hashes[0],
            cpu_hash,
            "OpenCL hash does not match CPU hash!\nGPU: {}\nCPU: {}",
            hex::encode(gpu_hashes[0]),
            hex::encode(cpu_hash)
        );
    }

    #[test]
    fn opencl_hash_matches_cpu_for_multi_nonce() {
        let Ok(mut solver) = OpenClSolver::new() else {
            eprintln!("Skipping: no OpenCL GPU available");
            return;
        };

        let mut header = Vec::new();
        header.extend_from_slice(&[0xCD; 32]);
        header.extend_from_slice(b"bostrom1multinonce");
        header.extend_from_slice(&1_700_000_001u64.to_le_bytes());

        let start_nonce = 0u64;
        let count = 64usize;
        let gpu_hashes = solver
            .compute_batch_hashes(&header, start_nonce, count)
            .expect("gpu batch hash");
        assert_eq!(gpu_hashes.len(), count);

        for (i, gpu_hash) in gpu_hashes.iter().enumerate().take(count) {
            let nonce = start_nonce + i as u64;
            let mut input = header.clone();
            input.extend_from_slice(&nonce.to_le_bytes());
            let mut cpu = UniversalHash::new();
            let cpu_hash = cpu.hash(&input);

            assert_eq!(
                *gpu_hash,
                cpu_hash,
                "Mismatch at nonce {}: GPU={} CPU={}",
                nonce,
                hex::encode(gpu_hash),
                hex::encode(cpu_hash)
            );
        }
    }

    #[test]
    #[ignore = "performance profile for local OpenCL throughput validation"]
    fn opencl_release_throughput_profile() {
        let Ok(mut solver) = OpenClSolver::new() else {
            eprintln!("Skipping: no OpenCL GPU available");
            return;
        };

        let mut header = Vec::new();
        header.extend_from_slice(&[0xEF; 32]);
        header.extend_from_slice(b"bostrom1oclprofile");
        header.extend_from_slice(&1_700_000_002u64.to_le_bytes());

        let lane_sets = [64usize, 128, 256, 512, 1024];
        let mut nonce = 0u64;

        for lanes in lane_sets {
            let _ = solver
                .compute_batch_hashes(&header, nonce, lanes)
                .expect("warmup batch");
            nonce = nonce.saturating_add(lanes as u64);

            let start = Instant::now();
            let batches = if lanes >= 1024 { 1u64 } else { 3u64 };
            let mut computed = 0u64;
            for _ in 0..batches {
                let _ = solver
                    .compute_batch_hashes(&header, nonce, lanes)
                    .expect("profile batch");
                nonce = nonce.saturating_add(lanes as u64);
                computed += lanes as u64;
            }
            let elapsed = start.elapsed().as_secs_f64();
            let hps = if elapsed > 0.0 {
                computed as f64 / elapsed
            } else {
                0.0
            };
            eprintln!(
                "opencl-profile lanes={} hashes={} elapsed={:.3}s hashrate={:.2} H/s",
                lanes, computed, elapsed, hps
            );
        }
    }
}