uhash-prover 0.4.1

//! WGPU GPU solver — cross-platform GPU compute via WebGPU.
//!
//! Enable with `--features gpu-wgpu`. Uses the `wgpu` crate for
//! Vulkan/Metal/DX12 compute on all platforms.

use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::fs;
use std::path::PathBuf;
use std::sync::mpsc;
use std::time::Instant;
use uhash_core::meets_difficulty;

use crate::solver::{ProofResult, Solver};

const CHAINS: usize = 4;
const SCRATCHPAD_SIZE: usize = 512 * 1024;
const TOTAL_MEMORY: usize = CHAINS * SCRATCHPAD_SIZE;
const ROUNDS: usize = 12_288;
const DEFAULT_INFLIGHT_SLOTS: usize = 3;
const DEFAULT_WORKGROUP_SIZE: u32 = 64;

const WGSL_SHADER_SOURCE: &str = include_str!("../../kernels/uhash.wgsl");

/// Kernel params layout must match WGSL struct (8 u32s = 32 bytes).
#[repr(C)]
#[derive(Clone, Copy, Debug, bytemuck::Pod, bytemuck::Zeroable)]
struct KernelParams {
    lanes: u32,
    rounds: u32,
    header_len: u32,
    _pad0: u32,
    start_nonce_lo: u32,
    start_nonce_hi: u32,
    _pad1: u32,
    _pad2: u32,
}

#[derive(Clone, Debug, Serialize)]
pub struct WgpuTelemetry {
    pub device_name: String,
    pub backend: String,
    pub device_type: String,
    pub inflight_slots: usize,
    pub tuned_chunk_lanes: usize,
    pub max_chunk_lanes: usize,
    pub runtime_profiled: bool,
}

#[derive(Debug, Serialize, Deserialize)]
struct WgpuTuneCache {
    version: u32,
    device_name: String,
    backend: String,
    tuned_chunk_lanes: usize,
    max_chunk_lanes: usize,
    inflight_slots: usize,
    #[serde(default = "default_workgroup_size")]
    workgroup_size: u32,
}

const fn default_workgroup_size() -> u32 {
    DEFAULT_WORKGROUP_SIZE
}

struct PendingDispatch {
    slot: usize,
    output_lane_offset: usize,
    lanes: usize,
}

struct SlotBuffers {
    params_buf: wgpu::Buffer,
    hashes_buf: wgpu::Buffer,
    _scratch_buf: wgpu::Buffer,
    staging_buf: wgpu::Buffer,
    bind_group: Option<wgpu::BindGroup>,
}

pub struct WgpuSolver {
    device: wgpu::Device,
    queue: wgpu::Queue,
    pipeline: wgpu::ComputePipeline,
    bind_group_layout: wgpu::BindGroupLayout,
    shader: wgpu::ShaderModule,
    pipeline_layout: wgpu::PipelineLayout,
    header_buf: Option<wgpu::Buffer>,
    header_capacity: usize,
    slots: Vec<SlotBuffers>,
    // Tuning
    chunk_lanes: usize,
    tuned_chunk_lanes: usize,
    max_chunk_lanes: usize,
    inflight_slots: usize,
    workgroup_size: u32,
    did_runtime_profile: bool,
    // Device info
    device_name: String,
    backend_name_str: String,
    device_type_str: String,
}

impl WgpuSolver {
    pub fn new() -> Result<Self> {
        let instance = wgpu::Instance::default();
        let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions {
            power_preference: wgpu::PowerPreference::HighPerformance,
            compatible_surface: None,
            force_fallback_adapter: false,
        }))
        .map_err(|e| anyhow::anyhow!("No WGPU adapter found: {}", e))?;

        let info = adapter.get_info();
        let device_name = info.name.clone();
        let backend_name_str = format!("{:?}", info.backend);
        let device_type_str = format!("{:?}", info.device_type);

        let (device, queue) = pollster::block_on(adapter.request_device(&wgpu::DeviceDescriptor {
            label: Some("uhash"),
            required_features: wgpu::Features::empty(),
            required_limits: wgpu::Limits {
                max_storage_buffer_binding_size: 1 << 30, // 1 GB
                max_buffer_size: 1 << 30,
                ..wgpu::Limits::default()
            },
            memory_hints: wgpu::MemoryHints::Performance,
            experimental_features: Default::default(),
            trace: Default::default(),
        }))
        .map_err(|e| anyhow::anyhow!("Failed to create WGPU device: {}", e))?;

        let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
            label: Some("uhash_shader"),
            source: wgpu::ShaderSource::Wgsl(WGSL_SHADER_SOURCE.into()),
        });

        let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
            label: Some("uhash_bind_group_layout"),
            entries: &[
                // binding 0: header (storage, read)
                wgpu::BindGroupLayoutEntry {
                    binding: 0,
                    visibility: wgpu::ShaderStages::COMPUTE,
                    ty: wgpu::BindingType::Buffer {
                        ty: wgpu::BufferBindingType::Storage { read_only: true },
                        has_dynamic_offset: false,
                        min_binding_size: None,
                    },
                    count: None,
                },
                // binding 1: hashes_out (storage, read_write)
                wgpu::BindGroupLayoutEntry {
                    binding: 1,
                    visibility: wgpu::ShaderStages::COMPUTE,
                    ty: wgpu::BindingType::Buffer {
                        ty: wgpu::BufferBindingType::Storage { read_only: false },
                        has_dynamic_offset: false,
                        min_binding_size: None,
                    },
                    count: None,
                },
                // binding 2: scratchpads (storage, read_write)
                wgpu::BindGroupLayoutEntry {
                    binding: 2,
                    visibility: wgpu::ShaderStages::COMPUTE,
                    ty: wgpu::BindingType::Buffer {
                        ty: wgpu::BufferBindingType::Storage { read_only: false },
                        has_dynamic_offset: false,
                        min_binding_size: None,
                    },
                    count: None,
                },
                // binding 3: params (uniform)
                wgpu::BindGroupLayoutEntry {
                    binding: 3,
                    visibility: wgpu::ShaderStages::COMPUTE,
                    ty: wgpu::BindingType::Buffer {
                        ty: wgpu::BufferBindingType::Uniform,
                        has_dynamic_offset: false,
                        min_binding_size: None,
                    },
                    count: None,
                },
            ],
        });

        let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
            label: Some("uhash_pipeline_layout"),
            bind_group_layouts: &[&bind_group_layout],
            immediate_size: 0,
        });

        let pipeline = Self::create_pipeline_with_workgroup_size(
            &device,
            &shader,
            &pipeline_layout,
            DEFAULT_WORKGROUP_SIZE,
        );

        let mut solver = Self {
            device,
            queue,
            pipeline,
            bind_group_layout,
            shader,
            pipeline_layout,
            header_buf: None,
            header_capacity: 0,
            slots: Vec::new(),
            chunk_lanes: 0,
            tuned_chunk_lanes: 0,
            max_chunk_lanes: 0,
            inflight_slots: DEFAULT_INFLIGHT_SLOTS,
            workgroup_size: DEFAULT_WORKGROUP_SIZE,
            did_runtime_profile: false,
            device_name,
            backend_name_str,
            device_type_str,
        };
        solver.try_load_tune_cache();
        Ok(solver)
    }

    pub fn telemetry(&self) -> WgpuTelemetry {
        WgpuTelemetry {
            device_name: self.device_name.clone(),
            backend: self.backend_name_str.clone(),
            device_type: self.device_type_str.clone(),
            inflight_slots: self.inflight_slots,
            tuned_chunk_lanes: self.tuned_chunk_lanes,
            max_chunk_lanes: self.max_chunk_lanes,
            runtime_profiled: self.did_runtime_profile,
        }
    }

    fn create_pipeline_with_workgroup_size(
        device: &wgpu::Device,
        shader: &wgpu::ShaderModule,
        pipeline_layout: &wgpu::PipelineLayout,
        workgroup_size: u32,
    ) -> wgpu::ComputePipeline {
        let constants: Vec<(&str, f64)> = vec![("WORKGROUP_SIZE", workgroup_size as f64)];
        device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
            label: Some("uhash_pipeline"),
            layout: Some(pipeline_layout),
            module: shader,
            entry_point: Some("main"),
            compilation_options: wgpu::PipelineCompilationOptions {
                constants: &constants,
                ..Default::default()
            },
            cache: None,
        })
    }

    fn rebuild_pipeline(&mut self) {
        self.pipeline = Self::create_pipeline_with_workgroup_size(
            &self.device,
            &self.shader,
            &self.pipeline_layout,
            self.workgroup_size,
        );
        // Bind groups are independent of the pipeline (only depend on layout),
        // so no need to invalidate them.
    }

    fn tune_cache_path(&self) -> Option<PathBuf> {
        let base = std::env::var_os("XDG_CONFIG_HOME")
            .map(PathBuf::from)
            .or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".config")))?;
        let safe_name: String = self
            .device_name
            .chars()
            .map(|c| if c.is_alphanumeric() { c } else { '_' })
            .collect();
        Some(
            base.join("uhash")
                .join(format!("wgpu_tuning_{}.json", safe_name)),
        )
    }

    fn try_load_tune_cache(&mut self) {
        let Some(path) = self.tune_cache_path() else {
            return;
        };
        let Ok(raw) = fs::read(&path) else {
            return;
        };
        let Ok(cache) = serde_json::from_slice::<WgpuTuneCache>(&raw) else {
            return;
        };
        if cache.version != 2 {
            return;
        }
        let (_heuristic_chunk, heuristic_max) = self.autotune_chunk_limits();
        let cap_max = heuristic_max.max(32);
        let chunk = cache.tuned_chunk_lanes.clamp(32, cap_max);
        let max_chunk = cache.max_chunk_lanes.clamp(chunk, cap_max);
        if chunk % 32 != 0 || max_chunk % 32 != 0 {
            return;
        }
        self.tuned_chunk_lanes = chunk;
        self.max_chunk_lanes = max_chunk;
        self.inflight_slots = cache.inflight_slots.clamp(2, 6);
        let wgs = cache.workgroup_size;
        if wgs == 64 || wgs == 128 || wgs == 256 {
            self.workgroup_size = wgs;
            self.rebuild_pipeline();
        }
    }

    fn persist_tune_cache(&self) {
        let Some(path) = self.tune_cache_path() else {
            return;
        };
        let cache = WgpuTuneCache {
            version: 2,
            device_name: self.device_name.clone(),
            backend: self.backend_name_str.clone(),
            tuned_chunk_lanes: self.tuned_chunk_lanes,
            max_chunk_lanes: self.max_chunk_lanes,
            inflight_slots: self.inflight_slots,
            workgroup_size: self.workgroup_size,
        };
        let Ok(raw) = serde_json::to_vec_pretty(&cache) else {
            return;
        };
        if let Some(parent) = path.parent() {
            let _ = fs::create_dir_all(parent);
        }
        let _ = fs::write(path, raw);
    }

    fn autotune_chunk_limits(&self) -> (usize, usize) {
        // wgpu doesn't expose total VRAM easily; use conservative defaults
        // Assume 2 GB budget (typical integrated GPU)
        let mem_budget = 2usize * 1024 * 1024 * 1024;
        let per_lane = TOTAL_MEMORY + 32;
        let max_by_mem = mem_budget
            .saturating_div(per_lane.saturating_mul(self.inflight_slots))
            .max(1);
        let max_chunk = max_by_mem.clamp(32, 4096);
        let tuned = max_chunk.min(1024);
        let tuned = (tuned / 32).max(1) * 32;
        let max_chunk = (max_chunk / 32).max(1) * 32;
        (tuned, max_chunk)
    }

    fn round_to_multiple(v: usize, m: usize) -> usize {
        (v / m).max(1) * m
    }

    fn runtime_profile_chunk_lanes(&mut self, header_without_nonce: &[u8]) -> Result<()> {
        if self.did_runtime_profile {
            return Ok(());
        }
        self.did_runtime_profile = true;

        let base = self.tuned_chunk_lanes.max(32);
        let max_chunk = self.max_chunk_lanes.max(base);
        let mut candidates = vec![
            base,
            Self::round_to_multiple(base.saturating_mul(2), 32).min(max_chunk),
            Self::round_to_multiple(base.saturating_mul(3), 32).min(max_chunk),
            max_chunk,
        ];
        candidates.sort_unstable();
        candidates.dedup();

        let mut best_lane = base;
        let mut best_hps = 0.0f64;
        for &lane in &candidates {
            if lane == 0 {
                continue;
            }
            self.ensure_resources(header_without_nonce.len(), lane)?;

            // warmup
            self.dispatch_chunk_sync(0, lane, header_without_nonce, 0)?;

            let start = Instant::now();
            self.dispatch_chunk_sync(0, lane, header_without_nonce, lane as u64)?;
            let elapsed = start.elapsed().as_secs_f64();
            if elapsed <= 0.0 {
                continue;
            }
            let hps = lane as f64 / elapsed;
            if hps > best_hps {
                best_hps = hps;
                best_lane = lane;
            }
        }

        self.tuned_chunk_lanes = best_lane.max(32);
        self.runtime_profile_workgroup_size(header_without_nonce, self.tuned_chunk_lanes)?;
        self.runtime_profile_inflight_slots(header_without_nonce, self.tuned_chunk_lanes)?;
        self.chunk_lanes = 0;
        self.persist_tune_cache();
        Ok(())
    }

    fn runtime_profile_workgroup_size(
        &mut self,
        header_without_nonce: &[u8],
        chunk_lanes: usize,
    ) -> Result<()> {
        let candidates = [64u32, 128, 256];
        let mut best_wgs = self.workgroup_size;
        let mut best_hps = 0.0f64;

        for &wgs in &candidates {
            self.workgroup_size = wgs;
            self.rebuild_pipeline();
            self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;

            // warmup
            self.dispatch_chunk_sync(0, chunk_lanes, header_without_nonce, 0)?;

            let start = Instant::now();
            self.dispatch_chunk_sync(0, chunk_lanes, header_without_nonce, 1)?;
            let elapsed = start.elapsed().as_secs_f64();
            if elapsed <= 0.0 {
                continue;
            }
            let hps = chunk_lanes as f64 / elapsed;
            if hps > best_hps {
                best_hps = hps;
                best_wgs = wgs;
            }
        }

        self.workgroup_size = best_wgs;
        self.rebuild_pipeline();
        self.chunk_lanes = 0;
        Ok(())
    }

    fn runtime_profile_inflight_slots(
        &mut self,
        header_without_nonce: &[u8],
        chunk_lanes: usize,
    ) -> Result<()> {
        let candidates = [2usize, 3, 4];
        let original = self.inflight_slots;
        let mut best_slots = original;
        let mut best_hps = 0.0f64;

        for &slots in &candidates {
            self.inflight_slots = slots;
            self.chunk_lanes = 0;
            self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;

            let test_lanes = chunk_lanes
                .saturating_mul(slots)
                .saturating_mul(2)
                .max(chunk_lanes);
            let start = Instant::now();
            let done = self.compute_batch_count(header_without_nonce, 0, test_lanes)?;
            let elapsed = start.elapsed().as_secs_f64();
            if elapsed <= 0.0 || done == 0 {
                continue;
            }
            let hps = done as f64 / elapsed;
            if hps > best_hps {
                best_hps = hps;
                best_slots = slots;
            }
        }

        self.inflight_slots = best_slots;
        self.chunk_lanes = 0;
        self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
        Ok(())
    }

    fn ensure_resources(&mut self, header_len: usize, chunk_lanes: usize) -> Result<()> {
        // Round header up to multiple of 4 for u32 alignment
        let header_buf_size = header_len.div_ceil(4) * 4;
        if header_len > self.header_capacity {
            let new_capacity = header_buf_size.next_power_of_two().max(4);
            self.header_buf = Some(self.device.create_buffer(&wgpu::BufferDescriptor {
                label: Some("header"),
                size: new_capacity as u64,
                usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
                mapped_at_creation: false,
            }));
            self.header_capacity = header_len;
        }

        if self.chunk_lanes == chunk_lanes && self.slots.len() == self.inflight_slots {
            return Ok(());
        }

        let hash_bytes = (chunk_lanes * 32) as u64;
        let scratch_bytes = (chunk_lanes * TOTAL_MEMORY) as u64;
        let params_size = std::mem::size_of::<KernelParams>() as u64;

        if hash_bytes == 0 || scratch_bytes == 0 {
            anyhow::bail!("invalid chunk_lanes={}", chunk_lanes);
        }

        self.slots.clear();
        let header_buf = self
            .header_buf
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
        for _ in 0..self.inflight_slots {
            let params_buf = self.device.create_buffer(&wgpu::BufferDescriptor {
                label: Some("params"),
                size: params_size,
                usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
                mapped_at_creation: false,
            });
            let hashes_buf = self.device.create_buffer(&wgpu::BufferDescriptor {
                label: Some("hashes"),
                size: hash_bytes,
                usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
                mapped_at_creation: false,
            });
            let _scratch_buf = self.device.create_buffer(&wgpu::BufferDescriptor {
                label: Some("scratchpads"),
                size: scratch_bytes,
                usage: wgpu::BufferUsages::STORAGE,
                mapped_at_creation: false,
            });
            let staging_buf = self.device.create_buffer(&wgpu::BufferDescriptor {
                label: Some("staging"),
                size: hash_bytes,
                usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
                mapped_at_creation: false,
            });
            // Cache bind group per slot to avoid per-dispatch overhead
            let bind_group = self.device.create_bind_group(&wgpu::BindGroupDescriptor {
                label: Some("uhash_bind_group"),
                layout: &self.bind_group_layout,
                entries: &[
                    wgpu::BindGroupEntry {
                        binding: 0,
                        resource: header_buf.as_entire_binding(),
                    },
                    wgpu::BindGroupEntry {
                        binding: 1,
                        resource: hashes_buf.as_entire_binding(),
                    },
                    wgpu::BindGroupEntry {
                        binding: 2,
                        resource: _scratch_buf.as_entire_binding(),
                    },
                    wgpu::BindGroupEntry {
                        binding: 3,
                        resource: params_buf.as_entire_binding(),
                    },
                ],
            });
            self.slots.push(SlotBuffers {
                params_buf,
                hashes_buf,
                _scratch_buf,
                staging_buf,
                bind_group: Some(bind_group),
            });
        }
        self.chunk_lanes = chunk_lanes;
        Ok(())
    }

    fn dispatch_chunk_sync(
        &mut self,
        slot: usize,
        chunk_lanes: usize,
        header_without_nonce: &[u8],
        start_nonce: u64,
    ) -> Result<Vec<u8>> {
        let params = KernelParams {
            lanes: chunk_lanes as u32,
            rounds: ROUNDS as u32,
            header_len: header_without_nonce.len() as u32,
            _pad0: 0,
            start_nonce_lo: start_nonce as u32,
            start_nonce_hi: (start_nonce >> 32) as u32,
            _pad1: 0,
            _pad2: 0,
        };

        let slot_bufs = &self.slots[slot];

        // Upload params
        self.queue
            .write_buffer(&slot_bufs.params_buf, 0, bytemuck::bytes_of(&params));

        // Use cached bind group if available, otherwise create one
        let bind_group = match &slot_bufs.bind_group {
            Some(bg) => bg,
            None => {
                anyhow::bail!("bind group for slot {} not cached", slot);
            }
        };

        // Encode compute pass
        let mut encoder = self
            .device
            .create_command_encoder(&wgpu::CommandEncoderDescriptor {
                label: Some("uhash_encoder"),
            });

        {
            let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
                label: Some("uhash_pass"),
                timestamp_writes: None,
            });
            pass.set_pipeline(&self.pipeline);
            pass.set_bind_group(0, bind_group, &[]);
            let wgs = self.workgroup_size;
            let workgroups = (chunk_lanes as u32).div_ceil(wgs);
            pass.dispatch_workgroups(workgroups, 1, 1);
        }

        // Copy hashes to staging buffer for readback
        let hash_bytes = (chunk_lanes * 32) as u64;
        encoder.copy_buffer_to_buffer(
            &slot_bufs.hashes_buf,
            0,
            &slot_bufs.staging_buf,
            0,
            hash_bytes,
        );

        self.queue.submit(std::iter::once(encoder.finish()));

        // Map staging buffer and read (blocking)
        let staging = &self.slots[slot].staging_buf;
        let slice = staging.slice(..);
        let (sender, receiver) = mpsc::channel();
        slice.map_async(wgpu::MapMode::Read, move |result| {
            let _ = sender.send(result);
        });
        self.device
            .poll(wgpu::PollType::Wait {
                submission_index: None,
                timeout: Some(std::time::Duration::from_secs(30)),
            })
            .ok();

        receiver
            .recv()
            .map_err(|e| anyhow::anyhow!("Map async channel error: {}", e))?
            .map_err(|e| anyhow::anyhow!("Buffer map failed: {}", e))?;

        let data = slice.get_mapped_range();
        let out = data[..hash_bytes as usize].to_vec();
        drop(data);
        staging.unmap();

        Ok(out)
    }

    /// Non-blocking dispatch: encode compute pass + staging copy, submit to GPU.
    /// Returns immediately — GPU starts working in the background.
    fn dispatch_chunk_nonblocking(
        &mut self,
        slot: usize,
        chunk_lanes: usize,
        header_len: usize,
        start_nonce: u64,
    ) -> Result<()> {
        let params = KernelParams {
            lanes: chunk_lanes as u32,
            rounds: ROUNDS as u32,
            header_len: header_len as u32,
            _pad0: 0,
            start_nonce_lo: start_nonce as u32,
            start_nonce_hi: (start_nonce >> 32) as u32,
            _pad1: 0,
            _pad2: 0,
        };

        let slot_bufs = &self.slots[slot];

        // Upload params (queued, not blocking)
        self.queue
            .write_buffer(&slot_bufs.params_buf, 0, bytemuck::bytes_of(&params));

        let bind_group = slot_bufs
            .bind_group
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("bind group for slot {} not cached", slot))?;

        let mut encoder = self
            .device
            .create_command_encoder(&wgpu::CommandEncoderDescriptor {
                label: Some("uhash_encoder"),
            });

        {
            let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
                label: Some("uhash_pass"),
                timestamp_writes: None,
            });
            pass.set_pipeline(&self.pipeline);
            pass.set_bind_group(0, bind_group, &[]);
            let wgs = self.workgroup_size;
            let workgroups = (chunk_lanes as u32).div_ceil(wgs);
            pass.dispatch_workgroups(workgroups, 1, 1);
        }

        let hash_bytes = (chunk_lanes * 32) as u64;
        encoder.copy_buffer_to_buffer(
            &slot_bufs.hashes_buf,
            0,
            &slot_bufs.staging_buf,
            0,
            hash_bytes,
        );

        self.queue.submit(std::iter::once(encoder.finish()));
        Ok(())
    }

    /// Begin async map of staging buffer for readback.
    /// Sends the slot index through `tx` when the mapping is ready.
    fn begin_readback(&self, slot: usize, tx: &mpsc::Sender<usize>) {
        let staging = &self.slots[slot].staging_buf;
        let slice = staging.slice(..);
        let tx = tx.clone();
        let s = slot;
        slice.map_async(wgpu::MapMode::Read, move |result| {
            if result.is_ok() {
                let _ = tx.send(s);
            }
        });
    }

    /// Collect mapped data from staging buffer into out_hashes, then unmap.
    fn collect_readback(
        &self,
        slot: usize,
        read_lanes: usize,
        output_lane_offset: usize,
        out_hashes: &mut [[u8; 32]],
    ) {
        let staging = &self.slots[slot].staging_buf;
        let slice = staging.slice(..);
        let data = slice.get_mapped_range();
        for i in 0..read_lanes {
            let src_offset = i * 32;
            out_hashes[output_lane_offset + i].copy_from_slice(&data[src_offset..src_offset + 32]);
        }
        drop(data);
        staging.unmap();
    }

    /// Pipelined batch count — used for inflight slot profiling.
    fn compute_batch_count(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
    ) -> Result<usize> {
        if lanes == 0 {
            return Ok(0);
        }

        let chunk_lanes = self
            .tuned_chunk_lanes
            .min(self.max_chunk_lanes)
            .min(lanes)
            .max(1);
        self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;

        // Upload header once (pad to 4-byte alignment)
        let header_buf = self
            .header_buf
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
        let mut padded_header = header_without_nonce.to_vec();
        while !padded_header.len().is_multiple_of(4) {
            padded_header.push(0);
        }
        self.queue.write_buffer(header_buf, 0, &padded_header);

        let mut pending_by_slot: Vec<Option<PendingDispatch>> =
            (0..self.inflight_slots).map(|_| None).collect();
        let mut available_slots: VecDeque<usize> = (0..self.inflight_slots).collect();
        let (completion_tx, completion_rx) = mpsc::channel::<usize>();
        let mut in_flight = 0usize;
        let mut lane_offset = 0usize;
        let mut total_done = 0usize;

        while lane_offset < lanes || in_flight > 0 {
            while lane_offset < lanes && !available_slots.is_empty() {
                let this_lanes = (lanes - lane_offset).min(chunk_lanes);
                let slot = available_slots.pop_front().expect("slot available");
                let this_start_nonce = start_nonce.wrapping_add(lane_offset as u64);

                self.dispatch_chunk_nonblocking(
                    slot,
                    this_lanes,
                    header_without_nonce.len(),
                    this_start_nonce,
                )?;
                self.begin_readback(slot, &completion_tx);

                pending_by_slot[slot] = Some(PendingDispatch {
                    slot,
                    output_lane_offset: lane_offset,
                    lanes: this_lanes,
                });
                in_flight += 1;
                lane_offset += this_lanes;
            }

            if in_flight == 0 {
                break;
            }

            // Poll to drive callbacks
            self.device
                .poll(wgpu::PollType::Wait {
                    submission_index: None,
                    timeout: Some(std::time::Duration::from_secs(30)),
                })
                .ok();

            // Drain all completed slots
            while let Ok(slot) = completion_rx.try_recv() {
                let done = pending_by_slot[slot]
                    .take()
                    .ok_or_else(|| anyhow::anyhow!("completion for unknown slot {}", slot))?;
                // Just unmap, don't copy data for count-only
                self.slots[slot].staging_buf.unmap();
                total_done += done.lanes;
                available_slots.push_back(slot);
                in_flight = in_flight.saturating_sub(1);
            }
        }

        Ok(total_done)
    }

    fn compute_batch_hashes(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
    ) -> Result<Vec<[u8; 32]>> {
        if lanes == 0 {
            return Ok(Vec::new());
        }
        if header_without_nonce.is_empty() {
            anyhow::bail!("header must not be empty");
        }

        if self.tuned_chunk_lanes == 0 || self.max_chunk_lanes == 0 {
            let (chunk, max_chunk) = self.autotune_chunk_limits();
            self.tuned_chunk_lanes = chunk;
            self.max_chunk_lanes = max_chunk;
        }
        if !self.did_runtime_profile {
            self.runtime_profile_chunk_lanes(header_without_nonce)?;
        }
        let chunk_lanes = self
            .tuned_chunk_lanes
            .min(self.max_chunk_lanes)
            .min(lanes)
            .max(1);
        self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;

        // Upload header once (pad to 4-byte alignment)
        let header_buf = self
            .header_buf
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
        let mut padded_header = header_without_nonce.to_vec();
        while !padded_header.len().is_multiple_of(4) {
            padded_header.push(0);
        }
        self.queue.write_buffer(header_buf, 0, &padded_header);

        let mut out_hashes = vec![[0u8; 32]; lanes];
        let mut pending_by_slot: Vec<Option<PendingDispatch>> =
            (0..self.inflight_slots).map(|_| None).collect();
        let mut available_slots: VecDeque<usize> = (0..self.inflight_slots).collect();
        let (completion_tx, completion_rx) = mpsc::channel::<usize>();
        let mut in_flight = 0usize;
        let mut lane_offset = 0usize;

        while lane_offset < lanes || in_flight > 0 {
            // Fill pipeline: dispatch to all available slots
            while lane_offset < lanes && !available_slots.is_empty() {
                let this_lanes = (lanes - lane_offset).min(chunk_lanes);
                let slot = available_slots.pop_front().expect("slot available");
                let this_start_nonce = start_nonce.wrapping_add(lane_offset as u64);

                self.dispatch_chunk_nonblocking(
                    slot,
                    this_lanes,
                    header_without_nonce.len(),
                    this_start_nonce,
                )?;
                self.begin_readback(slot, &completion_tx);

                pending_by_slot[slot] = Some(PendingDispatch {
                    slot,
                    output_lane_offset: lane_offset,
                    lanes: this_lanes,
                });
                in_flight += 1;
                lane_offset += this_lanes;
            }

            if in_flight == 0 {
                break;
            }

            // Poll to drive map callbacks — blocks until at least one completes
            self.device
                .poll(wgpu::PollType::Wait {
                    submission_index: None,
                    timeout: Some(std::time::Duration::from_secs(30)),
                })
                .ok();

            // Drain all completed slots
            while let Ok(slot) = completion_rx.try_recv() {
                let done = pending_by_slot[slot]
                    .take()
                    .ok_or_else(|| anyhow::anyhow!("completion for unknown slot {}", slot))?;
                self.collect_readback(
                    done.slot,
                    done.lanes,
                    done.output_lane_offset,
                    &mut out_hashes,
                );
                available_slots.push_back(slot);
                in_flight = in_flight.saturating_sub(1);
            }
        }

        Ok(out_hashes)
    }
}

impl Solver for WgpuSolver {
    fn backend_name(&self) -> &'static str {
        "wgpu"
    }

    fn recommended_lanes(&mut self, requested: usize) -> usize {
        if self.tuned_chunk_lanes == 0 {
            let (chunk, max_chunk) = self.autotune_chunk_limits();
            self.tuned_chunk_lanes = chunk;
            self.max_chunk_lanes = max_chunk;
        }
        if requested == 0 {
            self.tuned_chunk_lanes
                .saturating_mul(self.inflight_slots)
                .max(1)
        } else {
            requested.max(1)
        }
    }

    fn find_proof_batch(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
        difficulty: u32,
    ) -> Result<ProofResult> {
        let hashes = self.compute_batch_hashes(header_without_nonce, start_nonce, lanes)?;
        for (lane, hash) in hashes.into_iter().enumerate() {
            if meets_difficulty(&hash, difficulty) {
                return Ok(Some((start_nonce + lane as u64, hash)));
            }
        }
        Ok(None)
    }

    fn benchmark_hashes(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
    ) -> Result<usize> {
        Ok(self
            .compute_batch_hashes(header_without_nonce, start_nonce, lanes)?
            .len())
    }
}

#[cfg(test)]
mod tests {
    use super::WgpuSolver;
    use std::time::Instant;
    use uhash_core::UniversalHash;

    #[test]
    fn wgpu_hash_matches_cpu_for_single_nonce() {
        let Ok(mut solver) = WgpuSolver::new() else {
            eprintln!("Skipping: no WGPU adapter available");
            return;
        };

        let mut header = Vec::new();
        header.extend_from_slice(&[0xAB; 32]);
        header.extend_from_slice(b"bostrom1testaddress");
        header.extend_from_slice(&1_700_000_000u64.to_le_bytes());

        let nonce = 42u64;
        let gpu_hashes = solver
            .compute_batch_hashes(&header, nonce, 1)
            .expect("gpu hash");
        assert_eq!(gpu_hashes.len(), 1);

        let mut input = header.clone();
        input.extend_from_slice(&nonce.to_le_bytes());
        let mut cpu = UniversalHash::new();
        let cpu_hash = cpu.hash(&input);

        assert_eq!(
            gpu_hashes[0],
            cpu_hash,
            "WGPU hash does not match CPU hash!\nGPU: {}\nCPU: {}",
            hex::encode(gpu_hashes[0]),
            hex::encode(cpu_hash)
        );
    }

    #[test]
    fn wgpu_hash_matches_cpu_for_multi_nonce() {
        let Ok(mut solver) = WgpuSolver::new() else {
            eprintln!("Skipping: no WGPU adapter available");
            return;
        };

        let mut header = Vec::new();
        header.extend_from_slice(&[0xCD; 32]);
        header.extend_from_slice(b"bostrom1multinonce");
        header.extend_from_slice(&1_700_000_001u64.to_le_bytes());

        let start_nonce = 0u64;
        let count = 64usize;
        let gpu_hashes = solver
            .compute_batch_hashes(&header, start_nonce, count)
            .expect("gpu batch hash");
        assert_eq!(gpu_hashes.len(), count);

        for (i, gpu_hash) in gpu_hashes.iter().enumerate().take(count) {
            let nonce = start_nonce + i as u64;
            let mut input = header.clone();
            input.extend_from_slice(&nonce.to_le_bytes());
            let mut cpu = UniversalHash::new();
            let cpu_hash = cpu.hash(&input);

            assert_eq!(
                *gpu_hash,
                cpu_hash,
                "Mismatch at nonce {}: GPU={} CPU={}",
                nonce,
                hex::encode(gpu_hash),
                hex::encode(cpu_hash)
            );
        }
    }

    #[test]
    #[ignore = "performance profile for local WGPU throughput validation"]
    fn wgpu_release_throughput_profile() {
        let Ok(mut solver) = WgpuSolver::new() else {
            eprintln!("Skipping: no WGPU adapter available");
            return;
        };

        let mut header = Vec::new();
        header.extend_from_slice(&[0xEF; 32]);
        header.extend_from_slice(b"bostrom1wgpuprofile");
        header.extend_from_slice(&1_700_000_002u64.to_le_bytes());

        let lane_sets = [64usize, 128, 256, 512, 1024];
        let mut nonce = 0u64;

        for lanes in lane_sets {
            let _ = solver
                .compute_batch_hashes(&header, nonce, lanes)
                .expect("warmup batch");
            nonce = nonce.saturating_add(lanes as u64);

            let start = Instant::now();
            let batches = if lanes >= 1024 { 1u64 } else { 3u64 };
            let mut computed = 0u64;
            for _ in 0..batches {
                let _ = solver
                    .compute_batch_hashes(&header, nonce, lanes)
                    .expect("profile batch");
                nonce = nonce.saturating_add(lanes as u64);
                computed += lanes as u64;
            }
            let elapsed = start.elapsed().as_secs_f64();
            let hps = if elapsed > 0.0 {
                computed as f64 / elapsed
            } else {
                0.0
            };
            eprintln!(
                "wgpu-profile lanes={} hashes={} elapsed={:.3}s hashrate={:.2} H/s",
                lanes, computed, elapsed, hps
            );
        }
    }
}