uhash-prover 0.4.1

#![cfg(all(feature = "metal-backend", target_os = "macos"))]

use anyhow::Result;
use block::ConcreteBlock;
use metal::{
    Buffer, CommandBuffer, CommandBufferRef, CommandQueue, CompileOptions, ComputePipelineState,
    Device, FunctionConstantValues, MTLCommandBufferStatus, MTLDataType, MTLGPUFamily,
    MTLResourceOptions,
};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::fs;
use std::path::PathBuf;
use std::sync::mpsc::{self, Receiver, Sender};
use std::time::Instant;

const CHAINS: usize = 4;
const SCRATCHPAD_SIZE: usize = 512 * 1024;
const TOTAL_MEMORY: usize = CHAINS * SCRATCHPAD_SIZE;
const ROUNDS: usize = 12_288;
const DEFAULT_INFLIGHT_SLOTS: usize = 3;
const FOUND_DATA_BYTES: usize = 8 + 32;

#[repr(C)]
#[derive(Clone, Copy)]
struct KernelParams {
    lanes: u32,
    rounds: u32,
    header_len: u32,
    difficulty: u32,
    emit_hashes: u32,
    _pad0: u32,
    start_nonce: u64,
}

struct PendingDispatch {
    _slot: usize,
    output_lane_offset: usize,
    lanes: usize,
    _command_buffer: CommandBuffer,
}

type CompletionEvent = (usize, MTLCommandBufferStatus);

#[derive(Clone, Debug, Serialize)]
pub struct MetalTelemetry {
    pub device_name: String,
    pub apple_family: u32,
    pub registry_id: u64,
    pub unified_memory: bool,
    pub recommended_max_working_set_size: u64,
    pub max_buffer_length: u64,
    pub inflight_slots: usize,
    pub tuned_chunk_lanes: usize,
    pub max_chunk_lanes: usize,
    pub tg_factor: usize,
    pub tg_width: u64,
    pub vector_block_io: bool,
    pub unroll_rounds: bool,
    pub runtime_profiled: bool,
    pub cache_warm_start: bool,
}

#[derive(Debug, Serialize, Deserialize)]
struct MetalTuneCache {
    version: u32,
    registry_id: u64,
    tuned_chunk_lanes: usize,
    max_chunk_lanes: usize,
    inflight_slots: usize,
    #[serde(default = "default_tg_factor")]
    tg_factor: usize,
    #[serde(default)]
    tg_width: u64,
}

const fn default_tg_factor() -> usize {
    4
}

pub struct MetalMiner {
    device: Device,
    queue: CommandQueue,
    pipeline_hash: ComputePipelineState,
    pipeline_proof: ComputePipelineState,
    header_buf: Option<Buffer>,
    header_capacity: usize,
    params_slots: Vec<Buffer>,
    hashes_private_slots: Vec<Buffer>,
    hashes_readback_slots: Vec<Buffer>,
    found_flag_slots: Vec<Buffer>,
    found_data_slots: Vec<Buffer>,
    scratchpads_slots: Vec<Buffer>,
    chunk_lanes: usize,
    max_chunk_lanes: usize,
    tuned_chunk_lanes: usize,
    inflight_slots: usize,
    tg_factor: usize,
    tg_width: u64,
    did_runtime_profile: bool,
    device_name: String,
    apple_family: u32,
    registry_id: u64,
    unified_memory: bool,
    recommended_ws: u64,
    max_buffer_len: u64,
    vector_block_io: bool,
    unroll_rounds: bool,
    profile_enabled: bool,
    profile_every: usize,
    profile_counter: u64,
    tune_cache_loaded: bool,
    force_retune: bool,
}

impl MetalMiner {
    fn detect_apple_family(device: &Device) -> u32 {
        let families = [
            (MTLGPUFamily::Apple9, 9u32),
            (MTLGPUFamily::Apple8, 8u32),
            (MTLGPUFamily::Apple7, 7u32),
            (MTLGPUFamily::Apple6, 6u32),
            (MTLGPUFamily::Apple5, 5u32),
            (MTLGPUFamily::Apple4, 4u32),
            (MTLGPUFamily::Apple3, 3u32),
            (MTLGPUFamily::Apple2, 2u32),
            (MTLGPUFamily::Apple1, 1u32),
        ];
        for (family, idx) in families {
            if device.supports_family(family) {
                return idx;
            }
        }
        0
    }

    pub fn new() -> Result<Self> {
        let device =
            Device::system_default().ok_or_else(|| anyhow::anyhow!("No Metal device found"))?;
        let options = CompileOptions::new();
        options.set_fast_math_enabled(true);
        let apple_family = Self::detect_apple_family(&device);
        let vector_block_io = apple_family >= 7;
        let unroll_rounds = apple_family >= 8;
        let library = device
            .new_library_with_source(METAL_SHADER_SOURCE, &options)
            .map_err(|e| anyhow::anyhow!("Failed to compile Metal shader: {}", e))?;
        let build_pipeline = |proof_mode: bool| -> Result<ComputePipelineState> {
            let constants = FunctionConstantValues::new();
            constants.set_constant_value_at_index(
                &vector_block_io as *const bool as *const std::ffi::c_void,
                MTLDataType::Bool,
                0,
            );
            constants.set_constant_value_at_index(
                &unroll_rounds as *const bool as *const std::ffi::c_void,
                MTLDataType::Bool,
                1,
            );
            constants.set_constant_value_at_index(
                &proof_mode as *const bool as *const std::ffi::c_void,
                MTLDataType::Bool,
                2,
            );
            let kernel = library
                .get_function("uhash_kernel", Some(constants))
                .map_err(|e| anyhow::anyhow!("Failed to get kernel function: {}", e))?;
            device
                .new_compute_pipeline_state_with_function(&kernel)
                .map_err(|e| anyhow::anyhow!("Failed to create compute pipeline: {}", e))
        };
        let pipeline_hash = build_pipeline(false)?;
        let pipeline_proof = build_pipeline(true)?;
        let queue = device.new_command_queue();
        let device_name = device.name().to_string();
        let registry_id = device.registry_id();
        let unified_memory = device.has_unified_memory();
        let recommended_ws = device.recommended_max_working_set_size();
        let max_buffer_len = device.max_buffer_length();
        let inflight_slots = if unified_memory {
            4
        } else {
            DEFAULT_INFLIGHT_SLOTS
        };
        let profile_enabled = std::env::var_os("UHASH_METAL_PROFILE").is_some();
        let profile_every = std::env::var("UHASH_METAL_PROFILE_EVERY")
            .ok()
            .and_then(|v| v.parse::<usize>().ok())
            .filter(|v| *v > 0)
            .unwrap_or(1);
        let force_retune = std::env::var_os("UHASH_METAL_FORCE_RETUNE").is_some();
        let tg_factor = if apple_family >= 8 { 8 } else { 4 };
        let mut miner = Self {
            device,
            queue,
            pipeline_hash,
            pipeline_proof,
            header_buf: None,
            header_capacity: 0,
            params_slots: Vec::new(),
            hashes_private_slots: Vec::new(),
            hashes_readback_slots: Vec::new(),
            found_flag_slots: Vec::new(),
            found_data_slots: Vec::new(),
            scratchpads_slots: Vec::new(),
            chunk_lanes: 0,
            max_chunk_lanes: 0,
            tuned_chunk_lanes: 0,
            inflight_slots,
            tg_factor,
            tg_width: 0,
            did_runtime_profile: false,
            device_name,
            apple_family,
            registry_id,
            unified_memory,
            recommended_ws,
            max_buffer_len,
            vector_block_io,
            unroll_rounds,
            profile_enabled,
            profile_every,
            profile_counter: 0,
            tune_cache_loaded: false,
            force_retune,
        };
        miner.try_load_tune_cache();
        Ok(miner)
    }

    pub fn recommended_lanes(&mut self, requested: usize) -> usize {
        if self.tuned_chunk_lanes == 0 {
            let (chunk, max_chunk) = self.autotune_chunk_limits();
            self.tuned_chunk_lanes = chunk;
            self.max_chunk_lanes = max_chunk;
        }
        if requested == 0 {
            let base_lanes = if self.did_runtime_profile {
                self.tuned_chunk_lanes
            } else if self.apple_family >= 8 {
                self.max_chunk_lanes.max(self.tuned_chunk_lanes)
            } else {
                self.tuned_chunk_lanes
            };
            let default_mult = if self.apple_family >= 8 {
                // Empirically, 4 chunks per batch is a better default on Apple 8/9
                // than blindly scaling by inflight slots (which can over-batch).
                self.inflight_slots.min(4)
            } else {
                self.inflight_slots
            };
            base_lanes.saturating_mul(default_mult).max(1)
        } else {
            requested.max(1)
        }
    }

    pub fn telemetry(&self) -> MetalTelemetry {
        MetalTelemetry {
            device_name: self.device_name.clone(),
            apple_family: self.apple_family,
            registry_id: self.registry_id,
            unified_memory: self.unified_memory,
            recommended_max_working_set_size: self.recommended_ws,
            max_buffer_length: self.max_buffer_len,
            inflight_slots: self.inflight_slots,
            tuned_chunk_lanes: self.tuned_chunk_lanes,
            max_chunk_lanes: self.max_chunk_lanes,
            tg_factor: self.tg_factor,
            tg_width: self.tg_width,
            vector_block_io: self.vector_block_io,
            unroll_rounds: self.unroll_rounds,
            runtime_profiled: self.did_runtime_profile,
            cache_warm_start: self.tune_cache_loaded,
        }
    }

    pub fn benchmark_hashes(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
    ) -> Result<usize> {
        self.compute_batch_count(header_without_nonce, start_nonce, lanes)
    }

    fn tune_cache_path(registry_id: u64) -> Option<PathBuf> {
        let base = std::env::var_os("XDG_CONFIG_HOME")
            .map(PathBuf::from)
            .or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".config")))?;
        Some(
            base.join("uhash")
                .join(format!("metal_tuning_{registry_id:016x}.json")),
        )
    }

    pub fn tune_cache_path_for_current_device(&self) -> Option<PathBuf> {
        Self::tune_cache_path(self.registry_id)
    }

    pub fn clear_tune_cache_for_current_device(&mut self) -> Result<bool> {
        let Some(path) = self.tune_cache_path_for_current_device() else {
            return Ok(false);
        };
        if !path.exists() {
            self.tune_cache_loaded = false;
            return Ok(false);
        }
        fs::remove_file(&path)
            .map_err(|e| anyhow::anyhow!("failed to remove {}: {}", path.display(), e))?;
        self.tune_cache_loaded = false;
        Ok(true)
    }

    pub fn force_retune_now(&mut self, header_without_nonce: &[u8]) -> Result<()> {
        self.did_runtime_profile = false;
        self.tune_cache_loaded = false;
        self.chunk_lanes = 0;
        self.runtime_profile_chunk_lanes(header_without_nonce)
    }

    fn should_log_profile(&mut self) -> bool {
        if !self.profile_enabled {
            return false;
        }
        self.profile_counter = self.profile_counter.saturating_add(1);
        self.profile_counter
            .is_multiple_of(self.profile_every as u64)
    }

    fn try_load_tune_cache(&mut self) {
        if self.force_retune {
            return;
        }
        let Some(path) = Self::tune_cache_path(self.registry_id) else {
            return;
        };
        let Ok(raw) = fs::read(&path) else {
            return;
        };
        let Ok(cache) = serde_json::from_slice::<MetalTuneCache>(&raw) else {
            return;
        };
        if !(cache.version == 1 || cache.version == 2) || cache.registry_id != self.registry_id {
            return;
        }
        let (_heuristic_chunk, heuristic_max) = self.autotune_chunk_limits();
        let cap_max = heuristic_max.max(32);
        let chunk = cache.tuned_chunk_lanes.clamp(32, cap_max);
        let max_chunk = cache.max_chunk_lanes.clamp(chunk, cap_max);
        if chunk % 32 != 0 || max_chunk % 32 != 0 {
            return;
        }
        self.tuned_chunk_lanes = chunk;
        self.max_chunk_lanes = max_chunk;
        self.inflight_slots = cache.inflight_slots.clamp(2, 6);
        self.tg_factor = cache.tg_factor.clamp(1, 8);
        self.tg_width = cache.tg_width;
        self.tune_cache_loaded = true;
    }

    fn persist_tune_cache(&self) {
        let Some(path) = Self::tune_cache_path(self.registry_id) else {
            return;
        };
        let cache = MetalTuneCache {
            version: 2,
            registry_id: self.registry_id,
            tuned_chunk_lanes: self.tuned_chunk_lanes,
            max_chunk_lanes: self.max_chunk_lanes,
            inflight_slots: self.inflight_slots,
            tg_factor: self.tg_factor,
            tg_width: self.tg_width,
        };
        let Ok(raw) = serde_json::to_vec_pretty(&cache) else {
            return;
        };
        if let Some(parent) = path.parent() {
            let _ = fs::create_dir_all(parent);
        }
        let _ = fs::write(path, raw);
    }

    fn autotune_chunk_limits(&self) -> (usize, usize) {
        let max_buffer_len = self.device.max_buffer_length() as usize;
        let recommended_ws = self.device.recommended_max_working_set_size() as usize;

        let mem_budget = if recommended_ws > 0 {
            recommended_ws.saturating_mul(7) / 10
        } else {
            512 * 1024 * 1024
        };

        let max_by_working_set = mem_budget
            .saturating_div(TOTAL_MEMORY.saturating_mul(self.inflight_slots))
            .max(1);
        let max_by_buffer = max_buffer_len.saturating_div(TOTAL_MEMORY).max(1);
        let max_chunk = max_by_working_set.min(max_by_buffer).clamp(32, 4096);
        let tuned = max_chunk.min(1024);
        // keep dispatch sizes in threadgroup-friendly multiples
        let tuned = (tuned / 32).max(1) * 32;
        let max_chunk = (max_chunk / 32).max(1) * 32;
        (tuned, max_chunk)
    }

    fn round_to_multiple(v: usize, m: usize) -> usize {
        (v / m).max(1) * m
    }

    fn compute_tg_width(&self, chunk_lanes: usize, tg_factor: usize) -> u64 {
        let exec_width = self.pipeline_hash.thread_execution_width().max(1);
        let max_threads = self
            .pipeline_hash
            .max_total_threads_per_threadgroup()
            .max(1);
        if self.tg_width > 0 {
            let mut tg = self.tg_width.min(max_threads).min(chunk_lanes as u64);
            tg = (tg / exec_width).max(1) * exec_width;
            return tg.min(chunk_lanes as u64).max(exec_width);
        }
        if chunk_lanes as u64 <= exec_width {
            return chunk_lanes as u64;
        }
        let mut tg = exec_width.saturating_mul(tg_factor as u64);
        tg = tg.min(max_threads).min(chunk_lanes as u64);
        tg = (tg / exec_width).max(1) * exec_width;
        tg.min(chunk_lanes as u64).max(exec_width)
    }

    fn runtime_profile_threadgroup_factor(
        &mut self,
        header_without_nonce: &[u8],
        chunk_lanes: usize,
    ) -> Result<()> {
        let factors: &[usize] = if self.apple_family >= 8 {
            &[2, 4, 8]
        } else {
            &[1, 2, 4, 8]
        };
        let current = self.tg_factor;
        let mut best_factor = current;
        let mut best_hps = 0.0f64;
        let samples_per_factor = 3usize;

        for &factor in factors {
            self.tg_factor = factor;
            self.tg_width = 0;
            self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;

            let warmup = self.dispatch_chunk(
                0,
                chunk_lanes,
                header_without_nonce.len(),
                0,
                0,
                false,
                false,
                false,
                None,
            )?;
            warmup.wait_until_completed();
            if warmup.status() != MTLCommandBufferStatus::Completed {
                continue;
            }

            let mut total_elapsed = 0.0f64;
            for sample in 0..samples_per_factor {
                let nonce = 1u64.saturating_add((sample as u64).saturating_mul(chunk_lanes as u64));
                let start = Instant::now();
                let cmd = self.dispatch_chunk(
                    0,
                    chunk_lanes,
                    header_without_nonce.len(),
                    nonce,
                    0,
                    false,
                    false,
                    false,
                    None,
                )?;
                cmd.wait_until_completed();
                if cmd.status() != MTLCommandBufferStatus::Completed {
                    total_elapsed = 0.0;
                    break;
                }
                total_elapsed += start.elapsed().as_secs_f64();
            }
            if total_elapsed <= 0.0 {
                continue;
            }
            let hps = (chunk_lanes * samples_per_factor) as f64 / total_elapsed;
            if hps > best_hps {
                best_hps = hps;
                best_factor = factor;
            }
        }

        self.tg_factor = best_factor;
        // Threadgroup-shape autotune on top of factor tuning.
        let exec = self.pipeline_hash.thread_execution_width().max(1);
        let max_threads = self
            .pipeline_hash
            .max_total_threads_per_threadgroup()
            .max(1);
        let mut shape_candidates = vec![exec, exec.saturating_mul(2), exec.saturating_mul(3)];
        shape_candidates.push(exec.saturating_mul(4));
        shape_candidates.push(exec.saturating_mul(6));
        shape_candidates.push(exec.saturating_mul(8));
        shape_candidates.push(max_threads);
        shape_candidates.sort_unstable();
        shape_candidates.dedup();

        let mut best_shape = self.compute_tg_width(chunk_lanes, self.tg_factor);
        let mut best_shape_hps = 0.0f64;
        for shape in shape_candidates {
            self.tg_width = shape;
            self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
            let warmup = self.dispatch_chunk(
                0,
                chunk_lanes,
                header_without_nonce.len(),
                0,
                0,
                false,
                false,
                false,
                None,
            )?;
            warmup.wait_until_completed();
            if warmup.status() != MTLCommandBufferStatus::Completed {
                continue;
            }
            let start = Instant::now();
            let cmd = self.dispatch_chunk(
                0,
                chunk_lanes,
                header_without_nonce.len(),
                1,
                0,
                false,
                false,
                false,
                None,
            )?;
            cmd.wait_until_completed();
            if cmd.status() != MTLCommandBufferStatus::Completed {
                continue;
            }
            let elapsed = start.elapsed().as_secs_f64();
            if elapsed <= 0.0 {
                continue;
            }
            let hps = chunk_lanes as f64 / elapsed;
            if hps > best_shape_hps {
                best_shape_hps = hps;
                best_shape = shape;
            }
        }
        self.tg_width = best_shape;
        self.chunk_lanes = 0;
        Ok(())
    }

    fn runtime_profile_chunk_lanes(&mut self, header_without_nonce: &[u8]) -> Result<()> {
        if self.did_runtime_profile {
            return Ok(());
        }
        self.did_runtime_profile = true;

        let base = self.tuned_chunk_lanes.max(32);
        let max_chunk = self.max_chunk_lanes.max(base);
        let mut candidates = vec![
            base,
            Self::round_to_multiple(base.saturating_mul(2), 32).min(max_chunk),
            Self::round_to_multiple(base.saturating_mul(3), 32).min(max_chunk),
            max_chunk,
        ];
        candidates.sort_unstable();
        candidates.dedup();

        let mut best_lane = base;
        let mut best_hps = 0.0f64;
        for &lane in &candidates {
            if lane == 0 {
                continue;
            }
            self.ensure_resources(header_without_nonce.len(), lane)?;
            let warmup = self.dispatch_chunk(
                0,
                lane,
                header_without_nonce.len(),
                0,
                0,
                false,
                false,
                false,
                None,
            )?;
            warmup.wait_until_completed();
            if warmup.status() != MTLCommandBufferStatus::Completed {
                continue;
            }

            let start = Instant::now();
            let cmd = self.dispatch_chunk(
                0,
                lane,
                header_without_nonce.len(),
                lane as u64,
                0,
                false,
                false,
                false,
                None,
            )?;
            cmd.wait_until_completed();
            if cmd.status() != MTLCommandBufferStatus::Completed {
                continue;
            }
            let elapsed = start.elapsed().as_secs_f64();
            if elapsed <= 0.0 {
                continue;
            }
            let hps = lane as f64 / elapsed;
            if hps > best_hps {
                best_hps = hps;
                best_lane = lane;
            }
        }

        self.tuned_chunk_lanes = best_lane.max(32);
        self.runtime_profile_threadgroup_factor(header_without_nonce, self.tuned_chunk_lanes)?;
        self.runtime_profile_inflight_slots(header_without_nonce, self.tuned_chunk_lanes)?;
        // force exact re-allocation to tuned lane on next ensure_resources call
        self.chunk_lanes = 0;
        if !self.profile_enabled {
            self.persist_tune_cache();
        }
        Ok(())
    }

    fn runtime_profile_inflight_slots(
        &mut self,
        header_without_nonce: &[u8],
        chunk_lanes: usize,
    ) -> Result<()> {
        let candidates: &[usize] = if self.unified_memory {
            &[3, 4, 5, 6]
        } else {
            &[2, 3, 4]
        };
        let original = self.inflight_slots;
        let mut best_slots = original;
        let mut best_hps = 0.0f64;

        for &slots in candidates {
            self.inflight_slots = slots;
            self.chunk_lanes = 0;
            self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;

            let test_lanes = chunk_lanes
                .saturating_mul(slots)
                .saturating_mul(2)
                .max(chunk_lanes);
            let start = Instant::now();
            let done = self.compute_batch_count(header_without_nonce, 0, test_lanes)?;
            let elapsed = start.elapsed().as_secs_f64();
            if elapsed <= 0.0 || done == 0 {
                continue;
            }
            let hps = done as f64 / elapsed;
            if hps > best_hps {
                best_hps = hps;
                best_slots = slots;
            }
        }

        self.inflight_slots = best_slots;
        self.chunk_lanes = 0;
        self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
        Ok(())
    }

    fn ensure_runtime_profile(&mut self, header_without_nonce: &[u8]) -> Result<()> {
        if self.did_runtime_profile {
            return Ok(());
        }
        if self.tune_cache_loaded && !self.force_retune {
            self.did_runtime_profile = true;
            return Ok(());
        }
        self.runtime_profile_chunk_lanes(header_without_nonce)
    }

    fn ensure_resources(&mut self, header_len: usize, chunk_lanes: usize) -> Result<()> {
        if header_len > self.header_capacity {
            let new_capacity = header_len.next_power_of_two();
            self.header_buf = Some(
                self.device
                    .new_buffer(new_capacity as u64, MTLResourceOptions::StorageModeShared),
            );
            self.header_capacity = new_capacity;
        }

        if self.chunk_lanes == chunk_lanes
            && self.params_slots.len() == self.inflight_slots
            && self.hashes_private_slots.len() == self.inflight_slots
            && self.hashes_readback_slots.len() == self.inflight_slots
            && self.found_flag_slots.len() == self.inflight_slots
            && self.found_data_slots.len() == self.inflight_slots
            && self.scratchpads_slots.len() == self.inflight_slots
        {
            return Ok(());
        }

        let hash_bytes = chunk_lanes.saturating_mul(32);
        let scratch_bytes = chunk_lanes.saturating_mul(TOTAL_MEMORY);
        if hash_bytes == 0 || scratch_bytes == 0 {
            anyhow::bail!("invalid chunk_lanes={}", chunk_lanes);
        }

        let max_buffer = self.device.max_buffer_length() as usize;
        if hash_bytes > max_buffer || scratch_bytes > max_buffer {
            anyhow::bail!(
                "chunk too large: chunk_lanes={} hash_bytes={} scratch_bytes={} max_buffer={}",
                chunk_lanes,
                hash_bytes,
                scratch_bytes,
                max_buffer
            );
        }

        self.params_slots.clear();
        self.hashes_private_slots.clear();
        self.hashes_readback_slots.clear();
        self.found_flag_slots.clear();
        self.found_data_slots.clear();
        self.scratchpads_slots.clear();

        for _ in 0..self.inflight_slots {
            self.params_slots.push(self.device.new_buffer(
                std::mem::size_of::<KernelParams>() as u64,
                MTLResourceOptions::StorageModeShared,
            ));
            self.hashes_private_slots.push(
                self.device
                    .new_buffer(hash_bytes as u64, MTLResourceOptions::StorageModePrivate),
            );
            self.hashes_readback_slots.push(
                self.device
                    .new_buffer(hash_bytes as u64, MTLResourceOptions::StorageModeShared),
            );
            self.found_flag_slots.push(
                self.device
                    .new_buffer(4, MTLResourceOptions::StorageModeShared),
            );
            self.found_data_slots.push(self.device.new_buffer(
                FOUND_DATA_BYTES as u64,
                MTLResourceOptions::StorageModeShared,
            ));
            self.scratchpads_slots.push(
                self.device
                    .new_buffer(scratch_bytes as u64, MTLResourceOptions::StorageModePrivate),
            );
        }
        self.chunk_lanes = chunk_lanes;
        Ok(())
    }

    #[allow(clippy::too_many_arguments)]
    fn dispatch_chunk(
        &mut self,
        slot: usize,
        chunk_lanes: usize,
        header_len: usize,
        start_nonce: u64,
        difficulty: u32,
        proof_mode: bool,
        emit_hashes: bool,
        copy_hashes_for_readback: bool,
        completion_tx: Option<&Sender<CompletionEvent>>,
    ) -> Result<CommandBuffer> {
        let log_dispatch = self.should_log_profile();
        let header_buf = self
            .header_buf
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
        let params_buf = self
            .params_slots
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("params slot {} missing", slot))?;
        let hashes_private = self
            .hashes_private_slots
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("hashes private slot {} missing", slot))?;
        let hashes_readback = self
            .hashes_readback_slots
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("hashes readback slot {} missing", slot))?;
        let hashes_out = if copy_hashes_for_readback {
            hashes_readback
        } else {
            hashes_private
        };
        let found_flag = self
            .found_flag_slots
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("found flag slot {} missing", slot))?;
        let found_data = self
            .found_data_slots
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("found data slot {} missing", slot))?;
        let scratchpad = self
            .scratchpads_slots
            .get(slot)
            .ok_or_else(|| anyhow::anyhow!("scratchpad slot {} missing", slot))?;

        let header_len_u32 = u32::try_from(header_len)
            .map_err(|_| anyhow::anyhow!("header too large for gpu kernel: {}", header_len))?;
        let params = KernelParams {
            lanes: chunk_lanes as u32,
            rounds: ROUNDS as u32,
            header_len: header_len_u32,
            difficulty,
            emit_hashes: if emit_hashes { 1 } else { 0 },
            _pad0: 0,
            start_nonce,
        };
        // SAFETY: params buffer allocated for exactly one KernelParams object.
        unsafe {
            std::ptr::copy_nonoverlapping(
                &params as *const KernelParams,
                params_buf.contents() as *mut KernelParams,
                1,
            );
        }
        // SAFETY: found flag/data are tiny shared buffers per slot; fully initialized before dispatch.
        unsafe {
            *(found_flag.contents() as *mut u32) = 0u32;
            std::ptr::write_bytes(found_data.contents() as *mut u8, 0, FOUND_DATA_BYTES);
        }

        let command_buffer = self.queue.new_command_buffer().to_owned();
        if log_dispatch {
            command_buffer.set_label(&format!(
                "uhash slot={} lanes={} nonce={}",
                slot, chunk_lanes, start_nonce
            ));
            command_buffer.push_debug_group("uhash.dispatch");
        }
        let encoder = command_buffer.new_compute_command_encoder();
        let pipeline = if proof_mode {
            &self.pipeline_proof
        } else {
            &self.pipeline_hash
        };
        encoder.set_compute_pipeline_state(pipeline);
        encoder.set_buffer(0, Some(header_buf), 0);
        encoder.set_buffer(1, Some(hashes_out), 0);
        encoder.set_buffer(2, Some(scratchpad), 0);
        encoder.set_buffer(3, Some(params_buf), 0);
        encoder.set_buffer(4, Some(found_flag), 0);
        encoder.set_buffer(5, Some(found_data), 0);

        let tg_width = self.compute_tg_width(chunk_lanes, self.tg_factor);
        let grid = metal::MTLSize {
            width: chunk_lanes as u64,
            height: 1,
            depth: 1,
        };
        let tg = metal::MTLSize {
            width: tg_width,
            height: 1,
            depth: 1,
        };
        encoder.dispatch_threads(grid, tg);
        encoder.end_encoding();

        if log_dispatch {
            command_buffer.pop_debug_group();
        }

        if let Some(tx) = completion_tx {
            let tx = tx.clone();
            let block = ConcreteBlock::new(move |cb: &CommandBufferRef| {
                let _ = tx.send((slot, cb.status()));
            })
            .copy();
            command_buffer.add_completed_handler(&block);
        }

        command_buffer.commit();
        Ok(command_buffer)
    }

    #[allow(dead_code)]
    fn collect_readback_without_wait(
        &self,
        pending: &PendingDispatch,
        out_hashes: &mut [[u8; 32]],
    ) -> Result<()> {
        let readback = self
            .hashes_readback_slots
            .get(pending._slot)
            .ok_or_else(|| anyhow::anyhow!("missing readback slot {}", pending._slot))?;

        let dst_offset = pending.output_lane_offset;
        let lane_count = pending.lanes;
        // SAFETY: destination range is validated by caller; source buffer has at least lane_count*32 bytes.
        unsafe {
            std::ptr::copy_nonoverlapping(
                readback.contents() as *const u8,
                out_hashes.as_mut_ptr().add(dst_offset) as *mut u8,
                lane_count * 32,
            );
        }
        Ok(())
    }

    fn await_completion_event(
        &self,
        rx: &Receiver<CompletionEvent>,
    ) -> Result<(usize, MTLCommandBufferStatus)> {
        rx.recv()
            .map_err(|e| anyhow::anyhow!("completion channel closed: {}", e))
    }

    #[allow(dead_code)]
    fn compute_batch_hashes(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
    ) -> Result<Vec<[u8; 32]>> {
        if lanes == 0 {
            return Ok(Vec::new());
        }
        if header_without_nonce.is_empty() {
            anyhow::bail!("header must not be empty");
        }

        if self.tuned_chunk_lanes == 0 || self.max_chunk_lanes == 0 {
            let (chunk, max_chunk) = self.autotune_chunk_limits();
            self.tuned_chunk_lanes = chunk;
            self.max_chunk_lanes = max_chunk;
        }
        self.ensure_runtime_profile(header_without_nonce)?;
        let chunk_lanes = self
            .tuned_chunk_lanes
            .min(self.max_chunk_lanes)
            .min(lanes)
            .max(1);
        self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;

        let header_buf = self
            .header_buf
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
        // SAFETY: header buffer has at least header_without_nonce.len() capacity.
        unsafe {
            std::ptr::copy_nonoverlapping(
                header_without_nonce.as_ptr(),
                header_buf.contents() as *mut u8,
                header_without_nonce.len(),
            );
        }

        let mut out_hashes = vec![[0u8; 32]; lanes];
        let mut pending_by_slot: Vec<Option<PendingDispatch>> =
            (0..self.inflight_slots).map(|_| None).collect();
        let mut available_slots: VecDeque<usize> = (0..self.inflight_slots).collect();
        let (completion_tx, completion_rx) = mpsc::channel::<CompletionEvent>();
        let mut in_flight = 0usize;
        let mut lane_offset = 0usize;
        let start_submit = Instant::now();
        let mut completion_waits = 0usize;

        while lane_offset < lanes || in_flight > 0 {
            while lane_offset < lanes && !available_slots.is_empty() {
                let this_lanes = (lanes - lane_offset).min(chunk_lanes);
                let slot = available_slots.pop_front().expect("slot available");
                let this_start_nonce = start_nonce.wrapping_add(lane_offset as u64);
                let command_buffer = self.dispatch_chunk(
                    slot,
                    this_lanes,
                    header_without_nonce.len(),
                    this_start_nonce,
                    0,
                    false,
                    true,
                    true,
                    Some(&completion_tx),
                )?;

                pending_by_slot[slot] = Some(PendingDispatch {
                    _slot: slot,
                    output_lane_offset: lane_offset,
                    lanes: this_lanes,
                    _command_buffer: command_buffer,
                });
                in_flight += 1;
                lane_offset += this_lanes;
            }

            if in_flight == 0 {
                break;
            }
            let (slot, status) = self.await_completion_event(&completion_rx)?;
            completion_waits = completion_waits.saturating_add(1);
            let done = pending_by_slot
                .get_mut(slot)
                .and_then(Option::take)
                .ok_or_else(|| anyhow::anyhow!("completion for unknown slot {}", slot))?;
            if status != MTLCommandBufferStatus::Completed {
                anyhow::bail!(
                    "metal command failed with status {:?} (slot={})",
                    status,
                    slot
                );
            }
            self.collect_readback_without_wait(&done, &mut out_hashes)?;
            available_slots.push_back(slot);
            in_flight = in_flight.saturating_sub(1);
        }
        if self.should_log_profile() {
            let elapsed = start_submit.elapsed().as_secs_f64();
            eprintln!(
                "[metal-prof] compute_batch_hashes lanes={} chunk={} inflight={} elapsed={:.4}s completion_waits={}",
                lanes, chunk_lanes, self.inflight_slots, elapsed, completion_waits
            );
        }

        Ok(out_hashes)
    }

    fn compute_batch_count(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
    ) -> Result<usize> {
        if lanes == 0 {
            return Ok(0);
        }
        if header_without_nonce.is_empty() {
            anyhow::bail!("header must not be empty");
        }

        if self.tuned_chunk_lanes == 0 || self.max_chunk_lanes == 0 {
            let (chunk, max_chunk) = self.autotune_chunk_limits();
            self.tuned_chunk_lanes = chunk;
            self.max_chunk_lanes = max_chunk;
        }
        self.ensure_runtime_profile(header_without_nonce)?;
        let chunk_lanes = self
            .tuned_chunk_lanes
            .min(self.max_chunk_lanes)
            .min(lanes)
            .max(1);
        self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;

        let header_buf = self
            .header_buf
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
        // SAFETY: header buffer has at least header_without_nonce.len() capacity.
        unsafe {
            std::ptr::copy_nonoverlapping(
                header_without_nonce.as_ptr(),
                header_buf.contents() as *mut u8,
                header_without_nonce.len(),
            );
        }

        let mut pending_by_slot: Vec<Option<PendingDispatch>> =
            (0..self.inflight_slots).map(|_| None).collect();
        let mut available_slots: VecDeque<usize> = (0..self.inflight_slots).collect();
        let (completion_tx, completion_rx) = mpsc::channel::<CompletionEvent>();
        let mut in_flight = 0usize;
        let mut lane_offset = 0usize;
        let start_submit = Instant::now();
        let mut completion_waits = 0usize;

        while lane_offset < lanes || in_flight > 0 {
            while lane_offset < lanes && !available_slots.is_empty() {
                let this_lanes = (lanes - lane_offset).min(chunk_lanes);
                let slot = available_slots.pop_front().expect("slot available");
                let this_start_nonce = start_nonce.wrapping_add(lane_offset as u64);
                let command_buffer = self.dispatch_chunk(
                    slot,
                    this_lanes,
                    header_without_nonce.len(),
                    this_start_nonce,
                    0,
                    false,
                    false,
                    false,
                    Some(&completion_tx),
                )?;
                pending_by_slot[slot] = Some(PendingDispatch {
                    _slot: slot,
                    output_lane_offset: 0,
                    lanes: this_lanes,
                    _command_buffer: command_buffer,
                });
                in_flight += 1;
                lane_offset += this_lanes;
            }

            if in_flight == 0 {
                break;
            }
            let (slot, status) = self.await_completion_event(&completion_rx)?;
            completion_waits = completion_waits.saturating_add(1);
            let _done = pending_by_slot
                .get_mut(slot)
                .and_then(Option::take)
                .ok_or_else(|| anyhow::anyhow!("completion for unknown slot {}", slot))?;
            if status != MTLCommandBufferStatus::Completed {
                anyhow::bail!(
                    "metal command failed with status {:?} (slot={})",
                    status,
                    slot
                );
            }
            available_slots.push_back(slot);
            in_flight = in_flight.saturating_sub(1);
        }

        if self.should_log_profile() {
            let elapsed = start_submit.elapsed().as_secs_f64();
            eprintln!(
                "[metal-prof] compute_batch_count lanes={} chunk={} inflight={} elapsed={:.4}s completion_waits={}",
                lanes, chunk_lanes, self.inflight_slots, elapsed, completion_waits
            );
        }

        Ok(lanes)
    }

    pub fn find_proof_batch(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
        difficulty: u32,
    ) -> Result<Option<(u64, [u8; 32])>> {
        if lanes == 0 {
            return Ok(None);
        }
        if header_without_nonce.is_empty() {
            anyhow::bail!("header must not be empty");
        }

        if self.tuned_chunk_lanes == 0 || self.max_chunk_lanes == 0 {
            let (chunk, max_chunk) = self.autotune_chunk_limits();
            self.tuned_chunk_lanes = chunk;
            self.max_chunk_lanes = max_chunk;
        }
        self.ensure_runtime_profile(header_without_nonce)?;
        let chunk_lanes = self
            .tuned_chunk_lanes
            .min(self.max_chunk_lanes)
            .min(lanes)
            .max(1);
        self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;

        let header_buf = self
            .header_buf
            .as_ref()
            .ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
        // SAFETY: header buffer has at least header_without_nonce.len() capacity.
        unsafe {
            std::ptr::copy_nonoverlapping(
                header_without_nonce.as_ptr(),
                header_buf.contents() as *mut u8,
                header_without_nonce.len(),
            );
        }

        let mut pending_by_slot: Vec<Option<PendingDispatch>> =
            (0..self.inflight_slots).map(|_| None).collect();
        let mut available_slots: VecDeque<usize> = (0..self.inflight_slots).collect();
        let (completion_tx, completion_rx) = mpsc::channel::<CompletionEvent>();
        let mut in_flight = 0usize;
        let mut lane_offset = 0usize;
        let mut found: Option<(u64, [u8; 32])> = None;

        while lane_offset < lanes || in_flight > 0 {
            while lane_offset < lanes && !available_slots.is_empty() {
                let this_lanes = (lanes - lane_offset).min(chunk_lanes);
                let slot = available_slots.pop_front().expect("slot available");
                let this_start_nonce = start_nonce.wrapping_add(lane_offset as u64);
                let command_buffer = self.dispatch_chunk(
                    slot,
                    this_lanes,
                    header_without_nonce.len(),
                    this_start_nonce,
                    difficulty,
                    true,
                    false,
                    false,
                    Some(&completion_tx),
                )?;
                pending_by_slot[slot] = Some(PendingDispatch {
                    _slot: slot,
                    output_lane_offset: lane_offset,
                    lanes: this_lanes,
                    _command_buffer: command_buffer,
                });
                in_flight += 1;
                lane_offset += this_lanes;
            }

            if in_flight == 0 {
                break;
            }

            let (slot, status) = self.await_completion_event(&completion_rx)?;
            let _done = pending_by_slot
                .get_mut(slot)
                .and_then(Option::take)
                .ok_or_else(|| anyhow::anyhow!("completion for unknown slot {}", slot))?;
            if status != MTLCommandBufferStatus::Completed {
                anyhow::bail!(
                    "metal command failed with status {:?} (slot={})",
                    status,
                    slot
                );
            }

            if found.is_none() {
                let found_flag = self
                    .found_flag_slots
                    .get(slot)
                    .ok_or_else(|| anyhow::anyhow!("missing found flag slot {}", slot))?;
                let found_data = self
                    .found_data_slots
                    .get(slot)
                    .ok_or_else(|| anyhow::anyhow!("missing found data slot {}", slot))?;
                // SAFETY: host reads slot-local shared found buffers written by the kernel.
                unsafe {
                    let flag = *(found_flag.contents() as *const u32);
                    if flag != 0 {
                        let mut nonce_bytes = [0u8; 8];
                        std::ptr::copy_nonoverlapping(
                            found_data.contents() as *const u8,
                            nonce_bytes.as_mut_ptr(),
                            8,
                        );
                        let mut hash = [0u8; 32];
                        std::ptr::copy_nonoverlapping(
                            (found_data.contents() as *const u8).add(8),
                            hash.as_mut_ptr(),
                            32,
                        );
                        let nonce = u64::from_le_bytes(nonce_bytes);
                        // Safety fence against stale/invalid writes.
                        if nonce >= start_nonce
                            && nonce < start_nonce.wrapping_add(lanes as u64).wrapping_add(1)
                        {
                            found = Some((nonce, hash));
                        }
                    }
                }
            }

            available_slots.push_back(slot);
            in_flight = in_flight.saturating_sub(1);
        }

        Ok(found)
    }
}

const METAL_SHADER_SOURCE: &str = r#"
#include <metal_stdlib>
using namespace metal;

constant uint CHAINS = 4;
constant uint SCRATCHPAD_SIZE = 524288;
constant uint BLOCK_SIZE = 64;
constant uint BLOCKS_PER_SCRATCHPAD = 8192;
constant uint ADDRESS_MASK = 8191;
constant ulong GOLDEN_RATIO = 0x9E3779B97F4A7C15ul;
constant bool FC_VECTOR_BLOCK_IO [[function_constant(0)]];
constant bool FC_UNROLL_ROUNDS [[function_constant(1)]];
constant bool FC_PROOF_MODE [[function_constant(2)]];

constant uint B3_CHUNK_START = 1;
constant uint B3_CHUNK_END = 2;
constant uint B3_ROOT = 8;

struct KernelParams {
  uint lanes;
  uint rounds;
  uint header_len;
  uint difficulty;
  uint emit_hashes;
  uint _pad0;
  ulong start_nonce;
};

constant uchar SBOX[256] = {
  0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76,
  0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0,
  0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15,
  0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75,
  0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84,
  0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf,
  0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8,
  0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2,
  0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73,
  0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb,
  0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79,
  0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08,
  0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a,
  0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e,
  0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf,
  0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
};

constant uint SHA256_K[64] = {
  0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
  0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
  0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
  0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
  0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
  0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
  0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
};
constant uint SHA256_IV[8] = {
  0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
};

constant uint B3_IV[8] = {0x6A09E667,0xBB67AE85,0x3C6EF372,0xA54FF53A,0x510E527F,0x9B05688C,0x1F83D9AB,0x5BE0CD19};
constant ushort B3_S[7][16] = {
  {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15},
  {2,6,3,10,7,0,4,13,1,11,12,5,9,14,15,8},
  {3,4,10,12,13,2,7,14,6,5,9,0,11,15,8,1},
  {10,7,12,9,14,3,13,15,4,0,11,2,5,8,1,6},
  {12,13,9,11,15,10,14,8,7,2,5,3,0,1,6,4},
  {9,14,11,5,8,12,15,1,13,3,0,10,2,6,4,7},
  {11,15,5,0,1,9,8,6,14,10,2,12,3,4,7,13}
};

inline uchar gf_mul2(uchar x) { uchar hi = x >> 7; return (uchar)((x << 1) ^ (hi * 0x1b)); }
inline uchar gf_mul3(uchar x) { return (uchar)(gf_mul2(x) ^ x); }

inline void aesenc_round(thread uchar state[16], thread const uchar rk[16]) {
  uchar s[16];
  for (uint i = 0; i < 16; i++) s[i] = SBOX[state[i]];
  uchar t[16];
  for (uint i = 0; i < 16; i++) t[i] = s[i];
  s[1]=t[5]; s[5]=t[9]; s[9]=t[13]; s[13]=t[1];
  s[2]=t[10]; s[6]=t[14]; s[10]=t[2]; s[14]=t[6];
  s[3]=t[15]; s[7]=t[3]; s[11]=t[7]; s[15]=t[11];
  uchar out[16];
  for (uint c = 0; c < 4; c++) {
    uint i = c * 4;
    uchar a0=s[i], a1=s[i+1], a2=s[i+2], a3=s[i+3];
    out[i]   = (uchar)(gf_mul2(a0) ^ gf_mul3(a1) ^ a2 ^ a3);
    out[i+1] = (uchar)(a0 ^ gf_mul2(a1) ^ gf_mul3(a2) ^ a3);
    out[i+2] = (uchar)(a0 ^ a1 ^ gf_mul2(a2) ^ gf_mul3(a3));
    out[i+3] = (uchar)(gf_mul3(a0) ^ a1 ^ a2 ^ gf_mul2(a3));
  }
  for (uint i = 0; i < 16; i++) state[i] = (uchar)(out[i] ^ rk[i]);
}

inline void aes_expand_block(thread uchar state[16], thread const uchar key[16]) {
  aesenc_round(state, key);
  aesenc_round(state, key);
  aesenc_round(state, key);
  aesenc_round(state, key);
}

inline uchar4 make_u4(thread const uchar a[16], uint i) {
  return uchar4(a[i], a[i+1], a[i+2], a[i+3]);
}

inline void write_scratch_block(device uchar* dst, thread const uchar st[16], thread const uchar st2[16]) {
  if (FC_VECTOR_BLOCK_IO) {
    device uchar4* d4 = (device uchar4*)dst;
    d4[0] = make_u4(st, 0);   d4[1] = make_u4(st, 4);   d4[2] = make_u4(st, 8);   d4[3] = make_u4(st, 12);
    d4[4] = make_u4(st2, 0);  d4[5] = make_u4(st2, 4);  d4[6] = make_u4(st2, 8);  d4[7] = make_u4(st2, 12);
    d4[8] = make_u4(st, 0);   d4[9] = make_u4(st, 4);   d4[10] = make_u4(st, 8);  d4[11] = make_u4(st, 12);
    d4[12] = make_u4(st2, 0); d4[13] = make_u4(st2, 4); d4[14] = make_u4(st2, 8); d4[15] = make_u4(st2, 12);
  } else {
    for(uint i=0;i<16;i++) {
      dst[i] = st[i];
      dst[16+i] = st2[i];
      dst[32+i] = st[i];
      dst[48+i] = st2[i];
    }
  }
}

inline void load_scratch_block(const device uchar* src, thread uchar block[64]) {
  if (FC_VECTOR_BLOCK_IO) {
    device const uchar4* src4 = (device const uchar4*)src;
    for (uint i=0; i<16; i++) {
      uchar4 v = src4[i];
      uint o = i * 4;
      block[o] = v.x;
      block[o+1] = v.y;
      block[o+2] = v.z;
      block[o+3] = v.w;
    }
  } else {
    for(uint i=0;i<64;i++) block[i] = src[i];
  }
}

inline uint rotr(uint x, uint n) { return (x >> n) | (x << (32 - n)); }
inline uint rd_be32(thread const uchar* p) {
  return ((uint)p[0] << 24) | ((uint)p[1] << 16) | ((uint)p[2] << 8) | (uint)p[3];
}
inline ulong rd_le64(thread const uchar* p) {
  return (ulong)p[0] | ((ulong)p[1] << 8) | ((ulong)p[2] << 16) | ((ulong)p[3] << 24) |
         ((ulong)p[4] << 32) | ((ulong)p[5] << 40) | ((ulong)p[6] << 48) | ((ulong)p[7] << 56);
}
inline void wr_be32(thread uchar* p, uint v) {
  p[0] = (uchar)(v >> 24); p[1] = (uchar)(v >> 16); p[2] = (uchar)(v >> 8); p[3] = (uchar)(v);
}
inline void wr_le32(thread uchar* p, uint v) {
  p[0] = (uchar)(v); p[1] = (uchar)(v >> 8); p[2] = (uchar)(v >> 16); p[3] = (uchar)(v >> 24);
}

inline void sha256_compress(thread const uchar state_b[32], thread const uchar block[64], thread uchar out[32]) {
  uint s[8];
  for (uint i=0;i<8;i++) s[i]=rd_be32(&state_b[i*4]);
  uint w[64];
  for (uint i=0;i<16;i++) w[i]=rd_be32(&block[i*4]);
  for (uint i=16;i<64;i++) {
    uint s0 = rotr(w[i-15],7) ^ rotr(w[i-15],18) ^ (w[i-15] >> 3);
    uint s1 = rotr(w[i-2],17) ^ rotr(w[i-2],19) ^ (w[i-2] >> 10);
    w[i] = w[i-16] + s0 + w[i-7] + s1;
  }
  uint a=s[0],b=s[1],c=s[2],d=s[3],e=s[4],f=s[5],g=s[6],h=s[7];
  for (uint i=0;i<64;i++) {
    uint S1 = rotr(e,6) ^ rotr(e,11) ^ rotr(e,25);
    uint ch = (e & f) ^ ((~e) & g);
    uint t1 = h + S1 + ch + SHA256_K[i] + w[i];
    uint S0 = rotr(a,2) ^ rotr(a,13) ^ rotr(a,22);
    uint maj = (a & b) ^ (a & c) ^ (b & c);
    uint t2 = S0 + maj;
    h=g; g=f; f=e; e=d+t1; d=c; c=b; b=a; a=t1+t2;
  }
  s[0]+=a; s[1]+=b; s[2]+=c; s[3]+=d; s[4]+=e; s[5]+=f; s[6]+=g; s[7]+=h;
  for (uint i=0;i<8;i++) wr_be32(&out[i*4], s[i]);
}

inline void blake3_g(thread uint v[16], uint a, uint b, uint c, uint d, uint mx, uint my) {
  v[a] = v[a] + v[b] + mx;
  v[d] = rotr(v[d] ^ v[a], 16);
  v[c] = v[c] + v[d];
  v[b] = rotr(v[b] ^ v[c], 12);
  v[a] = v[a] + v[b] + my;
  v[d] = rotr(v[d] ^ v[a], 8);
  v[c] = v[c] + v[d];
  v[b] = rotr(v[b] ^ v[c], 7);
}

inline void blake3_compress(thread const uchar state_b[32], thread const uchar block[64], thread uchar out[32]) {
  uint h[8];
  uint m[16];
  for (uint i=0;i<8;i++) h[i] = (uint)state_b[i*4] | ((uint)state_b[i*4+1]<<8) | ((uint)state_b[i*4+2]<<16) | ((uint)state_b[i*4+3]<<24);
  for (uint i=0;i<16;i++) m[i] = (uint)block[i*4] | ((uint)block[i*4+1]<<8) | ((uint)block[i*4+2]<<16) | ((uint)block[i*4+3]<<24);
  uint v[16];
  for(uint i=0;i<8;i++){ v[i]=h[i]; v[i+8]=B3_IV[i]; }
  for(uint r=0;r<7;r++){
    blake3_g(v,0,4,8,12,m[B3_S[r][0]],m[B3_S[r][1]]);
    blake3_g(v,1,5,9,13,m[B3_S[r][2]],m[B3_S[r][3]]);
    blake3_g(v,2,6,10,14,m[B3_S[r][4]],m[B3_S[r][5]]);
    blake3_g(v,3,7,11,15,m[B3_S[r][6]],m[B3_S[r][7]]);
    blake3_g(v,0,5,10,15,m[B3_S[r][8]],m[B3_S[r][9]]);
    blake3_g(v,1,6,11,12,m[B3_S[r][10]],m[B3_S[r][11]]);
    blake3_g(v,2,7,8,13,m[B3_S[r][12]],m[B3_S[r][13]]);
    blake3_g(v,3,4,9,14,m[B3_S[r][14]],m[B3_S[r][15]]);
  }
  for (uint i=0;i<8;i++) {
    uint x = v[i] ^ v[i+8];
    wr_le32(&out[i*4], x);
  }
}

inline void blake3_compress_hash(
    thread const uint cv[8],
    thread const uint block_words[16],
    uint counter_low,
    uint counter_high,
    uint block_len,
    uint flags,
    thread uint out[16]
) {
  uint v[16];
  for(uint i=0;i<8;i++){ v[i]=cv[i]; }
  v[8] = B3_IV[0];
  v[9] = B3_IV[1];
  v[10] = B3_IV[2];
  v[11] = B3_IV[3];
  v[12] = counter_low;
  v[13] = counter_high;
  v[14] = block_len;
  v[15] = flags;

  for(uint r=0;r<7;r++){
    blake3_g(v,0,4,8,12,block_words[B3_S[r][0]],block_words[B3_S[r][1]]);
    blake3_g(v,1,5,9,13,block_words[B3_S[r][2]],block_words[B3_S[r][3]]);
    blake3_g(v,2,6,10,14,block_words[B3_S[r][4]],block_words[B3_S[r][5]]);
    blake3_g(v,3,7,11,15,block_words[B3_S[r][6]],block_words[B3_S[r][7]]);
    blake3_g(v,0,5,10,15,block_words[B3_S[r][8]],block_words[B3_S[r][9]]);
    blake3_g(v,1,6,11,12,block_words[B3_S[r][10]],block_words[B3_S[r][11]]);
    blake3_g(v,2,7,8,13,block_words[B3_S[r][12]],block_words[B3_S[r][13]]);
    blake3_g(v,3,4,9,14,block_words[B3_S[r][14]],block_words[B3_S[r][15]]);
  }

  for (uint i=0;i<8;i++) {
    out[i] = v[i] ^ v[i+8];
    out[i+8] = v[i+8] ^ cv[i];
  }
}

inline void blake3_compress_inplace(
    thread uint cv[8],
    thread const uint block_words[16],
    uint counter_low,
    uint counter_high,
    uint block_len,
    uint flags
) {
  uint out_words[16];
  blake3_compress_hash(cv, block_words, counter_low, counter_high, block_len, flags, out_words);
  for (uint i=0;i<8;i++) cv[i] = out_words[i];
}

inline uchar header_nonce_byte(const device uchar* header, uint header_len, ulong nonce, uint idx) {
  if (idx < header_len) {
    return header[idx];
  }
  uint j = idx - header_len;
  if (j < 8) {
    return (uchar)((nonce >> (j * 8)) & 0xfful);
  }
  return 0;
}

inline void blake3_hash_header_nonce(
    const device uchar* header,
    uint header_len,
    ulong nonce,
    thread uchar out[32]
) {
  uint total_len = header_len + 8;
  uint blocks = (total_len + 63) / 64;
  uint cv[8];
  for (uint i=0;i<8;i++) cv[i] = B3_IV[i];

  for (uint b = 0; b < blocks; b++) {
    uint off = b * 64;
    uint block_len = min((uint)64, total_len - off);
    uint words[16];
    for (uint i=0;i<16;i++) words[i] = 0;
    for (uint i=0;i<block_len;i++) {
      uchar by = header_nonce_byte(header, header_len, nonce, off + i);
      words[i / 4] |= ((uint)by) << ((i % 4) * 8);
    }

    uint flags = 0;
    if (b == 0) flags |= B3_CHUNK_START;
    bool last = (b + 1 == blocks);
    if (last) flags |= B3_CHUNK_END;

    if (!last) {
      blake3_compress_inplace(cv, words, 0, 0, block_len, flags);
    } else {
      uint out_words[16];
      blake3_compress_hash(cv, words, 0, 0, block_len, flags | B3_ROOT, out_words);
      for (uint i=0;i<8;i++) wr_le32(&out[i*4], out_words[i]);
    }
  }
}

inline void sha256_digest_32(thread const uchar input[32], thread uchar out[32]) {
  uint state[8];
  for (uint i=0;i<8;i++) state[i] = SHA256_IV[i];

  uchar block[64];
  for (uint i=0;i<32;i++) block[i] = input[i];
  block[32] = 0x80;
  for (uint i=33;i<56;i++) block[i] = 0;
  block[56] = 0;
  block[57] = 0;
  block[58] = 0;
  block[59] = 0;
  block[60] = 0;
  block[61] = 0;
  block[62] = 1;
  block[63] = 0;

  uint w[64];
  for (uint i=0;i<16;i++) w[i]=rd_be32(&block[i*4]);
  for (uint i=16;i<64;i++) {
    uint s0 = rotr(w[i-15],7) ^ rotr(w[i-15],18) ^ (w[i-15] >> 3);
    uint s1 = rotr(w[i-2],17) ^ rotr(w[i-2],19) ^ (w[i-2] >> 10);
    w[i] = w[i-16] + s0 + w[i-7] + s1;
  }
  uint a=state[0],b=state[1],c=state[2],d=state[3],e=state[4],f=state[5],g=state[6],h=state[7];
  for (uint i=0;i<64;i++) {
    uint S1 = rotr(e,6) ^ rotr(e,11) ^ rotr(e,25);
    uint ch = (e & f) ^ ((~e) & g);
    uint t1 = h + S1 + ch + SHA256_K[i] + w[i];
    uint S0 = rotr(a,2) ^ rotr(a,13) ^ rotr(a,22);
    uint maj = (a & b) ^ (a & c) ^ (b & c);
    uint t2 = S0 + maj;
    h=g; g=f; f=e; e=d+t1; d=c; c=b; b=a; a=t1+t2;
  }
  state[0]+=a; state[1]+=b; state[2]+=c; state[3]+=d;
  state[4]+=e; state[5]+=f; state[6]+=g; state[7]+=h;
  for (uint i=0;i<8;i++) wr_be32(&out[i*4], state[i]);
}

inline void blake3_hash_32(thread const uchar input[32], thread uchar out[32]) {
  uint cv[8];
  for (uint i=0;i<8;i++) cv[i] = B3_IV[i];

  uint block_words[16];
  for (uint i=0;i<8;i++) {
    block_words[i] =
      (uint)input[i*4] |
      ((uint)input[i*4+1] << 8) |
      ((uint)input[i*4+2] << 16) |
      ((uint)input[i*4+3] << 24);
  }
  for (uint i=8;i<16;i++) block_words[i] = 0;

  uint out_words[16];
  blake3_compress_hash(cv, block_words, 0, 0, 32, B3_CHUNK_START | B3_CHUNK_END | B3_ROOT, out_words);
  for (uint i=0;i<8;i++) wr_le32(&out[i*4], out_words[i]);
}

inline uint leading_zeros_u8(uchar b) {
  uint n = 0;
  for (uint bit = 0; bit < 8; bit++) {
    if ((b & (uchar)(0x80 >> bit)) != 0) return n;
    n++;
  }
  return 8;
}

inline bool meets_difficulty_bits(thread const uchar hash[32], uint difficulty) {
  uint zero_bits = 0;
  for (uint i=0; i<32; i++) {
    uchar b = hash[i];
    if (b == 0) {
      zero_bits += 8;
    } else {
      zero_bits += leading_zeros_u8(b);
      break;
    }
    if (zero_bits >= difficulty) return true;
  }
  return zero_bits >= difficulty;
}

inline uint compute_addr(thread const uchar state[32], uint round) {
  const ulong C = 0x517cc1b727220a95ul;
  ulong lo = rd_le64(&state[0]);
  ulong hi = rd_le64(&state[8]);
  ulong ru = (ulong)round;
  ulong mixed = lo ^ hi ^ ((ru << 13) | (ru >> (64 - 13))) ^ (ru * C);
  return (uint)((mixed & ADDRESS_MASK) * BLOCK_SIZE);
}

inline void uhash_round(
    device uchar* chain_scratch,
    thread uchar state[32],
    uint initial_primitive,
    uint r
) {
  uint addr = compute_addr(state, r);
  uchar block[64];
  load_scratch_block(chain_scratch + addr, block);
  uint primitive = (initial_primitive + r + 1) % 3;
  uchar new_state[32];
  if (primitive == 0) {
    uchar lo[16]; uchar hi[16];
    for(uint i=0;i<16;i++){ lo[i]=state[i]; hi[i]=state[16+i]; }
    aesenc_round(lo, &block[0]);
    aesenc_round(lo, &block[16]);
    aesenc_round(lo, &block[32]);
    aesenc_round(lo, &block[48]);
    aesenc_round(hi, &block[32]);
    aesenc_round(hi, &block[48]);
    aesenc_round(hi, &block[0]);
    aesenc_round(hi, &block[16]);
    for(uint i=0;i<16;i++){ new_state[i]=lo[i]; new_state[16+i]=hi[i]; }
  } else if (primitive == 1) {
    sha256_compress(state, block, new_state);
  } else {
    blake3_compress(state, block, new_state);
  }
  for(uint i=0;i<32;i++) chain_scratch[addr+i] = new_state[i];
  for(uint i=0;i<32;i++) state[i] = new_state[i];
}

kernel void uhash_kernel(
    const device uchar* header [[buffer(0)]],
    device uchar* hashes_out [[buffer(1)]],
    device uchar* scratchpads [[buffer(2)]],
    const device KernelParams& params [[buffer(3)]],
    device atomic_uint* found_flag [[buffer(4)]],
    device uchar* found_data [[buffer(5)]],
    uint gid [[thread_position_in_grid]],
    uint tid [[thread_index_in_threadgroup]]
) {
  if (gid >= params.lanes) return;
  device uchar* lane_scratch = scratchpads + (ulong)gid * (ulong)(CHAINS * SCRATCHPAD_SIZE);
  uchar states[CHAINS][32];

  ulong nonce_gid = params.start_nonce + (ulong)gid;
  for (uint c = 0; c < CHAINS; c++) {
    ulong modified_nonce = nonce_gid ^ ((ulong)c * GOLDEN_RATIO);
    blake3_hash_header_nonce(header, params.header_len, modified_nonce, states[c]);

    uchar key[16];
    uchar st[16];
    for(uint i=0;i<16;i++){ key[i]=states[c][i]; st[i]=states[c][16+i]; }
    device uchar* chain_scratch = lane_scratch + c * SCRATCHPAD_SIZE;
    for (uint b=0; b < BLOCKS_PER_SCRATCHPAD; b++) {
      aes_expand_block(st, key);
      uint off = b * BLOCK_SIZE;
      uchar st2[16];
      for(uint i=0;i<16;i++) st2[i]=st[i];
      aes_expand_block(st2, key);
      write_scratch_block(chain_scratch + off, st, st2);
    }
  }

  for (uint c = 0; c < CHAINS; c++) {
    device uchar* chain_scratch = lane_scratch + c * SCRATCHPAD_SIZE;
    uint initial_primitive = (uint)((nonce_gid + (ulong)c) % 3ul);
    if (FC_UNROLL_ROUNDS) {
      uint r = 0;
      for (; r + 1 < params.rounds; r += 2) {
        uhash_round(chain_scratch, states[c], initial_primitive, r);
        uhash_round(chain_scratch, states[c], initial_primitive, r + 1);
      }
      if (r < params.rounds) {
        uhash_round(chain_scratch, states[c], initial_primitive, r);
      }
    } else {
      for (uint r=0; r<params.rounds; r++) {
        uhash_round(chain_scratch, states[c], initial_primitive, r);
      }
    }
  }

  uchar combined[32];
  for(uint i=0;i<32;i++) combined[i] = 0;
  for(uint c=0;c<CHAINS;c++) for(uint i=0;i<32;i++) combined[i] ^= states[c][i];
  uchar sha_out[32];
  sha256_digest_32(combined, sha_out);
  uchar final_out[32];
  blake3_hash_32(sha_out, final_out);

  if (!FC_PROOF_MODE && params.emit_hashes != 0u) {
    device uchar* out = hashes_out + gid * 32;
    for(uint i=0;i<32;i++) out[i] = final_out[i];
  }

  if (FC_PROOF_MODE && params.difficulty > 0u) {
    threadgroup atomic_uint tg_found;
    threadgroup uint tg_nonce_lo;
    threadgroup uint tg_nonce_hi;
    threadgroup uchar tg_hash[32];

    if (tid == 0) {
      atomic_store_explicit(&tg_found, 0u, memory_order_relaxed);
      tg_nonce_lo = 0u;
      tg_nonce_hi = 0u;
    }
    threadgroup_barrier(mem_flags::mem_threadgroup);

    if (meets_difficulty_bits(final_out, params.difficulty)) {
      uint expected_local = 0u;
      if (atomic_compare_exchange_weak_explicit(
              &tg_found, &expected_local, 1u, memory_order_relaxed, memory_order_relaxed)) {
        tg_nonce_lo = (uint)(nonce_gid & 0xfffffffful);
        tg_nonce_hi = (uint)(nonce_gid >> 32);
        for (uint i=0; i<32; i++) tg_hash[i] = final_out[i];
      }
    }

    threadgroup_barrier(mem_flags::mem_threadgroup);
    if (tid == 0) {
      uint local_found = atomic_load_explicit(&tg_found, memory_order_relaxed);
      if (local_found != 0u) {
        uint expected = 0u;
        if (atomic_compare_exchange_weak_explicit(
                found_flag, &expected, 1u, memory_order_relaxed, memory_order_relaxed)) {
          ulong winner_nonce = ((ulong)tg_nonce_hi << 32) | (ulong)tg_nonce_lo;
          for (uint i=0; i<8; i++) {
            found_data[i] = (uchar)((winner_nonce >> (i * 8)) & 0xfful);
          }
          for (uint i=0; i<32; i++) {
            found_data[8 + i] = tg_hash[i];
          }
        }
      }
    }
  }
}
"#;

impl crate::solver::Solver for MetalMiner {
    fn backend_name(&self) -> &'static str {
        "metal"
    }

    fn recommended_lanes(&mut self, requested: usize) -> usize {
        self.recommended_lanes(requested)
    }

    fn find_proof_batch(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
        difficulty: u32,
    ) -> anyhow::Result<crate::solver::ProofResult> {
        self.find_proof_batch(header_without_nonce, start_nonce, lanes, difficulty)
    }

    fn benchmark_hashes(
        &mut self,
        header_without_nonce: &[u8],
        start_nonce: u64,
        lanes: usize,
    ) -> anyhow::Result<usize> {
        self.benchmark_hashes(header_without_nonce, start_nonce, lanes)
    }
}

#[cfg(test)]
mod tests {
    use super::MetalMiner;
    use std::time::Instant;
    use uhash_core::UniversalHash;

    fn cpu_hash(header: &[u8], nonce: u64) -> [u8; 32] {
        let mut input = Vec::with_capacity(header.len() + 8);
        input.extend_from_slice(header);
        input.extend_from_slice(&nonce.to_le_bytes());
        let mut cpu = UniversalHash::new();
        cpu.hash(&input)
    }

    #[test]
    fn metal_hash_matches_cpu_for_single_nonce() {
        let mut miner = MetalMiner::new().expect("metal init");
        let mut header = Vec::new();
        header.extend_from_slice(&[0xAB; 32]);
        header.extend_from_slice(b"bostrom1testaddress");
        header.extend_from_slice(&1_700_000_000u64.to_le_bytes());

        let nonce = 42u64;
        let gpu_hashes = miner
            .compute_batch_hashes(&header, nonce, 1)
            .expect("gpu hash");
        assert_eq!(gpu_hashes.len(), 1);

        let cpu_hash = cpu_hash(&header, nonce);

        assert_eq!(gpu_hashes[0], cpu_hash);
    }

    #[test]
    fn metal_hash_matches_cpu_for_header_size_vectors() {
        let mut miner = MetalMiner::new().expect("metal init");
        let header_lengths = [1usize, 55, 56, 63, 64, 65, 127, 128];

        for &header_len in &header_lengths {
            let mut header = vec![0u8; header_len];
            for (i, b) in header.iter_mut().enumerate() {
                *b = ((i * 17 + header_len) & 0xFF) as u8;
            }

            let start_nonce = 1000u64 + header_len as u64;
            let lanes = 4usize;
            let gpu = miner
                .compute_batch_hashes(&header, start_nonce, lanes)
                .expect("gpu batch");
            assert_eq!(gpu.len(), lanes);

            for (lane, gpu_hash) in gpu.into_iter().enumerate() {
                let nonce = start_nonce.wrapping_add(lane as u64);
                let expected = cpu_hash(&header, nonce);
                assert_eq!(
                    gpu_hash, expected,
                    "header_len={} lane={}",
                    header_len, lane
                );
            }
        }
    }

    #[test]
    fn metal_hash_matches_cpu_for_wrapping_nonce_window() {
        let mut miner = MetalMiner::new().expect("metal init");
        let header = b"bostrom1wrapping-nonce-window".to_vec();
        let start_nonce = u64::MAX - 2;
        let lanes = 5usize;

        let gpu = miner
            .compute_batch_hashes(&header, start_nonce, lanes)
            .expect("gpu wrapping batch");
        assert_eq!(gpu.len(), lanes);

        for (lane, gpu_hash) in gpu.into_iter().enumerate() {
            let nonce = start_nonce.wrapping_add(lane as u64);
            let expected = cpu_hash(&header, nonce);
            assert_eq!(gpu_hash, expected, "lane={} nonce={}", lane, nonce);
        }
    }

    #[test]
    #[ignore = "performance/stress profile for local metal throughput validation"]
    fn metal_release_throughput_profile() {
        let mut miner = MetalMiner::new().expect("metal init");
        let mut header = Vec::new();
        header.extend_from_slice(&[0xCD; 32]);
        header.extend_from_slice(b"bostrom1metalprofile");
        header.extend_from_slice(&1_700_000_001u64.to_le_bytes());

        let lane_sets = [64usize, 128, 256, 512, 1024];
        let mut nonce = 0u64;

        for lanes in lane_sets {
            let _ = miner
                .compute_batch_hashes(&header, nonce, lanes)
                .expect("warmup batch");
            nonce = nonce.saturating_add(lanes as u64);

            let start = Instant::now();
            let batches_per_set = if lanes >= 1024 { 1u64 } else { 3u64 };
            let mut computed = 0u64;
            for _ in 0..batches_per_set {
                let _ = miner
                    .compute_batch_hashes(&header, nonce, lanes)
                    .expect("profile batch");
                nonce = nonce.saturating_add(lanes as u64);
                computed += lanes as u64;
            }
            let elapsed = start.elapsed().as_secs_f64();
            let hps = if elapsed > 0.0 {
                computed as f64 / elapsed
            } else {
                0.0
            };
            eprintln!(
                "metal-profile lanes={} hashes={} elapsed={:.3}s hashrate={:.2} H/s",
                lanes, computed, elapsed, hps
            );
        }
    }
}