use anyhow::Result;
use opencl3::command_queue::{
CommandQueue, CL_BLOCKING, CL_NON_BLOCKING, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
CL_QUEUE_PROFILING_ENABLE,
};
use opencl3::context::Context;
use opencl3::device::{get_all_devices, Device, CL_DEVICE_TYPE_GPU};
use opencl3::event::Event;
use opencl3::kernel::{ExecuteKernel, Kernel};
use opencl3::memory::{Buffer, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE, CL_MEM_WRITE_ONLY};
use opencl3::program::Program;
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::fs;
use std::path::PathBuf;
use std::ptr;
use std::time::Instant;
use uhash_core::meets_difficulty;
use crate::solver::{ProofResult, Solver};
const CHAINS: usize = 4;
const SCRATCHPAD_SIZE: usize = 512 * 1024;
const TOTAL_MEMORY: usize = CHAINS * SCRATCHPAD_SIZE;
const ROUNDS: usize = 12_288;
const DEFAULT_INFLIGHT_SLOTS: usize = 3;
const OPENCL_KERNEL_SOURCE: &str = include_str!("../../kernels/uhash.cl");
#[repr(C)]
#[derive(Clone, Copy, Debug)]
struct KernelParams {
lanes: u32,
rounds: u32,
header_len: u32,
_pad0: u32,
start_nonce: u64,
}
struct PendingDispatch {
slot: usize,
output_lane_offset: usize,
lanes: usize,
read_event: Event,
}
#[derive(Clone, Debug, Serialize)]
pub struct OpenClTelemetry {
pub device_name: String,
pub device_vendor: String,
pub max_compute_units: u32,
pub max_work_group_size: usize,
pub global_mem_bytes: u64,
pub inflight_slots: usize,
pub tuned_chunk_lanes: usize,
pub max_chunk_lanes: usize,
pub runtime_profiled: bool,
}
#[derive(Debug, Serialize, Deserialize)]
struct OpenClTuneCache {
version: u32,
device_name: String,
tuned_chunk_lanes: usize,
max_chunk_lanes: usize,
inflight_slots: usize,
work_group_size: usize,
}
pub struct OpenClSolver {
context: Context,
queue: CommandQueue,
kernel: Kernel,
d_params: Vec<Buffer<u8>>,
d_hashes: Vec<Buffer<u8>>,
d_scratchpads: Vec<Buffer<u8>>,
d_header: Option<Buffer<u8>>,
header_capacity: usize,
h_hashes: Vec<Vec<u8>>,
chunk_lanes: usize,
tuned_chunk_lanes: usize,
max_chunk_lanes: usize,
inflight_slots: usize,
work_group_size: usize,
did_runtime_profile: bool,
device_name: String,
device_vendor: String,
max_compute_units: u32,
max_work_group_size: usize,
global_mem_bytes: u64,
}
impl OpenClSolver {
pub fn new() -> Result<Self> {
let device_id = *get_all_devices(CL_DEVICE_TYPE_GPU)
.map_err(|e| anyhow::anyhow!("OpenCL device enumeration failed: {}", e))?
.first()
.ok_or_else(|| anyhow::anyhow!("No OpenCL GPU device found"))?;
let device = Device::new(device_id);
let device_name = device
.name()
.map_err(|e| anyhow::anyhow!("Failed to get device name: {}", e))?;
let device_vendor = device
.vendor()
.map_err(|e| anyhow::anyhow!("Failed to get device vendor: {}", e))?;
let max_compute_units = device
.max_compute_units()
.map_err(|e| anyhow::anyhow!("Failed to get max compute units: {}", e))?;
let max_work_group_size = device
.max_work_group_size()
.map_err(|e| anyhow::anyhow!("Failed to get max work group size: {}", e))?;
let global_mem_bytes = device
.global_mem_size()
.map_err(|e| anyhow::anyhow!("Failed to get global memory: {}", e))?;
let context = Context::from_device(&device)
.map_err(|e| anyhow::anyhow!("OpenCL context creation failed: {}", e))?;
let queue = CommandQueue::create_default_with_properties(
&context,
CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
0,
)
.or_else(|_| {
CommandQueue::create_default_with_properties(&context, CL_QUEUE_PROFILING_ENABLE, 0)
})
.or_else(|_| CommandQueue::create_default(&context, CL_QUEUE_PROFILING_ENABLE))
.map_err(|e| anyhow::anyhow!("OpenCL command queue creation failed: {}", e))?;
let program = Program::create_and_build_from_source(&context, OPENCL_KERNEL_SOURCE, "")
.map_err(|e| anyhow::anyhow!("OpenCL kernel compilation failed: {}", e))?;
let kernel = Kernel::create(&program, "uhash_kernel")
.map_err(|e| anyhow::anyhow!("Failed to create uhash_kernel: {}", e))?;
let mut solver = Self {
context,
queue,
kernel,
d_params: Vec::new(),
d_hashes: Vec::new(),
d_scratchpads: Vec::new(),
d_header: None,
header_capacity: 0,
h_hashes: Vec::new(),
chunk_lanes: 0,
tuned_chunk_lanes: 0,
max_chunk_lanes: 0,
inflight_slots: DEFAULT_INFLIGHT_SLOTS,
work_group_size: 256,
did_runtime_profile: false,
device_name,
device_vendor,
max_compute_units,
max_work_group_size,
global_mem_bytes,
};
solver.try_load_tune_cache();
Ok(solver)
}
pub fn telemetry(&self) -> OpenClTelemetry {
OpenClTelemetry {
device_name: self.device_name.clone(),
device_vendor: self.device_vendor.clone(),
max_compute_units: self.max_compute_units,
max_work_group_size: self.max_work_group_size,
global_mem_bytes: self.global_mem_bytes,
inflight_slots: self.inflight_slots,
tuned_chunk_lanes: self.tuned_chunk_lanes,
max_chunk_lanes: self.max_chunk_lanes,
runtime_profiled: self.did_runtime_profile,
}
}
fn tune_cache_path(&self) -> Option<PathBuf> {
let base = std::env::var_os("XDG_CONFIG_HOME")
.map(PathBuf::from)
.or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".config")))?;
let safe_name: String = self
.device_name
.chars()
.map(|c| if c.is_alphanumeric() { c } else { '_' })
.collect();
Some(
base.join("uhash")
.join(format!("opencl_tuning_{}.json", safe_name)),
)
}
fn try_load_tune_cache(&mut self) {
let Some(path) = self.tune_cache_path() else {
return;
};
let Ok(raw) = fs::read(&path) else {
return;
};
let Ok(cache) = serde_json::from_slice::<OpenClTuneCache>(&raw) else {
return;
};
if cache.version != 2 {
return;
}
let (_heuristic_chunk, heuristic_max) = self.autotune_chunk_limits();
let cap_max = heuristic_max.max(32);
let chunk = cache.tuned_chunk_lanes.clamp(32, cap_max);
let max_chunk = cache.max_chunk_lanes.clamp(chunk, cap_max);
if chunk % 32 != 0 || max_chunk % 32 != 0 {
return;
}
self.tuned_chunk_lanes = chunk;
self.max_chunk_lanes = max_chunk;
self.inflight_slots = cache.inflight_slots.clamp(2, 6);
self.work_group_size = cache.work_group_size.clamp(64, 1024);
}
fn persist_tune_cache(&self) {
let Some(path) = self.tune_cache_path() else {
return;
};
let cache = OpenClTuneCache {
version: 2,
device_name: self.device_name.clone(),
tuned_chunk_lanes: self.tuned_chunk_lanes,
max_chunk_lanes: self.max_chunk_lanes,
inflight_slots: self.inflight_slots,
work_group_size: self.work_group_size,
};
let Ok(raw) = serde_json::to_vec_pretty(&cache) else {
return;
};
if let Some(parent) = path.parent() {
let _ = fs::create_dir_all(parent);
}
let _ = fs::write(path, raw);
}
fn autotune_chunk_limits(&self) -> (usize, usize) {
let mem_budget = (self.global_mem_bytes as usize).saturating_mul(7) / 10;
let per_lane = TOTAL_MEMORY + 32;
let max_by_mem = mem_budget
.saturating_div(per_lane.saturating_mul(self.inflight_slots))
.max(1);
let max_chunk = max_by_mem.clamp(32, 8192);
let tuned = max_chunk.min(1024);
let tuned = (tuned / 32).max(1) * 32;
let max_chunk = (max_chunk / 32).max(1) * 32;
(tuned, max_chunk)
}
fn round_to_multiple(v: usize, m: usize) -> usize {
(v / m).max(1) * m
}
fn runtime_profile_work_group_size(
&mut self,
header_without_nonce: &[u8],
chunk_lanes: usize,
) -> Result<()> {
let candidates = [64usize, 128, 256];
let current = self.work_group_size;
let mut best_wgs = current;
let mut best_hps = 0.0f64;
for &wgs in &candidates {
if wgs > self.max_work_group_size {
continue;
}
self.work_group_size = wgs;
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
self.dispatch_chunk_sync(0, chunk_lanes, header_without_nonce.len(), 0)?;
let start = Instant::now();
self.dispatch_chunk_sync(0, chunk_lanes, header_without_nonce.len(), 1)?;
let elapsed = start.elapsed().as_secs_f64();
if elapsed <= 0.0 {
continue;
}
let hps = chunk_lanes as f64 / elapsed;
if hps > best_hps {
best_hps = hps;
best_wgs = wgs;
}
}
self.work_group_size = best_wgs;
self.chunk_lanes = 0;
Ok(())
}
fn runtime_profile_chunk_lanes(&mut self, header_without_nonce: &[u8]) -> Result<()> {
if self.did_runtime_profile {
return Ok(());
}
self.did_runtime_profile = true;
let base = self.tuned_chunk_lanes.max(32);
let max_chunk = self.max_chunk_lanes.max(base);
let mut candidates = vec![
base,
Self::round_to_multiple(base.saturating_mul(2), 32).min(max_chunk),
Self::round_to_multiple(base.saturating_mul(3), 32).min(max_chunk),
max_chunk,
];
candidates.sort_unstable();
candidates.dedup();
let mut best_lane = base;
let mut best_hps = 0.0f64;
for &lane in &candidates {
if lane == 0 {
continue;
}
self.ensure_resources(header_without_nonce.len(), lane)?;
self.dispatch_chunk_sync(0, lane, header_without_nonce.len(), 0)?;
let start = Instant::now();
self.dispatch_chunk_sync(0, lane, header_without_nonce.len(), lane as u64)?;
let elapsed = start.elapsed().as_secs_f64();
if elapsed <= 0.0 {
continue;
}
let hps = lane as f64 / elapsed;
if hps > best_hps {
best_hps = hps;
best_lane = lane;
}
}
self.tuned_chunk_lanes = best_lane.max(32);
self.runtime_profile_work_group_size(header_without_nonce, self.tuned_chunk_lanes)?;
self.runtime_profile_inflight_slots(header_without_nonce, self.tuned_chunk_lanes)?;
self.chunk_lanes = 0;
self.persist_tune_cache();
Ok(())
}
fn ensure_resources(&mut self, header_len: usize, chunk_lanes: usize) -> Result<()> {
if header_len > self.header_capacity {
let new_capacity = header_len.next_power_of_two();
self.d_header = Some(unsafe {
Buffer::<u8>::create(
&self.context,
CL_MEM_READ_ONLY,
new_capacity,
ptr::null_mut(),
)
.map_err(|e| anyhow::anyhow!("Failed to alloc header buffer: {}", e))?
});
self.header_capacity = new_capacity;
}
if self.chunk_lanes == chunk_lanes
&& self.d_params.len() == self.inflight_slots
&& self.d_hashes.len() == self.inflight_slots
&& self.d_scratchpads.len() == self.inflight_slots
&& self.h_hashes.len() == self.inflight_slots
{
return Ok(());
}
let hash_bytes = chunk_lanes.saturating_mul(32);
let scratch_bytes = chunk_lanes.saturating_mul(TOTAL_MEMORY);
if hash_bytes == 0 || scratch_bytes == 0 {
anyhow::bail!("invalid chunk_lanes={}", chunk_lanes);
}
self.d_params.clear();
self.d_hashes.clear();
self.d_scratchpads.clear();
self.h_hashes.clear();
let params_size = std::mem::size_of::<KernelParams>();
for _ in 0..self.inflight_slots {
self.d_params.push(unsafe {
Buffer::<u8>::create(
&self.context,
CL_MEM_READ_ONLY,
params_size,
ptr::null_mut(),
)
.map_err(|e| anyhow::anyhow!("Failed to alloc params: {}", e))?
});
self.d_hashes.push(unsafe {
Buffer::<u8>::create(
&self.context,
CL_MEM_WRITE_ONLY,
hash_bytes,
ptr::null_mut(),
)
.map_err(|e| anyhow::anyhow!("Failed to alloc hashes: {}", e))?
});
self.d_scratchpads.push(unsafe {
Buffer::<u8>::create(
&self.context,
CL_MEM_READ_WRITE,
scratch_bytes,
ptr::null_mut(),
)
.map_err(|e| anyhow::anyhow!("Failed to alloc scratchpads: {}", e))?
});
self.h_hashes.push(vec![0u8; hash_bytes]);
}
self.chunk_lanes = chunk_lanes;
Ok(())
}
fn dispatch_chunk_sync(
&mut self,
slot: usize,
chunk_lanes: usize,
header_len: usize,
start_nonce: u64,
) -> Result<Vec<u8>> {
let params = KernelParams {
lanes: chunk_lanes as u32,
rounds: ROUNDS as u32,
header_len: header_len as u32,
_pad0: 0,
start_nonce,
};
let params_bytes: &[u8] = unsafe {
std::slice::from_raw_parts(
¶ms as *const KernelParams as *const u8,
std::mem::size_of::<KernelParams>(),
)
};
let d_params = self
.d_params
.get_mut(slot)
.ok_or_else(|| anyhow::anyhow!("params slot {} missing", slot))?;
unsafe {
self.queue
.enqueue_write_buffer(d_params, CL_BLOCKING, 0, params_bytes, &[])
.map_err(|e| anyhow::anyhow!("Failed to upload params: {}", e))?;
}
let d_header = self
.d_header
.as_ref()
.ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
let d_params = self
.d_params
.get(slot)
.ok_or_else(|| anyhow::anyhow!("params slot {} missing", slot))?;
let d_hashes = self
.d_hashes
.get(slot)
.ok_or_else(|| anyhow::anyhow!("hashes slot {} missing", slot))?;
let d_scratch = self
.d_scratchpads
.get(slot)
.ok_or_else(|| anyhow::anyhow!("scratchpad slot {} missing", slot))?;
let global_size = chunk_lanes.div_ceil(self.work_group_size) * self.work_group_size;
let kernel_event = unsafe {
ExecuteKernel::new(&self.kernel)
.set_arg(d_header)
.set_arg(d_hashes)
.set_arg(d_scratch)
.set_arg(d_params)
.set_global_work_size(global_size)
.set_local_work_size(self.work_group_size)
.enqueue_nd_range(&self.queue)
.map_err(|e| anyhow::anyhow!("Kernel launch failed: {}", e))?
};
kernel_event
.wait()
.map_err(|e| anyhow::anyhow!("Kernel wait failed: {}", e))?;
let hash_bytes = chunk_lanes * 32;
let mut out = vec![0u8; hash_bytes];
let d_hashes = self
.d_hashes
.get(slot)
.ok_or_else(|| anyhow::anyhow!("hashes slot {} missing", slot))?;
unsafe {
self.queue
.enqueue_read_buffer(d_hashes, CL_BLOCKING, 0, &mut out, &[])
.map_err(|e| anyhow::anyhow!("Failed to read hashes from device: {}", e))?;
}
Ok(out)
}
fn dispatch_chunk_async(
&mut self,
slot: usize,
chunk_lanes: usize,
header_len: usize,
start_nonce: u64,
) -> Result<Event> {
let params = KernelParams {
lanes: chunk_lanes as u32,
rounds: ROUNDS as u32,
header_len: header_len as u32,
_pad0: 0,
start_nonce,
};
let params_bytes: &[u8] = unsafe {
std::slice::from_raw_parts(
¶ms as *const KernelParams as *const u8,
std::mem::size_of::<KernelParams>(),
)
};
let d_params = self
.d_params
.get_mut(slot)
.ok_or_else(|| anyhow::anyhow!("params slot {} missing", slot))?;
let write_event = unsafe {
self.queue
.enqueue_write_buffer(d_params, CL_NON_BLOCKING, 0, params_bytes, &[])
.map_err(|e| anyhow::anyhow!("Failed to upload params: {}", e))?
};
let d_header = self
.d_header
.as_ref()
.ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
let d_params = self
.d_params
.get(slot)
.ok_or_else(|| anyhow::anyhow!("params slot {} missing", slot))?;
let d_hashes = self
.d_hashes
.get(slot)
.ok_or_else(|| anyhow::anyhow!("hashes slot {} missing", slot))?;
let d_scratch = self
.d_scratchpads
.get(slot)
.ok_or_else(|| anyhow::anyhow!("scratchpad slot {} missing", slot))?;
let global_size = chunk_lanes.div_ceil(self.work_group_size) * self.work_group_size;
let kernel_event = unsafe {
ExecuteKernel::new(&self.kernel)
.set_arg(d_header)
.set_arg(d_hashes)
.set_arg(d_scratch)
.set_arg(d_params)
.set_global_work_size(global_size)
.set_local_work_size(self.work_group_size)
.set_wait_event(&write_event)
.enqueue_nd_range(&self.queue)
.map_err(|e| anyhow::anyhow!("Kernel launch failed: {}", e))?
};
let hash_bytes = chunk_lanes * 32;
let d_hashes = self
.d_hashes
.get(slot)
.ok_or_else(|| anyhow::anyhow!("hashes slot {} missing", slot))?;
let h_buf = self
.h_hashes
.get_mut(slot)
.ok_or_else(|| anyhow::anyhow!("host hash slot {} missing", slot))?;
let read_event = unsafe {
self.queue
.enqueue_read_buffer(
d_hashes,
CL_NON_BLOCKING,
0,
&mut h_buf[..hash_bytes],
&[kernel_event.get()],
)
.map_err(|e| anyhow::anyhow!("Failed to enqueue hash readback: {}", e))?
};
Ok(read_event)
}
fn runtime_profile_inflight_slots(
&mut self,
header_without_nonce: &[u8],
chunk_lanes: usize,
) -> Result<()> {
let candidates = [2usize, 3, 4];
let original = self.inflight_slots;
let mut best_slots = original;
let mut best_hps = 0.0f64;
for &slots in &candidates {
self.inflight_slots = slots;
self.chunk_lanes = 0;
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
let test_lanes = chunk_lanes
.saturating_mul(slots)
.saturating_mul(2)
.max(chunk_lanes);
let start = Instant::now();
let done = self.compute_batch_count(header_without_nonce, 0, test_lanes)?;
let elapsed = start.elapsed().as_secs_f64();
if elapsed <= 0.0 || done == 0 {
continue;
}
let hps = done as f64 / elapsed;
if hps > best_hps {
best_hps = hps;
best_slots = slots;
}
}
self.inflight_slots = best_slots;
self.chunk_lanes = 0;
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
Ok(())
}
fn compute_batch_count(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
) -> Result<usize> {
if lanes == 0 {
return Ok(0);
}
let chunk_lanes = self
.tuned_chunk_lanes
.min(self.max_chunk_lanes)
.min(lanes)
.max(1);
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
let d_header = self
.d_header
.as_mut()
.ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
unsafe {
self.queue
.enqueue_write_buffer(d_header, CL_BLOCKING, 0, header_without_nonce, &[])
.map_err(|e| anyhow::anyhow!("Failed to upload header: {}", e))?;
}
let mut pending_by_slot: Vec<Option<PendingDispatch>> =
(0..self.inflight_slots).map(|_| None).collect();
let mut available_slots: VecDeque<usize> = (0..self.inflight_slots).collect();
let mut in_flight = 0usize;
let mut lane_offset = 0usize;
let mut total_done = 0usize;
while lane_offset < lanes || in_flight > 0 {
while lane_offset < lanes && !available_slots.is_empty() {
let this_lanes = (lanes - lane_offset).min(chunk_lanes);
let slot = available_slots.pop_front().expect("slot available");
let this_start_nonce = start_nonce.wrapping_add(lane_offset as u64);
let read_event = self.dispatch_chunk_async(
slot,
this_lanes,
header_without_nonce.len(),
this_start_nonce,
)?;
pending_by_slot[slot] = Some(PendingDispatch {
slot,
output_lane_offset: lane_offset,
lanes: this_lanes,
read_event,
});
in_flight += 1;
lane_offset += this_lanes;
}
if in_flight == 0 {
break;
}
let oldest_slot = pending_by_slot
.iter()
.enumerate()
.find_map(|(i, p)| p.as_ref().map(|_| i))
.ok_or_else(|| anyhow::anyhow!("no pending slot found but in_flight > 0"))?;
let done = pending_by_slot[oldest_slot]
.take()
.ok_or_else(|| anyhow::anyhow!("pending slot vanished"))?;
done.read_event
.wait()
.map_err(|e| anyhow::anyhow!("Read event wait failed: {}", e))?;
total_done += done.lanes;
available_slots.push_back(oldest_slot);
in_flight = in_flight.saturating_sub(1);
}
Ok(total_done)
}
fn compute_batch_hashes(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
) -> Result<Vec<[u8; 32]>> {
if lanes == 0 {
return Ok(Vec::new());
}
if header_without_nonce.is_empty() {
anyhow::bail!("header must not be empty");
}
if self.tuned_chunk_lanes == 0 || self.max_chunk_lanes == 0 {
let (chunk, max_chunk) = self.autotune_chunk_limits();
self.tuned_chunk_lanes = chunk;
self.max_chunk_lanes = max_chunk;
}
if !self.did_runtime_profile {
self.runtime_profile_chunk_lanes(header_without_nonce)?;
}
let chunk_lanes = self
.tuned_chunk_lanes
.min(self.max_chunk_lanes)
.min(lanes)
.max(1);
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
let d_header = self
.d_header
.as_mut()
.ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
unsafe {
self.queue
.enqueue_write_buffer(d_header, CL_BLOCKING, 0, header_without_nonce, &[])
.map_err(|e| anyhow::anyhow!("Failed to upload header: {}", e))?;
}
let mut out_hashes = vec![[0u8; 32]; lanes];
let mut pending_by_slot: Vec<Option<PendingDispatch>> =
(0..self.inflight_slots).map(|_| None).collect();
let mut available_slots: VecDeque<usize> = (0..self.inflight_slots).collect();
let mut in_flight = 0usize;
let mut lane_offset = 0usize;
while lane_offset < lanes || in_flight > 0 {
while lane_offset < lanes && !available_slots.is_empty() {
let this_lanes = (lanes - lane_offset).min(chunk_lanes);
let slot = available_slots.pop_front().expect("slot available");
let this_start_nonce = start_nonce.wrapping_add(lane_offset as u64);
let read_event = self.dispatch_chunk_async(
slot,
this_lanes,
header_without_nonce.len(),
this_start_nonce,
)?;
pending_by_slot[slot] = Some(PendingDispatch {
slot,
output_lane_offset: lane_offset,
lanes: this_lanes,
read_event,
});
in_flight += 1;
lane_offset += this_lanes;
}
if in_flight == 0 {
break;
}
let oldest_slot = pending_by_slot
.iter()
.enumerate()
.find_map(|(i, p)| p.as_ref().map(|_| i))
.ok_or_else(|| anyhow::anyhow!("no pending slot found but in_flight > 0"))?;
let done = pending_by_slot[oldest_slot]
.take()
.ok_or_else(|| anyhow::anyhow!("pending slot vanished"))?;
done.read_event
.wait()
.map_err(|e| anyhow::anyhow!("Read event wait failed: {}", e))?;
let h_buf = &self.h_hashes[done.slot];
for i in 0..done.lanes {
let src_offset = i * 32;
out_hashes[done.output_lane_offset + i]
.copy_from_slice(&h_buf[src_offset..src_offset + 32]);
}
available_slots.push_back(oldest_slot);
in_flight = in_flight.saturating_sub(1);
}
Ok(out_hashes)
}
}
impl Solver for OpenClSolver {
fn backend_name(&self) -> &'static str {
"opencl"
}
fn recommended_lanes(&mut self, requested: usize) -> usize {
if self.tuned_chunk_lanes == 0 {
let (chunk, max_chunk) = self.autotune_chunk_limits();
self.tuned_chunk_lanes = chunk;
self.max_chunk_lanes = max_chunk;
}
if requested == 0 {
self.tuned_chunk_lanes
.saturating_mul(self.inflight_slots)
.max(1)
} else {
requested.max(1)
}
}
fn find_proof_batch(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
difficulty: u32,
) -> Result<ProofResult> {
let hashes = self.compute_batch_hashes(header_without_nonce, start_nonce, lanes)?;
for (lane, hash) in hashes.into_iter().enumerate() {
if meets_difficulty(&hash, difficulty) {
return Ok(Some((start_nonce + lane as u64, hash)));
}
}
Ok(None)
}
fn benchmark_hashes(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
) -> Result<usize> {
Ok(self
.compute_batch_hashes(header_without_nonce, start_nonce, lanes)?
.len())
}
}
#[cfg(test)]
mod tests {
use super::OpenClSolver;
use std::time::Instant;
use uhash_core::UniversalHash;
#[test]
fn opencl_hash_matches_cpu_for_single_nonce() {
let Ok(mut solver) = OpenClSolver::new() else {
eprintln!("Skipping: no OpenCL GPU available");
return;
};
let mut header = Vec::new();
header.extend_from_slice(&[0xAB; 32]);
header.extend_from_slice(b"bostrom1testaddress");
header.extend_from_slice(&1_700_000_000u64.to_le_bytes());
let nonce = 42u64;
let gpu_hashes = solver
.compute_batch_hashes(&header, nonce, 1)
.expect("gpu hash");
assert_eq!(gpu_hashes.len(), 1);
let mut input = header.clone();
input.extend_from_slice(&nonce.to_le_bytes());
let mut cpu = UniversalHash::new();
let cpu_hash = cpu.hash(&input);
assert_eq!(
gpu_hashes[0],
cpu_hash,
"OpenCL hash does not match CPU hash!\nGPU: {}\nCPU: {}",
hex::encode(gpu_hashes[0]),
hex::encode(cpu_hash)
);
}
#[test]
fn opencl_hash_matches_cpu_for_multi_nonce() {
let Ok(mut solver) = OpenClSolver::new() else {
eprintln!("Skipping: no OpenCL GPU available");
return;
};
let mut header = Vec::new();
header.extend_from_slice(&[0xCD; 32]);
header.extend_from_slice(b"bostrom1multinonce");
header.extend_from_slice(&1_700_000_001u64.to_le_bytes());
let start_nonce = 0u64;
let count = 64usize;
let gpu_hashes = solver
.compute_batch_hashes(&header, start_nonce, count)
.expect("gpu batch hash");
assert_eq!(gpu_hashes.len(), count);
for (i, gpu_hash) in gpu_hashes.iter().enumerate().take(count) {
let nonce = start_nonce + i as u64;
let mut input = header.clone();
input.extend_from_slice(&nonce.to_le_bytes());
let mut cpu = UniversalHash::new();
let cpu_hash = cpu.hash(&input);
assert_eq!(
*gpu_hash,
cpu_hash,
"Mismatch at nonce {}: GPU={} CPU={}",
nonce,
hex::encode(gpu_hash),
hex::encode(cpu_hash)
);
}
}
#[test]
#[ignore = "performance profile for local OpenCL throughput validation"]
fn opencl_release_throughput_profile() {
let Ok(mut solver) = OpenClSolver::new() else {
eprintln!("Skipping: no OpenCL GPU available");
return;
};
let mut header = Vec::new();
header.extend_from_slice(&[0xEF; 32]);
header.extend_from_slice(b"bostrom1oclprofile");
header.extend_from_slice(&1_700_000_002u64.to_le_bytes());
let lane_sets = [64usize, 128, 256, 512, 1024];
let mut nonce = 0u64;
for lanes in lane_sets {
let _ = solver
.compute_batch_hashes(&header, nonce, lanes)
.expect("warmup batch");
nonce = nonce.saturating_add(lanes as u64);
let start = Instant::now();
let batches = if lanes >= 1024 { 1u64 } else { 3u64 };
let mut computed = 0u64;
for _ in 0..batches {
let _ = solver
.compute_batch_hashes(&header, nonce, lanes)
.expect("profile batch");
nonce = nonce.saturating_add(lanes as u64);
computed += lanes as u64;
}
let elapsed = start.elapsed().as_secs_f64();
let hps = if elapsed > 0.0 {
computed as f64 / elapsed
} else {
0.0
};
eprintln!(
"opencl-profile lanes={} hashes={} elapsed={:.3}s hashrate={:.2} H/s",
lanes, computed, elapsed, hps
);
}
}
}