use anyhow::Result;
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::fs;
use std::path::PathBuf;
use std::sync::mpsc;
use std::time::Instant;
use uhash_core::meets_difficulty;
use crate::solver::{ProofResult, Solver};
const CHAINS: usize = 4;
const SCRATCHPAD_SIZE: usize = 512 * 1024;
const TOTAL_MEMORY: usize = CHAINS * SCRATCHPAD_SIZE;
const ROUNDS: usize = 12_288;
const DEFAULT_INFLIGHT_SLOTS: usize = 3;
const DEFAULT_WORKGROUP_SIZE: u32 = 64;
const WGSL_SHADER_SOURCE: &str = include_str!("../../kernels/uhash.wgsl");
#[repr(C)]
#[derive(Clone, Copy, Debug, bytemuck::Pod, bytemuck::Zeroable)]
struct KernelParams {
lanes: u32,
rounds: u32,
header_len: u32,
_pad0: u32,
start_nonce_lo: u32,
start_nonce_hi: u32,
_pad1: u32,
_pad2: u32,
}
#[derive(Clone, Debug, Serialize)]
pub struct WgpuTelemetry {
pub device_name: String,
pub backend: String,
pub device_type: String,
pub inflight_slots: usize,
pub tuned_chunk_lanes: usize,
pub max_chunk_lanes: usize,
pub runtime_profiled: bool,
}
#[derive(Debug, Serialize, Deserialize)]
struct WgpuTuneCache {
version: u32,
device_name: String,
backend: String,
tuned_chunk_lanes: usize,
max_chunk_lanes: usize,
inflight_slots: usize,
#[serde(default = "default_workgroup_size")]
workgroup_size: u32,
}
const fn default_workgroup_size() -> u32 {
DEFAULT_WORKGROUP_SIZE
}
struct PendingDispatch {
slot: usize,
output_lane_offset: usize,
lanes: usize,
}
struct SlotBuffers {
params_buf: wgpu::Buffer,
hashes_buf: wgpu::Buffer,
_scratch_buf: wgpu::Buffer,
staging_buf: wgpu::Buffer,
bind_group: Option<wgpu::BindGroup>,
}
pub struct WgpuSolver {
device: wgpu::Device,
queue: wgpu::Queue,
pipeline: wgpu::ComputePipeline,
bind_group_layout: wgpu::BindGroupLayout,
shader: wgpu::ShaderModule,
pipeline_layout: wgpu::PipelineLayout,
header_buf: Option<wgpu::Buffer>,
header_capacity: usize,
slots: Vec<SlotBuffers>,
chunk_lanes: usize,
tuned_chunk_lanes: usize,
max_chunk_lanes: usize,
inflight_slots: usize,
workgroup_size: u32,
did_runtime_profile: bool,
device_name: String,
backend_name_str: String,
device_type_str: String,
}
impl WgpuSolver {
pub fn new() -> Result<Self> {
let instance = wgpu::Instance::default();
let adapter = pollster::block_on(instance.request_adapter(&wgpu::RequestAdapterOptions {
power_preference: wgpu::PowerPreference::HighPerformance,
compatible_surface: None,
force_fallback_adapter: false,
}))
.map_err(|e| anyhow::anyhow!("No WGPU adapter found: {}", e))?;
let info = adapter.get_info();
let device_name = info.name.clone();
let backend_name_str = format!("{:?}", info.backend);
let device_type_str = format!("{:?}", info.device_type);
let (device, queue) = pollster::block_on(adapter.request_device(&wgpu::DeviceDescriptor {
label: Some("uhash"),
required_features: wgpu::Features::empty(),
required_limits: wgpu::Limits {
max_storage_buffer_binding_size: 1 << 30, max_buffer_size: 1 << 30,
..wgpu::Limits::default()
},
memory_hints: wgpu::MemoryHints::Performance,
experimental_features: Default::default(),
trace: Default::default(),
}))
.map_err(|e| anyhow::anyhow!("Failed to create WGPU device: {}", e))?;
let shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
label: Some("uhash_shader"),
source: wgpu::ShaderSource::Wgsl(WGSL_SHADER_SOURCE.into()),
});
let bind_group_layout = device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
label: Some("uhash_bind_group_layout"),
entries: &[
wgpu::BindGroupLayoutEntry {
binding: 0,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only: true },
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
wgpu::BindGroupLayoutEntry {
binding: 1,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only: false },
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
wgpu::BindGroupLayoutEntry {
binding: 2,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Storage { read_only: false },
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
wgpu::BindGroupLayoutEntry {
binding: 3,
visibility: wgpu::ShaderStages::COMPUTE,
ty: wgpu::BindingType::Buffer {
ty: wgpu::BufferBindingType::Uniform,
has_dynamic_offset: false,
min_binding_size: None,
},
count: None,
},
],
});
let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
label: Some("uhash_pipeline_layout"),
bind_group_layouts: &[&bind_group_layout],
immediate_size: 0,
});
let pipeline = Self::create_pipeline_with_workgroup_size(
&device,
&shader,
&pipeline_layout,
DEFAULT_WORKGROUP_SIZE,
);
let mut solver = Self {
device,
queue,
pipeline,
bind_group_layout,
shader,
pipeline_layout,
header_buf: None,
header_capacity: 0,
slots: Vec::new(),
chunk_lanes: 0,
tuned_chunk_lanes: 0,
max_chunk_lanes: 0,
inflight_slots: DEFAULT_INFLIGHT_SLOTS,
workgroup_size: DEFAULT_WORKGROUP_SIZE,
did_runtime_profile: false,
device_name,
backend_name_str,
device_type_str,
};
solver.try_load_tune_cache();
Ok(solver)
}
pub fn telemetry(&self) -> WgpuTelemetry {
WgpuTelemetry {
device_name: self.device_name.clone(),
backend: self.backend_name_str.clone(),
device_type: self.device_type_str.clone(),
inflight_slots: self.inflight_slots,
tuned_chunk_lanes: self.tuned_chunk_lanes,
max_chunk_lanes: self.max_chunk_lanes,
runtime_profiled: self.did_runtime_profile,
}
}
fn create_pipeline_with_workgroup_size(
device: &wgpu::Device,
shader: &wgpu::ShaderModule,
pipeline_layout: &wgpu::PipelineLayout,
workgroup_size: u32,
) -> wgpu::ComputePipeline {
let constants: Vec<(&str, f64)> = vec![("WORKGROUP_SIZE", workgroup_size as f64)];
device.create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
label: Some("uhash_pipeline"),
layout: Some(pipeline_layout),
module: shader,
entry_point: Some("main"),
compilation_options: wgpu::PipelineCompilationOptions {
constants: &constants,
..Default::default()
},
cache: None,
})
}
fn rebuild_pipeline(&mut self) {
self.pipeline = Self::create_pipeline_with_workgroup_size(
&self.device,
&self.shader,
&self.pipeline_layout,
self.workgroup_size,
);
}
fn tune_cache_path(&self) -> Option<PathBuf> {
let base = std::env::var_os("XDG_CONFIG_HOME")
.map(PathBuf::from)
.or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".config")))?;
let safe_name: String = self
.device_name
.chars()
.map(|c| if c.is_alphanumeric() { c } else { '_' })
.collect();
Some(
base.join("uhash")
.join(format!("wgpu_tuning_{}.json", safe_name)),
)
}
fn try_load_tune_cache(&mut self) {
let Some(path) = self.tune_cache_path() else {
return;
};
let Ok(raw) = fs::read(&path) else {
return;
};
let Ok(cache) = serde_json::from_slice::<WgpuTuneCache>(&raw) else {
return;
};
if cache.version != 2 {
return;
}
let (_heuristic_chunk, heuristic_max) = self.autotune_chunk_limits();
let cap_max = heuristic_max.max(32);
let chunk = cache.tuned_chunk_lanes.clamp(32, cap_max);
let max_chunk = cache.max_chunk_lanes.clamp(chunk, cap_max);
if chunk % 32 != 0 || max_chunk % 32 != 0 {
return;
}
self.tuned_chunk_lanes = chunk;
self.max_chunk_lanes = max_chunk;
self.inflight_slots = cache.inflight_slots.clamp(2, 6);
let wgs = cache.workgroup_size;
if wgs == 64 || wgs == 128 || wgs == 256 {
self.workgroup_size = wgs;
self.rebuild_pipeline();
}
}
fn persist_tune_cache(&self) {
let Some(path) = self.tune_cache_path() else {
return;
};
let cache = WgpuTuneCache {
version: 2,
device_name: self.device_name.clone(),
backend: self.backend_name_str.clone(),
tuned_chunk_lanes: self.tuned_chunk_lanes,
max_chunk_lanes: self.max_chunk_lanes,
inflight_slots: self.inflight_slots,
workgroup_size: self.workgroup_size,
};
let Ok(raw) = serde_json::to_vec_pretty(&cache) else {
return;
};
if let Some(parent) = path.parent() {
let _ = fs::create_dir_all(parent);
}
let _ = fs::write(path, raw);
}
fn autotune_chunk_limits(&self) -> (usize, usize) {
let mem_budget = 2usize * 1024 * 1024 * 1024;
let per_lane = TOTAL_MEMORY + 32;
let max_by_mem = mem_budget
.saturating_div(per_lane.saturating_mul(self.inflight_slots))
.max(1);
let max_chunk = max_by_mem.clamp(32, 4096);
let tuned = max_chunk.min(1024);
let tuned = (tuned / 32).max(1) * 32;
let max_chunk = (max_chunk / 32).max(1) * 32;
(tuned, max_chunk)
}
fn round_to_multiple(v: usize, m: usize) -> usize {
(v / m).max(1) * m
}
fn runtime_profile_chunk_lanes(&mut self, header_without_nonce: &[u8]) -> Result<()> {
if self.did_runtime_profile {
return Ok(());
}
self.did_runtime_profile = true;
let base = self.tuned_chunk_lanes.max(32);
let max_chunk = self.max_chunk_lanes.max(base);
let mut candidates = vec![
base,
Self::round_to_multiple(base.saturating_mul(2), 32).min(max_chunk),
Self::round_to_multiple(base.saturating_mul(3), 32).min(max_chunk),
max_chunk,
];
candidates.sort_unstable();
candidates.dedup();
let mut best_lane = base;
let mut best_hps = 0.0f64;
for &lane in &candidates {
if lane == 0 {
continue;
}
self.ensure_resources(header_without_nonce.len(), lane)?;
self.dispatch_chunk_sync(0, lane, header_without_nonce, 0)?;
let start = Instant::now();
self.dispatch_chunk_sync(0, lane, header_without_nonce, lane as u64)?;
let elapsed = start.elapsed().as_secs_f64();
if elapsed <= 0.0 {
continue;
}
let hps = lane as f64 / elapsed;
if hps > best_hps {
best_hps = hps;
best_lane = lane;
}
}
self.tuned_chunk_lanes = best_lane.max(32);
self.runtime_profile_workgroup_size(header_without_nonce, self.tuned_chunk_lanes)?;
self.runtime_profile_inflight_slots(header_without_nonce, self.tuned_chunk_lanes)?;
self.chunk_lanes = 0;
self.persist_tune_cache();
Ok(())
}
fn runtime_profile_workgroup_size(
&mut self,
header_without_nonce: &[u8],
chunk_lanes: usize,
) -> Result<()> {
let candidates = [64u32, 128, 256];
let mut best_wgs = self.workgroup_size;
let mut best_hps = 0.0f64;
for &wgs in &candidates {
self.workgroup_size = wgs;
self.rebuild_pipeline();
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
self.dispatch_chunk_sync(0, chunk_lanes, header_without_nonce, 0)?;
let start = Instant::now();
self.dispatch_chunk_sync(0, chunk_lanes, header_without_nonce, 1)?;
let elapsed = start.elapsed().as_secs_f64();
if elapsed <= 0.0 {
continue;
}
let hps = chunk_lanes as f64 / elapsed;
if hps > best_hps {
best_hps = hps;
best_wgs = wgs;
}
}
self.workgroup_size = best_wgs;
self.rebuild_pipeline();
self.chunk_lanes = 0;
Ok(())
}
fn runtime_profile_inflight_slots(
&mut self,
header_without_nonce: &[u8],
chunk_lanes: usize,
) -> Result<()> {
let candidates = [2usize, 3, 4];
let original = self.inflight_slots;
let mut best_slots = original;
let mut best_hps = 0.0f64;
for &slots in &candidates {
self.inflight_slots = slots;
self.chunk_lanes = 0;
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
let test_lanes = chunk_lanes
.saturating_mul(slots)
.saturating_mul(2)
.max(chunk_lanes);
let start = Instant::now();
let done = self.compute_batch_count(header_without_nonce, 0, test_lanes)?;
let elapsed = start.elapsed().as_secs_f64();
if elapsed <= 0.0 || done == 0 {
continue;
}
let hps = done as f64 / elapsed;
if hps > best_hps {
best_hps = hps;
best_slots = slots;
}
}
self.inflight_slots = best_slots;
self.chunk_lanes = 0;
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
Ok(())
}
fn ensure_resources(&mut self, header_len: usize, chunk_lanes: usize) -> Result<()> {
let header_buf_size = header_len.div_ceil(4) * 4;
if header_len > self.header_capacity {
let new_capacity = header_buf_size.next_power_of_two().max(4);
self.header_buf = Some(self.device.create_buffer(&wgpu::BufferDescriptor {
label: Some("header"),
size: new_capacity as u64,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
}));
self.header_capacity = header_len;
}
if self.chunk_lanes == chunk_lanes && self.slots.len() == self.inflight_slots {
return Ok(());
}
let hash_bytes = (chunk_lanes * 32) as u64;
let scratch_bytes = (chunk_lanes * TOTAL_MEMORY) as u64;
let params_size = std::mem::size_of::<KernelParams>() as u64;
if hash_bytes == 0 || scratch_bytes == 0 {
anyhow::bail!("invalid chunk_lanes={}", chunk_lanes);
}
self.slots.clear();
let header_buf = self
.header_buf
.as_ref()
.ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
for _ in 0..self.inflight_slots {
let params_buf = self.device.create_buffer(&wgpu::BufferDescriptor {
label: Some("params"),
size: params_size,
usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
});
let hashes_buf = self.device.create_buffer(&wgpu::BufferDescriptor {
label: Some("hashes"),
size: hash_bytes,
usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_SRC,
mapped_at_creation: false,
});
let _scratch_buf = self.device.create_buffer(&wgpu::BufferDescriptor {
label: Some("scratchpads"),
size: scratch_bytes,
usage: wgpu::BufferUsages::STORAGE,
mapped_at_creation: false,
});
let staging_buf = self.device.create_buffer(&wgpu::BufferDescriptor {
label: Some("staging"),
size: hash_bytes,
usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
mapped_at_creation: false,
});
let bind_group = self.device.create_bind_group(&wgpu::BindGroupDescriptor {
label: Some("uhash_bind_group"),
layout: &self.bind_group_layout,
entries: &[
wgpu::BindGroupEntry {
binding: 0,
resource: header_buf.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 1,
resource: hashes_buf.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 2,
resource: _scratch_buf.as_entire_binding(),
},
wgpu::BindGroupEntry {
binding: 3,
resource: params_buf.as_entire_binding(),
},
],
});
self.slots.push(SlotBuffers {
params_buf,
hashes_buf,
_scratch_buf,
staging_buf,
bind_group: Some(bind_group),
});
}
self.chunk_lanes = chunk_lanes;
Ok(())
}
fn dispatch_chunk_sync(
&mut self,
slot: usize,
chunk_lanes: usize,
header_without_nonce: &[u8],
start_nonce: u64,
) -> Result<Vec<u8>> {
let params = KernelParams {
lanes: chunk_lanes as u32,
rounds: ROUNDS as u32,
header_len: header_without_nonce.len() as u32,
_pad0: 0,
start_nonce_lo: start_nonce as u32,
start_nonce_hi: (start_nonce >> 32) as u32,
_pad1: 0,
_pad2: 0,
};
let slot_bufs = &self.slots[slot];
self.queue
.write_buffer(&slot_bufs.params_buf, 0, bytemuck::bytes_of(¶ms));
let bind_group = match &slot_bufs.bind_group {
Some(bg) => bg,
None => {
anyhow::bail!("bind group for slot {} not cached", slot);
}
};
let mut encoder = self
.device
.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("uhash_encoder"),
});
{
let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
label: Some("uhash_pass"),
timestamp_writes: None,
});
pass.set_pipeline(&self.pipeline);
pass.set_bind_group(0, bind_group, &[]);
let wgs = self.workgroup_size;
let workgroups = (chunk_lanes as u32).div_ceil(wgs);
pass.dispatch_workgroups(workgroups, 1, 1);
}
let hash_bytes = (chunk_lanes * 32) as u64;
encoder.copy_buffer_to_buffer(
&slot_bufs.hashes_buf,
0,
&slot_bufs.staging_buf,
0,
hash_bytes,
);
self.queue.submit(std::iter::once(encoder.finish()));
let staging = &self.slots[slot].staging_buf;
let slice = staging.slice(..);
let (sender, receiver) = mpsc::channel();
slice.map_async(wgpu::MapMode::Read, move |result| {
let _ = sender.send(result);
});
self.device
.poll(wgpu::PollType::Wait {
submission_index: None,
timeout: Some(std::time::Duration::from_secs(30)),
})
.ok();
receiver
.recv()
.map_err(|e| anyhow::anyhow!("Map async channel error: {}", e))?
.map_err(|e| anyhow::anyhow!("Buffer map failed: {}", e))?;
let data = slice.get_mapped_range();
let out = data[..hash_bytes as usize].to_vec();
drop(data);
staging.unmap();
Ok(out)
}
fn dispatch_chunk_nonblocking(
&mut self,
slot: usize,
chunk_lanes: usize,
header_len: usize,
start_nonce: u64,
) -> Result<()> {
let params = KernelParams {
lanes: chunk_lanes as u32,
rounds: ROUNDS as u32,
header_len: header_len as u32,
_pad0: 0,
start_nonce_lo: start_nonce as u32,
start_nonce_hi: (start_nonce >> 32) as u32,
_pad1: 0,
_pad2: 0,
};
let slot_bufs = &self.slots[slot];
self.queue
.write_buffer(&slot_bufs.params_buf, 0, bytemuck::bytes_of(¶ms));
let bind_group = slot_bufs
.bind_group
.as_ref()
.ok_or_else(|| anyhow::anyhow!("bind group for slot {} not cached", slot))?;
let mut encoder = self
.device
.create_command_encoder(&wgpu::CommandEncoderDescriptor {
label: Some("uhash_encoder"),
});
{
let mut pass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
label: Some("uhash_pass"),
timestamp_writes: None,
});
pass.set_pipeline(&self.pipeline);
pass.set_bind_group(0, bind_group, &[]);
let wgs = self.workgroup_size;
let workgroups = (chunk_lanes as u32).div_ceil(wgs);
pass.dispatch_workgroups(workgroups, 1, 1);
}
let hash_bytes = (chunk_lanes * 32) as u64;
encoder.copy_buffer_to_buffer(
&slot_bufs.hashes_buf,
0,
&slot_bufs.staging_buf,
0,
hash_bytes,
);
self.queue.submit(std::iter::once(encoder.finish()));
Ok(())
}
fn begin_readback(&self, slot: usize, tx: &mpsc::Sender<usize>) {
let staging = &self.slots[slot].staging_buf;
let slice = staging.slice(..);
let tx = tx.clone();
let s = slot;
slice.map_async(wgpu::MapMode::Read, move |result| {
if result.is_ok() {
let _ = tx.send(s);
}
});
}
fn collect_readback(
&self,
slot: usize,
read_lanes: usize,
output_lane_offset: usize,
out_hashes: &mut [[u8; 32]],
) {
let staging = &self.slots[slot].staging_buf;
let slice = staging.slice(..);
let data = slice.get_mapped_range();
for i in 0..read_lanes {
let src_offset = i * 32;
out_hashes[output_lane_offset + i].copy_from_slice(&data[src_offset..src_offset + 32]);
}
drop(data);
staging.unmap();
}
fn compute_batch_count(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
) -> Result<usize> {
if lanes == 0 {
return Ok(0);
}
let chunk_lanes = self
.tuned_chunk_lanes
.min(self.max_chunk_lanes)
.min(lanes)
.max(1);
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
let header_buf = self
.header_buf
.as_ref()
.ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
let mut padded_header = header_without_nonce.to_vec();
while !padded_header.len().is_multiple_of(4) {
padded_header.push(0);
}
self.queue.write_buffer(header_buf, 0, &padded_header);
let mut pending_by_slot: Vec<Option<PendingDispatch>> =
(0..self.inflight_slots).map(|_| None).collect();
let mut available_slots: VecDeque<usize> = (0..self.inflight_slots).collect();
let (completion_tx, completion_rx) = mpsc::channel::<usize>();
let mut in_flight = 0usize;
let mut lane_offset = 0usize;
let mut total_done = 0usize;
while lane_offset < lanes || in_flight > 0 {
while lane_offset < lanes && !available_slots.is_empty() {
let this_lanes = (lanes - lane_offset).min(chunk_lanes);
let slot = available_slots.pop_front().expect("slot available");
let this_start_nonce = start_nonce.wrapping_add(lane_offset as u64);
self.dispatch_chunk_nonblocking(
slot,
this_lanes,
header_without_nonce.len(),
this_start_nonce,
)?;
self.begin_readback(slot, &completion_tx);
pending_by_slot[slot] = Some(PendingDispatch {
slot,
output_lane_offset: lane_offset,
lanes: this_lanes,
});
in_flight += 1;
lane_offset += this_lanes;
}
if in_flight == 0 {
break;
}
self.device
.poll(wgpu::PollType::Wait {
submission_index: None,
timeout: Some(std::time::Duration::from_secs(30)),
})
.ok();
while let Ok(slot) = completion_rx.try_recv() {
let done = pending_by_slot[slot]
.take()
.ok_or_else(|| anyhow::anyhow!("completion for unknown slot {}", slot))?;
self.slots[slot].staging_buf.unmap();
total_done += done.lanes;
available_slots.push_back(slot);
in_flight = in_flight.saturating_sub(1);
}
}
Ok(total_done)
}
fn compute_batch_hashes(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
) -> Result<Vec<[u8; 32]>> {
if lanes == 0 {
return Ok(Vec::new());
}
if header_without_nonce.is_empty() {
anyhow::bail!("header must not be empty");
}
if self.tuned_chunk_lanes == 0 || self.max_chunk_lanes == 0 {
let (chunk, max_chunk) = self.autotune_chunk_limits();
self.tuned_chunk_lanes = chunk;
self.max_chunk_lanes = max_chunk;
}
if !self.did_runtime_profile {
self.runtime_profile_chunk_lanes(header_without_nonce)?;
}
let chunk_lanes = self
.tuned_chunk_lanes
.min(self.max_chunk_lanes)
.min(lanes)
.max(1);
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
let header_buf = self
.header_buf
.as_ref()
.ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
let mut padded_header = header_without_nonce.to_vec();
while !padded_header.len().is_multiple_of(4) {
padded_header.push(0);
}
self.queue.write_buffer(header_buf, 0, &padded_header);
let mut out_hashes = vec![[0u8; 32]; lanes];
let mut pending_by_slot: Vec<Option<PendingDispatch>> =
(0..self.inflight_slots).map(|_| None).collect();
let mut available_slots: VecDeque<usize> = (0..self.inflight_slots).collect();
let (completion_tx, completion_rx) = mpsc::channel::<usize>();
let mut in_flight = 0usize;
let mut lane_offset = 0usize;
while lane_offset < lanes || in_flight > 0 {
while lane_offset < lanes && !available_slots.is_empty() {
let this_lanes = (lanes - lane_offset).min(chunk_lanes);
let slot = available_slots.pop_front().expect("slot available");
let this_start_nonce = start_nonce.wrapping_add(lane_offset as u64);
self.dispatch_chunk_nonblocking(
slot,
this_lanes,
header_without_nonce.len(),
this_start_nonce,
)?;
self.begin_readback(slot, &completion_tx);
pending_by_slot[slot] = Some(PendingDispatch {
slot,
output_lane_offset: lane_offset,
lanes: this_lanes,
});
in_flight += 1;
lane_offset += this_lanes;
}
if in_flight == 0 {
break;
}
self.device
.poll(wgpu::PollType::Wait {
submission_index: None,
timeout: Some(std::time::Duration::from_secs(30)),
})
.ok();
while let Ok(slot) = completion_rx.try_recv() {
let done = pending_by_slot[slot]
.take()
.ok_or_else(|| anyhow::anyhow!("completion for unknown slot {}", slot))?;
self.collect_readback(
done.slot,
done.lanes,
done.output_lane_offset,
&mut out_hashes,
);
available_slots.push_back(slot);
in_flight = in_flight.saturating_sub(1);
}
}
Ok(out_hashes)
}
}
impl Solver for WgpuSolver {
fn backend_name(&self) -> &'static str {
"wgpu"
}
fn recommended_lanes(&mut self, requested: usize) -> usize {
if self.tuned_chunk_lanes == 0 {
let (chunk, max_chunk) = self.autotune_chunk_limits();
self.tuned_chunk_lanes = chunk;
self.max_chunk_lanes = max_chunk;
}
if requested == 0 {
self.tuned_chunk_lanes
.saturating_mul(self.inflight_slots)
.max(1)
} else {
requested.max(1)
}
}
fn find_proof_batch(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
difficulty: u32,
) -> Result<ProofResult> {
let hashes = self.compute_batch_hashes(header_without_nonce, start_nonce, lanes)?;
for (lane, hash) in hashes.into_iter().enumerate() {
if meets_difficulty(&hash, difficulty) {
return Ok(Some((start_nonce + lane as u64, hash)));
}
}
Ok(None)
}
fn benchmark_hashes(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
) -> Result<usize> {
Ok(self
.compute_batch_hashes(header_without_nonce, start_nonce, lanes)?
.len())
}
}
#[cfg(test)]
mod tests {
use super::WgpuSolver;
use std::time::Instant;
use uhash_core::UniversalHash;
#[test]
fn wgpu_hash_matches_cpu_for_single_nonce() {
let Ok(mut solver) = WgpuSolver::new() else {
eprintln!("Skipping: no WGPU adapter available");
return;
};
let mut header = Vec::new();
header.extend_from_slice(&[0xAB; 32]);
header.extend_from_slice(b"bostrom1testaddress");
header.extend_from_slice(&1_700_000_000u64.to_le_bytes());
let nonce = 42u64;
let gpu_hashes = solver
.compute_batch_hashes(&header, nonce, 1)
.expect("gpu hash");
assert_eq!(gpu_hashes.len(), 1);
let mut input = header.clone();
input.extend_from_slice(&nonce.to_le_bytes());
let mut cpu = UniversalHash::new();
let cpu_hash = cpu.hash(&input);
assert_eq!(
gpu_hashes[0],
cpu_hash,
"WGPU hash does not match CPU hash!\nGPU: {}\nCPU: {}",
hex::encode(gpu_hashes[0]),
hex::encode(cpu_hash)
);
}
#[test]
fn wgpu_hash_matches_cpu_for_multi_nonce() {
let Ok(mut solver) = WgpuSolver::new() else {
eprintln!("Skipping: no WGPU adapter available");
return;
};
let mut header = Vec::new();
header.extend_from_slice(&[0xCD; 32]);
header.extend_from_slice(b"bostrom1multinonce");
header.extend_from_slice(&1_700_000_001u64.to_le_bytes());
let start_nonce = 0u64;
let count = 64usize;
let gpu_hashes = solver
.compute_batch_hashes(&header, start_nonce, count)
.expect("gpu batch hash");
assert_eq!(gpu_hashes.len(), count);
for (i, gpu_hash) in gpu_hashes.iter().enumerate().take(count) {
let nonce = start_nonce + i as u64;
let mut input = header.clone();
input.extend_from_slice(&nonce.to_le_bytes());
let mut cpu = UniversalHash::new();
let cpu_hash = cpu.hash(&input);
assert_eq!(
*gpu_hash,
cpu_hash,
"Mismatch at nonce {}: GPU={} CPU={}",
nonce,
hex::encode(gpu_hash),
hex::encode(cpu_hash)
);
}
}
#[test]
#[ignore = "performance profile for local WGPU throughput validation"]
fn wgpu_release_throughput_profile() {
let Ok(mut solver) = WgpuSolver::new() else {
eprintln!("Skipping: no WGPU adapter available");
return;
};
let mut header = Vec::new();
header.extend_from_slice(&[0xEF; 32]);
header.extend_from_slice(b"bostrom1wgpuprofile");
header.extend_from_slice(&1_700_000_002u64.to_le_bytes());
let lane_sets = [64usize, 128, 256, 512, 1024];
let mut nonce = 0u64;
for lanes in lane_sets {
let _ = solver
.compute_batch_hashes(&header, nonce, lanes)
.expect("warmup batch");
nonce = nonce.saturating_add(lanes as u64);
let start = Instant::now();
let batches = if lanes >= 1024 { 1u64 } else { 3u64 };
let mut computed = 0u64;
for _ in 0..batches {
let _ = solver
.compute_batch_hashes(&header, nonce, lanes)
.expect("profile batch");
nonce = nonce.saturating_add(lanes as u64);
computed += lanes as u64;
}
let elapsed = start.elapsed().as_secs_f64();
let hps = if elapsed > 0.0 {
computed as f64 / elapsed
} else {
0.0
};
eprintln!(
"wgpu-profile lanes={} hashes={} elapsed={:.3}s hashrate={:.2} H/s",
lanes, computed, elapsed, hps
);
}
}
}