#![cfg(all(feature = "metal-backend", target_os = "macos"))]
use anyhow::Result;
use block::ConcreteBlock;
use metal::{
Buffer, CommandBuffer, CommandBufferRef, CommandQueue, CompileOptions, ComputePipelineState,
Device, FunctionConstantValues, MTLCommandBufferStatus, MTLDataType, MTLGPUFamily,
MTLResourceOptions,
};
use serde::{Deserialize, Serialize};
use std::collections::VecDeque;
use std::fs;
use std::path::PathBuf;
use std::sync::mpsc::{self, Receiver, Sender};
use std::time::Instant;
const CHAINS: usize = 4;
const SCRATCHPAD_SIZE: usize = 512 * 1024;
const TOTAL_MEMORY: usize = CHAINS * SCRATCHPAD_SIZE;
const ROUNDS: usize = 12_288;
const DEFAULT_INFLIGHT_SLOTS: usize = 3;
const FOUND_DATA_BYTES: usize = 8 + 32;
#[repr(C)]
#[derive(Clone, Copy)]
struct KernelParams {
lanes: u32,
rounds: u32,
header_len: u32,
difficulty: u32,
emit_hashes: u32,
_pad0: u32,
start_nonce: u64,
}
struct PendingDispatch {
_slot: usize,
output_lane_offset: usize,
lanes: usize,
_command_buffer: CommandBuffer,
}
type CompletionEvent = (usize, MTLCommandBufferStatus);
#[derive(Clone, Debug, Serialize)]
pub struct MetalTelemetry {
pub device_name: String,
pub apple_family: u32,
pub registry_id: u64,
pub unified_memory: bool,
pub recommended_max_working_set_size: u64,
pub max_buffer_length: u64,
pub inflight_slots: usize,
pub tuned_chunk_lanes: usize,
pub max_chunk_lanes: usize,
pub tg_factor: usize,
pub tg_width: u64,
pub vector_block_io: bool,
pub unroll_rounds: bool,
pub runtime_profiled: bool,
pub cache_warm_start: bool,
}
#[derive(Debug, Serialize, Deserialize)]
struct MetalTuneCache {
version: u32,
registry_id: u64,
tuned_chunk_lanes: usize,
max_chunk_lanes: usize,
inflight_slots: usize,
#[serde(default = "default_tg_factor")]
tg_factor: usize,
#[serde(default)]
tg_width: u64,
}
const fn default_tg_factor() -> usize {
4
}
pub struct MetalMiner {
device: Device,
queue: CommandQueue,
pipeline_hash: ComputePipelineState,
pipeline_proof: ComputePipelineState,
header_buf: Option<Buffer>,
header_capacity: usize,
params_slots: Vec<Buffer>,
hashes_private_slots: Vec<Buffer>,
hashes_readback_slots: Vec<Buffer>,
found_flag_slots: Vec<Buffer>,
found_data_slots: Vec<Buffer>,
scratchpads_slots: Vec<Buffer>,
chunk_lanes: usize,
max_chunk_lanes: usize,
tuned_chunk_lanes: usize,
inflight_slots: usize,
tg_factor: usize,
tg_width: u64,
did_runtime_profile: bool,
device_name: String,
apple_family: u32,
registry_id: u64,
unified_memory: bool,
recommended_ws: u64,
max_buffer_len: u64,
vector_block_io: bool,
unroll_rounds: bool,
profile_enabled: bool,
profile_every: usize,
profile_counter: u64,
tune_cache_loaded: bool,
force_retune: bool,
}
impl MetalMiner {
fn detect_apple_family(device: &Device) -> u32 {
let families = [
(MTLGPUFamily::Apple9, 9u32),
(MTLGPUFamily::Apple8, 8u32),
(MTLGPUFamily::Apple7, 7u32),
(MTLGPUFamily::Apple6, 6u32),
(MTLGPUFamily::Apple5, 5u32),
(MTLGPUFamily::Apple4, 4u32),
(MTLGPUFamily::Apple3, 3u32),
(MTLGPUFamily::Apple2, 2u32),
(MTLGPUFamily::Apple1, 1u32),
];
for (family, idx) in families {
if device.supports_family(family) {
return idx;
}
}
0
}
pub fn new() -> Result<Self> {
let device =
Device::system_default().ok_or_else(|| anyhow::anyhow!("No Metal device found"))?;
let options = CompileOptions::new();
options.set_fast_math_enabled(true);
let apple_family = Self::detect_apple_family(&device);
let vector_block_io = apple_family >= 7;
let unroll_rounds = apple_family >= 8;
let library = device
.new_library_with_source(METAL_SHADER_SOURCE, &options)
.map_err(|e| anyhow::anyhow!("Failed to compile Metal shader: {}", e))?;
let build_pipeline = |proof_mode: bool| -> Result<ComputePipelineState> {
let constants = FunctionConstantValues::new();
constants.set_constant_value_at_index(
&vector_block_io as *const bool as *const std::ffi::c_void,
MTLDataType::Bool,
0,
);
constants.set_constant_value_at_index(
&unroll_rounds as *const bool as *const std::ffi::c_void,
MTLDataType::Bool,
1,
);
constants.set_constant_value_at_index(
&proof_mode as *const bool as *const std::ffi::c_void,
MTLDataType::Bool,
2,
);
let kernel = library
.get_function("uhash_kernel", Some(constants))
.map_err(|e| anyhow::anyhow!("Failed to get kernel function: {}", e))?;
device
.new_compute_pipeline_state_with_function(&kernel)
.map_err(|e| anyhow::anyhow!("Failed to create compute pipeline: {}", e))
};
let pipeline_hash = build_pipeline(false)?;
let pipeline_proof = build_pipeline(true)?;
let queue = device.new_command_queue();
let device_name = device.name().to_string();
let registry_id = device.registry_id();
let unified_memory = device.has_unified_memory();
let recommended_ws = device.recommended_max_working_set_size();
let max_buffer_len = device.max_buffer_length();
let inflight_slots = if unified_memory {
4
} else {
DEFAULT_INFLIGHT_SLOTS
};
let profile_enabled = std::env::var_os("UHASH_METAL_PROFILE").is_some();
let profile_every = std::env::var("UHASH_METAL_PROFILE_EVERY")
.ok()
.and_then(|v| v.parse::<usize>().ok())
.filter(|v| *v > 0)
.unwrap_or(1);
let force_retune = std::env::var_os("UHASH_METAL_FORCE_RETUNE").is_some();
let tg_factor = if apple_family >= 8 { 8 } else { 4 };
let mut miner = Self {
device,
queue,
pipeline_hash,
pipeline_proof,
header_buf: None,
header_capacity: 0,
params_slots: Vec::new(),
hashes_private_slots: Vec::new(),
hashes_readback_slots: Vec::new(),
found_flag_slots: Vec::new(),
found_data_slots: Vec::new(),
scratchpads_slots: Vec::new(),
chunk_lanes: 0,
max_chunk_lanes: 0,
tuned_chunk_lanes: 0,
inflight_slots,
tg_factor,
tg_width: 0,
did_runtime_profile: false,
device_name,
apple_family,
registry_id,
unified_memory,
recommended_ws,
max_buffer_len,
vector_block_io,
unroll_rounds,
profile_enabled,
profile_every,
profile_counter: 0,
tune_cache_loaded: false,
force_retune,
};
miner.try_load_tune_cache();
Ok(miner)
}
pub fn recommended_lanes(&mut self, requested: usize) -> usize {
if self.tuned_chunk_lanes == 0 {
let (chunk, max_chunk) = self.autotune_chunk_limits();
self.tuned_chunk_lanes = chunk;
self.max_chunk_lanes = max_chunk;
}
if requested == 0 {
let base_lanes = if self.did_runtime_profile {
self.tuned_chunk_lanes
} else if self.apple_family >= 8 {
self.max_chunk_lanes.max(self.tuned_chunk_lanes)
} else {
self.tuned_chunk_lanes
};
let default_mult = if self.apple_family >= 8 {
self.inflight_slots.min(4)
} else {
self.inflight_slots
};
base_lanes.saturating_mul(default_mult).max(1)
} else {
requested.max(1)
}
}
pub fn telemetry(&self) -> MetalTelemetry {
MetalTelemetry {
device_name: self.device_name.clone(),
apple_family: self.apple_family,
registry_id: self.registry_id,
unified_memory: self.unified_memory,
recommended_max_working_set_size: self.recommended_ws,
max_buffer_length: self.max_buffer_len,
inflight_slots: self.inflight_slots,
tuned_chunk_lanes: self.tuned_chunk_lanes,
max_chunk_lanes: self.max_chunk_lanes,
tg_factor: self.tg_factor,
tg_width: self.tg_width,
vector_block_io: self.vector_block_io,
unroll_rounds: self.unroll_rounds,
runtime_profiled: self.did_runtime_profile,
cache_warm_start: self.tune_cache_loaded,
}
}
pub fn benchmark_hashes(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
) -> Result<usize> {
self.compute_batch_count(header_without_nonce, start_nonce, lanes)
}
fn tune_cache_path(registry_id: u64) -> Option<PathBuf> {
let base = std::env::var_os("XDG_CONFIG_HOME")
.map(PathBuf::from)
.or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".config")))?;
Some(
base.join("uhash")
.join(format!("metal_tuning_{registry_id:016x}.json")),
)
}
pub fn tune_cache_path_for_current_device(&self) -> Option<PathBuf> {
Self::tune_cache_path(self.registry_id)
}
pub fn clear_tune_cache_for_current_device(&mut self) -> Result<bool> {
let Some(path) = self.tune_cache_path_for_current_device() else {
return Ok(false);
};
if !path.exists() {
self.tune_cache_loaded = false;
return Ok(false);
}
fs::remove_file(&path)
.map_err(|e| anyhow::anyhow!("failed to remove {}: {}", path.display(), e))?;
self.tune_cache_loaded = false;
Ok(true)
}
pub fn force_retune_now(&mut self, header_without_nonce: &[u8]) -> Result<()> {
self.did_runtime_profile = false;
self.tune_cache_loaded = false;
self.chunk_lanes = 0;
self.runtime_profile_chunk_lanes(header_without_nonce)
}
fn should_log_profile(&mut self) -> bool {
if !self.profile_enabled {
return false;
}
self.profile_counter = self.profile_counter.saturating_add(1);
self.profile_counter
.is_multiple_of(self.profile_every as u64)
}
fn try_load_tune_cache(&mut self) {
if self.force_retune {
return;
}
let Some(path) = Self::tune_cache_path(self.registry_id) else {
return;
};
let Ok(raw) = fs::read(&path) else {
return;
};
let Ok(cache) = serde_json::from_slice::<MetalTuneCache>(&raw) else {
return;
};
if !(cache.version == 1 || cache.version == 2) || cache.registry_id != self.registry_id {
return;
}
let (_heuristic_chunk, heuristic_max) = self.autotune_chunk_limits();
let cap_max = heuristic_max.max(32);
let chunk = cache.tuned_chunk_lanes.clamp(32, cap_max);
let max_chunk = cache.max_chunk_lanes.clamp(chunk, cap_max);
if chunk % 32 != 0 || max_chunk % 32 != 0 {
return;
}
self.tuned_chunk_lanes = chunk;
self.max_chunk_lanes = max_chunk;
self.inflight_slots = cache.inflight_slots.clamp(2, 6);
self.tg_factor = cache.tg_factor.clamp(1, 8);
self.tg_width = cache.tg_width;
self.tune_cache_loaded = true;
}
fn persist_tune_cache(&self) {
let Some(path) = Self::tune_cache_path(self.registry_id) else {
return;
};
let cache = MetalTuneCache {
version: 2,
registry_id: self.registry_id,
tuned_chunk_lanes: self.tuned_chunk_lanes,
max_chunk_lanes: self.max_chunk_lanes,
inflight_slots: self.inflight_slots,
tg_factor: self.tg_factor,
tg_width: self.tg_width,
};
let Ok(raw) = serde_json::to_vec_pretty(&cache) else {
return;
};
if let Some(parent) = path.parent() {
let _ = fs::create_dir_all(parent);
}
let _ = fs::write(path, raw);
}
fn autotune_chunk_limits(&self) -> (usize, usize) {
let max_buffer_len = self.device.max_buffer_length() as usize;
let recommended_ws = self.device.recommended_max_working_set_size() as usize;
let mem_budget = if recommended_ws > 0 {
recommended_ws.saturating_mul(7) / 10
} else {
512 * 1024 * 1024
};
let max_by_working_set = mem_budget
.saturating_div(TOTAL_MEMORY.saturating_mul(self.inflight_slots))
.max(1);
let max_by_buffer = max_buffer_len.saturating_div(TOTAL_MEMORY).max(1);
let max_chunk = max_by_working_set.min(max_by_buffer).clamp(32, 4096);
let tuned = max_chunk.min(1024);
let tuned = (tuned / 32).max(1) * 32;
let max_chunk = (max_chunk / 32).max(1) * 32;
(tuned, max_chunk)
}
fn round_to_multiple(v: usize, m: usize) -> usize {
(v / m).max(1) * m
}
fn compute_tg_width(&self, chunk_lanes: usize, tg_factor: usize) -> u64 {
let exec_width = self.pipeline_hash.thread_execution_width().max(1);
let max_threads = self
.pipeline_hash
.max_total_threads_per_threadgroup()
.max(1);
if self.tg_width > 0 {
let mut tg = self.tg_width.min(max_threads).min(chunk_lanes as u64);
tg = (tg / exec_width).max(1) * exec_width;
return tg.min(chunk_lanes as u64).max(exec_width);
}
if chunk_lanes as u64 <= exec_width {
return chunk_lanes as u64;
}
let mut tg = exec_width.saturating_mul(tg_factor as u64);
tg = tg.min(max_threads).min(chunk_lanes as u64);
tg = (tg / exec_width).max(1) * exec_width;
tg.min(chunk_lanes as u64).max(exec_width)
}
fn runtime_profile_threadgroup_factor(
&mut self,
header_without_nonce: &[u8],
chunk_lanes: usize,
) -> Result<()> {
let factors: &[usize] = if self.apple_family >= 8 {
&[2, 4, 8]
} else {
&[1, 2, 4, 8]
};
let current = self.tg_factor;
let mut best_factor = current;
let mut best_hps = 0.0f64;
let samples_per_factor = 3usize;
for &factor in factors {
self.tg_factor = factor;
self.tg_width = 0;
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
let warmup = self.dispatch_chunk(
0,
chunk_lanes,
header_without_nonce.len(),
0,
0,
false,
false,
false,
None,
)?;
warmup.wait_until_completed();
if warmup.status() != MTLCommandBufferStatus::Completed {
continue;
}
let mut total_elapsed = 0.0f64;
for sample in 0..samples_per_factor {
let nonce = 1u64.saturating_add((sample as u64).saturating_mul(chunk_lanes as u64));
let start = Instant::now();
let cmd = self.dispatch_chunk(
0,
chunk_lanes,
header_without_nonce.len(),
nonce,
0,
false,
false,
false,
None,
)?;
cmd.wait_until_completed();
if cmd.status() != MTLCommandBufferStatus::Completed {
total_elapsed = 0.0;
break;
}
total_elapsed += start.elapsed().as_secs_f64();
}
if total_elapsed <= 0.0 {
continue;
}
let hps = (chunk_lanes * samples_per_factor) as f64 / total_elapsed;
if hps > best_hps {
best_hps = hps;
best_factor = factor;
}
}
self.tg_factor = best_factor;
let exec = self.pipeline_hash.thread_execution_width().max(1);
let max_threads = self
.pipeline_hash
.max_total_threads_per_threadgroup()
.max(1);
let mut shape_candidates = vec![exec, exec.saturating_mul(2), exec.saturating_mul(3)];
shape_candidates.push(exec.saturating_mul(4));
shape_candidates.push(exec.saturating_mul(6));
shape_candidates.push(exec.saturating_mul(8));
shape_candidates.push(max_threads);
shape_candidates.sort_unstable();
shape_candidates.dedup();
let mut best_shape = self.compute_tg_width(chunk_lanes, self.tg_factor);
let mut best_shape_hps = 0.0f64;
for shape in shape_candidates {
self.tg_width = shape;
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
let warmup = self.dispatch_chunk(
0,
chunk_lanes,
header_without_nonce.len(),
0,
0,
false,
false,
false,
None,
)?;
warmup.wait_until_completed();
if warmup.status() != MTLCommandBufferStatus::Completed {
continue;
}
let start = Instant::now();
let cmd = self.dispatch_chunk(
0,
chunk_lanes,
header_without_nonce.len(),
1,
0,
false,
false,
false,
None,
)?;
cmd.wait_until_completed();
if cmd.status() != MTLCommandBufferStatus::Completed {
continue;
}
let elapsed = start.elapsed().as_secs_f64();
if elapsed <= 0.0 {
continue;
}
let hps = chunk_lanes as f64 / elapsed;
if hps > best_shape_hps {
best_shape_hps = hps;
best_shape = shape;
}
}
self.tg_width = best_shape;
self.chunk_lanes = 0;
Ok(())
}
fn runtime_profile_chunk_lanes(&mut self, header_without_nonce: &[u8]) -> Result<()> {
if self.did_runtime_profile {
return Ok(());
}
self.did_runtime_profile = true;
let base = self.tuned_chunk_lanes.max(32);
let max_chunk = self.max_chunk_lanes.max(base);
let mut candidates = vec![
base,
Self::round_to_multiple(base.saturating_mul(2), 32).min(max_chunk),
Self::round_to_multiple(base.saturating_mul(3), 32).min(max_chunk),
max_chunk,
];
candidates.sort_unstable();
candidates.dedup();
let mut best_lane = base;
let mut best_hps = 0.0f64;
for &lane in &candidates {
if lane == 0 {
continue;
}
self.ensure_resources(header_without_nonce.len(), lane)?;
let warmup = self.dispatch_chunk(
0,
lane,
header_without_nonce.len(),
0,
0,
false,
false,
false,
None,
)?;
warmup.wait_until_completed();
if warmup.status() != MTLCommandBufferStatus::Completed {
continue;
}
let start = Instant::now();
let cmd = self.dispatch_chunk(
0,
lane,
header_without_nonce.len(),
lane as u64,
0,
false,
false,
false,
None,
)?;
cmd.wait_until_completed();
if cmd.status() != MTLCommandBufferStatus::Completed {
continue;
}
let elapsed = start.elapsed().as_secs_f64();
if elapsed <= 0.0 {
continue;
}
let hps = lane as f64 / elapsed;
if hps > best_hps {
best_hps = hps;
best_lane = lane;
}
}
self.tuned_chunk_lanes = best_lane.max(32);
self.runtime_profile_threadgroup_factor(header_without_nonce, self.tuned_chunk_lanes)?;
self.runtime_profile_inflight_slots(header_without_nonce, self.tuned_chunk_lanes)?;
self.chunk_lanes = 0;
if !self.profile_enabled {
self.persist_tune_cache();
}
Ok(())
}
fn runtime_profile_inflight_slots(
&mut self,
header_without_nonce: &[u8],
chunk_lanes: usize,
) -> Result<()> {
let candidates: &[usize] = if self.unified_memory {
&[3, 4, 5, 6]
} else {
&[2, 3, 4]
};
let original = self.inflight_slots;
let mut best_slots = original;
let mut best_hps = 0.0f64;
for &slots in candidates {
self.inflight_slots = slots;
self.chunk_lanes = 0;
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
let test_lanes = chunk_lanes
.saturating_mul(slots)
.saturating_mul(2)
.max(chunk_lanes);
let start = Instant::now();
let done = self.compute_batch_count(header_without_nonce, 0, test_lanes)?;
let elapsed = start.elapsed().as_secs_f64();
if elapsed <= 0.0 || done == 0 {
continue;
}
let hps = done as f64 / elapsed;
if hps > best_hps {
best_hps = hps;
best_slots = slots;
}
}
self.inflight_slots = best_slots;
self.chunk_lanes = 0;
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
Ok(())
}
fn ensure_runtime_profile(&mut self, header_without_nonce: &[u8]) -> Result<()> {
if self.did_runtime_profile {
return Ok(());
}
if self.tune_cache_loaded && !self.force_retune {
self.did_runtime_profile = true;
return Ok(());
}
self.runtime_profile_chunk_lanes(header_without_nonce)
}
fn ensure_resources(&mut self, header_len: usize, chunk_lanes: usize) -> Result<()> {
if header_len > self.header_capacity {
let new_capacity = header_len.next_power_of_two();
self.header_buf = Some(
self.device
.new_buffer(new_capacity as u64, MTLResourceOptions::StorageModeShared),
);
self.header_capacity = new_capacity;
}
if self.chunk_lanes == chunk_lanes
&& self.params_slots.len() == self.inflight_slots
&& self.hashes_private_slots.len() == self.inflight_slots
&& self.hashes_readback_slots.len() == self.inflight_slots
&& self.found_flag_slots.len() == self.inflight_slots
&& self.found_data_slots.len() == self.inflight_slots
&& self.scratchpads_slots.len() == self.inflight_slots
{
return Ok(());
}
let hash_bytes = chunk_lanes.saturating_mul(32);
let scratch_bytes = chunk_lanes.saturating_mul(TOTAL_MEMORY);
if hash_bytes == 0 || scratch_bytes == 0 {
anyhow::bail!("invalid chunk_lanes={}", chunk_lanes);
}
let max_buffer = self.device.max_buffer_length() as usize;
if hash_bytes > max_buffer || scratch_bytes > max_buffer {
anyhow::bail!(
"chunk too large: chunk_lanes={} hash_bytes={} scratch_bytes={} max_buffer={}",
chunk_lanes,
hash_bytes,
scratch_bytes,
max_buffer
);
}
self.params_slots.clear();
self.hashes_private_slots.clear();
self.hashes_readback_slots.clear();
self.found_flag_slots.clear();
self.found_data_slots.clear();
self.scratchpads_slots.clear();
for _ in 0..self.inflight_slots {
self.params_slots.push(self.device.new_buffer(
std::mem::size_of::<KernelParams>() as u64,
MTLResourceOptions::StorageModeShared,
));
self.hashes_private_slots.push(
self.device
.new_buffer(hash_bytes as u64, MTLResourceOptions::StorageModePrivate),
);
self.hashes_readback_slots.push(
self.device
.new_buffer(hash_bytes as u64, MTLResourceOptions::StorageModeShared),
);
self.found_flag_slots.push(
self.device
.new_buffer(4, MTLResourceOptions::StorageModeShared),
);
self.found_data_slots.push(self.device.new_buffer(
FOUND_DATA_BYTES as u64,
MTLResourceOptions::StorageModeShared,
));
self.scratchpads_slots.push(
self.device
.new_buffer(scratch_bytes as u64, MTLResourceOptions::StorageModePrivate),
);
}
self.chunk_lanes = chunk_lanes;
Ok(())
}
#[allow(clippy::too_many_arguments)]
fn dispatch_chunk(
&mut self,
slot: usize,
chunk_lanes: usize,
header_len: usize,
start_nonce: u64,
difficulty: u32,
proof_mode: bool,
emit_hashes: bool,
copy_hashes_for_readback: bool,
completion_tx: Option<&Sender<CompletionEvent>>,
) -> Result<CommandBuffer> {
let log_dispatch = self.should_log_profile();
let header_buf = self
.header_buf
.as_ref()
.ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
let params_buf = self
.params_slots
.get(slot)
.ok_or_else(|| anyhow::anyhow!("params slot {} missing", slot))?;
let hashes_private = self
.hashes_private_slots
.get(slot)
.ok_or_else(|| anyhow::anyhow!("hashes private slot {} missing", slot))?;
let hashes_readback = self
.hashes_readback_slots
.get(slot)
.ok_or_else(|| anyhow::anyhow!("hashes readback slot {} missing", slot))?;
let hashes_out = if copy_hashes_for_readback {
hashes_readback
} else {
hashes_private
};
let found_flag = self
.found_flag_slots
.get(slot)
.ok_or_else(|| anyhow::anyhow!("found flag slot {} missing", slot))?;
let found_data = self
.found_data_slots
.get(slot)
.ok_or_else(|| anyhow::anyhow!("found data slot {} missing", slot))?;
let scratchpad = self
.scratchpads_slots
.get(slot)
.ok_or_else(|| anyhow::anyhow!("scratchpad slot {} missing", slot))?;
let header_len_u32 = u32::try_from(header_len)
.map_err(|_| anyhow::anyhow!("header too large for gpu kernel: {}", header_len))?;
let params = KernelParams {
lanes: chunk_lanes as u32,
rounds: ROUNDS as u32,
header_len: header_len_u32,
difficulty,
emit_hashes: if emit_hashes { 1 } else { 0 },
_pad0: 0,
start_nonce,
};
unsafe {
std::ptr::copy_nonoverlapping(
¶ms as *const KernelParams,
params_buf.contents() as *mut KernelParams,
1,
);
}
unsafe {
*(found_flag.contents() as *mut u32) = 0u32;
std::ptr::write_bytes(found_data.contents() as *mut u8, 0, FOUND_DATA_BYTES);
}
let command_buffer = self.queue.new_command_buffer().to_owned();
if log_dispatch {
command_buffer.set_label(&format!(
"uhash slot={} lanes={} nonce={}",
slot, chunk_lanes, start_nonce
));
command_buffer.push_debug_group("uhash.dispatch");
}
let encoder = command_buffer.new_compute_command_encoder();
let pipeline = if proof_mode {
&self.pipeline_proof
} else {
&self.pipeline_hash
};
encoder.set_compute_pipeline_state(pipeline);
encoder.set_buffer(0, Some(header_buf), 0);
encoder.set_buffer(1, Some(hashes_out), 0);
encoder.set_buffer(2, Some(scratchpad), 0);
encoder.set_buffer(3, Some(params_buf), 0);
encoder.set_buffer(4, Some(found_flag), 0);
encoder.set_buffer(5, Some(found_data), 0);
let tg_width = self.compute_tg_width(chunk_lanes, self.tg_factor);
let grid = metal::MTLSize {
width: chunk_lanes as u64,
height: 1,
depth: 1,
};
let tg = metal::MTLSize {
width: tg_width,
height: 1,
depth: 1,
};
encoder.dispatch_threads(grid, tg);
encoder.end_encoding();
if log_dispatch {
command_buffer.pop_debug_group();
}
if let Some(tx) = completion_tx {
let tx = tx.clone();
let block = ConcreteBlock::new(move |cb: &CommandBufferRef| {
let _ = tx.send((slot, cb.status()));
})
.copy();
command_buffer.add_completed_handler(&block);
}
command_buffer.commit();
Ok(command_buffer)
}
#[allow(dead_code)]
fn collect_readback_without_wait(
&self,
pending: &PendingDispatch,
out_hashes: &mut [[u8; 32]],
) -> Result<()> {
let readback = self
.hashes_readback_slots
.get(pending._slot)
.ok_or_else(|| anyhow::anyhow!("missing readback slot {}", pending._slot))?;
let dst_offset = pending.output_lane_offset;
let lane_count = pending.lanes;
unsafe {
std::ptr::copy_nonoverlapping(
readback.contents() as *const u8,
out_hashes.as_mut_ptr().add(dst_offset) as *mut u8,
lane_count * 32,
);
}
Ok(())
}
fn await_completion_event(
&self,
rx: &Receiver<CompletionEvent>,
) -> Result<(usize, MTLCommandBufferStatus)> {
rx.recv()
.map_err(|e| anyhow::anyhow!("completion channel closed: {}", e))
}
#[allow(dead_code)]
fn compute_batch_hashes(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
) -> Result<Vec<[u8; 32]>> {
if lanes == 0 {
return Ok(Vec::new());
}
if header_without_nonce.is_empty() {
anyhow::bail!("header must not be empty");
}
if self.tuned_chunk_lanes == 0 || self.max_chunk_lanes == 0 {
let (chunk, max_chunk) = self.autotune_chunk_limits();
self.tuned_chunk_lanes = chunk;
self.max_chunk_lanes = max_chunk;
}
self.ensure_runtime_profile(header_without_nonce)?;
let chunk_lanes = self
.tuned_chunk_lanes
.min(self.max_chunk_lanes)
.min(lanes)
.max(1);
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
let header_buf = self
.header_buf
.as_ref()
.ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
unsafe {
std::ptr::copy_nonoverlapping(
header_without_nonce.as_ptr(),
header_buf.contents() as *mut u8,
header_without_nonce.len(),
);
}
let mut out_hashes = vec![[0u8; 32]; lanes];
let mut pending_by_slot: Vec<Option<PendingDispatch>> =
(0..self.inflight_slots).map(|_| None).collect();
let mut available_slots: VecDeque<usize> = (0..self.inflight_slots).collect();
let (completion_tx, completion_rx) = mpsc::channel::<CompletionEvent>();
let mut in_flight = 0usize;
let mut lane_offset = 0usize;
let start_submit = Instant::now();
let mut completion_waits = 0usize;
while lane_offset < lanes || in_flight > 0 {
while lane_offset < lanes && !available_slots.is_empty() {
let this_lanes = (lanes - lane_offset).min(chunk_lanes);
let slot = available_slots.pop_front().expect("slot available");
let this_start_nonce = start_nonce.wrapping_add(lane_offset as u64);
let command_buffer = self.dispatch_chunk(
slot,
this_lanes,
header_without_nonce.len(),
this_start_nonce,
0,
false,
true,
true,
Some(&completion_tx),
)?;
pending_by_slot[slot] = Some(PendingDispatch {
_slot: slot,
output_lane_offset: lane_offset,
lanes: this_lanes,
_command_buffer: command_buffer,
});
in_flight += 1;
lane_offset += this_lanes;
}
if in_flight == 0 {
break;
}
let (slot, status) = self.await_completion_event(&completion_rx)?;
completion_waits = completion_waits.saturating_add(1);
let done = pending_by_slot
.get_mut(slot)
.and_then(Option::take)
.ok_or_else(|| anyhow::anyhow!("completion for unknown slot {}", slot))?;
if status != MTLCommandBufferStatus::Completed {
anyhow::bail!(
"metal command failed with status {:?} (slot={})",
status,
slot
);
}
self.collect_readback_without_wait(&done, &mut out_hashes)?;
available_slots.push_back(slot);
in_flight = in_flight.saturating_sub(1);
}
if self.should_log_profile() {
let elapsed = start_submit.elapsed().as_secs_f64();
eprintln!(
"[metal-prof] compute_batch_hashes lanes={} chunk={} inflight={} elapsed={:.4}s completion_waits={}",
lanes, chunk_lanes, self.inflight_slots, elapsed, completion_waits
);
}
Ok(out_hashes)
}
fn compute_batch_count(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
) -> Result<usize> {
if lanes == 0 {
return Ok(0);
}
if header_without_nonce.is_empty() {
anyhow::bail!("header must not be empty");
}
if self.tuned_chunk_lanes == 0 || self.max_chunk_lanes == 0 {
let (chunk, max_chunk) = self.autotune_chunk_limits();
self.tuned_chunk_lanes = chunk;
self.max_chunk_lanes = max_chunk;
}
self.ensure_runtime_profile(header_without_nonce)?;
let chunk_lanes = self
.tuned_chunk_lanes
.min(self.max_chunk_lanes)
.min(lanes)
.max(1);
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
let header_buf = self
.header_buf
.as_ref()
.ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
unsafe {
std::ptr::copy_nonoverlapping(
header_without_nonce.as_ptr(),
header_buf.contents() as *mut u8,
header_without_nonce.len(),
);
}
let mut pending_by_slot: Vec<Option<PendingDispatch>> =
(0..self.inflight_slots).map(|_| None).collect();
let mut available_slots: VecDeque<usize> = (0..self.inflight_slots).collect();
let (completion_tx, completion_rx) = mpsc::channel::<CompletionEvent>();
let mut in_flight = 0usize;
let mut lane_offset = 0usize;
let start_submit = Instant::now();
let mut completion_waits = 0usize;
while lane_offset < lanes || in_flight > 0 {
while lane_offset < lanes && !available_slots.is_empty() {
let this_lanes = (lanes - lane_offset).min(chunk_lanes);
let slot = available_slots.pop_front().expect("slot available");
let this_start_nonce = start_nonce.wrapping_add(lane_offset as u64);
let command_buffer = self.dispatch_chunk(
slot,
this_lanes,
header_without_nonce.len(),
this_start_nonce,
0,
false,
false,
false,
Some(&completion_tx),
)?;
pending_by_slot[slot] = Some(PendingDispatch {
_slot: slot,
output_lane_offset: 0,
lanes: this_lanes,
_command_buffer: command_buffer,
});
in_flight += 1;
lane_offset += this_lanes;
}
if in_flight == 0 {
break;
}
let (slot, status) = self.await_completion_event(&completion_rx)?;
completion_waits = completion_waits.saturating_add(1);
let _done = pending_by_slot
.get_mut(slot)
.and_then(Option::take)
.ok_or_else(|| anyhow::anyhow!("completion for unknown slot {}", slot))?;
if status != MTLCommandBufferStatus::Completed {
anyhow::bail!(
"metal command failed with status {:?} (slot={})",
status,
slot
);
}
available_slots.push_back(slot);
in_flight = in_flight.saturating_sub(1);
}
if self.should_log_profile() {
let elapsed = start_submit.elapsed().as_secs_f64();
eprintln!(
"[metal-prof] compute_batch_count lanes={} chunk={} inflight={} elapsed={:.4}s completion_waits={}",
lanes, chunk_lanes, self.inflight_slots, elapsed, completion_waits
);
}
Ok(lanes)
}
pub fn find_proof_batch(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
difficulty: u32,
) -> Result<Option<(u64, [u8; 32])>> {
if lanes == 0 {
return Ok(None);
}
if header_without_nonce.is_empty() {
anyhow::bail!("header must not be empty");
}
if self.tuned_chunk_lanes == 0 || self.max_chunk_lanes == 0 {
let (chunk, max_chunk) = self.autotune_chunk_limits();
self.tuned_chunk_lanes = chunk;
self.max_chunk_lanes = max_chunk;
}
self.ensure_runtime_profile(header_without_nonce)?;
let chunk_lanes = self
.tuned_chunk_lanes
.min(self.max_chunk_lanes)
.min(lanes)
.max(1);
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
let header_buf = self
.header_buf
.as_ref()
.ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
unsafe {
std::ptr::copy_nonoverlapping(
header_without_nonce.as_ptr(),
header_buf.contents() as *mut u8,
header_without_nonce.len(),
);
}
let mut pending_by_slot: Vec<Option<PendingDispatch>> =
(0..self.inflight_slots).map(|_| None).collect();
let mut available_slots: VecDeque<usize> = (0..self.inflight_slots).collect();
let (completion_tx, completion_rx) = mpsc::channel::<CompletionEvent>();
let mut in_flight = 0usize;
let mut lane_offset = 0usize;
let mut found: Option<(u64, [u8; 32])> = None;
while lane_offset < lanes || in_flight > 0 {
while lane_offset < lanes && !available_slots.is_empty() {
let this_lanes = (lanes - lane_offset).min(chunk_lanes);
let slot = available_slots.pop_front().expect("slot available");
let this_start_nonce = start_nonce.wrapping_add(lane_offset as u64);
let command_buffer = self.dispatch_chunk(
slot,
this_lanes,
header_without_nonce.len(),
this_start_nonce,
difficulty,
true,
false,
false,
Some(&completion_tx),
)?;
pending_by_slot[slot] = Some(PendingDispatch {
_slot: slot,
output_lane_offset: lane_offset,
lanes: this_lanes,
_command_buffer: command_buffer,
});
in_flight += 1;
lane_offset += this_lanes;
}
if in_flight == 0 {
break;
}
let (slot, status) = self.await_completion_event(&completion_rx)?;
let _done = pending_by_slot
.get_mut(slot)
.and_then(Option::take)
.ok_or_else(|| anyhow::anyhow!("completion for unknown slot {}", slot))?;
if status != MTLCommandBufferStatus::Completed {
anyhow::bail!(
"metal command failed with status {:?} (slot={})",
status,
slot
);
}
if found.is_none() {
let found_flag = self
.found_flag_slots
.get(slot)
.ok_or_else(|| anyhow::anyhow!("missing found flag slot {}", slot))?;
let found_data = self
.found_data_slots
.get(slot)
.ok_or_else(|| anyhow::anyhow!("missing found data slot {}", slot))?;
unsafe {
let flag = *(found_flag.contents() as *const u32);
if flag != 0 {
let mut nonce_bytes = [0u8; 8];
std::ptr::copy_nonoverlapping(
found_data.contents() as *const u8,
nonce_bytes.as_mut_ptr(),
8,
);
let mut hash = [0u8; 32];
std::ptr::copy_nonoverlapping(
(found_data.contents() as *const u8).add(8),
hash.as_mut_ptr(),
32,
);
let nonce = u64::from_le_bytes(nonce_bytes);
if nonce >= start_nonce
&& nonce < start_nonce.wrapping_add(lanes as u64).wrapping_add(1)
{
found = Some((nonce, hash));
}
}
}
}
available_slots.push_back(slot);
in_flight = in_flight.saturating_sub(1);
}
Ok(found)
}
}
const METAL_SHADER_SOURCE: &str = r#"
#include <metal_stdlib>
using namespace metal;
constant uint CHAINS = 4;
constant uint SCRATCHPAD_SIZE = 524288;
constant uint BLOCK_SIZE = 64;
constant uint BLOCKS_PER_SCRATCHPAD = 8192;
constant uint ADDRESS_MASK = 8191;
constant ulong GOLDEN_RATIO = 0x9E3779B97F4A7C15ul;
constant bool FC_VECTOR_BLOCK_IO [[function_constant(0)]];
constant bool FC_UNROLL_ROUNDS [[function_constant(1)]];
constant bool FC_PROOF_MODE [[function_constant(2)]];
constant uint B3_CHUNK_START = 1;
constant uint B3_CHUNK_END = 2;
constant uint B3_ROOT = 8;
struct KernelParams {
uint lanes;
uint rounds;
uint header_len;
uint difficulty;
uint emit_hashes;
uint _pad0;
ulong start_nonce;
};
constant uchar SBOX[256] = {
0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76,
0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0,
0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15,
0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75,
0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84,
0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf,
0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8,
0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2,
0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73,
0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb,
0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79,
0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08,
0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a,
0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e,
0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf,
0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
};
constant uint SHA256_K[64] = {
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
};
constant uint SHA256_IV[8] = {
0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
};
constant uint B3_IV[8] = {0x6A09E667,0xBB67AE85,0x3C6EF372,0xA54FF53A,0x510E527F,0x9B05688C,0x1F83D9AB,0x5BE0CD19};
constant ushort B3_S[7][16] = {
{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15},
{2,6,3,10,7,0,4,13,1,11,12,5,9,14,15,8},
{3,4,10,12,13,2,7,14,6,5,9,0,11,15,8,1},
{10,7,12,9,14,3,13,15,4,0,11,2,5,8,1,6},
{12,13,9,11,15,10,14,8,7,2,5,3,0,1,6,4},
{9,14,11,5,8,12,15,1,13,3,0,10,2,6,4,7},
{11,15,5,0,1,9,8,6,14,10,2,12,3,4,7,13}
};
inline uchar gf_mul2(uchar x) { uchar hi = x >> 7; return (uchar)((x << 1) ^ (hi * 0x1b)); }
inline uchar gf_mul3(uchar x) { return (uchar)(gf_mul2(x) ^ x); }
inline void aesenc_round(thread uchar state[16], thread const uchar rk[16]) {
uchar s[16];
for (uint i = 0; i < 16; i++) s[i] = SBOX[state[i]];
uchar t[16];
for (uint i = 0; i < 16; i++) t[i] = s[i];
s[1]=t[5]; s[5]=t[9]; s[9]=t[13]; s[13]=t[1];
s[2]=t[10]; s[6]=t[14]; s[10]=t[2]; s[14]=t[6];
s[3]=t[15]; s[7]=t[3]; s[11]=t[7]; s[15]=t[11];
uchar out[16];
for (uint c = 0; c < 4; c++) {
uint i = c * 4;
uchar a0=s[i], a1=s[i+1], a2=s[i+2], a3=s[i+3];
out[i] = (uchar)(gf_mul2(a0) ^ gf_mul3(a1) ^ a2 ^ a3);
out[i+1] = (uchar)(a0 ^ gf_mul2(a1) ^ gf_mul3(a2) ^ a3);
out[i+2] = (uchar)(a0 ^ a1 ^ gf_mul2(a2) ^ gf_mul3(a3));
out[i+3] = (uchar)(gf_mul3(a0) ^ a1 ^ a2 ^ gf_mul2(a3));
}
for (uint i = 0; i < 16; i++) state[i] = (uchar)(out[i] ^ rk[i]);
}
inline void aes_expand_block(thread uchar state[16], thread const uchar key[16]) {
aesenc_round(state, key);
aesenc_round(state, key);
aesenc_round(state, key);
aesenc_round(state, key);
}
inline uchar4 make_u4(thread const uchar a[16], uint i) {
return uchar4(a[i], a[i+1], a[i+2], a[i+3]);
}
inline void write_scratch_block(device uchar* dst, thread const uchar st[16], thread const uchar st2[16]) {
if (FC_VECTOR_BLOCK_IO) {
device uchar4* d4 = (device uchar4*)dst;
d4[0] = make_u4(st, 0); d4[1] = make_u4(st, 4); d4[2] = make_u4(st, 8); d4[3] = make_u4(st, 12);
d4[4] = make_u4(st2, 0); d4[5] = make_u4(st2, 4); d4[6] = make_u4(st2, 8); d4[7] = make_u4(st2, 12);
d4[8] = make_u4(st, 0); d4[9] = make_u4(st, 4); d4[10] = make_u4(st, 8); d4[11] = make_u4(st, 12);
d4[12] = make_u4(st2, 0); d4[13] = make_u4(st2, 4); d4[14] = make_u4(st2, 8); d4[15] = make_u4(st2, 12);
} else {
for(uint i=0;i<16;i++) {
dst[i] = st[i];
dst[16+i] = st2[i];
dst[32+i] = st[i];
dst[48+i] = st2[i];
}
}
}
inline void load_scratch_block(const device uchar* src, thread uchar block[64]) {
if (FC_VECTOR_BLOCK_IO) {
device const uchar4* src4 = (device const uchar4*)src;
for (uint i=0; i<16; i++) {
uchar4 v = src4[i];
uint o = i * 4;
block[o] = v.x;
block[o+1] = v.y;
block[o+2] = v.z;
block[o+3] = v.w;
}
} else {
for(uint i=0;i<64;i++) block[i] = src[i];
}
}
inline uint rotr(uint x, uint n) { return (x >> n) | (x << (32 - n)); }
inline uint rd_be32(thread const uchar* p) {
return ((uint)p[0] << 24) | ((uint)p[1] << 16) | ((uint)p[2] << 8) | (uint)p[3];
}
inline ulong rd_le64(thread const uchar* p) {
return (ulong)p[0] | ((ulong)p[1] << 8) | ((ulong)p[2] << 16) | ((ulong)p[3] << 24) |
((ulong)p[4] << 32) | ((ulong)p[5] << 40) | ((ulong)p[6] << 48) | ((ulong)p[7] << 56);
}
inline void wr_be32(thread uchar* p, uint v) {
p[0] = (uchar)(v >> 24); p[1] = (uchar)(v >> 16); p[2] = (uchar)(v >> 8); p[3] = (uchar)(v);
}
inline void wr_le32(thread uchar* p, uint v) {
p[0] = (uchar)(v); p[1] = (uchar)(v >> 8); p[2] = (uchar)(v >> 16); p[3] = (uchar)(v >> 24);
}
inline void sha256_compress(thread const uchar state_b[32], thread const uchar block[64], thread uchar out[32]) {
uint s[8];
for (uint i=0;i<8;i++) s[i]=rd_be32(&state_b[i*4]);
uint w[64];
for (uint i=0;i<16;i++) w[i]=rd_be32(&block[i*4]);
for (uint i=16;i<64;i++) {
uint s0 = rotr(w[i-15],7) ^ rotr(w[i-15],18) ^ (w[i-15] >> 3);
uint s1 = rotr(w[i-2],17) ^ rotr(w[i-2],19) ^ (w[i-2] >> 10);
w[i] = w[i-16] + s0 + w[i-7] + s1;
}
uint a=s[0],b=s[1],c=s[2],d=s[3],e=s[4],f=s[5],g=s[6],h=s[7];
for (uint i=0;i<64;i++) {
uint S1 = rotr(e,6) ^ rotr(e,11) ^ rotr(e,25);
uint ch = (e & f) ^ ((~e) & g);
uint t1 = h + S1 + ch + SHA256_K[i] + w[i];
uint S0 = rotr(a,2) ^ rotr(a,13) ^ rotr(a,22);
uint maj = (a & b) ^ (a & c) ^ (b & c);
uint t2 = S0 + maj;
h=g; g=f; f=e; e=d+t1; d=c; c=b; b=a; a=t1+t2;
}
s[0]+=a; s[1]+=b; s[2]+=c; s[3]+=d; s[4]+=e; s[5]+=f; s[6]+=g; s[7]+=h;
for (uint i=0;i<8;i++) wr_be32(&out[i*4], s[i]);
}
inline void blake3_g(thread uint v[16], uint a, uint b, uint c, uint d, uint mx, uint my) {
v[a] = v[a] + v[b] + mx;
v[d] = rotr(v[d] ^ v[a], 16);
v[c] = v[c] + v[d];
v[b] = rotr(v[b] ^ v[c], 12);
v[a] = v[a] + v[b] + my;
v[d] = rotr(v[d] ^ v[a], 8);
v[c] = v[c] + v[d];
v[b] = rotr(v[b] ^ v[c], 7);
}
inline void blake3_compress(thread const uchar state_b[32], thread const uchar block[64], thread uchar out[32]) {
uint h[8];
uint m[16];
for (uint i=0;i<8;i++) h[i] = (uint)state_b[i*4] | ((uint)state_b[i*4+1]<<8) | ((uint)state_b[i*4+2]<<16) | ((uint)state_b[i*4+3]<<24);
for (uint i=0;i<16;i++) m[i] = (uint)block[i*4] | ((uint)block[i*4+1]<<8) | ((uint)block[i*4+2]<<16) | ((uint)block[i*4+3]<<24);
uint v[16];
for(uint i=0;i<8;i++){ v[i]=h[i]; v[i+8]=B3_IV[i]; }
for(uint r=0;r<7;r++){
blake3_g(v,0,4,8,12,m[B3_S[r][0]],m[B3_S[r][1]]);
blake3_g(v,1,5,9,13,m[B3_S[r][2]],m[B3_S[r][3]]);
blake3_g(v,2,6,10,14,m[B3_S[r][4]],m[B3_S[r][5]]);
blake3_g(v,3,7,11,15,m[B3_S[r][6]],m[B3_S[r][7]]);
blake3_g(v,0,5,10,15,m[B3_S[r][8]],m[B3_S[r][9]]);
blake3_g(v,1,6,11,12,m[B3_S[r][10]],m[B3_S[r][11]]);
blake3_g(v,2,7,8,13,m[B3_S[r][12]],m[B3_S[r][13]]);
blake3_g(v,3,4,9,14,m[B3_S[r][14]],m[B3_S[r][15]]);
}
for (uint i=0;i<8;i++) {
uint x = v[i] ^ v[i+8];
wr_le32(&out[i*4], x);
}
}
inline void blake3_compress_hash(
thread const uint cv[8],
thread const uint block_words[16],
uint counter_low,
uint counter_high,
uint block_len,
uint flags,
thread uint out[16]
) {
uint v[16];
for(uint i=0;i<8;i++){ v[i]=cv[i]; }
v[8] = B3_IV[0];
v[9] = B3_IV[1];
v[10] = B3_IV[2];
v[11] = B3_IV[3];
v[12] = counter_low;
v[13] = counter_high;
v[14] = block_len;
v[15] = flags;
for(uint r=0;r<7;r++){
blake3_g(v,0,4,8,12,block_words[B3_S[r][0]],block_words[B3_S[r][1]]);
blake3_g(v,1,5,9,13,block_words[B3_S[r][2]],block_words[B3_S[r][3]]);
blake3_g(v,2,6,10,14,block_words[B3_S[r][4]],block_words[B3_S[r][5]]);
blake3_g(v,3,7,11,15,block_words[B3_S[r][6]],block_words[B3_S[r][7]]);
blake3_g(v,0,5,10,15,block_words[B3_S[r][8]],block_words[B3_S[r][9]]);
blake3_g(v,1,6,11,12,block_words[B3_S[r][10]],block_words[B3_S[r][11]]);
blake3_g(v,2,7,8,13,block_words[B3_S[r][12]],block_words[B3_S[r][13]]);
blake3_g(v,3,4,9,14,block_words[B3_S[r][14]],block_words[B3_S[r][15]]);
}
for (uint i=0;i<8;i++) {
out[i] = v[i] ^ v[i+8];
out[i+8] = v[i+8] ^ cv[i];
}
}
inline void blake3_compress_inplace(
thread uint cv[8],
thread const uint block_words[16],
uint counter_low,
uint counter_high,
uint block_len,
uint flags
) {
uint out_words[16];
blake3_compress_hash(cv, block_words, counter_low, counter_high, block_len, flags, out_words);
for (uint i=0;i<8;i++) cv[i] = out_words[i];
}
inline uchar header_nonce_byte(const device uchar* header, uint header_len, ulong nonce, uint idx) {
if (idx < header_len) {
return header[idx];
}
uint j = idx - header_len;
if (j < 8) {
return (uchar)((nonce >> (j * 8)) & 0xfful);
}
return 0;
}
inline void blake3_hash_header_nonce(
const device uchar* header,
uint header_len,
ulong nonce,
thread uchar out[32]
) {
uint total_len = header_len + 8;
uint blocks = (total_len + 63) / 64;
uint cv[8];
for (uint i=0;i<8;i++) cv[i] = B3_IV[i];
for (uint b = 0; b < blocks; b++) {
uint off = b * 64;
uint block_len = min((uint)64, total_len - off);
uint words[16];
for (uint i=0;i<16;i++) words[i] = 0;
for (uint i=0;i<block_len;i++) {
uchar by = header_nonce_byte(header, header_len, nonce, off + i);
words[i / 4] |= ((uint)by) << ((i % 4) * 8);
}
uint flags = 0;
if (b == 0) flags |= B3_CHUNK_START;
bool last = (b + 1 == blocks);
if (last) flags |= B3_CHUNK_END;
if (!last) {
blake3_compress_inplace(cv, words, 0, 0, block_len, flags);
} else {
uint out_words[16];
blake3_compress_hash(cv, words, 0, 0, block_len, flags | B3_ROOT, out_words);
for (uint i=0;i<8;i++) wr_le32(&out[i*4], out_words[i]);
}
}
}
inline void sha256_digest_32(thread const uchar input[32], thread uchar out[32]) {
uint state[8];
for (uint i=0;i<8;i++) state[i] = SHA256_IV[i];
uchar block[64];
for (uint i=0;i<32;i++) block[i] = input[i];
block[32] = 0x80;
for (uint i=33;i<56;i++) block[i] = 0;
block[56] = 0;
block[57] = 0;
block[58] = 0;
block[59] = 0;
block[60] = 0;
block[61] = 0;
block[62] = 1;
block[63] = 0;
uint w[64];
for (uint i=0;i<16;i++) w[i]=rd_be32(&block[i*4]);
for (uint i=16;i<64;i++) {
uint s0 = rotr(w[i-15],7) ^ rotr(w[i-15],18) ^ (w[i-15] >> 3);
uint s1 = rotr(w[i-2],17) ^ rotr(w[i-2],19) ^ (w[i-2] >> 10);
w[i] = w[i-16] + s0 + w[i-7] + s1;
}
uint a=state[0],b=state[1],c=state[2],d=state[3],e=state[4],f=state[5],g=state[6],h=state[7];
for (uint i=0;i<64;i++) {
uint S1 = rotr(e,6) ^ rotr(e,11) ^ rotr(e,25);
uint ch = (e & f) ^ ((~e) & g);
uint t1 = h + S1 + ch + SHA256_K[i] + w[i];
uint S0 = rotr(a,2) ^ rotr(a,13) ^ rotr(a,22);
uint maj = (a & b) ^ (a & c) ^ (b & c);
uint t2 = S0 + maj;
h=g; g=f; f=e; e=d+t1; d=c; c=b; b=a; a=t1+t2;
}
state[0]+=a; state[1]+=b; state[2]+=c; state[3]+=d;
state[4]+=e; state[5]+=f; state[6]+=g; state[7]+=h;
for (uint i=0;i<8;i++) wr_be32(&out[i*4], state[i]);
}
inline void blake3_hash_32(thread const uchar input[32], thread uchar out[32]) {
uint cv[8];
for (uint i=0;i<8;i++) cv[i] = B3_IV[i];
uint block_words[16];
for (uint i=0;i<8;i++) {
block_words[i] =
(uint)input[i*4] |
((uint)input[i*4+1] << 8) |
((uint)input[i*4+2] << 16) |
((uint)input[i*4+3] << 24);
}
for (uint i=8;i<16;i++) block_words[i] = 0;
uint out_words[16];
blake3_compress_hash(cv, block_words, 0, 0, 32, B3_CHUNK_START | B3_CHUNK_END | B3_ROOT, out_words);
for (uint i=0;i<8;i++) wr_le32(&out[i*4], out_words[i]);
}
inline uint leading_zeros_u8(uchar b) {
uint n = 0;
for (uint bit = 0; bit < 8; bit++) {
if ((b & (uchar)(0x80 >> bit)) != 0) return n;
n++;
}
return 8;
}
inline bool meets_difficulty_bits(thread const uchar hash[32], uint difficulty) {
uint zero_bits = 0;
for (uint i=0; i<32; i++) {
uchar b = hash[i];
if (b == 0) {
zero_bits += 8;
} else {
zero_bits += leading_zeros_u8(b);
break;
}
if (zero_bits >= difficulty) return true;
}
return zero_bits >= difficulty;
}
inline uint compute_addr(thread const uchar state[32], uint round) {
const ulong C = 0x517cc1b727220a95ul;
ulong lo = rd_le64(&state[0]);
ulong hi = rd_le64(&state[8]);
ulong ru = (ulong)round;
ulong mixed = lo ^ hi ^ ((ru << 13) | (ru >> (64 - 13))) ^ (ru * C);
return (uint)((mixed & ADDRESS_MASK) * BLOCK_SIZE);
}
inline void uhash_round(
device uchar* chain_scratch,
thread uchar state[32],
uint initial_primitive,
uint r
) {
uint addr = compute_addr(state, r);
uchar block[64];
load_scratch_block(chain_scratch + addr, block);
uint primitive = (initial_primitive + r + 1) % 3;
uchar new_state[32];
if (primitive == 0) {
uchar lo[16]; uchar hi[16];
for(uint i=0;i<16;i++){ lo[i]=state[i]; hi[i]=state[16+i]; }
aesenc_round(lo, &block[0]);
aesenc_round(lo, &block[16]);
aesenc_round(lo, &block[32]);
aesenc_round(lo, &block[48]);
aesenc_round(hi, &block[32]);
aesenc_round(hi, &block[48]);
aesenc_round(hi, &block[0]);
aesenc_round(hi, &block[16]);
for(uint i=0;i<16;i++){ new_state[i]=lo[i]; new_state[16+i]=hi[i]; }
} else if (primitive == 1) {
sha256_compress(state, block, new_state);
} else {
blake3_compress(state, block, new_state);
}
for(uint i=0;i<32;i++) chain_scratch[addr+i] = new_state[i];
for(uint i=0;i<32;i++) state[i] = new_state[i];
}
kernel void uhash_kernel(
const device uchar* header [[buffer(0)]],
device uchar* hashes_out [[buffer(1)]],
device uchar* scratchpads [[buffer(2)]],
const device KernelParams& params [[buffer(3)]],
device atomic_uint* found_flag [[buffer(4)]],
device uchar* found_data [[buffer(5)]],
uint gid [[thread_position_in_grid]],
uint tid [[thread_index_in_threadgroup]]
) {
if (gid >= params.lanes) return;
device uchar* lane_scratch = scratchpads + (ulong)gid * (ulong)(CHAINS * SCRATCHPAD_SIZE);
uchar states[CHAINS][32];
ulong nonce_gid = params.start_nonce + (ulong)gid;
for (uint c = 0; c < CHAINS; c++) {
ulong modified_nonce = nonce_gid ^ ((ulong)c * GOLDEN_RATIO);
blake3_hash_header_nonce(header, params.header_len, modified_nonce, states[c]);
uchar key[16];
uchar st[16];
for(uint i=0;i<16;i++){ key[i]=states[c][i]; st[i]=states[c][16+i]; }
device uchar* chain_scratch = lane_scratch + c * SCRATCHPAD_SIZE;
for (uint b=0; b < BLOCKS_PER_SCRATCHPAD; b++) {
aes_expand_block(st, key);
uint off = b * BLOCK_SIZE;
uchar st2[16];
for(uint i=0;i<16;i++) st2[i]=st[i];
aes_expand_block(st2, key);
write_scratch_block(chain_scratch + off, st, st2);
}
}
for (uint c = 0; c < CHAINS; c++) {
device uchar* chain_scratch = lane_scratch + c * SCRATCHPAD_SIZE;
uint initial_primitive = (uint)((nonce_gid + (ulong)c) % 3ul);
if (FC_UNROLL_ROUNDS) {
uint r = 0;
for (; r + 1 < params.rounds; r += 2) {
uhash_round(chain_scratch, states[c], initial_primitive, r);
uhash_round(chain_scratch, states[c], initial_primitive, r + 1);
}
if (r < params.rounds) {
uhash_round(chain_scratch, states[c], initial_primitive, r);
}
} else {
for (uint r=0; r<params.rounds; r++) {
uhash_round(chain_scratch, states[c], initial_primitive, r);
}
}
}
uchar combined[32];
for(uint i=0;i<32;i++) combined[i] = 0;
for(uint c=0;c<CHAINS;c++) for(uint i=0;i<32;i++) combined[i] ^= states[c][i];
uchar sha_out[32];
sha256_digest_32(combined, sha_out);
uchar final_out[32];
blake3_hash_32(sha_out, final_out);
if (!FC_PROOF_MODE && params.emit_hashes != 0u) {
device uchar* out = hashes_out + gid * 32;
for(uint i=0;i<32;i++) out[i] = final_out[i];
}
if (FC_PROOF_MODE && params.difficulty > 0u) {
threadgroup atomic_uint tg_found;
threadgroup uint tg_nonce_lo;
threadgroup uint tg_nonce_hi;
threadgroup uchar tg_hash[32];
if (tid == 0) {
atomic_store_explicit(&tg_found, 0u, memory_order_relaxed);
tg_nonce_lo = 0u;
tg_nonce_hi = 0u;
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (meets_difficulty_bits(final_out, params.difficulty)) {
uint expected_local = 0u;
if (atomic_compare_exchange_weak_explicit(
&tg_found, &expected_local, 1u, memory_order_relaxed, memory_order_relaxed)) {
tg_nonce_lo = (uint)(nonce_gid & 0xfffffffful);
tg_nonce_hi = (uint)(nonce_gid >> 32);
for (uint i=0; i<32; i++) tg_hash[i] = final_out[i];
}
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (tid == 0) {
uint local_found = atomic_load_explicit(&tg_found, memory_order_relaxed);
if (local_found != 0u) {
uint expected = 0u;
if (atomic_compare_exchange_weak_explicit(
found_flag, &expected, 1u, memory_order_relaxed, memory_order_relaxed)) {
ulong winner_nonce = ((ulong)tg_nonce_hi << 32) | (ulong)tg_nonce_lo;
for (uint i=0; i<8; i++) {
found_data[i] = (uchar)((winner_nonce >> (i * 8)) & 0xfful);
}
for (uint i=0; i<32; i++) {
found_data[8 + i] = tg_hash[i];
}
}
}
}
}
}
"#;
impl crate::solver::Solver for MetalMiner {
fn backend_name(&self) -> &'static str {
"metal"
}
fn recommended_lanes(&mut self, requested: usize) -> usize {
self.recommended_lanes(requested)
}
fn find_proof_batch(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
difficulty: u32,
) -> anyhow::Result<crate::solver::ProofResult> {
self.find_proof_batch(header_without_nonce, start_nonce, lanes, difficulty)
}
fn benchmark_hashes(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
) -> anyhow::Result<usize> {
self.benchmark_hashes(header_without_nonce, start_nonce, lanes)
}
}
#[cfg(test)]
mod tests {
use super::MetalMiner;
use std::time::Instant;
use uhash_core::UniversalHash;
fn cpu_hash(header: &[u8], nonce: u64) -> [u8; 32] {
let mut input = Vec::with_capacity(header.len() + 8);
input.extend_from_slice(header);
input.extend_from_slice(&nonce.to_le_bytes());
let mut cpu = UniversalHash::new();
cpu.hash(&input)
}
#[test]
fn metal_hash_matches_cpu_for_single_nonce() {
let mut miner = MetalMiner::new().expect("metal init");
let mut header = Vec::new();
header.extend_from_slice(&[0xAB; 32]);
header.extend_from_slice(b"bostrom1testaddress");
header.extend_from_slice(&1_700_000_000u64.to_le_bytes());
let nonce = 42u64;
let gpu_hashes = miner
.compute_batch_hashes(&header, nonce, 1)
.expect("gpu hash");
assert_eq!(gpu_hashes.len(), 1);
let cpu_hash = cpu_hash(&header, nonce);
assert_eq!(gpu_hashes[0], cpu_hash);
}
#[test]
fn metal_hash_matches_cpu_for_header_size_vectors() {
let mut miner = MetalMiner::new().expect("metal init");
let header_lengths = [1usize, 55, 56, 63, 64, 65, 127, 128];
for &header_len in &header_lengths {
let mut header = vec![0u8; header_len];
for (i, b) in header.iter_mut().enumerate() {
*b = ((i * 17 + header_len) & 0xFF) as u8;
}
let start_nonce = 1000u64 + header_len as u64;
let lanes = 4usize;
let gpu = miner
.compute_batch_hashes(&header, start_nonce, lanes)
.expect("gpu batch");
assert_eq!(gpu.len(), lanes);
for (lane, gpu_hash) in gpu.into_iter().enumerate() {
let nonce = start_nonce.wrapping_add(lane as u64);
let expected = cpu_hash(&header, nonce);
assert_eq!(
gpu_hash, expected,
"header_len={} lane={}",
header_len, lane
);
}
}
}
#[test]
fn metal_hash_matches_cpu_for_wrapping_nonce_window() {
let mut miner = MetalMiner::new().expect("metal init");
let header = b"bostrom1wrapping-nonce-window".to_vec();
let start_nonce = u64::MAX - 2;
let lanes = 5usize;
let gpu = miner
.compute_batch_hashes(&header, start_nonce, lanes)
.expect("gpu wrapping batch");
assert_eq!(gpu.len(), lanes);
for (lane, gpu_hash) in gpu.into_iter().enumerate() {
let nonce = start_nonce.wrapping_add(lane as u64);
let expected = cpu_hash(&header, nonce);
assert_eq!(gpu_hash, expected, "lane={} nonce={}", lane, nonce);
}
}
#[test]
#[ignore = "performance/stress profile for local metal throughput validation"]
fn metal_release_throughput_profile() {
let mut miner = MetalMiner::new().expect("metal init");
let mut header = Vec::new();
header.extend_from_slice(&[0xCD; 32]);
header.extend_from_slice(b"bostrom1metalprofile");
header.extend_from_slice(&1_700_000_001u64.to_le_bytes());
let lane_sets = [64usize, 128, 256, 512, 1024];
let mut nonce = 0u64;
for lanes in lane_sets {
let _ = miner
.compute_batch_hashes(&header, nonce, lanes)
.expect("warmup batch");
nonce = nonce.saturating_add(lanes as u64);
let start = Instant::now();
let batches_per_set = if lanes >= 1024 { 1u64 } else { 3u64 };
let mut computed = 0u64;
for _ in 0..batches_per_set {
let _ = miner
.compute_batch_hashes(&header, nonce, lanes)
.expect("profile batch");
nonce = nonce.saturating_add(lanes as u64);
computed += lanes as u64;
}
let elapsed = start.elapsed().as_secs_f64();
let hps = if elapsed > 0.0 {
computed as f64 / elapsed
} else {
0.0
};
eprintln!(
"metal-profile lanes={} hashes={} elapsed={:.3}s hashrate={:.2} H/s",
lanes, computed, elapsed, hps
);
}
}
}