use anyhow::Result;
use cudarc::driver::{
CudaContext, CudaFunction, CudaSlice, CudaStream, LaunchConfig, PushKernelArg,
};
use serde::{Deserialize, Serialize};
use std::fs;
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Instant;
use uhash_core::meets_difficulty;
const CHAINS: usize = 4;
const SCRATCHPAD_SIZE: usize = 512 * 1024;
const TOTAL_MEMORY: usize = CHAINS * SCRATCHPAD_SIZE;
const ROUNDS: usize = 12_288;
const DEFAULT_INFLIGHT_SLOTS: usize = 3;
#[repr(C)]
#[derive(Clone, Copy, Debug)]
struct KernelParams {
lanes: u32,
rounds: u32,
header_len: u32,
_pad0: u32,
start_nonce: u64,
}
#[derive(Clone, Debug, Serialize)]
pub struct CudaTelemetry {
pub device_name: String,
pub compute_capability: (u32, u32),
pub total_memory_bytes: u64,
pub max_threads_per_block: u32,
pub multiprocessor_count: u32,
pub warp_size: u32,
pub inflight_slots: usize,
pub tuned_chunk_lanes: usize,
pub max_chunk_lanes: usize,
pub runtime_profiled: bool,
}
#[derive(Debug, Serialize, Deserialize)]
struct CudaTuneCache {
version: u32,
device_name: String,
compute_capability: (u32, u32),
tuned_chunk_lanes: usize,
max_chunk_lanes: usize,
inflight_slots: usize,
block_size: u32,
}
pub struct CudaMiner {
#[allow(dead_code)] ctx: Arc<CudaContext>,
stream: Arc<CudaStream>,
kernel: CudaFunction,
d_params: Vec<CudaSlice<u8>>,
d_hashes: Vec<CudaSlice<u8>>,
d_scratchpads: Vec<CudaSlice<u8>>,
d_header: Option<CudaSlice<u8>>,
header_capacity: usize,
chunk_lanes: usize,
tuned_chunk_lanes: usize,
max_chunk_lanes: usize,
inflight_slots: usize,
block_size: u32,
did_runtime_profile: bool,
device_name: String,
compute_capability: (u32, u32),
total_memory: u64,
max_threads_per_block: u32,
multiprocessor_count: u32,
warp_size: u32,
}
impl CudaMiner {
pub fn new() -> Result<Self> {
let ctx =
CudaContext::new(0).map_err(|e| anyhow::anyhow!("CUDA device init failed: {}", e))?;
let device_name = ctx
.name()
.map_err(|e| anyhow::anyhow!("Failed to get device name: {}", e))?;
let (cc_major, cc_minor) = ctx
.compute_capability()
.map_err(|e| anyhow::anyhow!("Failed to get compute capability: {}", e))?;
let arch_str: String = format!("sm_{}{}", cc_major, cc_minor);
let arch_static: &'static str = Box::leak(arch_str.into_boxed_str());
let compile_opts = cudarc::nvrtc::CompileOptions {
arch: Some(arch_static),
fmad: Some(true),
..Default::default()
};
let ptx = cudarc::nvrtc::compile_ptx_with_opts(CUDA_KERNEL_SOURCE, compile_opts)
.map_err(|e| anyhow::anyhow!("NVRTC compilation failed: {}", e))?;
let module = ctx
.load_module(ptx)
.map_err(|e| anyhow::anyhow!("Failed to load PTX module: {}", e))?;
let kernel = module
.load_function("uhash_kernel")
.map_err(|e| anyhow::anyhow!("Failed to load uhash_kernel: {}", e))?;
let total_memory = unsafe {
cudarc::driver::result::device::total_mem(ctx.cu_device())
.map_err(|e| anyhow::anyhow!("Failed to get total memory: {}", e))?
} as u64;
let max_threads_per_block =
ctx.attribute(
cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK,
)
.map_err(|e| anyhow::anyhow!("Failed to get max threads: {}", e))? as u32;
let multiprocessor_count =
ctx.attribute(
cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
)
.map_err(|e| anyhow::anyhow!("Failed to get SM count: {}", e))? as u32;
let warp_size =
ctx.attribute(cudarc::driver::sys::CUdevice_attribute::CU_DEVICE_ATTRIBUTE_WARP_SIZE)
.map_err(|e| anyhow::anyhow!("Failed to get warp size: {}", e))? as u32;
let stream = ctx.default_stream();
let mut miner = Self {
ctx,
stream,
kernel,
d_params: Vec::new(),
d_hashes: Vec::new(),
d_scratchpads: Vec::new(),
d_header: None,
header_capacity: 0,
chunk_lanes: 0,
tuned_chunk_lanes: 0,
max_chunk_lanes: 0,
inflight_slots: DEFAULT_INFLIGHT_SLOTS,
block_size: 256,
did_runtime_profile: false,
device_name,
compute_capability: (cc_major as u32, cc_minor as u32),
total_memory,
max_threads_per_block,
multiprocessor_count,
warp_size,
};
miner.try_load_tune_cache();
Ok(miner)
}
pub fn recommended_lanes(&mut self, requested: usize) -> usize {
if self.tuned_chunk_lanes == 0 {
let (chunk, max_chunk) = self.autotune_chunk_limits();
self.tuned_chunk_lanes = chunk;
self.max_chunk_lanes = max_chunk;
}
if requested == 0 {
self.tuned_chunk_lanes
.saturating_mul(self.inflight_slots)
.max(1)
} else {
requested.max(1)
}
}
pub fn telemetry(&self) -> CudaTelemetry {
CudaTelemetry {
device_name: self.device_name.clone(),
compute_capability: self.compute_capability,
total_memory_bytes: self.total_memory,
max_threads_per_block: self.max_threads_per_block,
multiprocessor_count: self.multiprocessor_count,
warp_size: self.warp_size,
inflight_slots: self.inflight_slots,
tuned_chunk_lanes: self.tuned_chunk_lanes,
max_chunk_lanes: self.max_chunk_lanes,
runtime_profiled: self.did_runtime_profile,
}
}
pub fn benchmark_hashes(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
) -> Result<usize> {
Ok(self
.compute_batch_hashes(header_without_nonce, start_nonce, lanes)?
.len())
}
fn tune_cache_path(&self) -> Option<PathBuf> {
let base = std::env::var_os("XDG_CONFIG_HOME")
.map(PathBuf::from)
.or_else(|| std::env::var_os("HOME").map(|h| PathBuf::from(h).join(".config")))?;
let (maj, min) = self.compute_capability;
let safe_name: String = self
.device_name
.chars()
.map(|c| if c.is_alphanumeric() { c } else { '_' })
.collect();
Some(
base.join("uhash")
.join(format!("cuda_tuning_{}_{}{}.json", safe_name, maj, min)),
)
}
fn try_load_tune_cache(&mut self) {
let Some(path) = self.tune_cache_path() else {
return;
};
let Ok(raw) = fs::read(&path) else {
return;
};
let Ok(cache) = serde_json::from_slice::<CudaTuneCache>(&raw) else {
return;
};
if cache.version != 1 || cache.compute_capability != self.compute_capability {
return;
}
let (_heuristic_chunk, heuristic_max) = self.autotune_chunk_limits();
let cap_max = heuristic_max.max(32);
let chunk = cache.tuned_chunk_lanes.clamp(32, cap_max);
let max_chunk = cache.max_chunk_lanes.clamp(chunk, cap_max);
if chunk % 32 != 0 || max_chunk % 32 != 0 {
return;
}
self.tuned_chunk_lanes = chunk;
self.max_chunk_lanes = max_chunk;
self.inflight_slots = cache.inflight_slots.clamp(2, 6);
self.block_size = cache.block_size.clamp(64, 1024);
}
fn persist_tune_cache(&self) {
let Some(path) = self.tune_cache_path() else {
return;
};
let cache = CudaTuneCache {
version: 1,
device_name: self.device_name.clone(),
compute_capability: self.compute_capability,
tuned_chunk_lanes: self.tuned_chunk_lanes,
max_chunk_lanes: self.max_chunk_lanes,
inflight_slots: self.inflight_slots,
block_size: self.block_size,
};
let Ok(raw) = serde_json::to_vec_pretty(&cache) else {
return;
};
if let Some(parent) = path.parent() {
let _ = fs::create_dir_all(parent);
}
let _ = fs::write(path, raw);
}
fn autotune_chunk_limits(&self) -> (usize, usize) {
let mem_budget = (self.total_memory as usize).saturating_mul(7) / 10;
let per_lane = TOTAL_MEMORY + 32;
let max_by_mem = mem_budget
.saturating_div(per_lane.saturating_mul(self.inflight_slots))
.max(1);
let max_chunk = max_by_mem.clamp(32, 8192);
let tuned = max_chunk.min(1024);
let tuned = (tuned / 32).max(1) * 32;
let max_chunk = (max_chunk / 32).max(1) * 32;
(tuned, max_chunk)
}
fn round_to_multiple(v: usize, m: usize) -> usize {
(v / m).max(1) * m
}
fn runtime_profile_block_size(
&mut self,
header_without_nonce: &[u8],
chunk_lanes: usize,
) -> Result<()> {
let candidates = [128u32, 256, 512];
let current = self.block_size;
let mut best_bs = current;
let mut best_hps = 0.0f64;
for &bs in &candidates {
if bs > self.max_threads_per_block {
continue;
}
self.block_size = bs;
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
self.dispatch_chunk_sync(0, chunk_lanes, header_without_nonce.len(), 0)?;
let start = Instant::now();
self.dispatch_chunk_sync(0, chunk_lanes, header_without_nonce.len(), 1)?;
let elapsed = start.elapsed().as_secs_f64();
if elapsed <= 0.0 {
continue;
}
let hps = chunk_lanes as f64 / elapsed;
if hps > best_hps {
best_hps = hps;
best_bs = bs;
}
}
self.block_size = best_bs;
self.chunk_lanes = 0;
Ok(())
}
fn runtime_profile_chunk_lanes(&mut self, header_without_nonce: &[u8]) -> Result<()> {
if self.did_runtime_profile {
return Ok(());
}
self.did_runtime_profile = true;
let base = self.tuned_chunk_lanes.max(32);
let max_chunk = self.max_chunk_lanes.max(base);
let mut candidates = vec![
base,
Self::round_to_multiple(base.saturating_mul(2), 32).min(max_chunk),
Self::round_to_multiple(base.saturating_mul(3), 32).min(max_chunk),
max_chunk,
];
candidates.sort_unstable();
candidates.dedup();
let mut best_lane = base;
let mut best_hps = 0.0f64;
for &lane in &candidates {
if lane == 0 {
continue;
}
self.ensure_resources(header_without_nonce.len(), lane)?;
self.dispatch_chunk_sync(0, lane, header_without_nonce.len(), 0)?;
let start = Instant::now();
self.dispatch_chunk_sync(0, lane, header_without_nonce.len(), lane as u64)?;
let elapsed = start.elapsed().as_secs_f64();
if elapsed <= 0.0 {
continue;
}
let hps = lane as f64 / elapsed;
if hps > best_hps {
best_hps = hps;
best_lane = lane;
}
}
self.tuned_chunk_lanes = best_lane.max(32);
self.runtime_profile_block_size(header_without_nonce, self.tuned_chunk_lanes)?;
self.chunk_lanes = 0;
self.persist_tune_cache();
Ok(())
}
fn ensure_resources(&mut self, header_len: usize, chunk_lanes: usize) -> Result<()> {
if header_len > self.header_capacity {
let new_capacity = header_len.next_power_of_two();
self.d_header = Some(
self.stream
.alloc_zeros::<u8>(new_capacity)
.map_err(|e| anyhow::anyhow!("Failed to alloc header buffer: {}", e))?,
);
self.header_capacity = new_capacity;
}
if self.chunk_lanes == chunk_lanes
&& self.d_params.len() == self.inflight_slots
&& self.d_hashes.len() == self.inflight_slots
&& self.d_scratchpads.len() == self.inflight_slots
{
return Ok(());
}
let hash_bytes = chunk_lanes.saturating_mul(32);
let scratch_bytes = chunk_lanes.saturating_mul(TOTAL_MEMORY);
if hash_bytes == 0 || scratch_bytes == 0 {
anyhow::bail!("invalid chunk_lanes={}", chunk_lanes);
}
self.d_params.clear();
self.d_hashes.clear();
self.d_scratchpads.clear();
for _ in 0..self.inflight_slots {
self.d_params.push(
self.stream
.alloc_zeros::<u8>(std::mem::size_of::<KernelParams>())
.map_err(|e| anyhow::anyhow!("Failed to alloc params: {}", e))?,
);
self.d_hashes.push(
self.stream
.alloc_zeros::<u8>(hash_bytes)
.map_err(|e| anyhow::anyhow!("Failed to alloc hashes: {}", e))?,
);
self.d_scratchpads.push(
self.stream
.alloc_zeros::<u8>(scratch_bytes)
.map_err(|e| anyhow::anyhow!("Failed to alloc scratchpads: {}", e))?,
);
}
self.chunk_lanes = chunk_lanes;
Ok(())
}
fn dispatch_chunk_sync(
&mut self,
slot: usize,
chunk_lanes: usize,
header_len: usize,
start_nonce: u64,
) -> Result<Vec<u8>> {
let params = KernelParams {
lanes: chunk_lanes as u32,
rounds: ROUNDS as u32,
header_len: header_len as u32,
_pad0: 0,
start_nonce,
};
let params_bytes: Vec<u8> = unsafe {
std::slice::from_raw_parts(
¶ms as *const KernelParams as *const u8,
std::mem::size_of::<KernelParams>(),
)
.to_vec()
};
let d_params = self
.d_params
.get_mut(slot)
.ok_or_else(|| anyhow::anyhow!("params slot {} missing", slot))?;
self.stream
.memcpy_htod(¶ms_bytes, d_params)
.map_err(|e| anyhow::anyhow!("Failed to upload params: {}", e))?;
let grid_dim = ((chunk_lanes as u32).div_ceil(self.block_size), 1, 1);
let block_dim = (self.block_size, 1, 1);
let cfg = LaunchConfig {
grid_dim,
block_dim,
shared_mem_bytes: 0,
};
let d_header = self
.d_header
.as_ref()
.ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
let d_params = self
.d_params
.get(slot)
.ok_or_else(|| anyhow::anyhow!("params slot {} missing", slot))?;
let d_hashes = self
.d_hashes
.get(slot)
.ok_or_else(|| anyhow::anyhow!("hashes slot {} missing", slot))?;
let d_scratch = self
.d_scratchpads
.get(slot)
.ok_or_else(|| anyhow::anyhow!("scratchpad slot {} missing", slot))?;
unsafe {
self.stream
.launch_builder(&self.kernel)
.arg(d_header)
.arg(d_hashes)
.arg(d_scratch)
.arg(d_params)
.launch(cfg)
.map_err(|e| anyhow::anyhow!("Kernel launch failed: {}", e))?;
}
self.stream
.synchronize()
.map_err(|e| anyhow::anyhow!("CUDA synchronize failed: {}", e))?;
let hash_bytes = chunk_lanes * 32;
let d_hashes = self
.d_hashes
.get(slot)
.ok_or_else(|| anyhow::anyhow!("hashes slot {} missing", slot))?;
let out: Vec<u8> = self
.stream
.memcpy_dtov(d_hashes)
.map_err(|e| anyhow::anyhow!("Failed to copy hashes from device: {}", e))?;
Ok(out[..hash_bytes].to_vec())
}
fn compute_batch_hashes(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
) -> Result<Vec<[u8; 32]>> {
if lanes == 0 {
return Ok(Vec::new());
}
if header_without_nonce.is_empty() {
anyhow::bail!("header must not be empty");
}
if self.tuned_chunk_lanes == 0 || self.max_chunk_lanes == 0 {
let (chunk, max_chunk) = self.autotune_chunk_limits();
self.tuned_chunk_lanes = chunk;
self.max_chunk_lanes = max_chunk;
}
if !self.did_runtime_profile {
self.runtime_profile_chunk_lanes(header_without_nonce)?;
}
let chunk_lanes = self
.tuned_chunk_lanes
.min(self.max_chunk_lanes)
.min(lanes)
.max(1);
self.ensure_resources(header_without_nonce.len(), chunk_lanes)?;
let d_header = self
.d_header
.as_mut()
.ok_or_else(|| anyhow::anyhow!("header buffer not allocated"))?;
self.stream
.memcpy_htod(header_without_nonce, d_header)
.map_err(|e| anyhow::anyhow!("Failed to upload header: {}", e))?;
let mut out_hashes = vec![[0u8; 32]; lanes];
let mut lane_offset = 0usize;
while lane_offset < lanes {
let this_lanes = (lanes - lane_offset).min(chunk_lanes);
let slot = (lane_offset / chunk_lanes) % self.inflight_slots;
let this_start_nonce = start_nonce.saturating_add(lane_offset as u64);
let raw = self.dispatch_chunk_sync(
slot,
this_lanes,
header_without_nonce.len(),
this_start_nonce,
)?;
for i in 0..this_lanes {
let src_offset = i * 32;
out_hashes[lane_offset + i].copy_from_slice(&raw[src_offset..src_offset + 32]);
}
lane_offset += this_lanes;
}
Ok(out_hashes)
}
pub fn find_proof_batch(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
difficulty: u32,
) -> Result<Option<(u64, [u8; 32])>> {
let hashes = self.compute_batch_hashes(header_without_nonce, start_nonce, lanes)?;
for (lane, hash) in hashes.into_iter().enumerate() {
if meets_difficulty(&hash, difficulty) {
return Ok(Some((start_nonce + lane as u64, hash)));
}
}
Ok(None)
}
}
impl crate::solver::Solver for CudaMiner {
fn backend_name(&self) -> &'static str {
"cuda"
}
fn recommended_lanes(&mut self, requested: usize) -> usize {
self.recommended_lanes(requested)
}
fn find_proof_batch(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
difficulty: u32,
) -> Result<crate::solver::ProofResult> {
self.find_proof_batch(header_without_nonce, start_nonce, lanes, difficulty)
}
fn benchmark_hashes(
&mut self,
header_without_nonce: &[u8],
start_nonce: u64,
lanes: usize,
) -> Result<usize> {
self.benchmark_hashes(header_without_nonce, start_nonce, lanes)
}
}
const CUDA_KERNEL_SOURCE: &str = r#"
extern "C" {
#define CHAINS 4u
#define SCRATCHPAD_SIZE 524288u
#define BLOCK_SIZE_BYTES 64u
#define BLOCKS_PER_SCRATCHPAD 8192u
#define ADDRESS_MASK 8191u
#define GOLDEN_RATIO 0x9E3779B97F4A7C15ull
#define B3_CHUNK_START 1u
#define B3_CHUNK_END 2u
#define B3_ROOT 8u
struct KernelParams {
unsigned int lanes;
unsigned int rounds;
unsigned int header_len;
unsigned int _pad0;
unsigned long long start_nonce;
};
__constant__ unsigned char SBOX[256] = {
0x63,0x7c,0x77,0x7b,0xf2,0x6b,0x6f,0xc5,0x30,0x01,0x67,0x2b,0xfe,0xd7,0xab,0x76,
0xca,0x82,0xc9,0x7d,0xfa,0x59,0x47,0xf0,0xad,0xd4,0xa2,0xaf,0x9c,0xa4,0x72,0xc0,
0xb7,0xfd,0x93,0x26,0x36,0x3f,0xf7,0xcc,0x34,0xa5,0xe5,0xf1,0x71,0xd8,0x31,0x15,
0x04,0xc7,0x23,0xc3,0x18,0x96,0x05,0x9a,0x07,0x12,0x80,0xe2,0xeb,0x27,0xb2,0x75,
0x09,0x83,0x2c,0x1a,0x1b,0x6e,0x5a,0xa0,0x52,0x3b,0xd6,0xb3,0x29,0xe3,0x2f,0x84,
0x53,0xd1,0x00,0xed,0x20,0xfc,0xb1,0x5b,0x6a,0xcb,0xbe,0x39,0x4a,0x4c,0x58,0xcf,
0xd0,0xef,0xaa,0xfb,0x43,0x4d,0x33,0x85,0x45,0xf9,0x02,0x7f,0x50,0x3c,0x9f,0xa8,
0x51,0xa3,0x40,0x8f,0x92,0x9d,0x38,0xf5,0xbc,0xb6,0xda,0x21,0x10,0xff,0xf3,0xd2,
0xcd,0x0c,0x13,0xec,0x5f,0x97,0x44,0x17,0xc4,0xa7,0x7e,0x3d,0x64,0x5d,0x19,0x73,
0x60,0x81,0x4f,0xdc,0x22,0x2a,0x90,0x88,0x46,0xee,0xb8,0x14,0xde,0x5e,0x0b,0xdb,
0xe0,0x32,0x3a,0x0a,0x49,0x06,0x24,0x5c,0xc2,0xd3,0xac,0x62,0x91,0x95,0xe4,0x79,
0xe7,0xc8,0x37,0x6d,0x8d,0xd5,0x4e,0xa9,0x6c,0x56,0xf4,0xea,0x65,0x7a,0xae,0x08,
0xba,0x78,0x25,0x2e,0x1c,0xa6,0xb4,0xc6,0xe8,0xdd,0x74,0x1f,0x4b,0xbd,0x8b,0x8a,
0x70,0x3e,0xb5,0x66,0x48,0x03,0xf6,0x0e,0x61,0x35,0x57,0xb9,0x86,0xc1,0x1d,0x9e,
0xe1,0xf8,0x98,0x11,0x69,0xd9,0x8e,0x94,0x9b,0x1e,0x87,0xe9,0xce,0x55,0x28,0xdf,
0x8c,0xa1,0x89,0x0d,0xbf,0xe6,0x42,0x68,0x41,0x99,0x2d,0x0f,0xb0,0x54,0xbb,0x16
};
__constant__ unsigned int SHA256_K[64] = {
0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
};
__constant__ unsigned int SHA256_IV[8] = {
0x6a09e667,0xbb67ae85,0x3c6ef372,0xa54ff53a,0x510e527f,0x9b05688c,0x1f83d9ab,0x5be0cd19
};
__constant__ unsigned int B3_IV[8] = {
0x6A09E667,0xBB67AE85,0x3C6EF372,0xA54FF53A,0x510E527F,0x9B05688C,0x1F83D9AB,0x5BE0CD19
};
__constant__ unsigned short B3_S[7][16] = {
{0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15},
{2,6,3,10,7,0,4,13,1,11,12,5,9,14,15,8},
{3,4,10,12,13,2,7,14,6,5,9,0,11,15,8,1},
{10,7,12,9,14,3,13,15,4,0,11,2,5,8,1,6},
{12,13,9,11,15,10,14,8,7,2,5,3,0,1,6,4},
{9,14,11,5,8,12,15,1,13,3,0,10,2,6,4,7},
{11,15,5,0,1,9,8,6,14,10,2,12,3,4,7,13}
};
__device__ __forceinline__ unsigned char gf_mul2(unsigned char x) {
unsigned char hi = x >> 7;
return (unsigned char)((x << 1) ^ (hi * 0x1b));
}
__device__ __forceinline__ unsigned char gf_mul3(unsigned char x) {
return (unsigned char)(gf_mul2(x) ^ x);
}
__device__ __forceinline__ unsigned int rotr(unsigned int x, unsigned int n) {
return (x >> n) | (x << (32u - n));
}
__device__ __forceinline__ unsigned int rd_be32(const unsigned char* p) {
return ((unsigned int)p[0] << 24) | ((unsigned int)p[1] << 16) |
((unsigned int)p[2] << 8) | (unsigned int)p[3];
}
__device__ __forceinline__ unsigned long long rd_le64(const unsigned char* p) {
return (unsigned long long)p[0] |
((unsigned long long)p[1] << 8) |
((unsigned long long)p[2] << 16) |
((unsigned long long)p[3] << 24) |
((unsigned long long)p[4] << 32) |
((unsigned long long)p[5] << 40) |
((unsigned long long)p[6] << 48) |
((unsigned long long)p[7] << 56);
}
__device__ __forceinline__ void wr_be32(unsigned char* p, unsigned int v) {
p[0] = (unsigned char)(v >> 24);
p[1] = (unsigned char)(v >> 16);
p[2] = (unsigned char)(v >> 8);
p[3] = (unsigned char)(v);
}
__device__ __forceinline__ void wr_le32(unsigned char* p, unsigned int v) {
p[0] = (unsigned char)(v);
p[1] = (unsigned char)(v >> 8);
p[2] = (unsigned char)(v >> 16);
p[3] = (unsigned char)(v >> 24);
}
__device__ void aesenc_round(unsigned char state[16], const unsigned char rk[16]) {
unsigned char s[16];
for (unsigned int i = 0; i < 16; i++) s[i] = SBOX[state[i]];
unsigned char t[16];
for (unsigned int i = 0; i < 16; i++) t[i] = s[i];
s[1]=t[5]; s[5]=t[9]; s[9]=t[13]; s[13]=t[1];
s[2]=t[10]; s[6]=t[14]; s[10]=t[2]; s[14]=t[6];
s[3]=t[15]; s[7]=t[3]; s[11]=t[7]; s[15]=t[11];
unsigned char out[16];
for (unsigned int c = 0; c < 4; c++) {
unsigned int i = c * 4;
unsigned char a0=s[i], a1=s[i+1], a2=s[i+2], a3=s[i+3];
out[i] = (unsigned char)(gf_mul2(a0) ^ gf_mul3(a1) ^ a2 ^ a3);
out[i+1] = (unsigned char)(a0 ^ gf_mul2(a1) ^ gf_mul3(a2) ^ a3);
out[i+2] = (unsigned char)(a0 ^ a1 ^ gf_mul2(a2) ^ gf_mul3(a3));
out[i+3] = (unsigned char)(gf_mul3(a0) ^ a1 ^ a2 ^ gf_mul2(a3));
}
for (unsigned int i = 0; i < 16; i++) state[i] = (unsigned char)(out[i] ^ rk[i]);
}
__device__ void aes_expand_block(unsigned char state[16], const unsigned char key[16]) {
aesenc_round(state, key);
aesenc_round(state, key);
aesenc_round(state, key);
aesenc_round(state, key);
}
__device__ void sha256_compress(const unsigned char state_b[32], const unsigned char block[64], unsigned char out[32]) {
unsigned int s[8];
for (unsigned int i = 0; i < 8; i++) s[i] = rd_be32(&state_b[i*4]);
unsigned int w[64];
for (unsigned int i = 0; i < 16; i++) w[i] = rd_be32(&block[i*4]);
for (unsigned int i = 16; i < 64; i++) {
unsigned int s0 = rotr(w[i-15],7) ^ rotr(w[i-15],18) ^ (w[i-15] >> 3);
unsigned int s1 = rotr(w[i-2],17) ^ rotr(w[i-2],19) ^ (w[i-2] >> 10);
w[i] = w[i-16] + s0 + w[i-7] + s1;
}
unsigned int a=s[0],b=s[1],c=s[2],d=s[3],e=s[4],f=s[5],g=s[6],h=s[7];
for (unsigned int i = 0; i < 64; i++) {
unsigned int S1 = rotr(e,6) ^ rotr(e,11) ^ rotr(e,25);
unsigned int ch = (e & f) ^ ((~e) & g);
unsigned int t1 = h + S1 + ch + SHA256_K[i] + w[i];
unsigned int S0 = rotr(a,2) ^ rotr(a,13) ^ rotr(a,22);
unsigned int maj = (a & b) ^ (a & c) ^ (b & c);
unsigned int t2 = S0 + maj;
h=g; g=f; f=e; e=d+t1; d=c; c=b; b=a; a=t1+t2;
}
s[0]+=a; s[1]+=b; s[2]+=c; s[3]+=d; s[4]+=e; s[5]+=f; s[6]+=g; s[7]+=h;
for (unsigned int i = 0; i < 8; i++) wr_be32(&out[i*4], s[i]);
}
__device__ void blake3_g(unsigned int v[16], unsigned int a, unsigned int b, unsigned int c, unsigned int d, unsigned int mx, unsigned int my) {
v[a] = v[a] + v[b] + mx;
v[d] = rotr(v[d] ^ v[a], 16);
v[c] = v[c] + v[d];
v[b] = rotr(v[b] ^ v[c], 12);
v[a] = v[a] + v[b] + my;
v[d] = rotr(v[d] ^ v[a], 8);
v[c] = v[c] + v[d];
v[b] = rotr(v[b] ^ v[c], 7);
}
__device__ void blake3_compress(const unsigned char state_b[32], const unsigned char block[64], unsigned char out[32]) {
unsigned int h[8];
unsigned int m[16];
for (unsigned int i = 0; i < 8; i++)
h[i] = (unsigned int)state_b[i*4] | ((unsigned int)state_b[i*4+1]<<8) |
((unsigned int)state_b[i*4+2]<<16) | ((unsigned int)state_b[i*4+3]<<24);
for (unsigned int i = 0; i < 16; i++)
m[i] = (unsigned int)block[i*4] | ((unsigned int)block[i*4+1]<<8) |
((unsigned int)block[i*4+2]<<16) | ((unsigned int)block[i*4+3]<<24);
unsigned int v[16];
for (unsigned int i = 0; i < 8; i++) { v[i] = h[i]; v[i+8] = B3_IV[i]; }
for (unsigned int r = 0; r < 7; r++) {
blake3_g(v,0,4,8,12,m[B3_S[r][0]],m[B3_S[r][1]]);
blake3_g(v,1,5,9,13,m[B3_S[r][2]],m[B3_S[r][3]]);
blake3_g(v,2,6,10,14,m[B3_S[r][4]],m[B3_S[r][5]]);
blake3_g(v,3,7,11,15,m[B3_S[r][6]],m[B3_S[r][7]]);
blake3_g(v,0,5,10,15,m[B3_S[r][8]],m[B3_S[r][9]]);
blake3_g(v,1,6,11,12,m[B3_S[r][10]],m[B3_S[r][11]]);
blake3_g(v,2,7,8,13,m[B3_S[r][12]],m[B3_S[r][13]]);
blake3_g(v,3,4,9,14,m[B3_S[r][14]],m[B3_S[r][15]]);
}
for (unsigned int i = 0; i < 8; i++) {
unsigned int x = v[i] ^ v[i+8];
wr_le32(&out[i*4], x);
}
}
__device__ void blake3_compress_hash(
const unsigned int cv[8],
const unsigned int block_words[16],
unsigned int counter_low,
unsigned int counter_high,
unsigned int block_len,
unsigned int flags,
unsigned int out[16]
) {
unsigned int v[16];
for (unsigned int i = 0; i < 8; i++) v[i] = cv[i];
v[8] = B3_IV[0]; v[9] = B3_IV[1]; v[10] = B3_IV[2]; v[11] = B3_IV[3];
v[12] = counter_low; v[13] = counter_high; v[14] = block_len; v[15] = flags;
for (unsigned int r = 0; r < 7; r++) {
blake3_g(v,0,4,8,12,block_words[B3_S[r][0]],block_words[B3_S[r][1]]);
blake3_g(v,1,5,9,13,block_words[B3_S[r][2]],block_words[B3_S[r][3]]);
blake3_g(v,2,6,10,14,block_words[B3_S[r][4]],block_words[B3_S[r][5]]);
blake3_g(v,3,7,11,15,block_words[B3_S[r][6]],block_words[B3_S[r][7]]);
blake3_g(v,0,5,10,15,block_words[B3_S[r][8]],block_words[B3_S[r][9]]);
blake3_g(v,1,6,11,12,block_words[B3_S[r][10]],block_words[B3_S[r][11]]);
blake3_g(v,2,7,8,13,block_words[B3_S[r][12]],block_words[B3_S[r][13]]);
blake3_g(v,3,4,9,14,block_words[B3_S[r][14]],block_words[B3_S[r][15]]);
}
for (unsigned int i = 0; i < 8; i++) {
out[i] = v[i] ^ v[i+8];
out[i+8] = v[i+8] ^ cv[i];
}
}
__device__ void blake3_compress_inplace(
unsigned int cv[8],
const unsigned int block_words[16],
unsigned int counter_low, unsigned int counter_high,
unsigned int block_len, unsigned int flags
) {
unsigned int out_words[16];
blake3_compress_hash(cv, block_words, counter_low, counter_high, block_len, flags, out_words);
for (unsigned int i = 0; i < 8; i++) cv[i] = out_words[i];
}
__device__ unsigned char header_nonce_byte(const unsigned char* header, unsigned int header_len, unsigned long long nonce, unsigned int idx) {
if (idx < header_len) return header[idx];
unsigned int j = idx - header_len;
if (j < 8) return (unsigned char)((nonce >> (j * 8)) & 0xffull);
return 0;
}
__device__ void blake3_hash_header_nonce(
const unsigned char* header, unsigned int header_len,
unsigned long long nonce, unsigned char out[32]
) {
unsigned int total_len = header_len + 8;
unsigned int blocks = (total_len + 63) / 64;
unsigned int cv[8];
for (unsigned int i = 0; i < 8; i++) cv[i] = B3_IV[i];
for (unsigned int b = 0; b < blocks; b++) {
unsigned int off = b * 64;
unsigned int blen = 64;
if (total_len - off < 64) blen = total_len - off;
unsigned int words[16];
for (unsigned int i = 0; i < 16; i++) words[i] = 0;
for (unsigned int i = 0; i < blen; i++) {
unsigned char by = header_nonce_byte(header, header_len, nonce, off + i);
words[i / 4] |= ((unsigned int)by) << ((i % 4) * 8);
}
unsigned int flags = 0;
if (b == 0) flags |= B3_CHUNK_START;
int last = (b + 1 == blocks) ? 1 : 0;
if (last) flags |= B3_CHUNK_END;
if (!last) {
blake3_compress_inplace(cv, words, 0, 0, blen, flags);
} else {
unsigned int out_words[16];
blake3_compress_hash(cv, words, 0, 0, blen, flags | B3_ROOT, out_words);
for (unsigned int i = 0; i < 8; i++) wr_le32(&out[i*4], out_words[i]);
}
}
}
__device__ void sha256_digest_32(const unsigned char input[32], unsigned char out[32]) {
unsigned int state[8];
for (unsigned int i = 0; i < 8; i++) state[i] = SHA256_IV[i];
unsigned char block[64];
for (unsigned int i = 0; i < 32; i++) block[i] = input[i];
block[32] = 0x80;
for (unsigned int i = 33; i < 56; i++) block[i] = 0;
block[56] = 0; block[57] = 0; block[58] = 0; block[59] = 0;
block[60] = 0; block[61] = 0; block[62] = 1; block[63] = 0;
unsigned int w[64];
for (unsigned int i = 0; i < 16; i++) w[i] = rd_be32(&block[i*4]);
for (unsigned int i = 16; i < 64; i++) {
unsigned int s0 = rotr(w[i-15],7) ^ rotr(w[i-15],18) ^ (w[i-15] >> 3);
unsigned int s1 = rotr(w[i-2],17) ^ rotr(w[i-2],19) ^ (w[i-2] >> 10);
w[i] = w[i-16] + s0 + w[i-7] + s1;
}
unsigned int a=state[0],b=state[1],c=state[2],d=state[3],e=state[4],f=state[5],g=state[6],h=state[7];
for (unsigned int i = 0; i < 64; i++) {
unsigned int S1 = rotr(e,6) ^ rotr(e,11) ^ rotr(e,25);
unsigned int ch = (e & f) ^ ((~e) & g);
unsigned int t1 = h + S1 + ch + SHA256_K[i] + w[i];
unsigned int S0 = rotr(a,2) ^ rotr(a,13) ^ rotr(a,22);
unsigned int maj = (a & b) ^ (a & c) ^ (b & c);
unsigned int t2 = S0 + maj;
h=g; g=f; f=e; e=d+t1; d=c; c=b; b=a; a=t1+t2;
}
state[0]+=a; state[1]+=b; state[2]+=c; state[3]+=d;
state[4]+=e; state[5]+=f; state[6]+=g; state[7]+=h;
for (unsigned int i = 0; i < 8; i++) wr_be32(&out[i*4], state[i]);
}
__device__ void blake3_hash_32(const unsigned char input[32], unsigned char out[32]) {
unsigned int cv[8];
for (unsigned int i = 0; i < 8; i++) cv[i] = B3_IV[i];
unsigned int block_words[16];
for (unsigned int i = 0; i < 8; i++) {
block_words[i] = (unsigned int)input[i*4] | ((unsigned int)input[i*4+1] << 8) |
((unsigned int)input[i*4+2] << 16) | ((unsigned int)input[i*4+3] << 24);
}
for (unsigned int i = 8; i < 16; i++) block_words[i] = 0;
unsigned int out_words[16];
blake3_compress_hash(cv, block_words, 0, 0, 32, B3_CHUNK_START | B3_CHUNK_END | B3_ROOT, out_words);
for (unsigned int i = 0; i < 8; i++) wr_le32(&out[i*4], out_words[i]);
}
__device__ unsigned int compute_addr(const unsigned char state[32], unsigned int round) {
const unsigned long long C = 0x517cc1b727220a95ull;
unsigned long long lo = rd_le64(&state[0]);
unsigned long long hi = rd_le64(&state[8]);
unsigned long long ru = (unsigned long long)round;
unsigned long long mixed = lo ^ hi ^ ((ru << 13) | (ru >> (64 - 13))) ^ (ru * C);
return (unsigned int)((mixed & ADDRESS_MASK) * BLOCK_SIZE_BYTES);
}
__device__ void uhash_round(
unsigned char* chain_scratch, unsigned char state[32],
unsigned int initial_primitive, unsigned int r
) {
unsigned int addr = compute_addr(state, r);
unsigned char block[64];
for (unsigned int i = 0; i < 64; i++) block[i] = chain_scratch[addr + i];
unsigned int primitive = (initial_primitive + r + 1) % 3;
unsigned char new_state[32];
if (primitive == 0) {
unsigned char lo[16]; unsigned char hi[16];
for (unsigned int i = 0; i < 16; i++) { lo[i] = state[i]; hi[i] = state[16+i]; }
aesenc_round(lo, &block[0]);
aesenc_round(lo, &block[16]);
aesenc_round(lo, &block[32]);
aesenc_round(lo, &block[48]);
aesenc_round(hi, &block[32]);
aesenc_round(hi, &block[48]);
aesenc_round(hi, &block[0]);
aesenc_round(hi, &block[16]);
for (unsigned int i = 0; i < 16; i++) { new_state[i] = lo[i]; new_state[16+i] = hi[i]; }
} else if (primitive == 1) {
sha256_compress(state, block, new_state);
} else {
blake3_compress(state, block, new_state);
}
for (unsigned int i = 0; i < 32; i++) chain_scratch[addr+i] = new_state[i];
for (unsigned int i = 0; i < 32; i++) state[i] = new_state[i];
}
__global__ void uhash_kernel(
const unsigned char* header,
unsigned char* hashes_out,
unsigned char* scratchpads,
const unsigned char* params_raw
) {
unsigned int gid = blockIdx.x * blockDim.x + threadIdx.x;
const struct KernelParams* params = (const struct KernelParams*)params_raw;
if (gid >= params->lanes) return;
unsigned char* lane_scratch = scratchpads + (unsigned long long)gid * (unsigned long long)(CHAINS * SCRATCHPAD_SIZE);
unsigned char states[CHAINS][32];
unsigned long long nonce_gid = params->start_nonce + (unsigned long long)gid;
for (unsigned int c = 0; c < CHAINS; c++) {
unsigned long long modified_nonce = nonce_gid ^ ((unsigned long long)c * GOLDEN_RATIO);
blake3_hash_header_nonce(header, params->header_len, modified_nonce, states[c]);
unsigned char key[16]; unsigned char st[16];
for (unsigned int i = 0; i < 16; i++) { key[i] = states[c][i]; st[i] = states[c][16+i]; }
unsigned char* chain_scratch = lane_scratch + c * SCRATCHPAD_SIZE;
for (unsigned int b = 0; b < BLOCKS_PER_SCRATCHPAD; b++) {
aes_expand_block(st, key);
unsigned int off = b * BLOCK_SIZE_BYTES;
unsigned char st2[16];
for (unsigned int i = 0; i < 16; i++) st2[i] = st[i];
aes_expand_block(st2, key);
for (unsigned int i = 0; i < 16; i++) {
chain_scratch[off+i] = st[i];
chain_scratch[off+16+i] = st2[i];
chain_scratch[off+32+i] = st[i];
chain_scratch[off+48+i] = st2[i];
}
}
}
for (unsigned int c = 0; c < CHAINS; c++) {
unsigned char* chain_scratch = lane_scratch + c * SCRATCHPAD_SIZE;
unsigned int initial_primitive = (unsigned int)((nonce_gid + (unsigned long long)c) % 3ull);
#pragma unroll 2
for (unsigned int r = 0; r < params->rounds; r++) {
uhash_round(chain_scratch, states[c], initial_primitive, r);
}
}
unsigned char combined[32];
for (unsigned int i = 0; i < 32; i++) combined[i] = 0;
for (unsigned int c = 0; c < CHAINS; c++)
for (unsigned int i = 0; i < 32; i++) combined[i] ^= states[c][i];
unsigned char sha_out[32];
sha256_digest_32(combined, sha_out);
unsigned char final_out[32];
blake3_hash_32(sha_out, final_out);
unsigned char* out = hashes_out + gid * 32;
for (unsigned int i = 0; i < 32; i++) out[i] = final_out[i];
}
} // extern "C"
"#;
#[cfg(test)]
mod tests {
use super::CudaMiner;
use std::time::Instant;
use uhash_core::UniversalHash;
#[test]
fn cuda_hash_matches_cpu_for_single_nonce() {
let Ok(mut miner) = CudaMiner::new() else {
eprintln!("Skipping: no CUDA GPU available");
return;
};
let mut header = Vec::new();
header.extend_from_slice(&[0xAB; 32]);
header.extend_from_slice(b"bostrom1testaddress");
header.extend_from_slice(&1_700_000_000u64.to_le_bytes());
let nonce = 42u64;
let gpu_hashes = miner
.compute_batch_hashes(&header, nonce, 1)
.expect("gpu hash");
assert_eq!(gpu_hashes.len(), 1);
let mut input = header.clone();
input.extend_from_slice(&nonce.to_le_bytes());
let mut cpu = UniversalHash::new();
let cpu_hash = cpu.hash(&input);
assert_eq!(
gpu_hashes[0],
cpu_hash,
"CUDA hash does not match CPU hash!\nGPU: {}\nCPU: {}",
hex::encode(gpu_hashes[0]),
hex::encode(cpu_hash)
);
}
#[test]
fn cuda_hash_matches_cpu_for_multi_nonce() {
let Ok(mut miner) = CudaMiner::new() else {
eprintln!("Skipping: no CUDA GPU available");
return;
};
let mut header = Vec::new();
header.extend_from_slice(&[0xCD; 32]);
header.extend_from_slice(b"bostrom1multinonce");
header.extend_from_slice(&1_700_000_001u64.to_le_bytes());
let start_nonce = 0u64;
let count = 64usize;
let gpu_hashes = miner
.compute_batch_hashes(&header, start_nonce, count)
.expect("gpu batch hash");
assert_eq!(gpu_hashes.len(), count);
for (i, gpu_hash) in gpu_hashes.iter().enumerate().take(count) {
let nonce = start_nonce + i as u64;
let mut input = header.clone();
input.extend_from_slice(&nonce.to_le_bytes());
let mut cpu = UniversalHash::new();
let cpu_hash = cpu.hash(&input);
assert_eq!(
*gpu_hash,
cpu_hash,
"Mismatch at nonce {}: GPU={} CPU={}",
nonce,
hex::encode(gpu_hash),
hex::encode(cpu_hash)
);
}
}
#[test]
#[ignore = "performance profile for local CUDA throughput validation"]
fn cuda_release_throughput_profile() {
let Ok(mut miner) = CudaMiner::new() else {
eprintln!("Skipping: no CUDA GPU available");
return;
};
let mut header = Vec::new();
header.extend_from_slice(&[0xEF; 32]);
header.extend_from_slice(b"bostrom1cudaprofile");
header.extend_from_slice(&1_700_000_002u64.to_le_bytes());
let lane_sets = [64usize, 128, 256, 512, 1024];
let mut nonce = 0u64;
for lanes in lane_sets {
let _ = miner
.compute_batch_hashes(&header, nonce, lanes)
.expect("warmup batch");
nonce = nonce.saturating_add(lanes as u64);
let start = Instant::now();
let batches = if lanes >= 1024 { 1u64 } else { 3u64 };
let mut computed = 0u64;
for _ in 0..batches {
let _ = miner
.compute_batch_hashes(&header, nonce, lanes)
.expect("profile batch");
nonce = nonce.saturating_add(lanes as u64);
computed += lanes as u64;
}
let elapsed = start.elapsed().as_secs_f64();
let hps = if elapsed > 0.0 {
computed as f64 / elapsed
} else {
0.0
};
eprintln!(
"cuda-profile lanes={} hashes={} elapsed={:.3}s hashrate={:.2} H/s",
lanes, computed, elapsed, hps
);
}
}
}