use async_trait::async_trait;
use cudarc::driver::{CudaContext, CudaSlice, CudaStream, LaunchConfig, PushKernelArg};
use cudarc::nvrtc::{compile_ptx_with_opts, CompileOptions};
use std::sync::{Arc, Mutex};
use super::sha256::{leading_zero_bits_words, state_words_to_bytes, Sha256Midstate};
use super::work_unit::NonceTable;
use super::{CancelFlag, MinerBackend, MiningChunkResult, MiningResult, NONCE_SPACE_SIZE};
const CUDA_BLOCK_SIZE: u32 = 256;
const RING_SLOTS: usize = 4;
const SLOT_WORDS: usize = 16;
const SLOT_S0: usize = 0;
const SLOT_DIFF: usize = 8;
const SLOT_PFXLEN: usize = 9;
const SLOT_READY: usize = 10;
const SLOT_COUNTER: usize = 11;
const SLOT_BEST_HI: usize = 12;
const SLOT_BEST_LO: usize = 13;
struct PersistentState {
stream: Arc<CudaStream>,
ring_dev: CudaSlice<u32>,
shutdown_dev: CudaSlice<u32>,
ring_host: Vec<u32>,
}
pub struct PersistentCudaMiner {
state: Mutex<PersistentState>,
nonce_words: Vec<u32>,
device_name: String,
ordinal: usize,
}
impl PersistentCudaMiner {
pub async fn try_new(ordinal: usize) -> Option<Self> {
let prev = std::panic::take_hook();
std::panic::set_hook(Box::new(|_| {}));
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
Self::try_new_inner(ordinal)
}));
std::panic::set_hook(prev);
result.ok().flatten()
}
fn try_new_inner(ordinal: usize) -> Option<Self> {
let ctx = CudaContext::new(ordinal).ok()?;
let stream = ctx.default_stream();
let device_name = ctx.name().ok()?;
let (cc_major, cc_minor) = ctx.compute_capability().ok()?;
let arch: &'static str = Box::leak(format!("sm_{cc_major}{cc_minor}").into_boxed_str());
let opts = CompileOptions {
arch: Some(arch),
..Default::default()
};
let ptx =
compile_ptx_with_opts(include_str!("shader/sha256_mine_persistent.cu"), opts).ok()?;
let module = ctx.load_module(ptx).ok()?;
let kernel = module.load_function("mine_persistent").ok()?;
let nonce_words = NonceTable::new().as_u32_slice();
let nonce_table_dev = stream.clone_htod(&nonce_words).ok()?;
let ring_total = RING_SLOTS * SLOT_WORDS;
let mut ring_dev = stream.alloc_zeros::<u32>(ring_total).ok()?;
let ring_host = vec![0u32; ring_total];
let mut shutdown_dev = stream.alloc_zeros::<u32>(1).ok()?;
let total_blocks = NONCE_SPACE_SIZE.div_ceil(CUDA_BLOCK_SIZE);
let ring_slots_u32 = RING_SLOTS as u32;
stream.synchronize().ok()?;
let cfg = LaunchConfig {
grid_dim: (total_blocks, 1, 1),
block_dim: (CUDA_BLOCK_SIZE, 1, 1),
shared_mem_bytes: 0,
};
let mut launch = stream.launch_builder(&kernel);
launch.arg(&nonce_table_dev);
launch.arg(&mut ring_dev);
launch.arg(&ring_slots_u32);
launch.arg(&total_blocks);
launch.arg(&mut shutdown_dev);
unsafe { launch.launch(cfg) }.ok()?;
Some(Self {
state: Mutex::new(PersistentState {
stream,
ring_dev,
shutdown_dev,
ring_host,
}),
nonce_words,
device_name,
ordinal,
})
}
pub fn device_name(&self) -> &str {
&self.device_name
}
fn nonce_indices(nonce: u32) -> (usize, usize) {
((nonce / 1000) as usize, (nonce % 1000) as usize)
}
fn best_result_from_slot(
&self,
ring_host: &[u32],
slot_idx: usize,
midstate: &Sha256Midstate,
difficulty: u32,
) -> Option<MiningResult> {
let base = slot_idx * SLOT_WORDS;
let packed = ((ring_host[base + SLOT_BEST_HI] as u64) << 32)
| (ring_host[base + SLOT_BEST_LO] as u64);
let best_zeros = (packed >> 32) as u32;
let nonce = (packed & 0xFFFF_FFFF) as u32;
if best_zeros < difficulty {
return None;
}
if nonce >= NONCE_SPACE_SIZE {
return None;
}
let (n1, n2) = Self::nonce_indices(nonce);
let state_words =
midstate.finalize_words_from_nonce_u32(self.nonce_words[n1], self.nonce_words[n2]);
let achieved = leading_zero_bits_words(&state_words);
if achieved < difficulty {
return None;
}
Some(MiningResult {
nonce1_idx: n1 as u16,
nonce2_idx: n2 as u16,
hash: state_words_to_bytes(&state_words),
difficulty_achieved: achieved,
})
}
fn submit_work(
state: &mut PersistentState,
slot_idx: usize,
midstate: &Sha256Midstate,
difficulty: u32,
) -> anyhow::Result<()> {
let base = slot_idx * SLOT_WORDS;
let s = midstate.state_words();
let mut slot_data = [0u32; SLOT_WORDS];
slot_data[SLOT_S0..SLOT_S0 + 8].copy_from_slice(s);
slot_data[SLOT_DIFF] = difficulty;
slot_data[SLOT_PFXLEN] = midstate.prefix_len as u32;
slot_data[SLOT_READY] = 1; slot_data[SLOT_COUNTER] = 0;
slot_data[SLOT_BEST_HI] = 0;
slot_data[SLOT_BEST_LO] = 0;
let mut slot_view = state.ring_dev.slice_mut(base..base + SLOT_WORDS);
state.stream.memcpy_htod(&slot_data, &mut slot_view)?;
Ok(())
}
fn poll_slot(state: &mut PersistentState, slot_idx: usize) -> anyhow::Result<()> {
let base = slot_idx * SLOT_WORDS;
loop {
let ready_view = state
.ring_dev
.slice(base + SLOT_READY..base + SLOT_READY + 1);
let mut ready_buf = [0u32; 1];
state.stream.memcpy_dtoh(&ready_view, &mut ready_buf)?;
state.stream.synchronize()?;
if ready_buf[0] == 2 {
break;
}
std::thread::sleep(std::time::Duration::from_micros(10));
}
let slot_view = state.ring_dev.slice(base..base + SLOT_WORDS);
state
.stream
.memcpy_dtoh(&slot_view, &mut state.ring_host[base..base + SLOT_WORDS])?;
state.stream.synchronize()?;
Ok(())
}
pub fn mine_batch(
&self,
midstates: &[Sha256Midstate],
difficulty: u32,
cancel: Option<CancelFlag>,
) -> anyhow::Result<Vec<MiningChunkResult>> {
if midstates.is_empty() {
return Ok(Vec::new());
}
let mut state = self
.state
.lock()
.map_err(|_| anyhow::anyhow!("persistent cuda state mutex poisoned"))?;
let n = midstates.len();
let num_slots = RING_SLOTS;
let started = std::time::Instant::now();
let mut results: Vec<Option<MiningChunkResult>> = (0..n).map(|_| None).collect();
let mut slot_midstate: Vec<Option<usize>> = vec![None; num_slots];
let mut next_dispatch = 0usize;
for slot_idx in 0..num_slots.min(n) {
Self::submit_work(&mut state, slot_idx, &midstates[next_dispatch], difficulty)?;
slot_midstate[slot_idx] = Some(next_dispatch);
next_dispatch += 1;
}
let mut collected = 0usize;
let mut collect_slot = 0usize;
while collected < n {
if let Some(flag) = cancel.as_ref() {
if flag.load(std::sync::atomic::Ordering::Relaxed) {
for r in results.iter_mut() {
if r.is_none() {
*r = Some(MiningChunkResult::empty());
}
}
break;
}
}
let slot_idx = collect_slot % num_slots;
if let Some(mid_idx) = slot_midstate[slot_idx] {
Self::poll_slot(&mut state, slot_idx)?;
let result = self.best_result_from_slot(
&state.ring_host,
slot_idx,
&midstates[mid_idx],
difficulty,
);
results[mid_idx] = Some(MiningChunkResult {
result,
attempted: NONCE_SPACE_SIZE as u64,
elapsed: started.elapsed(),
});
collected += 1;
if next_dispatch < n {
Self::submit_work(&mut state, slot_idx, &midstates[next_dispatch], difficulty)?;
slot_midstate[slot_idx] = Some(next_dispatch);
next_dispatch += 1;
} else {
slot_midstate[slot_idx] = None;
}
}
collect_slot += 1;
}
Ok(results.into_iter().map(|r| r.unwrap()).collect())
}
pub fn shutdown(&self) -> anyhow::Result<()> {
let mut state = self
.state
.lock()
.map_err(|_| anyhow::anyhow!("persistent cuda state mutex poisoned"))?;
let flag = [1u32; 1];
let PersistentState {
ref stream,
ref mut shutdown_dev,
..
} = *state;
stream.memcpy_htod(&flag, shutdown_dev)?;
stream.synchronize()?;
Ok(())
}
}
impl Drop for PersistentCudaMiner {
fn drop(&mut self) {
let _ = self.shutdown();
}
}
#[async_trait]
impl MinerBackend for PersistentCudaMiner {
fn name(&self) -> &str {
&self.device_name
}
fn startup_summary(&self) -> Vec<String> {
vec![
format!("persistent_cuda_device={}", self.device_name),
format!("persistent_cuda_ordinal={}", self.ordinal),
format!("persistent_cuda_ring_slots={}", RING_SLOTS),
"persistent_cuda_mode=ring_buffer".to_string(),
]
}
async fn benchmark(&self) -> anyhow::Result<f64> {
let midstate = Sha256Midstate::from_prefix(&[0u8; 64]);
let midstates = vec![midstate; RING_SLOTS];
let _ = self.mine_batch(&midstates, 256, None)?;
let mut samples = Vec::with_capacity(6);
for _ in 0..6 {
let started = std::time::Instant::now();
let chunks = self.mine_batch(&midstates, 256, None)?;
let elapsed = started.elapsed().as_secs_f64();
let total: u64 = chunks.iter().map(|c| c.attempted).sum();
if elapsed > 0.0 {
samples.push(total as f64 / elapsed);
}
}
if samples.is_empty() {
return Ok(0.0);
}
samples.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
Ok(samples[samples.len() / 2])
}
fn max_batch_hint(&self) -> u32 {
NONCE_SPACE_SIZE
}
async fn mine_range(
&self,
midstate: &Sha256Midstate,
_nonce_table: &NonceTable,
difficulty: u32,
_start_nonce: u32,
_nonce_count: u32,
cancel: Option<CancelFlag>,
) -> anyhow::Result<MiningChunkResult> {
let chunks = self.mine_batch(&[midstate.clone()], difficulty, cancel)?;
Ok(chunks
.into_iter()
.next()
.unwrap_or_else(MiningChunkResult::empty))
}
}