use async_trait::async_trait;
use rayon::prelude::*;
use std::sync::atomic::Ordering;
use std::sync::Arc;
use super::sha256::{leading_zero_bits_words, state_words_to_bytes, Sha256Midstate};
use super::work_unit::NonceTable;
#[cfg(test)]
use super::work_unit::WorkUnit;
use super::{CancelFlag, MinerBackend, MiningChunkResult, MiningResult, NONCE_SPACE_SIZE};
const SHA256_K: [u32; 64] = [
0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
];
const CPU_PAR_BLOCK_NONCES: u32 = 16_384;
#[derive(Debug, Clone, Copy)]
enum SimdType {
#[cfg(target_arch = "x86_64")]
X86Avx2Shani,
#[cfg(target_arch = "x86_64")]
X86Avx2,
#[cfg(target_arch = "x86_64")]
X86Shani,
#[cfg(target_arch = "x86_64")]
X86Sse41,
#[cfg(target_arch = "aarch64")]
Aarch64Hardware,
Scalar,
}
impl SimdType {
fn as_str(self) -> &'static str {
match self {
#[cfg(target_arch = "x86_64")]
SimdType::X86Avx2Shani => "x86_64_avx2+sha",
#[cfg(target_arch = "x86_64")]
SimdType::X86Avx2 => "x86_64_avx2",
#[cfg(target_arch = "x86_64")]
SimdType::X86Shani => "x86_64_sha",
#[cfg(target_arch = "x86_64")]
SimdType::X86Sse41 => "x86_64_sse4.1",
#[cfg(target_arch = "aarch64")]
SimdType::Aarch64Hardware => "aarch64_sha2",
SimdType::Scalar => "scalar",
}
}
}
fn detect_simd() -> SimdType {
#[cfg(target_arch = "x86_64")]
{
let avx2 = std::is_x86_feature_detected!("avx2");
let shani = std::is_x86_feature_detected!("sha");
let sse41 = std::is_x86_feature_detected!("sse4.1");
if avx2 && shani {
return SimdType::X86Avx2Shani;
}
if avx2 {
return SimdType::X86Avx2;
}
if shani {
return SimdType::X86Shani;
}
if sse41 {
return SimdType::X86Sse41;
}
}
#[cfg(target_arch = "aarch64")]
{
if std::arch::is_aarch64_feature_detected!("sha2") {
return SimdType::Aarch64Hardware;
}
}
SimdType::Scalar
}
pub struct SimdCpuMiner {
thread_count: usize,
pool: Arc<rayon::ThreadPool>,
nonce_words: Arc<Vec<u32>>,
simd: SimdType,
}
impl SimdCpuMiner {
pub fn new() -> Self {
let default_threads = std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(1);
Self::with_threads(default_threads)
}
pub fn with_threads(thread_count: usize) -> Self {
let max_threads = std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(1);
let thread_count = thread_count.clamp(1, max_threads);
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(thread_count)
.build()
.expect("failed to build rayon thread pool");
let nonce_words = Arc::new(NonceTable::new().as_u32_slice());
SimdCpuMiner {
thread_count,
pool: Arc::new(pool),
nonce_words,
simd: detect_simd(),
}
}
pub fn from_option(thread_count: Option<usize>) -> Self {
match thread_count {
Some(n) => Self::with_threads(n),
None => Self::new(),
}
}
pub fn thread_count(&self) -> usize {
self.thread_count
}
}
impl Default for SimdCpuMiner {
fn default() -> Self {
Self::new()
}
}
#[inline(always)]
fn nonce_indices(n: u32) -> (u16, u16) {
((n / 1000) as u16, (n % 1000) as u16)
}
#[inline(always)]
fn maybe_cancelled(cancel: Option<&CancelFlag>) -> bool {
cancel
.map(|flag| flag.load(Ordering::Relaxed))
.unwrap_or(false)
}
#[inline(always)]
fn set_cancel(cancel: Option<&CancelFlag>) {
if let Some(flag) = cancel {
flag.store(true, Ordering::Relaxed);
}
}
#[inline(always)]
fn score_state_and_build_result(
n1: u16,
n2: u16,
state_words: &[u32; 8],
difficulty: u32,
) -> Option<MiningResult> {
if difficulty >= 16 && (state_words[0] & 0xFFFF_0000) != 0 {
return None;
}
let difficulty_achieved = leading_zero_bits_words(state_words);
if difficulty_achieved < difficulty {
return None;
}
Some(MiningResult {
nonce1_idx: n1,
nonce2_idx: n2,
hash: state_words_to_bytes(state_words),
difficulty_achieved,
})
}
fn search_scalar_range(
midstate: &Sha256Midstate,
nonce_words: &[u32],
difficulty: u32,
start_nonce: u32,
end_nonce: u32,
cancel: Option<&CancelFlag>,
) -> Option<MiningResult> {
let mut idx = start_nonce;
while idx < end_nonce {
if maybe_cancelled(cancel) {
return None;
}
let (n1, n2) = nonce_indices(idx);
let nonce1_word = nonce_words[n1 as usize];
let nonce2_word = nonce_words[n2 as usize];
let state_words = midstate.finalize_words_from_nonce_u32(nonce1_word, nonce2_word);
if let Some(result) = score_state_and_build_result(n1, n2, &state_words, difficulty) {
set_cancel(cancel);
return Some(result);
}
idx += 1;
}
None
}
fn search_best_range(
simd: SimdType,
midstate: &Sha256Midstate,
nonce_words: &[u32],
difficulty: u32,
start_nonce: u32,
end_nonce: u32,
cancel: Option<&CancelFlag>,
) -> Option<MiningResult> {
#[cfg(target_arch = "x86_64")]
{
match simd {
SimdType::X86Avx2Shani | SimdType::X86Avx2 => {
unsafe {
return search_avx2_range(
midstate,
nonce_words,
difficulty,
start_nonce,
end_nonce,
cancel,
);
}
}
SimdType::X86Shani => {
return search_shani_range(
midstate,
nonce_words,
difficulty,
start_nonce,
end_nonce,
cancel,
);
}
SimdType::X86Sse41 => {
return search_sse41_range(
midstate,
nonce_words,
difficulty,
start_nonce,
end_nonce,
cancel,
);
}
_ => {}
}
}
#[cfg(target_arch = "aarch64")]
{
if matches!(simd, SimdType::Aarch64Hardware) {
return search_aarch64_sha2_range(
midstate,
nonce_words,
difficulty,
start_nonce,
end_nonce,
cancel,
);
}
}
search_scalar_range(
midstate,
nonce_words,
difficulty,
start_nonce,
end_nonce,
cancel,
)
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn search_shani_range(
midstate: &Sha256Midstate,
nonce_words: &[u32],
difficulty: u32,
start_nonce: u32,
end_nonce: u32,
cancel: Option<&CancelFlag>,
) -> Option<MiningResult> {
search_scalar_range(
midstate,
nonce_words,
difficulty,
start_nonce,
end_nonce,
cancel,
)
}
#[cfg(target_arch = "x86_64")]
#[inline(always)]
fn search_sse41_range(
midstate: &Sha256Midstate,
nonce_words: &[u32],
difficulty: u32,
start_nonce: u32,
end_nonce: u32,
cancel: Option<&CancelFlag>,
) -> Option<MiningResult> {
search_scalar_range(
midstate,
nonce_words,
difficulty,
start_nonce,
end_nonce,
cancel,
)
}
#[cfg(target_arch = "aarch64")]
#[inline(always)]
fn search_aarch64_sha2_range(
midstate: &Sha256Midstate,
nonce_words: &[u32],
difficulty: u32,
start_nonce: u32,
end_nonce: u32,
cancel: Option<&CancelFlag>,
) -> Option<MiningResult> {
search_scalar_range(
midstate,
nonce_words,
difficulty,
start_nonce,
end_nonce,
cancel,
)
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn search_avx2_range(
midstate: &Sha256Midstate,
nonce_words: &[u32],
difficulty: u32,
start_nonce: u32,
end_nonce: u32,
cancel: Option<&CancelFlag>,
) -> Option<MiningResult> {
use std::arch::x86_64::*;
#[inline(always)]
unsafe fn set1(v: u32) -> __m256i {
_mm256_set1_epi32(v as i32)
}
#[inline(always)]
unsafe fn add(a: __m256i, b: __m256i) -> __m256i {
_mm256_add_epi32(a, b)
}
#[inline(always)]
unsafe fn rotr(x: __m256i, n: i32) -> __m256i {
match n {
2 => _mm256_or_si256(_mm256_srli_epi32(x, 2), _mm256_slli_epi32(x, 30)),
6 => _mm256_or_si256(_mm256_srli_epi32(x, 6), _mm256_slli_epi32(x, 26)),
7 => _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 25)),
11 => _mm256_or_si256(_mm256_srli_epi32(x, 11), _mm256_slli_epi32(x, 21)),
13 => _mm256_or_si256(_mm256_srli_epi32(x, 13), _mm256_slli_epi32(x, 19)),
17 => _mm256_or_si256(_mm256_srli_epi32(x, 17), _mm256_slli_epi32(x, 15)),
18 => _mm256_or_si256(_mm256_srli_epi32(x, 18), _mm256_slli_epi32(x, 14)),
19 => _mm256_or_si256(_mm256_srli_epi32(x, 19), _mm256_slli_epi32(x, 13)),
22 => _mm256_or_si256(_mm256_srli_epi32(x, 22), _mm256_slli_epi32(x, 10)),
25 => _mm256_or_si256(_mm256_srli_epi32(x, 25), _mm256_slli_epi32(x, 7)),
_ => unreachable!(),
}
}
#[inline(always)]
unsafe fn ch(x: __m256i, y: __m256i, z: __m256i) -> __m256i {
_mm256_xor_si256(_mm256_and_si256(x, y), _mm256_andnot_si256(x, z))
}
#[inline(always)]
unsafe fn maj(x: __m256i, y: __m256i, z: __m256i) -> __m256i {
let xy = _mm256_and_si256(x, y);
let xz = _mm256_and_si256(x, z);
let yz = _mm256_and_si256(y, z);
_mm256_xor_si256(_mm256_xor_si256(xy, xz), yz)
}
#[inline(always)]
unsafe fn big_sigma0(x: __m256i) -> __m256i {
_mm256_xor_si256(_mm256_xor_si256(rotr(x, 2), rotr(x, 13)), rotr(x, 22))
}
#[inline(always)]
unsafe fn big_sigma1(x: __m256i) -> __m256i {
_mm256_xor_si256(_mm256_xor_si256(rotr(x, 6), rotr(x, 11)), rotr(x, 25))
}
#[inline(always)]
unsafe fn small_sigma0(x: __m256i) -> __m256i {
let r1 = rotr(x, 7);
let r2 = rotr(x, 18);
let s = _mm256_srli_epi32(x, 3);
_mm256_xor_si256(_mm256_xor_si256(r1, r2), s)
}
#[inline(always)]
unsafe fn small_sigma1(x: __m256i) -> __m256i {
let r1 = rotr(x, 17);
let r2 = rotr(x, 19);
let s = _mm256_srli_epi32(x, 10);
_mm256_xor_si256(_mm256_xor_si256(r1, r2), s)
}
#[inline(always)]
unsafe fn sha256_block_8(mid: &[u32; 8], w0: __m256i, w1: __m256i, w15: u32) -> [__m256i; 8] {
let zero = _mm256_setzero_si256();
let mut w = [zero; 64];
w[0] = w0;
w[1] = w1;
w[2] = set1(0x6651_3d3d);
w[3] = set1(0x8000_0000);
w[14] = zero;
w[15] = set1(w15);
let mut i = 16usize;
while i < 64 {
let s1 = small_sigma1(w[i - 2]);
let s0 = small_sigma0(w[i - 15]);
w[i] = add(add(add(s1, w[i - 7]), s0), w[i - 16]);
i += 1;
}
let mut a = set1(mid[0]);
let mut b = set1(mid[1]);
let mut c = set1(mid[2]);
let mut d = set1(mid[3]);
let mut e = set1(mid[4]);
let mut f = set1(mid[5]);
let mut g = set1(mid[6]);
let mut h = set1(mid[7]);
let mut round = 0usize;
while round < 64 {
let t1 = add(
add(
add(add(h, big_sigma1(e)), ch(e, f, g)),
set1(SHA256_K[round]),
),
w[round],
);
let t2 = add(big_sigma0(a), maj(a, b, c));
h = g;
g = f;
f = e;
e = add(d, t1);
d = c;
c = b;
b = a;
a = add(t1, t2);
round += 1;
}
[
add(a, set1(mid[0])),
add(b, set1(mid[1])),
add(c, set1(mid[2])),
add(d, set1(mid[3])),
add(e, set1(mid[4])),
add(f, set1(mid[5])),
add(g, set1(mid[6])),
add(h, set1(mid[7])),
]
}
let mut idx = start_nonce;
let w15 = ((midstate.prefix_len as u32) + 12) * 8;
while idx + 8 <= end_nonce {
if maybe_cancelled(cancel) {
return None;
}
let mut n1_idx = [0u16; 8];
let mut n2_idx = [0u16; 8];
let mut n1_words = [0u32; 8];
let mut n2_words = [0u32; 8];
let mut lane = 0usize;
while lane < 8 {
let nonce = idx + lane as u32;
let (n1, n2) = nonce_indices(nonce);
n1_idx[lane] = n1;
n2_idx[lane] = n2;
n1_words[lane] = nonce_words[n1 as usize];
n2_words[lane] = nonce_words[n2 as usize];
lane += 1;
}
let w0 = _mm256_setr_epi32(
n1_words[0] as i32,
n1_words[1] as i32,
n1_words[2] as i32,
n1_words[3] as i32,
n1_words[4] as i32,
n1_words[5] as i32,
n1_words[6] as i32,
n1_words[7] as i32,
);
let w1 = _mm256_setr_epi32(
n2_words[0] as i32,
n2_words[1] as i32,
n2_words[2] as i32,
n2_words[3] as i32,
n2_words[4] as i32,
n2_words[5] as i32,
n2_words[6] as i32,
n2_words[7] as i32,
);
let states = sha256_block_8(&midstate.state, w0, w1, w15);
let mut lanes_by_word = [[0u32; 8]; 8];
let mut word = 0usize;
while word < 8 {
_mm256_storeu_si256(
lanes_by_word[word].as_mut_ptr() as *mut __m256i,
states[word],
);
word += 1;
}
lane = 0;
while lane < 8 {
let state_words = [
lanes_by_word[0][lane],
lanes_by_word[1][lane],
lanes_by_word[2][lane],
lanes_by_word[3][lane],
lanes_by_word[4][lane],
lanes_by_word[5][lane],
lanes_by_word[6][lane],
lanes_by_word[7][lane],
];
if let Some(result) =
score_state_and_build_result(n1_idx[lane], n2_idx[lane], &state_words, difficulty)
{
set_cancel(cancel);
return Some(result);
}
lane += 1;
}
idx += 8;
}
if idx < end_nonce {
return search_scalar_range(midstate, nonce_words, difficulty, idx, end_nonce, cancel);
}
None
}
#[async_trait]
impl MinerBackend for SimdCpuMiner {
fn name(&self) -> &str {
match self.simd {
#[cfg(target_arch = "x86_64")]
SimdType::X86Avx2Shani => "CPU (rayon+AVX2+SHANI)",
#[cfg(target_arch = "x86_64")]
SimdType::X86Avx2 => "CPU (rayon+AVX2)",
#[cfg(target_arch = "x86_64")]
SimdType::X86Shani => "CPU (rayon+SHANI)",
#[cfg(target_arch = "x86_64")]
SimdType::X86Sse41 => "CPU (rayon+SSE4.1)",
#[cfg(target_arch = "aarch64")]
SimdType::Aarch64Hardware => "CPU (rayon+ARMv8-SHA2)",
SimdType::Scalar => "CPU (rayon)",
}
}
fn startup_summary(&self) -> Vec<String> {
#[cfg(target_arch = "x86_64")]
{
vec![
format!("cpu_threads={}", self.thread_count),
format!("cpu_avx2_enabled={}", std::is_x86_feature_detected!("avx2")),
format!("cpu_shani_enabled={}", std::is_x86_feature_detected!("sha")),
format!(
"cpu_sse41_enabled={}",
std::is_x86_feature_detected!("sse4.1")
),
format!("cpu_simd_selected={}", self.simd.as_str()),
]
}
#[cfg(target_arch = "aarch64")]
{
vec![
format!("cpu_threads={}", self.thread_count),
format!(
"cpu_armv8_sha2_enabled={}",
std::arch::is_aarch64_feature_detected!("sha2")
),
format!("cpu_simd_selected={}", self.simd.as_str()),
]
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
{
vec![
format!("cpu_threads={}", self.thread_count),
format!("cpu_simd_selected={}", self.simd.as_str()),
]
}
}
async fn benchmark(&self) -> anyhow::Result<f64> {
let nonce_table = NonceTable::new();
let midstate = Sha256Midstate::from_prefix(&[0u8; 64]);
let _ = self
.mine_range(&midstate, &nonce_table, 256, 0, NONCE_SPACE_SIZE, None)
.await?;
let mut samples = Vec::with_capacity(5);
for _ in 0..5 {
let chunk = self
.mine_range(&midstate, &nonce_table, 256, 0, NONCE_SPACE_SIZE, None)
.await?;
let secs = chunk.elapsed.as_secs_f64();
if secs > 0.0 {
samples.push(chunk.attempted as f64 / secs);
}
}
if samples.is_empty() {
return Ok(0.0);
}
samples.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
Ok(samples[samples.len() / 2])
}
async fn mine_range(
&self,
midstate: &Sha256Midstate,
_nonce_table: &NonceTable,
difficulty: u32,
start_nonce: u32,
nonce_count: u32,
cancel: Option<CancelFlag>,
) -> anyhow::Result<MiningChunkResult> {
let start_nonce = start_nonce.min(NONCE_SPACE_SIZE);
let end_nonce = start_nonce
.saturating_add(nonce_count)
.min(NONCE_SPACE_SIZE);
if start_nonce >= end_nonce {
return Ok(MiningChunkResult::empty());
}
let attempted = (end_nonce - start_nonce) as u64;
let midstate = midstate.clone();
let nonce_words = self.nonce_words.clone();
let cancel = cancel.clone();
let pool = self.pool.clone();
let simd = self.simd;
tokio::task::spawn_blocking(move || {
let started = std::time::Instant::now();
let result = pool.install(|| {
let total = end_nonce - start_nonce;
let block_count = total.div_ceil(CPU_PAR_BLOCK_NONCES);
(0..block_count).into_par_iter().find_map_any(|block_idx| {
if maybe_cancelled(cancel.as_ref()) {
return None;
}
let block_start =
start_nonce.saturating_add(block_idx.saturating_mul(CPU_PAR_BLOCK_NONCES));
let block_end = block_start
.saturating_add(CPU_PAR_BLOCK_NONCES)
.min(end_nonce);
search_best_range(
simd,
&midstate,
&nonce_words,
difficulty,
block_start,
block_end,
cancel.as_ref(),
)
})
});
Ok(MiningChunkResult {
result,
attempted,
elapsed: started.elapsed(),
})
})
.await?
}
}
#[cfg(test)]
mod tests {
use super::*;
use webylib::Amount;
#[tokio::test]
async fn cpu_mining_low_difficulty() {
let cpu = SimdCpuMiner::new();
let nonce_table = NonceTable::new();
let wu = WorkUnit::new(
8,
Amount::from_wats(20_000_000_000_000),
Amount::from_wats(1_000_000_000_000),
);
let result = cpu
.mine_work_unit(&wu.midstate, &nonce_table, 8)
.await
.unwrap();
assert!(
result.result.is_some(),
"should find solution at difficulty 8"
);
let result = result.result.unwrap();
assert!(result.difficulty_achieved >= 8);
assert!(result.nonce1_idx < 1000);
assert!(result.nonce2_idx < 1000);
let tail = WorkUnit::build_tail(&nonce_table, result.nonce1_idx, result.nonce2_idx);
let verify_hash = wu.midstate.finalize(&tail);
assert_eq!(verify_hash, result.hash);
}
}