Skip to main content

zeldhash_miner_gpu/
lib.rs

1#![deny(missing_docs)]
2
3//! WebGPU mining runner.
4//!
5//! The implementation wires a compute shader that performs the full
6//! double-SHA256 over `tx_prefix || nonce || tx_suffix`. The shader mirrors the
7//! CPU implementation in `zeldhash-miner-core` and returns txids in the same byte order
8//! (big-endian hash bytes, which callers treat as txid by reversing when
9//! counting leading zeros). Storage buffer layouts match the TODO design so the
10//! JavaScript bindings and future optimizations can reuse them unchanged.
11
12use std::{
13    borrow::Cow,
14    num::NonZeroU64,
15    sync::{Arc, Mutex, MutexGuard},
16    time::Instant,
17};
18
19use futures::channel::oneshot;
20
21use bytemuck::{cast_slice, pod_read_unaligned, Pod, Zeroable};
22use thiserror::Error;
23use wgpu::util::DeviceExt;
24use zeldhash_miner_core::encode_nonce;
25
26#[cfg_attr(test, allow(dead_code))]
27const WORKGROUP_SIZE: u32 = 256;
28const MAX_RESULTS: usize = 8;
29
30const SHADER_WGSL: &str = include_str!("shader.wgsl");
31
32/// Minimal adapter info exposed to callers (e.g., WASM bindings).
33#[derive(Debug, Clone, PartialEq, Eq)]
34pub struct AdapterSummary {
35    /// Human-readable adapter name (vendor/device).
36    pub name: String,
37    /// Backend string (e.g., "Vulkan", "Metal", "Dx12", "Gl", "BrowserWebGpu").
38    pub backend: String,
39    /// Device class (DiscreteGpu, IntegratedGpu, Cpu, VirtualGpu, Other).
40    pub device_type: String,
41}
42
43impl From<wgpu::AdapterInfo> for AdapterSummary {
44    fn from(info: wgpu::AdapterInfo) -> Self {
45        Self {
46            name: info.name,
47            backend: format!("{:?}", info.backend),
48            device_type: format!("{:?}", info.device_type),
49        }
50    }
51}
52
53/// Result returned when a matching nonce is found.
54#[derive(Debug, Clone, PartialEq, Eq)]
55pub struct MineResult {
56    /// Winning nonce.
57    pub nonce: u64,
58    /// Double SHA256 hash (big-endian bytes).
59    pub txid: [u8; 32],
60}
61
62/// GPU initialization and dispatch errors.
63#[derive(Debug, Error)]
64pub enum GpuError {
65    /// WebGPU is not available on this platform/adapter.
66    #[error("WebGPU not available: {0}")]
67    Unavailable(String),
68    /// Internal GPU error.
69    #[error("GPU error: {0}")]
70    Internal(String),
71}
72
73/// GPU context holding the device/queue.
74#[derive(Clone)]
75#[cfg_attr(test, allow(dead_code))]
76pub struct GpuContext {
77    device: Arc<wgpu::Device>,
78    queue: Arc<wgpu::Queue>,
79    #[allow(dead_code)]
80    adapter_info: wgpu::AdapterInfo,
81    batch_size_cache: Arc<Mutex<Option<u32>>>,
82    pipeline_cache: Arc<Mutex<Option<Arc<GpuPipeline>>>>,
83    fixed_buffers: Arc<Mutex<Option<Arc<FixedBuffers>>>>,
84    io_buffers: Arc<Mutex<Option<IoBuffers>>>,
85}
86
87impl GpuContext {
88    /// Initialize GPU context, preferring high-performance adapters.
89    pub async fn init() -> Result<Self, GpuError> {
90        let instance = if cfg!(target_arch = "wasm32") {
91            wgpu::Instance::new(wgpu::InstanceDescriptor {
92                backends: wgpu::Backends::BROWSER_WEBGPU,
93                dx12_shader_compiler: wgpu::Dx12Compiler::Fxc,
94                flags: wgpu::InstanceFlags::default(),
95                gles_minor_version: wgpu::Gles3MinorVersion::Automatic,
96            })
97        } else {
98            wgpu::Instance::new(wgpu::InstanceDescriptor {
99                backends: wgpu::Backends::PRIMARY,
100                dx12_shader_compiler: wgpu::Dx12Compiler::Fxc,
101                flags: wgpu::InstanceFlags::default(),
102                gles_minor_version: wgpu::Gles3MinorVersion::Automatic,
103            })
104        };
105        let adapter = instance
106            .request_adapter(&wgpu::RequestAdapterOptions {
107                power_preference: wgpu::PowerPreference::HighPerformance,
108                compatible_surface: None,
109                force_fallback_adapter: false,
110            })
111            .await
112            .ok_or_else(|| GpuError::Unavailable("no suitable adapter found".into()))?;
113
114        let adapter_info = adapter.get_info();
115        let required_features = wgpu::Features::empty();
116
117        // On the web, some implementations reject `requestDevice` if optional limits
118        // (like `maxInterStageShaderComponents`) are provided. Use the portable
119        // WebGPU defaults there and keep the full adapter limits on native builds.
120        // On WebAssembly, keep the requested limits minimal. Some Chrome/Dawn
121        // builds reject `requestDevice` when optional limits like
122        // `maxInterStageShaderComponents` are present, even with default values.
123        // Using the WebGL2 downlevel defaults and zeroing the problematic
124        // fields avoids sending those limits entirely and makes the request
125        // portable across browser versions.
126        let required_limits = if cfg!(target_arch = "wasm32") {
127            let mut limits = wgpu::Limits::downlevel_webgl2_defaults();
128            // Avoid optional limits that Chrome/Dawn can reject when present.
129            limits.max_inter_stage_shader_components = 0;
130            limits
131        } else {
132            adapter.limits()
133        };
134
135        let (device, queue) = adapter
136            .request_device(
137                &wgpu::DeviceDescriptor {
138                    label: Some("zeldhash-miner-gpu-device"),
139                    required_features,
140                    required_limits,
141                },
142                None,
143            )
144            .await
145            .map_err(|e| GpuError::Unavailable(format!("request_device failed: {e}")))?;
146
147        Ok(Self {
148            device: Arc::new(device),
149            queue: Arc::new(queue),
150            adapter_info,
151            batch_size_cache: Arc::new(Mutex::new(None)),
152            pipeline_cache: Arc::new(Mutex::new(None)),
153            fixed_buffers: Arc::new(Mutex::new(None)),
154            io_buffers: Arc::new(Mutex::new(None)),
155        })
156    }
157
158    /// Human-readable description of the active adapter.
159    pub fn adapter_summary(&self) -> AdapterSummary {
160        AdapterSummary::from(self.adapter_info.clone())
161    }
162}
163
164/// Mining parameters for a single batch.
165#[derive(Debug, Clone)]
166pub struct MiningBatch<'a> {
167    /// Serialized tx prefix (pre-nonce).
168    pub tx_prefix: &'a [u8],
169    /// Serialized tx suffix (post-nonce).
170    pub tx_suffix: &'a [u8],
171    /// Starting nonce for the batch.
172    pub start_nonce: u64,
173    /// Number of attempts.
174    pub batch_size: u32,
175    /// Target leading zeros (txid view).
176    pub target_zeros: u8,
177    /// When true, encode the nonce as CBOR (major type 0) rather than raw big-endian bytes.
178    pub use_cbor_nonce: bool,
179}
180
181#[repr(C)]
182#[derive(Clone, Copy, Pod, Zeroable)]
183struct MiningParams {
184    start_nonce_lo: u32,
185    start_nonce_hi: u32,
186    batch_size: u32,
187    target_zeros: u32,
188    prefix_len: u32,
189    suffix_len: u32,
190    nonce_len: u32,
191    use_cbor_nonce: u32, // bool flag (0 = raw, 1 = CBOR)
192    _pad2: u32,          // reserved for future fields / alignment
193    _pad3: u32,
194    _pad4: u32,
195    _pad5: u32,
196}
197
198#[repr(C, align(16))]
199#[derive(Clone, Copy, Pod, Zeroable)]
200struct ResultEntry {
201    nonce_lo: u32,
202    nonce_hi: u32,
203    txid: [u32; 8],
204    _tail_pad: [u32; 2],
205}
206
207// Align to 16 bytes to match WGSL storage layout expectations on all targets.
208#[repr(C, align(16))]
209#[derive(Clone, Copy, Pod, Zeroable)]
210struct ResultBuffer {
211    found_count: u32,
212    _pad: u32, // alignment to 8-byte boundary for following array
213    _align_pad: [u32; 2],
214    results: [ResultEntry; MAX_RESULTS],
215    _tail_pad: [u32; 2],
216    _final_pad: [u32; 2],
217}
218
219// Compile-time layout sanity checks (must stay in sync with WGSL).
220#[allow(dead_code)]
221const RESULT_ENTRY_SIZE: usize = 48;
222#[allow(dead_code)]
223const RESULT_BUFFER_HEADER: usize = 16; // found_count + _pad + _align_pad
224#[allow(dead_code)]
225const RESULT_BUFFER_TAIL: usize = 16; // _tail_pad + _final_pad
226#[allow(dead_code)]
227const RESULT_BUFFER_SIZE: usize =
228    ((RESULT_BUFFER_HEADER + (MAX_RESULTS * RESULT_ENTRY_SIZE) + RESULT_BUFFER_TAIL + 15) / 16)
229        * 16;
230const _: [(); RESULT_ENTRY_SIZE] = [(); std::mem::size_of::<ResultEntry>()];
231const _: [(); RESULT_BUFFER_SIZE] = [(); std::mem::size_of::<ResultBuffer>()];
232
233#[cfg_attr(test, allow(dead_code))]
234struct GpuPipeline {
235    pipeline: wgpu::ComputePipeline,
236    layout: wgpu::BindGroupLayout,
237}
238
239struct FixedBuffers {
240    result: wgpu::Buffer,
241    staging: wgpu::Buffer,
242}
243
244struct IoBuffers {
245    prefix: wgpu::Buffer,
246    prefix_capacity: u64,
247    suffix: wgpu::Buffer,
248    suffix_capacity: u64,
249    params: wgpu::Buffer,
250    params_capacity: u64,
251}
252
253type IoBuffersCacheGuard<'a> = MutexGuard<'a, Option<IoBuffers>>;
254
255fn min_capacity(size: u64) -> u64 {
256    // Avoid zero-sized buffers, keep allocations aligned and reusable.
257    size.max(16).next_power_of_two()
258}
259
260fn create_buffer(
261    device: &wgpu::Device,
262    label: &str,
263    size: u64,
264    usage: wgpu::BufferUsages,
265) -> wgpu::Buffer {
266    device.create_buffer(&wgpu::BufferDescriptor {
267        label: Some(label),
268        size,
269        usage,
270        mapped_at_creation: false,
271    })
272}
273
274impl IoBuffers {
275    fn new(device: &wgpu::Device, prefix: u64, suffix: u64, params: u64) -> Self {
276        let prefix_capacity = min_capacity(prefix);
277        let suffix_capacity = min_capacity(suffix);
278        let params_capacity = min_capacity(params);
279
280        Self {
281            prefix: create_buffer(
282                device,
283                "zeldhash-miner-gpu-prefix-pooled",
284                prefix_capacity,
285                wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
286            ),
287            prefix_capacity,
288            suffix: create_buffer(
289                device,
290                "zeldhash-miner-gpu-suffix-pooled",
291                suffix_capacity,
292                wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
293            ),
294            suffix_capacity,
295            params: create_buffer(
296                device,
297                "zeldhash-miner-gpu-params-pooled",
298                params_capacity,
299                wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
300            ),
301            params_capacity,
302        }
303    }
304
305    fn ensure_capacity(
306        &mut self,
307        device: &wgpu::Device,
308        prefix: u64,
309        suffix: u64,
310        params: u64,
311        limits: &wgpu::Limits,
312    ) -> Result<(), GpuError> {
313        let max_storage: u64 = limits.max_storage_buffer_binding_size.into();
314        let max_uniform: u64 = limits.max_uniform_buffer_binding_size.into();
315
316        if prefix > max_storage {
317            return Err(GpuError::Internal(format!(
318                "prefix buffer exceeds max storage binding size ({} > {})",
319                prefix, max_storage
320            )));
321        }
322        if suffix > max_storage {
323            return Err(GpuError::Internal(format!(
324                "suffix buffer exceeds max storage binding size ({} > {})",
325                suffix, max_storage
326            )));
327        }
328        if params > max_uniform {
329            return Err(GpuError::Internal(format!(
330                "params buffer exceeds max uniform binding size ({} > {})",
331                params, max_uniform
332            )));
333        }
334
335        let needed_prefix = min_capacity(prefix).min(max_storage);
336        if needed_prefix > self.prefix_capacity {
337            self.prefix = create_buffer(
338                device,
339                "zeldhash-miner-gpu-prefix-pooled",
340                needed_prefix,
341                wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
342            );
343            self.prefix_capacity = needed_prefix;
344        }
345
346        let needed_suffix = min_capacity(suffix).min(max_storage);
347        if needed_suffix > self.suffix_capacity {
348            self.suffix = create_buffer(
349                device,
350                "zeldhash-miner-gpu-suffix-pooled",
351                needed_suffix,
352                wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
353            );
354            self.suffix_capacity = needed_suffix;
355        }
356
357        let needed_params = min_capacity(params).min(max_uniform);
358        if needed_params > self.params_capacity {
359            self.params = create_buffer(
360                device,
361                "zeldhash-miner-gpu-params-pooled",
362                needed_params,
363                wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
364            );
365            self.params_capacity = needed_params;
366        }
367
368        Ok(())
369    }
370}
371
372fn get_or_create_io_buffers(
373    ctx: &GpuContext,
374    prefix: u64,
375    suffix: u64,
376    params: u64,
377) -> Result<IoBuffersCacheGuard<'_>, GpuError> {
378    let limits = ctx.device.limits();
379    let mut guard = ctx
380        .io_buffers
381        .lock()
382        .map_err(|_| GpuError::Internal("buffer cache poisoned".into()))?;
383
384    let buffers = guard.get_or_insert_with(|| IoBuffers::new(&ctx.device, prefix, suffix, params));
385    buffers.ensure_capacity(&ctx.device, prefix, suffix, params, &limits)?;
386
387    Ok(guard)
388}
389
390fn fallback_batch_size(info: &wgpu::AdapterInfo) -> u32 {
391    // Provide GPU-class aware defaults so we avoid overcommitting integrated GPUs
392    // while still giving discrete GPUs a throughput-friendly starting point.
393    match info.device_type {
394        wgpu::DeviceType::IntegratedGpu => 100_000,
395        wgpu::DeviceType::DiscreteGpu => 1_000_000,
396        wgpu::DeviceType::VirtualGpu => 200_000,
397        wgpu::DeviceType::Cpu => 25_000,
398        _ => 150_000,
399    }
400}
401
402fn cbor_nonce_len(value: u64) -> u32 {
403    match value {
404        0..=23 => 1,
405        24..=255 => 2,
406        256..=65_535 => 3,
407        65_536..=0xFFFF_FFFF => 5,
408        _ => 9,
409    }
410}
411
412fn nonce_len_for_range(
413    start_nonce: u64,
414    batch_size: u32,
415    use_cbor_nonce: bool,
416) -> Result<u32, GpuError> {
417    if batch_size == 0 {
418        return Err(GpuError::Internal("batch_size must be positive".into()));
419    }
420    let last = start_nonce
421        .checked_add(batch_size as u64 - 1)
422        .ok_or_else(|| GpuError::Internal("nonce range overflow".into()))?;
423
424    let (start_len, last_len) = if use_cbor_nonce {
425        (cbor_nonce_len(start_nonce), cbor_nonce_len(last))
426    } else {
427        (
428            encode_nonce(start_nonce).len() as u32,
429            encode_nonce(last).len() as u32,
430        )
431    };
432
433    if start_len != last_len {
434        return Err(GpuError::Internal(
435            "nonce range crosses byte-length boundary; split batch".into(),
436        ));
437    }
438    Ok(start_len)
439}
440
441fn pad_bytes_to_words(bytes: &[u8]) -> Vec<u32> {
442    let mut padded = bytes.to_vec();
443    while padded.len() % 4 != 0 {
444        padded.push(0);
445    }
446    padded
447        .chunks_exact(4)
448        .map(|chunk| u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]))
449        .collect()
450}
451
452fn to_u8_bytes(words: &[u32; 8]) -> [u8; 32] {
453    let mut out = [0u8; 32];
454    for (i, word) in words.iter().enumerate() {
455        out[i * 4..(i + 1) * 4].copy_from_slice(&word.to_be_bytes());
456    }
457    out
458}
459
460#[cfg_attr(test, allow(dead_code))]
461fn create_shader_module(ctx: &GpuContext) -> wgpu::ShaderModule {
462    ctx.device
463        .create_shader_module(wgpu::ShaderModuleDescriptor {
464            label: Some("zeldhash-miner-gpu-miner-shader-wgsl"),
465            source: wgpu::ShaderSource::Wgsl(Cow::Borrowed(SHADER_WGSL)),
466        })
467}
468
469fn build_pipeline(ctx: &GpuContext) -> Result<GpuPipeline, GpuError> {
470    let shader = create_shader_module(ctx);
471
472    let layout = ctx
473        .device
474        .create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
475            label: Some("zeldhash-miner-gpu-bind-layout"),
476            entries: &[
477                wgpu::BindGroupLayoutEntry {
478                    binding: 0,
479                    visibility: wgpu::ShaderStages::COMPUTE,
480                    ty: wgpu::BindingType::Buffer {
481                        ty: wgpu::BufferBindingType::Storage { read_only: true },
482                        has_dynamic_offset: false,
483                        min_binding_size: None,
484                    },
485                    count: None,
486                },
487                wgpu::BindGroupLayoutEntry {
488                    binding: 1,
489                    visibility: wgpu::ShaderStages::COMPUTE,
490                    ty: wgpu::BindingType::Buffer {
491                        ty: wgpu::BufferBindingType::Storage { read_only: true },
492                        has_dynamic_offset: false,
493                        min_binding_size: None,
494                    },
495                    count: None,
496                },
497                wgpu::BindGroupLayoutEntry {
498                    binding: 2,
499                    visibility: wgpu::ShaderStages::COMPUTE,
500                    ty: wgpu::BindingType::Buffer {
501                        ty: wgpu::BufferBindingType::Uniform,
502                        has_dynamic_offset: false,
503                        min_binding_size: NonZeroU64::new(
504                            std::mem::size_of::<MiningParams>() as u64
505                        ),
506                    },
507                    count: None,
508                },
509                wgpu::BindGroupLayoutEntry {
510                    binding: 3,
511                    visibility: wgpu::ShaderStages::COMPUTE,
512                    ty: wgpu::BindingType::Buffer {
513                        ty: wgpu::BufferBindingType::Storage { read_only: false },
514                        has_dynamic_offset: false,
515                        min_binding_size: NonZeroU64::new(
516                            std::mem::size_of::<ResultBuffer>() as u64
517                        ),
518                    },
519                    count: None,
520                },
521            ],
522        });
523
524    let pipeline_layout = ctx
525        .device
526        .create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
527            label: Some("zeldhash-miner-gpu-pipeline-layout"),
528            bind_group_layouts: &[&layout],
529            push_constant_ranges: &[],
530        });
531
532    let pipeline = ctx
533        .device
534        .create_compute_pipeline(&wgpu::ComputePipelineDescriptor {
535            label: Some("zeldhash-miner-gpu-miner"),
536            layout: Some(&pipeline_layout),
537            module: &shader,
538            entry_point: "main",
539        });
540
541    Ok(GpuPipeline { pipeline, layout })
542}
543
544fn get_or_create_pipeline(ctx: &GpuContext) -> Result<Arc<GpuPipeline>, GpuError> {
545    if let Ok(mut cache) = ctx.pipeline_cache.lock() {
546        if let Some(p) = cache.as_ref() {
547            return Ok(p.clone());
548        }
549        let built = Arc::new(build_pipeline(ctx)?);
550        *cache = Some(built.clone());
551        return Ok(built);
552    }
553
554    // Fallback if the mutex is poisoned.
555    Ok(Arc::new(build_pipeline(ctx)?))
556}
557
558fn get_or_create_fixed_buffers(ctx: &GpuContext) -> Result<Arc<FixedBuffers>, GpuError> {
559    let size = std::mem::size_of::<ResultBuffer>() as u64;
560
561    if let Ok(mut cache) = ctx.fixed_buffers.lock() {
562        if let Some(bufs) = cache.as_ref() {
563            return Ok(bufs.clone());
564        }
565
566        let result = ctx.device.create_buffer(&wgpu::BufferDescriptor {
567            label: Some("zeldhash-miner-gpu-results"),
568            size,
569            usage: wgpu::BufferUsages::STORAGE
570                | wgpu::BufferUsages::COPY_SRC
571                | wgpu::BufferUsages::COPY_DST,
572            mapped_at_creation: false,
573        });
574
575        let staging = ctx.device.create_buffer(&wgpu::BufferDescriptor {
576            label: Some("zeldhash-miner-gpu-result-staging"),
577            size,
578            usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
579            mapped_at_creation: false,
580        });
581
582        let fixed = Arc::new(FixedBuffers { result, staging });
583        *cache = Some(fixed.clone());
584        return Ok(fixed);
585    }
586
587    // Fallback if the mutex is poisoned.
588    let result = ctx.device.create_buffer(&wgpu::BufferDescriptor {
589        label: Some("zeldhash-miner-gpu-results"),
590        size,
591        usage: wgpu::BufferUsages::STORAGE
592            | wgpu::BufferUsages::COPY_SRC
593            | wgpu::BufferUsages::COPY_DST,
594        mapped_at_creation: false,
595    });
596    let staging = ctx.device.create_buffer(&wgpu::BufferDescriptor {
597        label: Some("zeldhash-miner-gpu-result-staging"),
598        size,
599        usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
600        mapped_at_creation: false,
601    });
602    Ok(Arc::new(FixedBuffers { result, staging }))
603}
604
605#[cfg_attr(test, allow(dead_code))]
606fn create_buffers(
607    ctx: &GpuContext,
608    pipeline: &GpuPipeline,
609    batch: &MiningBatch<'_>,
610    nonce_len: u32,
611    result_buf: &wgpu::Buffer,
612) -> Result<wgpu::BindGroup, GpuError> {
613    let prefix_words = pad_bytes_to_words(batch.tx_prefix);
614    let suffix_words = pad_bytes_to_words(batch.tx_suffix);
615
616    let prefix_size = (prefix_words.len() * std::mem::size_of::<u32>()) as u64;
617    let suffix_size = (suffix_words.len() * std::mem::size_of::<u32>()) as u64;
618    let params_size = std::mem::size_of::<MiningParams>() as u64;
619
620    let buffers_guard = get_or_create_io_buffers(ctx, prefix_size, suffix_size, params_size)?;
621    let buffers = buffers_guard
622        .as_ref()
623        .expect("io buffers must be initialized before use");
624
625    if !prefix_words.is_empty() {
626        ctx.queue
627            .write_buffer(&buffers.prefix, 0, cast_slice(&prefix_words));
628    }
629    if !suffix_words.is_empty() {
630        ctx.queue
631            .write_buffer(&buffers.suffix, 0, cast_slice(&suffix_words));
632    }
633
634    let params = MiningParams {
635        start_nonce_lo: batch.start_nonce as u32,
636        start_nonce_hi: (batch.start_nonce >> 32) as u32,
637        batch_size: batch.batch_size,
638        target_zeros: batch.target_zeros as u32,
639        prefix_len: batch.tx_prefix.len() as u32,
640        suffix_len: batch.tx_suffix.len() as u32,
641        nonce_len,
642        use_cbor_nonce: batch.use_cbor_nonce as u32,
643        _pad2: 0,
644        _pad3: 0,
645        _pad4: 0,
646        _pad5: 0,
647    };
648    ctx.queue.write_buffer(
649        &buffers.params,
650        0,
651        cast_slice(std::slice::from_ref(&params)),
652    );
653
654    let bind_group = ctx.device.create_bind_group(&wgpu::BindGroupDescriptor {
655        label: Some("zeldhash-miner-gpu-bind-group"),
656        layout: &pipeline.layout,
657        entries: &[
658            wgpu::BindGroupEntry {
659                binding: 0,
660                resource: buffers.prefix.as_entire_binding(),
661            },
662            wgpu::BindGroupEntry {
663                binding: 1,
664                resource: buffers.suffix.as_entire_binding(),
665            },
666            wgpu::BindGroupEntry {
667                binding: 2,
668                resource: buffers.params.as_entire_binding(),
669            },
670            wgpu::BindGroupEntry {
671                binding: 3,
672                resource: result_buf.as_entire_binding(),
673            },
674        ],
675    });
676
677    Ok(bind_group)
678}
679
680#[cfg_attr(test, allow(dead_code))]
681fn parse_results(mapped: &[u8]) -> Vec<MineResult> {
682    let required = std::mem::size_of::<ResultBuffer>();
683    if mapped.len() < required {
684        return Vec::new();
685    }
686
687    // Browser WebGPU can return a mapped slice that is not aligned to the
688    // 16-byte boundary required by ResultBuffer. Read with an unaligned helper
689    // to avoid panicking in bytemuck when the pointer is misaligned.
690    let buffer: ResultBuffer = pod_read_unaligned(mapped);
691    let found = buffer.found_count as usize;
692    let take = found.min(MAX_RESULTS);
693
694    let mut out = Vec::with_capacity(take);
695    for entry in buffer.results.iter().take(take) {
696        let nonce = ((entry.nonce_hi as u64) << 32) | entry.nonce_lo as u64;
697        out.push(MineResult {
698            nonce,
699            txid: to_u8_bytes(&entry.txid),
700        });
701    }
702    out
703}
704
705async fn dispatch_gpu(
706    ctx: &GpuContext,
707    batch: &MiningBatch<'_>,
708    nonce_len: u32,
709) -> Result<Vec<MineResult>, GpuError> {
710    if batch.batch_size == 0 {
711        return Ok(Vec::new());
712    }
713
714    let pipeline = get_or_create_pipeline(ctx)?;
715    let fixed = get_or_create_fixed_buffers(ctx)?;
716
717    // Clear the shared result buffer before dispatch.
718    let zero_template = vec![0u8; std::mem::size_of::<ResultBuffer>()];
719    ctx.queue.write_buffer(&fixed.result, 0, &zero_template);
720
721    let bind_group = create_buffers(ctx, &pipeline, batch, nonce_len, &fixed.result)?;
722
723    let mut encoder = ctx
724        .device
725        .create_command_encoder(&wgpu::CommandEncoderDescriptor {
726            label: Some("zeldhash-miner-gpu-encoder"),
727        });
728
729    {
730        let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
731            label: Some("zeldhash-miner-gpu-compute-pass"),
732            timestamp_writes: None,
733        });
734        cpass.set_pipeline(&pipeline.pipeline);
735        cpass.set_bind_group(0, &bind_group, &[]);
736        let groups = (batch.batch_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE;
737        cpass.dispatch_workgroups(groups, 1, 1);
738    }
739
740    encoder.copy_buffer_to_buffer(
741        &fixed.result,
742        0,
743        &fixed.staging,
744        0,
745        std::mem::size_of::<ResultBuffer>() as u64,
746    );
747
748    ctx.queue.submit(Some(encoder.finish()));
749
750    let (sender, receiver) = oneshot::channel();
751    fixed
752        .staging
753        .slice(..)
754        .map_async(wgpu::MapMode::Read, move |res| {
755            let _ = sender.send(res);
756        });
757
758    ctx.device.poll(wgpu::Maintain::Wait);
759
760    receiver
761        .await
762        .map_err(|e| GpuError::Internal(format!("failed to receive map result: {e}")))?
763        .map_err(|e| GpuError::Internal(format!("failed to map results: {e:?}")))?;
764
765    let data = fixed.staging.slice(..).get_mapped_range();
766    let parsed = parse_results(&data);
767    drop(data);
768    fixed.staging.unmap();
769    Ok(parsed)
770}
771
772/// Dispatch a mining batch on the GPU and return all matching nonces found.
773pub async fn dispatch_mining_batch(
774    ctx: &GpuContext,
775    batch: &MiningBatch<'_>,
776) -> Result<Vec<MineResult>, GpuError> {
777    let nonce_len = nonce_len_for_range(batch.start_nonce, batch.batch_size, batch.use_cbor_nonce)?;
778    dispatch_gpu(ctx, batch, nonce_len).await
779}
780
781/// Calibrate an approximate batch size for the current adapter.
782pub async fn calibrate_batch_size(ctx: &GpuContext) -> Result<u32, GpuError> {
783    // Return cached value when available to avoid re-running calibration.
784    if let Ok(cache) = ctx.batch_size_cache.lock() {
785        if let Some(value) = *cache {
786            return Ok(value);
787        }
788    }
789
790    // Include small and large samples to better fit a wide range of adapters.
791    let candidates = [1_000u32, 10_000, 100_000, 1_000_000];
792    let mut best = 100_000u32;
793    let mut best_hps = 0.0f64;
794
795    // Use minimal non-empty buffers so create_buffer_init never receives zero-length data.
796    const DUMMY: &[u8] = &[0u8];
797    let pipeline = get_or_create_pipeline(ctx)?;
798
799    let prefix_words = pad_bytes_to_words(DUMMY);
800    let suffix_words = pad_bytes_to_words(DUMMY);
801    let params_template = MiningParams {
802        start_nonce_lo: 0,
803        start_nonce_hi: 0,
804        batch_size: 1,
805        target_zeros: 64, // effectively impossible, keeps kernel busy
806        prefix_len: DUMMY.len() as u32,
807        suffix_len: DUMMY.len() as u32,
808        nonce_len: 1,
809        use_cbor_nonce: 0,
810        _pad2: 0,
811        _pad3: 0,
812        _pad4: 0,
813        _pad5: 0,
814    };
815
816    let prefix_buf = ctx
817        .device
818        .create_buffer_init(&wgpu::util::BufferInitDescriptor {
819            label: Some("zeldhash-miner-gpu-prefix-calibration"),
820            contents: cast_slice(&prefix_words),
821            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
822        });
823    let suffix_buf = ctx
824        .device
825        .create_buffer_init(&wgpu::util::BufferInitDescriptor {
826            label: Some("zeldhash-miner-gpu-suffix-calibration"),
827            contents: cast_slice(&suffix_words),
828            usage: wgpu::BufferUsages::STORAGE | wgpu::BufferUsages::COPY_DST,
829        });
830    let params_buf = ctx
831        .device
832        .create_buffer_init(&wgpu::util::BufferInitDescriptor {
833            label: Some("zeldhash-miner-gpu-params-calibration"),
834            contents: cast_slice(std::slice::from_ref(&params_template)),
835            usage: wgpu::BufferUsages::UNIFORM | wgpu::BufferUsages::COPY_DST,
836        });
837    let result_buf = ctx.device.create_buffer(&wgpu::BufferDescriptor {
838        label: Some("zeldhash-miner-gpu-results-calibration"),
839        size: std::mem::size_of::<ResultBuffer>() as u64,
840        usage: wgpu::BufferUsages::STORAGE
841            | wgpu::BufferUsages::COPY_SRC
842            | wgpu::BufferUsages::COPY_DST,
843        mapped_at_creation: false,
844    });
845    let staging = ctx.device.create_buffer(&wgpu::BufferDescriptor {
846        label: Some("zeldhash-miner-gpu-result-staging-calibration"),
847        size: std::mem::size_of::<ResultBuffer>() as u64,
848        usage: wgpu::BufferUsages::MAP_READ | wgpu::BufferUsages::COPY_DST,
849        mapped_at_creation: false,
850    });
851
852    let bind_group = ctx.device.create_bind_group(&wgpu::BindGroupDescriptor {
853        label: Some("zeldhash-miner-gpu-bind-group-calibration"),
854        layout: &pipeline.layout,
855        entries: &[
856            wgpu::BindGroupEntry {
857                binding: 0,
858                resource: prefix_buf.as_entire_binding(),
859            },
860            wgpu::BindGroupEntry {
861                binding: 1,
862                resource: suffix_buf.as_entire_binding(),
863            },
864            wgpu::BindGroupEntry {
865                binding: 2,
866                resource: params_buf.as_entire_binding(),
867            },
868            wgpu::BindGroupEntry {
869                binding: 3,
870                resource: result_buf.as_entire_binding(),
871            },
872        ],
873    });
874
875    let zero_template = vec![0u8; std::mem::size_of::<ResultBuffer>()];
876
877    for &size in &candidates {
878        let mut params = params_template;
879        params.batch_size = size;
880
881        ctx.queue
882            .write_buffer(&params_buf, 0, cast_slice(std::slice::from_ref(&params)));
883        ctx.queue.write_buffer(&result_buf, 0, &zero_template);
884
885        let start = Instant::now();
886        let mut encoder = ctx
887            .device
888            .create_command_encoder(&wgpu::CommandEncoderDescriptor {
889                label: Some("zeldhash-miner-gpu-calibration-encoder"),
890            });
891        {
892            let mut cpass = encoder.begin_compute_pass(&wgpu::ComputePassDescriptor {
893                label: Some("zeldhash-miner-gpu-calibration-pass"),
894                timestamp_writes: None,
895            });
896            cpass.set_pipeline(&pipeline.pipeline);
897            cpass.set_bind_group(0, &bind_group, &[]);
898            let groups = (size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE;
899            cpass.dispatch_workgroups(groups, 1, 1);
900        }
901
902        encoder.copy_buffer_to_buffer(
903            &result_buf,
904            0,
905            &staging,
906            0,
907            std::mem::size_of::<ResultBuffer>() as u64,
908        );
909
910        ctx.queue.submit(Some(encoder.finish()));
911        let (sender, receiver) = oneshot::channel();
912        staging
913            .slice(..)
914            .map_async(wgpu::MapMode::Read, move |res| {
915                let _ = sender.send(res);
916            });
917
918        ctx.device.poll(wgpu::Maintain::Wait);
919
920        receiver
921            .await
922            .map_err(|e| GpuError::Internal(format!("failed to receive map result: {e}")))?
923            .map_err(|e| GpuError::Internal(format!("failed to map results: {e:?}")))?;
924
925        // We do not parse results; mapping ensures the workload finished.
926        staging.unmap();
927
928        let elapsed = start.elapsed().as_secs_f64();
929        if elapsed == 0.0 {
930            continue;
931        }
932        let hps = size as f64 / elapsed;
933        if hps > best_hps {
934            best_hps = hps;
935            best = size;
936        }
937    }
938
939    let best_final = if best_hps == 0.0 {
940        fallback_batch_size(&ctx.adapter_info)
941    } else {
942        best
943    };
944
945    if let Ok(mut cache) = ctx.batch_size_cache.lock() {
946        *cache = Some(best_final);
947    }
948
949    Ok(best_final)
950}
951
952#[cfg(test)]
953mod tests {
954    use super::*;
955    use std::time::Duration;
956
957    fn cpu_mine(batch: &MiningBatch<'_>) -> Vec<MineResult> {
958        let nonce_len =
959            nonce_len_for_range(batch.start_nonce, batch.batch_size, batch.use_cbor_nonce)
960                .expect("valid nonce range");
961        let mut buf = Vec::new();
962        let mut out = Vec::new();
963        for offset in 0..batch.batch_size {
964            if let Some(nonce) = batch.start_nonce.checked_add(offset as u64) {
965                buf.clear();
966                buf.extend_from_slice(batch.tx_prefix);
967                if batch.use_cbor_nonce {
968                    let encoded = zeldhash_miner_core::cbor::encode_cbor_uint(nonce);
969                    assert_eq!(encoded.len(), nonce_len as usize);
970                    buf.extend_from_slice(&encoded);
971                } else {
972                    let be = nonce.to_be_bytes();
973                    let start = 8 - nonce_len as usize;
974                    buf.extend_from_slice(&be[start..]);
975                }
976                buf.extend_from_slice(batch.tx_suffix);
977                let hash = zeldhash_miner_core::double_sha256(&buf);
978                if zeldhash_miner_core::hash_meets_target(&hash, batch.target_zeros) {
979                    out.push(MineResult { nonce, txid: hash });
980                }
981            }
982        }
983        out
984    }
985
986    #[test]
987    fn pads_bytes_to_words() {
988        let words = pad_bytes_to_words(&[0x01, 0x02, 0x03]);
989        assert_eq!(words.len(), 1);
990        assert_eq!(words[0], 0x030201);
991    }
992
993    #[test]
994    fn converts_words_to_bytes() {
995        let words = [0x11223344u32; 8];
996        let bytes = to_u8_bytes(&words);
997        assert_eq!(bytes[0], 0x11);
998        assert_eq!(bytes[1], 0x22);
999        assert_eq!(bytes[2], 0x33);
1000        assert_eq!(bytes[3], 0x44);
1001    }
1002
1003    #[test]
1004    fn gpu_matches_cpu_when_available() {
1005        let ctx = pollster::block_on(GpuContext::init());
1006        let ctx = match ctx {
1007            Ok(c) => c,
1008            Err(_) => return, // Skip if WebGPU not available in CI environment.
1009        };
1010
1011        let batch = MiningBatch {
1012            tx_prefix: b"hello",
1013            tx_suffix: b"world",
1014            start_nonce: 0,
1015            batch_size: 64,
1016            target_zeros: 1,
1017            use_cbor_nonce: false,
1018        };
1019
1020        let mut cpu = cpu_mine(&batch);
1021        let mut gpu = pollster::block_on(dispatch_mining_batch(&ctx, &batch)).unwrap();
1022
1023        cpu.sort_by_key(|r| r.nonce);
1024        gpu.sort_by_key(|r| r.nonce);
1025        assert_eq!(cpu, gpu);
1026    }
1027
1028    #[test]
1029    fn gpu_collects_multiple_results_up_to_max_when_available() {
1030        let ctx = pollster::block_on(GpuContext::init());
1031        let ctx = match ctx {
1032            Ok(c) => c,
1033            Err(_) => return, // Skip if WebGPU not available in CI environment.
1034        };
1035
1036        let batch = MiningBatch {
1037            tx_prefix: b"a",
1038            tx_suffix: b"b",
1039            start_nonce: 0,
1040            batch_size: (MAX_RESULTS as u32) + 2,
1041            target_zeros: 0, // every hash counts
1042            use_cbor_nonce: false,
1043        };
1044
1045        let gpu_results =
1046            pollster::block_on(dispatch_mining_batch(&ctx, &batch)).expect("gpu dispatch failed");
1047        assert_eq!(
1048            gpu_results.len(),
1049            MAX_RESULTS.min(batch.batch_size as usize)
1050        );
1051    }
1052
1053    #[test]
1054    fn integrated_gpu_target_hash_rate_calculation() {
1055        // Use the integrated GPU fallback batch size and assume a short dispatch to
1056        // confirm the rate calculation clears the 10 MH/s target. This avoids
1057        // depending on real hardware while still guarding the math.
1058        let integrated = fallback_batch_size(&wgpu::AdapterInfo {
1059            name: String::from("test-integrated"),
1060            vendor: 0,
1061            device: 0,
1062            device_type: wgpu::DeviceType::IntegratedGpu,
1063            backend: wgpu::Backend::Vulkan,
1064            driver: String::new(),
1065            driver_info: String::new(),
1066        });
1067
1068        assert_eq!(integrated, 100_000);
1069
1070        let discrete = fallback_batch_size(&wgpu::AdapterInfo {
1071            name: String::from("test-discrete"),
1072            vendor: 0,
1073            device: 0,
1074            device_type: wgpu::DeviceType::DiscreteGpu,
1075            backend: wgpu::Backend::Vulkan,
1076            driver: String::new(),
1077            driver_info: String::new(),
1078        });
1079        assert!(discrete > integrated);
1080
1081        let elapsed = Duration::from_millis(5); // 0.005s dispatch
1082        let rate = integrated as f64 / elapsed.as_secs_f64();
1083        assert!(rate >= 10_000_000.0, "expected >= 10 MH/s, got {rate} H/s");
1084    }
1085}