Skip to main content

oxiphysics_gpu/compute/
types.rs

1//! Auto-generated module
2//!
3//! 🤖 Generated with [SplitRS](https://github.com/cool-japan/splitrs)
4
5#![allow(clippy::manual_div_ceil)]
6use std::cell::RefCell;
7use std::collections::HashMap;
8
9#[allow(unused_imports)]
10use super::functions::*;
11
12/// Records a sequence of kernel dispatches for batched execution.
13///
14/// Compute passes accumulate dispatch commands and execute them in order.
15#[allow(dead_code)]
16pub struct ComputePass {
17    /// Recorded dispatch commands: (kernel_name, work_size).
18    pub(super) commands: Vec<(String, usize)>,
19}
20#[allow(dead_code)]
21impl ComputePass {
22    /// Create a new empty compute pass.
23    pub fn new() -> Self {
24        Self {
25            commands: Vec::new(),
26        }
27    }
28    /// Record a dispatch command.
29    pub fn dispatch(&mut self, kernel_name: &str, work_size: usize) {
30        self.commands.push((kernel_name.to_string(), work_size));
31    }
32    /// Return the number of recorded commands.
33    pub fn num_commands(&self) -> usize {
34        self.commands.len()
35    }
36    /// Return the recorded commands.
37    pub fn commands(&self) -> &[(String, usize)] {
38        &self.commands
39    }
40    /// Clear all recorded commands.
41    pub fn clear(&mut self) {
42        self.commands.clear();
43    }
44    /// Total work items across all recorded dispatches.
45    pub fn total_work_items(&self) -> usize {
46        self.commands.iter().map(|(_, ws)| ws).sum()
47    }
48}
49/// Describes how a buffer is used in a compute pass.
50#[derive(Debug, Clone, Copy, PartialEq, Eq)]
51#[allow(dead_code)]
52pub enum BufferUsage {
53    /// Buffer is read-only (storage, read).
54    ReadOnly,
55    /// Buffer is write-only (storage, read_write but only written).
56    WriteOnly,
57    /// Buffer is read-write.
58    ReadWrite,
59    /// Buffer is a uniform (small, read-only parameters).
60    Uniform,
61}
62/// A single GPU command entry.
63#[derive(Debug, Clone)]
64#[allow(dead_code)]
65pub enum GpuCommand {
66    /// Copy from one buffer to another.
67    CopyBuffer {
68        /// Source buffer identifier.
69        src: BufferId,
70        /// Destination buffer identifier.
71        dst: BufferId,
72        /// Number of bytes to copy.
73        size: usize,
74    },
75    /// Dispatch a compute kernel.
76    DispatchCompute {
77        /// Name of the compute kernel.
78        kernel_name: String,
79        /// Workgroup counts for each dimension.
80        workgroups: [u32; 3],
81    },
82    /// Insert a pipeline barrier.
83    Barrier(PipelineBarrier),
84    /// Set a push constant value.
85    PushConstant {
86        /// Push constant name.
87        name: String,
88        /// Push constant value.
89        value: f64,
90    },
91}
92/// Tracks buffer lifecycle (creation, writes, reads) for debugging.
93#[allow(dead_code)]
94pub struct ResourceLifecycle {
95    pub(super) events: Vec<ResourceEvent>,
96}
97#[allow(dead_code)]
98impl ResourceLifecycle {
99    /// Create a new lifecycle tracker.
100    pub fn new() -> Self {
101        Self { events: Vec::new() }
102    }
103    /// Record a creation event.
104    pub fn record_create(&mut self, id: BufferId, size: usize) {
105        self.events.push(ResourceEvent::Created(id, size));
106    }
107    /// Record a write event.
108    pub fn record_write(&mut self, id: BufferId) {
109        self.events.push(ResourceEvent::Written(id));
110    }
111    /// Record a read event.
112    pub fn record_read(&mut self, id: BufferId) {
113        self.events.push(ResourceEvent::Read(id));
114    }
115    /// Record a destroy event.
116    pub fn record_destroy(&mut self, id: BufferId) {
117        self.events.push(ResourceEvent::Destroyed(id));
118    }
119    /// Return all events.
120    pub fn events(&self) -> &[ResourceEvent] {
121        &self.events
122    }
123    /// Return the number of events recorded.
124    pub fn len(&self) -> usize {
125        self.events.len()
126    }
127    /// Check if no events have been recorded.
128    pub fn is_empty(&self) -> bool {
129        self.events.is_empty()
130    }
131    /// Clear all events.
132    pub fn clear(&mut self) {
133        self.events.clear();
134    }
135    /// Count events of a specific type for a given buffer.
136    pub fn count_writes(&self, id: BufferId) -> usize {
137        self.events
138            .iter()
139            .filter(|e| matches!(e, ResourceEvent::Written(bid) if * bid == id))
140            .count()
141    }
142    /// Count reads for a given buffer.
143    pub fn count_reads(&self, id: BufferId) -> usize {
144        self.events
145            .iter()
146            .filter(|e| matches!(e, ResourceEvent::Read(bid) if * bid == id))
147            .count()
148    }
149}
150/// Specifies the type of pipeline barrier needed between passes.
151#[derive(Debug, Clone, PartialEq, Eq)]
152#[allow(dead_code)]
153pub enum PipelineBarrier {
154    /// Ensure all writes to storage buffers are visible before reading.
155    StorageReadAfterWrite,
156    /// Ensure all writes to uniform buffers are visible.
157    UniformReadAfterWrite,
158    /// Full barrier (all types).
159    Full,
160    /// No barrier needed.
161    None,
162}
163/// GPU occupancy model (simplified).
164///
165/// Models occupancy as the ratio of active warps to maximum concurrent warps.
166#[derive(Debug, Clone)]
167#[allow(dead_code)]
168pub struct OccupancyModel {
169    /// Total number of compute units (SMs / CUs).
170    pub compute_units: u32,
171    /// Maximum warps per compute unit.
172    pub max_warps_per_cu: u32,
173    /// Warp size (threads per warp, typically 32 for NVIDIA or 64 for AMD).
174    pub warp_size: u32,
175    /// Shared memory per compute unit (bytes).
176    pub shared_mem_per_cu: u32,
177    /// Registers per compute unit.
178    pub registers_per_cu: u32,
179}
180impl OccupancyModel {
181    /// Create a model resembling a mid-range discrete GPU.
182    #[allow(dead_code)]
183    pub fn mid_range() -> Self {
184        Self {
185            compute_units: 32,
186            max_warps_per_cu: 32,
187            warp_size: 32,
188            shared_mem_per_cu: 48 * 1024,
189            registers_per_cu: 65536,
190        }
191    }
192    /// Estimate the theoretical occupancy (0.0–1.0) for a kernel.
193    ///
194    /// Occupancy is limited by:
195    /// 1. Workgroup size (must not exceed warp_size * max_warps_per_cu).
196    /// 2. Shared memory usage.
197    /// 3. Register usage.
198    #[allow(dead_code)]
199    pub fn estimate_occupancy(
200        &self,
201        workgroup_size: u32,
202        shared_mem_bytes: u32,
203        registers_per_thread: u32,
204    ) -> f64 {
205        let warps_per_wg = workgroup_size.div_ceil(self.warp_size);
206        let max_wg_by_warps = self.max_warps_per_cu / warps_per_wg.max(1);
207        let max_wg_by_smem = self
208            .shared_mem_per_cu
209            .checked_div(shared_mem_bytes)
210            .unwrap_or(u32::MAX);
211        let regs_per_wg = registers_per_thread * workgroup_size;
212        let max_wg_by_regs = self
213            .registers_per_cu
214            .checked_div(regs_per_wg)
215            .unwrap_or(u32::MAX);
216        let active_wg = max_wg_by_warps.min(max_wg_by_smem).min(max_wg_by_regs);
217        let active_warps = (active_wg * warps_per_wg).min(self.max_warps_per_cu);
218        (active_warps as f64 / self.max_warps_per_cu as f64).clamp(0.0, 1.0)
219    }
220    /// Total theoretical peak throughput in GFLOP/s (mock model).
221    ///
222    /// Assumes 2 FP32 ops per clock per SIMD unit.
223    #[allow(dead_code)]
224    pub fn peak_gflops(&self, clock_mhz: f64) -> f64 {
225        let simd_width = self.warp_size as f64;
226        2.0 * simd_width * self.compute_units as f64 * clock_mhz * 1e6 / 1e9
227    }
228}
229/// A recorded sequence of GPU commands (mock encoder).
230///
231/// Records commands for later submission; models wgpu-style recording.
232#[allow(dead_code)]
233pub struct GpuCommandEncoder {
234    pub(super) label: String,
235    pub(super) commands: Vec<GpuCommand>,
236}
237#[allow(dead_code)]
238impl GpuCommandEncoder {
239    /// Create a new command encoder with a debug label.
240    pub fn new(label: impl Into<String>) -> Self {
241        Self {
242            label: label.into(),
243            commands: Vec::new(),
244        }
245    }
246    /// Record a buffer copy command.
247    pub fn copy_buffer(&mut self, src: BufferId, dst: BufferId, size: usize) {
248        self.commands
249            .push(GpuCommand::CopyBuffer { src, dst, size });
250    }
251    /// Record a compute dispatch.
252    pub fn dispatch_compute(&mut self, kernel_name: &str, workgroups: [u32; 3]) {
253        self.commands.push(GpuCommand::DispatchCompute {
254            kernel_name: kernel_name.to_string(),
255            workgroups,
256        });
257    }
258    /// Insert a pipeline barrier.
259    pub fn insert_barrier(&mut self, barrier: PipelineBarrier) {
260        self.commands.push(GpuCommand::Barrier(barrier));
261    }
262    /// Set a named push constant.
263    pub fn push_constant(&mut self, name: &str, value: f64) {
264        self.commands.push(GpuCommand::PushConstant {
265            name: name.to_string(),
266            value,
267        });
268    }
269    /// Return the label of this encoder.
270    pub fn label(&self) -> &str {
271        &self.label
272    }
273    /// Number of recorded commands.
274    pub fn command_count(&self) -> usize {
275        self.commands.len()
276    }
277    /// Return the recorded commands.
278    pub fn commands(&self) -> &[GpuCommand] {
279        &self.commands
280    }
281    /// Reset the encoder (clear recorded commands).
282    pub fn reset(&mut self) {
283        self.commands.clear();
284    }
285    /// "Submit" the recorded commands: replay them on the dispatcher.
286    ///
287    /// For copy commands, data is transferred between buffers.
288    /// Other commands are noted but not executed (they are mock-only).
289    pub fn submit(&self, dispatcher: &mut ComputeDispatcher) -> Result<(), GpuError> {
290        for cmd in &self.commands {
291            if let GpuCommand::CopyBuffer { src, dst, .. } = cmd {
292                dispatcher.copy_buffer(*src, *dst)?;
293            }
294        }
295        Ok(())
296    }
297}
298/// Manages GPU buffers and dispatches parallel map/reduce operations.
299///
300/// This is a CPU-side simulation of GPU compute dispatch that uses a thread
301/// pool metaphor (sequential execution) for testing without a real GPU.
302pub struct ComputeDispatcher {
303    pub(super) buffers: HashMap<BufferId, GpuBuffer>,
304    pub(super) next_id: u32,
305}
306impl ComputeDispatcher {
307    /// Create a new dispatcher with no buffers.
308    pub fn new() -> Self {
309        Self {
310            buffers: HashMap::new(),
311            next_id: 0,
312        }
313    }
314    /// Allocate a new buffer of `size` f64 elements, optionally pre-loaded
315    /// with `initial_data`.  Returns the new buffer's [`BufferId`].
316    pub fn create_buffer(&mut self, size: usize, initial_data: Option<&[f64]>) -> BufferId {
317        let id = BufferId(self.next_id);
318        self.next_id += 1;
319        let buf = match initial_data {
320            Some(data) => {
321                let mut b = GpuBuffer::new(size);
322                let copy_len = data.len().min(size);
323                b.data[..copy_len].copy_from_slice(&data[..copy_len]);
324                b
325            }
326            None => GpuBuffer::new(size),
327        };
328        self.buffers.insert(id, buf);
329        id
330    }
331    /// Write `data` into the buffer identified by `id`.
332    ///
333    /// # Errors
334    /// Returns [`GpuError::InvalidBuffer`] if `id` is not registered.
335    pub fn write_buffer(&mut self, id: BufferId, data: &[f64]) -> Result<(), GpuError> {
336        match self.buffers.get_mut(&id) {
337            Some(buf) => {
338                buf.data = data.to_vec();
339                buf.size = data.len();
340                Ok(())
341            }
342            None => Err(GpuError::InvalidBuffer(id)),
343        }
344    }
345    /// Read the contents of the buffer identified by `id`.
346    ///
347    /// # Errors
348    /// Returns [`GpuError::InvalidBuffer`] if `id` is not registered.
349    pub fn read_buffer(&self, id: BufferId) -> Result<Vec<f64>, GpuError> {
350        self.buffers
351            .get(&id)
352            .map(|b| b.data.clone())
353            .ok_or(GpuError::InvalidBuffer(id))
354    }
355    /// Return the number of buffers currently managed.
356    #[allow(dead_code)]
357    pub fn num_buffers(&self) -> usize {
358        self.buffers.len()
359    }
360    /// Check if a buffer exists.
361    #[allow(dead_code)]
362    pub fn has_buffer(&self, id: BufferId) -> bool {
363        self.buffers.contains_key(&id)
364    }
365    /// Return the size of a buffer.
366    #[allow(dead_code)]
367    pub fn buffer_size(&self, id: BufferId) -> Result<usize, GpuError> {
368        self.buffers
369            .get(&id)
370            .map(|b| b.size)
371            .ok_or(GpuError::InvalidBuffer(id))
372    }
373    /// Destroy (remove) a buffer.
374    #[allow(dead_code)]
375    pub fn destroy_buffer(&mut self, id: BufferId) -> Result<(), GpuError> {
376        self.buffers
377            .remove(&id)
378            .map(|_| ())
379            .ok_or(GpuError::InvalidBuffer(id))
380    }
381    /// Copy data from one buffer to another.
382    #[allow(dead_code)]
383    pub fn copy_buffer(&mut self, src: BufferId, dst: BufferId) -> Result<(), GpuError> {
384        let src_data = self
385            .buffers
386            .get(&src)
387            .ok_or(GpuError::InvalidBuffer(src))?
388            .data
389            .clone();
390        let dst_buf = self
391            .buffers
392            .get_mut(&dst)
393            .ok_or(GpuError::InvalidBuffer(dst))?;
394        if src_data.len() != dst_buf.size {
395            return Err(GpuError::SizeMismatch {
396                expected: dst_buf.size,
397                got: src_data.len(),
398            });
399        }
400        dst_buf.data = src_data;
401        Ok(())
402    }
403    /// Dispatch a parallel map: `out[i] = f(in[i])` for every element.
404    ///
405    /// # Errors
406    /// Returns an error if either buffer is invalid or sizes differ.
407    pub fn dispatch_map(
408        &mut self,
409        buf_in: BufferId,
410        buf_out: BufferId,
411        f: impl Fn(f64) -> f64,
412    ) -> Result<(), GpuError> {
413        let input = self
414            .buffers
415            .get(&buf_in)
416            .ok_or(GpuError::InvalidBuffer(buf_in))?
417            .data
418            .clone();
419        let out_buf = self
420            .buffers
421            .get_mut(&buf_out)
422            .ok_or(GpuError::InvalidBuffer(buf_out))?;
423        if input.len() != out_buf.size {
424            return Err(GpuError::SizeMismatch {
425                expected: out_buf.size,
426                got: input.len(),
427            });
428        }
429        out_buf.data = input.iter().map(|&x| f(x)).collect();
430        Ok(())
431    }
432    /// Dispatch a parallel map with index: `out[i] = f(i, in[i])`.
433    #[allow(dead_code)]
434    pub fn dispatch_map_indexed(
435        &mut self,
436        buf_in: BufferId,
437        buf_out: BufferId,
438        f: impl Fn(usize, f64) -> f64,
439    ) -> Result<(), GpuError> {
440        let input = self
441            .buffers
442            .get(&buf_in)
443            .ok_or(GpuError::InvalidBuffer(buf_in))?
444            .data
445            .clone();
446        let out_buf = self
447            .buffers
448            .get_mut(&buf_out)
449            .ok_or(GpuError::InvalidBuffer(buf_out))?;
450        if input.len() != out_buf.size {
451            return Err(GpuError::SizeMismatch {
452                expected: out_buf.size,
453                got: input.len(),
454            });
455        }
456        out_buf.data = input.iter().enumerate().map(|(i, &x)| f(i, x)).collect();
457        Ok(())
458    }
459    /// Dispatch a zip-map: `out[i] = f(a[i], b[i])`.
460    #[allow(dead_code)]
461    pub fn dispatch_zip_map(
462        &mut self,
463        buf_a: BufferId,
464        buf_b: BufferId,
465        buf_out: BufferId,
466        f: impl Fn(f64, f64) -> f64,
467    ) -> Result<(), GpuError> {
468        let a_data = self
469            .buffers
470            .get(&buf_a)
471            .ok_or(GpuError::InvalidBuffer(buf_a))?
472            .data
473            .clone();
474        let b_data = self
475            .buffers
476            .get(&buf_b)
477            .ok_or(GpuError::InvalidBuffer(buf_b))?
478            .data
479            .clone();
480        if a_data.len() != b_data.len() {
481            return Err(GpuError::SizeMismatch {
482                expected: a_data.len(),
483                got: b_data.len(),
484            });
485        }
486        let out_buf = self
487            .buffers
488            .get_mut(&buf_out)
489            .ok_or(GpuError::InvalidBuffer(buf_out))?;
490        if a_data.len() != out_buf.size {
491            return Err(GpuError::SizeMismatch {
492                expected: out_buf.size,
493                got: a_data.len(),
494            });
495        }
496        out_buf.data = a_data
497            .iter()
498            .zip(b_data.iter())
499            .map(|(&a, &b)| f(a, b))
500            .collect();
501        Ok(())
502    }
503    /// Dispatch a parallel reduce: folds all elements in `buf` using `f`.
504    ///
505    /// Mimics a GPU tree-reduction (sequential here for correctness).
506    ///
507    /// # Errors
508    /// Returns [`GpuError::InvalidBuffer`] or [`GpuError::EmptyBuffer`].
509    pub fn dispatch_reduce(
510        &self,
511        buf: BufferId,
512        f: impl Fn(f64, f64) -> f64,
513    ) -> Result<f64, GpuError> {
514        let data = self.buffers.get(&buf).ok_or(GpuError::InvalidBuffer(buf))?;
515        let mut iter = data.data.iter().copied();
516        let first = iter.next().ok_or(GpuError::EmptyBuffer)?;
517        Ok(iter.fold(first, f))
518    }
519    /// Dispatch a mock SPH density kernel.
520    ///
521    /// Computes a simplified SPH density estimate for each particle:
522    ///
523    /// ```text
524    /// rho_i = sum_j m_j * W(|r_i - r_j|, h)
525    /// ```
526    ///
527    /// where `W(r, h) = max(0, 1 - (r/h)^2)` (simplified poly-6 mock).
528    ///
529    /// Buffer layout (flat f64):
530    /// * `pos_buf` — `[x0, y0, z0, x1, y1, z1, ...]`
531    /// * `mass_buf` — `[m0, m1, ...]`
532    /// * `out_density_buf` — written with `[rho0, rho1, ...]`
533    ///
534    /// # Errors
535    /// Returns an error if any buffer id is invalid.
536    pub fn dispatch_sph_density(
537        &mut self,
538        pos_buf: BufferId,
539        mass_buf: BufferId,
540        h: f64,
541        out_density_buf: BufferId,
542    ) -> Result<(), GpuError> {
543        let positions = self
544            .buffers
545            .get(&pos_buf)
546            .ok_or(GpuError::InvalidBuffer(pos_buf))?
547            .data
548            .clone();
549        let masses = self
550            .buffers
551            .get(&mass_buf)
552            .ok_or(GpuError::InvalidBuffer(mass_buf))?
553            .data
554            .clone();
555        let n = positions.len() / 3;
556        let h2 = h * h;
557        let mut densities = vec![0.0f64; n];
558        for i in 0..n {
559            let xi = positions[i * 3];
560            let yi = positions[i * 3 + 1];
561            let zi = positions[i * 3 + 2];
562            let mut rho = 0.0;
563            for j in 0..n {
564                let dx = xi - positions[j * 3];
565                let dy = yi - positions[j * 3 + 1];
566                let dz = zi - positions[j * 3 + 2];
567                let r2 = dx * dx + dy * dy + dz * dz;
568                if r2 < h2 {
569                    let q = 1.0 - r2 / h2;
570                    rho += masses[j] * q * q;
571                }
572            }
573            densities[i] = rho;
574        }
575        let out_buf = self
576            .buffers
577            .get_mut(&out_density_buf)
578            .ok_or(GpuError::InvalidBuffer(out_density_buf))?;
579        out_buf.data = densities;
580        out_buf.size = n;
581        Ok(())
582    }
583    /// Dispatch a tree-based parallel reduction on a buffer.
584    ///
585    /// Simulates a GPU tree reduction: repeatedly halves the active range,
586    /// summing adjacent elements until one value remains.
587    ///
588    /// Returns the reduced value (identity `0.0` for an empty buffer).
589    #[allow(dead_code)]
590    pub fn dispatch_reduction_tree(&self, buf: BufferId) -> Result<f64, GpuError> {
591        let data = self
592            .buffers
593            .get(&buf)
594            .ok_or(GpuError::InvalidBuffer(buf))?
595            .data
596            .clone();
597        if data.is_empty() {
598            return Ok(0.0);
599        }
600        let mut work = data;
601        let mut len = work.len();
602        while len > 1 {
603            let half = len / 2;
604            for i in 0..half {
605                work[i] = work[i * 2] + work[i * 2 + 1];
606            }
607            if len % 2 == 1 {
608                work[half] = work[len - 1];
609                len = half + 1;
610            } else {
611                len = half;
612            }
613        }
614        Ok(work[0])
615    }
616    /// Dispatch an inclusive prefix scan (cumulative sum) on a buffer.
617    ///
618    /// Writes `out[i] = sum(in[0..=i])` into `out_buf`.
619    ///
620    /// Uses a sequential Hillis-Steele-style scan for correctness.
621    #[allow(dead_code)]
622    pub fn dispatch_inclusive_scan(
623        &mut self,
624        buf_in: BufferId,
625        buf_out: BufferId,
626    ) -> Result<(), GpuError> {
627        let data = self
628            .buffers
629            .get(&buf_in)
630            .ok_or(GpuError::InvalidBuffer(buf_in))?
631            .data
632            .clone();
633        let n = data.len();
634        let mut result = data;
635        for i in 1..n {
636            result[i] += result[i - 1];
637        }
638        let out = self
639            .buffers
640            .get_mut(&buf_out)
641            .ok_or(GpuError::InvalidBuffer(buf_out))?;
642        out.data = result;
643        out.size = n;
644        Ok(())
645    }
646    /// Dispatch a 2-bit radix sort on a buffer of non-negative f64 values.
647    ///
648    /// Values are cast to u64 (bit-cast) and sorted by their bits using
649    /// counting sort passes with 2-bit digits.  32 passes cover all 64 bits.
650    /// For non-negative IEEE 754 doubles the bit pattern order matches numeric
651    /// order.  Returns the sorted data as a new `Vec`f64` (input unchanged).
652    #[allow(dead_code)]
653    pub fn dispatch_radix_sort(&self, buf: BufferId) -> Result<Vec<f64>, GpuError> {
654        let data = self
655            .buffers
656            .get(&buf)
657            .ok_or(GpuError::InvalidBuffer(buf))?
658            .data
659            .clone();
660        let n = data.len();
661        if n == 0 {
662            return Ok(Vec::new());
663        }
664        let mut keys: Vec<u64> = data.iter().map(|&v| v.to_bits()).collect();
665        for pass in 0..32usize {
666            let shift = pass * 2;
667            let mut counts = [0usize; 4];
668            for &k in &keys {
669                counts[((k >> shift) & 0x3) as usize] += 1;
670            }
671            let mut starts = [0usize; 4];
672            for i in 1..4 {
673                starts[i] = starts[i - 1] + counts[i - 1];
674            }
675            let mut out = vec![0u64; n];
676            let mut pos = starts;
677            for &k in &keys {
678                let digit = ((k >> shift) & 0x3) as usize;
679                out[pos[digit]] = k;
680                pos[digit] += 1;
681            }
682            keys = out;
683        }
684        Ok(keys.iter().map(|&bits| f64::from_bits(bits)).collect())
685    }
686}
687/// Opaque handle to a GPU/CPU buffer (usize-indexed, used by ComputeBackend).
688#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
689pub struct BufferHandle(pub usize);
690/// Specification for a GPU compute kernel dispatch.
691#[derive(Debug, Clone)]
692#[allow(dead_code)]
693pub struct KernelSpec {
694    /// Human-readable kernel name.
695    pub name: String,
696    /// Number of threads per workgroup `\[X, Y, Z\]`.
697    pub workgroup_size: [u32; 3],
698    /// Ordered list of buffer bindings for this kernel.
699    pub buffer_bindings: Vec<BufferId>,
700}
701impl KernelSpec {
702    /// Create a new kernel spec with a 1-D workgroup.
703    pub fn new(name: impl Into<String>, workgroup_x: u32, buffer_bindings: Vec<BufferId>) -> Self {
704        Self {
705            name: name.into(),
706            workgroup_size: [workgroup_x, 1, 1],
707            buffer_bindings,
708        }
709    }
710    /// Create a kernel spec with a 3-D workgroup size.
711    #[allow(dead_code)]
712    pub fn with_workgroup_3d(
713        name: impl Into<String>,
714        workgroup_size: [u32; 3],
715        buffer_bindings: Vec<BufferId>,
716    ) -> Self {
717        Self {
718            name: name.into(),
719            workgroup_size,
720            buffer_bindings,
721        }
722    }
723    /// Compute the number of workgroups needed for `total_items` in the X dimension.
724    #[allow(dead_code)]
725    pub fn num_workgroups_x(&self, total_items: u32) -> u32 {
726        total_items.div_ceil(self.workgroup_size[0])
727    }
728    /// Total threads per workgroup.
729    #[allow(dead_code)]
730    pub fn threads_per_workgroup(&self) -> u32 {
731        self.workgroup_size[0] * self.workgroup_size[1] * self.workgroup_size[2]
732    }
733}
734/// A CPU-resident buffer that mimics a GPU storage buffer.
735#[derive(Debug, Clone)]
736pub struct GpuBuffer {
737    /// Buffer contents as f64.
738    pub data: Vec<f64>,
739    /// Declared capacity of the buffer (may differ from `data.len()` if
740    /// the buffer was created with a fixed size but partially written).
741    pub size: usize,
742}
743impl GpuBuffer {
744    /// Create a zero-filled buffer of `size` elements.
745    pub fn new(size: usize) -> Self {
746        Self {
747            data: vec![0.0; size],
748            size,
749        }
750    }
751    /// Create a buffer pre-loaded with `initial_data`.
752    pub fn from_data(initial_data: Vec<f64>) -> Self {
753        let size = initial_data.len();
754        Self {
755            data: initial_data,
756            size,
757        }
758    }
759    /// Fill the buffer with a constant value.
760    #[allow(dead_code)]
761    pub fn fill(&mut self, value: f64) {
762        for v in &mut self.data {
763            *v = value;
764        }
765    }
766    /// Clear the buffer (set all elements to 0).
767    #[allow(dead_code)]
768    pub fn clear(&mut self) {
769        self.fill(0.0);
770    }
771    /// Get a slice of the buffer data.
772    #[allow(dead_code)]
773    pub fn as_slice(&self) -> &[f64] {
774        &self.data
775    }
776    /// Get a mutable slice of the buffer data.
777    #[allow(dead_code)]
778    pub fn as_mut_slice(&mut self) -> &mut [f64] {
779        &mut self.data
780    }
781    /// Number of bytes the buffer would occupy on GPU (f64 = 8 bytes each).
782    #[allow(dead_code)]
783    pub fn byte_size(&self) -> usize {
784        self.size * std::mem::size_of::<f64>()
785    }
786}
787/// Errors that can occur during GPU (or CPU-fallback) dispatch.
788#[derive(Debug, Clone, PartialEq)]
789pub enum GpuError {
790    /// The specified buffer id is not registered.
791    InvalidBuffer(BufferId),
792    /// Input and output buffers must have the same length.
793    SizeMismatch {
794        /// Expected buffer size.
795        expected: usize,
796        /// Actual buffer size received.
797        got: usize,
798    },
799    /// The reduction was attempted on an empty buffer.
800    EmptyBuffer,
801    /// A kernel or operation was not found.
802    #[allow(dead_code)]
803    NotFound(String),
804}
805/// CPU fallback compute backend.
806///
807/// Stores buffers as `Vec`f64` in memory and dispatches kernels on the CPU.
808pub struct CpuBackend {
809    pub(super) buffers: RefCell<Vec<Vec<f64>>>,
810}
811impl CpuBackend {
812    /// Create a new CPU backend.
813    pub fn new() -> Self {
814        Self {
815            buffers: RefCell::new(Vec::new()),
816        }
817    }
818    /// Return the number of buffers currently allocated.
819    #[allow(dead_code)]
820    pub fn num_buffers(&self) -> usize {
821        self.buffers.borrow().len()
822    }
823    /// Return the total number of f64 elements across all buffers.
824    #[allow(dead_code)]
825    pub fn total_elements(&self) -> usize {
826        self.buffers.borrow().iter().map(|b| b.len()).sum()
827    }
828}
829/// A single resource event for lifecycle tracking.
830#[derive(Debug, Clone)]
831#[allow(dead_code)]
832pub enum ResourceEvent {
833    /// Buffer was created.
834    Created(BufferId, usize),
835    /// Buffer was written to.
836    Written(BufferId),
837    /// Buffer was read from.
838    Read(BufferId),
839    /// Buffer was destroyed.
840    Destroyed(BufferId),
841}
842/// A record of divergent branches observed in a kernel.
843#[derive(Debug, Clone, Default)]
844#[allow(dead_code)]
845pub struct WarpDivergenceRecord {
846    /// Number of branch instructions encountered.
847    pub total_branches: u64,
848    /// Number of branches where threads diverged (not all took same path).
849    pub divergent_branches: u64,
850}
851#[allow(dead_code)]
852impl WarpDivergenceRecord {
853    /// Compute the divergence rate (0.0 = no divergence, 1.0 = fully divergent).
854    pub fn divergence_rate(&self) -> f64 {
855        if self.total_branches == 0 {
856            0.0
857        } else {
858            self.divergent_branches as f64 / self.total_branches as f64
859        }
860    }
861    /// Estimated performance penalty from divergence (relative slowdown factor).
862    ///
863    /// A simple model: penalty = 1 + divergence_rate * (warp_size - 1) / warp_size.
864    pub fn performance_penalty(&self, warp_size: u32) -> f64 {
865        let rate = self.divergence_rate();
866        1.0 + rate * (warp_size as f64 - 1.0) / warp_size as f64
867    }
868}
869/// A mock GPU timeline semaphore for synchronising multi-pass GPU work.
870///
871/// On real GPU APIs (Vulkan, D3D12), timeline semaphores allow the CPU to
872/// wait for a specific GPU progress point.  This mock records signal and
873/// wait operations for testing.
874#[allow(dead_code)]
875pub struct TimelineSemaphore {
876    /// Current value of the semaphore counter.
877    pub value: u64,
878    /// History of signalled values.
879    pub(super) signal_history: Vec<u64>,
880    /// History of wait requests.
881    pub(super) wait_history: Vec<u64>,
882}
883#[allow(dead_code)]
884impl TimelineSemaphore {
885    /// Create a new semaphore starting at value 0.
886    pub fn new() -> Self {
887        Self {
888            value: 0,
889            signal_history: Vec::new(),
890            wait_history: Vec::new(),
891        }
892    }
893    /// Signal the semaphore to `new_value`.  Value must be monotonically increasing.
894    pub fn signal(&mut self, new_value: u64) {
895        assert!(
896            new_value > self.value,
897            "semaphore values must increase monotonically"
898        );
899        self.value = new_value;
900        self.signal_history.push(new_value);
901    }
902    /// Record a wait-for request.  In a mock, this checks `wait_value <= current`.
903    ///
904    /// Returns `true` if the semaphore has already reached `wait_value`.
905    pub fn wait(&mut self, wait_value: u64) -> bool {
906        self.wait_history.push(wait_value);
907        self.value >= wait_value
908    }
909    /// Return the current semaphore value.
910    pub fn current_value(&self) -> u64 {
911        self.value
912    }
913    /// Number of times the semaphore has been signalled.
914    pub fn signal_count(&self) -> usize {
915        self.signal_history.len()
916    }
917}
918/// GPU memory bandwidth model.
919///
920/// Estimates the effective bandwidth and the roofline-model bound for a kernel.
921#[derive(Debug, Clone)]
922#[allow(dead_code)]
923pub struct MemoryBandwidthModel {
924    /// Peak memory bandwidth in GB/s.
925    pub peak_bandwidth_gbs: f64,
926    /// Peak compute throughput in GFLOP/s.
927    pub peak_compute_gflops: f64,
928}
929#[allow(dead_code)]
930impl MemoryBandwidthModel {
931    /// Create a model for a mid-range discrete GPU.
932    pub fn mid_range() -> Self {
933        Self {
934            peak_bandwidth_gbs: 480.0,
935            peak_compute_gflops: 10000.0,
936        }
937    }
938    /// Compute the arithmetic intensity (FLOP/byte) of a kernel.
939    ///
940    /// `flops` – total floating-point operations.
941    /// `bytes_accessed` – total bytes read/written.
942    pub fn arithmetic_intensity(flops: f64, bytes_accessed: f64) -> f64 {
943        if bytes_accessed < 1e-30 {
944            f64::INFINITY
945        } else {
946            flops / bytes_accessed
947        }
948    }
949    /// Roofline performance estimate (GFLOP/s) given arithmetic intensity.
950    pub fn roofline_performance(&self, arithmetic_intensity: f64) -> f64 {
951        let bw_bound = arithmetic_intensity * self.peak_bandwidth_gbs;
952        bw_bound.min(self.peak_compute_gflops)
953    }
954    /// Estimated kernel execution time in milliseconds.
955    ///
956    /// `flops` – total FLOPs in the kernel.
957    /// `bytes_accessed` – total bytes transferred.
958    pub fn estimated_runtime_ms(&self, flops: f64, bytes_accessed: f64) -> f64 {
959        let intensity = Self::arithmetic_intensity(flops, bytes_accessed);
960        let perf_gflops = self.roofline_performance(intensity);
961        if perf_gflops < 1e-30 {
962            return f64::INFINITY;
963        }
964        (flops / (perf_gflops * 1e9)) * 1e3
965    }
966    /// Whether the kernel is bandwidth-bound or compute-bound.
967    ///
968    /// Returns `true` if bandwidth-bound (arithmetic intensity below the ridge point).
969    pub fn is_bandwidth_bound(&self, arithmetic_intensity: f64) -> bool {
970        let ridge_point = self.peak_compute_gflops / self.peak_bandwidth_gbs;
971        arithmetic_intensity < ridge_point
972    }
973}
974/// A binding entry associating a buffer with a binding index and usage.
975#[derive(Debug, Clone, Copy)]
976#[allow(dead_code)]
977pub struct BufferBinding {
978    /// Binding index in the shader (e.g. @binding(0)).
979    pub binding: u32,
980    /// The buffer to bind.
981    pub buffer_id: BufferId,
982    /// Usage of the buffer in this binding.
983    pub usage: BufferUsage,
984}
985#[allow(dead_code)]
986impl BufferBinding {
987    /// Create a new buffer binding.
988    pub fn new(binding: u32, buffer_id: BufferId, usage: BufferUsage) -> Self {
989        Self {
990            binding,
991            buffer_id,
992            usage,
993        }
994    }
995    /// Shorthand for a read-only binding.
996    pub fn read(binding: u32, buffer_id: BufferId) -> Self {
997        Self::new(binding, buffer_id, BufferUsage::ReadOnly)
998    }
999    /// Shorthand for a write-only binding.
1000    pub fn write(binding: u32, buffer_id: BufferId) -> Self {
1001        Self::new(binding, buffer_id, BufferUsage::WriteOnly)
1002    }
1003    /// Shorthand for a read-write binding.
1004    pub fn read_write(binding: u32, buffer_id: BufferId) -> Self {
1005        Self::new(binding, buffer_id, BufferUsage::ReadWrite)
1006    }
1007    /// Shorthand for a uniform binding.
1008    pub fn uniform(binding: u32, buffer_id: BufferId) -> Self {
1009        Self::new(binding, buffer_id, BufferUsage::Uniform)
1010    }
1011}
1012/// Newtype handle for GPU buffers in the dispatcher model.
1013#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
1014pub struct BufferId(pub u32);