#![allow(clippy::manual_div_ceil)]
use std::cell::RefCell;
use std::collections::HashMap;
#[allow(unused_imports)]
use super::functions::*;
#[allow(dead_code)]
pub struct ComputePass {
pub(super) commands: Vec<(String, usize)>,
}
#[allow(dead_code)]
impl ComputePass {
pub fn new() -> Self {
Self {
commands: Vec::new(),
}
}
pub fn dispatch(&mut self, kernel_name: &str, work_size: usize) {
self.commands.push((kernel_name.to_string(), work_size));
}
pub fn num_commands(&self) -> usize {
self.commands.len()
}
pub fn commands(&self) -> &[(String, usize)] {
&self.commands
}
pub fn clear(&mut self) {
self.commands.clear();
}
pub fn total_work_items(&self) -> usize {
self.commands.iter().map(|(_, ws)| ws).sum()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[allow(dead_code)]
pub enum BufferUsage {
ReadOnly,
WriteOnly,
ReadWrite,
Uniform,
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub enum GpuCommand {
CopyBuffer {
src: BufferId,
dst: BufferId,
size: usize,
},
DispatchCompute {
kernel_name: String,
workgroups: [u32; 3],
},
Barrier(PipelineBarrier),
PushConstant {
name: String,
value: f64,
},
}
#[allow(dead_code)]
pub struct ResourceLifecycle {
pub(super) events: Vec<ResourceEvent>,
}
#[allow(dead_code)]
impl ResourceLifecycle {
pub fn new() -> Self {
Self { events: Vec::new() }
}
pub fn record_create(&mut self, id: BufferId, size: usize) {
self.events.push(ResourceEvent::Created(id, size));
}
pub fn record_write(&mut self, id: BufferId) {
self.events.push(ResourceEvent::Written(id));
}
pub fn record_read(&mut self, id: BufferId) {
self.events.push(ResourceEvent::Read(id));
}
pub fn record_destroy(&mut self, id: BufferId) {
self.events.push(ResourceEvent::Destroyed(id));
}
pub fn events(&self) -> &[ResourceEvent] {
&self.events
}
pub fn len(&self) -> usize {
self.events.len()
}
pub fn is_empty(&self) -> bool {
self.events.is_empty()
}
pub fn clear(&mut self) {
self.events.clear();
}
pub fn count_writes(&self, id: BufferId) -> usize {
self.events
.iter()
.filter(|e| matches!(e, ResourceEvent::Written(bid) if * bid == id))
.count()
}
pub fn count_reads(&self, id: BufferId) -> usize {
self.events
.iter()
.filter(|e| matches!(e, ResourceEvent::Read(bid) if * bid == id))
.count()
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[allow(dead_code)]
pub enum PipelineBarrier {
StorageReadAfterWrite,
UniformReadAfterWrite,
Full,
None,
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct OccupancyModel {
pub compute_units: u32,
pub max_warps_per_cu: u32,
pub warp_size: u32,
pub shared_mem_per_cu: u32,
pub registers_per_cu: u32,
}
impl OccupancyModel {
#[allow(dead_code)]
pub fn mid_range() -> Self {
Self {
compute_units: 32,
max_warps_per_cu: 32,
warp_size: 32,
shared_mem_per_cu: 48 * 1024,
registers_per_cu: 65536,
}
}
#[allow(dead_code)]
pub fn estimate_occupancy(
&self,
workgroup_size: u32,
shared_mem_bytes: u32,
registers_per_thread: u32,
) -> f64 {
let warps_per_wg = workgroup_size.div_ceil(self.warp_size);
let max_wg_by_warps = self.max_warps_per_cu / warps_per_wg.max(1);
let max_wg_by_smem = self
.shared_mem_per_cu
.checked_div(shared_mem_bytes)
.unwrap_or(u32::MAX);
let regs_per_wg = registers_per_thread * workgroup_size;
let max_wg_by_regs = self
.registers_per_cu
.checked_div(regs_per_wg)
.unwrap_or(u32::MAX);
let active_wg = max_wg_by_warps.min(max_wg_by_smem).min(max_wg_by_regs);
let active_warps = (active_wg * warps_per_wg).min(self.max_warps_per_cu);
(active_warps as f64 / self.max_warps_per_cu as f64).clamp(0.0, 1.0)
}
#[allow(dead_code)]
pub fn peak_gflops(&self, clock_mhz: f64) -> f64 {
let simd_width = self.warp_size as f64;
2.0 * simd_width * self.compute_units as f64 * clock_mhz * 1e6 / 1e9
}
}
#[allow(dead_code)]
pub struct GpuCommandEncoder {
pub(super) label: String,
pub(super) commands: Vec<GpuCommand>,
}
#[allow(dead_code)]
impl GpuCommandEncoder {
pub fn new(label: impl Into<String>) -> Self {
Self {
label: label.into(),
commands: Vec::new(),
}
}
pub fn copy_buffer(&mut self, src: BufferId, dst: BufferId, size: usize) {
self.commands
.push(GpuCommand::CopyBuffer { src, dst, size });
}
pub fn dispatch_compute(&mut self, kernel_name: &str, workgroups: [u32; 3]) {
self.commands.push(GpuCommand::DispatchCompute {
kernel_name: kernel_name.to_string(),
workgroups,
});
}
pub fn insert_barrier(&mut self, barrier: PipelineBarrier) {
self.commands.push(GpuCommand::Barrier(barrier));
}
pub fn push_constant(&mut self, name: &str, value: f64) {
self.commands.push(GpuCommand::PushConstant {
name: name.to_string(),
value,
});
}
pub fn label(&self) -> &str {
&self.label
}
pub fn command_count(&self) -> usize {
self.commands.len()
}
pub fn commands(&self) -> &[GpuCommand] {
&self.commands
}
pub fn reset(&mut self) {
self.commands.clear();
}
pub fn submit(&self, dispatcher: &mut ComputeDispatcher) -> Result<(), GpuError> {
for cmd in &self.commands {
if let GpuCommand::CopyBuffer { src, dst, .. } = cmd {
dispatcher.copy_buffer(*src, *dst)?;
}
}
Ok(())
}
}
pub struct ComputeDispatcher {
pub(super) buffers: HashMap<BufferId, GpuBuffer>,
pub(super) next_id: u32,
}
impl ComputeDispatcher {
pub fn new() -> Self {
Self {
buffers: HashMap::new(),
next_id: 0,
}
}
pub fn create_buffer(&mut self, size: usize, initial_data: Option<&[f64]>) -> BufferId {
let id = BufferId(self.next_id);
self.next_id += 1;
let buf = match initial_data {
Some(data) => {
let mut b = GpuBuffer::new(size);
let copy_len = data.len().min(size);
b.data[..copy_len].copy_from_slice(&data[..copy_len]);
b
}
None => GpuBuffer::new(size),
};
self.buffers.insert(id, buf);
id
}
pub fn write_buffer(&mut self, id: BufferId, data: &[f64]) -> Result<(), GpuError> {
match self.buffers.get_mut(&id) {
Some(buf) => {
buf.data = data.to_vec();
buf.size = data.len();
Ok(())
}
None => Err(GpuError::InvalidBuffer(id)),
}
}
pub fn read_buffer(&self, id: BufferId) -> Result<Vec<f64>, GpuError> {
self.buffers
.get(&id)
.map(|b| b.data.clone())
.ok_or(GpuError::InvalidBuffer(id))
}
#[allow(dead_code)]
pub fn num_buffers(&self) -> usize {
self.buffers.len()
}
#[allow(dead_code)]
pub fn has_buffer(&self, id: BufferId) -> bool {
self.buffers.contains_key(&id)
}
#[allow(dead_code)]
pub fn buffer_size(&self, id: BufferId) -> Result<usize, GpuError> {
self.buffers
.get(&id)
.map(|b| b.size)
.ok_or(GpuError::InvalidBuffer(id))
}
#[allow(dead_code)]
pub fn destroy_buffer(&mut self, id: BufferId) -> Result<(), GpuError> {
self.buffers
.remove(&id)
.map(|_| ())
.ok_or(GpuError::InvalidBuffer(id))
}
#[allow(dead_code)]
pub fn copy_buffer(&mut self, src: BufferId, dst: BufferId) -> Result<(), GpuError> {
let src_data = self
.buffers
.get(&src)
.ok_or(GpuError::InvalidBuffer(src))?
.data
.clone();
let dst_buf = self
.buffers
.get_mut(&dst)
.ok_or(GpuError::InvalidBuffer(dst))?;
if src_data.len() != dst_buf.size {
return Err(GpuError::SizeMismatch {
expected: dst_buf.size,
got: src_data.len(),
});
}
dst_buf.data = src_data;
Ok(())
}
pub fn dispatch_map(
&mut self,
buf_in: BufferId,
buf_out: BufferId,
f: impl Fn(f64) -> f64,
) -> Result<(), GpuError> {
let input = self
.buffers
.get(&buf_in)
.ok_or(GpuError::InvalidBuffer(buf_in))?
.data
.clone();
let out_buf = self
.buffers
.get_mut(&buf_out)
.ok_or(GpuError::InvalidBuffer(buf_out))?;
if input.len() != out_buf.size {
return Err(GpuError::SizeMismatch {
expected: out_buf.size,
got: input.len(),
});
}
out_buf.data = input.iter().map(|&x| f(x)).collect();
Ok(())
}
#[allow(dead_code)]
pub fn dispatch_map_indexed(
&mut self,
buf_in: BufferId,
buf_out: BufferId,
f: impl Fn(usize, f64) -> f64,
) -> Result<(), GpuError> {
let input = self
.buffers
.get(&buf_in)
.ok_or(GpuError::InvalidBuffer(buf_in))?
.data
.clone();
let out_buf = self
.buffers
.get_mut(&buf_out)
.ok_or(GpuError::InvalidBuffer(buf_out))?;
if input.len() != out_buf.size {
return Err(GpuError::SizeMismatch {
expected: out_buf.size,
got: input.len(),
});
}
out_buf.data = input.iter().enumerate().map(|(i, &x)| f(i, x)).collect();
Ok(())
}
#[allow(dead_code)]
pub fn dispatch_zip_map(
&mut self,
buf_a: BufferId,
buf_b: BufferId,
buf_out: BufferId,
f: impl Fn(f64, f64) -> f64,
) -> Result<(), GpuError> {
let a_data = self
.buffers
.get(&buf_a)
.ok_or(GpuError::InvalidBuffer(buf_a))?
.data
.clone();
let b_data = self
.buffers
.get(&buf_b)
.ok_or(GpuError::InvalidBuffer(buf_b))?
.data
.clone();
if a_data.len() != b_data.len() {
return Err(GpuError::SizeMismatch {
expected: a_data.len(),
got: b_data.len(),
});
}
let out_buf = self
.buffers
.get_mut(&buf_out)
.ok_or(GpuError::InvalidBuffer(buf_out))?;
if a_data.len() != out_buf.size {
return Err(GpuError::SizeMismatch {
expected: out_buf.size,
got: a_data.len(),
});
}
out_buf.data = a_data
.iter()
.zip(b_data.iter())
.map(|(&a, &b)| f(a, b))
.collect();
Ok(())
}
pub fn dispatch_reduce(
&self,
buf: BufferId,
f: impl Fn(f64, f64) -> f64,
) -> Result<f64, GpuError> {
let data = self.buffers.get(&buf).ok_or(GpuError::InvalidBuffer(buf))?;
let mut iter = data.data.iter().copied();
let first = iter.next().ok_or(GpuError::EmptyBuffer)?;
Ok(iter.fold(first, f))
}
pub fn dispatch_sph_density(
&mut self,
pos_buf: BufferId,
mass_buf: BufferId,
h: f64,
out_density_buf: BufferId,
) -> Result<(), GpuError> {
let positions = self
.buffers
.get(&pos_buf)
.ok_or(GpuError::InvalidBuffer(pos_buf))?
.data
.clone();
let masses = self
.buffers
.get(&mass_buf)
.ok_or(GpuError::InvalidBuffer(mass_buf))?
.data
.clone();
let n = positions.len() / 3;
let h2 = h * h;
let mut densities = vec![0.0f64; n];
for i in 0..n {
let xi = positions[i * 3];
let yi = positions[i * 3 + 1];
let zi = positions[i * 3 + 2];
let mut rho = 0.0;
for j in 0..n {
let dx = xi - positions[j * 3];
let dy = yi - positions[j * 3 + 1];
let dz = zi - positions[j * 3 + 2];
let r2 = dx * dx + dy * dy + dz * dz;
if r2 < h2 {
let q = 1.0 - r2 / h2;
rho += masses[j] * q * q;
}
}
densities[i] = rho;
}
let out_buf = self
.buffers
.get_mut(&out_density_buf)
.ok_or(GpuError::InvalidBuffer(out_density_buf))?;
out_buf.data = densities;
out_buf.size = n;
Ok(())
}
#[allow(dead_code)]
pub fn dispatch_reduction_tree(&self, buf: BufferId) -> Result<f64, GpuError> {
let data = self
.buffers
.get(&buf)
.ok_or(GpuError::InvalidBuffer(buf))?
.data
.clone();
if data.is_empty() {
return Ok(0.0);
}
let mut work = data;
let mut len = work.len();
while len > 1 {
let half = len / 2;
for i in 0..half {
work[i] = work[i * 2] + work[i * 2 + 1];
}
if len % 2 == 1 {
work[half] = work[len - 1];
len = half + 1;
} else {
len = half;
}
}
Ok(work[0])
}
#[allow(dead_code)]
pub fn dispatch_inclusive_scan(
&mut self,
buf_in: BufferId,
buf_out: BufferId,
) -> Result<(), GpuError> {
let data = self
.buffers
.get(&buf_in)
.ok_or(GpuError::InvalidBuffer(buf_in))?
.data
.clone();
let n = data.len();
let mut result = data;
for i in 1..n {
result[i] += result[i - 1];
}
let out = self
.buffers
.get_mut(&buf_out)
.ok_or(GpuError::InvalidBuffer(buf_out))?;
out.data = result;
out.size = n;
Ok(())
}
#[allow(dead_code)]
pub fn dispatch_radix_sort(&self, buf: BufferId) -> Result<Vec<f64>, GpuError> {
let data = self
.buffers
.get(&buf)
.ok_or(GpuError::InvalidBuffer(buf))?
.data
.clone();
let n = data.len();
if n == 0 {
return Ok(Vec::new());
}
let mut keys: Vec<u64> = data.iter().map(|&v| v.to_bits()).collect();
for pass in 0..32usize {
let shift = pass * 2;
let mut counts = [0usize; 4];
for &k in &keys {
counts[((k >> shift) & 0x3) as usize] += 1;
}
let mut starts = [0usize; 4];
for i in 1..4 {
starts[i] = starts[i - 1] + counts[i - 1];
}
let mut out = vec![0u64; n];
let mut pos = starts;
for &k in &keys {
let digit = ((k >> shift) & 0x3) as usize;
out[pos[digit]] = k;
pos[digit] += 1;
}
keys = out;
}
Ok(keys.iter().map(|&bits| f64::from_bits(bits)).collect())
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct BufferHandle(pub usize);
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct KernelSpec {
pub name: String,
pub workgroup_size: [u32; 3],
pub buffer_bindings: Vec<BufferId>,
}
impl KernelSpec {
pub fn new(name: impl Into<String>, workgroup_x: u32, buffer_bindings: Vec<BufferId>) -> Self {
Self {
name: name.into(),
workgroup_size: [workgroup_x, 1, 1],
buffer_bindings,
}
}
#[allow(dead_code)]
pub fn with_workgroup_3d(
name: impl Into<String>,
workgroup_size: [u32; 3],
buffer_bindings: Vec<BufferId>,
) -> Self {
Self {
name: name.into(),
workgroup_size,
buffer_bindings,
}
}
#[allow(dead_code)]
pub fn num_workgroups_x(&self, total_items: u32) -> u32 {
total_items.div_ceil(self.workgroup_size[0])
}
#[allow(dead_code)]
pub fn threads_per_workgroup(&self) -> u32 {
self.workgroup_size[0] * self.workgroup_size[1] * self.workgroup_size[2]
}
}
#[derive(Debug, Clone)]
pub struct GpuBuffer {
pub data: Vec<f64>,
pub size: usize,
}
impl GpuBuffer {
pub fn new(size: usize) -> Self {
Self {
data: vec![0.0; size],
size,
}
}
pub fn from_data(initial_data: Vec<f64>) -> Self {
let size = initial_data.len();
Self {
data: initial_data,
size,
}
}
#[allow(dead_code)]
pub fn fill(&mut self, value: f64) {
for v in &mut self.data {
*v = value;
}
}
#[allow(dead_code)]
pub fn clear(&mut self) {
self.fill(0.0);
}
#[allow(dead_code)]
pub fn as_slice(&self) -> &[f64] {
&self.data
}
#[allow(dead_code)]
pub fn as_mut_slice(&mut self) -> &mut [f64] {
&mut self.data
}
#[allow(dead_code)]
pub fn byte_size(&self) -> usize {
self.size * std::mem::size_of::<f64>()
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum GpuError {
InvalidBuffer(BufferId),
SizeMismatch {
expected: usize,
got: usize,
},
EmptyBuffer,
#[allow(dead_code)]
NotFound(String),
}
pub struct CpuBackend {
pub(super) buffers: RefCell<Vec<Vec<f64>>>,
}
impl CpuBackend {
pub fn new() -> Self {
Self {
buffers: RefCell::new(Vec::new()),
}
}
#[allow(dead_code)]
pub fn num_buffers(&self) -> usize {
self.buffers.borrow().len()
}
#[allow(dead_code)]
pub fn total_elements(&self) -> usize {
self.buffers.borrow().iter().map(|b| b.len()).sum()
}
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub enum ResourceEvent {
Created(BufferId, usize),
Written(BufferId),
Read(BufferId),
Destroyed(BufferId),
}
#[derive(Debug, Clone, Default)]
#[allow(dead_code)]
pub struct WarpDivergenceRecord {
pub total_branches: u64,
pub divergent_branches: u64,
}
#[allow(dead_code)]
impl WarpDivergenceRecord {
pub fn divergence_rate(&self) -> f64 {
if self.total_branches == 0 {
0.0
} else {
self.divergent_branches as f64 / self.total_branches as f64
}
}
pub fn performance_penalty(&self, warp_size: u32) -> f64 {
let rate = self.divergence_rate();
1.0 + rate * (warp_size as f64 - 1.0) / warp_size as f64
}
}
#[allow(dead_code)]
pub struct TimelineSemaphore {
pub value: u64,
pub(super) signal_history: Vec<u64>,
pub(super) wait_history: Vec<u64>,
}
#[allow(dead_code)]
impl TimelineSemaphore {
pub fn new() -> Self {
Self {
value: 0,
signal_history: Vec::new(),
wait_history: Vec::new(),
}
}
pub fn signal(&mut self, new_value: u64) {
assert!(
new_value > self.value,
"semaphore values must increase monotonically"
);
self.value = new_value;
self.signal_history.push(new_value);
}
pub fn wait(&mut self, wait_value: u64) -> bool {
self.wait_history.push(wait_value);
self.value >= wait_value
}
pub fn current_value(&self) -> u64 {
self.value
}
pub fn signal_count(&self) -> usize {
self.signal_history.len()
}
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct MemoryBandwidthModel {
pub peak_bandwidth_gbs: f64,
pub peak_compute_gflops: f64,
}
#[allow(dead_code)]
impl MemoryBandwidthModel {
pub fn mid_range() -> Self {
Self {
peak_bandwidth_gbs: 480.0,
peak_compute_gflops: 10000.0,
}
}
pub fn arithmetic_intensity(flops: f64, bytes_accessed: f64) -> f64 {
if bytes_accessed < 1e-30 {
f64::INFINITY
} else {
flops / bytes_accessed
}
}
pub fn roofline_performance(&self, arithmetic_intensity: f64) -> f64 {
let bw_bound = arithmetic_intensity * self.peak_bandwidth_gbs;
bw_bound.min(self.peak_compute_gflops)
}
pub fn estimated_runtime_ms(&self, flops: f64, bytes_accessed: f64) -> f64 {
let intensity = Self::arithmetic_intensity(flops, bytes_accessed);
let perf_gflops = self.roofline_performance(intensity);
if perf_gflops < 1e-30 {
return f64::INFINITY;
}
(flops / (perf_gflops * 1e9)) * 1e3
}
pub fn is_bandwidth_bound(&self, arithmetic_intensity: f64) -> bool {
let ridge_point = self.peak_compute_gflops / self.peak_bandwidth_gbs;
arithmetic_intensity < ridge_point
}
}
#[derive(Debug, Clone, Copy)]
#[allow(dead_code)]
pub struct BufferBinding {
pub binding: u32,
pub buffer_id: BufferId,
pub usage: BufferUsage,
}
#[allow(dead_code)]
impl BufferBinding {
pub fn new(binding: u32, buffer_id: BufferId, usage: BufferUsage) -> Self {
Self {
binding,
buffer_id,
usage,
}
}
pub fn read(binding: u32, buffer_id: BufferId) -> Self {
Self::new(binding, buffer_id, BufferUsage::ReadOnly)
}
pub fn write(binding: u32, buffer_id: BufferId) -> Self {
Self::new(binding, buffer_id, BufferUsage::WriteOnly)
}
pub fn read_write(binding: u32, buffer_id: BufferId) -> Self {
Self::new(binding, buffer_id, BufferUsage::ReadWrite)
}
pub fn uniform(binding: u32, buffer_id: BufferId) -> Self {
Self::new(binding, buffer_id, BufferUsage::Uniform)
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct BufferId(pub u32);