use std::collections::{HashMap, VecDeque};
use std::time::{Duration, Instant};
const GL_SYNC_GPU_COMMANDS_COMPLETE: u32 = 0x9117;
const GL_ALREADY_SIGNALED: u32 = 0x911A;
const GL_TIMEOUT_EXPIRED: u32 = 0x911B;
const GL_CONDITION_SATISFIED: u32 = 0x911C;
const GL_WAIT_FAILED: u32 = 0x911D;
const GL_SYNC_FLUSH_COMMANDS_BIT: u32 = 0x00000001;
const GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT: u32 = 0x00000001;
const GL_ELEMENT_ARRAY_BARRIER_BIT: u32 = 0x00000002;
const GL_UNIFORM_BARRIER_BIT: u32 = 0x00000004;
const GL_TEXTURE_FETCH_BARRIER_BIT: u32 = 0x00000008;
const GL_SHADER_IMAGE_ACCESS_BARRIER_BIT: u32 = 0x00000020;
const GL_COMMAND_BARRIER_BIT: u32 = 0x00000040;
const GL_PIXEL_BUFFER_BARRIER_BIT: u32 = 0x00000080;
const GL_TEXTURE_UPDATE_BARRIER_BIT: u32 = 0x00000100;
const GL_BUFFER_UPDATE_BARRIER_BIT: u32 = 0x00000200;
const GL_FRAMEBUFFER_BARRIER_BIT: u32 = 0x00000400;
const GL_TRANSFORM_FEEDBACK_BARRIER_BIT: u32 = 0x00000800;
const GL_ATOMIC_COUNTER_BARRIER_BIT: u32 = 0x00001000;
const GL_SHADER_STORAGE_BARRIER_BIT: u32 = 0x00002000;
const GL_ALL_BARRIER_BITS: u32 = 0xFFFFFFFF;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum FenceStatus {
Unsignaled,
Signaled,
TimedOut,
Failed,
NotInserted,
}
pub struct FenceSync {
sync: Option<glow::NativeFence>,
status: FenceStatus,
inserted_at: Option<Instant>,
}
impl FenceSync {
pub fn new() -> Self {
Self {
sync: None,
status: FenceStatus::NotInserted,
inserted_at: None,
}
}
pub fn insert(&mut self, gl: &glow::Context) {
use glow::HasContext;
if let Some(old) = self.sync.take() {
unsafe {
gl.delete_sync(old);
}
}
let sync = unsafe { gl.fence_sync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0).unwrap() };
self.sync = Some(sync);
self.status = FenceStatus::Unsignaled;
self.inserted_at = Some(Instant::now());
}
pub fn poll(&mut self, gl: &glow::Context) -> FenceStatus {
if let Some(sync) = self.sync {
use glow::HasContext;
let result = unsafe { gl.client_wait_sync(sync, 0, 0) };
self.status = match result {
GL_ALREADY_SIGNALED | GL_CONDITION_SATISFIED => FenceStatus::Signaled,
GL_TIMEOUT_EXPIRED => FenceStatus::Unsignaled,
GL_WAIT_FAILED => FenceStatus::Failed,
_ => FenceStatus::Unsignaled,
};
}
self.status
}
pub fn wait(&mut self, gl: &glow::Context, timeout: Duration) -> FenceStatus {
if let Some(sync) = self.sync {
use glow::HasContext;
let timeout_ns = timeout.as_nanos() as u64;
let result = unsafe {
gl.client_wait_sync(sync, GL_SYNC_FLUSH_COMMANDS_BIT, timeout_ns as i32)
};
self.status = match result {
GL_ALREADY_SIGNALED | GL_CONDITION_SATISFIED => FenceStatus::Signaled,
GL_TIMEOUT_EXPIRED => FenceStatus::TimedOut,
GL_WAIT_FAILED => FenceStatus::Failed,
_ => FenceStatus::Unsignaled,
};
}
self.status
}
pub fn wait_forever(&mut self, gl: &glow::Context) -> FenceStatus {
self.wait(gl, Duration::from_secs(30)) }
pub fn status(&self) -> FenceStatus {
self.status
}
pub fn is_signaled(&self) -> bool {
self.status == FenceStatus::Signaled
}
pub fn elapsed_since_insert(&self) -> Option<Duration> {
self.inserted_at.map(|t| t.elapsed())
}
pub fn destroy(self, gl: &glow::Context) {
if let Some(sync) = self.sync {
use glow::HasContext;
unsafe {
gl.delete_sync(sync);
}
}
}
}
impl Default for FenceSync {
fn default() -> Self {
Self::new()
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct MemoryBarrierFlags(pub u32);
impl MemoryBarrierFlags {
pub const VERTEX_ATTRIB: Self = Self(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT);
pub const ELEMENT_ARRAY: Self = Self(GL_ELEMENT_ARRAY_BARRIER_BIT);
pub const UNIFORM: Self = Self(GL_UNIFORM_BARRIER_BIT);
pub const TEXTURE_FETCH: Self = Self(GL_TEXTURE_FETCH_BARRIER_BIT);
pub const SHADER_IMAGE: Self = Self(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
pub const COMMAND: Self = Self(GL_COMMAND_BARRIER_BIT);
pub const PIXEL_BUFFER: Self = Self(GL_PIXEL_BUFFER_BARRIER_BIT);
pub const TEXTURE_UPDATE: Self = Self(GL_TEXTURE_UPDATE_BARRIER_BIT);
pub const BUFFER_UPDATE: Self = Self(GL_BUFFER_UPDATE_BARRIER_BIT);
pub const FRAMEBUFFER: Self = Self(GL_FRAMEBUFFER_BARRIER_BIT);
pub const TRANSFORM_FEEDBACK: Self = Self(GL_TRANSFORM_FEEDBACK_BARRIER_BIT);
pub const ATOMIC_COUNTER: Self = Self(GL_ATOMIC_COUNTER_BARRIER_BIT);
pub const SHADER_STORAGE: Self = Self(GL_SHADER_STORAGE_BARRIER_BIT);
pub const ALL: Self = Self(GL_ALL_BARRIER_BITS);
pub fn combine(self, other: Self) -> Self {
Self(self.0 | other.0)
}
pub fn contains(self, flag: Self) -> bool {
(self.0 & flag.0) == flag.0
}
pub fn issue(self, gl: &glow::Context) {
use glow::HasContext;
unsafe {
gl.memory_barrier(self.0);
}
}
}
impl std::ops::BitOr for MemoryBarrierFlags {
type Output = Self;
fn bitor(self, rhs: Self) -> Self {
Self(self.0 | rhs.0)
}
}
impl std::ops::BitAnd for MemoryBarrierFlags {
type Output = Self;
fn bitand(self, rhs: Self) -> Self {
Self(self.0 & rhs.0)
}
}
#[derive(Debug, Clone)]
pub struct PipelineBarrier {
pub memory_flags: MemoryBarrierFlags,
pub fence: bool,
pub label: Option<String>,
}
impl PipelineBarrier {
pub fn memory(flags: MemoryBarrierFlags) -> Self {
Self {
memory_flags: flags,
fence: false,
label: None,
}
}
pub fn memory_and_fence(flags: MemoryBarrierFlags) -> Self {
Self {
memory_flags: flags,
fence: true,
label: None,
}
}
pub fn full() -> Self {
Self {
memory_flags: MemoryBarrierFlags::ALL,
fence: true,
label: None,
}
}
pub fn with_label(mut self, label: &str) -> Self {
self.label = Some(label.to_string());
self
}
pub fn ssbo_raw() -> Self {
Self::memory(MemoryBarrierFlags::SHADER_STORAGE)
}
pub fn compute_to_vertex() -> Self {
Self::memory(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::VERTEX_ATTRIB)
}
pub fn compute_to_indirect() -> Self {
Self::memory(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::COMMAND)
}
pub fn execute(&self, gl: &glow::Context) -> Option<FenceSync> {
self.memory_flags.issue(gl);
if self.fence {
let mut fence = FenceSync::new();
fence.insert(gl);
Some(fence)
} else {
None
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub enum ResourceState {
Undefined,
ComputeWrite,
ComputeRead,
VertexRead,
IndexRead,
IndirectRead,
UniformRead,
CpuAccess,
TransferSrc,
TransferDst,
}
pub struct ResourceTransition {
states: HashMap<u32, ResourceState>,
}
impl ResourceTransition {
pub fn new() -> Self {
Self {
states: HashMap::new(),
}
}
pub fn register(&mut self, resource_id: u32, initial: ResourceState) {
self.states.insert(resource_id, initial);
}
pub fn current_state(&self, resource_id: u32) -> Option<ResourceState> {
self.states.get(&resource_id).copied()
}
pub fn transition(
&mut self,
resource_id: u32,
new_state: ResourceState,
) -> Option<MemoryBarrierFlags> {
let old_state = self.states.get(&resource_id).copied().unwrap_or(ResourceState::Undefined);
if old_state == new_state {
return None;
}
let flags = Self::barrier_for_transition(old_state, new_state);
self.states.insert(resource_id, new_state);
flags
}
pub fn transition_and_barrier(
&mut self,
gl: &glow::Context,
resource_id: u32,
new_state: ResourceState,
) {
if let Some(flags) = self.transition(resource_id, new_state) {
flags.issue(gl);
}
}
fn barrier_for_transition(
from: ResourceState,
to: ResourceState,
) -> Option<MemoryBarrierFlags> {
match (from, to) {
(ResourceState::ComputeWrite, ResourceState::ComputeRead) => {
Some(MemoryBarrierFlags::SHADER_STORAGE)
}
(ResourceState::ComputeWrite, ResourceState::VertexRead) => {
Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::VERTEX_ATTRIB)
}
(ResourceState::ComputeWrite, ResourceState::IndexRead) => {
Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::ELEMENT_ARRAY)
}
(ResourceState::ComputeWrite, ResourceState::IndirectRead) => {
Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::COMMAND)
}
(ResourceState::ComputeWrite, ResourceState::UniformRead) => {
Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::UNIFORM)
}
(ResourceState::ComputeWrite, ResourceState::CpuAccess) => {
Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::BUFFER_UPDATE)
}
(ResourceState::ComputeWrite, ResourceState::TransferSrc) => {
Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::BUFFER_UPDATE)
}
(ResourceState::ComputeWrite, ResourceState::TransferDst) => {
Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::BUFFER_UPDATE)
}
(ResourceState::TransferDst, ResourceState::ComputeRead) => {
Some(MemoryBarrierFlags::BUFFER_UPDATE | MemoryBarrierFlags::SHADER_STORAGE)
}
(ResourceState::TransferDst, ResourceState::VertexRead) => {
Some(MemoryBarrierFlags::BUFFER_UPDATE | MemoryBarrierFlags::VERTEX_ATTRIB)
}
(ResourceState::CpuAccess, ResourceState::ComputeRead) => {
Some(MemoryBarrierFlags::BUFFER_UPDATE | MemoryBarrierFlags::SHADER_STORAGE)
}
(ResourceState::CpuAccess, ResourceState::ComputeWrite) => {
Some(MemoryBarrierFlags::BUFFER_UPDATE | MemoryBarrierFlags::SHADER_STORAGE)
}
_ if from == to => None,
(ResourceState::ComputeWrite, _) => Some(MemoryBarrierFlags::ALL),
_ => None,
}
}
pub fn unregister(&mut self, resource_id: u32) {
self.states.remove(&resource_id);
}
pub fn tracked_count(&self) -> usize {
self.states.len()
}
pub fn reset_all(&mut self) {
for state in self.states.values_mut() {
*state = ResourceState::Undefined;
}
}
}
impl Default for ResourceTransition {
fn default() -> Self {
Self::new()
}
}
struct ComputeJob {
id: u64,
program_key: u64,
dimension: super::dispatch::DispatchDimension,
uniforms: Vec<super::dispatch::UniformValue>,
barrier: MemoryBarrierFlags,
fence: Option<FenceSync>,
dispatched: bool,
completed: bool,
}
pub struct AsyncComputeQueue {
jobs: VecDeque<ComputeJob>,
next_id: u64,
max_in_flight: usize,
}
impl AsyncComputeQueue {
pub fn new(max_in_flight: usize) -> Self {
Self {
jobs: VecDeque::new(),
next_id: 1,
max_in_flight,
}
}
pub fn submit(
&mut self,
program_key: u64,
dimension: super::dispatch::DispatchDimension,
uniforms: Vec<super::dispatch::UniformValue>,
barrier: MemoryBarrierFlags,
) -> u64 {
let id = self.next_id;
self.next_id += 1;
self.jobs.push_back(ComputeJob {
id,
program_key,
dimension,
uniforms,
barrier,
fence: None,
dispatched: false,
completed: false,
});
id
}
pub fn flush(
&mut self,
gl: &glow::Context,
cache: &super::dispatch::PipelineCache,
) {
use glow::HasContext;
let in_flight = self.jobs.iter().filter(|j| j.dispatched && !j.completed).count();
let can_dispatch = self.max_in_flight.saturating_sub(in_flight);
let mut dispatched_count = 0;
for job in self.jobs.iter_mut() {
if dispatched_count >= can_dispatch {
break;
}
if job.dispatched {
continue;
}
if let Some(program) = cache.cache.get(&job.program_key) {
program.bind(gl);
for u in &job.uniforms {
match u {
super::dispatch::UniformValue::Int(name, v) => {
program.set_uniform_int(gl, name, *v)
}
super::dispatch::UniformValue::Uint(name, v) => {
program.set_uniform_uint(gl, name, *v)
}
super::dispatch::UniformValue::Float(name, v) => {
program.set_uniform_float(gl, name, *v)
}
super::dispatch::UniformValue::Vec2(name, x, y) => {
program.set_uniform_vec2(gl, name, *x, *y)
}
super::dispatch::UniformValue::Vec3(name, x, y, z) => {
program.set_uniform_vec3(gl, name, *x, *y, *z)
}
super::dispatch::UniformValue::Vec4(name, x, y, z, w) => {
program.set_uniform_vec4(gl, name, *x, *y, *z, *w)
}
}
}
let (gx, gy, gz) = job.dimension.as_tuple();
unsafe {
gl.dispatch_compute(gx, gy, gz);
gl.memory_barrier(job.barrier.0);
}
let mut fence = FenceSync::new();
fence.insert(gl);
job.fence = Some(fence);
job.dispatched = true;
dispatched_count += 1;
}
}
}
pub fn poll(&mut self, gl: &glow::Context) {
for job in self.jobs.iter_mut() {
if job.dispatched && !job.completed {
if let Some(ref mut fence) = job.fence {
if fence.poll(gl) == FenceStatus::Signaled {
job.completed = true;
}
}
}
}
}
pub fn drain_completed(&mut self) -> Vec<u64> {
let mut completed = Vec::new();
while let Some(front) = self.jobs.front() {
if front.completed {
let job = self.jobs.pop_front().unwrap();
completed.push(job.id);
} else {
break;
}
}
completed
}
pub fn is_complete(&self, job_id: u64) -> bool {
self.jobs.iter().find(|j| j.id == job_id).map_or(true, |j| j.completed)
}
pub fn wait_for(
&mut self,
gl: &glow::Context,
job_id: u64,
timeout: Duration,
) -> bool {
if let Some(job) = self.jobs.iter_mut().find(|j| j.id == job_id) {
if job.completed {
return true;
}
if let Some(ref mut fence) = job.fence {
let status = fence.wait(gl, timeout);
if status == FenceStatus::Signaled {
job.completed = true;
return true;
}
}
false
} else {
true }
}
pub fn pending_count(&self) -> usize {
self.jobs.iter().filter(|j| !j.dispatched).count()
}
pub fn in_flight_count(&self) -> usize {
self.jobs.iter().filter(|j| j.dispatched && !j.completed).count()
}
pub fn total_count(&self) -> usize {
self.jobs.len()
}
pub fn destroy(self, gl: &glow::Context) {
for job in self.jobs {
if let Some(fence) = job.fence {
fence.destroy(gl);
}
}
}
}
pub struct FrameTimeline {
frames: Vec<FrameContext>,
current: usize,
total_frames: u64,
}
struct FrameContext {
fence: FenceSync,
frame_number: u64,
transient_resources: Vec<u32>,
begin_time: Option<Instant>,
complete_time: Option<Instant>,
}
impl FrameTimeline {
pub fn new(ring_size: usize) -> Self {
let frames = (0..ring_size)
.map(|_| FrameContext {
fence: FenceSync::new(),
frame_number: 0,
transient_resources: Vec::new(),
begin_time: None,
complete_time: None,
})
.collect();
Self {
frames,
current: 0,
total_frames: 0,
}
}
pub fn begin_frame(&mut self, gl: &glow::Context) {
let ctx = &mut self.frames[self.current];
if ctx.fence.status() == FenceStatus::Unsignaled {
ctx.fence.wait_forever(gl);
}
ctx.complete_time = Some(Instant::now());
ctx.transient_resources.clear();
ctx.frame_number = self.total_frames;
ctx.begin_time = Some(Instant::now());
}
pub fn end_frame(&mut self, gl: &glow::Context) {
self.frames[self.current].fence.insert(gl);
self.current = (self.current + 1) % self.frames.len();
self.total_frames += 1;
}
pub fn register_transient(&mut self, resource_id: u32) {
self.frames[self.current]
.transient_resources
.push(resource_id);
}
pub fn current_frame_number(&self) -> u64 {
self.total_frames
}
pub fn ring_size(&self) -> usize {
self.frames.len()
}
pub fn current_slot(&self) -> usize {
self.current
}
pub fn is_frame_complete(&mut self, gl: &glow::Context, frame_number: u64) -> bool {
for ctx in self.frames.iter_mut() {
if ctx.frame_number == frame_number {
if ctx.fence.is_signaled() {
return true;
}
return ctx.fence.poll(gl) == FenceStatus::Signaled;
}
}
true
}
pub fn wait_all(&mut self, gl: &glow::Context) {
for ctx in self.frames.iter_mut() {
if ctx.fence.status() == FenceStatus::Unsignaled {
ctx.fence.wait_forever(gl);
}
}
}
pub fn average_latency(&self) -> Option<Duration> {
let mut total = Duration::ZERO;
let mut count = 0u32;
for ctx in &self.frames {
if let (Some(begin), Some(complete)) = (ctx.begin_time, ctx.complete_time) {
if complete > begin {
total += complete - begin;
count += 1;
}
}
}
if count > 0 {
Some(total / count)
} else {
None
}
}
pub fn destroy(self, gl: &glow::Context) {
for ctx in self.frames {
ctx.fence.destroy(gl);
}
}
}
pub struct CpuFallback {
active: bool,
last_execution_us: HashMap<String, u64>,
}
impl CpuFallback {
pub fn new() -> Self {
Self {
active: false,
last_execution_us: HashMap::new(),
}
}
pub fn activate(&mut self) {
self.active = true;
}
pub fn deactivate(&mut self) {
self.active = false;
}
pub fn is_active(&self) -> bool {
self.active
}
pub fn particle_integrate(
&mut self,
positions: &mut [[f32; 4]],
velocities: &mut [[f32; 4]],
params: &super::kernels::ParticleIntegrateParams,
) {
let start = Instant::now();
let dt = params.dt;
let gravity = params.gravity;
let damping = params.damping;
let max_age = params.max_age;
let wind = params.wind;
for i in 0..positions.len() {
let age = positions[i][3] + dt;
let lifetime = velocities[i][3];
if age >= lifetime || age >= max_age {
positions[i][3] = lifetime + 1.0;
velocities[i][0] = 0.0;
velocities[i][1] = 0.0;
velocities[i][2] = 0.0;
continue;
}
velocities[i][0] += gravity[0] * dt;
velocities[i][1] += gravity[1] * dt;
velocities[i][2] += gravity[2] * dt;
velocities[i][0] += wind[0] * dt;
velocities[i][1] += wind[1] * dt;
velocities[i][2] += wind[2] * dt;
let d = damping.powf(dt);
velocities[i][0] *= d;
velocities[i][1] *= d;
velocities[i][2] *= d;
positions[i][0] += velocities[i][0] * dt;
positions[i][1] += velocities[i][1] * dt;
positions[i][2] += velocities[i][2] * dt;
positions[i][3] = age;
}
let elapsed = start.elapsed().as_micros() as u64;
self.last_execution_us
.insert("particle_integrate".to_string(), elapsed);
}
pub fn lorenz_step(
&mut self,
points: &mut [[f32; 3]],
sigma: f32,
rho: f32,
beta: f32,
dt: f32,
) {
let start = Instant::now();
for p in points.iter_mut() {
let dx = sigma * (p[1] - p[0]);
let dy = p[0] * (rho - p[2]) - p[1];
let dz = p[0] * p[1] - beta * p[2];
let k1 = [dx, dy, dz];
let p2 = [
p[0] + 0.5 * dt * k1[0],
p[1] + 0.5 * dt * k1[1],
p[2] + 0.5 * dt * k1[2],
];
let k2 = [
sigma * (p2[1] - p2[0]),
p2[0] * (rho - p2[2]) - p2[1],
p2[0] * p2[1] - beta * p2[2],
];
let p3 = [
p[0] + 0.5 * dt * k2[0],
p[1] + 0.5 * dt * k2[1],
p[2] + 0.5 * dt * k2[2],
];
let k3 = [
sigma * (p3[1] - p3[0]),
p3[0] * (rho - p3[2]) - p3[1],
p3[0] * p3[1] - beta * p3[2],
];
let p4 = [
p[0] + dt * k3[0],
p[1] + dt * k3[1],
p[2] + dt * k3[2],
];
let k4 = [
sigma * (p4[1] - p4[0]),
p4[0] * (rho - p4[2]) - p4[1],
p4[0] * p4[1] - beta * p4[2],
];
p[0] += (dt / 6.0) * (k1[0] + 2.0 * k2[0] + 2.0 * k3[0] + k4[0]);
p[1] += (dt / 6.0) * (k1[1] + 2.0 * k2[1] + 2.0 * k3[1] + k4[1]);
p[2] += (dt / 6.0) * (k1[2] + 2.0 * k2[2] + 2.0 * k3[2] + k4[2]);
}
let elapsed = start.elapsed().as_micros() as u64;
self.last_execution_us
.insert("lorenz_step".to_string(), elapsed);
}
pub fn mandelbrot_iterate(
&mut self,
z_re: &mut [f32],
z_im: &mut [f32],
c_re: &[f32],
c_im: &[f32],
iterations: &mut [u32],
max_iter: u32,
) {
let start = Instant::now();
assert_eq!(z_re.len(), z_im.len());
assert_eq!(z_re.len(), c_re.len());
assert_eq!(z_re.len(), c_im.len());
assert_eq!(z_re.len(), iterations.len());
for i in 0..z_re.len() {
if iterations[i] >= max_iter {
continue;
}
let zr = z_re[i];
let zi = z_im[i];
if zr * zr + zi * zi >= 4.0 {
continue;
}
z_re[i] = zr * zr - zi * zi + c_re[i];
z_im[i] = 2.0 * zr * zi + c_im[i];
iterations[i] += 1;
}
let elapsed = start.elapsed().as_micros() as u64;
self.last_execution_us
.insert("mandelbrot_iterate".to_string(), elapsed);
}
pub fn julia_iterate(
&mut self,
z_re: &mut [f32],
z_im: &mut [f32],
c_re: f32,
c_im: f32,
iterations: &mut [u32],
max_iter: u32,
) {
let start = Instant::now();
for i in 0..z_re.len() {
if iterations[i] >= max_iter {
continue;
}
let zr = z_re[i];
let zi = z_im[i];
if zr * zr + zi * zi >= 4.0 {
continue;
}
z_re[i] = zr * zr - zi * zi + c_re;
z_im[i] = 2.0 * zr * zi + c_im;
iterations[i] += 1;
}
let elapsed = start.elapsed().as_micros() as u64;
self.last_execution_us
.insert("julia_iterate".to_string(), elapsed);
}
pub fn prefix_sum_exclusive(&mut self, data: &mut [u32]) {
let start = Instant::now();
if data.is_empty() {
return;
}
let mut sum = 0u32;
for val in data.iter_mut() {
let old = *val;
*val = sum;
sum += old;
}
let elapsed = start.elapsed().as_micros() as u64;
self.last_execution_us
.insert("prefix_sum".to_string(), elapsed);
}
pub fn prefix_sum_inclusive(&mut self, data: &mut [u32]) {
let start = Instant::now();
if data.is_empty() {
return;
}
for i in 1..data.len() {
data[i] += data[i - 1];
}
let elapsed = start.elapsed().as_micros() as u64;
self.last_execution_us
.insert("prefix_sum_inclusive".to_string(), elapsed);
}
pub fn radix_sort(&mut self, keys: &mut [u32], values: &mut [u32]) {
let start = Instant::now();
assert_eq!(keys.len(), values.len());
let n = keys.len();
if n == 0 {
return;
}
let mut keys_tmp = vec![0u32; n];
let mut vals_tmp = vec![0u32; n];
let radix = 256usize;
let mut counts = vec![0usize; radix];
for bit_offset in (0..32).step_by(8) {
for c in counts.iter_mut() {
*c = 0;
}
for &k in keys.iter() {
let digit = ((k >> bit_offset) & 0xFF) as usize;
counts[digit] += 1;
}
let mut total = 0;
for c in counts.iter_mut() {
let old = *c;
*c = total;
total += old;
}
for i in 0..n {
let digit = ((keys[i] >> bit_offset) & 0xFF) as usize;
let dest = counts[digit];
keys_tmp[dest] = keys[i];
vals_tmp[dest] = values[i];
counts[digit] += 1;
}
keys.copy_from_slice(&keys_tmp);
values.copy_from_slice(&vals_tmp);
}
let elapsed = start.elapsed().as_micros() as u64;
self.last_execution_us
.insert("radix_sort".to_string(), elapsed);
}
pub fn frustum_cull(
&mut self,
positions: &[[f32; 3]],
radii: &[f32],
planes: &[[f32; 4]; 6],
) -> Vec<usize> {
let start = Instant::now();
let mut visible = Vec::new();
for (i, (pos, &radius)) in positions.iter().zip(radii).enumerate() {
let mut inside = true;
for plane in planes {
let dist =
plane[0] * pos[0] + plane[1] * pos[1] + plane[2] * pos[2] + plane[3];
if dist < -radius {
inside = false;
break;
}
}
if inside {
visible.push(i);
}
}
let elapsed = start.elapsed().as_micros() as u64;
self.last_execution_us
.insert("frustum_cull".to_string(), elapsed);
visible
}
pub fn skin_vertices(
&mut self,
positions: &[[f32; 3]],
normals: &[[f32; 3]],
bone_indices: &[[u32; 4]],
bone_weights: &[[f32; 4]],
bone_matrices: &[[f32; 16]],
inv_bind_matrices: &[[f32; 16]],
out_positions: &mut [[f32; 3]],
out_normals: &mut [[f32; 3]],
) {
let start = Instant::now();
for i in 0..positions.len() {
let pos = positions[i];
let norm = normals[i];
let indices = bone_indices[i];
let weights = bone_weights[i];
let mut skinned_pos = [0.0f32; 3];
let mut skinned_norm = [0.0f32; 3];
for j in 0..4 {
let w = weights[j];
if w <= 0.0 {
continue;
}
let bi = indices[j] as usize;
if bi >= bone_matrices.len() {
continue;
}
let bone = &bone_matrices[bi];
let inv = &inv_bind_matrices[bi];
let mat = mat4_mul(bone, inv);
let tp = mat4_transform_point(&mat, &pos);
skinned_pos[0] += tp[0] * w;
skinned_pos[1] += tp[1] * w;
skinned_pos[2] += tp[2] * w;
let tn = mat4_transform_normal(&mat, &norm);
skinned_norm[0] += tn[0] * w;
skinned_norm[1] += tn[1] * w;
skinned_norm[2] += tn[2] * w;
}
let len = (skinned_norm[0] * skinned_norm[0]
+ skinned_norm[1] * skinned_norm[1]
+ skinned_norm[2] * skinned_norm[2])
.sqrt();
if len > 1e-6 {
skinned_norm[0] /= len;
skinned_norm[1] /= len;
skinned_norm[2] /= len;
}
out_positions[i] = skinned_pos;
out_normals[i] = skinned_norm;
}
let elapsed = start.elapsed().as_micros() as u64;
self.last_execution_us
.insert("skinning".to_string(), elapsed);
}
pub fn fluid_diffuse(
&mut self,
grid: &mut [f32],
scratch: &mut [f32],
width: usize,
height: usize,
diffusion_rate: f32,
dt: f32,
iterations: usize,
) {
let start = Instant::now();
let dx = 1.0f32;
let alpha = diffusion_rate * dt / (dx * dx);
let r_beta = 1.0 / (1.0 + 4.0 * alpha);
for _ in 0..iterations {
for y in 0..height {
for x in 0..width {
let idx = y * width + x;
let left = if x > 0 { grid[idx - 1] } else { grid[idx] };
let right = if x + 1 < width { grid[idx + 1] } else { grid[idx] };
let down = if y > 0 { grid[idx - width] } else { grid[idx] };
let up = if y + 1 < height { grid[idx + width] } else { grid[idx] };
scratch[idx] = (grid[idx] + alpha * (left + right + down + up)) * r_beta;
}
}
grid.copy_from_slice(scratch);
}
let elapsed = start.elapsed().as_micros() as u64;
self.last_execution_us
.insert("fluid_diffuse".to_string(), elapsed);
}
pub fn histogram_equalize(
&mut self,
data: &mut [f32],
bin_count: usize,
min_val: f32,
max_val: f32,
) {
let start = Instant::now();
let range = max_val - min_val;
if range <= 0.0 || data.is_empty() {
return;
}
let mut histogram = vec![0u32; bin_count];
for &v in data.iter() {
let norm = ((v - min_val) / range).clamp(0.0, 1.0);
let bin = ((norm * (bin_count - 1) as f32) as usize).min(bin_count - 1);
histogram[bin] += 1;
}
let mut cdf = vec![0.0f32; bin_count];
let mut running = 0u32;
for i in 0..bin_count {
running += histogram[i];
cdf[i] = running as f32 / data.len() as f32;
}
for v in data.iter_mut() {
let norm = ((*v - min_val) / range).clamp(0.0, 1.0);
let bin = ((norm * (bin_count - 1) as f32) as usize).min(bin_count - 1);
*v = cdf[bin] * range + min_val;
}
let elapsed = start.elapsed().as_micros() as u64;
self.last_execution_us
.insert("histogram_equalize".to_string(), elapsed);
}
pub fn last_execution_us(&self, name: &str) -> Option<u64> {
self.last_execution_us.get(name).copied()
}
pub fn summary(&self) -> String {
let mut s = String::from("=== CPU Fallback Timings ===\n");
let mut names: Vec<&str> = self.last_execution_us.keys().map(|s| s.as_str()).collect();
names.sort();
for name in names {
if let Some(us) = self.last_execution_us.get(name) {
s.push_str(&format!(" {}: {} us\n", name, us));
}
}
s
}
}
impl Default for CpuFallback {
fn default() -> Self {
Self::new()
}
}
fn mat4_mul(a: &[f32; 16], b: &[f32; 16]) -> [f32; 16] {
let mut result = [0.0f32; 16];
for col in 0..4 {
for row in 0..4 {
let mut sum = 0.0;
for k in 0..4 {
sum += a[k * 4 + row] * b[col * 4 + k];
}
result[col * 4 + row] = sum;
}
}
result
}
fn mat4_transform_point(m: &[f32; 16], p: &[f32; 3]) -> [f32; 3] {
[
m[0] * p[0] + m[4] * p[1] + m[8] * p[2] + m[12],
m[1] * p[0] + m[5] * p[1] + m[9] * p[2] + m[13],
m[2] * p[0] + m[6] * p[1] + m[10] * p[2] + m[14],
]
}
fn mat4_transform_normal(m: &[f32; 16], n: &[f32; 3]) -> [f32; 3] {
[
m[0] * n[0] + m[4] * n[1] + m[8] * n[2],
m[1] * n[0] + m[5] * n[1] + m[9] * n[2],
m[2] * n[0] + m[6] * n[1] + m[10] * n[2],
]
}
#[derive(Debug, Clone)]
pub struct ComputeCapabilities {
pub has_compute: bool,
pub max_work_group_invocations: u32,
pub max_work_group_size: [u32; 3],
pub max_work_group_count: [u32; 3],
pub max_shared_memory: u32,
pub max_ssbo_bindings: u32,
pub max_atomic_counter_bindings: u32,
pub gl_version_major: u32,
pub gl_version_minor: u32,
}
impl ComputeCapabilities {
pub fn query(gl: &glow::Context) -> Self {
use glow::HasContext;
unsafe {
let major = gl.get_parameter_i32(glow::MAJOR_VERSION) as u32;
let minor = gl.get_parameter_i32(glow::MINOR_VERSION) as u32;
let has_compute = major > 4 || (major == 4 && minor >= 3);
if !has_compute {
return Self {
has_compute: false,
max_work_group_invocations: 0,
max_work_group_size: [0; 3],
max_work_group_count: [0; 3],
max_shared_memory: 0,
max_ssbo_bindings: 0,
max_atomic_counter_bindings: 0,
gl_version_major: major,
gl_version_minor: minor,
};
}
let max_invocations = gl.get_parameter_i32(0x90EB) as u32;
let max_size = [
gl.get_parameter_indexed_i32(0x91BE, 0) as u32,
gl.get_parameter_indexed_i32(0x91BE, 1) as u32,
gl.get_parameter_indexed_i32(0x91BE, 2) as u32,
];
let max_count = [
gl.get_parameter_indexed_i32(0x91BF, 0) as u32,
gl.get_parameter_indexed_i32(0x91BF, 1) as u32,
gl.get_parameter_indexed_i32(0x91BF, 2) as u32,
];
let max_shared = gl.get_parameter_i32(0x8262) as u32;
let max_ssbo = gl.get_parameter_i32(0x90DC) as u32; let max_atomic = gl.get_parameter_i32(0x92D1) as u32;
Self {
has_compute,
max_work_group_invocations: max_invocations,
max_work_group_size: max_size,
max_work_group_count: max_count,
max_shared_memory: max_shared,
max_ssbo_bindings: max_ssbo,
max_atomic_counter_bindings: max_atomic,
gl_version_major: major,
gl_version_minor: minor,
}
}
}
pub fn validate_workgroup(&self, size: &super::dispatch::WorkgroupSize) -> bool {
size.x <= self.max_work_group_size[0]
&& size.y <= self.max_work_group_size[1]
&& size.z <= self.max_work_group_size[2]
&& size.total_invocations() <= self.max_work_group_invocations
}
pub fn summary(&self) -> String {
if !self.has_compute {
return format!(
"GL {}.{}: NO compute support (requires 4.3+)",
self.gl_version_major, self.gl_version_minor
);
}
format!(
"GL {}.{}: compute OK, max_invocations={}, max_size=[{},{},{}], max_shared={}KB, ssbo_bindings={}, atomic_bindings={}",
self.gl_version_major, self.gl_version_minor,
self.max_work_group_invocations,
self.max_work_group_size[0], self.max_work_group_size[1], self.max_work_group_size[2],
self.max_shared_memory / 1024,
self.max_ssbo_bindings,
self.max_atomic_counter_bindings,
)
}
}