Skip to main content

proof_engine/compute/
sync.rs

1//! Compute-to-render synchronization: fences, memory barriers, async compute queue,
2//! frame timeline, CPU fallback, resource state machine.
3
4use std::collections::{HashMap, VecDeque};
5use std::time::{Duration, Instant};
6
7// ---------------------------------------------------------------------------
8// GL constants
9// ---------------------------------------------------------------------------
10
11const GL_SYNC_GPU_COMMANDS_COMPLETE: u32 = 0x9117;
12const GL_ALREADY_SIGNALED: u32 = 0x911A;
13const GL_TIMEOUT_EXPIRED: u32 = 0x911B;
14const GL_CONDITION_SATISFIED: u32 = 0x911C;
15const GL_WAIT_FAILED: u32 = 0x911D;
16const GL_SYNC_FLUSH_COMMANDS_BIT: u32 = 0x00000001;
17
18// Barrier bits
19const GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT: u32 = 0x00000001;
20const GL_ELEMENT_ARRAY_BARRIER_BIT: u32 = 0x00000002;
21const GL_UNIFORM_BARRIER_BIT: u32 = 0x00000004;
22const GL_TEXTURE_FETCH_BARRIER_BIT: u32 = 0x00000008;
23const GL_SHADER_IMAGE_ACCESS_BARRIER_BIT: u32 = 0x00000020;
24const GL_COMMAND_BARRIER_BIT: u32 = 0x00000040;
25const GL_PIXEL_BUFFER_BARRIER_BIT: u32 = 0x00000080;
26const GL_TEXTURE_UPDATE_BARRIER_BIT: u32 = 0x00000100;
27const GL_BUFFER_UPDATE_BARRIER_BIT: u32 = 0x00000200;
28const GL_FRAMEBUFFER_BARRIER_BIT: u32 = 0x00000400;
29const GL_TRANSFORM_FEEDBACK_BARRIER_BIT: u32 = 0x00000800;
30const GL_ATOMIC_COUNTER_BARRIER_BIT: u32 = 0x00001000;
31const GL_SHADER_STORAGE_BARRIER_BIT: u32 = 0x00002000;
32const GL_ALL_BARRIER_BITS: u32 = 0xFFFFFFFF;
33
34// ---------------------------------------------------------------------------
35// FenceSync
36// ---------------------------------------------------------------------------
37
38/// Status of a GPU fence.
39#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40pub enum FenceStatus {
41    /// Not yet signaled.
42    Unsignaled,
43    /// Already signaled — GPU work is complete.
44    Signaled,
45    /// Wait timed out.
46    TimedOut,
47    /// Wait failed (GL error).
48    Failed,
49    /// Fence has not been inserted yet.
50    NotInserted,
51}
52
53/// A GPU fence for synchronizing compute and render work.
54///
55/// Insert a fence after issuing GPU commands; then poll or wait for it
56/// to determine when the GPU has finished processing those commands.
57pub struct FenceSync {
58    sync: Option<glow::NativeFence>,
59    status: FenceStatus,
60    inserted_at: Option<Instant>,
61}
62
63impl FenceSync {
64    /// Create a new fence (not yet inserted).
65    pub fn new() -> Self {
66        Self {
67            sync: None,
68            status: FenceStatus::NotInserted,
69            inserted_at: None,
70        }
71    }
72
73    /// Insert a fence into the GL command stream.
74    pub fn insert(&mut self, gl: &glow::Context) {
75        use glow::HasContext;
76        // Delete old fence if any
77        if let Some(old) = self.sync.take() {
78            unsafe {
79                gl.delete_sync(old);
80            }
81        }
82        let sync = unsafe { gl.fence_sync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0).unwrap() };
83        self.sync = Some(sync);
84        self.status = FenceStatus::Unsignaled;
85        self.inserted_at = Some(Instant::now());
86    }
87
88    /// Poll the fence without blocking. Returns current status.
89    pub fn poll(&mut self, gl: &glow::Context) -> FenceStatus {
90        if let Some(sync) = self.sync {
91            use glow::HasContext;
92            let result = unsafe { gl.client_wait_sync(sync, 0, 0) };
93            self.status = match result {
94                GL_ALREADY_SIGNALED | GL_CONDITION_SATISFIED => FenceStatus::Signaled,
95                GL_TIMEOUT_EXPIRED => FenceStatus::Unsignaled,
96                GL_WAIT_FAILED => FenceStatus::Failed,
97                _ => FenceStatus::Unsignaled,
98            };
99        }
100        self.status
101    }
102
103    /// Wait for the fence with a timeout. Returns status after wait.
104    pub fn wait(&mut self, gl: &glow::Context, timeout: Duration) -> FenceStatus {
105        if let Some(sync) = self.sync {
106            use glow::HasContext;
107            let timeout_ns = timeout.as_nanos() as u64;
108            let result = unsafe {
109                gl.client_wait_sync(sync, GL_SYNC_FLUSH_COMMANDS_BIT, timeout_ns as i32)
110            };
111            self.status = match result {
112                GL_ALREADY_SIGNALED | GL_CONDITION_SATISFIED => FenceStatus::Signaled,
113                GL_TIMEOUT_EXPIRED => FenceStatus::TimedOut,
114                GL_WAIT_FAILED => FenceStatus::Failed,
115                _ => FenceStatus::Unsignaled,
116            };
117        }
118        self.status
119    }
120
121    /// Block until the fence is signaled (infinite wait).
122    pub fn wait_forever(&mut self, gl: &glow::Context) -> FenceStatus {
123        self.wait(gl, Duration::from_secs(30)) // 30s practical "infinity"
124    }
125
126    /// Current status (may be stale — call poll() to refresh).
127    pub fn status(&self) -> FenceStatus {
128        self.status
129    }
130
131    /// Whether the fence has been signaled.
132    pub fn is_signaled(&self) -> bool {
133        self.status == FenceStatus::Signaled
134    }
135
136    /// How long ago the fence was inserted (wall-clock, not GPU time).
137    pub fn elapsed_since_insert(&self) -> Option<Duration> {
138        self.inserted_at.map(|t| t.elapsed())
139    }
140
141    /// Destroy the fence.
142    pub fn destroy(self, gl: &glow::Context) {
143        if let Some(sync) = self.sync {
144            use glow::HasContext;
145            unsafe {
146                gl.delete_sync(sync);
147            }
148        }
149    }
150}
151
152impl Default for FenceSync {
153    fn default() -> Self {
154        Self::new()
155    }
156}
157
158// ---------------------------------------------------------------------------
159// MemoryBarrierFlags
160// ---------------------------------------------------------------------------
161
162/// Flags for glMemoryBarrier, wrapped for type safety.
163#[derive(Debug, Clone, Copy, PartialEq, Eq)]
164pub struct MemoryBarrierFlags(pub u32);
165
166impl MemoryBarrierFlags {
167    pub const VERTEX_ATTRIB: Self = Self(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT);
168    pub const ELEMENT_ARRAY: Self = Self(GL_ELEMENT_ARRAY_BARRIER_BIT);
169    pub const UNIFORM: Self = Self(GL_UNIFORM_BARRIER_BIT);
170    pub const TEXTURE_FETCH: Self = Self(GL_TEXTURE_FETCH_BARRIER_BIT);
171    pub const SHADER_IMAGE: Self = Self(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
172    pub const COMMAND: Self = Self(GL_COMMAND_BARRIER_BIT);
173    pub const PIXEL_BUFFER: Self = Self(GL_PIXEL_BUFFER_BARRIER_BIT);
174    pub const TEXTURE_UPDATE: Self = Self(GL_TEXTURE_UPDATE_BARRIER_BIT);
175    pub const BUFFER_UPDATE: Self = Self(GL_BUFFER_UPDATE_BARRIER_BIT);
176    pub const FRAMEBUFFER: Self = Self(GL_FRAMEBUFFER_BARRIER_BIT);
177    pub const TRANSFORM_FEEDBACK: Self = Self(GL_TRANSFORM_FEEDBACK_BARRIER_BIT);
178    pub const ATOMIC_COUNTER: Self = Self(GL_ATOMIC_COUNTER_BARRIER_BIT);
179    pub const SHADER_STORAGE: Self = Self(GL_SHADER_STORAGE_BARRIER_BIT);
180    pub const ALL: Self = Self(GL_ALL_BARRIER_BITS);
181
182    /// Combine two barrier flag sets.
183    pub fn combine(self, other: Self) -> Self {
184        Self(self.0 | other.0)
185    }
186
187    /// Check if a specific flag is set.
188    pub fn contains(self, flag: Self) -> bool {
189        (self.0 & flag.0) == flag.0
190    }
191
192    /// Issue this barrier on the GL context.
193    pub fn issue(self, gl: &glow::Context) {
194        use glow::HasContext;
195        unsafe {
196            gl.memory_barrier(self.0);
197        }
198    }
199}
200
201impl std::ops::BitOr for MemoryBarrierFlags {
202    type Output = Self;
203    fn bitor(self, rhs: Self) -> Self {
204        Self(self.0 | rhs.0)
205    }
206}
207
208impl std::ops::BitAnd for MemoryBarrierFlags {
209    type Output = Self;
210    fn bitand(self, rhs: Self) -> Self {
211        Self(self.0 & rhs.0)
212    }
213}
214
215// ---------------------------------------------------------------------------
216// PipelineBarrier
217// ---------------------------------------------------------------------------
218
219/// A pipeline barrier specifying which stages must complete before which can start.
220#[derive(Debug, Clone)]
221pub struct PipelineBarrier {
222    /// Memory barrier flags.
223    pub memory_flags: MemoryBarrierFlags,
224    /// Optional fence for GPU-CPU sync.
225    pub fence: bool,
226    /// Label for debugging.
227    pub label: Option<String>,
228}
229
230impl PipelineBarrier {
231    /// Create a barrier with just memory flags.
232    pub fn memory(flags: MemoryBarrierFlags) -> Self {
233        Self {
234            memory_flags: flags,
235            fence: false,
236            label: None,
237        }
238    }
239
240    /// Create a barrier with memory flags and a fence.
241    pub fn memory_and_fence(flags: MemoryBarrierFlags) -> Self {
242        Self {
243            memory_flags: flags,
244            fence: true,
245            label: None,
246        }
247    }
248
249    /// Create a full barrier (all bits + fence).
250    pub fn full() -> Self {
251        Self {
252            memory_flags: MemoryBarrierFlags::ALL,
253            fence: true,
254            label: None,
255        }
256    }
257
258    /// Set a debug label.
259    pub fn with_label(mut self, label: &str) -> Self {
260        self.label = Some(label.to_string());
261        self
262    }
263
264    /// Shader storage read-after-write barrier.
265    pub fn ssbo_raw() -> Self {
266        Self::memory(MemoryBarrierFlags::SHADER_STORAGE)
267    }
268
269    /// Shader storage + vertex attrib barrier (compute writes, vertex shader reads).
270    pub fn compute_to_vertex() -> Self {
271        Self::memory(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::VERTEX_ATTRIB)
272    }
273
274    /// Compute writes, indirect draw reads.
275    pub fn compute_to_indirect() -> Self {
276        Self::memory(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::COMMAND)
277    }
278
279    /// Execute this barrier, issuing the memory barrier and optionally a fence.
280    pub fn execute(&self, gl: &glow::Context) -> Option<FenceSync> {
281        self.memory_flags.issue(gl);
282        if self.fence {
283            let mut fence = FenceSync::new();
284            fence.insert(gl);
285            Some(fence)
286        } else {
287            None
288        }
289    }
290}
291
292// ---------------------------------------------------------------------------
293// ResourceState & ResourceTransition
294// ---------------------------------------------------------------------------
295
296/// Possible states a GPU resource can be in.
297#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
298pub enum ResourceState {
299    /// Undefined / initial.
300    Undefined,
301    /// Being written by a compute shader.
302    ComputeWrite,
303    /// Being read by a compute shader.
304    ComputeRead,
305    /// Being read as vertex attribute data.
306    VertexRead,
307    /// Being read as an index buffer.
308    IndexRead,
309    /// Being read as an indirect command buffer.
310    IndirectRead,
311    /// Being read as a uniform buffer.
312    UniformRead,
313    /// Being read/written by the CPU (mapped).
314    CpuAccess,
315    /// Transfer source (copy read).
316    TransferSrc,
317    /// Transfer destination (copy write).
318    TransferDst,
319}
320
321/// Tracks and validates resource state transitions, issuing barriers as needed.
322pub struct ResourceTransition {
323    states: HashMap<u32, ResourceState>,
324}
325
326impl ResourceTransition {
327    /// Create a new transition tracker.
328    pub fn new() -> Self {
329        Self {
330            states: HashMap::new(),
331        }
332    }
333
334    /// Register a resource with an initial state.
335    pub fn register(&mut self, resource_id: u32, initial: ResourceState) {
336        self.states.insert(resource_id, initial);
337    }
338
339    /// Get current state.
340    pub fn current_state(&self, resource_id: u32) -> Option<ResourceState> {
341        self.states.get(&resource_id).copied()
342    }
343
344    /// Transition a resource to a new state. Returns the barrier flags needed.
345    pub fn transition(
346        &mut self,
347        resource_id: u32,
348        new_state: ResourceState,
349    ) -> Option<MemoryBarrierFlags> {
350        let old_state = self.states.get(&resource_id).copied().unwrap_or(ResourceState::Undefined);
351        if old_state == new_state {
352            return None;
353        }
354        let flags = Self::barrier_for_transition(old_state, new_state);
355        self.states.insert(resource_id, new_state);
356        flags
357    }
358
359    /// Transition and immediately issue the barrier.
360    pub fn transition_and_barrier(
361        &mut self,
362        gl: &glow::Context,
363        resource_id: u32,
364        new_state: ResourceState,
365    ) {
366        if let Some(flags) = self.transition(resource_id, new_state) {
367            flags.issue(gl);
368        }
369    }
370
371    /// Determine which barrier flags are needed for a given transition.
372    fn barrier_for_transition(
373        from: ResourceState,
374        to: ResourceState,
375    ) -> Option<MemoryBarrierFlags> {
376        // Only need barriers when transitioning from a write state
377        match (from, to) {
378            (ResourceState::ComputeWrite, ResourceState::ComputeRead) => {
379                Some(MemoryBarrierFlags::SHADER_STORAGE)
380            }
381            (ResourceState::ComputeWrite, ResourceState::VertexRead) => {
382                Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::VERTEX_ATTRIB)
383            }
384            (ResourceState::ComputeWrite, ResourceState::IndexRead) => {
385                Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::ELEMENT_ARRAY)
386            }
387            (ResourceState::ComputeWrite, ResourceState::IndirectRead) => {
388                Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::COMMAND)
389            }
390            (ResourceState::ComputeWrite, ResourceState::UniformRead) => {
391                Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::UNIFORM)
392            }
393            (ResourceState::ComputeWrite, ResourceState::CpuAccess) => {
394                Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::BUFFER_UPDATE)
395            }
396            (ResourceState::ComputeWrite, ResourceState::TransferSrc) => {
397                Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::BUFFER_UPDATE)
398            }
399            (ResourceState::ComputeWrite, ResourceState::TransferDst) => {
400                Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::BUFFER_UPDATE)
401            }
402            (ResourceState::TransferDst, ResourceState::ComputeRead) => {
403                Some(MemoryBarrierFlags::BUFFER_UPDATE | MemoryBarrierFlags::SHADER_STORAGE)
404            }
405            (ResourceState::TransferDst, ResourceState::VertexRead) => {
406                Some(MemoryBarrierFlags::BUFFER_UPDATE | MemoryBarrierFlags::VERTEX_ATTRIB)
407            }
408            (ResourceState::CpuAccess, ResourceState::ComputeRead) => {
409                Some(MemoryBarrierFlags::BUFFER_UPDATE | MemoryBarrierFlags::SHADER_STORAGE)
410            }
411            (ResourceState::CpuAccess, ResourceState::ComputeWrite) => {
412                Some(MemoryBarrierFlags::BUFFER_UPDATE | MemoryBarrierFlags::SHADER_STORAGE)
413            }
414            // No barrier needed for same-state or read-to-read transitions
415            _ if from == to => None,
416            // Default: if transitioning from any write to any read, barrier everything
417            (ResourceState::ComputeWrite, _) => Some(MemoryBarrierFlags::ALL),
418            _ => None,
419        }
420    }
421
422    /// Remove a resource from tracking.
423    pub fn unregister(&mut self, resource_id: u32) {
424        self.states.remove(&resource_id);
425    }
426
427    /// Number of tracked resources.
428    pub fn tracked_count(&self) -> usize {
429        self.states.len()
430    }
431
432    /// Reset all resources to undefined state.
433    pub fn reset_all(&mut self) {
434        for state in self.states.values_mut() {
435            *state = ResourceState::Undefined;
436        }
437    }
438}
439
440impl Default for ResourceTransition {
441    fn default() -> Self {
442        Self::new()
443    }
444}
445
446// ---------------------------------------------------------------------------
447// AsyncComputeQueue
448// ---------------------------------------------------------------------------
449
450/// A queued compute job for async execution.
451struct ComputeJob {
452    /// Unique job ID.
453    id: u64,
454    /// Program cache key.
455    program_key: u64,
456    /// Dispatch dimensions.
457    dimension: super::dispatch::DispatchDimension,
458    /// Uniforms to set.
459    uniforms: Vec<super::dispatch::UniformValue>,
460    /// Barrier flags after dispatch.
461    barrier: MemoryBarrierFlags,
462    /// Fence inserted after dispatch.
463    fence: Option<FenceSync>,
464    /// Whether the job has been dispatched.
465    dispatched: bool,
466    /// Whether the job has completed.
467    completed: bool,
468}
469
470/// Queue that manages overlapping compute and render work.
471///
472/// Submit compute jobs which are dispatched in order. Fences are inserted
473/// after each job so that subsequent render work can wait for completion.
474pub struct AsyncComputeQueue {
475    jobs: VecDeque<ComputeJob>,
476    next_id: u64,
477    max_in_flight: usize,
478}
479
480impl AsyncComputeQueue {
481    /// Create a new async queue.
482    pub fn new(max_in_flight: usize) -> Self {
483        Self {
484            jobs: VecDeque::new(),
485            next_id: 1,
486            max_in_flight,
487        }
488    }
489
490    /// Submit a compute job. Returns a job ID for tracking.
491    pub fn submit(
492        &mut self,
493        program_key: u64,
494        dimension: super::dispatch::DispatchDimension,
495        uniforms: Vec<super::dispatch::UniformValue>,
496        barrier: MemoryBarrierFlags,
497    ) -> u64 {
498        let id = self.next_id;
499        self.next_id += 1;
500        self.jobs.push_back(ComputeJob {
501            id,
502            program_key,
503            dimension,
504            uniforms,
505            barrier,
506            fence: None,
507            dispatched: false,
508            completed: false,
509        });
510        id
511    }
512
513    /// Flush: dispatch all pending jobs up to max_in_flight.
514    pub fn flush(
515        &mut self,
516        gl: &glow::Context,
517        cache: &super::dispatch::PipelineCache,
518    ) {
519        use glow::HasContext;
520        let in_flight = self.jobs.iter().filter(|j| j.dispatched && !j.completed).count();
521        let can_dispatch = self.max_in_flight.saturating_sub(in_flight);
522
523        let mut dispatched_count = 0;
524        for job in self.jobs.iter_mut() {
525            if dispatched_count >= can_dispatch {
526                break;
527            }
528            if job.dispatched {
529                continue;
530            }
531            // Find program
532            if let Some(program) = cache.cache.get(&job.program_key) {
533                program.bind(gl);
534                // Set uniforms
535                for u in &job.uniforms {
536                    match u {
537                        super::dispatch::UniformValue::Int(name, v) => {
538                            program.set_uniform_int(gl, name, *v)
539                        }
540                        super::dispatch::UniformValue::Uint(name, v) => {
541                            program.set_uniform_uint(gl, name, *v)
542                        }
543                        super::dispatch::UniformValue::Float(name, v) => {
544                            program.set_uniform_float(gl, name, *v)
545                        }
546                        super::dispatch::UniformValue::Vec2(name, x, y) => {
547                            program.set_uniform_vec2(gl, name, *x, *y)
548                        }
549                        super::dispatch::UniformValue::Vec3(name, x, y, z) => {
550                            program.set_uniform_vec3(gl, name, *x, *y, *z)
551                        }
552                        super::dispatch::UniformValue::Vec4(name, x, y, z, w) => {
553                            program.set_uniform_vec4(gl, name, *x, *y, *z, *w)
554                        }
555                    }
556                }
557                let (gx, gy, gz) = job.dimension.as_tuple();
558                unsafe {
559                    gl.dispatch_compute(gx, gy, gz);
560                    gl.memory_barrier(job.barrier.0);
561                }
562                let mut fence = FenceSync::new();
563                fence.insert(gl);
564                job.fence = Some(fence);
565                job.dispatched = true;
566                dispatched_count += 1;
567            }
568        }
569    }
570
571    /// Poll all in-flight jobs and mark completed ones.
572    pub fn poll(&mut self, gl: &glow::Context) {
573        for job in self.jobs.iter_mut() {
574            if job.dispatched && !job.completed {
575                if let Some(ref mut fence) = job.fence {
576                    if fence.poll(gl) == FenceStatus::Signaled {
577                        job.completed = true;
578                    }
579                }
580            }
581        }
582    }
583
584    /// Remove and return all completed job IDs.
585    pub fn drain_completed(&mut self) -> Vec<u64> {
586        let mut completed = Vec::new();
587        while let Some(front) = self.jobs.front() {
588            if front.completed {
589                let job = self.jobs.pop_front().unwrap();
590                completed.push(job.id);
591            } else {
592                break;
593            }
594        }
595        completed
596    }
597
598    /// Check if a specific job is complete.
599    pub fn is_complete(&self, job_id: u64) -> bool {
600        self.jobs.iter().find(|j| j.id == job_id).map_or(true, |j| j.completed)
601    }
602
603    /// Wait for a specific job to complete.
604    pub fn wait_for(
605        &mut self,
606        gl: &glow::Context,
607        job_id: u64,
608        timeout: Duration,
609    ) -> bool {
610        if let Some(job) = self.jobs.iter_mut().find(|j| j.id == job_id) {
611            if job.completed {
612                return true;
613            }
614            if let Some(ref mut fence) = job.fence {
615                let status = fence.wait(gl, timeout);
616                if status == FenceStatus::Signaled {
617                    job.completed = true;
618                    return true;
619                }
620            }
621            false
622        } else {
623            true // Job not found = already completed and drained
624        }
625    }
626
627    /// Number of pending (not yet dispatched) jobs.
628    pub fn pending_count(&self) -> usize {
629        self.jobs.iter().filter(|j| !j.dispatched).count()
630    }
631
632    /// Number of in-flight (dispatched but not completed) jobs.
633    pub fn in_flight_count(&self) -> usize {
634        self.jobs.iter().filter(|j| j.dispatched && !j.completed).count()
635    }
636
637    /// Total number of jobs in the queue.
638    pub fn total_count(&self) -> usize {
639        self.jobs.len()
640    }
641
642    /// Destroy the queue, cleaning up fences.
643    pub fn destroy(self, gl: &glow::Context) {
644        for job in self.jobs {
645            if let Some(fence) = job.fence {
646                fence.destroy(gl);
647            }
648        }
649    }
650}
651
652// ---------------------------------------------------------------------------
653// FrameTimeline
654// ---------------------------------------------------------------------------
655
656/// Per-frame resource versioning for safe concurrent GPU/CPU access.
657///
658/// Maintains a ring of frame contexts. Each frame has its own fence so we
659/// know when it's safe to reuse that frame's resources.
660pub struct FrameTimeline {
661    /// Frame contexts in a ring.
662    frames: Vec<FrameContext>,
663    /// Current frame index into the ring.
664    current: usize,
665    /// Total frames processed.
666    total_frames: u64,
667}
668
669/// A single frame's synchronization context.
670struct FrameContext {
671    /// Fence signaling GPU completion of this frame.
672    fence: FenceSync,
673    /// Frame number.
674    frame_number: u64,
675    /// Resources allocated this frame (buffer IDs for cleanup).
676    transient_resources: Vec<u32>,
677    /// CPU-side timestamp for when this frame began.
678    begin_time: Option<Instant>,
679    /// CPU-side timestamp for when this frame's GPU work completed.
680    complete_time: Option<Instant>,
681}
682
683impl FrameTimeline {
684    /// Create a timeline with `ring_size` frames in flight.
685    pub fn new(ring_size: usize) -> Self {
686        let frames = (0..ring_size)
687            .map(|_| FrameContext {
688                fence: FenceSync::new(),
689                frame_number: 0,
690                transient_resources: Vec::new(),
691                begin_time: None,
692                complete_time: None,
693            })
694            .collect();
695        Self {
696            frames,
697            current: 0,
698            total_frames: 0,
699        }
700    }
701
702    /// Begin a new frame. Waits for the oldest frame to complete if necessary.
703    pub fn begin_frame(&mut self, gl: &glow::Context) {
704        let ctx = &mut self.frames[self.current];
705
706        // If this slot has an active fence, wait for it
707        if ctx.fence.status() == FenceStatus::Unsignaled {
708            ctx.fence.wait_forever(gl);
709        }
710        ctx.complete_time = Some(Instant::now());
711
712        // Clean up transient resources
713        ctx.transient_resources.clear();
714
715        // Set up for new frame
716        ctx.frame_number = self.total_frames;
717        ctx.begin_time = Some(Instant::now());
718    }
719
720    /// End the current frame: insert a fence and advance to next slot.
721    pub fn end_frame(&mut self, gl: &glow::Context) {
722        self.frames[self.current].fence.insert(gl);
723        self.current = (self.current + 1) % self.frames.len();
724        self.total_frames += 1;
725    }
726
727    /// Register a transient resource for the current frame.
728    pub fn register_transient(&mut self, resource_id: u32) {
729        self.frames[self.current]
730            .transient_resources
731            .push(resource_id);
732    }
733
734    /// Current frame number (total frames started).
735    pub fn current_frame_number(&self) -> u64 {
736        self.total_frames
737    }
738
739    /// Ring size.
740    pub fn ring_size(&self) -> usize {
741        self.frames.len()
742    }
743
744    /// Current slot index in the ring.
745    pub fn current_slot(&self) -> usize {
746        self.current
747    }
748
749    /// Check if a specific frame has completed.
750    pub fn is_frame_complete(&mut self, gl: &glow::Context, frame_number: u64) -> bool {
751        for ctx in self.frames.iter_mut() {
752            if ctx.frame_number == frame_number {
753                if ctx.fence.is_signaled() {
754                    return true;
755                }
756                return ctx.fence.poll(gl) == FenceStatus::Signaled;
757            }
758        }
759        // Frame not in ring = already completed long ago
760        true
761    }
762
763    /// Wait for all in-flight frames to complete.
764    pub fn wait_all(&mut self, gl: &glow::Context) {
765        for ctx in self.frames.iter_mut() {
766            if ctx.fence.status() == FenceStatus::Unsignaled {
767                ctx.fence.wait_forever(gl);
768            }
769        }
770    }
771
772    /// Get the average frame latency (time between begin and GPU completion).
773    pub fn average_latency(&self) -> Option<Duration> {
774        let mut total = Duration::ZERO;
775        let mut count = 0u32;
776        for ctx in &self.frames {
777            if let (Some(begin), Some(complete)) = (ctx.begin_time, ctx.complete_time) {
778                if complete > begin {
779                    total += complete - begin;
780                    count += 1;
781                }
782            }
783        }
784        if count > 0 {
785            Some(total / count)
786        } else {
787            None
788        }
789    }
790
791    /// Destroy the timeline, cleaning up all fences.
792    pub fn destroy(self, gl: &glow::Context) {
793        for ctx in self.frames {
794            ctx.fence.destroy(gl);
795        }
796    }
797}
798
799// ---------------------------------------------------------------------------
800// CpuFallback — software implementation for hardware without compute
801// ---------------------------------------------------------------------------
802
803/// CPU fallback that implements the same interface as GPU compute for
804/// hardware without compute shader support.
805///
806/// Each "kernel" is implemented as a Rust function operating on CPU-side
807/// arrays. This allows the engine to function on older hardware, albeit
808/// at lower performance.
809pub struct CpuFallback {
810    /// Whether the fallback is active.
811    active: bool,
812    /// Performance tracking: last kernel execution time.
813    last_execution_us: HashMap<String, u64>,
814}
815
816impl CpuFallback {
817    /// Create a new CPU fallback.
818    pub fn new() -> Self {
819        Self {
820            active: false,
821            last_execution_us: HashMap::new(),
822        }
823    }
824
825    /// Activate the fallback (use when compute shaders are unavailable).
826    pub fn activate(&mut self) {
827        self.active = true;
828    }
829
830    /// Deactivate (revert to GPU).
831    pub fn deactivate(&mut self) {
832        self.active = false;
833    }
834
835    /// Whether the fallback is currently active.
836    pub fn is_active(&self) -> bool {
837        self.active
838    }
839
840    /// CPU particle integration (matches particle_integrate kernel).
841    pub fn particle_integrate(
842        &mut self,
843        positions: &mut [[f32; 4]],
844        velocities: &mut [[f32; 4]],
845        params: &super::kernels::ParticleIntegrateParams,
846    ) {
847        let start = Instant::now();
848        let dt = params.dt;
849        let gravity = params.gravity;
850        let damping = params.damping;
851        let max_age = params.max_age;
852        let wind = params.wind;
853
854        for i in 0..positions.len() {
855            let age = positions[i][3] + dt;
856            let lifetime = velocities[i][3];
857
858            if age >= lifetime || age >= max_age {
859                positions[i][3] = lifetime + 1.0;
860                velocities[i][0] = 0.0;
861                velocities[i][1] = 0.0;
862                velocities[i][2] = 0.0;
863                continue;
864            }
865
866            // Apply gravity
867            velocities[i][0] += gravity[0] * dt;
868            velocities[i][1] += gravity[1] * dt;
869            velocities[i][2] += gravity[2] * dt;
870
871            // Apply wind
872            velocities[i][0] += wind[0] * dt;
873            velocities[i][1] += wind[1] * dt;
874            velocities[i][2] += wind[2] * dt;
875
876            // Damping
877            let d = damping.powf(dt);
878            velocities[i][0] *= d;
879            velocities[i][1] *= d;
880            velocities[i][2] *= d;
881
882            // Integrate position
883            positions[i][0] += velocities[i][0] * dt;
884            positions[i][1] += velocities[i][1] * dt;
885            positions[i][2] += velocities[i][2] * dt;
886            positions[i][3] = age;
887        }
888
889        let elapsed = start.elapsed().as_micros() as u64;
890        self.last_execution_us
891            .insert("particle_integrate".to_string(), elapsed);
892    }
893
894    /// CPU Lorenz attractor integration.
895    pub fn lorenz_step(
896        &mut self,
897        points: &mut [[f32; 3]],
898        sigma: f32,
899        rho: f32,
900        beta: f32,
901        dt: f32,
902    ) {
903        let start = Instant::now();
904        for p in points.iter_mut() {
905            let dx = sigma * (p[1] - p[0]);
906            let dy = p[0] * (rho - p[2]) - p[1];
907            let dz = p[0] * p[1] - beta * p[2];
908
909            // RK4
910            let k1 = [dx, dy, dz];
911            let p2 = [
912                p[0] + 0.5 * dt * k1[0],
913                p[1] + 0.5 * dt * k1[1],
914                p[2] + 0.5 * dt * k1[2],
915            ];
916            let k2 = [
917                sigma * (p2[1] - p2[0]),
918                p2[0] * (rho - p2[2]) - p2[1],
919                p2[0] * p2[1] - beta * p2[2],
920            ];
921            let p3 = [
922                p[0] + 0.5 * dt * k2[0],
923                p[1] + 0.5 * dt * k2[1],
924                p[2] + 0.5 * dt * k2[2],
925            ];
926            let k3 = [
927                sigma * (p3[1] - p3[0]),
928                p3[0] * (rho - p3[2]) - p3[1],
929                p3[0] * p3[1] - beta * p3[2],
930            ];
931            let p4 = [
932                p[0] + dt * k3[0],
933                p[1] + dt * k3[1],
934                p[2] + dt * k3[2],
935            ];
936            let k4 = [
937                sigma * (p4[1] - p4[0]),
938                p4[0] * (rho - p4[2]) - p4[1],
939                p4[0] * p4[1] - beta * p4[2],
940            ];
941
942            p[0] += (dt / 6.0) * (k1[0] + 2.0 * k2[0] + 2.0 * k3[0] + k4[0]);
943            p[1] += (dt / 6.0) * (k1[1] + 2.0 * k2[1] + 2.0 * k3[1] + k4[1]);
944            p[2] += (dt / 6.0) * (k1[2] + 2.0 * k2[2] + 2.0 * k3[2] + k4[2]);
945        }
946        let elapsed = start.elapsed().as_micros() as u64;
947        self.last_execution_us
948            .insert("lorenz_step".to_string(), elapsed);
949    }
950
951    /// CPU Mandelbrot iteration.
952    pub fn mandelbrot_iterate(
953        &mut self,
954        z_re: &mut [f32],
955        z_im: &mut [f32],
956        c_re: &[f32],
957        c_im: &[f32],
958        iterations: &mut [u32],
959        max_iter: u32,
960    ) {
961        let start = Instant::now();
962        assert_eq!(z_re.len(), z_im.len());
963        assert_eq!(z_re.len(), c_re.len());
964        assert_eq!(z_re.len(), c_im.len());
965        assert_eq!(z_re.len(), iterations.len());
966
967        for i in 0..z_re.len() {
968            if iterations[i] >= max_iter {
969                continue;
970            }
971            let zr = z_re[i];
972            let zi = z_im[i];
973            if zr * zr + zi * zi >= 4.0 {
974                continue;
975            }
976            z_re[i] = zr * zr - zi * zi + c_re[i];
977            z_im[i] = 2.0 * zr * zi + c_im[i];
978            iterations[i] += 1;
979        }
980        let elapsed = start.elapsed().as_micros() as u64;
981        self.last_execution_us
982            .insert("mandelbrot_iterate".to_string(), elapsed);
983    }
984
985    /// CPU Julia set iteration.
986    pub fn julia_iterate(
987        &mut self,
988        z_re: &mut [f32],
989        z_im: &mut [f32],
990        c_re: f32,
991        c_im: f32,
992        iterations: &mut [u32],
993        max_iter: u32,
994    ) {
995        let start = Instant::now();
996        for i in 0..z_re.len() {
997            if iterations[i] >= max_iter {
998                continue;
999            }
1000            let zr = z_re[i];
1001            let zi = z_im[i];
1002            if zr * zr + zi * zi >= 4.0 {
1003                continue;
1004            }
1005            z_re[i] = zr * zr - zi * zi + c_re;
1006            z_im[i] = 2.0 * zr * zi + c_im;
1007            iterations[i] += 1;
1008        }
1009        let elapsed = start.elapsed().as_micros() as u64;
1010        self.last_execution_us
1011            .insert("julia_iterate".to_string(), elapsed);
1012    }
1013
1014    /// CPU prefix sum (exclusive scan).
1015    pub fn prefix_sum_exclusive(&mut self, data: &mut [u32]) {
1016        let start = Instant::now();
1017        if data.is_empty() {
1018            return;
1019        }
1020        let mut sum = 0u32;
1021        for val in data.iter_mut() {
1022            let old = *val;
1023            *val = sum;
1024            sum += old;
1025        }
1026        let elapsed = start.elapsed().as_micros() as u64;
1027        self.last_execution_us
1028            .insert("prefix_sum".to_string(), elapsed);
1029    }
1030
1031    /// CPU prefix sum (inclusive scan).
1032    pub fn prefix_sum_inclusive(&mut self, data: &mut [u32]) {
1033        let start = Instant::now();
1034        if data.is_empty() {
1035            return;
1036        }
1037        for i in 1..data.len() {
1038            data[i] += data[i - 1];
1039        }
1040        let elapsed = start.elapsed().as_micros() as u64;
1041        self.last_execution_us
1042            .insert("prefix_sum_inclusive".to_string(), elapsed);
1043    }
1044
1045    /// CPU radix sort (key-value pairs, ascending).
1046    pub fn radix_sort(&mut self, keys: &mut [u32], values: &mut [u32]) {
1047        let start = Instant::now();
1048        assert_eq!(keys.len(), values.len());
1049        let n = keys.len();
1050        if n == 0 {
1051            return;
1052        }
1053
1054        let mut keys_tmp = vec![0u32; n];
1055        let mut vals_tmp = vec![0u32; n];
1056
1057        let radix = 256usize;
1058        let mut counts = vec![0usize; radix];
1059
1060        for bit_offset in (0..32).step_by(8) {
1061            // Count
1062            for c in counts.iter_mut() {
1063                *c = 0;
1064            }
1065            for &k in keys.iter() {
1066                let digit = ((k >> bit_offset) & 0xFF) as usize;
1067                counts[digit] += 1;
1068            }
1069            // Prefix sum on counts
1070            let mut total = 0;
1071            for c in counts.iter_mut() {
1072                let old = *c;
1073                *c = total;
1074                total += old;
1075            }
1076            // Scatter
1077            for i in 0..n {
1078                let digit = ((keys[i] >> bit_offset) & 0xFF) as usize;
1079                let dest = counts[digit];
1080                keys_tmp[dest] = keys[i];
1081                vals_tmp[dest] = values[i];
1082                counts[digit] += 1;
1083            }
1084            // Swap
1085            keys.copy_from_slice(&keys_tmp);
1086            values.copy_from_slice(&vals_tmp);
1087        }
1088
1089        let elapsed = start.elapsed().as_micros() as u64;
1090        self.last_execution_us
1091            .insert("radix_sort".to_string(), elapsed);
1092    }
1093
1094    /// CPU frustum culling.
1095    pub fn frustum_cull(
1096        &mut self,
1097        positions: &[[f32; 3]],
1098        radii: &[f32],
1099        planes: &[[f32; 4]; 6],
1100    ) -> Vec<usize> {
1101        let start = Instant::now();
1102        let mut visible = Vec::new();
1103
1104        for (i, (pos, &radius)) in positions.iter().zip(radii).enumerate() {
1105            let mut inside = true;
1106            for plane in planes {
1107                let dist =
1108                    plane[0] * pos[0] + plane[1] * pos[1] + plane[2] * pos[2] + plane[3];
1109                if dist < -radius {
1110                    inside = false;
1111                    break;
1112                }
1113            }
1114            if inside {
1115                visible.push(i);
1116            }
1117        }
1118
1119        let elapsed = start.elapsed().as_micros() as u64;
1120        self.last_execution_us
1121            .insert("frustum_cull".to_string(), elapsed);
1122        visible
1123    }
1124
1125    /// CPU skinning: transform vertices by bone matrices.
1126    pub fn skin_vertices(
1127        &mut self,
1128        positions: &[[f32; 3]],
1129        normals: &[[f32; 3]],
1130        bone_indices: &[[u32; 4]],
1131        bone_weights: &[[f32; 4]],
1132        bone_matrices: &[[f32; 16]],
1133        inv_bind_matrices: &[[f32; 16]],
1134        out_positions: &mut [[f32; 3]],
1135        out_normals: &mut [[f32; 3]],
1136    ) {
1137        let start = Instant::now();
1138
1139        for i in 0..positions.len() {
1140            let pos = positions[i];
1141            let norm = normals[i];
1142            let indices = bone_indices[i];
1143            let weights = bone_weights[i];
1144
1145            let mut skinned_pos = [0.0f32; 3];
1146            let mut skinned_norm = [0.0f32; 3];
1147
1148            for j in 0..4 {
1149                let w = weights[j];
1150                if w <= 0.0 {
1151                    continue;
1152                }
1153                let bi = indices[j] as usize;
1154                if bi >= bone_matrices.len() {
1155                    continue;
1156                }
1157                // Compute final_matrix = bone * inv_bind
1158                let bone = &bone_matrices[bi];
1159                let inv = &inv_bind_matrices[bi];
1160                let mat = mat4_mul(bone, inv);
1161
1162                // Transform position
1163                let tp = mat4_transform_point(&mat, &pos);
1164                skinned_pos[0] += tp[0] * w;
1165                skinned_pos[1] += tp[1] * w;
1166                skinned_pos[2] += tp[2] * w;
1167
1168                // Transform normal (upper-left 3x3)
1169                let tn = mat4_transform_normal(&mat, &norm);
1170                skinned_norm[0] += tn[0] * w;
1171                skinned_norm[1] += tn[1] * w;
1172                skinned_norm[2] += tn[2] * w;
1173            }
1174
1175            // Normalize the normal
1176            let len = (skinned_norm[0] * skinned_norm[0]
1177                + skinned_norm[1] * skinned_norm[1]
1178                + skinned_norm[2] * skinned_norm[2])
1179                .sqrt();
1180            if len > 1e-6 {
1181                skinned_norm[0] /= len;
1182                skinned_norm[1] /= len;
1183                skinned_norm[2] /= len;
1184            }
1185
1186            out_positions[i] = skinned_pos;
1187            out_normals[i] = skinned_norm;
1188        }
1189
1190        let elapsed = start.elapsed().as_micros() as u64;
1191        self.last_execution_us
1192            .insert("skinning".to_string(), elapsed);
1193    }
1194
1195    /// CPU fluid diffusion (Jacobi iteration).
1196    pub fn fluid_diffuse(
1197        &mut self,
1198        grid: &mut [f32],
1199        scratch: &mut [f32],
1200        width: usize,
1201        height: usize,
1202        diffusion_rate: f32,
1203        dt: f32,
1204        iterations: usize,
1205    ) {
1206        let start = Instant::now();
1207        let dx = 1.0f32;
1208        let alpha = diffusion_rate * dt / (dx * dx);
1209        let r_beta = 1.0 / (1.0 + 4.0 * alpha);
1210
1211        for _ in 0..iterations {
1212            for y in 0..height {
1213                for x in 0..width {
1214                    let idx = y * width + x;
1215                    let left = if x > 0 { grid[idx - 1] } else { grid[idx] };
1216                    let right = if x + 1 < width { grid[idx + 1] } else { grid[idx] };
1217                    let down = if y > 0 { grid[idx - width] } else { grid[idx] };
1218                    let up = if y + 1 < height { grid[idx + width] } else { grid[idx] };
1219                    scratch[idx] = (grid[idx] + alpha * (left + right + down + up)) * r_beta;
1220                }
1221            }
1222            grid.copy_from_slice(scratch);
1223        }
1224
1225        let elapsed = start.elapsed().as_micros() as u64;
1226        self.last_execution_us
1227            .insert("fluid_diffuse".to_string(), elapsed);
1228    }
1229
1230    /// CPU histogram equalization.
1231    pub fn histogram_equalize(
1232        &mut self,
1233        data: &mut [f32],
1234        bin_count: usize,
1235        min_val: f32,
1236        max_val: f32,
1237    ) {
1238        let start = Instant::now();
1239        let range = max_val - min_val;
1240        if range <= 0.0 || data.is_empty() {
1241            return;
1242        }
1243
1244        // Build histogram
1245        let mut histogram = vec![0u32; bin_count];
1246        for &v in data.iter() {
1247            let norm = ((v - min_val) / range).clamp(0.0, 1.0);
1248            let bin = ((norm * (bin_count - 1) as f32) as usize).min(bin_count - 1);
1249            histogram[bin] += 1;
1250        }
1251
1252        // Build CDF
1253        let mut cdf = vec![0.0f32; bin_count];
1254        let mut running = 0u32;
1255        for i in 0..bin_count {
1256            running += histogram[i];
1257            cdf[i] = running as f32 / data.len() as f32;
1258        }
1259
1260        // Apply equalization
1261        for v in data.iter_mut() {
1262            let norm = ((*v - min_val) / range).clamp(0.0, 1.0);
1263            let bin = ((norm * (bin_count - 1) as f32) as usize).min(bin_count - 1);
1264            *v = cdf[bin] * range + min_val;
1265        }
1266
1267        let elapsed = start.elapsed().as_micros() as u64;
1268        self.last_execution_us
1269            .insert("histogram_equalize".to_string(), elapsed);
1270    }
1271
1272    /// Get the last execution time for a named kernel, in microseconds.
1273    pub fn last_execution_us(&self, name: &str) -> Option<u64> {
1274        self.last_execution_us.get(name).copied()
1275    }
1276
1277    /// Summary of all kernel execution times.
1278    pub fn summary(&self) -> String {
1279        let mut s = String::from("=== CPU Fallback Timings ===\n");
1280        let mut names: Vec<&str> = self.last_execution_us.keys().map(|s| s.as_str()).collect();
1281        names.sort();
1282        for name in names {
1283            if let Some(us) = self.last_execution_us.get(name) {
1284                s.push_str(&format!("  {}: {} us\n", name, us));
1285            }
1286        }
1287        s
1288    }
1289}
1290
1291impl Default for CpuFallback {
1292    fn default() -> Self {
1293        Self::new()
1294    }
1295}
1296
1297// ---------------------------------------------------------------------------
1298// Matrix math helpers for CPU fallback
1299// ---------------------------------------------------------------------------
1300
1301/// Multiply two column-major 4x4 matrices.
1302fn mat4_mul(a: &[f32; 16], b: &[f32; 16]) -> [f32; 16] {
1303    let mut result = [0.0f32; 16];
1304    for col in 0..4 {
1305        for row in 0..4 {
1306            let mut sum = 0.0;
1307            for k in 0..4 {
1308                sum += a[k * 4 + row] * b[col * 4 + k];
1309            }
1310            result[col * 4 + row] = sum;
1311        }
1312    }
1313    result
1314}
1315
1316/// Transform a point by a column-major 4x4 matrix (w=1).
1317fn mat4_transform_point(m: &[f32; 16], p: &[f32; 3]) -> [f32; 3] {
1318    [
1319        m[0] * p[0] + m[4] * p[1] + m[8] * p[2] + m[12],
1320        m[1] * p[0] + m[5] * p[1] + m[9] * p[2] + m[13],
1321        m[2] * p[0] + m[6] * p[1] + m[10] * p[2] + m[14],
1322    ]
1323}
1324
1325/// Transform a normal by the upper-left 3x3 of a column-major 4x4 matrix.
1326fn mat4_transform_normal(m: &[f32; 16], n: &[f32; 3]) -> [f32; 3] {
1327    [
1328        m[0] * n[0] + m[4] * n[1] + m[8] * n[2],
1329        m[1] * n[0] + m[5] * n[1] + m[9] * n[2],
1330        m[2] * n[0] + m[6] * n[1] + m[10] * n[2],
1331    ]
1332}
1333
1334// ---------------------------------------------------------------------------
1335// ComputeCapabilities — detect hardware support
1336// ---------------------------------------------------------------------------
1337
1338/// Detected GPU compute capabilities.
1339#[derive(Debug, Clone)]
1340pub struct ComputeCapabilities {
1341    pub has_compute: bool,
1342    pub max_work_group_invocations: u32,
1343    pub max_work_group_size: [u32; 3],
1344    pub max_work_group_count: [u32; 3],
1345    pub max_shared_memory: u32,
1346    pub max_ssbo_bindings: u32,
1347    pub max_atomic_counter_bindings: u32,
1348    pub gl_version_major: u32,
1349    pub gl_version_minor: u32,
1350}
1351
1352impl ComputeCapabilities {
1353    /// Query capabilities from the GL context.
1354    pub fn query(gl: &glow::Context) -> Self {
1355        use glow::HasContext;
1356        unsafe {
1357            let major = gl.get_parameter_i32(glow::MAJOR_VERSION) as u32;
1358            let minor = gl.get_parameter_i32(glow::MINOR_VERSION) as u32;
1359            let has_compute = major > 4 || (major == 4 && minor >= 3);
1360
1361            if !has_compute {
1362                return Self {
1363                    has_compute: false,
1364                    max_work_group_invocations: 0,
1365                    max_work_group_size: [0; 3],
1366                    max_work_group_count: [0; 3],
1367                    max_shared_memory: 0,
1368                    max_ssbo_bindings: 0,
1369                    max_atomic_counter_bindings: 0,
1370                    gl_version_major: major,
1371                    gl_version_minor: minor,
1372                };
1373            }
1374
1375            let max_invocations = gl.get_parameter_i32(0x90EB) as u32;
1376            let max_size = [
1377                gl.get_parameter_indexed_i32(0x91BE, 0) as u32,
1378                gl.get_parameter_indexed_i32(0x91BE, 1) as u32,
1379                gl.get_parameter_indexed_i32(0x91BE, 2) as u32,
1380            ];
1381            let max_count = [
1382                gl.get_parameter_indexed_i32(0x91BF, 0) as u32,
1383                gl.get_parameter_indexed_i32(0x91BF, 1) as u32,
1384                gl.get_parameter_indexed_i32(0x91BF, 2) as u32,
1385            ];
1386            let max_shared = gl.get_parameter_i32(0x8262) as u32;
1387            let max_ssbo = gl.get_parameter_i32(0x90DC) as u32; // GL_MAX_SHADER_STORAGE_BUFFER_BINDINGS
1388            let max_atomic = gl.get_parameter_i32(0x92D1) as u32; // GL_MAX_ATOMIC_COUNTER_BUFFER_BINDINGS
1389
1390            Self {
1391                has_compute,
1392                max_work_group_invocations: max_invocations,
1393                max_work_group_size: max_size,
1394                max_work_group_count: max_count,
1395                max_shared_memory: max_shared,
1396                max_ssbo_bindings: max_ssbo,
1397                max_atomic_counter_bindings: max_atomic,
1398                gl_version_major: major,
1399                gl_version_minor: minor,
1400            }
1401        }
1402    }
1403
1404    /// Check if a specific workgroup size fits within limits.
1405    pub fn validate_workgroup(&self, size: &super::dispatch::WorkgroupSize) -> bool {
1406        size.x <= self.max_work_group_size[0]
1407            && size.y <= self.max_work_group_size[1]
1408            && size.z <= self.max_work_group_size[2]
1409            && size.total_invocations() <= self.max_work_group_invocations
1410    }
1411
1412    /// Summary string.
1413    pub fn summary(&self) -> String {
1414        if !self.has_compute {
1415            return format!(
1416                "GL {}.{}: NO compute support (requires 4.3+)",
1417                self.gl_version_major, self.gl_version_minor
1418            );
1419        }
1420        format!(
1421            "GL {}.{}: compute OK, max_invocations={}, max_size=[{},{},{}], max_shared={}KB, ssbo_bindings={}, atomic_bindings={}",
1422            self.gl_version_major, self.gl_version_minor,
1423            self.max_work_group_invocations,
1424            self.max_work_group_size[0], self.max_work_group_size[1], self.max_work_group_size[2],
1425            self.max_shared_memory / 1024,
1426            self.max_ssbo_bindings,
1427            self.max_atomic_counter_bindings,
1428        )
1429    }
1430}