1use std::collections::{HashMap, VecDeque};
5use std::time::{Duration, Instant};
6
7const GL_SYNC_GPU_COMMANDS_COMPLETE: u32 = 0x9117;
12const GL_ALREADY_SIGNALED: u32 = 0x911A;
13const GL_TIMEOUT_EXPIRED: u32 = 0x911B;
14const GL_CONDITION_SATISFIED: u32 = 0x911C;
15const GL_WAIT_FAILED: u32 = 0x911D;
16const GL_SYNC_FLUSH_COMMANDS_BIT: u32 = 0x00000001;
17
18const GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT: u32 = 0x00000001;
20const GL_ELEMENT_ARRAY_BARRIER_BIT: u32 = 0x00000002;
21const GL_UNIFORM_BARRIER_BIT: u32 = 0x00000004;
22const GL_TEXTURE_FETCH_BARRIER_BIT: u32 = 0x00000008;
23const GL_SHADER_IMAGE_ACCESS_BARRIER_BIT: u32 = 0x00000020;
24const GL_COMMAND_BARRIER_BIT: u32 = 0x00000040;
25const GL_PIXEL_BUFFER_BARRIER_BIT: u32 = 0x00000080;
26const GL_TEXTURE_UPDATE_BARRIER_BIT: u32 = 0x00000100;
27const GL_BUFFER_UPDATE_BARRIER_BIT: u32 = 0x00000200;
28const GL_FRAMEBUFFER_BARRIER_BIT: u32 = 0x00000400;
29const GL_TRANSFORM_FEEDBACK_BARRIER_BIT: u32 = 0x00000800;
30const GL_ATOMIC_COUNTER_BARRIER_BIT: u32 = 0x00001000;
31const GL_SHADER_STORAGE_BARRIER_BIT: u32 = 0x00002000;
32const GL_ALL_BARRIER_BITS: u32 = 0xFFFFFFFF;
33
34#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40pub enum FenceStatus {
41 Unsignaled,
43 Signaled,
45 TimedOut,
47 Failed,
49 NotInserted,
51}
52
53pub struct FenceSync {
58 sync: Option<glow::NativeFence>,
59 status: FenceStatus,
60 inserted_at: Option<Instant>,
61}
62
63impl FenceSync {
64 pub fn new() -> Self {
66 Self {
67 sync: None,
68 status: FenceStatus::NotInserted,
69 inserted_at: None,
70 }
71 }
72
73 pub fn insert(&mut self, gl: &glow::Context) {
75 use glow::HasContext;
76 if let Some(old) = self.sync.take() {
78 unsafe {
79 gl.delete_sync(old);
80 }
81 }
82 let sync = unsafe { gl.fence_sync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0).unwrap() };
83 self.sync = Some(sync);
84 self.status = FenceStatus::Unsignaled;
85 self.inserted_at = Some(Instant::now());
86 }
87
88 pub fn poll(&mut self, gl: &glow::Context) -> FenceStatus {
90 if let Some(sync) = self.sync {
91 use glow::HasContext;
92 let result = unsafe { gl.client_wait_sync(sync, 0, 0) };
93 self.status = match result {
94 GL_ALREADY_SIGNALED | GL_CONDITION_SATISFIED => FenceStatus::Signaled,
95 GL_TIMEOUT_EXPIRED => FenceStatus::Unsignaled,
96 GL_WAIT_FAILED => FenceStatus::Failed,
97 _ => FenceStatus::Unsignaled,
98 };
99 }
100 self.status
101 }
102
103 pub fn wait(&mut self, gl: &glow::Context, timeout: Duration) -> FenceStatus {
105 if let Some(sync) = self.sync {
106 use glow::HasContext;
107 let timeout_ns = timeout.as_nanos() as u64;
108 let result = unsafe {
109 gl.client_wait_sync(sync, GL_SYNC_FLUSH_COMMANDS_BIT, timeout_ns as i32)
110 };
111 self.status = match result {
112 GL_ALREADY_SIGNALED | GL_CONDITION_SATISFIED => FenceStatus::Signaled,
113 GL_TIMEOUT_EXPIRED => FenceStatus::TimedOut,
114 GL_WAIT_FAILED => FenceStatus::Failed,
115 _ => FenceStatus::Unsignaled,
116 };
117 }
118 self.status
119 }
120
121 pub fn wait_forever(&mut self, gl: &glow::Context) -> FenceStatus {
123 self.wait(gl, Duration::from_secs(30)) }
125
126 pub fn status(&self) -> FenceStatus {
128 self.status
129 }
130
131 pub fn is_signaled(&self) -> bool {
133 self.status == FenceStatus::Signaled
134 }
135
136 pub fn elapsed_since_insert(&self) -> Option<Duration> {
138 self.inserted_at.map(|t| t.elapsed())
139 }
140
141 pub fn destroy(self, gl: &glow::Context) {
143 if let Some(sync) = self.sync {
144 use glow::HasContext;
145 unsafe {
146 gl.delete_sync(sync);
147 }
148 }
149 }
150}
151
152impl Default for FenceSync {
153 fn default() -> Self {
154 Self::new()
155 }
156}
157
158#[derive(Debug, Clone, Copy, PartialEq, Eq)]
164pub struct MemoryBarrierFlags(pub u32);
165
166impl MemoryBarrierFlags {
167 pub const VERTEX_ATTRIB: Self = Self(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT);
168 pub const ELEMENT_ARRAY: Self = Self(GL_ELEMENT_ARRAY_BARRIER_BIT);
169 pub const UNIFORM: Self = Self(GL_UNIFORM_BARRIER_BIT);
170 pub const TEXTURE_FETCH: Self = Self(GL_TEXTURE_FETCH_BARRIER_BIT);
171 pub const SHADER_IMAGE: Self = Self(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT);
172 pub const COMMAND: Self = Self(GL_COMMAND_BARRIER_BIT);
173 pub const PIXEL_BUFFER: Self = Self(GL_PIXEL_BUFFER_BARRIER_BIT);
174 pub const TEXTURE_UPDATE: Self = Self(GL_TEXTURE_UPDATE_BARRIER_BIT);
175 pub const BUFFER_UPDATE: Self = Self(GL_BUFFER_UPDATE_BARRIER_BIT);
176 pub const FRAMEBUFFER: Self = Self(GL_FRAMEBUFFER_BARRIER_BIT);
177 pub const TRANSFORM_FEEDBACK: Self = Self(GL_TRANSFORM_FEEDBACK_BARRIER_BIT);
178 pub const ATOMIC_COUNTER: Self = Self(GL_ATOMIC_COUNTER_BARRIER_BIT);
179 pub const SHADER_STORAGE: Self = Self(GL_SHADER_STORAGE_BARRIER_BIT);
180 pub const ALL: Self = Self(GL_ALL_BARRIER_BITS);
181
182 pub fn combine(self, other: Self) -> Self {
184 Self(self.0 | other.0)
185 }
186
187 pub fn contains(self, flag: Self) -> bool {
189 (self.0 & flag.0) == flag.0
190 }
191
192 pub fn issue(self, gl: &glow::Context) {
194 use glow::HasContext;
195 unsafe {
196 gl.memory_barrier(self.0);
197 }
198 }
199}
200
201impl std::ops::BitOr for MemoryBarrierFlags {
202 type Output = Self;
203 fn bitor(self, rhs: Self) -> Self {
204 Self(self.0 | rhs.0)
205 }
206}
207
208impl std::ops::BitAnd for MemoryBarrierFlags {
209 type Output = Self;
210 fn bitand(self, rhs: Self) -> Self {
211 Self(self.0 & rhs.0)
212 }
213}
214
215#[derive(Debug, Clone)]
221pub struct PipelineBarrier {
222 pub memory_flags: MemoryBarrierFlags,
224 pub fence: bool,
226 pub label: Option<String>,
228}
229
230impl PipelineBarrier {
231 pub fn memory(flags: MemoryBarrierFlags) -> Self {
233 Self {
234 memory_flags: flags,
235 fence: false,
236 label: None,
237 }
238 }
239
240 pub fn memory_and_fence(flags: MemoryBarrierFlags) -> Self {
242 Self {
243 memory_flags: flags,
244 fence: true,
245 label: None,
246 }
247 }
248
249 pub fn full() -> Self {
251 Self {
252 memory_flags: MemoryBarrierFlags::ALL,
253 fence: true,
254 label: None,
255 }
256 }
257
258 pub fn with_label(mut self, label: &str) -> Self {
260 self.label = Some(label.to_string());
261 self
262 }
263
264 pub fn ssbo_raw() -> Self {
266 Self::memory(MemoryBarrierFlags::SHADER_STORAGE)
267 }
268
269 pub fn compute_to_vertex() -> Self {
271 Self::memory(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::VERTEX_ATTRIB)
272 }
273
274 pub fn compute_to_indirect() -> Self {
276 Self::memory(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::COMMAND)
277 }
278
279 pub fn execute(&self, gl: &glow::Context) -> Option<FenceSync> {
281 self.memory_flags.issue(gl);
282 if self.fence {
283 let mut fence = FenceSync::new();
284 fence.insert(gl);
285 Some(fence)
286 } else {
287 None
288 }
289 }
290}
291
292#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
298pub enum ResourceState {
299 Undefined,
301 ComputeWrite,
303 ComputeRead,
305 VertexRead,
307 IndexRead,
309 IndirectRead,
311 UniformRead,
313 CpuAccess,
315 TransferSrc,
317 TransferDst,
319}
320
321pub struct ResourceTransition {
323 states: HashMap<u32, ResourceState>,
324}
325
326impl ResourceTransition {
327 pub fn new() -> Self {
329 Self {
330 states: HashMap::new(),
331 }
332 }
333
334 pub fn register(&mut self, resource_id: u32, initial: ResourceState) {
336 self.states.insert(resource_id, initial);
337 }
338
339 pub fn current_state(&self, resource_id: u32) -> Option<ResourceState> {
341 self.states.get(&resource_id).copied()
342 }
343
344 pub fn transition(
346 &mut self,
347 resource_id: u32,
348 new_state: ResourceState,
349 ) -> Option<MemoryBarrierFlags> {
350 let old_state = self.states.get(&resource_id).copied().unwrap_or(ResourceState::Undefined);
351 if old_state == new_state {
352 return None;
353 }
354 let flags = Self::barrier_for_transition(old_state, new_state);
355 self.states.insert(resource_id, new_state);
356 flags
357 }
358
359 pub fn transition_and_barrier(
361 &mut self,
362 gl: &glow::Context,
363 resource_id: u32,
364 new_state: ResourceState,
365 ) {
366 if let Some(flags) = self.transition(resource_id, new_state) {
367 flags.issue(gl);
368 }
369 }
370
371 fn barrier_for_transition(
373 from: ResourceState,
374 to: ResourceState,
375 ) -> Option<MemoryBarrierFlags> {
376 match (from, to) {
378 (ResourceState::ComputeWrite, ResourceState::ComputeRead) => {
379 Some(MemoryBarrierFlags::SHADER_STORAGE)
380 }
381 (ResourceState::ComputeWrite, ResourceState::VertexRead) => {
382 Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::VERTEX_ATTRIB)
383 }
384 (ResourceState::ComputeWrite, ResourceState::IndexRead) => {
385 Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::ELEMENT_ARRAY)
386 }
387 (ResourceState::ComputeWrite, ResourceState::IndirectRead) => {
388 Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::COMMAND)
389 }
390 (ResourceState::ComputeWrite, ResourceState::UniformRead) => {
391 Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::UNIFORM)
392 }
393 (ResourceState::ComputeWrite, ResourceState::CpuAccess) => {
394 Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::BUFFER_UPDATE)
395 }
396 (ResourceState::ComputeWrite, ResourceState::TransferSrc) => {
397 Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::BUFFER_UPDATE)
398 }
399 (ResourceState::ComputeWrite, ResourceState::TransferDst) => {
400 Some(MemoryBarrierFlags::SHADER_STORAGE | MemoryBarrierFlags::BUFFER_UPDATE)
401 }
402 (ResourceState::TransferDst, ResourceState::ComputeRead) => {
403 Some(MemoryBarrierFlags::BUFFER_UPDATE | MemoryBarrierFlags::SHADER_STORAGE)
404 }
405 (ResourceState::TransferDst, ResourceState::VertexRead) => {
406 Some(MemoryBarrierFlags::BUFFER_UPDATE | MemoryBarrierFlags::VERTEX_ATTRIB)
407 }
408 (ResourceState::CpuAccess, ResourceState::ComputeRead) => {
409 Some(MemoryBarrierFlags::BUFFER_UPDATE | MemoryBarrierFlags::SHADER_STORAGE)
410 }
411 (ResourceState::CpuAccess, ResourceState::ComputeWrite) => {
412 Some(MemoryBarrierFlags::BUFFER_UPDATE | MemoryBarrierFlags::SHADER_STORAGE)
413 }
414 _ if from == to => None,
416 (ResourceState::ComputeWrite, _) => Some(MemoryBarrierFlags::ALL),
418 _ => None,
419 }
420 }
421
422 pub fn unregister(&mut self, resource_id: u32) {
424 self.states.remove(&resource_id);
425 }
426
427 pub fn tracked_count(&self) -> usize {
429 self.states.len()
430 }
431
432 pub fn reset_all(&mut self) {
434 for state in self.states.values_mut() {
435 *state = ResourceState::Undefined;
436 }
437 }
438}
439
440impl Default for ResourceTransition {
441 fn default() -> Self {
442 Self::new()
443 }
444}
445
446struct ComputeJob {
452 id: u64,
454 program_key: u64,
456 dimension: super::dispatch::DispatchDimension,
458 uniforms: Vec<super::dispatch::UniformValue>,
460 barrier: MemoryBarrierFlags,
462 fence: Option<FenceSync>,
464 dispatched: bool,
466 completed: bool,
468}
469
470pub struct AsyncComputeQueue {
475 jobs: VecDeque<ComputeJob>,
476 next_id: u64,
477 max_in_flight: usize,
478}
479
480impl AsyncComputeQueue {
481 pub fn new(max_in_flight: usize) -> Self {
483 Self {
484 jobs: VecDeque::new(),
485 next_id: 1,
486 max_in_flight,
487 }
488 }
489
490 pub fn submit(
492 &mut self,
493 program_key: u64,
494 dimension: super::dispatch::DispatchDimension,
495 uniforms: Vec<super::dispatch::UniformValue>,
496 barrier: MemoryBarrierFlags,
497 ) -> u64 {
498 let id = self.next_id;
499 self.next_id += 1;
500 self.jobs.push_back(ComputeJob {
501 id,
502 program_key,
503 dimension,
504 uniforms,
505 barrier,
506 fence: None,
507 dispatched: false,
508 completed: false,
509 });
510 id
511 }
512
513 pub fn flush(
515 &mut self,
516 gl: &glow::Context,
517 cache: &super::dispatch::PipelineCache,
518 ) {
519 use glow::HasContext;
520 let in_flight = self.jobs.iter().filter(|j| j.dispatched && !j.completed).count();
521 let can_dispatch = self.max_in_flight.saturating_sub(in_flight);
522
523 let mut dispatched_count = 0;
524 for job in self.jobs.iter_mut() {
525 if dispatched_count >= can_dispatch {
526 break;
527 }
528 if job.dispatched {
529 continue;
530 }
531 if let Some(program) = cache.cache.get(&job.program_key) {
533 program.bind(gl);
534 for u in &job.uniforms {
536 match u {
537 super::dispatch::UniformValue::Int(name, v) => {
538 program.set_uniform_int(gl, name, *v)
539 }
540 super::dispatch::UniformValue::Uint(name, v) => {
541 program.set_uniform_uint(gl, name, *v)
542 }
543 super::dispatch::UniformValue::Float(name, v) => {
544 program.set_uniform_float(gl, name, *v)
545 }
546 super::dispatch::UniformValue::Vec2(name, x, y) => {
547 program.set_uniform_vec2(gl, name, *x, *y)
548 }
549 super::dispatch::UniformValue::Vec3(name, x, y, z) => {
550 program.set_uniform_vec3(gl, name, *x, *y, *z)
551 }
552 super::dispatch::UniformValue::Vec4(name, x, y, z, w) => {
553 program.set_uniform_vec4(gl, name, *x, *y, *z, *w)
554 }
555 }
556 }
557 let (gx, gy, gz) = job.dimension.as_tuple();
558 unsafe {
559 gl.dispatch_compute(gx, gy, gz);
560 gl.memory_barrier(job.barrier.0);
561 }
562 let mut fence = FenceSync::new();
563 fence.insert(gl);
564 job.fence = Some(fence);
565 job.dispatched = true;
566 dispatched_count += 1;
567 }
568 }
569 }
570
571 pub fn poll(&mut self, gl: &glow::Context) {
573 for job in self.jobs.iter_mut() {
574 if job.dispatched && !job.completed {
575 if let Some(ref mut fence) = job.fence {
576 if fence.poll(gl) == FenceStatus::Signaled {
577 job.completed = true;
578 }
579 }
580 }
581 }
582 }
583
584 pub fn drain_completed(&mut self) -> Vec<u64> {
586 let mut completed = Vec::new();
587 while let Some(front) = self.jobs.front() {
588 if front.completed {
589 let job = self.jobs.pop_front().unwrap();
590 completed.push(job.id);
591 } else {
592 break;
593 }
594 }
595 completed
596 }
597
598 pub fn is_complete(&self, job_id: u64) -> bool {
600 self.jobs.iter().find(|j| j.id == job_id).map_or(true, |j| j.completed)
601 }
602
603 pub fn wait_for(
605 &mut self,
606 gl: &glow::Context,
607 job_id: u64,
608 timeout: Duration,
609 ) -> bool {
610 if let Some(job) = self.jobs.iter_mut().find(|j| j.id == job_id) {
611 if job.completed {
612 return true;
613 }
614 if let Some(ref mut fence) = job.fence {
615 let status = fence.wait(gl, timeout);
616 if status == FenceStatus::Signaled {
617 job.completed = true;
618 return true;
619 }
620 }
621 false
622 } else {
623 true }
625 }
626
627 pub fn pending_count(&self) -> usize {
629 self.jobs.iter().filter(|j| !j.dispatched).count()
630 }
631
632 pub fn in_flight_count(&self) -> usize {
634 self.jobs.iter().filter(|j| j.dispatched && !j.completed).count()
635 }
636
637 pub fn total_count(&self) -> usize {
639 self.jobs.len()
640 }
641
642 pub fn destroy(self, gl: &glow::Context) {
644 for job in self.jobs {
645 if let Some(fence) = job.fence {
646 fence.destroy(gl);
647 }
648 }
649 }
650}
651
652pub struct FrameTimeline {
661 frames: Vec<FrameContext>,
663 current: usize,
665 total_frames: u64,
667}
668
669struct FrameContext {
671 fence: FenceSync,
673 frame_number: u64,
675 transient_resources: Vec<u32>,
677 begin_time: Option<Instant>,
679 complete_time: Option<Instant>,
681}
682
683impl FrameTimeline {
684 pub fn new(ring_size: usize) -> Self {
686 let frames = (0..ring_size)
687 .map(|_| FrameContext {
688 fence: FenceSync::new(),
689 frame_number: 0,
690 transient_resources: Vec::new(),
691 begin_time: None,
692 complete_time: None,
693 })
694 .collect();
695 Self {
696 frames,
697 current: 0,
698 total_frames: 0,
699 }
700 }
701
702 pub fn begin_frame(&mut self, gl: &glow::Context) {
704 let ctx = &mut self.frames[self.current];
705
706 if ctx.fence.status() == FenceStatus::Unsignaled {
708 ctx.fence.wait_forever(gl);
709 }
710 ctx.complete_time = Some(Instant::now());
711
712 ctx.transient_resources.clear();
714
715 ctx.frame_number = self.total_frames;
717 ctx.begin_time = Some(Instant::now());
718 }
719
720 pub fn end_frame(&mut self, gl: &glow::Context) {
722 self.frames[self.current].fence.insert(gl);
723 self.current = (self.current + 1) % self.frames.len();
724 self.total_frames += 1;
725 }
726
727 pub fn register_transient(&mut self, resource_id: u32) {
729 self.frames[self.current]
730 .transient_resources
731 .push(resource_id);
732 }
733
734 pub fn current_frame_number(&self) -> u64 {
736 self.total_frames
737 }
738
739 pub fn ring_size(&self) -> usize {
741 self.frames.len()
742 }
743
744 pub fn current_slot(&self) -> usize {
746 self.current
747 }
748
749 pub fn is_frame_complete(&mut self, gl: &glow::Context, frame_number: u64) -> bool {
751 for ctx in self.frames.iter_mut() {
752 if ctx.frame_number == frame_number {
753 if ctx.fence.is_signaled() {
754 return true;
755 }
756 return ctx.fence.poll(gl) == FenceStatus::Signaled;
757 }
758 }
759 true
761 }
762
763 pub fn wait_all(&mut self, gl: &glow::Context) {
765 for ctx in self.frames.iter_mut() {
766 if ctx.fence.status() == FenceStatus::Unsignaled {
767 ctx.fence.wait_forever(gl);
768 }
769 }
770 }
771
772 pub fn average_latency(&self) -> Option<Duration> {
774 let mut total = Duration::ZERO;
775 let mut count = 0u32;
776 for ctx in &self.frames {
777 if let (Some(begin), Some(complete)) = (ctx.begin_time, ctx.complete_time) {
778 if complete > begin {
779 total += complete - begin;
780 count += 1;
781 }
782 }
783 }
784 if count > 0 {
785 Some(total / count)
786 } else {
787 None
788 }
789 }
790
791 pub fn destroy(self, gl: &glow::Context) {
793 for ctx in self.frames {
794 ctx.fence.destroy(gl);
795 }
796 }
797}
798
799pub struct CpuFallback {
810 active: bool,
812 last_execution_us: HashMap<String, u64>,
814}
815
816impl CpuFallback {
817 pub fn new() -> Self {
819 Self {
820 active: false,
821 last_execution_us: HashMap::new(),
822 }
823 }
824
825 pub fn activate(&mut self) {
827 self.active = true;
828 }
829
830 pub fn deactivate(&mut self) {
832 self.active = false;
833 }
834
835 pub fn is_active(&self) -> bool {
837 self.active
838 }
839
840 pub fn particle_integrate(
842 &mut self,
843 positions: &mut [[f32; 4]],
844 velocities: &mut [[f32; 4]],
845 params: &super::kernels::ParticleIntegrateParams,
846 ) {
847 let start = Instant::now();
848 let dt = params.dt;
849 let gravity = params.gravity;
850 let damping = params.damping;
851 let max_age = params.max_age;
852 let wind = params.wind;
853
854 for i in 0..positions.len() {
855 let age = positions[i][3] + dt;
856 let lifetime = velocities[i][3];
857
858 if age >= lifetime || age >= max_age {
859 positions[i][3] = lifetime + 1.0;
860 velocities[i][0] = 0.0;
861 velocities[i][1] = 0.0;
862 velocities[i][2] = 0.0;
863 continue;
864 }
865
866 velocities[i][0] += gravity[0] * dt;
868 velocities[i][1] += gravity[1] * dt;
869 velocities[i][2] += gravity[2] * dt;
870
871 velocities[i][0] += wind[0] * dt;
873 velocities[i][1] += wind[1] * dt;
874 velocities[i][2] += wind[2] * dt;
875
876 let d = damping.powf(dt);
878 velocities[i][0] *= d;
879 velocities[i][1] *= d;
880 velocities[i][2] *= d;
881
882 positions[i][0] += velocities[i][0] * dt;
884 positions[i][1] += velocities[i][1] * dt;
885 positions[i][2] += velocities[i][2] * dt;
886 positions[i][3] = age;
887 }
888
889 let elapsed = start.elapsed().as_micros() as u64;
890 self.last_execution_us
891 .insert("particle_integrate".to_string(), elapsed);
892 }
893
894 pub fn lorenz_step(
896 &mut self,
897 points: &mut [[f32; 3]],
898 sigma: f32,
899 rho: f32,
900 beta: f32,
901 dt: f32,
902 ) {
903 let start = Instant::now();
904 for p in points.iter_mut() {
905 let dx = sigma * (p[1] - p[0]);
906 let dy = p[0] * (rho - p[2]) - p[1];
907 let dz = p[0] * p[1] - beta * p[2];
908
909 let k1 = [dx, dy, dz];
911 let p2 = [
912 p[0] + 0.5 * dt * k1[0],
913 p[1] + 0.5 * dt * k1[1],
914 p[2] + 0.5 * dt * k1[2],
915 ];
916 let k2 = [
917 sigma * (p2[1] - p2[0]),
918 p2[0] * (rho - p2[2]) - p2[1],
919 p2[0] * p2[1] - beta * p2[2],
920 ];
921 let p3 = [
922 p[0] + 0.5 * dt * k2[0],
923 p[1] + 0.5 * dt * k2[1],
924 p[2] + 0.5 * dt * k2[2],
925 ];
926 let k3 = [
927 sigma * (p3[1] - p3[0]),
928 p3[0] * (rho - p3[2]) - p3[1],
929 p3[0] * p3[1] - beta * p3[2],
930 ];
931 let p4 = [
932 p[0] + dt * k3[0],
933 p[1] + dt * k3[1],
934 p[2] + dt * k3[2],
935 ];
936 let k4 = [
937 sigma * (p4[1] - p4[0]),
938 p4[0] * (rho - p4[2]) - p4[1],
939 p4[0] * p4[1] - beta * p4[2],
940 ];
941
942 p[0] += (dt / 6.0) * (k1[0] + 2.0 * k2[0] + 2.0 * k3[0] + k4[0]);
943 p[1] += (dt / 6.0) * (k1[1] + 2.0 * k2[1] + 2.0 * k3[1] + k4[1]);
944 p[2] += (dt / 6.0) * (k1[2] + 2.0 * k2[2] + 2.0 * k3[2] + k4[2]);
945 }
946 let elapsed = start.elapsed().as_micros() as u64;
947 self.last_execution_us
948 .insert("lorenz_step".to_string(), elapsed);
949 }
950
951 pub fn mandelbrot_iterate(
953 &mut self,
954 z_re: &mut [f32],
955 z_im: &mut [f32],
956 c_re: &[f32],
957 c_im: &[f32],
958 iterations: &mut [u32],
959 max_iter: u32,
960 ) {
961 let start = Instant::now();
962 assert_eq!(z_re.len(), z_im.len());
963 assert_eq!(z_re.len(), c_re.len());
964 assert_eq!(z_re.len(), c_im.len());
965 assert_eq!(z_re.len(), iterations.len());
966
967 for i in 0..z_re.len() {
968 if iterations[i] >= max_iter {
969 continue;
970 }
971 let zr = z_re[i];
972 let zi = z_im[i];
973 if zr * zr + zi * zi >= 4.0 {
974 continue;
975 }
976 z_re[i] = zr * zr - zi * zi + c_re[i];
977 z_im[i] = 2.0 * zr * zi + c_im[i];
978 iterations[i] += 1;
979 }
980 let elapsed = start.elapsed().as_micros() as u64;
981 self.last_execution_us
982 .insert("mandelbrot_iterate".to_string(), elapsed);
983 }
984
985 pub fn julia_iterate(
987 &mut self,
988 z_re: &mut [f32],
989 z_im: &mut [f32],
990 c_re: f32,
991 c_im: f32,
992 iterations: &mut [u32],
993 max_iter: u32,
994 ) {
995 let start = Instant::now();
996 for i in 0..z_re.len() {
997 if iterations[i] >= max_iter {
998 continue;
999 }
1000 let zr = z_re[i];
1001 let zi = z_im[i];
1002 if zr * zr + zi * zi >= 4.0 {
1003 continue;
1004 }
1005 z_re[i] = zr * zr - zi * zi + c_re;
1006 z_im[i] = 2.0 * zr * zi + c_im;
1007 iterations[i] += 1;
1008 }
1009 let elapsed = start.elapsed().as_micros() as u64;
1010 self.last_execution_us
1011 .insert("julia_iterate".to_string(), elapsed);
1012 }
1013
1014 pub fn prefix_sum_exclusive(&mut self, data: &mut [u32]) {
1016 let start = Instant::now();
1017 if data.is_empty() {
1018 return;
1019 }
1020 let mut sum = 0u32;
1021 for val in data.iter_mut() {
1022 let old = *val;
1023 *val = sum;
1024 sum += old;
1025 }
1026 let elapsed = start.elapsed().as_micros() as u64;
1027 self.last_execution_us
1028 .insert("prefix_sum".to_string(), elapsed);
1029 }
1030
1031 pub fn prefix_sum_inclusive(&mut self, data: &mut [u32]) {
1033 let start = Instant::now();
1034 if data.is_empty() {
1035 return;
1036 }
1037 for i in 1..data.len() {
1038 data[i] += data[i - 1];
1039 }
1040 let elapsed = start.elapsed().as_micros() as u64;
1041 self.last_execution_us
1042 .insert("prefix_sum_inclusive".to_string(), elapsed);
1043 }
1044
1045 pub fn radix_sort(&mut self, keys: &mut [u32], values: &mut [u32]) {
1047 let start = Instant::now();
1048 assert_eq!(keys.len(), values.len());
1049 let n = keys.len();
1050 if n == 0 {
1051 return;
1052 }
1053
1054 let mut keys_tmp = vec![0u32; n];
1055 let mut vals_tmp = vec![0u32; n];
1056
1057 let radix = 256usize;
1058 let mut counts = vec![0usize; radix];
1059
1060 for bit_offset in (0..32).step_by(8) {
1061 for c in counts.iter_mut() {
1063 *c = 0;
1064 }
1065 for &k in keys.iter() {
1066 let digit = ((k >> bit_offset) & 0xFF) as usize;
1067 counts[digit] += 1;
1068 }
1069 let mut total = 0;
1071 for c in counts.iter_mut() {
1072 let old = *c;
1073 *c = total;
1074 total += old;
1075 }
1076 for i in 0..n {
1078 let digit = ((keys[i] >> bit_offset) & 0xFF) as usize;
1079 let dest = counts[digit];
1080 keys_tmp[dest] = keys[i];
1081 vals_tmp[dest] = values[i];
1082 counts[digit] += 1;
1083 }
1084 keys.copy_from_slice(&keys_tmp);
1086 values.copy_from_slice(&vals_tmp);
1087 }
1088
1089 let elapsed = start.elapsed().as_micros() as u64;
1090 self.last_execution_us
1091 .insert("radix_sort".to_string(), elapsed);
1092 }
1093
1094 pub fn frustum_cull(
1096 &mut self,
1097 positions: &[[f32; 3]],
1098 radii: &[f32],
1099 planes: &[[f32; 4]; 6],
1100 ) -> Vec<usize> {
1101 let start = Instant::now();
1102 let mut visible = Vec::new();
1103
1104 for (i, (pos, &radius)) in positions.iter().zip(radii).enumerate() {
1105 let mut inside = true;
1106 for plane in planes {
1107 let dist =
1108 plane[0] * pos[0] + plane[1] * pos[1] + plane[2] * pos[2] + plane[3];
1109 if dist < -radius {
1110 inside = false;
1111 break;
1112 }
1113 }
1114 if inside {
1115 visible.push(i);
1116 }
1117 }
1118
1119 let elapsed = start.elapsed().as_micros() as u64;
1120 self.last_execution_us
1121 .insert("frustum_cull".to_string(), elapsed);
1122 visible
1123 }
1124
1125 pub fn skin_vertices(
1127 &mut self,
1128 positions: &[[f32; 3]],
1129 normals: &[[f32; 3]],
1130 bone_indices: &[[u32; 4]],
1131 bone_weights: &[[f32; 4]],
1132 bone_matrices: &[[f32; 16]],
1133 inv_bind_matrices: &[[f32; 16]],
1134 out_positions: &mut [[f32; 3]],
1135 out_normals: &mut [[f32; 3]],
1136 ) {
1137 let start = Instant::now();
1138
1139 for i in 0..positions.len() {
1140 let pos = positions[i];
1141 let norm = normals[i];
1142 let indices = bone_indices[i];
1143 let weights = bone_weights[i];
1144
1145 let mut skinned_pos = [0.0f32; 3];
1146 let mut skinned_norm = [0.0f32; 3];
1147
1148 for j in 0..4 {
1149 let w = weights[j];
1150 if w <= 0.0 {
1151 continue;
1152 }
1153 let bi = indices[j] as usize;
1154 if bi >= bone_matrices.len() {
1155 continue;
1156 }
1157 let bone = &bone_matrices[bi];
1159 let inv = &inv_bind_matrices[bi];
1160 let mat = mat4_mul(bone, inv);
1161
1162 let tp = mat4_transform_point(&mat, &pos);
1164 skinned_pos[0] += tp[0] * w;
1165 skinned_pos[1] += tp[1] * w;
1166 skinned_pos[2] += tp[2] * w;
1167
1168 let tn = mat4_transform_normal(&mat, &norm);
1170 skinned_norm[0] += tn[0] * w;
1171 skinned_norm[1] += tn[1] * w;
1172 skinned_norm[2] += tn[2] * w;
1173 }
1174
1175 let len = (skinned_norm[0] * skinned_norm[0]
1177 + skinned_norm[1] * skinned_norm[1]
1178 + skinned_norm[2] * skinned_norm[2])
1179 .sqrt();
1180 if len > 1e-6 {
1181 skinned_norm[0] /= len;
1182 skinned_norm[1] /= len;
1183 skinned_norm[2] /= len;
1184 }
1185
1186 out_positions[i] = skinned_pos;
1187 out_normals[i] = skinned_norm;
1188 }
1189
1190 let elapsed = start.elapsed().as_micros() as u64;
1191 self.last_execution_us
1192 .insert("skinning".to_string(), elapsed);
1193 }
1194
1195 pub fn fluid_diffuse(
1197 &mut self,
1198 grid: &mut [f32],
1199 scratch: &mut [f32],
1200 width: usize,
1201 height: usize,
1202 diffusion_rate: f32,
1203 dt: f32,
1204 iterations: usize,
1205 ) {
1206 let start = Instant::now();
1207 let dx = 1.0f32;
1208 let alpha = diffusion_rate * dt / (dx * dx);
1209 let r_beta = 1.0 / (1.0 + 4.0 * alpha);
1210
1211 for _ in 0..iterations {
1212 for y in 0..height {
1213 for x in 0..width {
1214 let idx = y * width + x;
1215 let left = if x > 0 { grid[idx - 1] } else { grid[idx] };
1216 let right = if x + 1 < width { grid[idx + 1] } else { grid[idx] };
1217 let down = if y > 0 { grid[idx - width] } else { grid[idx] };
1218 let up = if y + 1 < height { grid[idx + width] } else { grid[idx] };
1219 scratch[idx] = (grid[idx] + alpha * (left + right + down + up)) * r_beta;
1220 }
1221 }
1222 grid.copy_from_slice(scratch);
1223 }
1224
1225 let elapsed = start.elapsed().as_micros() as u64;
1226 self.last_execution_us
1227 .insert("fluid_diffuse".to_string(), elapsed);
1228 }
1229
1230 pub fn histogram_equalize(
1232 &mut self,
1233 data: &mut [f32],
1234 bin_count: usize,
1235 min_val: f32,
1236 max_val: f32,
1237 ) {
1238 let start = Instant::now();
1239 let range = max_val - min_val;
1240 if range <= 0.0 || data.is_empty() {
1241 return;
1242 }
1243
1244 let mut histogram = vec![0u32; bin_count];
1246 for &v in data.iter() {
1247 let norm = ((v - min_val) / range).clamp(0.0, 1.0);
1248 let bin = ((norm * (bin_count - 1) as f32) as usize).min(bin_count - 1);
1249 histogram[bin] += 1;
1250 }
1251
1252 let mut cdf = vec![0.0f32; bin_count];
1254 let mut running = 0u32;
1255 for i in 0..bin_count {
1256 running += histogram[i];
1257 cdf[i] = running as f32 / data.len() as f32;
1258 }
1259
1260 for v in data.iter_mut() {
1262 let norm = ((*v - min_val) / range).clamp(0.0, 1.0);
1263 let bin = ((norm * (bin_count - 1) as f32) as usize).min(bin_count - 1);
1264 *v = cdf[bin] * range + min_val;
1265 }
1266
1267 let elapsed = start.elapsed().as_micros() as u64;
1268 self.last_execution_us
1269 .insert("histogram_equalize".to_string(), elapsed);
1270 }
1271
1272 pub fn last_execution_us(&self, name: &str) -> Option<u64> {
1274 self.last_execution_us.get(name).copied()
1275 }
1276
1277 pub fn summary(&self) -> String {
1279 let mut s = String::from("=== CPU Fallback Timings ===\n");
1280 let mut names: Vec<&str> = self.last_execution_us.keys().map(|s| s.as_str()).collect();
1281 names.sort();
1282 for name in names {
1283 if let Some(us) = self.last_execution_us.get(name) {
1284 s.push_str(&format!(" {}: {} us\n", name, us));
1285 }
1286 }
1287 s
1288 }
1289}
1290
1291impl Default for CpuFallback {
1292 fn default() -> Self {
1293 Self::new()
1294 }
1295}
1296
1297fn mat4_mul(a: &[f32; 16], b: &[f32; 16]) -> [f32; 16] {
1303 let mut result = [0.0f32; 16];
1304 for col in 0..4 {
1305 for row in 0..4 {
1306 let mut sum = 0.0;
1307 for k in 0..4 {
1308 sum += a[k * 4 + row] * b[col * 4 + k];
1309 }
1310 result[col * 4 + row] = sum;
1311 }
1312 }
1313 result
1314}
1315
1316fn mat4_transform_point(m: &[f32; 16], p: &[f32; 3]) -> [f32; 3] {
1318 [
1319 m[0] * p[0] + m[4] * p[1] + m[8] * p[2] + m[12],
1320 m[1] * p[0] + m[5] * p[1] + m[9] * p[2] + m[13],
1321 m[2] * p[0] + m[6] * p[1] + m[10] * p[2] + m[14],
1322 ]
1323}
1324
1325fn mat4_transform_normal(m: &[f32; 16], n: &[f32; 3]) -> [f32; 3] {
1327 [
1328 m[0] * n[0] + m[4] * n[1] + m[8] * n[2],
1329 m[1] * n[0] + m[5] * n[1] + m[9] * n[2],
1330 m[2] * n[0] + m[6] * n[1] + m[10] * n[2],
1331 ]
1332}
1333
1334#[derive(Debug, Clone)]
1340pub struct ComputeCapabilities {
1341 pub has_compute: bool,
1342 pub max_work_group_invocations: u32,
1343 pub max_work_group_size: [u32; 3],
1344 pub max_work_group_count: [u32; 3],
1345 pub max_shared_memory: u32,
1346 pub max_ssbo_bindings: u32,
1347 pub max_atomic_counter_bindings: u32,
1348 pub gl_version_major: u32,
1349 pub gl_version_minor: u32,
1350}
1351
1352impl ComputeCapabilities {
1353 pub fn query(gl: &glow::Context) -> Self {
1355 use glow::HasContext;
1356 unsafe {
1357 let major = gl.get_parameter_i32(glow::MAJOR_VERSION) as u32;
1358 let minor = gl.get_parameter_i32(glow::MINOR_VERSION) as u32;
1359 let has_compute = major > 4 || (major == 4 && minor >= 3);
1360
1361 if !has_compute {
1362 return Self {
1363 has_compute: false,
1364 max_work_group_invocations: 0,
1365 max_work_group_size: [0; 3],
1366 max_work_group_count: [0; 3],
1367 max_shared_memory: 0,
1368 max_ssbo_bindings: 0,
1369 max_atomic_counter_bindings: 0,
1370 gl_version_major: major,
1371 gl_version_minor: minor,
1372 };
1373 }
1374
1375 let max_invocations = gl.get_parameter_i32(0x90EB) as u32;
1376 let max_size = [
1377 gl.get_parameter_indexed_i32(0x91BE, 0) as u32,
1378 gl.get_parameter_indexed_i32(0x91BE, 1) as u32,
1379 gl.get_parameter_indexed_i32(0x91BE, 2) as u32,
1380 ];
1381 let max_count = [
1382 gl.get_parameter_indexed_i32(0x91BF, 0) as u32,
1383 gl.get_parameter_indexed_i32(0x91BF, 1) as u32,
1384 gl.get_parameter_indexed_i32(0x91BF, 2) as u32,
1385 ];
1386 let max_shared = gl.get_parameter_i32(0x8262) as u32;
1387 let max_ssbo = gl.get_parameter_i32(0x90DC) as u32; let max_atomic = gl.get_parameter_i32(0x92D1) as u32; Self {
1391 has_compute,
1392 max_work_group_invocations: max_invocations,
1393 max_work_group_size: max_size,
1394 max_work_group_count: max_count,
1395 max_shared_memory: max_shared,
1396 max_ssbo_bindings: max_ssbo,
1397 max_atomic_counter_bindings: max_atomic,
1398 gl_version_major: major,
1399 gl_version_minor: minor,
1400 }
1401 }
1402 }
1403
1404 pub fn validate_workgroup(&self, size: &super::dispatch::WorkgroupSize) -> bool {
1406 size.x <= self.max_work_group_size[0]
1407 && size.y <= self.max_work_group_size[1]
1408 && size.z <= self.max_work_group_size[2]
1409 && size.total_invocations() <= self.max_work_group_invocations
1410 }
1411
1412 pub fn summary(&self) -> String {
1414 if !self.has_compute {
1415 return format!(
1416 "GL {}.{}: NO compute support (requires 4.3+)",
1417 self.gl_version_major, self.gl_version_minor
1418 );
1419 }
1420 format!(
1421 "GL {}.{}: compute OK, max_invocations={}, max_size=[{},{},{}], max_shared={}KB, ssbo_bindings={}, atomic_bindings={}",
1422 self.gl_version_major, self.gl_version_minor,
1423 self.max_work_group_invocations,
1424 self.max_work_group_size[0], self.max_work_group_size[1], self.max_work_group_size[2],
1425 self.max_shared_memory / 1024,
1426 self.max_ssbo_bindings,
1427 self.max_atomic_counter_bindings,
1428 )
1429 }
1430}