1use ringkernel_core::error::{Result, RingKernelError};
23use ringkernel_core::memory::GpuBuffer;
24use ringkernel_cuda::{CudaBuffer, CudaDevice};
25
26#[cfg(feature = "cuda-codegen")]
31fn get_cuda_packed_source() -> String {
32 super::kernels::generate_packed_kernels()
33}
34
35#[cfg(not(feature = "cuda-codegen"))]
36fn get_cuda_packed_source() -> String {
37 include_str!("../shaders/fdtd_packed.cu").to_string()
38}
39
40const MODULE_NAME: &str = "fdtd_packed";
42
43const FN_EXCHANGE_HALOS: &str = "exchange_all_halos";
45const FN_FDTD_ALL_TILES: &str = "fdtd_all_tiles";
46const FN_READ_ALL_INTERIORS: &str = "read_all_interiors";
47const FN_INJECT_IMPULSE: &str = "inject_impulse";
48const FN_APPLY_BOUNDARY: &str = "apply_boundary_conditions";
49
50pub struct CudaPackedBackend {
55 device: CudaDevice,
58
59 tiles_x: u32,
61 tiles_y: u32,
62
63 tile_size: u32,
65 buffer_width: u32,
66
67 grid_width: u32,
69 grid_height: u32,
70
71 packed_a: CudaBuffer,
75 packed_b: CudaBuffer,
76
77 current_is_a: bool,
79
80 halo_copies: CudaBuffer,
83 num_halo_copies: u32,
84
85 output_buffer: CudaBuffer,
87
88 staging_buffer: CudaBuffer,
90
91 c2: f32,
93 damping: f32,
94}
95
96impl std::fmt::Debug for CudaPackedBackend {
97 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
98 f.debug_struct("CudaPackedBackend")
99 .field("tiles_x", &self.tiles_x)
100 .field("tiles_y", &self.tiles_y)
101 .field("tile_size", &self.tile_size)
102 .field("grid_width", &self.grid_width)
103 .field("grid_height", &self.grid_height)
104 .field("num_halo_copies", &self.num_halo_copies)
105 .field("c2", &self.c2)
106 .field("damping", &self.damping)
107 .finish_non_exhaustive()
108 }
109}
110
111impl CudaPackedBackend {
112 pub fn new(grid_width: u32, grid_height: u32, tile_size: u32) -> Result<Self> {
119 Self::with_device(0, grid_width, grid_height, tile_size)
120 }
121
122 pub fn with_device(
124 ordinal: usize,
125 grid_width: u32,
126 grid_height: u32,
127 tile_size: u32,
128 ) -> Result<Self> {
129 let device = CudaDevice::new(ordinal)?;
130
131 tracing::info!(
132 "CUDA packed backend: {} (CC {}.{})",
133 device.name(),
134 device.compute_capability().0,
135 device.compute_capability().1
136 );
137
138 let tiles_x = grid_width.div_ceil(tile_size);
140 let tiles_y = grid_height.div_ceil(tile_size);
141 let buffer_width = tile_size + 2;
142 let tile_buffer_size = buffer_width * buffer_width;
143 let num_tiles = tiles_x * tiles_y;
144 let total_buffer_size =
145 (num_tiles * tile_buffer_size) as usize * std::mem::size_of::<f32>();
146
147 tracing::info!(
148 "Packed layout: {}x{} tiles, {} total floats ({:.1} KB)",
149 tiles_x,
150 tiles_y,
151 num_tiles * tile_buffer_size,
152 total_buffer_size as f64 / 1024.0
153 );
154
155 let cuda_source = get_cuda_packed_source();
157 let ptx = cudarc::nvrtc::compile_ptx(&cuda_source).map_err(|e| {
158 RingKernelError::BackendError(format!("NVRTC compilation failed: {}", e))
159 })?;
160
161 device
162 .inner()
163 .load_ptx(
164 ptx,
165 MODULE_NAME,
166 &[
167 FN_EXCHANGE_HALOS,
168 FN_FDTD_ALL_TILES,
169 FN_READ_ALL_INTERIORS,
170 FN_INJECT_IMPULSE,
171 FN_APPLY_BOUNDARY,
172 ],
173 )
174 .map_err(|e| RingKernelError::BackendError(format!("Failed to load PTX: {}", e)))?;
175
176 let packed_a = CudaBuffer::new(&device, total_buffer_size)?;
178 let packed_b = CudaBuffer::new(&device, total_buffer_size)?;
179
180 let zeros = vec![0u8; total_buffer_size];
182 packed_a.copy_from_host(&zeros)?;
183 packed_b.copy_from_host(&zeros)?;
184
185 let halo_copy_list = Self::compute_halo_copies(tiles_x, tiles_y, tile_size, buffer_width);
187 let num_halo_copies = halo_copy_list.len() as u32;
188
189 tracing::info!(
190 "Halo copy list: {} copies ({:.1} KB on GPU)",
191 num_halo_copies,
192 (num_halo_copies as usize * 2 * std::mem::size_of::<u32>()) as f64 / 1024.0
193 );
194
195 let halo_copy_bytes: Vec<u8> = halo_copy_list
197 .iter()
198 .flat_map(|(src, dst)| {
199 let mut bytes = Vec::with_capacity(8);
200 bytes.extend_from_slice(&src.to_ne_bytes());
201 bytes.extend_from_slice(&dst.to_ne_bytes());
202 bytes
203 })
204 .collect();
205
206 let halo_copies = CudaBuffer::new(&device, halo_copy_bytes.len())?;
207 halo_copies.copy_from_host(&halo_copy_bytes)?;
208
209 let output_size = (grid_width * grid_height) as usize * std::mem::size_of::<f32>();
211 let output_buffer = CudaBuffer::new(&device, output_size)?;
212
213 let staging_size = tile_buffer_size as usize * std::mem::size_of::<f32>();
215 let staging_buffer = CudaBuffer::new(&device, staging_size)?;
216
217 Ok(Self {
218 device,
219 tiles_x,
220 tiles_y,
221 tile_size,
222 buffer_width,
223 grid_width,
224 grid_height,
225 packed_a,
226 packed_b,
227 current_is_a: true,
228 halo_copies,
229 num_halo_copies,
230 output_buffer,
231 staging_buffer,
232 c2: 0.25, damping: 0.99, })
235 }
236
237 fn compute_halo_copies(
242 tiles_x: u32,
243 tiles_y: u32,
244 tile_size: u32,
245 buffer_width: u32,
246 ) -> Vec<(u32, u32)> {
247 let tile_buffer_size = buffer_width * buffer_width;
248 let mut copies = Vec::new();
249
250 for ty in 0..tiles_y {
251 for tx in 0..tiles_x {
252 let tile_offset = (ty * tiles_x + tx) * tile_buffer_size;
253
254 if ty > 0 {
259 let north_tile_offset = ((ty - 1) * tiles_x + tx) * tile_buffer_size;
260 for x in 0..tile_size {
261 let src = tile_offset + buffer_width + (x + 1);
262 let dst = north_tile_offset + (buffer_width - 1) * buffer_width + (x + 1);
263 copies.push((src, dst));
264 }
265 }
266
267 if ty < tiles_y - 1 {
270 let south_tile_offset = ((ty + 1) * tiles_x + tx) * tile_buffer_size;
271 for x in 0..tile_size {
272 let src = tile_offset + tile_size * buffer_width + (x + 1);
273 let dst = south_tile_offset + (x + 1);
274 copies.push((src, dst));
275 }
276 }
277
278 if tx > 0 {
281 let west_tile_offset = (ty * tiles_x + (tx - 1)) * tile_buffer_size;
282 for y in 0..tile_size {
283 let src = tile_offset + (y + 1) * buffer_width + 1;
284 let dst = west_tile_offset + (y + 1) * buffer_width + (buffer_width - 1);
285 copies.push((src, dst));
286 }
287 }
288
289 if tx < tiles_x - 1 {
292 let east_tile_offset = (ty * tiles_x + (tx + 1)) * tile_buffer_size;
293 for y in 0..tile_size {
294 let src = tile_offset + (y + 1) * buffer_width + tile_size;
295 let dst = east_tile_offset + (y + 1) * buffer_width;
296 copies.push((src, dst));
297 }
298 }
299 }
300 }
301
302 copies
303 }
304
305 pub fn set_params(&mut self, c2: f32, damping: f32) {
307 self.c2 = c2;
308 self.damping = damping;
309 }
310
311 pub fn upload_tile(&self, tile_x: u32, tile_y: u32, pressure: &[f32]) -> Result<()> {
313 use cudarc::driver::LaunchAsync;
314
315 let pressure_bytes: &[u8] = bytemuck::cast_slice(pressure);
316 self.staging_buffer.copy_from_host(pressure_bytes)?;
317
318 let kernel = self
319 .device
320 .inner()
321 .get_func(MODULE_NAME, "upload_tile_data")
322 .ok_or_else(|| RingKernelError::BackendError("upload_tile_data not found".into()))?;
323
324 let packed_ptr = if self.current_is_a {
325 self.packed_a.device_ptr()
326 } else {
327 self.packed_b.device_ptr()
328 };
329
330 let cfg = cudarc::driver::LaunchConfig {
331 grid_dim: (1, 1, 1),
332 block_dim: (self.buffer_width, self.buffer_width, 1),
333 shared_mem_bytes: 0,
334 };
335
336 unsafe {
337 kernel.launch(
338 cfg,
339 (
340 packed_ptr,
341 self.staging_buffer.device_ptr(),
342 tile_x as i32,
343 tile_y as i32,
344 self.tiles_x as i32,
345 self.buffer_width as i32,
346 ),
347 )
348 }
349 .map_err(|e| RingKernelError::BackendError(format!("Upload failed: {}", e)))?;
350
351 Ok(())
352 }
353
354 pub fn upload_all_tiles(&self, tiles: &[(u32, u32, Vec<f32>)]) -> Result<()> {
356 for (tx, ty, pressure) in tiles {
357 self.upload_tile(*tx, *ty, pressure)?;
358 }
359 let size = self.packed_a.size();
361 let mut data = vec![0u8; size];
362 if self.current_is_a {
363 self.packed_a.copy_to_host(&mut data)?;
364 self.packed_b.copy_from_host(&data)?;
365 } else {
366 self.packed_b.copy_to_host(&mut data)?;
367 self.packed_a.copy_from_host(&data)?;
368 }
369 Ok(())
370 }
371
372 pub fn step(&mut self) -> Result<()> {
380 use cudarc::driver::LaunchAsync;
381
382 let (curr_ptr, prev_ptr) = if self.current_is_a {
383 (self.packed_a.device_ptr(), self.packed_b.device_ptr())
384 } else {
385 (self.packed_b.device_ptr(), self.packed_a.device_ptr())
386 };
387
388 if self.num_halo_copies > 0 {
390 let exchange_kernel = self
391 .device
392 .inner()
393 .get_func(MODULE_NAME, FN_EXCHANGE_HALOS)
394 .ok_or_else(|| {
395 RingKernelError::BackendError("exchange_all_halos not found".into())
396 })?;
397
398 let threads_per_block = 256u32;
399 let num_blocks = self.num_halo_copies.div_ceil(threads_per_block);
400
401 let cfg = cudarc::driver::LaunchConfig {
402 grid_dim: (num_blocks, 1, 1),
403 block_dim: (threads_per_block, 1, 1),
404 shared_mem_bytes: 0,
405 };
406
407 unsafe {
408 exchange_kernel.launch(
409 cfg,
410 (
411 curr_ptr,
412 self.halo_copies.device_ptr(),
413 self.num_halo_copies as i32,
414 ),
415 )
416 }
417 .map_err(|e| RingKernelError::BackendError(format!("Halo exchange failed: {}", e)))?;
418 }
419
420 let boundary_kernel = self
422 .device
423 .inner()
424 .get_func(MODULE_NAME, FN_APPLY_BOUNDARY)
425 .ok_or_else(|| {
426 RingKernelError::BackendError("apply_boundary_conditions not found".into())
427 })?;
428
429 let max_edge_threads = std::cmp::max(self.tiles_x, self.tiles_y) * self.tile_size;
430
431 let boundary_cfg = cudarc::driver::LaunchConfig {
432 grid_dim: (4, 1, 1), block_dim: (max_edge_threads.min(1024), 1, 1),
434 shared_mem_bytes: 0,
435 };
436
437 unsafe {
438 boundary_kernel.launch(
439 boundary_cfg,
440 (
441 curr_ptr,
442 self.tiles_x as i32,
443 self.tiles_y as i32,
444 self.tile_size as i32,
445 self.buffer_width as i32,
446 0.95f32, ),
448 )
449 }
450 .map_err(|e| RingKernelError::BackendError(format!("Boundary failed: {}", e)))?;
451
452 let fdtd_kernel = self
454 .device
455 .inner()
456 .get_func(MODULE_NAME, FN_FDTD_ALL_TILES)
457 .ok_or_else(|| RingKernelError::BackendError("fdtd_all_tiles not found".into()))?;
458
459 let fdtd_cfg = cudarc::driver::LaunchConfig {
460 grid_dim: (self.tiles_x, self.tiles_y, 1),
461 block_dim: (self.tile_size, self.tile_size, 1),
462 shared_mem_bytes: 0,
463 };
464
465 unsafe {
466 fdtd_kernel.launch(
467 fdtd_cfg,
468 (
469 curr_ptr,
470 prev_ptr,
471 self.tiles_x as i32,
472 self.tiles_y as i32,
473 self.tile_size as i32,
474 self.buffer_width as i32,
475 self.c2,
476 self.damping,
477 ),
478 )
479 }
480 .map_err(|e| RingKernelError::BackendError(format!("FDTD failed: {}", e)))?;
481
482 self.current_is_a = !self.current_is_a;
484
485 Ok(())
486 }
487
488 pub fn step_batch(&mut self, count: u32) -> Result<()> {
490 for _ in 0..count {
491 self.step()?;
492 }
493 self.device.synchronize()?;
495 Ok(())
496 }
497
498 pub fn inject_impulse(&self, x: u32, y: u32, amplitude: f32) -> Result<()> {
500 use cudarc::driver::LaunchAsync;
501
502 if x >= self.grid_width || y >= self.grid_height {
503 return Ok(());
504 }
505
506 let tile_x = x / self.tile_size;
507 let tile_y = y / self.tile_size;
508 let local_x = x % self.tile_size;
509 let local_y = y % self.tile_size;
510
511 let kernel = self
512 .device
513 .inner()
514 .get_func(MODULE_NAME, FN_INJECT_IMPULSE)
515 .ok_or_else(|| RingKernelError::BackendError("inject_impulse not found".into()))?;
516
517 let curr_ptr = if self.current_is_a {
518 self.packed_a.device_ptr()
519 } else {
520 self.packed_b.device_ptr()
521 };
522
523 let cfg = cudarc::driver::LaunchConfig {
524 grid_dim: (1, 1, 1),
525 block_dim: (1, 1, 1),
526 shared_mem_bytes: 0,
527 };
528
529 unsafe {
530 kernel.launch(
531 cfg,
532 (
533 curr_ptr,
534 tile_x as i32,
535 tile_y as i32,
536 local_x as i32,
537 local_y as i32,
538 self.tiles_x as i32,
539 self.buffer_width as i32,
540 amplitude,
541 ),
542 )
543 }
544 .map_err(|e| RingKernelError::BackendError(format!("Impulse failed: {}", e)))?;
545
546 Ok(())
547 }
548
549 pub fn read_pressure_grid(&self) -> Result<Vec<f32>> {
554 use cudarc::driver::LaunchAsync;
555
556 let kernel = self
557 .device
558 .inner()
559 .get_func(MODULE_NAME, FN_READ_ALL_INTERIORS)
560 .ok_or_else(|| RingKernelError::BackendError("read_all_interiors not found".into()))?;
561
562 let curr_ptr = if self.current_is_a {
563 self.packed_a.device_ptr()
564 } else {
565 self.packed_b.device_ptr()
566 };
567
568 let threads = 16u32;
569 let blocks_x = self.grid_width.div_ceil(threads);
570 let blocks_y = self.grid_height.div_ceil(threads);
571
572 let cfg = cudarc::driver::LaunchConfig {
573 grid_dim: (blocks_x, blocks_y, 1),
574 block_dim: (threads, threads, 1),
575 shared_mem_bytes: 0,
576 };
577
578 unsafe {
579 kernel.launch(
580 cfg,
581 (
582 curr_ptr,
583 self.output_buffer.device_ptr(),
584 self.tiles_x as i32,
585 self.tiles_y as i32,
586 self.tile_size as i32,
587 self.buffer_width as i32,
588 self.grid_width as i32,
589 self.grid_height as i32,
590 ),
591 )
592 }
593 .map_err(|e| RingKernelError::BackendError(format!("Read failed: {}", e)))?;
594
595 self.device.synchronize()?;
597
598 let output_size =
599 (self.grid_width * self.grid_height) as usize * std::mem::size_of::<f32>();
600 let mut output_bytes = vec![0u8; output_size];
601 self.output_buffer.copy_to_host(&mut output_bytes)?;
602
603 Ok(bytemuck::cast_slice(&output_bytes).to_vec())
604 }
605
606 pub fn synchronize(&self) -> Result<()> {
608 self.device.synchronize()
609 }
610
611 pub fn grid_size(&self) -> (u32, u32) {
613 (self.grid_width, self.grid_height)
614 }
615
616 pub fn tile_grid_size(&self) -> (u32, u32) {
618 (self.tiles_x, self.tiles_y)
619 }
620
621 pub fn tile_count(&self) -> u32 {
623 self.tiles_x * self.tiles_y
624 }
625
626 pub fn halo_copy_count(&self) -> u32 {
628 self.num_halo_copies
629 }
630}
631
632#[cfg(test)]
633mod tests {
634 use super::*;
635
636 #[test]
637 #[ignore] fn test_packed_backend_creation() {
639 let backend = CudaPackedBackend::new(64, 64, 16).unwrap();
640 assert_eq!(backend.tiles_x, 4);
641 assert_eq!(backend.tiles_y, 4);
642 assert_eq!(backend.tile_count(), 16);
643 }
644
645 #[test]
646 #[ignore] fn test_packed_backend_step() {
648 let mut backend = CudaPackedBackend::new(32, 32, 16).unwrap();
649 backend.set_params(0.25, 0.99);
650
651 backend.inject_impulse(16, 16, 1.0).unwrap();
653
654 for _ in 0..10 {
656 backend.step().unwrap();
657 }
658
659 let pressure = backend.read_pressure_grid().unwrap();
661 assert_eq!(pressure.len(), 32 * 32);
662
663 let center = pressure[16 * 32 + 16];
665 assert!(center.abs() < 1.0, "Wave should have propagated");
666 }
667
668 #[test]
669 fn test_halo_copy_computation() {
670 let copies = CudaPackedBackend::compute_halo_copies(2, 2, 16, 18);
672
673 assert_eq!(copies.len(), 128);
697 }
698}