ringkernel_wavesim/simulation/
cuda_packed.rs

1//! Packed CUDA backend for GPU-only tile-based FDTD simulation.
2//!
3//! This backend keeps ALL tile data in a single packed GPU buffer, enabling
4//! GPU-only halo exchange with ZERO host transfers during simulation steps.
5//!
6//! ## Key Features
7//!
8//! - **Zero host transfers**: Halo exchange happens entirely on GPU
9//! - **Two kernel launches per step**: Exchange halos + FDTD compute
10//! - **Massive parallelism**: All tiles computed in single kernel launch
11//! - **Precomputed halo routing**: Copy indices computed once at init
12//!
13//! ## Memory Layout
14//!
15//! All tiles packed contiguously in GPU memory:
16//! ```text
17//! [Tile(0,0) 18×18][Tile(1,0) 18×18][Tile(0,1) 18×18]...
18//! ```
19//!
20//! Each tile is `(tile_size + 2)²` floats (18×18 = 324 floats for 16×16 tile).
21
22use ringkernel_core::error::{Result, RingKernelError};
23use ringkernel_core::memory::GpuBuffer;
24use ringkernel_cuda::{CudaBuffer, CudaDevice};
25
26/// CUDA kernel source for packed tile FDTD.
27///
28/// Uses generated DSL kernels when `cuda-codegen` feature is enabled,
29/// otherwise falls back to handwritten CUDA source.
30#[cfg(feature = "cuda-codegen")]
31fn get_cuda_packed_source() -> String {
32    super::kernels::generate_packed_kernels()
33}
34
35#[cfg(not(feature = "cuda-codegen"))]
36fn get_cuda_packed_source() -> String {
37    include_str!("../shaders/fdtd_packed.cu").to_string()
38}
39
40/// Module name for loaded kernels.
41const MODULE_NAME: &str = "fdtd_packed";
42
43/// Kernel function names.
44const FN_EXCHANGE_HALOS: &str = "exchange_all_halos";
45const FN_FDTD_ALL_TILES: &str = "fdtd_all_tiles";
46const FN_READ_ALL_INTERIORS: &str = "read_all_interiors";
47const FN_INJECT_IMPULSE: &str = "inject_impulse";
48const FN_APPLY_BOUNDARY: &str = "apply_boundary_conditions";
49
50/// Packed CUDA backend for tile-based FDTD simulation.
51///
52/// All tiles are stored in a single contiguous GPU buffer, enabling
53/// GPU-only halo exchange without any host transfers.
54pub struct CudaPackedBackend {
55    // Note: Cannot derive Debug due to CudaDevice/CudaBuffer not implementing it
56    /// CUDA device.
57    device: CudaDevice,
58
59    /// Grid dimensions in tiles.
60    tiles_x: u32,
61    tiles_y: u32,
62
63    /// Tile dimensions.
64    tile_size: u32,
65    buffer_width: u32,
66
67    /// Grid dimensions in cells.
68    grid_width: u32,
69    grid_height: u32,
70
71    /// Packed pressure buffers (all tiles contiguous).
72    /// Buffer A: current pressure state
73    /// Buffer B: previous pressure state (ping-pong)
74    packed_a: CudaBuffer,
75    packed_b: CudaBuffer,
76
77    /// Which buffer is current (true = A, false = B).
78    current_is_a: bool,
79
80    /// Halo copy list on GPU.
81    /// Format: [src0, dst0, src1, dst1, ...] as u32 indices into packed buffer.
82    halo_copies: CudaBuffer,
83    num_halo_copies: u32,
84
85    /// Output buffer for visualization readback.
86    output_buffer: CudaBuffer,
87
88    /// Staging buffer for tile uploads.
89    staging_buffer: CudaBuffer,
90
91    /// FDTD parameters.
92    c2: f32,
93    damping: f32,
94}
95
96impl std::fmt::Debug for CudaPackedBackend {
97    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
98        f.debug_struct("CudaPackedBackend")
99            .field("tiles_x", &self.tiles_x)
100            .field("tiles_y", &self.tiles_y)
101            .field("tile_size", &self.tile_size)
102            .field("grid_width", &self.grid_width)
103            .field("grid_height", &self.grid_height)
104            .field("num_halo_copies", &self.num_halo_copies)
105            .field("c2", &self.c2)
106            .field("damping", &self.damping)
107            .finish_non_exhaustive()
108    }
109}
110
111impl CudaPackedBackend {
112    /// Create a new packed CUDA backend.
113    ///
114    /// # Arguments
115    /// * `grid_width` - Width of the simulation grid in cells
116    /// * `grid_height` - Height of the simulation grid in cells
117    /// * `tile_size` - Size of each tile's interior (typically 16)
118    pub fn new(grid_width: u32, grid_height: u32, tile_size: u32) -> Result<Self> {
119        Self::with_device(0, grid_width, grid_height, tile_size)
120    }
121
122    /// Create a packed CUDA backend on a specific device.
123    pub fn with_device(
124        ordinal: usize,
125        grid_width: u32,
126        grid_height: u32,
127        tile_size: u32,
128    ) -> Result<Self> {
129        let device = CudaDevice::new(ordinal)?;
130
131        tracing::info!(
132            "CUDA packed backend: {} (CC {}.{})",
133            device.name(),
134            device.compute_capability().0,
135            device.compute_capability().1
136        );
137
138        // Calculate tile grid dimensions
139        let tiles_x = grid_width.div_ceil(tile_size);
140        let tiles_y = grid_height.div_ceil(tile_size);
141        let buffer_width = tile_size + 2;
142        let tile_buffer_size = buffer_width * buffer_width;
143        let num_tiles = tiles_x * tiles_y;
144        let total_buffer_size =
145            (num_tiles * tile_buffer_size) as usize * std::mem::size_of::<f32>();
146
147        tracing::info!(
148            "Packed layout: {}x{} tiles, {} total floats ({:.1} KB)",
149            tiles_x,
150            tiles_y,
151            num_tiles * tile_buffer_size,
152            total_buffer_size as f64 / 1024.0
153        );
154
155        // Compile CUDA source to PTX and load module
156        let cuda_source = get_cuda_packed_source();
157        let ptx = cudarc::nvrtc::compile_ptx(&cuda_source).map_err(|e| {
158            RingKernelError::BackendError(format!("NVRTC compilation failed: {}", e))
159        })?;
160
161        device
162            .inner()
163            .load_ptx(
164                ptx,
165                MODULE_NAME,
166                &[
167                    FN_EXCHANGE_HALOS,
168                    FN_FDTD_ALL_TILES,
169                    FN_READ_ALL_INTERIORS,
170                    FN_INJECT_IMPULSE,
171                    FN_APPLY_BOUNDARY,
172                ],
173            )
174            .map_err(|e| RingKernelError::BackendError(format!("Failed to load PTX: {}", e)))?;
175
176        // Allocate packed pressure buffers
177        let packed_a = CudaBuffer::new(&device, total_buffer_size)?;
178        let packed_b = CudaBuffer::new(&device, total_buffer_size)?;
179
180        // Initialize to zero
181        let zeros = vec![0u8; total_buffer_size];
182        packed_a.copy_from_host(&zeros)?;
183        packed_b.copy_from_host(&zeros)?;
184
185        // Compute halo copy list
186        let halo_copy_list = Self::compute_halo_copies(tiles_x, tiles_y, tile_size, buffer_width);
187        let num_halo_copies = halo_copy_list.len() as u32;
188
189        tracing::info!(
190            "Halo copy list: {} copies ({:.1} KB on GPU)",
191            num_halo_copies,
192            (num_halo_copies as usize * 2 * std::mem::size_of::<u32>()) as f64 / 1024.0
193        );
194
195        // Upload halo copy list to GPU
196        let halo_copy_bytes: Vec<u8> = halo_copy_list
197            .iter()
198            .flat_map(|(src, dst)| {
199                let mut bytes = Vec::with_capacity(8);
200                bytes.extend_from_slice(&src.to_ne_bytes());
201                bytes.extend_from_slice(&dst.to_ne_bytes());
202                bytes
203            })
204            .collect();
205
206        let halo_copies = CudaBuffer::new(&device, halo_copy_bytes.len())?;
207        halo_copies.copy_from_host(&halo_copy_bytes)?;
208
209        // Allocate output buffer for visualization
210        let output_size = (grid_width * grid_height) as usize * std::mem::size_of::<f32>();
211        let output_buffer = CudaBuffer::new(&device, output_size)?;
212
213        // Allocate staging buffer for tile uploads (one tile at a time)
214        let staging_size = tile_buffer_size as usize * std::mem::size_of::<f32>();
215        let staging_buffer = CudaBuffer::new(&device, staging_size)?;
216
217        Ok(Self {
218            device,
219            tiles_x,
220            tiles_y,
221            tile_size,
222            buffer_width,
223            grid_width,
224            grid_height,
225            packed_a,
226            packed_b,
227            current_is_a: true,
228            halo_copies,
229            num_halo_copies,
230            output_buffer,
231            staging_buffer,
232            c2: 0.25,      // Default Courant number squared
233            damping: 0.99, // Default damping
234        })
235    }
236
237    /// Compute the list of halo copy operations.
238    ///
239    /// Returns a list of (src_index, dst_index) pairs for copying halo data
240    /// between adjacent tiles.
241    fn compute_halo_copies(
242        tiles_x: u32,
243        tiles_y: u32,
244        tile_size: u32,
245        buffer_width: u32,
246    ) -> Vec<(u32, u32)> {
247        let tile_buffer_size = buffer_width * buffer_width;
248        let mut copies = Vec::new();
249
250        for ty in 0..tiles_y {
251            for tx in 0..tiles_x {
252                let tile_offset = (ty * tiles_x + tx) * tile_buffer_size;
253
254                // Copy this tile's edges to neighbor's halos
255
256                // North edge → South halo of tile above
257                // Extract row 1 (first interior row) → inject to row 17 (south halo) of north neighbor
258                if ty > 0 {
259                    let north_tile_offset = ((ty - 1) * tiles_x + tx) * tile_buffer_size;
260                    for x in 0..tile_size {
261                        let src = tile_offset + buffer_width + (x + 1);
262                        let dst = north_tile_offset + (buffer_width - 1) * buffer_width + (x + 1);
263                        copies.push((src, dst));
264                    }
265                }
266
267                // South edge → North halo of tile below
268                // Extract row 16 (last interior row) → inject to row 0 (north halo) of south neighbor
269                if ty < tiles_y - 1 {
270                    let south_tile_offset = ((ty + 1) * tiles_x + tx) * tile_buffer_size;
271                    for x in 0..tile_size {
272                        let src = tile_offset + tile_size * buffer_width + (x + 1);
273                        let dst = south_tile_offset + (x + 1);
274                        copies.push((src, dst));
275                    }
276                }
277
278                // West edge → East halo of tile to the left
279                // Extract col 1 (first interior col) → inject to col 17 (east halo) of west neighbor
280                if tx > 0 {
281                    let west_tile_offset = (ty * tiles_x + (tx - 1)) * tile_buffer_size;
282                    for y in 0..tile_size {
283                        let src = tile_offset + (y + 1) * buffer_width + 1;
284                        let dst = west_tile_offset + (y + 1) * buffer_width + (buffer_width - 1);
285                        copies.push((src, dst));
286                    }
287                }
288
289                // East edge → West halo of tile to the right
290                // Extract col 16 (last interior col) → inject to col 0 (west halo) of east neighbor
291                if tx < tiles_x - 1 {
292                    let east_tile_offset = (ty * tiles_x + (tx + 1)) * tile_buffer_size;
293                    for y in 0..tile_size {
294                        let src = tile_offset + (y + 1) * buffer_width + tile_size;
295                        let dst = east_tile_offset + (y + 1) * buffer_width;
296                        copies.push((src, dst));
297                    }
298                }
299            }
300        }
301
302        copies
303    }
304
305    /// Set FDTD parameters.
306    pub fn set_params(&mut self, c2: f32, damping: f32) {
307        self.c2 = c2;
308        self.damping = damping;
309    }
310
311    /// Upload initial pressure data for a single tile.
312    pub fn upload_tile(&self, tile_x: u32, tile_y: u32, pressure: &[f32]) -> Result<()> {
313        use cudarc::driver::LaunchAsync;
314
315        let pressure_bytes: &[u8] = bytemuck::cast_slice(pressure);
316        self.staging_buffer.copy_from_host(pressure_bytes)?;
317
318        let kernel = self
319            .device
320            .inner()
321            .get_func(MODULE_NAME, "upload_tile_data")
322            .ok_or_else(|| RingKernelError::BackendError("upload_tile_data not found".into()))?;
323
324        let packed_ptr = if self.current_is_a {
325            self.packed_a.device_ptr()
326        } else {
327            self.packed_b.device_ptr()
328        };
329
330        let cfg = cudarc::driver::LaunchConfig {
331            grid_dim: (1, 1, 1),
332            block_dim: (self.buffer_width, self.buffer_width, 1),
333            shared_mem_bytes: 0,
334        };
335
336        unsafe {
337            kernel.launch(
338                cfg,
339                (
340                    packed_ptr,
341                    self.staging_buffer.device_ptr(),
342                    tile_x as i32,
343                    tile_y as i32,
344                    self.tiles_x as i32,
345                    self.buffer_width as i32,
346                ),
347            )
348        }
349        .map_err(|e| RingKernelError::BackendError(format!("Upload failed: {}", e)))?;
350
351        Ok(())
352    }
353
354    /// Upload initial state for all tiles from a Vec of tile buffers.
355    pub fn upload_all_tiles(&self, tiles: &[(u32, u32, Vec<f32>)]) -> Result<()> {
356        for (tx, ty, pressure) in tiles {
357            self.upload_tile(*tx, *ty, pressure)?;
358        }
359        // Also copy to the other buffer for proper ping-pong initialization
360        let size = self.packed_a.size();
361        let mut data = vec![0u8; size];
362        if self.current_is_a {
363            self.packed_a.copy_to_host(&mut data)?;
364            self.packed_b.copy_from_host(&data)?;
365        } else {
366            self.packed_b.copy_to_host(&mut data)?;
367            self.packed_a.copy_from_host(&data)?;
368        }
369        Ok(())
370    }
371
372    /// Perform one simulation step (GPU-only, no host transfers).
373    ///
374    /// This executes:
375    /// 1. Halo exchange kernel (copies all edges between tiles)
376    /// 2. Boundary conditions kernel (handles grid edges)
377    /// 3. FDTD kernel (computes wave equation for all tiles)
378    /// 4. Buffer swap (just flips a flag)
379    pub fn step(&mut self) -> Result<()> {
380        use cudarc::driver::LaunchAsync;
381
382        let (curr_ptr, prev_ptr) = if self.current_is_a {
383            (self.packed_a.device_ptr(), self.packed_b.device_ptr())
384        } else {
385            (self.packed_b.device_ptr(), self.packed_a.device_ptr())
386        };
387
388        // Phase 1: Exchange all halos (single kernel launch)
389        if self.num_halo_copies > 0 {
390            let exchange_kernel = self
391                .device
392                .inner()
393                .get_func(MODULE_NAME, FN_EXCHANGE_HALOS)
394                .ok_or_else(|| {
395                    RingKernelError::BackendError("exchange_all_halos not found".into())
396                })?;
397
398            let threads_per_block = 256u32;
399            let num_blocks = self.num_halo_copies.div_ceil(threads_per_block);
400
401            let cfg = cudarc::driver::LaunchConfig {
402                grid_dim: (num_blocks, 1, 1),
403                block_dim: (threads_per_block, 1, 1),
404                shared_mem_bytes: 0,
405            };
406
407            unsafe {
408                exchange_kernel.launch(
409                    cfg,
410                    (
411                        curr_ptr,
412                        self.halo_copies.device_ptr(),
413                        self.num_halo_copies as i32,
414                    ),
415                )
416            }
417            .map_err(|e| RingKernelError::BackendError(format!("Halo exchange failed: {}", e)))?;
418        }
419
420        // Phase 2: Apply boundary conditions
421        let boundary_kernel = self
422            .device
423            .inner()
424            .get_func(MODULE_NAME, FN_APPLY_BOUNDARY)
425            .ok_or_else(|| {
426                RingKernelError::BackendError("apply_boundary_conditions not found".into())
427            })?;
428
429        let max_edge_threads = std::cmp::max(self.tiles_x, self.tiles_y) * self.tile_size;
430
431        let boundary_cfg = cudarc::driver::LaunchConfig {
432            grid_dim: (4, 1, 1), // 4 edges
433            block_dim: (max_edge_threads.min(1024), 1, 1),
434            shared_mem_bytes: 0,
435        };
436
437        unsafe {
438            boundary_kernel.launch(
439                boundary_cfg,
440                (
441                    curr_ptr,
442                    self.tiles_x as i32,
443                    self.tiles_y as i32,
444                    self.tile_size as i32,
445                    self.buffer_width as i32,
446                    0.95f32, // reflection coefficient (slight absorption)
447                ),
448            )
449        }
450        .map_err(|e| RingKernelError::BackendError(format!("Boundary failed: {}", e)))?;
451
452        // Phase 3: Compute FDTD for all tiles (single kernel launch)
453        let fdtd_kernel = self
454            .device
455            .inner()
456            .get_func(MODULE_NAME, FN_FDTD_ALL_TILES)
457            .ok_or_else(|| RingKernelError::BackendError("fdtd_all_tiles not found".into()))?;
458
459        let fdtd_cfg = cudarc::driver::LaunchConfig {
460            grid_dim: (self.tiles_x, self.tiles_y, 1),
461            block_dim: (self.tile_size, self.tile_size, 1),
462            shared_mem_bytes: 0,
463        };
464
465        unsafe {
466            fdtd_kernel.launch(
467                fdtd_cfg,
468                (
469                    curr_ptr,
470                    prev_ptr,
471                    self.tiles_x as i32,
472                    self.tiles_y as i32,
473                    self.tile_size as i32,
474                    self.buffer_width as i32,
475                    self.c2,
476                    self.damping,
477                ),
478            )
479        }
480        .map_err(|e| RingKernelError::BackendError(format!("FDTD failed: {}", e)))?;
481
482        // Phase 4: Swap buffers (just flip the flag, no data movement)
483        self.current_is_a = !self.current_is_a;
484
485        Ok(())
486    }
487
488    /// Run N steps without any host interaction.
489    pub fn step_batch(&mut self, count: u32) -> Result<()> {
490        for _ in 0..count {
491            self.step()?;
492        }
493        // Synchronize once at the end
494        self.device.synchronize()?;
495        Ok(())
496    }
497
498    /// Inject an impulse at the given global grid position.
499    pub fn inject_impulse(&self, x: u32, y: u32, amplitude: f32) -> Result<()> {
500        use cudarc::driver::LaunchAsync;
501
502        if x >= self.grid_width || y >= self.grid_height {
503            return Ok(());
504        }
505
506        let tile_x = x / self.tile_size;
507        let tile_y = y / self.tile_size;
508        let local_x = x % self.tile_size;
509        let local_y = y % self.tile_size;
510
511        let kernel = self
512            .device
513            .inner()
514            .get_func(MODULE_NAME, FN_INJECT_IMPULSE)
515            .ok_or_else(|| RingKernelError::BackendError("inject_impulse not found".into()))?;
516
517        let curr_ptr = if self.current_is_a {
518            self.packed_a.device_ptr()
519        } else {
520            self.packed_b.device_ptr()
521        };
522
523        let cfg = cudarc::driver::LaunchConfig {
524            grid_dim: (1, 1, 1),
525            block_dim: (1, 1, 1),
526            shared_mem_bytes: 0,
527        };
528
529        unsafe {
530            kernel.launch(
531                cfg,
532                (
533                    curr_ptr,
534                    tile_x as i32,
535                    tile_y as i32,
536                    local_x as i32,
537                    local_y as i32,
538                    self.tiles_x as i32,
539                    self.buffer_width as i32,
540                    amplitude,
541                ),
542            )
543        }
544        .map_err(|e| RingKernelError::BackendError(format!("Impulse failed: {}", e)))?;
545
546        Ok(())
547    }
548
549    /// Read the full pressure grid for visualization.
550    ///
551    /// This is the only operation that transfers data from GPU to host.
552    /// Call this only when you need to render the simulation.
553    pub fn read_pressure_grid(&self) -> Result<Vec<f32>> {
554        use cudarc::driver::LaunchAsync;
555
556        let kernel = self
557            .device
558            .inner()
559            .get_func(MODULE_NAME, FN_READ_ALL_INTERIORS)
560            .ok_or_else(|| RingKernelError::BackendError("read_all_interiors not found".into()))?;
561
562        let curr_ptr = if self.current_is_a {
563            self.packed_a.device_ptr()
564        } else {
565            self.packed_b.device_ptr()
566        };
567
568        let threads = 16u32;
569        let blocks_x = self.grid_width.div_ceil(threads);
570        let blocks_y = self.grid_height.div_ceil(threads);
571
572        let cfg = cudarc::driver::LaunchConfig {
573            grid_dim: (blocks_x, blocks_y, 1),
574            block_dim: (threads, threads, 1),
575            shared_mem_bytes: 0,
576        };
577
578        unsafe {
579            kernel.launch(
580                cfg,
581                (
582                    curr_ptr,
583                    self.output_buffer.device_ptr(),
584                    self.tiles_x as i32,
585                    self.tiles_y as i32,
586                    self.tile_size as i32,
587                    self.buffer_width as i32,
588                    self.grid_width as i32,
589                    self.grid_height as i32,
590                ),
591            )
592        }
593        .map_err(|e| RingKernelError::BackendError(format!("Read failed: {}", e)))?;
594
595        // Synchronize and read back
596        self.device.synchronize()?;
597
598        let output_size =
599            (self.grid_width * self.grid_height) as usize * std::mem::size_of::<f32>();
600        let mut output_bytes = vec![0u8; output_size];
601        self.output_buffer.copy_to_host(&mut output_bytes)?;
602
603        Ok(bytemuck::cast_slice(&output_bytes).to_vec())
604    }
605
606    /// Synchronize the GPU (wait for all operations to complete).
607    pub fn synchronize(&self) -> Result<()> {
608        self.device.synchronize()
609    }
610
611    /// Get grid dimensions.
612    pub fn grid_size(&self) -> (u32, u32) {
613        (self.grid_width, self.grid_height)
614    }
615
616    /// Get tile grid dimensions.
617    pub fn tile_grid_size(&self) -> (u32, u32) {
618        (self.tiles_x, self.tiles_y)
619    }
620
621    /// Get total number of tiles.
622    pub fn tile_count(&self) -> u32 {
623        self.tiles_x * self.tiles_y
624    }
625
626    /// Get number of halo copy operations per step.
627    pub fn halo_copy_count(&self) -> u32 {
628        self.num_halo_copies
629    }
630}
631
632#[cfg(test)]
633mod tests {
634    use super::*;
635
636    #[test]
637    #[ignore] // Requires CUDA hardware
638    fn test_packed_backend_creation() {
639        let backend = CudaPackedBackend::new(64, 64, 16).unwrap();
640        assert_eq!(backend.tiles_x, 4);
641        assert_eq!(backend.tiles_y, 4);
642        assert_eq!(backend.tile_count(), 16);
643    }
644
645    #[test]
646    #[ignore] // Requires CUDA hardware
647    fn test_packed_backend_step() {
648        let mut backend = CudaPackedBackend::new(32, 32, 16).unwrap();
649        backend.set_params(0.25, 0.99);
650
651        // Inject impulse
652        backend.inject_impulse(16, 16, 1.0).unwrap();
653
654        // Run some steps
655        for _ in 0..10 {
656            backend.step().unwrap();
657        }
658
659        // Read back
660        let pressure = backend.read_pressure_grid().unwrap();
661        assert_eq!(pressure.len(), 32 * 32);
662
663        // Check that wave has propagated (center should have changed)
664        let center = pressure[16 * 32 + 16];
665        assert!(center.abs() < 1.0, "Wave should have propagated");
666    }
667
668    #[test]
669    fn test_halo_copy_computation() {
670        // 2x2 tile grid
671        let copies = CudaPackedBackend::compute_halo_copies(2, 2, 16, 18);
672
673        // Each internal edge has 16 copies
674        // 2x2 grid has: 2 horizontal edges + 2 vertical edges = 4 edges
675        // Each edge = 16 copies, but bidirectional = 2x
676        // Actually: tile(0,0) has east and south neighbors
677        //           tile(1,0) has west and south neighbors
678        //           tile(0,1) has east and north neighbors
679        //           tile(1,1) has west and north neighbors
680        // Total = 8 edge connections × 16 cells = 128 copies
681        // But we copy in one direction per edge pair, so 4 edges × 16 = 64 each direction
682        // Hmm, let me recalculate:
683        // tile(0,0): south (16), east (16) = 32
684        // tile(1,0): south (16) = 16 (west already covered by 0,0's east)
685        // tile(0,1): east (16) = 16 (north already covered by 0,0's south)
686        // tile(1,1): nothing new
687        // Total = 32 + 16 + 16 = 64? No wait, each tile copies its OWN edges...
688
689        // Let me trace through:
690        // (0,0): ty>0? no. ty<1? yes, south to (0,1) = 16. tx>0? no. tx<1? yes, east to (1,0) = 16. Total: 32
691        // (1,0): ty>0? no. ty<1? yes, south to (1,1) = 16. tx>0? yes, west to (0,0) = 16. tx<1? no. Total: 32
692        // (0,1): ty>0? yes, north to (0,0) = 16. ty<1? no. tx>0? no. tx<1? yes, east to (1,1) = 16. Total: 32
693        // (1,1): ty>0? yes, north to (1,0) = 16. ty<1? no. tx>0? yes, west to (0,1) = 16. tx<1? no. Total: 32
694        // Grand total: 128 copies
695
696        assert_eq!(copies.len(), 128);
697    }
698}
ringkernel_wavesim/simulation/cuda_packed.rs

ringkernel_wavesim/simulation/
cuda_packed.rs