ringkernel_wavesim/simulation/
kernels.rs

1//! CUDA kernel definitions using the Rust DSL.
2//!
3//! This module contains all CUDA kernels for the wave simulation, defined in
4//! a Rust DSL that gets transpiled to CUDA C at compile time.
5//!
6//! The generated CUDA code is designed to match the handwritten versions in
7//! `shaders/fdtd_tile.cu` and `shaders/fdtd_packed.cu` exactly.
8
9#[cfg(feature = "cuda-codegen")]
10use ringkernel_cuda_codegen::{
11    transpile_global_kernel, transpile_stencil_kernel, Grid, StencilConfig,
12};
13
14// ============================================================================
15// Tile-Based Kernels (fdtd_tile.cu equivalent)
16// ============================================================================
17
18/// Generate the complete CUDA source for tile-based kernels.
19///
20/// This generates CUDA code equivalent to `shaders/fdtd_tile.cu`:
21/// - `fdtd_tile_step`: Main FDTD wave equation kernel
22/// - `extract_halo`: Extract halo from interior edge
23/// - `inject_halo`: Inject halo to boundary region
24/// - `read_interior`: Read interior cells to output buffer
25/// - `apply_boundary_reflection`: Apply boundary conditions
26#[cfg(feature = "cuda-codegen")]
27pub fn generate_tile_kernels() -> String {
28    let mut output = String::new();
29
30    output.push_str(TILE_KERNELS_HEADER);
31    output.push_str("\nextern \"C\" {\n\n");
32
33    // Generate fdtd_tile_step kernel
34    output.push_str(&generate_fdtd_tile_step());
35    output.push('\n');
36
37    // Generate extract_halo kernel
38    output.push_str(&generate_extract_halo());
39    output.push('\n');
40
41    // Generate inject_halo kernel
42    output.push_str(&generate_inject_halo());
43    output.push('\n');
44
45    // Generate read_interior kernel
46    output.push_str(&generate_read_interior());
47    output.push('\n');
48
49    // Generate apply_boundary_reflection kernel
50    output.push_str(&generate_apply_boundary_reflection());
51
52    output.push_str("\n}  // extern \"C\"\n");
53
54    output
55}
56
57/// Header comment for generated tile kernels.
58pub const TILE_KERNELS_HEADER: &str = r#"// CUDA Kernels for Tile-Based FDTD Wave Simulation
59// Generated by ringkernel-cuda-codegen from Rust DSL
60//
61// Buffer Layout (18x18 = 324 floats):
62//   +---+----------------+---+
63//   | NW|   North Halo   |NE |  <- Row 0
64//   +---+----------------+---+
65//   |   |   16x16 Tile   |   |  <- Rows 1-16
66//   | W |    Interior    | E |
67//   +---+----------------+---+
68//   | SW|   South Halo   |SE |  <- Row 17
69//   +---+----------------+---+
70//
71// Index: idx = y * 18 + x
72// Interior cell (lx, ly): idx = (ly + 1) * 18 + (lx + 1)
73"#;
74
75/// Generate the main FDTD tile step kernel.
76#[cfg(feature = "cuda-codegen")]
77fn generate_fdtd_tile_step() -> String {
78    use syn::parse_quote;
79
80    let kernel_fn: syn::ItemFn = parse_quote! {
81        fn fdtd_tile_step(
82            pressure: &[f32],
83            pressure_prev: &mut [f32],
84            c2: f32,
85            damping: f32,
86            pos: GridPos,
87        ) {
88            let p = pressure[pos.idx()];
89            let p_prev = pressure_prev[pos.idx()];
90
91            let p_n = pos.north(pressure);
92            let p_s = pos.south(pressure);
93            let p_w = pos.west(pressure);
94            let p_e = pos.east(pressure);
95
96            let laplacian = p_n + p_s + p_e + p_w - 4.0 * p;
97            let p_new = 2.0 * p - p_prev + c2 * laplacian;
98
99            pressure_prev[pos.idx()] = p_new * damping;
100        }
101    };
102
103    let config = StencilConfig::new("fdtd_tile_step")
104        .with_grid(Grid::Grid2D)
105        .with_tile_size(16, 16)
106        .with_halo(1);
107
108    match transpile_stencil_kernel(&kernel_fn, &config) {
109        Ok(cuda) => cuda,
110        Err(e) => format!("// Transpilation error: {}\n", e),
111    }
112}
113
114/// Generate the extract_halo kernel.
115#[cfg(feature = "cuda-codegen")]
116fn generate_extract_halo() -> String {
117    use syn::parse_quote;
118
119    let kernel_fn: syn::ItemFn = parse_quote! {
120        fn extract_halo(
121            pressure: &[f32],
122            halo_out: &mut [f32],
123            edge: i32,
124        ) {
125            let i = thread_idx_x();
126            if i >= 16 {
127                return;
128            }
129
130            let idx = match edge {
131                0 => 1 * 18 + (i + 1),       // North - extract row 1
132                1 => 16 * 18 + (i + 1),      // South - extract row 16
133                2 => (i + 1) * 18 + 1,       // West - extract col 1
134                _ => (i + 1) * 18 + 16,      // East - extract col 16
135            };
136
137            halo_out[i as usize] = pressure[idx as usize];
138        }
139    };
140
141    match transpile_global_kernel(&kernel_fn) {
142        Ok(cuda) => format!("// Extract Halo - extracts halo data from interior edge\n// edge: 0=North, 1=South, 2=West, 3=East\n{}", cuda),
143        Err(e) => format!("// Transpilation error: {}\n", e),
144    }
145}
146
147/// Generate the inject_halo kernel.
148#[cfg(feature = "cuda-codegen")]
149fn generate_inject_halo() -> String {
150    use syn::parse_quote;
151
152    let kernel_fn: syn::ItemFn = parse_quote! {
153        fn inject_halo(
154            pressure: &mut [f32],
155            halo_in: &[f32],
156            edge: i32,
157        ) {
158            let i = thread_idx_x();
159            if i >= 16 {
160                return;
161            }
162
163            let idx = match edge {
164                0 => 0 * 18 + (i + 1),       // North - inject to row 0
165                1 => 17 * 18 + (i + 1),      // South - inject to row 17
166                2 => (i + 1) * 18 + 0,       // West - inject to col 0
167                _ => (i + 1) * 18 + 17,      // East - inject to col 17
168            };
169
170            pressure[idx as usize] = halo_in[i as usize];
171        }
172    };
173
174    match transpile_global_kernel(&kernel_fn) {
175        Ok(cuda) => format!("// Inject Halo - injects halo data from linear buffer to halo region\n// edge: 0=North, 1=South, 2=West, 3=East\n{}", cuda),
176        Err(e) => format!("// Transpilation error: {}\n", e),
177    }
178}
179
180/// Generate the read_interior kernel.
181#[cfg(feature = "cuda-codegen")]
182fn generate_read_interior() -> String {
183    use syn::parse_quote;
184
185    let kernel_fn: syn::ItemFn = parse_quote! {
186        fn read_interior(
187            pressure: &[f32],
188            output: &mut [f32],
189        ) {
190            let lx = thread_idx_x();
191            let ly = thread_idx_y();
192
193            if lx >= 16 || ly >= 16 {
194                return;
195            }
196
197            let src_idx = (ly + 1) * 18 + (lx + 1);
198            let dst_idx = ly * 16 + lx;
199
200            output[dst_idx as usize] = pressure[src_idx as usize];
201        }
202    };
203
204    match transpile_global_kernel(&kernel_fn) {
205        Ok(cuda) => format!(
206            "// Read Interior - reads interior pressure to linear buffer for visualization\n{}",
207            cuda
208        ),
209        Err(e) => format!("// Transpilation error: {}\n", e),
210    }
211}
212
213/// Generate the apply_boundary_reflection kernel.
214#[cfg(feature = "cuda-codegen")]
215fn generate_apply_boundary_reflection() -> String {
216    use syn::parse_quote;
217
218    let kernel_fn: syn::ItemFn = parse_quote! {
219        fn apply_boundary_reflection(
220            pressure: &mut [f32],
221            edge: i32,
222            reflection_coeff: f32,
223        ) {
224            let i = thread_idx_x();
225            if i >= 16 {
226                return;
227            }
228
229            // Calculate source and destination indices based on edge
230            let src_idx = match edge {
231                0 => 1 * 18 + (i + 1),       // North - reflect row 1
232                1 => 16 * 18 + (i + 1),      // South - reflect row 16
233                2 => (i + 1) * 18 + 1,       // West - reflect col 1
234                _ => (i + 1) * 18 + 16,      // East - reflect col 16
235            };
236
237            let dst_idx = match edge {
238                0 => 0 * 18 + (i + 1),       // North - to row 0
239                1 => 17 * 18 + (i + 1),      // South - to row 17
240                2 => (i + 1) * 18 + 0,       // West - to col 0
241                _ => (i + 1) * 18 + 17,      // East - to col 17
242            };
243
244            pressure[dst_idx as usize] = pressure[src_idx as usize] * reflection_coeff;
245        }
246    };
247
248    match transpile_global_kernel(&kernel_fn) {
249        Ok(cuda) => format!("// Apply Boundary Reflection - applies boundary conditions for tiles at grid edges\n// edge: 0=North, 1=South, 2=West, 3=East\n{}", cuda),
250        Err(e) => format!("// Transpilation error: {}\n", e),
251    }
252}
253
254// ============================================================================
255// Packed Tile Kernels (fdtd_packed.cu equivalent)
256// ============================================================================
257
258/// Generate the complete CUDA source for packed tile kernels.
259///
260/// This generates CUDA code equivalent to `shaders/fdtd_packed.cu`:
261/// - `exchange_all_halos`: Copy halos between adjacent tiles
262/// - `fdtd_all_tiles`: Batched FDTD for all tiles in parallel
263/// - `upload_tile_data`: Upload initial state to a tile
264/// - `read_all_interiors`: Read all tile interiors to output
265/// - `inject_impulse`: Add impulse to specific cell
266/// - `apply_boundary_conditions`: Apply boundary conditions to edge tiles
267#[cfg(feature = "cuda-codegen")]
268pub fn generate_packed_kernels() -> String {
269    let mut output = String::new();
270
271    output.push_str(PACKED_KERNELS_HEADER);
272    output.push_str("\nextern \"C\" {\n\n");
273
274    output.push_str(&generate_exchange_all_halos());
275    output.push('\n');
276
277    output.push_str(&generate_fdtd_all_tiles());
278    output.push('\n');
279
280    output.push_str(&generate_upload_tile_data());
281    output.push('\n');
282
283    output.push_str(&generate_read_all_interiors());
284    output.push('\n');
285
286    output.push_str(&generate_inject_impulse());
287    output.push('\n');
288
289    output.push_str(&generate_apply_boundary_conditions());
290
291    output.push_str("\n}  // extern \"C\"\n");
292
293    output
294}
295
296/// Header comment for generated packed kernels.
297pub const PACKED_KERNELS_HEADER: &str = r#"// CUDA Kernels for Packed Tile-Based FDTD Wave Simulation
298// Generated by ringkernel-cuda-codegen from Rust DSL
299//
300// All tiles packed contiguously: [Tile(0,0)][Tile(1,0)]...[Tile(n,m)]
301// Each tile is 18x18 floats (16x16 interior + 1-cell halo)
302//
303// Benefits:
304// - Zero host<->GPU transfers during simulation
305// - All tiles computed in parallel
306// - Halo exchange is just GPU memory copies
307"#;
308
309/// Generate exchange_all_halos kernel.
310#[cfg(feature = "cuda-codegen")]
311fn generate_exchange_all_halos() -> String {
312    use syn::parse_quote;
313
314    let kernel_fn: syn::ItemFn = parse_quote! {
315        fn exchange_all_halos(
316            packed_buffer: &mut [f32],
317            copies: &[u32],
318            num_copies: i32,
319        ) {
320            let idx = block_idx_x() * block_dim_x() + thread_idx_x();
321            if idx >= num_copies {
322                return;
323            }
324
325            let src_idx = copies[(idx * 2) as usize];
326            let dst_idx = copies[(idx * 2 + 1) as usize];
327
328            packed_buffer[dst_idx as usize] = packed_buffer[src_idx as usize];
329        }
330    };
331
332    match transpile_global_kernel(&kernel_fn) {
333        Ok(cuda) => format!(
334            "// Halo Exchange Kernel - copies all halo edges between adjacent tiles\n{}",
335            cuda
336        ),
337        Err(e) => format!("// Transpilation error: {}\n", e),
338    }
339}
340
341/// Generate fdtd_all_tiles kernel.
342#[cfg(feature = "cuda-codegen")]
343fn generate_fdtd_all_tiles() -> String {
344    use syn::parse_quote;
345
346    let kernel_fn: syn::ItemFn = parse_quote! {
347        fn fdtd_all_tiles(
348            packed_curr: &[f32],
349            packed_prev: &mut [f32],
350            tiles_x: i32,
351            tiles_y: i32,
352            tile_size: i32,
353            buffer_width: i32,
354            c2: f32,
355            damping: f32,
356        ) {
357            let tile_x = block_idx_x();
358            let tile_y = block_idx_y();
359            let lx = thread_idx_x();
360            let ly = thread_idx_y();
361
362            if tile_x >= tiles_x || tile_y >= tiles_y {
363                return;
364            }
365            if lx >= tile_size || ly >= tile_size {
366                return;
367            }
368
369            let tile_buffer_size = buffer_width * buffer_width;
370            let tile_idx = tile_y * tiles_x + tile_x;
371            let tile_offset = tile_idx * tile_buffer_size;
372
373            let idx = tile_offset + (ly + 1) * buffer_width + (lx + 1);
374
375            let p = packed_curr[idx as usize];
376            let p_prev_val = packed_prev[idx as usize];
377
378            let p_n = packed_curr[(idx - buffer_width) as usize];
379            let p_s = packed_curr[(idx + buffer_width) as usize];
380            let p_w = packed_curr[(idx - 1) as usize];
381            let p_e = packed_curr[(idx + 1) as usize];
382
383            let laplacian = p_n + p_s + p_e + p_w - 4.0 * p;
384            let p_new = 2.0 * p - p_prev_val + c2 * laplacian;
385
386            packed_prev[idx as usize] = p_new * damping;
387        }
388    };
389
390    match transpile_global_kernel(&kernel_fn) {
391        Ok(cuda) => format!(
392            "// Batched FDTD Kernel - computes FDTD for ALL tiles in single launch\n{}",
393            cuda
394        ),
395        Err(e) => format!("// Transpilation error: {}\n", e),
396    }
397}
398
399/// Generate upload_tile_data kernel.
400#[cfg(feature = "cuda-codegen")]
401fn generate_upload_tile_data() -> String {
402    use syn::parse_quote;
403
404    let kernel_fn: syn::ItemFn = parse_quote! {
405        fn upload_tile_data(
406            packed_buffer: &mut [f32],
407            staging: &[f32],
408            tile_x: i32,
409            tile_y: i32,
410            tiles_x: i32,
411            buffer_width: i32,
412        ) {
413            let lx = thread_idx_x();
414            let ly = thread_idx_y();
415
416            if lx >= buffer_width || ly >= buffer_width {
417                return;
418            }
419
420            let tile_buffer_size = buffer_width * buffer_width;
421            let tile_idx = tile_y * tiles_x + tile_x;
422            let tile_offset = tile_idx * tile_buffer_size;
423
424            let local_idx = ly * buffer_width + lx;
425            let global_idx = tile_offset + local_idx;
426
427            packed_buffer[global_idx as usize] = staging[local_idx as usize];
428        }
429    };
430
431    match transpile_global_kernel(&kernel_fn) {
432        Ok(cuda) => format!(
433            "// Upload Initial State - copies initial data to packed buffer\n{}",
434            cuda
435        ),
436        Err(e) => format!("// Transpilation error: {}\n", e),
437    }
438}
439
440/// Generate read_all_interiors kernel.
441#[cfg(feature = "cuda-codegen")]
442fn generate_read_all_interiors() -> String {
443    use syn::parse_quote;
444
445    let kernel_fn: syn::ItemFn = parse_quote! {
446        fn read_all_interiors(
447            packed_buffer: &[f32],
448            output: &mut [f32],
449            tiles_x: i32,
450            tiles_y: i32,
451            tile_size: i32,
452            buffer_width: i32,
453            grid_width: i32,
454            grid_height: i32,
455        ) {
456            let gx = block_idx_x() * block_dim_x() + thread_idx_x();
457            let gy = block_idx_y() * block_dim_y() + thread_idx_y();
458
459            if gx >= grid_width || gy >= grid_height {
460                return;
461            }
462
463            let tile_x = gx / tile_size;
464            let tile_y = gy / tile_size;
465
466            let lx = gx % tile_size;
467            let ly = gy % tile_size;
468
469            let tile_buffer_size = buffer_width * buffer_width;
470            let tile_idx = tile_y * tiles_x + tile_x;
471            let tile_offset = tile_idx * tile_buffer_size;
472            let src_idx = tile_offset + (ly + 1) * buffer_width + (lx + 1);
473
474            let dst_idx = gy * grid_width + gx;
475
476            output[dst_idx as usize] = packed_buffer[src_idx as usize];
477        }
478    };
479
480    match transpile_global_kernel(&kernel_fn) {
481        Ok(cuda) => format!(
482            "// Read All Interiors - extracts all tile interiors for visualization\n{}",
483            cuda
484        ),
485        Err(e) => format!("// Transpilation error: {}\n", e),
486    }
487}
488
489/// Generate inject_impulse kernel.
490#[cfg(feature = "cuda-codegen")]
491fn generate_inject_impulse() -> String {
492    use syn::parse_quote;
493
494    let kernel_fn: syn::ItemFn = parse_quote! {
495        fn inject_impulse(
496            packed_buffer: &mut [f32],
497            tile_x: i32,
498            tile_y: i32,
499            local_x: i32,
500            local_y: i32,
501            tiles_x: i32,
502            buffer_width: i32,
503            amplitude: f32,
504        ) {
505            let tile_buffer_size = buffer_width * buffer_width;
506            let tile_idx = tile_y * tiles_x + tile_x;
507            let tile_offset = tile_idx * tile_buffer_size;
508            let idx = tile_offset + (local_y + 1) * buffer_width + (local_x + 1);
509
510            packed_buffer[idx as usize] = packed_buffer[idx as usize] + amplitude;
511        }
512    };
513
514    match transpile_global_kernel(&kernel_fn) {
515        Ok(cuda) => format!("// Inject Impulse - adds energy to specific cell\n{}", cuda),
516        Err(e) => format!("// Transpilation error: {}\n", e),
517    }
518}
519
520/// Generate apply_boundary_conditions kernel.
521#[cfg(feature = "cuda-codegen")]
522fn generate_apply_boundary_conditions() -> String {
523    use syn::parse_quote;
524
525    let kernel_fn: syn::ItemFn = parse_quote! {
526        fn apply_boundary_conditions(
527            packed_buffer: &mut [f32],
528            tiles_x: i32,
529            tiles_y: i32,
530            tile_size: i32,
531            buffer_width: i32,
532            reflection_coeff: f32,
533        ) {
534            let edge = block_idx_x();
535            let idx = thread_idx_x();
536
537            let tile_buffer_size = buffer_width * buffer_width;
538
539            if edge == 0 {
540                // North boundary: tiles with tile_y == 0
541                let tile_x = idx / tile_size;
542                let cell_x = idx % tile_size;
543                if tile_x >= tiles_x {
544                    return;
545                }
546
547                let tile_idx = 0 * tiles_x + tile_x;
548                let tile_offset = tile_idx * tile_buffer_size;
549                let src_idx = tile_offset + 1 * buffer_width + (cell_x + 1);
550                let dst_idx = tile_offset + 0 * buffer_width + (cell_x + 1);
551                packed_buffer[dst_idx as usize] = packed_buffer[src_idx as usize] * reflection_coeff;
552            } else if edge == 1 {
553                // South boundary: tiles with tile_y == tiles_y - 1
554                let tile_x = idx / tile_size;
555                let cell_x = idx % tile_size;
556                if tile_x >= tiles_x {
557                    return;
558                }
559
560                let tile_idx = (tiles_y - 1) * tiles_x + tile_x;
561                let tile_offset = tile_idx * tile_buffer_size;
562                let src_idx = tile_offset + tile_size * buffer_width + (cell_x + 1);
563                let dst_idx = tile_offset + (tile_size + 1) * buffer_width + (cell_x + 1);
564                packed_buffer[dst_idx as usize] = packed_buffer[src_idx as usize] * reflection_coeff;
565            } else if edge == 2 {
566                // West boundary: tiles with tile_x == 0
567                let tile_y = idx / tile_size;
568                let cell_y = idx % tile_size;
569                if tile_y >= tiles_y {
570                    return;
571                }
572
573                let tile_idx = tile_y * tiles_x + 0;
574                let tile_offset = tile_idx * tile_buffer_size;
575                let src_idx = tile_offset + (cell_y + 1) * buffer_width + 1;
576                let dst_idx = tile_offset + (cell_y + 1) * buffer_width + 0;
577                packed_buffer[dst_idx as usize] = packed_buffer[src_idx as usize] * reflection_coeff;
578            } else if edge == 3 {
579                // East boundary: tiles with tile_x == tiles_x - 1
580                let tile_y = idx / tile_size;
581                let cell_y = idx % tile_size;
582                if tile_y >= tiles_y {
583                    return;
584                }
585
586                let tile_idx = tile_y * tiles_x + (tiles_x - 1);
587                let tile_offset = tile_idx * tile_buffer_size;
588                let src_idx = tile_offset + (cell_y + 1) * buffer_width + tile_size;
589                let dst_idx = tile_offset + (cell_y + 1) * buffer_width + (tile_size + 1);
590                packed_buffer[dst_idx as usize] = packed_buffer[src_idx as usize] * reflection_coeff;
591            }
592        }
593    };
594
595    match transpile_global_kernel(&kernel_fn) {
596        Ok(cuda) => format!(
597            "// Apply Boundary Conditions - handles domain edges for packed tiles\n{}",
598            cuda
599        ),
600        Err(e) => format!("// Transpilation error: {}\n", e),
601    }
602}
603
604// ============================================================================
605// Fallback implementations (when cuda-codegen is not enabled)
606// ============================================================================
607
608#[cfg(not(feature = "cuda-codegen"))]
609pub fn generate_tile_kernels() -> String {
610    "// CUDA codegen not enabled - use handwritten shaders/fdtd_tile.cu".to_string()
611}
612
613#[cfg(not(feature = "cuda-codegen"))]
614pub fn generate_packed_kernels() -> String {
615    "// CUDA codegen not enabled - use handwritten shaders/fdtd_packed.cu".to_string()
616}
617
618// ============================================================================
619// Tests
620// ============================================================================
621
622#[cfg(test)]
623mod tests {
624    #[allow(unused_imports)]
625    use super::*;
626
627    #[test]
628    #[cfg(feature = "cuda-codegen")]
629    fn test_tile_kernels_structure() {
630        let source = generate_tile_kernels();
631
632        // Check all kernels are present
633        assert!(source.contains("fdtd_tile_step"), "Missing fdtd_tile_step");
634        assert!(source.contains("extract_halo"), "Missing extract_halo");
635        assert!(source.contains("inject_halo"), "Missing inject_halo");
636        assert!(source.contains("read_interior"), "Missing read_interior");
637        assert!(
638            source.contains("apply_boundary_reflection"),
639            "Missing apply_boundary_reflection"
640        );
641
642        // Check extern "C" wrapper
643        assert!(source.contains("extern \"C\""), "Missing extern C");
644
645        // Check FDTD kernel has correct structure
646        assert!(source.contains("__global__ void fdtd_tile_step"));
647        assert!(source.contains("threadIdx.x"));
648        assert!(source.contains("threadIdx.y"));
649        assert!(source.contains("buffer_width = 18") || source.contains("* 18"));
650    }
651
652    #[test]
653    #[cfg(feature = "cuda-codegen")]
654    fn test_packed_kernels_structure() {
655        let source = generate_packed_kernels();
656
657        // Check all kernels are present
658        assert!(
659            source.contains("exchange_all_halos"),
660            "Missing exchange_all_halos"
661        );
662        assert!(source.contains("fdtd_all_tiles"), "Missing fdtd_all_tiles");
663        assert!(
664            source.contains("upload_tile_data"),
665            "Missing upload_tile_data"
666        );
667        assert!(
668            source.contains("read_all_interiors"),
669            "Missing read_all_interiors"
670        );
671        assert!(source.contains("inject_impulse"), "Missing inject_impulse");
672        assert!(
673            source.contains("apply_boundary_conditions"),
674            "Missing apply_boundary_conditions"
675        );
676
677        // Check batched FDTD uses blockIdx
678        assert!(source.contains("blockIdx.x"), "Missing blockIdx usage");
679        assert!(source.contains("blockIdx.y"), "Missing blockIdx.y usage");
680    }
681
682    #[test]
683    #[cfg(feature = "cuda-codegen")]
684    fn test_fdtd_tile_step_matches_handwritten() {
685        let generated = generate_fdtd_tile_step();
686
687        // Verify key structural elements match handwritten version
688        assert!(generated.contains("const float* __restrict__ pressure"));
689        assert!(generated.contains("float* __restrict__ pressure_prev"));
690        assert!(generated.contains("float c2"));
691        assert!(generated.contains("float damping"));
692        assert!(generated.contains("if (lx >= 16 || ly >= 16) return;"));
693        assert!(
694            generated.contains("idx = (ly + 1) * buffer_width + (lx + 1)")
695                || generated.contains("(ly + 1) * 18 + (lx + 1)")
696        );
697        assert!(generated.contains("laplacian"));
698        assert!(generated.contains("* damping"));
699
700        println!("Generated fdtd_tile_step:\n{}", generated);
701    }
702
703    #[test]
704    #[cfg(feature = "cuda-codegen")]
705    fn test_generated_vs_handwritten_tile() {
706        let generated = generate_tile_kernels();
707        let handwritten = include_str!("../shaders/fdtd_tile.cu");
708
709        // Count kernels in both
710        let gen_kernel_count = generated.matches("__global__").count();
711        let hw_kernel_count = handwritten.matches("__global__").count();
712
713        assert_eq!(
714            gen_kernel_count, hw_kernel_count,
715            "Kernel count mismatch: generated={}, handwritten={}",
716            gen_kernel_count, hw_kernel_count
717        );
718    }
719
720    #[test]
721    #[cfg(feature = "cuda-codegen")]
722    fn test_generated_vs_handwritten_packed() {
723        let generated = generate_packed_kernels();
724        let handwritten = include_str!("../shaders/fdtd_packed.cu");
725
726        // Count kernels in both
727        let gen_kernel_count = generated.matches("__global__").count();
728        let hw_kernel_count = handwritten.matches("__global__").count();
729
730        assert_eq!(
731            gen_kernel_count, hw_kernel_count,
732            "Kernel count mismatch: generated={}, handwritten={}",
733            gen_kernel_count, hw_kernel_count
734        );
735    }
736
737    #[test]
738    #[cfg(feature = "cuda-codegen")]
739    fn test_match_expression_transpiles_to_switch() {
740        // Test extract_halo uses switch for edge selection
741        let extract = generate_extract_halo();
742        assert!(
743            extract.contains("switch (edge)"),
744            "extract_halo should use switch: {}",
745            extract
746        );
747        assert!(
748            extract.contains("case 0:"),
749            "extract_halo should have case 0"
750        );
751        assert!(
752            extract.contains("case 1:"),
753            "extract_halo should have case 1"
754        );
755        assert!(
756            extract.contains("case 2:"),
757            "extract_halo should have case 2"
758        );
759        assert!(
760            extract.contains("default:"),
761            "extract_halo should have default"
762        );
763
764        // Test inject_halo uses switch for edge selection
765        let inject = generate_inject_halo();
766        assert!(
767            inject.contains("switch (edge)"),
768            "inject_halo should use switch: {}",
769            inject
770        );
771
772        // Test apply_boundary_reflection uses switch
773        let boundary = generate_apply_boundary_reflection();
774        assert!(
775            boundary.contains("switch (edge)"),
776            "apply_boundary_reflection should use switch: {}",
777            boundary
778        );
779
780        println!("Generated extract_halo:\n{}", extract);
781    }
782
783    #[test]
784    #[cfg(feature = "cuda-codegen")]
785    fn test_all_kernels_transpile_successfully() {
786        // Verify all tile kernels transpile without errors
787        let tile_source = generate_tile_kernels();
788        assert!(
789            !tile_source.contains("Transpilation error"),
790            "Tile kernels had transpilation errors:\n{}",
791            tile_source
792        );
793
794        // Verify all packed kernels transpile without errors
795        let packed_source = generate_packed_kernels();
796        assert!(
797            !packed_source.contains("Transpilation error"),
798            "Packed kernels had transpilation errors:\n{}",
799            packed_source
800        );
801
802        // Count __global__ functions to ensure all generated
803        let tile_count = tile_source.matches("__global__").count();
804        let packed_count = packed_source.matches("__global__").count();
805
806        assert_eq!(tile_count, 5, "Expected 5 tile kernels, got {}", tile_count);
807        assert_eq!(
808            packed_count, 6,
809            "Expected 6 packed kernels, got {}",
810            packed_count
811        );
812
813        println!(
814            "Successfully generated {} tile kernels and {} packed kernels",
815            tile_count, packed_count
816        );
817    }
818}