1#[cfg(feature = "cuda-codegen")]
10use ringkernel_cuda_codegen::{
11 transpile_global_kernel, transpile_stencil_kernel, Grid, StencilConfig,
12};
13
14#[cfg(feature = "cuda-codegen")]
27pub fn generate_tile_kernels() -> String {
28 let mut output = String::new();
29
30 output.push_str(TILE_KERNELS_HEADER);
31 output.push_str("\nextern \"C\" {\n\n");
32
33 output.push_str(&generate_fdtd_tile_step());
35 output.push('\n');
36
37 output.push_str(&generate_extract_halo());
39 output.push('\n');
40
41 output.push_str(&generate_inject_halo());
43 output.push('\n');
44
45 output.push_str(&generate_read_interior());
47 output.push('\n');
48
49 output.push_str(&generate_apply_boundary_reflection());
51
52 output.push_str("\n} // extern \"C\"\n");
53
54 output
55}
56
57pub const TILE_KERNELS_HEADER: &str = r#"// CUDA Kernels for Tile-Based FDTD Wave Simulation
59// Generated by ringkernel-cuda-codegen from Rust DSL
60//
61// Buffer Layout (18x18 = 324 floats):
62// +---+----------------+---+
63// | NW| North Halo |NE | <- Row 0
64// +---+----------------+---+
65// | | 16x16 Tile | | <- Rows 1-16
66// | W | Interior | E |
67// +---+----------------+---+
68// | SW| South Halo |SE | <- Row 17
69// +---+----------------+---+
70//
71// Index: idx = y * 18 + x
72// Interior cell (lx, ly): idx = (ly + 1) * 18 + (lx + 1)
73"#;
74
75#[cfg(feature = "cuda-codegen")]
77fn generate_fdtd_tile_step() -> String {
78 use syn::parse_quote;
79
80 let kernel_fn: syn::ItemFn = parse_quote! {
81 fn fdtd_tile_step(
82 pressure: &[f32],
83 pressure_prev: &mut [f32],
84 c2: f32,
85 damping: f32,
86 pos: GridPos,
87 ) {
88 let p = pressure[pos.idx()];
89 let p_prev = pressure_prev[pos.idx()];
90
91 let p_n = pos.north(pressure);
92 let p_s = pos.south(pressure);
93 let p_w = pos.west(pressure);
94 let p_e = pos.east(pressure);
95
96 let laplacian = p_n + p_s + p_e + p_w - 4.0 * p;
97 let p_new = 2.0 * p - p_prev + c2 * laplacian;
98
99 pressure_prev[pos.idx()] = p_new * damping;
100 }
101 };
102
103 let config = StencilConfig::new("fdtd_tile_step")
104 .with_grid(Grid::Grid2D)
105 .with_tile_size(16, 16)
106 .with_halo(1);
107
108 match transpile_stencil_kernel(&kernel_fn, &config) {
109 Ok(cuda) => cuda,
110 Err(e) => format!("// Transpilation error: {}\n", e),
111 }
112}
113
114#[cfg(feature = "cuda-codegen")]
116fn generate_extract_halo() -> String {
117 use syn::parse_quote;
118
119 let kernel_fn: syn::ItemFn = parse_quote! {
120 fn extract_halo(
121 pressure: &[f32],
122 halo_out: &mut [f32],
123 edge: i32,
124 ) {
125 let i = thread_idx_x();
126 if i >= 16 {
127 return;
128 }
129
130 let idx = match edge {
131 0 => 1 * 18 + (i + 1), 1 => 16 * 18 + (i + 1), 2 => (i + 1) * 18 + 1, _ => (i + 1) * 18 + 16, };
136
137 halo_out[i as usize] = pressure[idx as usize];
138 }
139 };
140
141 match transpile_global_kernel(&kernel_fn) {
142 Ok(cuda) => format!("// Extract Halo - extracts halo data from interior edge\n// edge: 0=North, 1=South, 2=West, 3=East\n{}", cuda),
143 Err(e) => format!("// Transpilation error: {}\n", e),
144 }
145}
146
147#[cfg(feature = "cuda-codegen")]
149fn generate_inject_halo() -> String {
150 use syn::parse_quote;
151
152 let kernel_fn: syn::ItemFn = parse_quote! {
153 fn inject_halo(
154 pressure: &mut [f32],
155 halo_in: &[f32],
156 edge: i32,
157 ) {
158 let i = thread_idx_x();
159 if i >= 16 {
160 return;
161 }
162
163 let idx = match edge {
164 0 => 0 * 18 + (i + 1), 1 => 17 * 18 + (i + 1), 2 => (i + 1) * 18 + 0, _ => (i + 1) * 18 + 17, };
169
170 pressure[idx as usize] = halo_in[i as usize];
171 }
172 };
173
174 match transpile_global_kernel(&kernel_fn) {
175 Ok(cuda) => format!("// Inject Halo - injects halo data from linear buffer to halo region\n// edge: 0=North, 1=South, 2=West, 3=East\n{}", cuda),
176 Err(e) => format!("// Transpilation error: {}\n", e),
177 }
178}
179
180#[cfg(feature = "cuda-codegen")]
182fn generate_read_interior() -> String {
183 use syn::parse_quote;
184
185 let kernel_fn: syn::ItemFn = parse_quote! {
186 fn read_interior(
187 pressure: &[f32],
188 output: &mut [f32],
189 ) {
190 let lx = thread_idx_x();
191 let ly = thread_idx_y();
192
193 if lx >= 16 || ly >= 16 {
194 return;
195 }
196
197 let src_idx = (ly + 1) * 18 + (lx + 1);
198 let dst_idx = ly * 16 + lx;
199
200 output[dst_idx as usize] = pressure[src_idx as usize];
201 }
202 };
203
204 match transpile_global_kernel(&kernel_fn) {
205 Ok(cuda) => format!(
206 "// Read Interior - reads interior pressure to linear buffer for visualization\n{}",
207 cuda
208 ),
209 Err(e) => format!("// Transpilation error: {}\n", e),
210 }
211}
212
213#[cfg(feature = "cuda-codegen")]
215fn generate_apply_boundary_reflection() -> String {
216 use syn::parse_quote;
217
218 let kernel_fn: syn::ItemFn = parse_quote! {
219 fn apply_boundary_reflection(
220 pressure: &mut [f32],
221 edge: i32,
222 reflection_coeff: f32,
223 ) {
224 let i = thread_idx_x();
225 if i >= 16 {
226 return;
227 }
228
229 let src_idx = match edge {
231 0 => 1 * 18 + (i + 1), 1 => 16 * 18 + (i + 1), 2 => (i + 1) * 18 + 1, _ => (i + 1) * 18 + 16, };
236
237 let dst_idx = match edge {
238 0 => 0 * 18 + (i + 1), 1 => 17 * 18 + (i + 1), 2 => (i + 1) * 18 + 0, _ => (i + 1) * 18 + 17, };
243
244 pressure[dst_idx as usize] = pressure[src_idx as usize] * reflection_coeff;
245 }
246 };
247
248 match transpile_global_kernel(&kernel_fn) {
249 Ok(cuda) => format!("// Apply Boundary Reflection - applies boundary conditions for tiles at grid edges\n// edge: 0=North, 1=South, 2=West, 3=East\n{}", cuda),
250 Err(e) => format!("// Transpilation error: {}\n", e),
251 }
252}
253
254#[cfg(feature = "cuda-codegen")]
268pub fn generate_packed_kernels() -> String {
269 let mut output = String::new();
270
271 output.push_str(PACKED_KERNELS_HEADER);
272 output.push_str("\nextern \"C\" {\n\n");
273
274 output.push_str(&generate_exchange_all_halos());
275 output.push('\n');
276
277 output.push_str(&generate_fdtd_all_tiles());
278 output.push('\n');
279
280 output.push_str(&generate_upload_tile_data());
281 output.push('\n');
282
283 output.push_str(&generate_read_all_interiors());
284 output.push('\n');
285
286 output.push_str(&generate_inject_impulse());
287 output.push('\n');
288
289 output.push_str(&generate_apply_boundary_conditions());
290
291 output.push_str("\n} // extern \"C\"\n");
292
293 output
294}
295
296pub const PACKED_KERNELS_HEADER: &str = r#"// CUDA Kernels for Packed Tile-Based FDTD Wave Simulation
298// Generated by ringkernel-cuda-codegen from Rust DSL
299//
300// All tiles packed contiguously: [Tile(0,0)][Tile(1,0)]...[Tile(n,m)]
301// Each tile is 18x18 floats (16x16 interior + 1-cell halo)
302//
303// Benefits:
304// - Zero host<->GPU transfers during simulation
305// - All tiles computed in parallel
306// - Halo exchange is just GPU memory copies
307"#;
308
309#[cfg(feature = "cuda-codegen")]
311fn generate_exchange_all_halos() -> String {
312 use syn::parse_quote;
313
314 let kernel_fn: syn::ItemFn = parse_quote! {
315 fn exchange_all_halos(
316 packed_buffer: &mut [f32],
317 copies: &[u32],
318 num_copies: i32,
319 ) {
320 let idx = block_idx_x() * block_dim_x() + thread_idx_x();
321 if idx >= num_copies {
322 return;
323 }
324
325 let src_idx = copies[(idx * 2) as usize];
326 let dst_idx = copies[(idx * 2 + 1) as usize];
327
328 packed_buffer[dst_idx as usize] = packed_buffer[src_idx as usize];
329 }
330 };
331
332 match transpile_global_kernel(&kernel_fn) {
333 Ok(cuda) => format!(
334 "// Halo Exchange Kernel - copies all halo edges between adjacent tiles\n{}",
335 cuda
336 ),
337 Err(e) => format!("// Transpilation error: {}\n", e),
338 }
339}
340
341#[cfg(feature = "cuda-codegen")]
343fn generate_fdtd_all_tiles() -> String {
344 use syn::parse_quote;
345
346 let kernel_fn: syn::ItemFn = parse_quote! {
347 fn fdtd_all_tiles(
348 packed_curr: &[f32],
349 packed_prev: &mut [f32],
350 tiles_x: i32,
351 tiles_y: i32,
352 tile_size: i32,
353 buffer_width: i32,
354 c2: f32,
355 damping: f32,
356 ) {
357 let tile_x = block_idx_x();
358 let tile_y = block_idx_y();
359 let lx = thread_idx_x();
360 let ly = thread_idx_y();
361
362 if tile_x >= tiles_x || tile_y >= tiles_y {
363 return;
364 }
365 if lx >= tile_size || ly >= tile_size {
366 return;
367 }
368
369 let tile_buffer_size = buffer_width * buffer_width;
370 let tile_idx = tile_y * tiles_x + tile_x;
371 let tile_offset = tile_idx * tile_buffer_size;
372
373 let idx = tile_offset + (ly + 1) * buffer_width + (lx + 1);
374
375 let p = packed_curr[idx as usize];
376 let p_prev_val = packed_prev[idx as usize];
377
378 let p_n = packed_curr[(idx - buffer_width) as usize];
379 let p_s = packed_curr[(idx + buffer_width) as usize];
380 let p_w = packed_curr[(idx - 1) as usize];
381 let p_e = packed_curr[(idx + 1) as usize];
382
383 let laplacian = p_n + p_s + p_e + p_w - 4.0 * p;
384 let p_new = 2.0 * p - p_prev_val + c2 * laplacian;
385
386 packed_prev[idx as usize] = p_new * damping;
387 }
388 };
389
390 match transpile_global_kernel(&kernel_fn) {
391 Ok(cuda) => format!(
392 "// Batched FDTD Kernel - computes FDTD for ALL tiles in single launch\n{}",
393 cuda
394 ),
395 Err(e) => format!("// Transpilation error: {}\n", e),
396 }
397}
398
399#[cfg(feature = "cuda-codegen")]
401fn generate_upload_tile_data() -> String {
402 use syn::parse_quote;
403
404 let kernel_fn: syn::ItemFn = parse_quote! {
405 fn upload_tile_data(
406 packed_buffer: &mut [f32],
407 staging: &[f32],
408 tile_x: i32,
409 tile_y: i32,
410 tiles_x: i32,
411 buffer_width: i32,
412 ) {
413 let lx = thread_idx_x();
414 let ly = thread_idx_y();
415
416 if lx >= buffer_width || ly >= buffer_width {
417 return;
418 }
419
420 let tile_buffer_size = buffer_width * buffer_width;
421 let tile_idx = tile_y * tiles_x + tile_x;
422 let tile_offset = tile_idx * tile_buffer_size;
423
424 let local_idx = ly * buffer_width + lx;
425 let global_idx = tile_offset + local_idx;
426
427 packed_buffer[global_idx as usize] = staging[local_idx as usize];
428 }
429 };
430
431 match transpile_global_kernel(&kernel_fn) {
432 Ok(cuda) => format!(
433 "// Upload Initial State - copies initial data to packed buffer\n{}",
434 cuda
435 ),
436 Err(e) => format!("// Transpilation error: {}\n", e),
437 }
438}
439
440#[cfg(feature = "cuda-codegen")]
442fn generate_read_all_interiors() -> String {
443 use syn::parse_quote;
444
445 let kernel_fn: syn::ItemFn = parse_quote! {
446 fn read_all_interiors(
447 packed_buffer: &[f32],
448 output: &mut [f32],
449 tiles_x: i32,
450 tiles_y: i32,
451 tile_size: i32,
452 buffer_width: i32,
453 grid_width: i32,
454 grid_height: i32,
455 ) {
456 let gx = block_idx_x() * block_dim_x() + thread_idx_x();
457 let gy = block_idx_y() * block_dim_y() + thread_idx_y();
458
459 if gx >= grid_width || gy >= grid_height {
460 return;
461 }
462
463 let tile_x = gx / tile_size;
464 let tile_y = gy / tile_size;
465
466 let lx = gx % tile_size;
467 let ly = gy % tile_size;
468
469 let tile_buffer_size = buffer_width * buffer_width;
470 let tile_idx = tile_y * tiles_x + tile_x;
471 let tile_offset = tile_idx * tile_buffer_size;
472 let src_idx = tile_offset + (ly + 1) * buffer_width + (lx + 1);
473
474 let dst_idx = gy * grid_width + gx;
475
476 output[dst_idx as usize] = packed_buffer[src_idx as usize];
477 }
478 };
479
480 match transpile_global_kernel(&kernel_fn) {
481 Ok(cuda) => format!(
482 "// Read All Interiors - extracts all tile interiors for visualization\n{}",
483 cuda
484 ),
485 Err(e) => format!("// Transpilation error: {}\n", e),
486 }
487}
488
489#[cfg(feature = "cuda-codegen")]
491fn generate_inject_impulse() -> String {
492 use syn::parse_quote;
493
494 let kernel_fn: syn::ItemFn = parse_quote! {
495 fn inject_impulse(
496 packed_buffer: &mut [f32],
497 tile_x: i32,
498 tile_y: i32,
499 local_x: i32,
500 local_y: i32,
501 tiles_x: i32,
502 buffer_width: i32,
503 amplitude: f32,
504 ) {
505 let tile_buffer_size = buffer_width * buffer_width;
506 let tile_idx = tile_y * tiles_x + tile_x;
507 let tile_offset = tile_idx * tile_buffer_size;
508 let idx = tile_offset + (local_y + 1) * buffer_width + (local_x + 1);
509
510 packed_buffer[idx as usize] = packed_buffer[idx as usize] + amplitude;
511 }
512 };
513
514 match transpile_global_kernel(&kernel_fn) {
515 Ok(cuda) => format!("// Inject Impulse - adds energy to specific cell\n{}", cuda),
516 Err(e) => format!("// Transpilation error: {}\n", e),
517 }
518}
519
520#[cfg(feature = "cuda-codegen")]
522fn generate_apply_boundary_conditions() -> String {
523 use syn::parse_quote;
524
525 let kernel_fn: syn::ItemFn = parse_quote! {
526 fn apply_boundary_conditions(
527 packed_buffer: &mut [f32],
528 tiles_x: i32,
529 tiles_y: i32,
530 tile_size: i32,
531 buffer_width: i32,
532 reflection_coeff: f32,
533 ) {
534 let edge = block_idx_x();
535 let idx = thread_idx_x();
536
537 let tile_buffer_size = buffer_width * buffer_width;
538
539 if edge == 0 {
540 let tile_x = idx / tile_size;
542 let cell_x = idx % tile_size;
543 if tile_x >= tiles_x {
544 return;
545 }
546
547 let tile_idx = 0 * tiles_x + tile_x;
548 let tile_offset = tile_idx * tile_buffer_size;
549 let src_idx = tile_offset + 1 * buffer_width + (cell_x + 1);
550 let dst_idx = tile_offset + 0 * buffer_width + (cell_x + 1);
551 packed_buffer[dst_idx as usize] = packed_buffer[src_idx as usize] * reflection_coeff;
552 } else if edge == 1 {
553 let tile_x = idx / tile_size;
555 let cell_x = idx % tile_size;
556 if tile_x >= tiles_x {
557 return;
558 }
559
560 let tile_idx = (tiles_y - 1) * tiles_x + tile_x;
561 let tile_offset = tile_idx * tile_buffer_size;
562 let src_idx = tile_offset + tile_size * buffer_width + (cell_x + 1);
563 let dst_idx = tile_offset + (tile_size + 1) * buffer_width + (cell_x + 1);
564 packed_buffer[dst_idx as usize] = packed_buffer[src_idx as usize] * reflection_coeff;
565 } else if edge == 2 {
566 let tile_y = idx / tile_size;
568 let cell_y = idx % tile_size;
569 if tile_y >= tiles_y {
570 return;
571 }
572
573 let tile_idx = tile_y * tiles_x + 0;
574 let tile_offset = tile_idx * tile_buffer_size;
575 let src_idx = tile_offset + (cell_y + 1) * buffer_width + 1;
576 let dst_idx = tile_offset + (cell_y + 1) * buffer_width + 0;
577 packed_buffer[dst_idx as usize] = packed_buffer[src_idx as usize] * reflection_coeff;
578 } else if edge == 3 {
579 let tile_y = idx / tile_size;
581 let cell_y = idx % tile_size;
582 if tile_y >= tiles_y {
583 return;
584 }
585
586 let tile_idx = tile_y * tiles_x + (tiles_x - 1);
587 let tile_offset = tile_idx * tile_buffer_size;
588 let src_idx = tile_offset + (cell_y + 1) * buffer_width + tile_size;
589 let dst_idx = tile_offset + (cell_y + 1) * buffer_width + (tile_size + 1);
590 packed_buffer[dst_idx as usize] = packed_buffer[src_idx as usize] * reflection_coeff;
591 }
592 }
593 };
594
595 match transpile_global_kernel(&kernel_fn) {
596 Ok(cuda) => format!(
597 "// Apply Boundary Conditions - handles domain edges for packed tiles\n{}",
598 cuda
599 ),
600 Err(e) => format!("// Transpilation error: {}\n", e),
601 }
602}
603
604#[cfg(not(feature = "cuda-codegen"))]
609pub fn generate_tile_kernels() -> String {
610 "// CUDA codegen not enabled - use handwritten shaders/fdtd_tile.cu".to_string()
611}
612
613#[cfg(not(feature = "cuda-codegen"))]
614pub fn generate_packed_kernels() -> String {
615 "// CUDA codegen not enabled - use handwritten shaders/fdtd_packed.cu".to_string()
616}
617
618#[cfg(test)]
623mod tests {
624 #[allow(unused_imports)]
625 use super::*;
626
627 #[test]
628 #[cfg(feature = "cuda-codegen")]
629 fn test_tile_kernels_structure() {
630 let source = generate_tile_kernels();
631
632 assert!(source.contains("fdtd_tile_step"), "Missing fdtd_tile_step");
634 assert!(source.contains("extract_halo"), "Missing extract_halo");
635 assert!(source.contains("inject_halo"), "Missing inject_halo");
636 assert!(source.contains("read_interior"), "Missing read_interior");
637 assert!(
638 source.contains("apply_boundary_reflection"),
639 "Missing apply_boundary_reflection"
640 );
641
642 assert!(source.contains("extern \"C\""), "Missing extern C");
644
645 assert!(source.contains("__global__ void fdtd_tile_step"));
647 assert!(source.contains("threadIdx.x"));
648 assert!(source.contains("threadIdx.y"));
649 assert!(source.contains("buffer_width = 18") || source.contains("* 18"));
650 }
651
652 #[test]
653 #[cfg(feature = "cuda-codegen")]
654 fn test_packed_kernels_structure() {
655 let source = generate_packed_kernels();
656
657 assert!(
659 source.contains("exchange_all_halos"),
660 "Missing exchange_all_halos"
661 );
662 assert!(source.contains("fdtd_all_tiles"), "Missing fdtd_all_tiles");
663 assert!(
664 source.contains("upload_tile_data"),
665 "Missing upload_tile_data"
666 );
667 assert!(
668 source.contains("read_all_interiors"),
669 "Missing read_all_interiors"
670 );
671 assert!(source.contains("inject_impulse"), "Missing inject_impulse");
672 assert!(
673 source.contains("apply_boundary_conditions"),
674 "Missing apply_boundary_conditions"
675 );
676
677 assert!(source.contains("blockIdx.x"), "Missing blockIdx usage");
679 assert!(source.contains("blockIdx.y"), "Missing blockIdx.y usage");
680 }
681
682 #[test]
683 #[cfg(feature = "cuda-codegen")]
684 fn test_fdtd_tile_step_matches_handwritten() {
685 let generated = generate_fdtd_tile_step();
686
687 assert!(generated.contains("const float* __restrict__ pressure"));
689 assert!(generated.contains("float* __restrict__ pressure_prev"));
690 assert!(generated.contains("float c2"));
691 assert!(generated.contains("float damping"));
692 assert!(generated.contains("if (lx >= 16 || ly >= 16) return;"));
693 assert!(
694 generated.contains("idx = (ly + 1) * buffer_width + (lx + 1)")
695 || generated.contains("(ly + 1) * 18 + (lx + 1)")
696 );
697 assert!(generated.contains("laplacian"));
698 assert!(generated.contains("* damping"));
699
700 println!("Generated fdtd_tile_step:\n{}", generated);
701 }
702
703 #[test]
704 #[cfg(feature = "cuda-codegen")]
705 fn test_generated_vs_handwritten_tile() {
706 let generated = generate_tile_kernels();
707 let handwritten = include_str!("../shaders/fdtd_tile.cu");
708
709 let gen_kernel_count = generated.matches("__global__").count();
711 let hw_kernel_count = handwritten.matches("__global__").count();
712
713 assert_eq!(
714 gen_kernel_count, hw_kernel_count,
715 "Kernel count mismatch: generated={}, handwritten={}",
716 gen_kernel_count, hw_kernel_count
717 );
718 }
719
720 #[test]
721 #[cfg(feature = "cuda-codegen")]
722 fn test_generated_vs_handwritten_packed() {
723 let generated = generate_packed_kernels();
724 let handwritten = include_str!("../shaders/fdtd_packed.cu");
725
726 let gen_kernel_count = generated.matches("__global__").count();
728 let hw_kernel_count = handwritten.matches("__global__").count();
729
730 assert_eq!(
731 gen_kernel_count, hw_kernel_count,
732 "Kernel count mismatch: generated={}, handwritten={}",
733 gen_kernel_count, hw_kernel_count
734 );
735 }
736
737 #[test]
738 #[cfg(feature = "cuda-codegen")]
739 fn test_match_expression_transpiles_to_switch() {
740 let extract = generate_extract_halo();
742 assert!(
743 extract.contains("switch (edge)"),
744 "extract_halo should use switch: {}",
745 extract
746 );
747 assert!(
748 extract.contains("case 0:"),
749 "extract_halo should have case 0"
750 );
751 assert!(
752 extract.contains("case 1:"),
753 "extract_halo should have case 1"
754 );
755 assert!(
756 extract.contains("case 2:"),
757 "extract_halo should have case 2"
758 );
759 assert!(
760 extract.contains("default:"),
761 "extract_halo should have default"
762 );
763
764 let inject = generate_inject_halo();
766 assert!(
767 inject.contains("switch (edge)"),
768 "inject_halo should use switch: {}",
769 inject
770 );
771
772 let boundary = generate_apply_boundary_reflection();
774 assert!(
775 boundary.contains("switch (edge)"),
776 "apply_boundary_reflection should use switch: {}",
777 boundary
778 );
779
780 println!("Generated extract_halo:\n{}", extract);
781 }
782
783 #[test]
784 #[cfg(feature = "cuda-codegen")]
785 fn test_all_kernels_transpile_successfully() {
786 let tile_source = generate_tile_kernels();
788 assert!(
789 !tile_source.contains("Transpilation error"),
790 "Tile kernels had transpilation errors:\n{}",
791 tile_source
792 );
793
794 let packed_source = generate_packed_kernels();
796 assert!(
797 !packed_source.contains("Transpilation error"),
798 "Packed kernels had transpilation errors:\n{}",
799 packed_source
800 );
801
802 let tile_count = tile_source.matches("__global__").count();
804 let packed_count = packed_source.matches("__global__").count();
805
806 assert_eq!(tile_count, 5, "Expected 5 tile kernels, got {}", tile_count);
807 assert_eq!(
808 packed_count, 6,
809 "Expected 6 packed kernels, got {}",
810 packed_count
811 );
812
813 println!(
814 "Successfully generated {} tile kernels and {} packed kernels",
815 tile_count, packed_count
816 );
817 }
818}