1use std::collections::HashMap;
7
8#[derive(Debug, Clone, PartialEq)]
10pub enum GpuIntrinsic {
11 SyncThreads,
14 ThreadFence,
16 ThreadFenceBlock,
18 ThreadFenceSystem,
20 SyncThreadsCount,
22 SyncThreadsAnd,
24 SyncThreadsOr,
26
27 AtomicAdd,
30 AtomicSub,
32 AtomicMin,
34 AtomicMax,
36 AtomicExch,
38 AtomicCas,
40 AtomicAnd,
42 AtomicOr,
44 AtomicXor,
46 AtomicInc,
48 AtomicDec,
50
51 Sqrt,
54 Rsqrt,
56 Abs,
58 Fabs,
60 Floor,
62 Ceil,
64 Round,
66 Trunc,
68 Fma,
70 Min,
72 Max,
74 Fmod,
76 Remainder,
78 Copysign,
80 Cbrt,
82 Hypot,
84
85 Sin,
88 Cos,
90 Tan,
92 Asin,
94 Acos,
96 Atan,
98 Atan2,
100 Sincos,
102 Sinpi,
104 Cospi,
106
107 Sinh,
110 Cosh,
112 Tanh,
114 Asinh,
116 Acosh,
118 Atanh,
120
121 Exp,
124 Exp2,
126 Exp10,
128 Expm1,
130 Log,
132 Log2,
134 Log10,
136 Log1p,
138 Pow,
140 Ldexp,
142 Scalbn,
144 Ilogb,
146 Lgamma,
148 Tgamma,
150 Erf,
152 Erfc,
154 Erfinv,
156 Erfcinv,
158
159 Isnan,
162 Isinf,
164 Isfinite,
166 Isnormal,
168 Signbit,
170 Nextafter,
172 Fdim,
174 Nan,
176
177 WarpShfl,
180 WarpShflUp,
182 WarpShflDown,
184 WarpShflXor,
186 WarpActiveMask,
188 WarpBallot,
190 WarpAll,
192 WarpAny,
194 WarpMatchAny,
196 WarpMatchAll,
198 WarpReduceAdd,
200 WarpReduceMin,
202 WarpReduceMax,
204 WarpReduceAnd,
206 WarpReduceOr,
208 WarpReduceXor,
210
211 Popc,
214 Clz,
216 Ctz,
218 Ffs,
220 Brev,
222 BytePerm,
224 FunnelShiftLeft,
226 FunnelShiftRight,
228
229 Ldg,
232 PrefetchL1,
234 PrefetchL2,
236
237 Rcp,
240 Fdividef,
242 Saturate,
244 J0,
246 J1,
248 Jn,
250 Y0,
252 Y1,
254 Yn,
256 Normcdf,
258 Normcdfinv,
260 CylBesselI0,
262 CylBesselI1,
264
265 ThreadIdxX,
268 ThreadIdxY,
270 ThreadIdxZ,
272 BlockIdxX,
274 BlockIdxY,
276 BlockIdxZ,
278 BlockDimX,
280 BlockDimY,
282 BlockDimZ,
284 GridDimX,
286 GridDimY,
288 GridDimZ,
290 WarpSize,
292
293 Clock,
296 Clock64,
298 Nanosleep,
300}
301
302impl GpuIntrinsic {
303 pub fn to_cuda_string(&self) -> &'static str {
305 match self {
306 GpuIntrinsic::SyncThreads => "__syncthreads()",
308 GpuIntrinsic::ThreadFence => "__threadfence()",
309 GpuIntrinsic::ThreadFenceBlock => "__threadfence_block()",
310 GpuIntrinsic::ThreadFenceSystem => "__threadfence_system()",
311 GpuIntrinsic::SyncThreadsCount => "__syncthreads_count",
312 GpuIntrinsic::SyncThreadsAnd => "__syncthreads_and",
313 GpuIntrinsic::SyncThreadsOr => "__syncthreads_or",
314
315 GpuIntrinsic::AtomicAdd => "atomicAdd",
317 GpuIntrinsic::AtomicSub => "atomicSub",
318 GpuIntrinsic::AtomicMin => "atomicMin",
319 GpuIntrinsic::AtomicMax => "atomicMax",
320 GpuIntrinsic::AtomicExch => "atomicExch",
321 GpuIntrinsic::AtomicCas => "atomicCAS",
322 GpuIntrinsic::AtomicAnd => "atomicAnd",
323 GpuIntrinsic::AtomicOr => "atomicOr",
324 GpuIntrinsic::AtomicXor => "atomicXor",
325 GpuIntrinsic::AtomicInc => "atomicInc",
326 GpuIntrinsic::AtomicDec => "atomicDec",
327
328 GpuIntrinsic::Sqrt => "sqrtf",
330 GpuIntrinsic::Rsqrt => "rsqrtf",
331 GpuIntrinsic::Abs => "abs",
332 GpuIntrinsic::Fabs => "fabsf",
333 GpuIntrinsic::Floor => "floorf",
334 GpuIntrinsic::Ceil => "ceilf",
335 GpuIntrinsic::Round => "roundf",
336 GpuIntrinsic::Trunc => "truncf",
337 GpuIntrinsic::Fma => "fmaf",
338 GpuIntrinsic::Min => "fminf",
339 GpuIntrinsic::Max => "fmaxf",
340 GpuIntrinsic::Fmod => "fmodf",
341 GpuIntrinsic::Remainder => "remainderf",
342 GpuIntrinsic::Copysign => "copysignf",
343 GpuIntrinsic::Cbrt => "cbrtf",
344 GpuIntrinsic::Hypot => "hypotf",
345
346 GpuIntrinsic::Sin => "sinf",
348 GpuIntrinsic::Cos => "cosf",
349 GpuIntrinsic::Tan => "tanf",
350 GpuIntrinsic::Asin => "asinf",
351 GpuIntrinsic::Acos => "acosf",
352 GpuIntrinsic::Atan => "atanf",
353 GpuIntrinsic::Atan2 => "atan2f",
354 GpuIntrinsic::Sincos => "sincosf",
355 GpuIntrinsic::Sinpi => "sinpif",
356 GpuIntrinsic::Cospi => "cospif",
357
358 GpuIntrinsic::Sinh => "sinhf",
360 GpuIntrinsic::Cosh => "coshf",
361 GpuIntrinsic::Tanh => "tanhf",
362 GpuIntrinsic::Asinh => "asinhf",
363 GpuIntrinsic::Acosh => "acoshf",
364 GpuIntrinsic::Atanh => "atanhf",
365
366 GpuIntrinsic::Exp => "expf",
368 GpuIntrinsic::Exp2 => "exp2f",
369 GpuIntrinsic::Exp10 => "exp10f",
370 GpuIntrinsic::Expm1 => "expm1f",
371 GpuIntrinsic::Log => "logf",
372 GpuIntrinsic::Log2 => "log2f",
373 GpuIntrinsic::Log10 => "log10f",
374 GpuIntrinsic::Log1p => "log1pf",
375 GpuIntrinsic::Pow => "powf",
376 GpuIntrinsic::Ldexp => "ldexpf",
377 GpuIntrinsic::Scalbn => "scalbnf",
378 GpuIntrinsic::Ilogb => "ilogbf",
379 GpuIntrinsic::Lgamma => "lgammaf",
380 GpuIntrinsic::Tgamma => "tgammaf",
381 GpuIntrinsic::Erf => "erff",
382 GpuIntrinsic::Erfc => "erfcf",
383 GpuIntrinsic::Erfinv => "erfinvf",
384 GpuIntrinsic::Erfcinv => "erfcinvf",
385
386 GpuIntrinsic::Isnan => "isnan",
388 GpuIntrinsic::Isinf => "isinf",
389 GpuIntrinsic::Isfinite => "isfinite",
390 GpuIntrinsic::Isnormal => "isnormal",
391 GpuIntrinsic::Signbit => "signbit",
392 GpuIntrinsic::Nextafter => "nextafterf",
393 GpuIntrinsic::Fdim => "fdimf",
394 GpuIntrinsic::Nan => "nanf",
395
396 GpuIntrinsic::WarpShfl => "__shfl_sync",
398 GpuIntrinsic::WarpShflUp => "__shfl_up_sync",
399 GpuIntrinsic::WarpShflDown => "__shfl_down_sync",
400 GpuIntrinsic::WarpShflXor => "__shfl_xor_sync",
401 GpuIntrinsic::WarpActiveMask => "__activemask()",
402 GpuIntrinsic::WarpBallot => "__ballot_sync",
403 GpuIntrinsic::WarpAll => "__all_sync",
404 GpuIntrinsic::WarpAny => "__any_sync",
405 GpuIntrinsic::WarpMatchAny => "__match_any_sync",
406 GpuIntrinsic::WarpMatchAll => "__match_all_sync",
407 GpuIntrinsic::WarpReduceAdd => "__reduce_add_sync",
408 GpuIntrinsic::WarpReduceMin => "__reduce_min_sync",
409 GpuIntrinsic::WarpReduceMax => "__reduce_max_sync",
410 GpuIntrinsic::WarpReduceAnd => "__reduce_and_sync",
411 GpuIntrinsic::WarpReduceOr => "__reduce_or_sync",
412 GpuIntrinsic::WarpReduceXor => "__reduce_xor_sync",
413
414 GpuIntrinsic::Popc => "__popc",
416 GpuIntrinsic::Clz => "__clz",
417 GpuIntrinsic::Ctz => "__ffs", GpuIntrinsic::Ffs => "__ffs",
419 GpuIntrinsic::Brev => "__brev",
420 GpuIntrinsic::BytePerm => "__byte_perm",
421 GpuIntrinsic::FunnelShiftLeft => "__funnelshift_l",
422 GpuIntrinsic::FunnelShiftRight => "__funnelshift_r",
423
424 GpuIntrinsic::Ldg => "__ldg",
426 GpuIntrinsic::PrefetchL1 => "__prefetch_l1",
427 GpuIntrinsic::PrefetchL2 => "__prefetch_l2",
428
429 GpuIntrinsic::Rcp => "__frcp_rn",
431 GpuIntrinsic::Fdividef => "__fdividef",
432 GpuIntrinsic::Saturate => "__saturatef",
433 GpuIntrinsic::J0 => "j0f",
434 GpuIntrinsic::J1 => "j1f",
435 GpuIntrinsic::Jn => "jnf",
436 GpuIntrinsic::Y0 => "y0f",
437 GpuIntrinsic::Y1 => "y1f",
438 GpuIntrinsic::Yn => "ynf",
439 GpuIntrinsic::Normcdf => "normcdff",
440 GpuIntrinsic::Normcdfinv => "normcdfinvf",
441 GpuIntrinsic::CylBesselI0 => "cyl_bessel_i0f",
442 GpuIntrinsic::CylBesselI1 => "cyl_bessel_i1f",
443
444 GpuIntrinsic::ThreadIdxX => "threadIdx.x",
446 GpuIntrinsic::ThreadIdxY => "threadIdx.y",
447 GpuIntrinsic::ThreadIdxZ => "threadIdx.z",
448 GpuIntrinsic::BlockIdxX => "blockIdx.x",
449 GpuIntrinsic::BlockIdxY => "blockIdx.y",
450 GpuIntrinsic::BlockIdxZ => "blockIdx.z",
451 GpuIntrinsic::BlockDimX => "blockDim.x",
452 GpuIntrinsic::BlockDimY => "blockDim.y",
453 GpuIntrinsic::BlockDimZ => "blockDim.z",
454 GpuIntrinsic::GridDimX => "gridDim.x",
455 GpuIntrinsic::GridDimY => "gridDim.y",
456 GpuIntrinsic::GridDimZ => "gridDim.z",
457 GpuIntrinsic::WarpSize => "warpSize",
458
459 GpuIntrinsic::Clock => "clock()",
461 GpuIntrinsic::Clock64 => "clock64()",
462 GpuIntrinsic::Nanosleep => "__nanosleep",
463 }
464 }
465
466 pub fn is_value_intrinsic(&self) -> bool {
468 matches!(
469 self,
470 GpuIntrinsic::ThreadIdxX
471 | GpuIntrinsic::ThreadIdxY
472 | GpuIntrinsic::ThreadIdxZ
473 | GpuIntrinsic::BlockIdxX
474 | GpuIntrinsic::BlockIdxY
475 | GpuIntrinsic::BlockIdxZ
476 | GpuIntrinsic::BlockDimX
477 | GpuIntrinsic::BlockDimY
478 | GpuIntrinsic::BlockDimZ
479 | GpuIntrinsic::GridDimX
480 | GpuIntrinsic::GridDimY
481 | GpuIntrinsic::GridDimZ
482 | GpuIntrinsic::WarpSize
483 )
484 }
485
486 pub fn is_zero_arg_function(&self) -> bool {
488 matches!(
489 self,
490 GpuIntrinsic::SyncThreads
491 | GpuIntrinsic::ThreadFence
492 | GpuIntrinsic::ThreadFenceBlock
493 | GpuIntrinsic::ThreadFenceSystem
494 | GpuIntrinsic::WarpActiveMask
495 | GpuIntrinsic::Clock
496 | GpuIntrinsic::Clock64
497 )
498 }
499
500 pub fn requires_mask(&self) -> bool {
502 matches!(
503 self,
504 GpuIntrinsic::WarpShfl
505 | GpuIntrinsic::WarpShflUp
506 | GpuIntrinsic::WarpShflDown
507 | GpuIntrinsic::WarpShflXor
508 | GpuIntrinsic::WarpBallot
509 | GpuIntrinsic::WarpAll
510 | GpuIntrinsic::WarpAny
511 | GpuIntrinsic::WarpMatchAny
512 | GpuIntrinsic::WarpMatchAll
513 | GpuIntrinsic::WarpReduceAdd
514 | GpuIntrinsic::WarpReduceMin
515 | GpuIntrinsic::WarpReduceMax
516 | GpuIntrinsic::WarpReduceAnd
517 | GpuIntrinsic::WarpReduceOr
518 | GpuIntrinsic::WarpReduceXor
519 )
520 }
521
522 pub fn category(&self) -> &'static str {
524 match self {
525 GpuIntrinsic::SyncThreads
526 | GpuIntrinsic::ThreadFence
527 | GpuIntrinsic::ThreadFenceBlock
528 | GpuIntrinsic::ThreadFenceSystem
529 | GpuIntrinsic::SyncThreadsCount
530 | GpuIntrinsic::SyncThreadsAnd
531 | GpuIntrinsic::SyncThreadsOr => "synchronization",
532
533 GpuIntrinsic::AtomicAdd
534 | GpuIntrinsic::AtomicSub
535 | GpuIntrinsic::AtomicMin
536 | GpuIntrinsic::AtomicMax
537 | GpuIntrinsic::AtomicExch
538 | GpuIntrinsic::AtomicCas
539 | GpuIntrinsic::AtomicAnd
540 | GpuIntrinsic::AtomicOr
541 | GpuIntrinsic::AtomicXor
542 | GpuIntrinsic::AtomicInc
543 | GpuIntrinsic::AtomicDec => "atomic",
544
545 GpuIntrinsic::Sqrt
546 | GpuIntrinsic::Rsqrt
547 | GpuIntrinsic::Abs
548 | GpuIntrinsic::Fabs
549 | GpuIntrinsic::Floor
550 | GpuIntrinsic::Ceil
551 | GpuIntrinsic::Round
552 | GpuIntrinsic::Trunc
553 | GpuIntrinsic::Fma
554 | GpuIntrinsic::Min
555 | GpuIntrinsic::Max
556 | GpuIntrinsic::Fmod
557 | GpuIntrinsic::Remainder
558 | GpuIntrinsic::Copysign
559 | GpuIntrinsic::Cbrt
560 | GpuIntrinsic::Hypot => "math",
561
562 GpuIntrinsic::Sin
563 | GpuIntrinsic::Cos
564 | GpuIntrinsic::Tan
565 | GpuIntrinsic::Asin
566 | GpuIntrinsic::Acos
567 | GpuIntrinsic::Atan
568 | GpuIntrinsic::Atan2
569 | GpuIntrinsic::Sincos
570 | GpuIntrinsic::Sinpi
571 | GpuIntrinsic::Cospi => "trigonometric",
572
573 GpuIntrinsic::Sinh
574 | GpuIntrinsic::Cosh
575 | GpuIntrinsic::Tanh
576 | GpuIntrinsic::Asinh
577 | GpuIntrinsic::Acosh
578 | GpuIntrinsic::Atanh => "hyperbolic",
579
580 GpuIntrinsic::Exp
581 | GpuIntrinsic::Exp2
582 | GpuIntrinsic::Exp10
583 | GpuIntrinsic::Expm1
584 | GpuIntrinsic::Log
585 | GpuIntrinsic::Log2
586 | GpuIntrinsic::Log10
587 | GpuIntrinsic::Log1p
588 | GpuIntrinsic::Pow
589 | GpuIntrinsic::Ldexp
590 | GpuIntrinsic::Scalbn
591 | GpuIntrinsic::Ilogb
592 | GpuIntrinsic::Lgamma
593 | GpuIntrinsic::Tgamma
594 | GpuIntrinsic::Erf
595 | GpuIntrinsic::Erfc
596 | GpuIntrinsic::Erfinv
597 | GpuIntrinsic::Erfcinv => "exponential",
598
599 GpuIntrinsic::Isnan
600 | GpuIntrinsic::Isinf
601 | GpuIntrinsic::Isfinite
602 | GpuIntrinsic::Isnormal
603 | GpuIntrinsic::Signbit
604 | GpuIntrinsic::Nextafter
605 | GpuIntrinsic::Fdim
606 | GpuIntrinsic::Nan => "classification",
607
608 GpuIntrinsic::WarpShfl
609 | GpuIntrinsic::WarpShflUp
610 | GpuIntrinsic::WarpShflDown
611 | GpuIntrinsic::WarpShflXor
612 | GpuIntrinsic::WarpActiveMask
613 | GpuIntrinsic::WarpBallot
614 | GpuIntrinsic::WarpAll
615 | GpuIntrinsic::WarpAny
616 | GpuIntrinsic::WarpMatchAny
617 | GpuIntrinsic::WarpMatchAll
618 | GpuIntrinsic::WarpReduceAdd
619 | GpuIntrinsic::WarpReduceMin
620 | GpuIntrinsic::WarpReduceMax
621 | GpuIntrinsic::WarpReduceAnd
622 | GpuIntrinsic::WarpReduceOr
623 | GpuIntrinsic::WarpReduceXor => "warp",
624
625 GpuIntrinsic::Popc
626 | GpuIntrinsic::Clz
627 | GpuIntrinsic::Ctz
628 | GpuIntrinsic::Ffs
629 | GpuIntrinsic::Brev
630 | GpuIntrinsic::BytePerm
631 | GpuIntrinsic::FunnelShiftLeft
632 | GpuIntrinsic::FunnelShiftRight => "bit",
633
634 GpuIntrinsic::Ldg | GpuIntrinsic::PrefetchL1 | GpuIntrinsic::PrefetchL2 => "memory",
635
636 GpuIntrinsic::Rcp
637 | GpuIntrinsic::Fdividef
638 | GpuIntrinsic::Saturate
639 | GpuIntrinsic::J0
640 | GpuIntrinsic::J1
641 | GpuIntrinsic::Jn
642 | GpuIntrinsic::Y0
643 | GpuIntrinsic::Y1
644 | GpuIntrinsic::Yn
645 | GpuIntrinsic::Normcdf
646 | GpuIntrinsic::Normcdfinv
647 | GpuIntrinsic::CylBesselI0
648 | GpuIntrinsic::CylBesselI1 => "special",
649
650 GpuIntrinsic::ThreadIdxX
651 | GpuIntrinsic::ThreadIdxY
652 | GpuIntrinsic::ThreadIdxZ
653 | GpuIntrinsic::BlockIdxX
654 | GpuIntrinsic::BlockIdxY
655 | GpuIntrinsic::BlockIdxZ
656 | GpuIntrinsic::BlockDimX
657 | GpuIntrinsic::BlockDimY
658 | GpuIntrinsic::BlockDimZ
659 | GpuIntrinsic::GridDimX
660 | GpuIntrinsic::GridDimY
661 | GpuIntrinsic::GridDimZ
662 | GpuIntrinsic::WarpSize => "index",
663
664 GpuIntrinsic::Clock | GpuIntrinsic::Clock64 | GpuIntrinsic::Nanosleep => "timing",
665 }
666 }
667}
668
669#[derive(Debug)]
671pub struct IntrinsicRegistry {
672 mappings: HashMap<String, GpuIntrinsic>,
673}
674
675impl Default for IntrinsicRegistry {
676 fn default() -> Self {
677 Self::new()
678 }
679}
680
681impl IntrinsicRegistry {
682 pub fn new() -> Self {
684 let mut mappings = HashMap::new();
685
686 mappings.insert("sync_threads".to_string(), GpuIntrinsic::SyncThreads);
688 mappings.insert("thread_fence".to_string(), GpuIntrinsic::ThreadFence);
689 mappings.insert(
690 "thread_fence_block".to_string(),
691 GpuIntrinsic::ThreadFenceBlock,
692 );
693 mappings.insert(
694 "thread_fence_system".to_string(),
695 GpuIntrinsic::ThreadFenceSystem,
696 );
697 mappings.insert(
698 "sync_threads_count".to_string(),
699 GpuIntrinsic::SyncThreadsCount,
700 );
701 mappings.insert("sync_threads_and".to_string(), GpuIntrinsic::SyncThreadsAnd);
702 mappings.insert("sync_threads_or".to_string(), GpuIntrinsic::SyncThreadsOr);
703
704 mappings.insert("atomic_add".to_string(), GpuIntrinsic::AtomicAdd);
706 mappings.insert("atomic_sub".to_string(), GpuIntrinsic::AtomicSub);
707 mappings.insert("atomic_min".to_string(), GpuIntrinsic::AtomicMin);
708 mappings.insert("atomic_max".to_string(), GpuIntrinsic::AtomicMax);
709 mappings.insert("atomic_exchange".to_string(), GpuIntrinsic::AtomicExch);
710 mappings.insert("atomic_exch".to_string(), GpuIntrinsic::AtomicExch);
711 mappings.insert("atomic_cas".to_string(), GpuIntrinsic::AtomicCas);
712 mappings.insert("atomic_compare_swap".to_string(), GpuIntrinsic::AtomicCas);
713 mappings.insert("atomic_and".to_string(), GpuIntrinsic::AtomicAnd);
714 mappings.insert("atomic_or".to_string(), GpuIntrinsic::AtomicOr);
715 mappings.insert("atomic_xor".to_string(), GpuIntrinsic::AtomicXor);
716 mappings.insert("atomic_inc".to_string(), GpuIntrinsic::AtomicInc);
717 mappings.insert("atomic_dec".to_string(), GpuIntrinsic::AtomicDec);
718
719 mappings.insert("sqrt".to_string(), GpuIntrinsic::Sqrt);
721 mappings.insert("rsqrt".to_string(), GpuIntrinsic::Rsqrt);
722 mappings.insert("abs".to_string(), GpuIntrinsic::Fabs);
723 mappings.insert("fabs".to_string(), GpuIntrinsic::Fabs);
724 mappings.insert("floor".to_string(), GpuIntrinsic::Floor);
725 mappings.insert("ceil".to_string(), GpuIntrinsic::Ceil);
726 mappings.insert("round".to_string(), GpuIntrinsic::Round);
727 mappings.insert("trunc".to_string(), GpuIntrinsic::Trunc);
728 mappings.insert("mul_add".to_string(), GpuIntrinsic::Fma);
729 mappings.insert("fma".to_string(), GpuIntrinsic::Fma);
730 mappings.insert("min".to_string(), GpuIntrinsic::Min);
731 mappings.insert("max".to_string(), GpuIntrinsic::Max);
732 mappings.insert("fmin".to_string(), GpuIntrinsic::Min);
733 mappings.insert("fmax".to_string(), GpuIntrinsic::Max);
734 mappings.insert("fmod".to_string(), GpuIntrinsic::Fmod);
735 mappings.insert("remainder".to_string(), GpuIntrinsic::Remainder);
736 mappings.insert("copysign".to_string(), GpuIntrinsic::Copysign);
737 mappings.insert("cbrt".to_string(), GpuIntrinsic::Cbrt);
738 mappings.insert("hypot".to_string(), GpuIntrinsic::Hypot);
739
740 mappings.insert("sin".to_string(), GpuIntrinsic::Sin);
742 mappings.insert("cos".to_string(), GpuIntrinsic::Cos);
743 mappings.insert("tan".to_string(), GpuIntrinsic::Tan);
744 mappings.insert("asin".to_string(), GpuIntrinsic::Asin);
745 mappings.insert("acos".to_string(), GpuIntrinsic::Acos);
746 mappings.insert("atan".to_string(), GpuIntrinsic::Atan);
747 mappings.insert("atan2".to_string(), GpuIntrinsic::Atan2);
748 mappings.insert("sincos".to_string(), GpuIntrinsic::Sincos);
749 mappings.insert("sinpi".to_string(), GpuIntrinsic::Sinpi);
750 mappings.insert("cospi".to_string(), GpuIntrinsic::Cospi);
751
752 mappings.insert("sinh".to_string(), GpuIntrinsic::Sinh);
754 mappings.insert("cosh".to_string(), GpuIntrinsic::Cosh);
755 mappings.insert("tanh".to_string(), GpuIntrinsic::Tanh);
756 mappings.insert("asinh".to_string(), GpuIntrinsic::Asinh);
757 mappings.insert("acosh".to_string(), GpuIntrinsic::Acosh);
758 mappings.insert("atanh".to_string(), GpuIntrinsic::Atanh);
759
760 mappings.insert("exp".to_string(), GpuIntrinsic::Exp);
762 mappings.insert("exp2".to_string(), GpuIntrinsic::Exp2);
763 mappings.insert("exp10".to_string(), GpuIntrinsic::Exp10);
764 mappings.insert("expm1".to_string(), GpuIntrinsic::Expm1);
765 mappings.insert("ln".to_string(), GpuIntrinsic::Log);
766 mappings.insert("log".to_string(), GpuIntrinsic::Log);
767 mappings.insert("log2".to_string(), GpuIntrinsic::Log2);
768 mappings.insert("log10".to_string(), GpuIntrinsic::Log10);
769 mappings.insert("log1p".to_string(), GpuIntrinsic::Log1p);
770 mappings.insert("powf".to_string(), GpuIntrinsic::Pow);
771 mappings.insert("powi".to_string(), GpuIntrinsic::Pow);
772 mappings.insert("pow".to_string(), GpuIntrinsic::Pow);
773 mappings.insert("ldexp".to_string(), GpuIntrinsic::Ldexp);
774 mappings.insert("scalbn".to_string(), GpuIntrinsic::Scalbn);
775 mappings.insert("ilogb".to_string(), GpuIntrinsic::Ilogb);
776 mappings.insert("lgamma".to_string(), GpuIntrinsic::Lgamma);
777 mappings.insert("tgamma".to_string(), GpuIntrinsic::Tgamma);
778 mappings.insert("gamma".to_string(), GpuIntrinsic::Tgamma);
779 mappings.insert("erf".to_string(), GpuIntrinsic::Erf);
780 mappings.insert("erfc".to_string(), GpuIntrinsic::Erfc);
781 mappings.insert("erfinv".to_string(), GpuIntrinsic::Erfinv);
782 mappings.insert("erfcinv".to_string(), GpuIntrinsic::Erfcinv);
783
784 mappings.insert("is_nan".to_string(), GpuIntrinsic::Isnan);
786 mappings.insert("isnan".to_string(), GpuIntrinsic::Isnan);
787 mappings.insert("is_infinite".to_string(), GpuIntrinsic::Isinf);
788 mappings.insert("isinf".to_string(), GpuIntrinsic::Isinf);
789 mappings.insert("is_finite".to_string(), GpuIntrinsic::Isfinite);
790 mappings.insert("isfinite".to_string(), GpuIntrinsic::Isfinite);
791 mappings.insert("is_normal".to_string(), GpuIntrinsic::Isnormal);
792 mappings.insert("isnormal".to_string(), GpuIntrinsic::Isnormal);
793 mappings.insert("is_sign_negative".to_string(), GpuIntrinsic::Signbit);
794 mappings.insert("signbit".to_string(), GpuIntrinsic::Signbit);
795 mappings.insert("nextafter".to_string(), GpuIntrinsic::Nextafter);
796 mappings.insert("fdim".to_string(), GpuIntrinsic::Fdim);
797 mappings.insert("nan".to_string(), GpuIntrinsic::Nan);
798
799 mappings.insert("warp_shfl".to_string(), GpuIntrinsic::WarpShfl);
801 mappings.insert("warp_shuffle".to_string(), GpuIntrinsic::WarpShfl);
802 mappings.insert("warp_shfl_up".to_string(), GpuIntrinsic::WarpShflUp);
803 mappings.insert("warp_shuffle_up".to_string(), GpuIntrinsic::WarpShflUp);
804 mappings.insert("warp_shfl_down".to_string(), GpuIntrinsic::WarpShflDown);
805 mappings.insert("warp_shuffle_down".to_string(), GpuIntrinsic::WarpShflDown);
806 mappings.insert("warp_shfl_xor".to_string(), GpuIntrinsic::WarpShflXor);
807 mappings.insert("warp_shuffle_xor".to_string(), GpuIntrinsic::WarpShflXor);
808 mappings.insert("warp_active_mask".to_string(), GpuIntrinsic::WarpActiveMask);
809 mappings.insert("active_mask".to_string(), GpuIntrinsic::WarpActiveMask);
810 mappings.insert("warp_ballot".to_string(), GpuIntrinsic::WarpBallot);
811 mappings.insert("ballot".to_string(), GpuIntrinsic::WarpBallot);
812 mappings.insert("warp_all".to_string(), GpuIntrinsic::WarpAll);
813 mappings.insert("warp_any".to_string(), GpuIntrinsic::WarpAny);
814 mappings.insert("warp_match_any".to_string(), GpuIntrinsic::WarpMatchAny);
815 mappings.insert("warp_match_all".to_string(), GpuIntrinsic::WarpMatchAll);
816 mappings.insert("warp_reduce_add".to_string(), GpuIntrinsic::WarpReduceAdd);
817 mappings.insert("warp_reduce_min".to_string(), GpuIntrinsic::WarpReduceMin);
818 mappings.insert("warp_reduce_max".to_string(), GpuIntrinsic::WarpReduceMax);
819 mappings.insert("warp_reduce_and".to_string(), GpuIntrinsic::WarpReduceAnd);
820 mappings.insert("warp_reduce_or".to_string(), GpuIntrinsic::WarpReduceOr);
821 mappings.insert("warp_reduce_xor".to_string(), GpuIntrinsic::WarpReduceXor);
822
823 mappings.insert("popc".to_string(), GpuIntrinsic::Popc);
825 mappings.insert("popcount".to_string(), GpuIntrinsic::Popc);
826 mappings.insert("count_ones".to_string(), GpuIntrinsic::Popc);
827 mappings.insert("clz".to_string(), GpuIntrinsic::Clz);
828 mappings.insert("leading_zeros".to_string(), GpuIntrinsic::Clz);
829 mappings.insert("ctz".to_string(), GpuIntrinsic::Ctz);
830 mappings.insert("trailing_zeros".to_string(), GpuIntrinsic::Ctz);
831 mappings.insert("ffs".to_string(), GpuIntrinsic::Ffs);
832 mappings.insert("brev".to_string(), GpuIntrinsic::Brev);
833 mappings.insert("reverse_bits".to_string(), GpuIntrinsic::Brev);
834 mappings.insert("byte_perm".to_string(), GpuIntrinsic::BytePerm);
835 mappings.insert(
836 "funnel_shift_left".to_string(),
837 GpuIntrinsic::FunnelShiftLeft,
838 );
839 mappings.insert(
840 "funnel_shift_right".to_string(),
841 GpuIntrinsic::FunnelShiftRight,
842 );
843
844 mappings.insert("ldg".to_string(), GpuIntrinsic::Ldg);
846 mappings.insert("load_global".to_string(), GpuIntrinsic::Ldg);
847 mappings.insert("prefetch_l1".to_string(), GpuIntrinsic::PrefetchL1);
848 mappings.insert("prefetch_l2".to_string(), GpuIntrinsic::PrefetchL2);
849
850 mappings.insert("rcp".to_string(), GpuIntrinsic::Rcp);
852 mappings.insert("recip".to_string(), GpuIntrinsic::Rcp);
853 mappings.insert("fdividef".to_string(), GpuIntrinsic::Fdividef);
854 mappings.insert("fast_div".to_string(), GpuIntrinsic::Fdividef);
855 mappings.insert("saturate".to_string(), GpuIntrinsic::Saturate);
856 mappings.insert("clamp_01".to_string(), GpuIntrinsic::Saturate);
857 mappings.insert("j0".to_string(), GpuIntrinsic::J0);
858 mappings.insert("j1".to_string(), GpuIntrinsic::J1);
859 mappings.insert("jn".to_string(), GpuIntrinsic::Jn);
860 mappings.insert("y0".to_string(), GpuIntrinsic::Y0);
861 mappings.insert("y1".to_string(), GpuIntrinsic::Y1);
862 mappings.insert("yn".to_string(), GpuIntrinsic::Yn);
863 mappings.insert("normcdf".to_string(), GpuIntrinsic::Normcdf);
864 mappings.insert("norm_cdf".to_string(), GpuIntrinsic::Normcdf);
865 mappings.insert("normcdfinv".to_string(), GpuIntrinsic::Normcdfinv);
866 mappings.insert("norm_cdf_inv".to_string(), GpuIntrinsic::Normcdfinv);
867 mappings.insert("cyl_bessel_i0".to_string(), GpuIntrinsic::CylBesselI0);
868 mappings.insert("cyl_bessel_i1".to_string(), GpuIntrinsic::CylBesselI1);
869
870 mappings.insert("thread_idx_x".to_string(), GpuIntrinsic::ThreadIdxX);
872 mappings.insert("thread_idx_y".to_string(), GpuIntrinsic::ThreadIdxY);
873 mappings.insert("thread_idx_z".to_string(), GpuIntrinsic::ThreadIdxZ);
874 mappings.insert("block_idx_x".to_string(), GpuIntrinsic::BlockIdxX);
875 mappings.insert("block_idx_y".to_string(), GpuIntrinsic::BlockIdxY);
876 mappings.insert("block_idx_z".to_string(), GpuIntrinsic::BlockIdxZ);
877 mappings.insert("block_dim_x".to_string(), GpuIntrinsic::BlockDimX);
878 mappings.insert("block_dim_y".to_string(), GpuIntrinsic::BlockDimY);
879 mappings.insert("block_dim_z".to_string(), GpuIntrinsic::BlockDimZ);
880 mappings.insert("grid_dim_x".to_string(), GpuIntrinsic::GridDimX);
881 mappings.insert("grid_dim_y".to_string(), GpuIntrinsic::GridDimY);
882 mappings.insert("grid_dim_z".to_string(), GpuIntrinsic::GridDimZ);
883 mappings.insert("warp_size".to_string(), GpuIntrinsic::WarpSize);
884
885 mappings.insert("clock".to_string(), GpuIntrinsic::Clock);
887 mappings.insert("clock64".to_string(), GpuIntrinsic::Clock64);
888 mappings.insert("nanosleep".to_string(), GpuIntrinsic::Nanosleep);
889
890 Self { mappings }
891 }
892
893 pub fn lookup(&self, name: &str) -> Option<&GpuIntrinsic> {
895 self.mappings.get(name)
896 }
897
898 pub fn register(&mut self, rust_name: &str, intrinsic: GpuIntrinsic) {
900 self.mappings.insert(rust_name.to_string(), intrinsic);
901 }
902
903 pub fn is_intrinsic(&self, name: &str) -> bool {
905 self.mappings.contains_key(name)
906 }
907}
908
909#[derive(Debug, Clone, PartialEq)]
914pub enum StencilIntrinsic {
915 Index,
917 North,
919 South,
921 East,
923 West,
925 At,
927 Up,
929 Down,
931}
932
933impl StencilIntrinsic {
934 pub fn from_method_name(name: &str) -> Option<Self> {
936 match name {
937 "idx" => Some(StencilIntrinsic::Index),
938 "north" => Some(StencilIntrinsic::North),
939 "south" => Some(StencilIntrinsic::South),
940 "east" => Some(StencilIntrinsic::East),
941 "west" => Some(StencilIntrinsic::West),
942 "at" => Some(StencilIntrinsic::At),
943 "up" => Some(StencilIntrinsic::Up),
944 "down" => Some(StencilIntrinsic::Down),
945 _ => None,
946 }
947 }
948}
949
950#[derive(Debug, Clone, Copy, PartialEq, Eq)]
955pub enum RingKernelIntrinsic {
956 IsActive,
959 ShouldTerminate,
961 MarkTerminated,
963 GetMessagesProcessed,
965
966 InputQueueSize,
969 OutputQueueSize,
971 InputQueueEmpty,
973 OutputQueueEmpty,
975 EnqueueResponse,
977
978 HlcTick,
981 HlcUpdate,
983 HlcNow,
985
986 K2kSend,
989 K2kTryRecv,
991 K2kHasMessage,
993 K2kPeek,
995 K2kPendingCount,
997
998 Nanosleep,
1001}
1002
1003impl RingKernelIntrinsic {
1004 pub fn to_cuda(&self, args: &[String]) -> String {
1006 match self {
1007 Self::IsActive => "atomicAdd(&control->is_active, 0) != 0".to_string(),
1008 Self::ShouldTerminate => "atomicAdd(&control->should_terminate, 0) != 0".to_string(),
1009 Self::MarkTerminated => "atomicExch(&control->has_terminated, 1)".to_string(),
1010 Self::GetMessagesProcessed => "atomicAdd(&control->messages_processed, 0)".to_string(),
1011
1012 Self::InputQueueSize => {
1013 "(atomicAdd(&control->input_head, 0) - atomicAdd(&control->input_tail, 0))"
1014 .to_string()
1015 }
1016 Self::OutputQueueSize => {
1017 "(atomicAdd(&control->output_head, 0) - atomicAdd(&control->output_tail, 0))"
1018 .to_string()
1019 }
1020 Self::InputQueueEmpty => {
1021 "(atomicAdd(&control->input_head, 0) == atomicAdd(&control->input_tail, 0))"
1022 .to_string()
1023 }
1024 Self::OutputQueueEmpty => {
1025 "(atomicAdd(&control->output_head, 0) == atomicAdd(&control->output_tail, 0))"
1026 .to_string()
1027 }
1028 Self::EnqueueResponse => {
1029 if !args.is_empty() {
1030 format!(
1031 "{{ unsigned long long _out_idx = atomicAdd(&control->output_head, 1) & control->output_mask; \
1032 memcpy(&output_buffer[_out_idx * RESP_SIZE], {}, RESP_SIZE); }}",
1033 args[0]
1034 )
1035 } else {
1036 "/* enqueue_response requires response pointer */".to_string()
1037 }
1038 }
1039
1040 Self::HlcTick => "hlc_logical++".to_string(),
1041 Self::HlcUpdate => {
1042 if !args.is_empty() {
1043 format!(
1044 "{{ if ({} > hlc_physical) {{ hlc_physical = {}; hlc_logical = 0; }} else {{ hlc_logical++; }} }}",
1045 args[0], args[0]
1046 )
1047 } else {
1048 "hlc_logical++".to_string()
1049 }
1050 }
1051 Self::HlcNow => "(hlc_physical << 32) | (hlc_logical & 0xFFFFFFFF)".to_string(),
1052
1053 Self::K2kSend => {
1054 if args.len() >= 2 {
1055 format!(
1057 "k2k_send(k2k_routes, {}, {}, sizeof(*{}))",
1058 args[0], args[1], args[1]
1059 )
1060 } else {
1061 "/* k2k_send requires target_id and msg_ptr */".to_string()
1062 }
1063 }
1064 Self::K2kTryRecv => "k2k_try_recv(k2k_inbox)".to_string(),
1065 Self::K2kHasMessage => "k2k_has_message(k2k_inbox)".to_string(),
1066 Self::K2kPeek => "k2k_peek(k2k_inbox)".to_string(),
1067 Self::K2kPendingCount => "k2k_pending_count(k2k_inbox)".to_string(),
1068
1069 Self::Nanosleep => {
1070 if !args.is_empty() {
1071 format!("__nanosleep({})", args[0])
1072 } else {
1073 "__nanosleep(1000)".to_string()
1074 }
1075 }
1076 }
1077 }
1078
1079 pub fn from_name(name: &str) -> Option<Self> {
1081 match name {
1082 "is_active" | "is_kernel_active" => Some(Self::IsActive),
1083 "should_terminate" => Some(Self::ShouldTerminate),
1084 "mark_terminated" => Some(Self::MarkTerminated),
1085 "messages_processed" | "get_messages_processed" => Some(Self::GetMessagesProcessed),
1086
1087 "input_queue_size" => Some(Self::InputQueueSize),
1088 "output_queue_size" => Some(Self::OutputQueueSize),
1089 "input_queue_empty" => Some(Self::InputQueueEmpty),
1090 "output_queue_empty" => Some(Self::OutputQueueEmpty),
1091 "enqueue_response" | "enqueue" => Some(Self::EnqueueResponse),
1092
1093 "hlc_tick" => Some(Self::HlcTick),
1094 "hlc_update" => Some(Self::HlcUpdate),
1095 "hlc_now" => Some(Self::HlcNow),
1096
1097 "k2k_send" => Some(Self::K2kSend),
1098 "k2k_try_recv" => Some(Self::K2kTryRecv),
1099 "k2k_has_message" => Some(Self::K2kHasMessage),
1100 "k2k_peek" => Some(Self::K2kPeek),
1101 "k2k_pending_count" | "k2k_pending" => Some(Self::K2kPendingCount),
1102
1103 "nanosleep" => Some(Self::Nanosleep),
1104
1105 _ => None,
1106 }
1107 }
1108
1109 pub fn requires_control_block(&self) -> bool {
1111 matches!(
1112 self,
1113 Self::IsActive
1114 | Self::ShouldTerminate
1115 | Self::MarkTerminated
1116 | Self::GetMessagesProcessed
1117 | Self::InputQueueSize
1118 | Self::OutputQueueSize
1119 | Self::InputQueueEmpty
1120 | Self::OutputQueueEmpty
1121 | Self::EnqueueResponse
1122 )
1123 }
1124
1125 pub fn requires_hlc(&self) -> bool {
1127 matches!(self, Self::HlcTick | Self::HlcUpdate | Self::HlcNow)
1128 }
1129
1130 pub fn requires_k2k(&self) -> bool {
1132 matches!(
1133 self,
1134 Self::K2kSend
1135 | Self::K2kTryRecv
1136 | Self::K2kHasMessage
1137 | Self::K2kPeek
1138 | Self::K2kPendingCount
1139 )
1140 }
1141}
1142
1143impl StencilIntrinsic {
1144 pub fn get_offset_2d(&self) -> Option<(i32, i32)> {
1149 match self {
1150 StencilIntrinsic::Index => Some((0, 0)),
1151 StencilIntrinsic::North => Some((-1, 0)),
1152 StencilIntrinsic::South => Some((1, 0)),
1153 StencilIntrinsic::East => Some((0, 1)),
1154 StencilIntrinsic::West => Some((0, -1)),
1155 StencilIntrinsic::At => None, StencilIntrinsic::Up | StencilIntrinsic::Down => None, }
1158 }
1159
1160 pub fn get_offset_3d(&self) -> Option<(i32, i32, i32)> {
1165 match self {
1166 StencilIntrinsic::Index => Some((0, 0, 0)),
1167 StencilIntrinsic::North => Some((0, -1, 0)),
1168 StencilIntrinsic::South => Some((0, 1, 0)),
1169 StencilIntrinsic::East => Some((0, 0, 1)),
1170 StencilIntrinsic::West => Some((0, 0, -1)),
1171 StencilIntrinsic::Up => Some((-1, 0, 0)),
1172 StencilIntrinsic::Down => Some((1, 0, 0)),
1173 StencilIntrinsic::At => None, }
1175 }
1176
1177 pub fn is_3d_only(&self) -> bool {
1179 matches!(self, StencilIntrinsic::Up | StencilIntrinsic::Down)
1180 }
1181
1182 pub fn to_cuda_index_2d(&self, buffer_name: &str, buffer_width: &str, idx_var: &str) -> String {
1189 match self {
1190 StencilIntrinsic::Index => format!("{}[{}]", buffer_name, idx_var),
1191 StencilIntrinsic::North => {
1192 format!("{}[{} - {}]", buffer_name, idx_var, buffer_width)
1193 }
1194 StencilIntrinsic::South => {
1195 format!("{}[{} + {}]", buffer_name, idx_var, buffer_width)
1196 }
1197 StencilIntrinsic::East => format!("{}[{} + 1]", buffer_name, idx_var),
1198 StencilIntrinsic::West => format!("{}[{} - 1]", buffer_name, idx_var),
1199 StencilIntrinsic::At => {
1200 format!("{}[{}]", buffer_name, idx_var)
1202 }
1203 _ => format!("{}[{}]", buffer_name, idx_var),
1204 }
1205 }
1206
1207 pub fn to_cuda_index_3d(
1215 &self,
1216 buffer_name: &str,
1217 buffer_width: &str,
1218 buffer_slice: &str,
1219 idx_var: &str,
1220 ) -> String {
1221 match self {
1222 StencilIntrinsic::Index => format!("{}[{}]", buffer_name, idx_var),
1223 StencilIntrinsic::North => {
1224 format!("{}[{} - {}]", buffer_name, idx_var, buffer_width)
1225 }
1226 StencilIntrinsic::South => {
1227 format!("{}[{} + {}]", buffer_name, idx_var, buffer_width)
1228 }
1229 StencilIntrinsic::East => format!("{}[{} + 1]", buffer_name, idx_var),
1230 StencilIntrinsic::West => format!("{}[{} - 1]", buffer_name, idx_var),
1231 StencilIntrinsic::Up => {
1232 format!("{}[{} - {}]", buffer_name, idx_var, buffer_slice)
1233 }
1234 StencilIntrinsic::Down => {
1235 format!("{}[{} + {}]", buffer_name, idx_var, buffer_slice)
1236 }
1237 StencilIntrinsic::At => {
1238 format!("{}[{}]", buffer_name, idx_var)
1240 }
1241 }
1242 }
1243}
1244
1245#[cfg(test)]
1246mod tests {
1247 use super::*;
1248
1249 #[test]
1250 fn test_intrinsic_lookup() {
1251 let registry = IntrinsicRegistry::new();
1252
1253 assert_eq!(
1254 registry.lookup("sync_threads"),
1255 Some(&GpuIntrinsic::SyncThreads)
1256 );
1257 assert_eq!(registry.lookup("sqrt"), Some(&GpuIntrinsic::Sqrt));
1258 assert_eq!(registry.lookup("unknown_func"), None);
1259 }
1260
1261 #[test]
1262 fn test_intrinsic_cuda_output() {
1263 assert_eq!(
1264 GpuIntrinsic::SyncThreads.to_cuda_string(),
1265 "__syncthreads()"
1266 );
1267 assert_eq!(GpuIntrinsic::AtomicAdd.to_cuda_string(), "atomicAdd");
1268 assert_eq!(GpuIntrinsic::Sqrt.to_cuda_string(), "sqrtf");
1269 }
1270
1271 #[test]
1272 fn test_stencil_intrinsic_parsing() {
1273 assert_eq!(
1274 StencilIntrinsic::from_method_name("north"),
1275 Some(StencilIntrinsic::North)
1276 );
1277 assert_eq!(
1278 StencilIntrinsic::from_method_name("idx"),
1279 Some(StencilIntrinsic::Index)
1280 );
1281 assert_eq!(StencilIntrinsic::from_method_name("unknown"), None);
1282 }
1283
1284 #[test]
1285 fn test_stencil_cuda_index() {
1286 let north = StencilIntrinsic::North;
1287 assert_eq!(
1288 north.to_cuda_index_2d("p", "buffer_width", "idx"),
1289 "p[idx - buffer_width]"
1290 );
1291
1292 let east = StencilIntrinsic::East;
1293 assert_eq!(east.to_cuda_index_2d("p", "18", "idx"), "p[idx + 1]");
1294 }
1295
1296 #[test]
1297 fn test_stencil_offset() {
1298 assert_eq!(StencilIntrinsic::North.get_offset_2d(), Some((-1, 0)));
1299 assert_eq!(StencilIntrinsic::East.get_offset_2d(), Some((0, 1)));
1300 assert_eq!(StencilIntrinsic::Index.get_offset_2d(), Some((0, 0)));
1301 }
1302
1303 #[test]
1304 fn test_ring_kernel_intrinsic_lookup() {
1305 assert_eq!(
1306 RingKernelIntrinsic::from_name("is_active"),
1307 Some(RingKernelIntrinsic::IsActive)
1308 );
1309 assert_eq!(
1310 RingKernelIntrinsic::from_name("should_terminate"),
1311 Some(RingKernelIntrinsic::ShouldTerminate)
1312 );
1313 assert_eq!(
1314 RingKernelIntrinsic::from_name("hlc_tick"),
1315 Some(RingKernelIntrinsic::HlcTick)
1316 );
1317 assert_eq!(
1318 RingKernelIntrinsic::from_name("enqueue_response"),
1319 Some(RingKernelIntrinsic::EnqueueResponse)
1320 );
1321 assert_eq!(RingKernelIntrinsic::from_name("unknown"), None);
1322 }
1323
1324 #[test]
1325 fn test_ring_kernel_intrinsic_cuda_output() {
1326 assert!(RingKernelIntrinsic::IsActive
1327 .to_cuda(&[])
1328 .contains("is_active"));
1329 assert!(RingKernelIntrinsic::ShouldTerminate
1330 .to_cuda(&[])
1331 .contains("should_terminate"));
1332 assert!(RingKernelIntrinsic::HlcTick
1333 .to_cuda(&[])
1334 .contains("hlc_logical"));
1335 assert!(RingKernelIntrinsic::InputQueueEmpty
1336 .to_cuda(&[])
1337 .contains("input_head"));
1338 }
1339
1340 #[test]
1341 fn test_ring_kernel_queue_intrinsics() {
1342 let enqueue = RingKernelIntrinsic::EnqueueResponse;
1343 let cuda = enqueue.to_cuda(&["&response".to_string()]);
1344 assert!(cuda.contains("output_head"));
1345 assert!(cuda.contains("memcpy"));
1346 }
1347
1348 #[test]
1349 fn test_k2k_intrinsics() {
1350 let send = RingKernelIntrinsic::K2kSend;
1352 let cuda = send.to_cuda(&["target_id".to_string(), "&msg".to_string()]);
1353 assert!(cuda.contains("k2k_send"));
1354 assert!(cuda.contains("k2k_routes"));
1355 assert!(cuda.contains("target_id"));
1356
1357 assert_eq!(
1359 RingKernelIntrinsic::K2kTryRecv.to_cuda(&[]),
1360 "k2k_try_recv(k2k_inbox)"
1361 );
1362
1363 assert_eq!(
1365 RingKernelIntrinsic::K2kHasMessage.to_cuda(&[]),
1366 "k2k_has_message(k2k_inbox)"
1367 );
1368
1369 assert_eq!(
1371 RingKernelIntrinsic::K2kPeek.to_cuda(&[]),
1372 "k2k_peek(k2k_inbox)"
1373 );
1374
1375 assert_eq!(
1377 RingKernelIntrinsic::K2kPendingCount.to_cuda(&[]),
1378 "k2k_pending_count(k2k_inbox)"
1379 );
1380 }
1381
1382 #[test]
1383 fn test_k2k_intrinsic_lookup() {
1384 assert_eq!(
1385 RingKernelIntrinsic::from_name("k2k_send"),
1386 Some(RingKernelIntrinsic::K2kSend)
1387 );
1388 assert_eq!(
1389 RingKernelIntrinsic::from_name("k2k_try_recv"),
1390 Some(RingKernelIntrinsic::K2kTryRecv)
1391 );
1392 assert_eq!(
1393 RingKernelIntrinsic::from_name("k2k_has_message"),
1394 Some(RingKernelIntrinsic::K2kHasMessage)
1395 );
1396 assert_eq!(
1397 RingKernelIntrinsic::from_name("k2k_peek"),
1398 Some(RingKernelIntrinsic::K2kPeek)
1399 );
1400 assert_eq!(
1401 RingKernelIntrinsic::from_name("k2k_pending_count"),
1402 Some(RingKernelIntrinsic::K2kPendingCount)
1403 );
1404 }
1405
1406 #[test]
1407 fn test_intrinsic_requirements() {
1408 assert!(RingKernelIntrinsic::K2kSend.requires_k2k());
1410 assert!(RingKernelIntrinsic::K2kTryRecv.requires_k2k());
1411 assert!(RingKernelIntrinsic::K2kPeek.requires_k2k());
1412 assert!(!RingKernelIntrinsic::HlcTick.requires_k2k());
1413
1414 assert!(RingKernelIntrinsic::HlcTick.requires_hlc());
1416 assert!(RingKernelIntrinsic::HlcNow.requires_hlc());
1417 assert!(!RingKernelIntrinsic::K2kSend.requires_hlc());
1418
1419 assert!(RingKernelIntrinsic::IsActive.requires_control_block());
1421 assert!(RingKernelIntrinsic::EnqueueResponse.requires_control_block());
1422 assert!(!RingKernelIntrinsic::HlcTick.requires_control_block());
1423 }
1424
1425 #[test]
1428 fn test_new_atomic_intrinsics() {
1429 let registry = IntrinsicRegistry::new();
1430
1431 assert_eq!(
1433 registry.lookup("atomic_and"),
1434 Some(&GpuIntrinsic::AtomicAnd)
1435 );
1436 assert_eq!(registry.lookup("atomic_or"), Some(&GpuIntrinsic::AtomicOr));
1437 assert_eq!(
1438 registry.lookup("atomic_xor"),
1439 Some(&GpuIntrinsic::AtomicXor)
1440 );
1441 assert_eq!(
1442 registry.lookup("atomic_inc"),
1443 Some(&GpuIntrinsic::AtomicInc)
1444 );
1445 assert_eq!(
1446 registry.lookup("atomic_dec"),
1447 Some(&GpuIntrinsic::AtomicDec)
1448 );
1449
1450 assert_eq!(GpuIntrinsic::AtomicAnd.to_cuda_string(), "atomicAnd");
1452 assert_eq!(GpuIntrinsic::AtomicOr.to_cuda_string(), "atomicOr");
1453 assert_eq!(GpuIntrinsic::AtomicXor.to_cuda_string(), "atomicXor");
1454 assert_eq!(GpuIntrinsic::AtomicInc.to_cuda_string(), "atomicInc");
1455 assert_eq!(GpuIntrinsic::AtomicDec.to_cuda_string(), "atomicDec");
1456 }
1457
1458 #[test]
1459 fn test_trigonometric_intrinsics() {
1460 let registry = IntrinsicRegistry::new();
1461
1462 assert_eq!(registry.lookup("asin"), Some(&GpuIntrinsic::Asin));
1464 assert_eq!(registry.lookup("acos"), Some(&GpuIntrinsic::Acos));
1465 assert_eq!(registry.lookup("atan"), Some(&GpuIntrinsic::Atan));
1466 assert_eq!(registry.lookup("atan2"), Some(&GpuIntrinsic::Atan2));
1467
1468 assert_eq!(GpuIntrinsic::Asin.to_cuda_string(), "asinf");
1470 assert_eq!(GpuIntrinsic::Acos.to_cuda_string(), "acosf");
1471 assert_eq!(GpuIntrinsic::Atan.to_cuda_string(), "atanf");
1472 assert_eq!(GpuIntrinsic::Atan2.to_cuda_string(), "atan2f");
1473 }
1474
1475 #[test]
1476 fn test_hyperbolic_intrinsics() {
1477 let registry = IntrinsicRegistry::new();
1478
1479 assert_eq!(registry.lookup("sinh"), Some(&GpuIntrinsic::Sinh));
1481 assert_eq!(registry.lookup("cosh"), Some(&GpuIntrinsic::Cosh));
1482 assert_eq!(registry.lookup("tanh"), Some(&GpuIntrinsic::Tanh));
1483 assert_eq!(registry.lookup("asinh"), Some(&GpuIntrinsic::Asinh));
1484 assert_eq!(registry.lookup("acosh"), Some(&GpuIntrinsic::Acosh));
1485 assert_eq!(registry.lookup("atanh"), Some(&GpuIntrinsic::Atanh));
1486
1487 assert_eq!(GpuIntrinsic::Sinh.to_cuda_string(), "sinhf");
1489 assert_eq!(GpuIntrinsic::Cosh.to_cuda_string(), "coshf");
1490 assert_eq!(GpuIntrinsic::Tanh.to_cuda_string(), "tanhf");
1491 }
1492
1493 #[test]
1494 fn test_exponential_logarithmic_intrinsics() {
1495 let registry = IntrinsicRegistry::new();
1496
1497 assert_eq!(registry.lookup("exp2"), Some(&GpuIntrinsic::Exp2));
1499 assert_eq!(registry.lookup("exp10"), Some(&GpuIntrinsic::Exp10));
1500 assert_eq!(registry.lookup("expm1"), Some(&GpuIntrinsic::Expm1));
1501
1502 assert_eq!(registry.lookup("log2"), Some(&GpuIntrinsic::Log2));
1504 assert_eq!(registry.lookup("log10"), Some(&GpuIntrinsic::Log10));
1505 assert_eq!(registry.lookup("log1p"), Some(&GpuIntrinsic::Log1p));
1506
1507 assert_eq!(GpuIntrinsic::Exp2.to_cuda_string(), "exp2f");
1509 assert_eq!(GpuIntrinsic::Log2.to_cuda_string(), "log2f");
1510 assert_eq!(GpuIntrinsic::Log10.to_cuda_string(), "log10f");
1511 }
1512
1513 #[test]
1514 fn test_classification_intrinsics() {
1515 let registry = IntrinsicRegistry::new();
1516
1517 assert_eq!(registry.lookup("is_nan"), Some(&GpuIntrinsic::Isnan));
1519 assert_eq!(registry.lookup("isnan"), Some(&GpuIntrinsic::Isnan));
1520 assert_eq!(registry.lookup("is_infinite"), Some(&GpuIntrinsic::Isinf));
1521 assert_eq!(registry.lookup("is_finite"), Some(&GpuIntrinsic::Isfinite));
1522 assert_eq!(registry.lookup("is_normal"), Some(&GpuIntrinsic::Isnormal));
1523 assert_eq!(registry.lookup("signbit"), Some(&GpuIntrinsic::Signbit));
1524
1525 assert_eq!(GpuIntrinsic::Isnan.to_cuda_string(), "isnan");
1527 assert_eq!(GpuIntrinsic::Isinf.to_cuda_string(), "isinf");
1528 assert_eq!(GpuIntrinsic::Isfinite.to_cuda_string(), "isfinite");
1529 }
1530
1531 #[test]
1532 fn test_warp_reduce_intrinsics() {
1533 let registry = IntrinsicRegistry::new();
1534
1535 assert_eq!(
1537 registry.lookup("warp_reduce_add"),
1538 Some(&GpuIntrinsic::WarpReduceAdd)
1539 );
1540 assert_eq!(
1541 registry.lookup("warp_reduce_min"),
1542 Some(&GpuIntrinsic::WarpReduceMin)
1543 );
1544 assert_eq!(
1545 registry.lookup("warp_reduce_max"),
1546 Some(&GpuIntrinsic::WarpReduceMax)
1547 );
1548 assert_eq!(
1549 registry.lookup("warp_reduce_and"),
1550 Some(&GpuIntrinsic::WarpReduceAnd)
1551 );
1552 assert_eq!(
1553 registry.lookup("warp_reduce_or"),
1554 Some(&GpuIntrinsic::WarpReduceOr)
1555 );
1556 assert_eq!(
1557 registry.lookup("warp_reduce_xor"),
1558 Some(&GpuIntrinsic::WarpReduceXor)
1559 );
1560
1561 assert_eq!(
1563 GpuIntrinsic::WarpReduceAdd.to_cuda_string(),
1564 "__reduce_add_sync"
1565 );
1566 assert_eq!(
1567 GpuIntrinsic::WarpReduceMin.to_cuda_string(),
1568 "__reduce_min_sync"
1569 );
1570 assert_eq!(
1571 GpuIntrinsic::WarpReduceMax.to_cuda_string(),
1572 "__reduce_max_sync"
1573 );
1574 }
1575
1576 #[test]
1577 fn test_warp_match_intrinsics() {
1578 let registry = IntrinsicRegistry::new();
1579
1580 assert_eq!(
1581 registry.lookup("warp_match_any"),
1582 Some(&GpuIntrinsic::WarpMatchAny)
1583 );
1584 assert_eq!(
1585 registry.lookup("warp_match_all"),
1586 Some(&GpuIntrinsic::WarpMatchAll)
1587 );
1588
1589 assert_eq!(
1590 GpuIntrinsic::WarpMatchAny.to_cuda_string(),
1591 "__match_any_sync"
1592 );
1593 assert_eq!(
1594 GpuIntrinsic::WarpMatchAll.to_cuda_string(),
1595 "__match_all_sync"
1596 );
1597 }
1598
1599 #[test]
1600 fn test_bit_manipulation_intrinsics() {
1601 let registry = IntrinsicRegistry::new();
1602
1603 assert_eq!(registry.lookup("popc"), Some(&GpuIntrinsic::Popc));
1605 assert_eq!(registry.lookup("popcount"), Some(&GpuIntrinsic::Popc));
1606 assert_eq!(registry.lookup("count_ones"), Some(&GpuIntrinsic::Popc));
1607 assert_eq!(registry.lookup("clz"), Some(&GpuIntrinsic::Clz));
1608 assert_eq!(registry.lookup("leading_zeros"), Some(&GpuIntrinsic::Clz));
1609 assert_eq!(registry.lookup("ctz"), Some(&GpuIntrinsic::Ctz));
1610 assert_eq!(registry.lookup("ffs"), Some(&GpuIntrinsic::Ffs));
1611 assert_eq!(registry.lookup("brev"), Some(&GpuIntrinsic::Brev));
1612 assert_eq!(registry.lookup("reverse_bits"), Some(&GpuIntrinsic::Brev));
1613
1614 assert_eq!(GpuIntrinsic::Popc.to_cuda_string(), "__popc");
1616 assert_eq!(GpuIntrinsic::Clz.to_cuda_string(), "__clz");
1617 assert_eq!(GpuIntrinsic::Ffs.to_cuda_string(), "__ffs");
1618 assert_eq!(GpuIntrinsic::Brev.to_cuda_string(), "__brev");
1619 }
1620
1621 #[test]
1622 fn test_funnel_shift_intrinsics() {
1623 let registry = IntrinsicRegistry::new();
1624
1625 assert_eq!(
1626 registry.lookup("funnel_shift_left"),
1627 Some(&GpuIntrinsic::FunnelShiftLeft)
1628 );
1629 assert_eq!(
1630 registry.lookup("funnel_shift_right"),
1631 Some(&GpuIntrinsic::FunnelShiftRight)
1632 );
1633
1634 assert_eq!(
1635 GpuIntrinsic::FunnelShiftLeft.to_cuda_string(),
1636 "__funnelshift_l"
1637 );
1638 assert_eq!(
1639 GpuIntrinsic::FunnelShiftRight.to_cuda_string(),
1640 "__funnelshift_r"
1641 );
1642 }
1643
1644 #[test]
1645 fn test_memory_intrinsics() {
1646 let registry = IntrinsicRegistry::new();
1647
1648 assert_eq!(registry.lookup("ldg"), Some(&GpuIntrinsic::Ldg));
1649 assert_eq!(registry.lookup("load_global"), Some(&GpuIntrinsic::Ldg));
1650 assert_eq!(
1651 registry.lookup("prefetch_l1"),
1652 Some(&GpuIntrinsic::PrefetchL1)
1653 );
1654 assert_eq!(
1655 registry.lookup("prefetch_l2"),
1656 Some(&GpuIntrinsic::PrefetchL2)
1657 );
1658
1659 assert_eq!(GpuIntrinsic::Ldg.to_cuda_string(), "__ldg");
1660 assert_eq!(GpuIntrinsic::PrefetchL1.to_cuda_string(), "__prefetch_l1");
1661 assert_eq!(GpuIntrinsic::PrefetchL2.to_cuda_string(), "__prefetch_l2");
1662 }
1663
1664 #[test]
1665 fn test_clock_intrinsics() {
1666 let registry = IntrinsicRegistry::new();
1667
1668 assert_eq!(registry.lookup("clock"), Some(&GpuIntrinsic::Clock));
1669 assert_eq!(registry.lookup("clock64"), Some(&GpuIntrinsic::Clock64));
1670 assert_eq!(registry.lookup("nanosleep"), Some(&GpuIntrinsic::Nanosleep));
1671
1672 assert_eq!(GpuIntrinsic::Clock.to_cuda_string(), "clock()");
1673 assert_eq!(GpuIntrinsic::Clock64.to_cuda_string(), "clock64()");
1674 assert_eq!(GpuIntrinsic::Nanosleep.to_cuda_string(), "__nanosleep");
1675 }
1676
1677 #[test]
1678 fn test_special_function_intrinsics() {
1679 let registry = IntrinsicRegistry::new();
1680
1681 assert_eq!(registry.lookup("rcp"), Some(&GpuIntrinsic::Rcp));
1682 assert_eq!(registry.lookup("recip"), Some(&GpuIntrinsic::Rcp));
1683 assert_eq!(registry.lookup("saturate"), Some(&GpuIntrinsic::Saturate));
1684 assert_eq!(registry.lookup("clamp_01"), Some(&GpuIntrinsic::Saturate));
1685
1686 assert_eq!(GpuIntrinsic::Rcp.to_cuda_string(), "__frcp_rn");
1687 assert_eq!(GpuIntrinsic::Saturate.to_cuda_string(), "__saturatef");
1688 }
1689
1690 #[test]
1691 fn test_intrinsic_categories() {
1692 assert_eq!(GpuIntrinsic::SyncThreads.category(), "synchronization");
1694 assert_eq!(GpuIntrinsic::AtomicAdd.category(), "atomic");
1695 assert_eq!(GpuIntrinsic::Sqrt.category(), "math");
1696 assert_eq!(GpuIntrinsic::Sin.category(), "trigonometric");
1697 assert_eq!(GpuIntrinsic::Sinh.category(), "hyperbolic");
1698 assert_eq!(GpuIntrinsic::Exp.category(), "exponential");
1699 assert_eq!(GpuIntrinsic::Isnan.category(), "classification");
1700 assert_eq!(GpuIntrinsic::WarpShfl.category(), "warp");
1701 assert_eq!(GpuIntrinsic::Popc.category(), "bit");
1702 assert_eq!(GpuIntrinsic::Ldg.category(), "memory");
1703 assert_eq!(GpuIntrinsic::Rcp.category(), "special");
1704 assert_eq!(GpuIntrinsic::ThreadIdxX.category(), "index");
1705 assert_eq!(GpuIntrinsic::Clock.category(), "timing");
1706 }
1707
1708 #[test]
1709 fn test_intrinsic_flags() {
1710 assert!(GpuIntrinsic::ThreadIdxX.is_value_intrinsic());
1712 assert!(GpuIntrinsic::BlockDimX.is_value_intrinsic());
1713 assert!(GpuIntrinsic::WarpSize.is_value_intrinsic());
1714 assert!(!GpuIntrinsic::Sin.is_value_intrinsic());
1715 assert!(!GpuIntrinsic::AtomicAdd.is_value_intrinsic());
1716
1717 assert!(GpuIntrinsic::SyncThreads.is_zero_arg_function());
1719 assert!(GpuIntrinsic::ThreadFence.is_zero_arg_function());
1720 assert!(GpuIntrinsic::WarpActiveMask.is_zero_arg_function());
1721 assert!(GpuIntrinsic::Clock.is_zero_arg_function());
1722 assert!(!GpuIntrinsic::Sin.is_zero_arg_function());
1723
1724 assert!(GpuIntrinsic::WarpShfl.requires_mask());
1726 assert!(GpuIntrinsic::WarpBallot.requires_mask());
1727 assert!(GpuIntrinsic::WarpReduceAdd.requires_mask());
1728 assert!(!GpuIntrinsic::Sin.requires_mask());
1729 assert!(!GpuIntrinsic::AtomicAdd.requires_mask());
1730 }
1731
1732 #[test]
1733 fn test_3d_stencil_intrinsics() {
1734 assert_eq!(
1735 StencilIntrinsic::from_method_name("up"),
1736 Some(StencilIntrinsic::Up)
1737 );
1738 assert_eq!(
1739 StencilIntrinsic::from_method_name("down"),
1740 Some(StencilIntrinsic::Down)
1741 );
1742
1743 assert!(StencilIntrinsic::Up.is_3d_only());
1745 assert!(StencilIntrinsic::Down.is_3d_only());
1746 assert!(!StencilIntrinsic::North.is_3d_only());
1747 assert!(!StencilIntrinsic::East.is_3d_only());
1748 assert!(!StencilIntrinsic::Index.is_3d_only());
1749
1750 assert_eq!(StencilIntrinsic::Up.get_offset_3d(), Some((-1, 0, 0)));
1752 assert_eq!(StencilIntrinsic::Down.get_offset_3d(), Some((1, 0, 0)));
1753 assert_eq!(StencilIntrinsic::North.get_offset_3d(), Some((0, -1, 0)));
1754 assert_eq!(StencilIntrinsic::South.get_offset_3d(), Some((0, 1, 0)));
1755 assert_eq!(StencilIntrinsic::East.get_offset_3d(), Some((0, 0, 1)));
1756 assert_eq!(StencilIntrinsic::West.get_offset_3d(), Some((0, 0, -1)));
1757
1758 let up = StencilIntrinsic::Up;
1760 assert_eq!(up.to_cuda_index_3d("p", "18", "324", "idx"), "p[idx - 324]");
1761
1762 let down = StencilIntrinsic::Down;
1763 assert_eq!(
1764 down.to_cuda_index_3d("p", "18", "324", "idx"),
1765 "p[idx + 324]"
1766 );
1767 }
1768
1769 #[test]
1770 fn test_sync_intrinsics() {
1771 let registry = IntrinsicRegistry::new();
1772
1773 assert_eq!(
1774 registry.lookup("sync_threads_count"),
1775 Some(&GpuIntrinsic::SyncThreadsCount)
1776 );
1777 assert_eq!(
1778 registry.lookup("sync_threads_and"),
1779 Some(&GpuIntrinsic::SyncThreadsAnd)
1780 );
1781 assert_eq!(
1782 registry.lookup("sync_threads_or"),
1783 Some(&GpuIntrinsic::SyncThreadsOr)
1784 );
1785
1786 assert_eq!(
1787 GpuIntrinsic::SyncThreadsCount.to_cuda_string(),
1788 "__syncthreads_count"
1789 );
1790 assert_eq!(
1791 GpuIntrinsic::SyncThreadsAnd.to_cuda_string(),
1792 "__syncthreads_and"
1793 );
1794 assert_eq!(
1795 GpuIntrinsic::SyncThreadsOr.to_cuda_string(),
1796 "__syncthreads_or"
1797 );
1798 }
1799
1800 #[test]
1801 fn test_math_extras() {
1802 let registry = IntrinsicRegistry::new();
1803
1804 assert_eq!(registry.lookup("trunc"), Some(&GpuIntrinsic::Trunc));
1805 assert_eq!(registry.lookup("cbrt"), Some(&GpuIntrinsic::Cbrt));
1806 assert_eq!(registry.lookup("hypot"), Some(&GpuIntrinsic::Hypot));
1807 assert_eq!(registry.lookup("copysign"), Some(&GpuIntrinsic::Copysign));
1808 assert_eq!(registry.lookup("fmod"), Some(&GpuIntrinsic::Fmod));
1809
1810 assert_eq!(GpuIntrinsic::Trunc.to_cuda_string(), "truncf");
1811 assert_eq!(GpuIntrinsic::Cbrt.to_cuda_string(), "cbrtf");
1812 assert_eq!(GpuIntrinsic::Hypot.to_cuda_string(), "hypotf");
1813 }
1814}