1use std::collections::HashMap;
7
8#[derive(Debug, Clone, PartialEq)]
10pub enum GpuIntrinsic {
11 SyncThreads,
14 ThreadFence,
16 ThreadFenceBlock,
18 ThreadFenceSystem,
20 SyncThreadsCount,
22 SyncThreadsAnd,
24 SyncThreadsOr,
26
27 AtomicAdd,
30 AtomicSub,
32 AtomicMin,
34 AtomicMax,
36 AtomicExch,
38 AtomicCas,
40 AtomicAnd,
42 AtomicOr,
44 AtomicXor,
46 AtomicInc,
48 AtomicDec,
50
51 Sqrt,
54 Rsqrt,
56 Abs,
58 Fabs,
60 Floor,
62 Ceil,
64 Round,
66 Trunc,
68 Fma,
70 Min,
72 Max,
74 Fmod,
76 Remainder,
78 Copysign,
80 Cbrt,
82 Hypot,
84
85 Sin,
88 Cos,
90 Tan,
92 Asin,
94 Acos,
96 Atan,
98 Atan2,
100 Sincos,
102 Sinpi,
104 Cospi,
106
107 Sinh,
110 Cosh,
112 Tanh,
114 Asinh,
116 Acosh,
118 Atanh,
120
121 Exp,
124 Exp2,
126 Exp10,
128 Expm1,
130 Log,
132 Log2,
134 Log10,
136 Log1p,
138 Pow,
140 Ldexp,
142 Scalbn,
144 Ilogb,
146 Lgamma,
148 Tgamma,
150 Erf,
152 Erfc,
154 Erfinv,
156 Erfcinv,
158
159 Isnan,
162 Isinf,
164 Isfinite,
166 Isnormal,
168 Signbit,
170 Nextafter,
172 Fdim,
174 Nan,
176
177 WarpShfl,
180 WarpShflUp,
182 WarpShflDown,
184 WarpShflXor,
186 WarpActiveMask,
188 WarpBallot,
190 WarpAll,
192 WarpAny,
194 WarpMatchAny,
196 WarpMatchAll,
198 WarpReduceAdd,
200 WarpReduceMin,
202 WarpReduceMax,
204 WarpReduceAnd,
206 WarpReduceOr,
208 WarpReduceXor,
210
211 Popc,
214 Clz,
216 Ctz,
218 Ffs,
220 Brev,
222 BytePerm,
224 FunnelShiftLeft,
226 FunnelShiftRight,
228
229 Ldg,
232 PrefetchL1,
234 PrefetchL2,
236
237 Rcp,
240 Fdividef,
242 Saturate,
244 J0,
246 J1,
248 Jn,
250 Y0,
252 Y1,
254 Yn,
256 Normcdf,
258 Normcdfinv,
260 CylBesselI0,
262 CylBesselI1,
264
265 ThreadIdxX,
268 ThreadIdxY,
270 ThreadIdxZ,
272 BlockIdxX,
274 BlockIdxY,
276 BlockIdxZ,
278 BlockDimX,
280 BlockDimY,
282 BlockDimZ,
284 GridDimX,
286 GridDimY,
288 GridDimZ,
290 WarpSize,
292
293 Clock,
296 Clock64,
298 Nanosleep,
300
301 BlockReduceSum,
304 BlockReduceMin,
306 BlockReduceMax,
308 BlockReduceAnd,
310 BlockReduceOr,
312
313 GridReduceSum,
316 GridReduceMin,
318 GridReduceMax,
320
321 ReduceAndBroadcast,
325}
326
327impl GpuIntrinsic {
328 pub fn to_cuda_string(&self) -> &'static str {
330 match self {
331 GpuIntrinsic::SyncThreads => "__syncthreads()",
333 GpuIntrinsic::ThreadFence => "__threadfence()",
334 GpuIntrinsic::ThreadFenceBlock => "__threadfence_block()",
335 GpuIntrinsic::ThreadFenceSystem => "__threadfence_system()",
336 GpuIntrinsic::SyncThreadsCount => "__syncthreads_count",
337 GpuIntrinsic::SyncThreadsAnd => "__syncthreads_and",
338 GpuIntrinsic::SyncThreadsOr => "__syncthreads_or",
339
340 GpuIntrinsic::AtomicAdd => "atomicAdd",
342 GpuIntrinsic::AtomicSub => "atomicSub",
343 GpuIntrinsic::AtomicMin => "atomicMin",
344 GpuIntrinsic::AtomicMax => "atomicMax",
345 GpuIntrinsic::AtomicExch => "atomicExch",
346 GpuIntrinsic::AtomicCas => "atomicCAS",
347 GpuIntrinsic::AtomicAnd => "atomicAnd",
348 GpuIntrinsic::AtomicOr => "atomicOr",
349 GpuIntrinsic::AtomicXor => "atomicXor",
350 GpuIntrinsic::AtomicInc => "atomicInc",
351 GpuIntrinsic::AtomicDec => "atomicDec",
352
353 GpuIntrinsic::Sqrt => "sqrtf",
355 GpuIntrinsic::Rsqrt => "rsqrtf",
356 GpuIntrinsic::Abs => "abs",
357 GpuIntrinsic::Fabs => "fabsf",
358 GpuIntrinsic::Floor => "floorf",
359 GpuIntrinsic::Ceil => "ceilf",
360 GpuIntrinsic::Round => "roundf",
361 GpuIntrinsic::Trunc => "truncf",
362 GpuIntrinsic::Fma => "fmaf",
363 GpuIntrinsic::Min => "fminf",
364 GpuIntrinsic::Max => "fmaxf",
365 GpuIntrinsic::Fmod => "fmodf",
366 GpuIntrinsic::Remainder => "remainderf",
367 GpuIntrinsic::Copysign => "copysignf",
368 GpuIntrinsic::Cbrt => "cbrtf",
369 GpuIntrinsic::Hypot => "hypotf",
370
371 GpuIntrinsic::Sin => "sinf",
373 GpuIntrinsic::Cos => "cosf",
374 GpuIntrinsic::Tan => "tanf",
375 GpuIntrinsic::Asin => "asinf",
376 GpuIntrinsic::Acos => "acosf",
377 GpuIntrinsic::Atan => "atanf",
378 GpuIntrinsic::Atan2 => "atan2f",
379 GpuIntrinsic::Sincos => "sincosf",
380 GpuIntrinsic::Sinpi => "sinpif",
381 GpuIntrinsic::Cospi => "cospif",
382
383 GpuIntrinsic::Sinh => "sinhf",
385 GpuIntrinsic::Cosh => "coshf",
386 GpuIntrinsic::Tanh => "tanhf",
387 GpuIntrinsic::Asinh => "asinhf",
388 GpuIntrinsic::Acosh => "acoshf",
389 GpuIntrinsic::Atanh => "atanhf",
390
391 GpuIntrinsic::Exp => "expf",
393 GpuIntrinsic::Exp2 => "exp2f",
394 GpuIntrinsic::Exp10 => "exp10f",
395 GpuIntrinsic::Expm1 => "expm1f",
396 GpuIntrinsic::Log => "logf",
397 GpuIntrinsic::Log2 => "log2f",
398 GpuIntrinsic::Log10 => "log10f",
399 GpuIntrinsic::Log1p => "log1pf",
400 GpuIntrinsic::Pow => "powf",
401 GpuIntrinsic::Ldexp => "ldexpf",
402 GpuIntrinsic::Scalbn => "scalbnf",
403 GpuIntrinsic::Ilogb => "ilogbf",
404 GpuIntrinsic::Lgamma => "lgammaf",
405 GpuIntrinsic::Tgamma => "tgammaf",
406 GpuIntrinsic::Erf => "erff",
407 GpuIntrinsic::Erfc => "erfcf",
408 GpuIntrinsic::Erfinv => "erfinvf",
409 GpuIntrinsic::Erfcinv => "erfcinvf",
410
411 GpuIntrinsic::Isnan => "isnan",
413 GpuIntrinsic::Isinf => "isinf",
414 GpuIntrinsic::Isfinite => "isfinite",
415 GpuIntrinsic::Isnormal => "isnormal",
416 GpuIntrinsic::Signbit => "signbit",
417 GpuIntrinsic::Nextafter => "nextafterf",
418 GpuIntrinsic::Fdim => "fdimf",
419 GpuIntrinsic::Nan => "nanf",
420
421 GpuIntrinsic::WarpShfl => "__shfl_sync",
423 GpuIntrinsic::WarpShflUp => "__shfl_up_sync",
424 GpuIntrinsic::WarpShflDown => "__shfl_down_sync",
425 GpuIntrinsic::WarpShflXor => "__shfl_xor_sync",
426 GpuIntrinsic::WarpActiveMask => "__activemask()",
427 GpuIntrinsic::WarpBallot => "__ballot_sync",
428 GpuIntrinsic::WarpAll => "__all_sync",
429 GpuIntrinsic::WarpAny => "__any_sync",
430 GpuIntrinsic::WarpMatchAny => "__match_any_sync",
431 GpuIntrinsic::WarpMatchAll => "__match_all_sync",
432 GpuIntrinsic::WarpReduceAdd => "__reduce_add_sync",
433 GpuIntrinsic::WarpReduceMin => "__reduce_min_sync",
434 GpuIntrinsic::WarpReduceMax => "__reduce_max_sync",
435 GpuIntrinsic::WarpReduceAnd => "__reduce_and_sync",
436 GpuIntrinsic::WarpReduceOr => "__reduce_or_sync",
437 GpuIntrinsic::WarpReduceXor => "__reduce_xor_sync",
438
439 GpuIntrinsic::Popc => "__popc",
441 GpuIntrinsic::Clz => "__clz",
442 GpuIntrinsic::Ctz => "__ffs", GpuIntrinsic::Ffs => "__ffs",
444 GpuIntrinsic::Brev => "__brev",
445 GpuIntrinsic::BytePerm => "__byte_perm",
446 GpuIntrinsic::FunnelShiftLeft => "__funnelshift_l",
447 GpuIntrinsic::FunnelShiftRight => "__funnelshift_r",
448
449 GpuIntrinsic::Ldg => "__ldg",
451 GpuIntrinsic::PrefetchL1 => "__prefetch_l1",
452 GpuIntrinsic::PrefetchL2 => "__prefetch_l2",
453
454 GpuIntrinsic::Rcp => "__frcp_rn",
456 GpuIntrinsic::Fdividef => "__fdividef",
457 GpuIntrinsic::Saturate => "__saturatef",
458 GpuIntrinsic::J0 => "j0f",
459 GpuIntrinsic::J1 => "j1f",
460 GpuIntrinsic::Jn => "jnf",
461 GpuIntrinsic::Y0 => "y0f",
462 GpuIntrinsic::Y1 => "y1f",
463 GpuIntrinsic::Yn => "ynf",
464 GpuIntrinsic::Normcdf => "normcdff",
465 GpuIntrinsic::Normcdfinv => "normcdfinvf",
466 GpuIntrinsic::CylBesselI0 => "cyl_bessel_i0f",
467 GpuIntrinsic::CylBesselI1 => "cyl_bessel_i1f",
468
469 GpuIntrinsic::ThreadIdxX => "threadIdx.x",
471 GpuIntrinsic::ThreadIdxY => "threadIdx.y",
472 GpuIntrinsic::ThreadIdxZ => "threadIdx.z",
473 GpuIntrinsic::BlockIdxX => "blockIdx.x",
474 GpuIntrinsic::BlockIdxY => "blockIdx.y",
475 GpuIntrinsic::BlockIdxZ => "blockIdx.z",
476 GpuIntrinsic::BlockDimX => "blockDim.x",
477 GpuIntrinsic::BlockDimY => "blockDim.y",
478 GpuIntrinsic::BlockDimZ => "blockDim.z",
479 GpuIntrinsic::GridDimX => "gridDim.x",
480 GpuIntrinsic::GridDimY => "gridDim.y",
481 GpuIntrinsic::GridDimZ => "gridDim.z",
482 GpuIntrinsic::WarpSize => "warpSize",
483
484 GpuIntrinsic::Clock => "clock()",
486 GpuIntrinsic::Clock64 => "clock64()",
487 GpuIntrinsic::Nanosleep => "__nanosleep",
488
489 GpuIntrinsic::BlockReduceSum => "__block_reduce_sum",
491 GpuIntrinsic::BlockReduceMin => "__block_reduce_min",
492 GpuIntrinsic::BlockReduceMax => "__block_reduce_max",
493 GpuIntrinsic::BlockReduceAnd => "__block_reduce_and",
494 GpuIntrinsic::BlockReduceOr => "__block_reduce_or",
495
496 GpuIntrinsic::GridReduceSum => "__grid_reduce_sum",
498 GpuIntrinsic::GridReduceMin => "__grid_reduce_min",
499 GpuIntrinsic::GridReduceMax => "__grid_reduce_max",
500
501 GpuIntrinsic::ReduceAndBroadcast => "__reduce_and_broadcast",
503 }
504 }
505
506 pub fn is_value_intrinsic(&self) -> bool {
508 matches!(
509 self,
510 GpuIntrinsic::ThreadIdxX
511 | GpuIntrinsic::ThreadIdxY
512 | GpuIntrinsic::ThreadIdxZ
513 | GpuIntrinsic::BlockIdxX
514 | GpuIntrinsic::BlockIdxY
515 | GpuIntrinsic::BlockIdxZ
516 | GpuIntrinsic::BlockDimX
517 | GpuIntrinsic::BlockDimY
518 | GpuIntrinsic::BlockDimZ
519 | GpuIntrinsic::GridDimX
520 | GpuIntrinsic::GridDimY
521 | GpuIntrinsic::GridDimZ
522 | GpuIntrinsic::WarpSize
523 )
524 }
525
526 pub fn is_zero_arg_function(&self) -> bool {
528 matches!(
529 self,
530 GpuIntrinsic::SyncThreads
531 | GpuIntrinsic::ThreadFence
532 | GpuIntrinsic::ThreadFenceBlock
533 | GpuIntrinsic::ThreadFenceSystem
534 | GpuIntrinsic::WarpActiveMask
535 | GpuIntrinsic::Clock
536 | GpuIntrinsic::Clock64
537 )
538 }
539
540 pub fn requires_mask(&self) -> bool {
542 matches!(
543 self,
544 GpuIntrinsic::WarpShfl
545 | GpuIntrinsic::WarpShflUp
546 | GpuIntrinsic::WarpShflDown
547 | GpuIntrinsic::WarpShflXor
548 | GpuIntrinsic::WarpBallot
549 | GpuIntrinsic::WarpAll
550 | GpuIntrinsic::WarpAny
551 | GpuIntrinsic::WarpMatchAny
552 | GpuIntrinsic::WarpMatchAll
553 | GpuIntrinsic::WarpReduceAdd
554 | GpuIntrinsic::WarpReduceMin
555 | GpuIntrinsic::WarpReduceMax
556 | GpuIntrinsic::WarpReduceAnd
557 | GpuIntrinsic::WarpReduceOr
558 | GpuIntrinsic::WarpReduceXor
559 )
560 }
561
562 pub fn category(&self) -> &'static str {
564 match self {
565 GpuIntrinsic::SyncThreads
566 | GpuIntrinsic::ThreadFence
567 | GpuIntrinsic::ThreadFenceBlock
568 | GpuIntrinsic::ThreadFenceSystem
569 | GpuIntrinsic::SyncThreadsCount
570 | GpuIntrinsic::SyncThreadsAnd
571 | GpuIntrinsic::SyncThreadsOr => "synchronization",
572
573 GpuIntrinsic::AtomicAdd
574 | GpuIntrinsic::AtomicSub
575 | GpuIntrinsic::AtomicMin
576 | GpuIntrinsic::AtomicMax
577 | GpuIntrinsic::AtomicExch
578 | GpuIntrinsic::AtomicCas
579 | GpuIntrinsic::AtomicAnd
580 | GpuIntrinsic::AtomicOr
581 | GpuIntrinsic::AtomicXor
582 | GpuIntrinsic::AtomicInc
583 | GpuIntrinsic::AtomicDec => "atomic",
584
585 GpuIntrinsic::Sqrt
586 | GpuIntrinsic::Rsqrt
587 | GpuIntrinsic::Abs
588 | GpuIntrinsic::Fabs
589 | GpuIntrinsic::Floor
590 | GpuIntrinsic::Ceil
591 | GpuIntrinsic::Round
592 | GpuIntrinsic::Trunc
593 | GpuIntrinsic::Fma
594 | GpuIntrinsic::Min
595 | GpuIntrinsic::Max
596 | GpuIntrinsic::Fmod
597 | GpuIntrinsic::Remainder
598 | GpuIntrinsic::Copysign
599 | GpuIntrinsic::Cbrt
600 | GpuIntrinsic::Hypot => "math",
601
602 GpuIntrinsic::Sin
603 | GpuIntrinsic::Cos
604 | GpuIntrinsic::Tan
605 | GpuIntrinsic::Asin
606 | GpuIntrinsic::Acos
607 | GpuIntrinsic::Atan
608 | GpuIntrinsic::Atan2
609 | GpuIntrinsic::Sincos
610 | GpuIntrinsic::Sinpi
611 | GpuIntrinsic::Cospi => "trigonometric",
612
613 GpuIntrinsic::Sinh
614 | GpuIntrinsic::Cosh
615 | GpuIntrinsic::Tanh
616 | GpuIntrinsic::Asinh
617 | GpuIntrinsic::Acosh
618 | GpuIntrinsic::Atanh => "hyperbolic",
619
620 GpuIntrinsic::Exp
621 | GpuIntrinsic::Exp2
622 | GpuIntrinsic::Exp10
623 | GpuIntrinsic::Expm1
624 | GpuIntrinsic::Log
625 | GpuIntrinsic::Log2
626 | GpuIntrinsic::Log10
627 | GpuIntrinsic::Log1p
628 | GpuIntrinsic::Pow
629 | GpuIntrinsic::Ldexp
630 | GpuIntrinsic::Scalbn
631 | GpuIntrinsic::Ilogb
632 | GpuIntrinsic::Lgamma
633 | GpuIntrinsic::Tgamma
634 | GpuIntrinsic::Erf
635 | GpuIntrinsic::Erfc
636 | GpuIntrinsic::Erfinv
637 | GpuIntrinsic::Erfcinv => "exponential",
638
639 GpuIntrinsic::Isnan
640 | GpuIntrinsic::Isinf
641 | GpuIntrinsic::Isfinite
642 | GpuIntrinsic::Isnormal
643 | GpuIntrinsic::Signbit
644 | GpuIntrinsic::Nextafter
645 | GpuIntrinsic::Fdim
646 | GpuIntrinsic::Nan => "classification",
647
648 GpuIntrinsic::WarpShfl
649 | GpuIntrinsic::WarpShflUp
650 | GpuIntrinsic::WarpShflDown
651 | GpuIntrinsic::WarpShflXor
652 | GpuIntrinsic::WarpActiveMask
653 | GpuIntrinsic::WarpBallot
654 | GpuIntrinsic::WarpAll
655 | GpuIntrinsic::WarpAny
656 | GpuIntrinsic::WarpMatchAny
657 | GpuIntrinsic::WarpMatchAll
658 | GpuIntrinsic::WarpReduceAdd
659 | GpuIntrinsic::WarpReduceMin
660 | GpuIntrinsic::WarpReduceMax
661 | GpuIntrinsic::WarpReduceAnd
662 | GpuIntrinsic::WarpReduceOr
663 | GpuIntrinsic::WarpReduceXor => "warp",
664
665 GpuIntrinsic::Popc
666 | GpuIntrinsic::Clz
667 | GpuIntrinsic::Ctz
668 | GpuIntrinsic::Ffs
669 | GpuIntrinsic::Brev
670 | GpuIntrinsic::BytePerm
671 | GpuIntrinsic::FunnelShiftLeft
672 | GpuIntrinsic::FunnelShiftRight => "bit",
673
674 GpuIntrinsic::Ldg | GpuIntrinsic::PrefetchL1 | GpuIntrinsic::PrefetchL2 => "memory",
675
676 GpuIntrinsic::Rcp
677 | GpuIntrinsic::Fdividef
678 | GpuIntrinsic::Saturate
679 | GpuIntrinsic::J0
680 | GpuIntrinsic::J1
681 | GpuIntrinsic::Jn
682 | GpuIntrinsic::Y0
683 | GpuIntrinsic::Y1
684 | GpuIntrinsic::Yn
685 | GpuIntrinsic::Normcdf
686 | GpuIntrinsic::Normcdfinv
687 | GpuIntrinsic::CylBesselI0
688 | GpuIntrinsic::CylBesselI1 => "special",
689
690 GpuIntrinsic::ThreadIdxX
691 | GpuIntrinsic::ThreadIdxY
692 | GpuIntrinsic::ThreadIdxZ
693 | GpuIntrinsic::BlockIdxX
694 | GpuIntrinsic::BlockIdxY
695 | GpuIntrinsic::BlockIdxZ
696 | GpuIntrinsic::BlockDimX
697 | GpuIntrinsic::BlockDimY
698 | GpuIntrinsic::BlockDimZ
699 | GpuIntrinsic::GridDimX
700 | GpuIntrinsic::GridDimY
701 | GpuIntrinsic::GridDimZ
702 | GpuIntrinsic::WarpSize => "index",
703
704 GpuIntrinsic::Clock | GpuIntrinsic::Clock64 | GpuIntrinsic::Nanosleep => "timing",
705
706 GpuIntrinsic::BlockReduceSum
708 | GpuIntrinsic::BlockReduceMin
709 | GpuIntrinsic::BlockReduceMax
710 | GpuIntrinsic::BlockReduceAnd
711 | GpuIntrinsic::BlockReduceOr
712 | GpuIntrinsic::GridReduceSum
713 | GpuIntrinsic::GridReduceMin
714 | GpuIntrinsic::GridReduceMax
715 | GpuIntrinsic::ReduceAndBroadcast => "reduction",
716 }
717 }
718}
719
720#[derive(Debug)]
722pub struct IntrinsicRegistry {
723 mappings: HashMap<String, GpuIntrinsic>,
724}
725
726impl Default for IntrinsicRegistry {
727 fn default() -> Self {
728 Self::new()
729 }
730}
731
732impl IntrinsicRegistry {
733 pub fn new() -> Self {
735 let mut mappings = HashMap::new();
736
737 mappings.insert("sync_threads".to_string(), GpuIntrinsic::SyncThreads);
739 mappings.insert("thread_fence".to_string(), GpuIntrinsic::ThreadFence);
740 mappings.insert(
741 "thread_fence_block".to_string(),
742 GpuIntrinsic::ThreadFenceBlock,
743 );
744 mappings.insert(
745 "thread_fence_system".to_string(),
746 GpuIntrinsic::ThreadFenceSystem,
747 );
748 mappings.insert(
749 "sync_threads_count".to_string(),
750 GpuIntrinsic::SyncThreadsCount,
751 );
752 mappings.insert("sync_threads_and".to_string(), GpuIntrinsic::SyncThreadsAnd);
753 mappings.insert("sync_threads_or".to_string(), GpuIntrinsic::SyncThreadsOr);
754
755 mappings.insert("atomic_add".to_string(), GpuIntrinsic::AtomicAdd);
757 mappings.insert("atomic_sub".to_string(), GpuIntrinsic::AtomicSub);
758 mappings.insert("atomic_min".to_string(), GpuIntrinsic::AtomicMin);
759 mappings.insert("atomic_max".to_string(), GpuIntrinsic::AtomicMax);
760 mappings.insert("atomic_exchange".to_string(), GpuIntrinsic::AtomicExch);
761 mappings.insert("atomic_exch".to_string(), GpuIntrinsic::AtomicExch);
762 mappings.insert("atomic_cas".to_string(), GpuIntrinsic::AtomicCas);
763 mappings.insert("atomic_compare_swap".to_string(), GpuIntrinsic::AtomicCas);
764 mappings.insert("atomic_and".to_string(), GpuIntrinsic::AtomicAnd);
765 mappings.insert("atomic_or".to_string(), GpuIntrinsic::AtomicOr);
766 mappings.insert("atomic_xor".to_string(), GpuIntrinsic::AtomicXor);
767 mappings.insert("atomic_inc".to_string(), GpuIntrinsic::AtomicInc);
768 mappings.insert("atomic_dec".to_string(), GpuIntrinsic::AtomicDec);
769
770 mappings.insert("sqrt".to_string(), GpuIntrinsic::Sqrt);
772 mappings.insert("rsqrt".to_string(), GpuIntrinsic::Rsqrt);
773 mappings.insert("abs".to_string(), GpuIntrinsic::Fabs);
774 mappings.insert("fabs".to_string(), GpuIntrinsic::Fabs);
775 mappings.insert("floor".to_string(), GpuIntrinsic::Floor);
776 mappings.insert("ceil".to_string(), GpuIntrinsic::Ceil);
777 mappings.insert("round".to_string(), GpuIntrinsic::Round);
778 mappings.insert("trunc".to_string(), GpuIntrinsic::Trunc);
779 mappings.insert("mul_add".to_string(), GpuIntrinsic::Fma);
780 mappings.insert("fma".to_string(), GpuIntrinsic::Fma);
781 mappings.insert("min".to_string(), GpuIntrinsic::Min);
782 mappings.insert("max".to_string(), GpuIntrinsic::Max);
783 mappings.insert("fmin".to_string(), GpuIntrinsic::Min);
784 mappings.insert("fmax".to_string(), GpuIntrinsic::Max);
785 mappings.insert("fmod".to_string(), GpuIntrinsic::Fmod);
786 mappings.insert("remainder".to_string(), GpuIntrinsic::Remainder);
787 mappings.insert("copysign".to_string(), GpuIntrinsic::Copysign);
788 mappings.insert("cbrt".to_string(), GpuIntrinsic::Cbrt);
789 mappings.insert("hypot".to_string(), GpuIntrinsic::Hypot);
790
791 mappings.insert("sin".to_string(), GpuIntrinsic::Sin);
793 mappings.insert("cos".to_string(), GpuIntrinsic::Cos);
794 mappings.insert("tan".to_string(), GpuIntrinsic::Tan);
795 mappings.insert("asin".to_string(), GpuIntrinsic::Asin);
796 mappings.insert("acos".to_string(), GpuIntrinsic::Acos);
797 mappings.insert("atan".to_string(), GpuIntrinsic::Atan);
798 mappings.insert("atan2".to_string(), GpuIntrinsic::Atan2);
799 mappings.insert("sincos".to_string(), GpuIntrinsic::Sincos);
800 mappings.insert("sinpi".to_string(), GpuIntrinsic::Sinpi);
801 mappings.insert("cospi".to_string(), GpuIntrinsic::Cospi);
802
803 mappings.insert("sinh".to_string(), GpuIntrinsic::Sinh);
805 mappings.insert("cosh".to_string(), GpuIntrinsic::Cosh);
806 mappings.insert("tanh".to_string(), GpuIntrinsic::Tanh);
807 mappings.insert("asinh".to_string(), GpuIntrinsic::Asinh);
808 mappings.insert("acosh".to_string(), GpuIntrinsic::Acosh);
809 mappings.insert("atanh".to_string(), GpuIntrinsic::Atanh);
810
811 mappings.insert("exp".to_string(), GpuIntrinsic::Exp);
813 mappings.insert("exp2".to_string(), GpuIntrinsic::Exp2);
814 mappings.insert("exp10".to_string(), GpuIntrinsic::Exp10);
815 mappings.insert("expm1".to_string(), GpuIntrinsic::Expm1);
816 mappings.insert("ln".to_string(), GpuIntrinsic::Log);
817 mappings.insert("log".to_string(), GpuIntrinsic::Log);
818 mappings.insert("log2".to_string(), GpuIntrinsic::Log2);
819 mappings.insert("log10".to_string(), GpuIntrinsic::Log10);
820 mappings.insert("log1p".to_string(), GpuIntrinsic::Log1p);
821 mappings.insert("powf".to_string(), GpuIntrinsic::Pow);
822 mappings.insert("powi".to_string(), GpuIntrinsic::Pow);
823 mappings.insert("pow".to_string(), GpuIntrinsic::Pow);
824 mappings.insert("ldexp".to_string(), GpuIntrinsic::Ldexp);
825 mappings.insert("scalbn".to_string(), GpuIntrinsic::Scalbn);
826 mappings.insert("ilogb".to_string(), GpuIntrinsic::Ilogb);
827 mappings.insert("lgamma".to_string(), GpuIntrinsic::Lgamma);
828 mappings.insert("tgamma".to_string(), GpuIntrinsic::Tgamma);
829 mappings.insert("gamma".to_string(), GpuIntrinsic::Tgamma);
830 mappings.insert("erf".to_string(), GpuIntrinsic::Erf);
831 mappings.insert("erfc".to_string(), GpuIntrinsic::Erfc);
832 mappings.insert("erfinv".to_string(), GpuIntrinsic::Erfinv);
833 mappings.insert("erfcinv".to_string(), GpuIntrinsic::Erfcinv);
834
835 mappings.insert("is_nan".to_string(), GpuIntrinsic::Isnan);
837 mappings.insert("isnan".to_string(), GpuIntrinsic::Isnan);
838 mappings.insert("is_infinite".to_string(), GpuIntrinsic::Isinf);
839 mappings.insert("isinf".to_string(), GpuIntrinsic::Isinf);
840 mappings.insert("is_finite".to_string(), GpuIntrinsic::Isfinite);
841 mappings.insert("isfinite".to_string(), GpuIntrinsic::Isfinite);
842 mappings.insert("is_normal".to_string(), GpuIntrinsic::Isnormal);
843 mappings.insert("isnormal".to_string(), GpuIntrinsic::Isnormal);
844 mappings.insert("is_sign_negative".to_string(), GpuIntrinsic::Signbit);
845 mappings.insert("signbit".to_string(), GpuIntrinsic::Signbit);
846 mappings.insert("nextafter".to_string(), GpuIntrinsic::Nextafter);
847 mappings.insert("fdim".to_string(), GpuIntrinsic::Fdim);
848 mappings.insert("nan".to_string(), GpuIntrinsic::Nan);
849
850 mappings.insert("warp_shfl".to_string(), GpuIntrinsic::WarpShfl);
852 mappings.insert("warp_shuffle".to_string(), GpuIntrinsic::WarpShfl);
853 mappings.insert("warp_shfl_up".to_string(), GpuIntrinsic::WarpShflUp);
854 mappings.insert("warp_shuffle_up".to_string(), GpuIntrinsic::WarpShflUp);
855 mappings.insert("warp_shfl_down".to_string(), GpuIntrinsic::WarpShflDown);
856 mappings.insert("warp_shuffle_down".to_string(), GpuIntrinsic::WarpShflDown);
857 mappings.insert("warp_shfl_xor".to_string(), GpuIntrinsic::WarpShflXor);
858 mappings.insert("warp_shuffle_xor".to_string(), GpuIntrinsic::WarpShflXor);
859 mappings.insert("warp_active_mask".to_string(), GpuIntrinsic::WarpActiveMask);
860 mappings.insert("active_mask".to_string(), GpuIntrinsic::WarpActiveMask);
861 mappings.insert("warp_ballot".to_string(), GpuIntrinsic::WarpBallot);
862 mappings.insert("ballot".to_string(), GpuIntrinsic::WarpBallot);
863 mappings.insert("warp_all".to_string(), GpuIntrinsic::WarpAll);
864 mappings.insert("warp_any".to_string(), GpuIntrinsic::WarpAny);
865 mappings.insert("warp_match_any".to_string(), GpuIntrinsic::WarpMatchAny);
866 mappings.insert("warp_match_all".to_string(), GpuIntrinsic::WarpMatchAll);
867 mappings.insert("warp_reduce_add".to_string(), GpuIntrinsic::WarpReduceAdd);
868 mappings.insert("warp_reduce_min".to_string(), GpuIntrinsic::WarpReduceMin);
869 mappings.insert("warp_reduce_max".to_string(), GpuIntrinsic::WarpReduceMax);
870 mappings.insert("warp_reduce_and".to_string(), GpuIntrinsic::WarpReduceAnd);
871 mappings.insert("warp_reduce_or".to_string(), GpuIntrinsic::WarpReduceOr);
872 mappings.insert("warp_reduce_xor".to_string(), GpuIntrinsic::WarpReduceXor);
873
874 mappings.insert("popc".to_string(), GpuIntrinsic::Popc);
876 mappings.insert("popcount".to_string(), GpuIntrinsic::Popc);
877 mappings.insert("count_ones".to_string(), GpuIntrinsic::Popc);
878 mappings.insert("clz".to_string(), GpuIntrinsic::Clz);
879 mappings.insert("leading_zeros".to_string(), GpuIntrinsic::Clz);
880 mappings.insert("ctz".to_string(), GpuIntrinsic::Ctz);
881 mappings.insert("trailing_zeros".to_string(), GpuIntrinsic::Ctz);
882 mappings.insert("ffs".to_string(), GpuIntrinsic::Ffs);
883 mappings.insert("brev".to_string(), GpuIntrinsic::Brev);
884 mappings.insert("reverse_bits".to_string(), GpuIntrinsic::Brev);
885 mappings.insert("byte_perm".to_string(), GpuIntrinsic::BytePerm);
886 mappings.insert(
887 "funnel_shift_left".to_string(),
888 GpuIntrinsic::FunnelShiftLeft,
889 );
890 mappings.insert(
891 "funnel_shift_right".to_string(),
892 GpuIntrinsic::FunnelShiftRight,
893 );
894
895 mappings.insert("ldg".to_string(), GpuIntrinsic::Ldg);
897 mappings.insert("load_global".to_string(), GpuIntrinsic::Ldg);
898 mappings.insert("prefetch_l1".to_string(), GpuIntrinsic::PrefetchL1);
899 mappings.insert("prefetch_l2".to_string(), GpuIntrinsic::PrefetchL2);
900
901 mappings.insert("rcp".to_string(), GpuIntrinsic::Rcp);
903 mappings.insert("recip".to_string(), GpuIntrinsic::Rcp);
904 mappings.insert("fdividef".to_string(), GpuIntrinsic::Fdividef);
905 mappings.insert("fast_div".to_string(), GpuIntrinsic::Fdividef);
906 mappings.insert("saturate".to_string(), GpuIntrinsic::Saturate);
907 mappings.insert("clamp_01".to_string(), GpuIntrinsic::Saturate);
908 mappings.insert("j0".to_string(), GpuIntrinsic::J0);
909 mappings.insert("j1".to_string(), GpuIntrinsic::J1);
910 mappings.insert("jn".to_string(), GpuIntrinsic::Jn);
911 mappings.insert("y0".to_string(), GpuIntrinsic::Y0);
912 mappings.insert("y1".to_string(), GpuIntrinsic::Y1);
913 mappings.insert("yn".to_string(), GpuIntrinsic::Yn);
914 mappings.insert("normcdf".to_string(), GpuIntrinsic::Normcdf);
915 mappings.insert("norm_cdf".to_string(), GpuIntrinsic::Normcdf);
916 mappings.insert("normcdfinv".to_string(), GpuIntrinsic::Normcdfinv);
917 mappings.insert("norm_cdf_inv".to_string(), GpuIntrinsic::Normcdfinv);
918 mappings.insert("cyl_bessel_i0".to_string(), GpuIntrinsic::CylBesselI0);
919 mappings.insert("cyl_bessel_i1".to_string(), GpuIntrinsic::CylBesselI1);
920
921 mappings.insert("thread_idx_x".to_string(), GpuIntrinsic::ThreadIdxX);
923 mappings.insert("thread_idx_y".to_string(), GpuIntrinsic::ThreadIdxY);
924 mappings.insert("thread_idx_z".to_string(), GpuIntrinsic::ThreadIdxZ);
925 mappings.insert("block_idx_x".to_string(), GpuIntrinsic::BlockIdxX);
926 mappings.insert("block_idx_y".to_string(), GpuIntrinsic::BlockIdxY);
927 mappings.insert("block_idx_z".to_string(), GpuIntrinsic::BlockIdxZ);
928 mappings.insert("block_dim_x".to_string(), GpuIntrinsic::BlockDimX);
929 mappings.insert("block_dim_y".to_string(), GpuIntrinsic::BlockDimY);
930 mappings.insert("block_dim_z".to_string(), GpuIntrinsic::BlockDimZ);
931 mappings.insert("grid_dim_x".to_string(), GpuIntrinsic::GridDimX);
932 mappings.insert("grid_dim_y".to_string(), GpuIntrinsic::GridDimY);
933 mappings.insert("grid_dim_z".to_string(), GpuIntrinsic::GridDimZ);
934 mappings.insert("warp_size".to_string(), GpuIntrinsic::WarpSize);
935
936 mappings.insert("clock".to_string(), GpuIntrinsic::Clock);
938 mappings.insert("clock64".to_string(), GpuIntrinsic::Clock64);
939 mappings.insert("nanosleep".to_string(), GpuIntrinsic::Nanosleep);
940
941 mappings.insert("block_reduce_sum".to_string(), GpuIntrinsic::BlockReduceSum);
943 mappings.insert("block_reduce_min".to_string(), GpuIntrinsic::BlockReduceMin);
944 mappings.insert("block_reduce_max".to_string(), GpuIntrinsic::BlockReduceMax);
945 mappings.insert("block_reduce_and".to_string(), GpuIntrinsic::BlockReduceAnd);
946 mappings.insert("block_reduce_or".to_string(), GpuIntrinsic::BlockReduceOr);
947
948 mappings.insert("grid_reduce_sum".to_string(), GpuIntrinsic::GridReduceSum);
950 mappings.insert("grid_reduce_min".to_string(), GpuIntrinsic::GridReduceMin);
951 mappings.insert("grid_reduce_max".to_string(), GpuIntrinsic::GridReduceMax);
952
953 mappings.insert(
955 "reduce_and_broadcast".to_string(),
956 GpuIntrinsic::ReduceAndBroadcast,
957 );
958 mappings.insert(
960 "reduce_broadcast".to_string(),
961 GpuIntrinsic::ReduceAndBroadcast,
962 );
963
964 Self { mappings }
965 }
966
967 pub fn lookup(&self, name: &str) -> Option<&GpuIntrinsic> {
969 self.mappings.get(name)
970 }
971
972 pub fn register(&mut self, rust_name: &str, intrinsic: GpuIntrinsic) {
974 self.mappings.insert(rust_name.to_string(), intrinsic);
975 }
976
977 pub fn is_intrinsic(&self, name: &str) -> bool {
979 self.mappings.contains_key(name)
980 }
981}
982
983#[derive(Debug, Clone, PartialEq)]
988pub enum StencilIntrinsic {
989 Index,
991 North,
993 South,
995 East,
997 West,
999 At,
1001 Up,
1003 Down,
1005}
1006
1007impl StencilIntrinsic {
1008 pub fn from_method_name(name: &str) -> Option<Self> {
1010 match name {
1011 "idx" => Some(StencilIntrinsic::Index),
1012 "north" => Some(StencilIntrinsic::North),
1013 "south" => Some(StencilIntrinsic::South),
1014 "east" => Some(StencilIntrinsic::East),
1015 "west" => Some(StencilIntrinsic::West),
1016 "at" => Some(StencilIntrinsic::At),
1017 "up" => Some(StencilIntrinsic::Up),
1018 "down" => Some(StencilIntrinsic::Down),
1019 _ => None,
1020 }
1021 }
1022}
1023
1024#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1029pub enum RingKernelIntrinsic {
1030 IsActive,
1033 ShouldTerminate,
1035 MarkTerminated,
1037 GetMessagesProcessed,
1039
1040 InputQueueSize,
1043 OutputQueueSize,
1045 InputQueueEmpty,
1047 OutputQueueEmpty,
1049 EnqueueResponse,
1051
1052 HlcTick,
1055 HlcUpdate,
1057 HlcNow,
1059
1060 K2kSend,
1063 K2kTryRecv,
1065 K2kHasMessage,
1067 K2kPeek,
1069 K2kPendingCount,
1071
1072 Nanosleep,
1075}
1076
1077impl RingKernelIntrinsic {
1078 pub fn to_cuda(&self, args: &[String]) -> String {
1080 match self {
1081 Self::IsActive => "atomicAdd(&control->is_active, 0) != 0".to_string(),
1082 Self::ShouldTerminate => "atomicAdd(&control->should_terminate, 0) != 0".to_string(),
1083 Self::MarkTerminated => "atomicExch(&control->has_terminated, 1)".to_string(),
1084 Self::GetMessagesProcessed => "atomicAdd(&control->messages_processed, 0)".to_string(),
1085
1086 Self::InputQueueSize => {
1087 "(atomicAdd(&control->input_head, 0) - atomicAdd(&control->input_tail, 0))"
1088 .to_string()
1089 }
1090 Self::OutputQueueSize => {
1091 "(atomicAdd(&control->output_head, 0) - atomicAdd(&control->output_tail, 0))"
1092 .to_string()
1093 }
1094 Self::InputQueueEmpty => {
1095 "(atomicAdd(&control->input_head, 0) == atomicAdd(&control->input_tail, 0))"
1096 .to_string()
1097 }
1098 Self::OutputQueueEmpty => {
1099 "(atomicAdd(&control->output_head, 0) == atomicAdd(&control->output_tail, 0))"
1100 .to_string()
1101 }
1102 Self::EnqueueResponse => {
1103 if !args.is_empty() {
1104 format!(
1105 "{{ unsigned long long _out_idx = atomicAdd(&control->output_head, 1) & control->output_mask; \
1106 memcpy(&output_buffer[_out_idx * RESP_SIZE], {}, RESP_SIZE); }}",
1107 args[0]
1108 )
1109 } else {
1110 "/* enqueue_response requires response pointer */".to_string()
1111 }
1112 }
1113
1114 Self::HlcTick => "hlc_logical++".to_string(),
1115 Self::HlcUpdate => {
1116 if !args.is_empty() {
1117 format!(
1118 "{{ if ({} > hlc_physical) {{ hlc_physical = {}; hlc_logical = 0; }} else {{ hlc_logical++; }} }}",
1119 args[0], args[0]
1120 )
1121 } else {
1122 "hlc_logical++".to_string()
1123 }
1124 }
1125 Self::HlcNow => "(hlc_physical << 32) | (hlc_logical & 0xFFFFFFFF)".to_string(),
1126
1127 Self::K2kSend => {
1128 if args.len() >= 2 {
1129 format!(
1131 "k2k_send(k2k_routes, {}, {}, sizeof(*{}))",
1132 args[0], args[1], args[1]
1133 )
1134 } else {
1135 "/* k2k_send requires target_id and msg_ptr */".to_string()
1136 }
1137 }
1138 Self::K2kTryRecv => "k2k_try_recv(k2k_inbox)".to_string(),
1139 Self::K2kHasMessage => "k2k_has_message(k2k_inbox)".to_string(),
1140 Self::K2kPeek => "k2k_peek(k2k_inbox)".to_string(),
1141 Self::K2kPendingCount => "k2k_pending_count(k2k_inbox)".to_string(),
1142
1143 Self::Nanosleep => {
1144 if !args.is_empty() {
1145 format!("__nanosleep({})", args[0])
1146 } else {
1147 "__nanosleep(1000)".to_string()
1148 }
1149 }
1150 }
1151 }
1152
1153 pub fn from_name(name: &str) -> Option<Self> {
1155 match name {
1156 "is_active" | "is_kernel_active" => Some(Self::IsActive),
1157 "should_terminate" => Some(Self::ShouldTerminate),
1158 "mark_terminated" => Some(Self::MarkTerminated),
1159 "messages_processed" | "get_messages_processed" => Some(Self::GetMessagesProcessed),
1160
1161 "input_queue_size" => Some(Self::InputQueueSize),
1162 "output_queue_size" => Some(Self::OutputQueueSize),
1163 "input_queue_empty" => Some(Self::InputQueueEmpty),
1164 "output_queue_empty" => Some(Self::OutputQueueEmpty),
1165 "enqueue_response" | "enqueue" => Some(Self::EnqueueResponse),
1166
1167 "hlc_tick" => Some(Self::HlcTick),
1168 "hlc_update" => Some(Self::HlcUpdate),
1169 "hlc_now" => Some(Self::HlcNow),
1170
1171 "k2k_send" => Some(Self::K2kSend),
1172 "k2k_try_recv" => Some(Self::K2kTryRecv),
1173 "k2k_has_message" => Some(Self::K2kHasMessage),
1174 "k2k_peek" => Some(Self::K2kPeek),
1175 "k2k_pending_count" | "k2k_pending" => Some(Self::K2kPendingCount),
1176
1177 "nanosleep" => Some(Self::Nanosleep),
1178
1179 _ => None,
1180 }
1181 }
1182
1183 pub fn requires_control_block(&self) -> bool {
1185 matches!(
1186 self,
1187 Self::IsActive
1188 | Self::ShouldTerminate
1189 | Self::MarkTerminated
1190 | Self::GetMessagesProcessed
1191 | Self::InputQueueSize
1192 | Self::OutputQueueSize
1193 | Self::InputQueueEmpty
1194 | Self::OutputQueueEmpty
1195 | Self::EnqueueResponse
1196 )
1197 }
1198
1199 pub fn requires_hlc(&self) -> bool {
1201 matches!(self, Self::HlcTick | Self::HlcUpdate | Self::HlcNow)
1202 }
1203
1204 pub fn requires_k2k(&self) -> bool {
1206 matches!(
1207 self,
1208 Self::K2kSend
1209 | Self::K2kTryRecv
1210 | Self::K2kHasMessage
1211 | Self::K2kPeek
1212 | Self::K2kPendingCount
1213 )
1214 }
1215}
1216
1217impl StencilIntrinsic {
1218 pub fn get_offset_2d(&self) -> Option<(i32, i32)> {
1223 match self {
1224 StencilIntrinsic::Index => Some((0, 0)),
1225 StencilIntrinsic::North => Some((-1, 0)),
1226 StencilIntrinsic::South => Some((1, 0)),
1227 StencilIntrinsic::East => Some((0, 1)),
1228 StencilIntrinsic::West => Some((0, -1)),
1229 StencilIntrinsic::At => None, StencilIntrinsic::Up | StencilIntrinsic::Down => None, }
1232 }
1233
1234 pub fn get_offset_3d(&self) -> Option<(i32, i32, i32)> {
1239 match self {
1240 StencilIntrinsic::Index => Some((0, 0, 0)),
1241 StencilIntrinsic::North => Some((0, -1, 0)),
1242 StencilIntrinsic::South => Some((0, 1, 0)),
1243 StencilIntrinsic::East => Some((0, 0, 1)),
1244 StencilIntrinsic::West => Some((0, 0, -1)),
1245 StencilIntrinsic::Up => Some((-1, 0, 0)),
1246 StencilIntrinsic::Down => Some((1, 0, 0)),
1247 StencilIntrinsic::At => None, }
1249 }
1250
1251 pub fn is_3d_only(&self) -> bool {
1253 matches!(self, StencilIntrinsic::Up | StencilIntrinsic::Down)
1254 }
1255
1256 pub fn to_cuda_index_2d(&self, buffer_name: &str, buffer_width: &str, idx_var: &str) -> String {
1263 match self {
1264 StencilIntrinsic::Index => format!("{}[{}]", buffer_name, idx_var),
1265 StencilIntrinsic::North => {
1266 format!("{}[{} - {}]", buffer_name, idx_var, buffer_width)
1267 }
1268 StencilIntrinsic::South => {
1269 format!("{}[{} + {}]", buffer_name, idx_var, buffer_width)
1270 }
1271 StencilIntrinsic::East => format!("{}[{} + 1]", buffer_name, idx_var),
1272 StencilIntrinsic::West => format!("{}[{} - 1]", buffer_name, idx_var),
1273 StencilIntrinsic::At => {
1274 format!("{}[{}]", buffer_name, idx_var)
1276 }
1277 _ => format!("{}[{}]", buffer_name, idx_var),
1278 }
1279 }
1280
1281 pub fn to_cuda_index_3d(
1289 &self,
1290 buffer_name: &str,
1291 buffer_width: &str,
1292 buffer_slice: &str,
1293 idx_var: &str,
1294 ) -> String {
1295 match self {
1296 StencilIntrinsic::Index => format!("{}[{}]", buffer_name, idx_var),
1297 StencilIntrinsic::North => {
1298 format!("{}[{} - {}]", buffer_name, idx_var, buffer_width)
1299 }
1300 StencilIntrinsic::South => {
1301 format!("{}[{} + {}]", buffer_name, idx_var, buffer_width)
1302 }
1303 StencilIntrinsic::East => format!("{}[{} + 1]", buffer_name, idx_var),
1304 StencilIntrinsic::West => format!("{}[{} - 1]", buffer_name, idx_var),
1305 StencilIntrinsic::Up => {
1306 format!("{}[{} - {}]", buffer_name, idx_var, buffer_slice)
1307 }
1308 StencilIntrinsic::Down => {
1309 format!("{}[{} + {}]", buffer_name, idx_var, buffer_slice)
1310 }
1311 StencilIntrinsic::At => {
1312 format!("{}[{}]", buffer_name, idx_var)
1314 }
1315 }
1316 }
1317}
1318
1319#[cfg(test)]
1320mod tests {
1321 use super::*;
1322
1323 #[test]
1324 fn test_intrinsic_lookup() {
1325 let registry = IntrinsicRegistry::new();
1326
1327 assert_eq!(
1328 registry.lookup("sync_threads"),
1329 Some(&GpuIntrinsic::SyncThreads)
1330 );
1331 assert_eq!(registry.lookup("sqrt"), Some(&GpuIntrinsic::Sqrt));
1332 assert_eq!(registry.lookup("unknown_func"), None);
1333 }
1334
1335 #[test]
1336 fn test_intrinsic_cuda_output() {
1337 assert_eq!(
1338 GpuIntrinsic::SyncThreads.to_cuda_string(),
1339 "__syncthreads()"
1340 );
1341 assert_eq!(GpuIntrinsic::AtomicAdd.to_cuda_string(), "atomicAdd");
1342 assert_eq!(GpuIntrinsic::Sqrt.to_cuda_string(), "sqrtf");
1343 }
1344
1345 #[test]
1346 fn test_stencil_intrinsic_parsing() {
1347 assert_eq!(
1348 StencilIntrinsic::from_method_name("north"),
1349 Some(StencilIntrinsic::North)
1350 );
1351 assert_eq!(
1352 StencilIntrinsic::from_method_name("idx"),
1353 Some(StencilIntrinsic::Index)
1354 );
1355 assert_eq!(StencilIntrinsic::from_method_name("unknown"), None);
1356 }
1357
1358 #[test]
1359 fn test_stencil_cuda_index() {
1360 let north = StencilIntrinsic::North;
1361 assert_eq!(
1362 north.to_cuda_index_2d("p", "buffer_width", "idx"),
1363 "p[idx - buffer_width]"
1364 );
1365
1366 let east = StencilIntrinsic::East;
1367 assert_eq!(east.to_cuda_index_2d("p", "18", "idx"), "p[idx + 1]");
1368 }
1369
1370 #[test]
1371 fn test_stencil_offset() {
1372 assert_eq!(StencilIntrinsic::North.get_offset_2d(), Some((-1, 0)));
1373 assert_eq!(StencilIntrinsic::East.get_offset_2d(), Some((0, 1)));
1374 assert_eq!(StencilIntrinsic::Index.get_offset_2d(), Some((0, 0)));
1375 }
1376
1377 #[test]
1378 fn test_ring_kernel_intrinsic_lookup() {
1379 assert_eq!(
1380 RingKernelIntrinsic::from_name("is_active"),
1381 Some(RingKernelIntrinsic::IsActive)
1382 );
1383 assert_eq!(
1384 RingKernelIntrinsic::from_name("should_terminate"),
1385 Some(RingKernelIntrinsic::ShouldTerminate)
1386 );
1387 assert_eq!(
1388 RingKernelIntrinsic::from_name("hlc_tick"),
1389 Some(RingKernelIntrinsic::HlcTick)
1390 );
1391 assert_eq!(
1392 RingKernelIntrinsic::from_name("enqueue_response"),
1393 Some(RingKernelIntrinsic::EnqueueResponse)
1394 );
1395 assert_eq!(RingKernelIntrinsic::from_name("unknown"), None);
1396 }
1397
1398 #[test]
1399 fn test_ring_kernel_intrinsic_cuda_output() {
1400 assert!(RingKernelIntrinsic::IsActive
1401 .to_cuda(&[])
1402 .contains("is_active"));
1403 assert!(RingKernelIntrinsic::ShouldTerminate
1404 .to_cuda(&[])
1405 .contains("should_terminate"));
1406 assert!(RingKernelIntrinsic::HlcTick
1407 .to_cuda(&[])
1408 .contains("hlc_logical"));
1409 assert!(RingKernelIntrinsic::InputQueueEmpty
1410 .to_cuda(&[])
1411 .contains("input_head"));
1412 }
1413
1414 #[test]
1415 fn test_ring_kernel_queue_intrinsics() {
1416 let enqueue = RingKernelIntrinsic::EnqueueResponse;
1417 let cuda = enqueue.to_cuda(&["&response".to_string()]);
1418 assert!(cuda.contains("output_head"));
1419 assert!(cuda.contains("memcpy"));
1420 }
1421
1422 #[test]
1423 fn test_k2k_intrinsics() {
1424 let send = RingKernelIntrinsic::K2kSend;
1426 let cuda = send.to_cuda(&["target_id".to_string(), "&msg".to_string()]);
1427 assert!(cuda.contains("k2k_send"));
1428 assert!(cuda.contains("k2k_routes"));
1429 assert!(cuda.contains("target_id"));
1430
1431 assert_eq!(
1433 RingKernelIntrinsic::K2kTryRecv.to_cuda(&[]),
1434 "k2k_try_recv(k2k_inbox)"
1435 );
1436
1437 assert_eq!(
1439 RingKernelIntrinsic::K2kHasMessage.to_cuda(&[]),
1440 "k2k_has_message(k2k_inbox)"
1441 );
1442
1443 assert_eq!(
1445 RingKernelIntrinsic::K2kPeek.to_cuda(&[]),
1446 "k2k_peek(k2k_inbox)"
1447 );
1448
1449 assert_eq!(
1451 RingKernelIntrinsic::K2kPendingCount.to_cuda(&[]),
1452 "k2k_pending_count(k2k_inbox)"
1453 );
1454 }
1455
1456 #[test]
1457 fn test_k2k_intrinsic_lookup() {
1458 assert_eq!(
1459 RingKernelIntrinsic::from_name("k2k_send"),
1460 Some(RingKernelIntrinsic::K2kSend)
1461 );
1462 assert_eq!(
1463 RingKernelIntrinsic::from_name("k2k_try_recv"),
1464 Some(RingKernelIntrinsic::K2kTryRecv)
1465 );
1466 assert_eq!(
1467 RingKernelIntrinsic::from_name("k2k_has_message"),
1468 Some(RingKernelIntrinsic::K2kHasMessage)
1469 );
1470 assert_eq!(
1471 RingKernelIntrinsic::from_name("k2k_peek"),
1472 Some(RingKernelIntrinsic::K2kPeek)
1473 );
1474 assert_eq!(
1475 RingKernelIntrinsic::from_name("k2k_pending_count"),
1476 Some(RingKernelIntrinsic::K2kPendingCount)
1477 );
1478 }
1479
1480 #[test]
1481 fn test_intrinsic_requirements() {
1482 assert!(RingKernelIntrinsic::K2kSend.requires_k2k());
1484 assert!(RingKernelIntrinsic::K2kTryRecv.requires_k2k());
1485 assert!(RingKernelIntrinsic::K2kPeek.requires_k2k());
1486 assert!(!RingKernelIntrinsic::HlcTick.requires_k2k());
1487
1488 assert!(RingKernelIntrinsic::HlcTick.requires_hlc());
1490 assert!(RingKernelIntrinsic::HlcNow.requires_hlc());
1491 assert!(!RingKernelIntrinsic::K2kSend.requires_hlc());
1492
1493 assert!(RingKernelIntrinsic::IsActive.requires_control_block());
1495 assert!(RingKernelIntrinsic::EnqueueResponse.requires_control_block());
1496 assert!(!RingKernelIntrinsic::HlcTick.requires_control_block());
1497 }
1498
1499 #[test]
1502 fn test_new_atomic_intrinsics() {
1503 let registry = IntrinsicRegistry::new();
1504
1505 assert_eq!(
1507 registry.lookup("atomic_and"),
1508 Some(&GpuIntrinsic::AtomicAnd)
1509 );
1510 assert_eq!(registry.lookup("atomic_or"), Some(&GpuIntrinsic::AtomicOr));
1511 assert_eq!(
1512 registry.lookup("atomic_xor"),
1513 Some(&GpuIntrinsic::AtomicXor)
1514 );
1515 assert_eq!(
1516 registry.lookup("atomic_inc"),
1517 Some(&GpuIntrinsic::AtomicInc)
1518 );
1519 assert_eq!(
1520 registry.lookup("atomic_dec"),
1521 Some(&GpuIntrinsic::AtomicDec)
1522 );
1523
1524 assert_eq!(GpuIntrinsic::AtomicAnd.to_cuda_string(), "atomicAnd");
1526 assert_eq!(GpuIntrinsic::AtomicOr.to_cuda_string(), "atomicOr");
1527 assert_eq!(GpuIntrinsic::AtomicXor.to_cuda_string(), "atomicXor");
1528 assert_eq!(GpuIntrinsic::AtomicInc.to_cuda_string(), "atomicInc");
1529 assert_eq!(GpuIntrinsic::AtomicDec.to_cuda_string(), "atomicDec");
1530 }
1531
1532 #[test]
1533 fn test_trigonometric_intrinsics() {
1534 let registry = IntrinsicRegistry::new();
1535
1536 assert_eq!(registry.lookup("asin"), Some(&GpuIntrinsic::Asin));
1538 assert_eq!(registry.lookup("acos"), Some(&GpuIntrinsic::Acos));
1539 assert_eq!(registry.lookup("atan"), Some(&GpuIntrinsic::Atan));
1540 assert_eq!(registry.lookup("atan2"), Some(&GpuIntrinsic::Atan2));
1541
1542 assert_eq!(GpuIntrinsic::Asin.to_cuda_string(), "asinf");
1544 assert_eq!(GpuIntrinsic::Acos.to_cuda_string(), "acosf");
1545 assert_eq!(GpuIntrinsic::Atan.to_cuda_string(), "atanf");
1546 assert_eq!(GpuIntrinsic::Atan2.to_cuda_string(), "atan2f");
1547 }
1548
1549 #[test]
1550 fn test_hyperbolic_intrinsics() {
1551 let registry = IntrinsicRegistry::new();
1552
1553 assert_eq!(registry.lookup("sinh"), Some(&GpuIntrinsic::Sinh));
1555 assert_eq!(registry.lookup("cosh"), Some(&GpuIntrinsic::Cosh));
1556 assert_eq!(registry.lookup("tanh"), Some(&GpuIntrinsic::Tanh));
1557 assert_eq!(registry.lookup("asinh"), Some(&GpuIntrinsic::Asinh));
1558 assert_eq!(registry.lookup("acosh"), Some(&GpuIntrinsic::Acosh));
1559 assert_eq!(registry.lookup("atanh"), Some(&GpuIntrinsic::Atanh));
1560
1561 assert_eq!(GpuIntrinsic::Sinh.to_cuda_string(), "sinhf");
1563 assert_eq!(GpuIntrinsic::Cosh.to_cuda_string(), "coshf");
1564 assert_eq!(GpuIntrinsic::Tanh.to_cuda_string(), "tanhf");
1565 }
1566
1567 #[test]
1568 fn test_exponential_logarithmic_intrinsics() {
1569 let registry = IntrinsicRegistry::new();
1570
1571 assert_eq!(registry.lookup("exp2"), Some(&GpuIntrinsic::Exp2));
1573 assert_eq!(registry.lookup("exp10"), Some(&GpuIntrinsic::Exp10));
1574 assert_eq!(registry.lookup("expm1"), Some(&GpuIntrinsic::Expm1));
1575
1576 assert_eq!(registry.lookup("log2"), Some(&GpuIntrinsic::Log2));
1578 assert_eq!(registry.lookup("log10"), Some(&GpuIntrinsic::Log10));
1579 assert_eq!(registry.lookup("log1p"), Some(&GpuIntrinsic::Log1p));
1580
1581 assert_eq!(GpuIntrinsic::Exp2.to_cuda_string(), "exp2f");
1583 assert_eq!(GpuIntrinsic::Log2.to_cuda_string(), "log2f");
1584 assert_eq!(GpuIntrinsic::Log10.to_cuda_string(), "log10f");
1585 }
1586
1587 #[test]
1588 fn test_classification_intrinsics() {
1589 let registry = IntrinsicRegistry::new();
1590
1591 assert_eq!(registry.lookup("is_nan"), Some(&GpuIntrinsic::Isnan));
1593 assert_eq!(registry.lookup("isnan"), Some(&GpuIntrinsic::Isnan));
1594 assert_eq!(registry.lookup("is_infinite"), Some(&GpuIntrinsic::Isinf));
1595 assert_eq!(registry.lookup("is_finite"), Some(&GpuIntrinsic::Isfinite));
1596 assert_eq!(registry.lookup("is_normal"), Some(&GpuIntrinsic::Isnormal));
1597 assert_eq!(registry.lookup("signbit"), Some(&GpuIntrinsic::Signbit));
1598
1599 assert_eq!(GpuIntrinsic::Isnan.to_cuda_string(), "isnan");
1601 assert_eq!(GpuIntrinsic::Isinf.to_cuda_string(), "isinf");
1602 assert_eq!(GpuIntrinsic::Isfinite.to_cuda_string(), "isfinite");
1603 }
1604
1605 #[test]
1606 fn test_warp_reduce_intrinsics() {
1607 let registry = IntrinsicRegistry::new();
1608
1609 assert_eq!(
1611 registry.lookup("warp_reduce_add"),
1612 Some(&GpuIntrinsic::WarpReduceAdd)
1613 );
1614 assert_eq!(
1615 registry.lookup("warp_reduce_min"),
1616 Some(&GpuIntrinsic::WarpReduceMin)
1617 );
1618 assert_eq!(
1619 registry.lookup("warp_reduce_max"),
1620 Some(&GpuIntrinsic::WarpReduceMax)
1621 );
1622 assert_eq!(
1623 registry.lookup("warp_reduce_and"),
1624 Some(&GpuIntrinsic::WarpReduceAnd)
1625 );
1626 assert_eq!(
1627 registry.lookup("warp_reduce_or"),
1628 Some(&GpuIntrinsic::WarpReduceOr)
1629 );
1630 assert_eq!(
1631 registry.lookup("warp_reduce_xor"),
1632 Some(&GpuIntrinsic::WarpReduceXor)
1633 );
1634
1635 assert_eq!(
1637 GpuIntrinsic::WarpReduceAdd.to_cuda_string(),
1638 "__reduce_add_sync"
1639 );
1640 assert_eq!(
1641 GpuIntrinsic::WarpReduceMin.to_cuda_string(),
1642 "__reduce_min_sync"
1643 );
1644 assert_eq!(
1645 GpuIntrinsic::WarpReduceMax.to_cuda_string(),
1646 "__reduce_max_sync"
1647 );
1648 }
1649
1650 #[test]
1651 fn test_warp_match_intrinsics() {
1652 let registry = IntrinsicRegistry::new();
1653
1654 assert_eq!(
1655 registry.lookup("warp_match_any"),
1656 Some(&GpuIntrinsic::WarpMatchAny)
1657 );
1658 assert_eq!(
1659 registry.lookup("warp_match_all"),
1660 Some(&GpuIntrinsic::WarpMatchAll)
1661 );
1662
1663 assert_eq!(
1664 GpuIntrinsic::WarpMatchAny.to_cuda_string(),
1665 "__match_any_sync"
1666 );
1667 assert_eq!(
1668 GpuIntrinsic::WarpMatchAll.to_cuda_string(),
1669 "__match_all_sync"
1670 );
1671 }
1672
1673 #[test]
1674 fn test_bit_manipulation_intrinsics() {
1675 let registry = IntrinsicRegistry::new();
1676
1677 assert_eq!(registry.lookup("popc"), Some(&GpuIntrinsic::Popc));
1679 assert_eq!(registry.lookup("popcount"), Some(&GpuIntrinsic::Popc));
1680 assert_eq!(registry.lookup("count_ones"), Some(&GpuIntrinsic::Popc));
1681 assert_eq!(registry.lookup("clz"), Some(&GpuIntrinsic::Clz));
1682 assert_eq!(registry.lookup("leading_zeros"), Some(&GpuIntrinsic::Clz));
1683 assert_eq!(registry.lookup("ctz"), Some(&GpuIntrinsic::Ctz));
1684 assert_eq!(registry.lookup("ffs"), Some(&GpuIntrinsic::Ffs));
1685 assert_eq!(registry.lookup("brev"), Some(&GpuIntrinsic::Brev));
1686 assert_eq!(registry.lookup("reverse_bits"), Some(&GpuIntrinsic::Brev));
1687
1688 assert_eq!(GpuIntrinsic::Popc.to_cuda_string(), "__popc");
1690 assert_eq!(GpuIntrinsic::Clz.to_cuda_string(), "__clz");
1691 assert_eq!(GpuIntrinsic::Ffs.to_cuda_string(), "__ffs");
1692 assert_eq!(GpuIntrinsic::Brev.to_cuda_string(), "__brev");
1693 }
1694
1695 #[test]
1696 fn test_funnel_shift_intrinsics() {
1697 let registry = IntrinsicRegistry::new();
1698
1699 assert_eq!(
1700 registry.lookup("funnel_shift_left"),
1701 Some(&GpuIntrinsic::FunnelShiftLeft)
1702 );
1703 assert_eq!(
1704 registry.lookup("funnel_shift_right"),
1705 Some(&GpuIntrinsic::FunnelShiftRight)
1706 );
1707
1708 assert_eq!(
1709 GpuIntrinsic::FunnelShiftLeft.to_cuda_string(),
1710 "__funnelshift_l"
1711 );
1712 assert_eq!(
1713 GpuIntrinsic::FunnelShiftRight.to_cuda_string(),
1714 "__funnelshift_r"
1715 );
1716 }
1717
1718 #[test]
1719 fn test_memory_intrinsics() {
1720 let registry = IntrinsicRegistry::new();
1721
1722 assert_eq!(registry.lookup("ldg"), Some(&GpuIntrinsic::Ldg));
1723 assert_eq!(registry.lookup("load_global"), Some(&GpuIntrinsic::Ldg));
1724 assert_eq!(
1725 registry.lookup("prefetch_l1"),
1726 Some(&GpuIntrinsic::PrefetchL1)
1727 );
1728 assert_eq!(
1729 registry.lookup("prefetch_l2"),
1730 Some(&GpuIntrinsic::PrefetchL2)
1731 );
1732
1733 assert_eq!(GpuIntrinsic::Ldg.to_cuda_string(), "__ldg");
1734 assert_eq!(GpuIntrinsic::PrefetchL1.to_cuda_string(), "__prefetch_l1");
1735 assert_eq!(GpuIntrinsic::PrefetchL2.to_cuda_string(), "__prefetch_l2");
1736 }
1737
1738 #[test]
1739 fn test_clock_intrinsics() {
1740 let registry = IntrinsicRegistry::new();
1741
1742 assert_eq!(registry.lookup("clock"), Some(&GpuIntrinsic::Clock));
1743 assert_eq!(registry.lookup("clock64"), Some(&GpuIntrinsic::Clock64));
1744 assert_eq!(registry.lookup("nanosleep"), Some(&GpuIntrinsic::Nanosleep));
1745
1746 assert_eq!(GpuIntrinsic::Clock.to_cuda_string(), "clock()");
1747 assert_eq!(GpuIntrinsic::Clock64.to_cuda_string(), "clock64()");
1748 assert_eq!(GpuIntrinsic::Nanosleep.to_cuda_string(), "__nanosleep");
1749 }
1750
1751 #[test]
1752 fn test_special_function_intrinsics() {
1753 let registry = IntrinsicRegistry::new();
1754
1755 assert_eq!(registry.lookup("rcp"), Some(&GpuIntrinsic::Rcp));
1756 assert_eq!(registry.lookup("recip"), Some(&GpuIntrinsic::Rcp));
1757 assert_eq!(registry.lookup("saturate"), Some(&GpuIntrinsic::Saturate));
1758 assert_eq!(registry.lookup("clamp_01"), Some(&GpuIntrinsic::Saturate));
1759
1760 assert_eq!(GpuIntrinsic::Rcp.to_cuda_string(), "__frcp_rn");
1761 assert_eq!(GpuIntrinsic::Saturate.to_cuda_string(), "__saturatef");
1762 }
1763
1764 #[test]
1765 fn test_intrinsic_categories() {
1766 assert_eq!(GpuIntrinsic::SyncThreads.category(), "synchronization");
1768 assert_eq!(GpuIntrinsic::AtomicAdd.category(), "atomic");
1769 assert_eq!(GpuIntrinsic::Sqrt.category(), "math");
1770 assert_eq!(GpuIntrinsic::Sin.category(), "trigonometric");
1771 assert_eq!(GpuIntrinsic::Sinh.category(), "hyperbolic");
1772 assert_eq!(GpuIntrinsic::Exp.category(), "exponential");
1773 assert_eq!(GpuIntrinsic::Isnan.category(), "classification");
1774 assert_eq!(GpuIntrinsic::WarpShfl.category(), "warp");
1775 assert_eq!(GpuIntrinsic::Popc.category(), "bit");
1776 assert_eq!(GpuIntrinsic::Ldg.category(), "memory");
1777 assert_eq!(GpuIntrinsic::Rcp.category(), "special");
1778 assert_eq!(GpuIntrinsic::ThreadIdxX.category(), "index");
1779 assert_eq!(GpuIntrinsic::Clock.category(), "timing");
1780 }
1781
1782 #[test]
1783 fn test_intrinsic_flags() {
1784 assert!(GpuIntrinsic::ThreadIdxX.is_value_intrinsic());
1786 assert!(GpuIntrinsic::BlockDimX.is_value_intrinsic());
1787 assert!(GpuIntrinsic::WarpSize.is_value_intrinsic());
1788 assert!(!GpuIntrinsic::Sin.is_value_intrinsic());
1789 assert!(!GpuIntrinsic::AtomicAdd.is_value_intrinsic());
1790
1791 assert!(GpuIntrinsic::SyncThreads.is_zero_arg_function());
1793 assert!(GpuIntrinsic::ThreadFence.is_zero_arg_function());
1794 assert!(GpuIntrinsic::WarpActiveMask.is_zero_arg_function());
1795 assert!(GpuIntrinsic::Clock.is_zero_arg_function());
1796 assert!(!GpuIntrinsic::Sin.is_zero_arg_function());
1797
1798 assert!(GpuIntrinsic::WarpShfl.requires_mask());
1800 assert!(GpuIntrinsic::WarpBallot.requires_mask());
1801 assert!(GpuIntrinsic::WarpReduceAdd.requires_mask());
1802 assert!(!GpuIntrinsic::Sin.requires_mask());
1803 assert!(!GpuIntrinsic::AtomicAdd.requires_mask());
1804 }
1805
1806 #[test]
1807 fn test_3d_stencil_intrinsics() {
1808 assert_eq!(
1809 StencilIntrinsic::from_method_name("up"),
1810 Some(StencilIntrinsic::Up)
1811 );
1812 assert_eq!(
1813 StencilIntrinsic::from_method_name("down"),
1814 Some(StencilIntrinsic::Down)
1815 );
1816
1817 assert!(StencilIntrinsic::Up.is_3d_only());
1819 assert!(StencilIntrinsic::Down.is_3d_only());
1820 assert!(!StencilIntrinsic::North.is_3d_only());
1821 assert!(!StencilIntrinsic::East.is_3d_only());
1822 assert!(!StencilIntrinsic::Index.is_3d_only());
1823
1824 assert_eq!(StencilIntrinsic::Up.get_offset_3d(), Some((-1, 0, 0)));
1826 assert_eq!(StencilIntrinsic::Down.get_offset_3d(), Some((1, 0, 0)));
1827 assert_eq!(StencilIntrinsic::North.get_offset_3d(), Some((0, -1, 0)));
1828 assert_eq!(StencilIntrinsic::South.get_offset_3d(), Some((0, 1, 0)));
1829 assert_eq!(StencilIntrinsic::East.get_offset_3d(), Some((0, 0, 1)));
1830 assert_eq!(StencilIntrinsic::West.get_offset_3d(), Some((0, 0, -1)));
1831
1832 let up = StencilIntrinsic::Up;
1834 assert_eq!(up.to_cuda_index_3d("p", "18", "324", "idx"), "p[idx - 324]");
1835
1836 let down = StencilIntrinsic::Down;
1837 assert_eq!(
1838 down.to_cuda_index_3d("p", "18", "324", "idx"),
1839 "p[idx + 324]"
1840 );
1841 }
1842
1843 #[test]
1844 fn test_sync_intrinsics() {
1845 let registry = IntrinsicRegistry::new();
1846
1847 assert_eq!(
1848 registry.lookup("sync_threads_count"),
1849 Some(&GpuIntrinsic::SyncThreadsCount)
1850 );
1851 assert_eq!(
1852 registry.lookup("sync_threads_and"),
1853 Some(&GpuIntrinsic::SyncThreadsAnd)
1854 );
1855 assert_eq!(
1856 registry.lookup("sync_threads_or"),
1857 Some(&GpuIntrinsic::SyncThreadsOr)
1858 );
1859
1860 assert_eq!(
1861 GpuIntrinsic::SyncThreadsCount.to_cuda_string(),
1862 "__syncthreads_count"
1863 );
1864 assert_eq!(
1865 GpuIntrinsic::SyncThreadsAnd.to_cuda_string(),
1866 "__syncthreads_and"
1867 );
1868 assert_eq!(
1869 GpuIntrinsic::SyncThreadsOr.to_cuda_string(),
1870 "__syncthreads_or"
1871 );
1872 }
1873
1874 #[test]
1875 fn test_math_extras() {
1876 let registry = IntrinsicRegistry::new();
1877
1878 assert_eq!(registry.lookup("trunc"), Some(&GpuIntrinsic::Trunc));
1879 assert_eq!(registry.lookup("cbrt"), Some(&GpuIntrinsic::Cbrt));
1880 assert_eq!(registry.lookup("hypot"), Some(&GpuIntrinsic::Hypot));
1881 assert_eq!(registry.lookup("copysign"), Some(&GpuIntrinsic::Copysign));
1882 assert_eq!(registry.lookup("fmod"), Some(&GpuIntrinsic::Fmod));
1883
1884 assert_eq!(GpuIntrinsic::Trunc.to_cuda_string(), "truncf");
1885 assert_eq!(GpuIntrinsic::Cbrt.to_cuda_string(), "cbrtf");
1886 assert_eq!(GpuIntrinsic::Hypot.to_cuda_string(), "hypotf");
1887 }
1888
1889 #[test]
1890 fn test_block_reduction_intrinsics() {
1891 let registry = IntrinsicRegistry::new();
1892
1893 assert_eq!(
1895 registry.lookup("block_reduce_sum"),
1896 Some(&GpuIntrinsic::BlockReduceSum)
1897 );
1898 assert_eq!(
1899 registry.lookup("block_reduce_min"),
1900 Some(&GpuIntrinsic::BlockReduceMin)
1901 );
1902 assert_eq!(
1903 registry.lookup("block_reduce_max"),
1904 Some(&GpuIntrinsic::BlockReduceMax)
1905 );
1906 assert_eq!(
1907 registry.lookup("block_reduce_and"),
1908 Some(&GpuIntrinsic::BlockReduceAnd)
1909 );
1910 assert_eq!(
1911 registry.lookup("block_reduce_or"),
1912 Some(&GpuIntrinsic::BlockReduceOr)
1913 );
1914
1915 assert_eq!(
1917 GpuIntrinsic::BlockReduceSum.to_cuda_string(),
1918 "__block_reduce_sum"
1919 );
1920 assert_eq!(
1921 GpuIntrinsic::BlockReduceMin.to_cuda_string(),
1922 "__block_reduce_min"
1923 );
1924 assert_eq!(
1925 GpuIntrinsic::BlockReduceMax.to_cuda_string(),
1926 "__block_reduce_max"
1927 );
1928
1929 assert_eq!(GpuIntrinsic::BlockReduceSum.category(), "reduction");
1931 assert_eq!(GpuIntrinsic::BlockReduceMin.category(), "reduction");
1932 }
1933
1934 #[test]
1935 fn test_grid_reduction_intrinsics() {
1936 let registry = IntrinsicRegistry::new();
1937
1938 assert_eq!(
1940 registry.lookup("grid_reduce_sum"),
1941 Some(&GpuIntrinsic::GridReduceSum)
1942 );
1943 assert_eq!(
1944 registry.lookup("grid_reduce_min"),
1945 Some(&GpuIntrinsic::GridReduceMin)
1946 );
1947 assert_eq!(
1948 registry.lookup("grid_reduce_max"),
1949 Some(&GpuIntrinsic::GridReduceMax)
1950 );
1951
1952 assert_eq!(
1954 GpuIntrinsic::GridReduceSum.to_cuda_string(),
1955 "__grid_reduce_sum"
1956 );
1957 assert_eq!(
1958 GpuIntrinsic::GridReduceMin.to_cuda_string(),
1959 "__grid_reduce_min"
1960 );
1961 assert_eq!(
1962 GpuIntrinsic::GridReduceMax.to_cuda_string(),
1963 "__grid_reduce_max"
1964 );
1965
1966 assert_eq!(GpuIntrinsic::GridReduceSum.category(), "reduction");
1968 }
1969
1970 #[test]
1971 fn test_reduce_and_broadcast_intrinsic() {
1972 let registry = IntrinsicRegistry::new();
1973
1974 assert_eq!(
1976 registry.lookup("reduce_and_broadcast"),
1977 Some(&GpuIntrinsic::ReduceAndBroadcast)
1978 );
1979 assert_eq!(
1981 registry.lookup("reduce_broadcast"),
1982 Some(&GpuIntrinsic::ReduceAndBroadcast)
1983 );
1984
1985 assert_eq!(
1987 GpuIntrinsic::ReduceAndBroadcast.to_cuda_string(),
1988 "__reduce_and_broadcast"
1989 );
1990
1991 assert_eq!(GpuIntrinsic::ReduceAndBroadcast.category(), "reduction");
1993 }
1994}