ringkernel_cuda_codegen/
intrinsics.rs

1//! GPU intrinsic mapping for CUDA code generation.
2//!
3//! This module provides mappings from high-level Rust operations to
4//! CUDA intrinsics and built-in functions.
5
6use std::collections::HashMap;
7
8/// GPU intrinsic operations.
9#[derive(Debug, Clone, PartialEq)]
10pub enum GpuIntrinsic {
11    // === Synchronization ===
12    /// Thread synchronization within a block.
13    SyncThreads,
14    /// Thread fence (memory ordering across device).
15    ThreadFence,
16    /// Thread fence within block.
17    ThreadFenceBlock,
18    /// Thread fence across system.
19    ThreadFenceSystem,
20    /// Synchronize threads with predicate.
21    SyncThreadsCount,
22    /// Synchronize threads with AND predicate.
23    SyncThreadsAnd,
24    /// Synchronize threads with OR predicate.
25    SyncThreadsOr,
26
27    // === Atomic Operations (Integer) ===
28    /// Atomic add.
29    AtomicAdd,
30    /// Atomic subtract.
31    AtomicSub,
32    /// Atomic minimum.
33    AtomicMin,
34    /// Atomic maximum.
35    AtomicMax,
36    /// Atomic exchange.
37    AtomicExch,
38    /// Atomic compare-and-swap.
39    AtomicCas,
40    /// Atomic bitwise AND.
41    AtomicAnd,
42    /// Atomic bitwise OR.
43    AtomicOr,
44    /// Atomic bitwise XOR.
45    AtomicXor,
46    /// Atomic increment (with wrap).
47    AtomicInc,
48    /// Atomic decrement (with wrap).
49    AtomicDec,
50
51    // === Basic Math Functions ===
52    /// Square root.
53    Sqrt,
54    /// Reciprocal square root.
55    Rsqrt,
56    /// Absolute value (integer).
57    Abs,
58    /// Absolute value (floating point).
59    Fabs,
60    /// Floor.
61    Floor,
62    /// Ceiling.
63    Ceil,
64    /// Round to nearest.
65    Round,
66    /// Truncate toward zero.
67    Trunc,
68    /// Fused multiply-add.
69    Fma,
70    /// Minimum.
71    Min,
72    /// Maximum.
73    Max,
74    /// Floating-point modulo.
75    Fmod,
76    /// Remainder.
77    Remainder,
78    /// Copy sign.
79    Copysign,
80    /// Cube root.
81    Cbrt,
82    /// Hypotenuse.
83    Hypot,
84
85    // === Trigonometric Functions ===
86    /// Sine.
87    Sin,
88    /// Cosine.
89    Cos,
90    /// Tangent.
91    Tan,
92    /// Arcsine.
93    Asin,
94    /// Arccosine.
95    Acos,
96    /// Arctangent.
97    Atan,
98    /// Two-argument arctangent.
99    Atan2,
100    /// Sine and cosine (combined).
101    Sincos,
102    /// Sine of pi*x.
103    Sinpi,
104    /// Cosine of pi*x.
105    Cospi,
106
107    // === Hyperbolic Functions ===
108    /// Hyperbolic sine.
109    Sinh,
110    /// Hyperbolic cosine.
111    Cosh,
112    /// Hyperbolic tangent.
113    Tanh,
114    /// Inverse hyperbolic sine.
115    Asinh,
116    /// Inverse hyperbolic cosine.
117    Acosh,
118    /// Inverse hyperbolic tangent.
119    Atanh,
120
121    // === Exponential and Logarithmic Functions ===
122    /// Exponential (base e).
123    Exp,
124    /// Exponential (base 2).
125    Exp2,
126    /// Exponential (base 10).
127    Exp10,
128    /// exp(x) - 1 (accurate for small x).
129    Expm1,
130    /// Natural logarithm (base e).
131    Log,
132    /// Logarithm (base 2).
133    Log2,
134    /// Logarithm (base 10).
135    Log10,
136    /// log(1 + x) (accurate for small x).
137    Log1p,
138    /// Power.
139    Pow,
140    /// Load exponent.
141    Ldexp,
142    /// Scale by power of 2.
143    Scalbn,
144    /// Extract exponent.
145    Ilogb,
146    /// Logarithm of gamma function.
147    Lgamma,
148    /// Gamma function.
149    Tgamma,
150    /// Error function.
151    Erf,
152    /// Complementary error function.
153    Erfc,
154    /// Inverse error function.
155    Erfinv,
156    /// Inverse complementary error function.
157    Erfcinv,
158
159    // === Classification and Comparison ===
160    /// Check if NaN.
161    Isnan,
162    /// Check if infinite.
163    Isinf,
164    /// Check if finite.
165    Isfinite,
166    /// Check if normal.
167    Isnormal,
168    /// Check sign bit.
169    Signbit,
170    /// Next representable value.
171    Nextafter,
172    /// Floating-point difference.
173    Fdim,
174    /// Not-a-Number.
175    Nan,
176
177    // === Warp-Level Operations ===
178    /// Warp shuffle.
179    WarpShfl,
180    /// Warp shuffle up.
181    WarpShflUp,
182    /// Warp shuffle down.
183    WarpShflDown,
184    /// Warp shuffle XOR.
185    WarpShflXor,
186    /// Get active thread mask.
187    WarpActiveMask,
188    /// Warp ballot.
189    WarpBallot,
190    /// Warp all predicate.
191    WarpAll,
192    /// Warp any predicate.
193    WarpAny,
194    /// Warp match any.
195    WarpMatchAny,
196    /// Warp match all.
197    WarpMatchAll,
198    /// Warp reduce add.
199    WarpReduceAdd,
200    /// Warp reduce min.
201    WarpReduceMin,
202    /// Warp reduce max.
203    WarpReduceMax,
204    /// Warp reduce AND.
205    WarpReduceAnd,
206    /// Warp reduce OR.
207    WarpReduceOr,
208    /// Warp reduce XOR.
209    WarpReduceXor,
210
211    // === Bit Manipulation ===
212    /// Population count (count set bits).
213    Popc,
214    /// Count leading zeros.
215    Clz,
216    /// Count trailing zeros (via ffs).
217    Ctz,
218    /// Find first set bit.
219    Ffs,
220    /// Bit reverse.
221    Brev,
222    /// Byte permute.
223    BytePerm,
224    /// Funnel shift left.
225    FunnelShiftLeft,
226    /// Funnel shift right.
227    FunnelShiftRight,
228
229    // === Memory Operations ===
230    /// Read-only cache load.
231    Ldg,
232    /// Prefetch L1.
233    PrefetchL1,
234    /// Prefetch L2.
235    PrefetchL2,
236
237    // === Special Functions ===
238    /// Reciprocal.
239    Rcp,
240    /// Division (fast).
241    Fdividef,
242    /// Saturate to [0,1].
243    Saturate,
244    /// Bessel J0.
245    J0,
246    /// Bessel J1.
247    J1,
248    /// Bessel Jn.
249    Jn,
250    /// Bessel Y0.
251    Y0,
252    /// Bessel Y1.
253    Y1,
254    /// Bessel Yn.
255    Yn,
256    /// Normal CDF.
257    Normcdf,
258    /// Inverse normal CDF.
259    Normcdfinv,
260    /// Cylindrical Bessel I0.
261    CylBesselI0,
262    /// Cylindrical Bessel I1.
263    CylBesselI1,
264
265    // === CUDA Thread/Block Indices ===
266    /// Thread index X.
267    ThreadIdxX,
268    /// Thread index Y.
269    ThreadIdxY,
270    /// Thread index Z.
271    ThreadIdxZ,
272    /// Block index X.
273    BlockIdxX,
274    /// Block index Y.
275    BlockIdxY,
276    /// Block index Z.
277    BlockIdxZ,
278    /// Block dimension X.
279    BlockDimX,
280    /// Block dimension Y.
281    BlockDimY,
282    /// Block dimension Z.
283    BlockDimZ,
284    /// Grid dimension X.
285    GridDimX,
286    /// Grid dimension Y.
287    GridDimY,
288    /// Grid dimension Z.
289    GridDimZ,
290    /// Warp size (always 32).
291    WarpSize,
292
293    // === Clock and Timing ===
294    /// Read clock counter.
295    Clock,
296    /// Read 64-bit clock counter.
297    Clock64,
298    /// Nanosleep.
299    Nanosleep,
300}
301
302impl GpuIntrinsic {
303    /// Convert to CUDA function/intrinsic name.
304    pub fn to_cuda_string(&self) -> &'static str {
305        match self {
306            // Synchronization
307            GpuIntrinsic::SyncThreads => "__syncthreads()",
308            GpuIntrinsic::ThreadFence => "__threadfence()",
309            GpuIntrinsic::ThreadFenceBlock => "__threadfence_block()",
310            GpuIntrinsic::ThreadFenceSystem => "__threadfence_system()",
311            GpuIntrinsic::SyncThreadsCount => "__syncthreads_count",
312            GpuIntrinsic::SyncThreadsAnd => "__syncthreads_and",
313            GpuIntrinsic::SyncThreadsOr => "__syncthreads_or",
314
315            // Atomic operations
316            GpuIntrinsic::AtomicAdd => "atomicAdd",
317            GpuIntrinsic::AtomicSub => "atomicSub",
318            GpuIntrinsic::AtomicMin => "atomicMin",
319            GpuIntrinsic::AtomicMax => "atomicMax",
320            GpuIntrinsic::AtomicExch => "atomicExch",
321            GpuIntrinsic::AtomicCas => "atomicCAS",
322            GpuIntrinsic::AtomicAnd => "atomicAnd",
323            GpuIntrinsic::AtomicOr => "atomicOr",
324            GpuIntrinsic::AtomicXor => "atomicXor",
325            GpuIntrinsic::AtomicInc => "atomicInc",
326            GpuIntrinsic::AtomicDec => "atomicDec",
327
328            // Basic math
329            GpuIntrinsic::Sqrt => "sqrtf",
330            GpuIntrinsic::Rsqrt => "rsqrtf",
331            GpuIntrinsic::Abs => "abs",
332            GpuIntrinsic::Fabs => "fabsf",
333            GpuIntrinsic::Floor => "floorf",
334            GpuIntrinsic::Ceil => "ceilf",
335            GpuIntrinsic::Round => "roundf",
336            GpuIntrinsic::Trunc => "truncf",
337            GpuIntrinsic::Fma => "fmaf",
338            GpuIntrinsic::Min => "fminf",
339            GpuIntrinsic::Max => "fmaxf",
340            GpuIntrinsic::Fmod => "fmodf",
341            GpuIntrinsic::Remainder => "remainderf",
342            GpuIntrinsic::Copysign => "copysignf",
343            GpuIntrinsic::Cbrt => "cbrtf",
344            GpuIntrinsic::Hypot => "hypotf",
345
346            // Trigonometric
347            GpuIntrinsic::Sin => "sinf",
348            GpuIntrinsic::Cos => "cosf",
349            GpuIntrinsic::Tan => "tanf",
350            GpuIntrinsic::Asin => "asinf",
351            GpuIntrinsic::Acos => "acosf",
352            GpuIntrinsic::Atan => "atanf",
353            GpuIntrinsic::Atan2 => "atan2f",
354            GpuIntrinsic::Sincos => "sincosf",
355            GpuIntrinsic::Sinpi => "sinpif",
356            GpuIntrinsic::Cospi => "cospif",
357
358            // Hyperbolic
359            GpuIntrinsic::Sinh => "sinhf",
360            GpuIntrinsic::Cosh => "coshf",
361            GpuIntrinsic::Tanh => "tanhf",
362            GpuIntrinsic::Asinh => "asinhf",
363            GpuIntrinsic::Acosh => "acoshf",
364            GpuIntrinsic::Atanh => "atanhf",
365
366            // Exponential and logarithmic
367            GpuIntrinsic::Exp => "expf",
368            GpuIntrinsic::Exp2 => "exp2f",
369            GpuIntrinsic::Exp10 => "exp10f",
370            GpuIntrinsic::Expm1 => "expm1f",
371            GpuIntrinsic::Log => "logf",
372            GpuIntrinsic::Log2 => "log2f",
373            GpuIntrinsic::Log10 => "log10f",
374            GpuIntrinsic::Log1p => "log1pf",
375            GpuIntrinsic::Pow => "powf",
376            GpuIntrinsic::Ldexp => "ldexpf",
377            GpuIntrinsic::Scalbn => "scalbnf",
378            GpuIntrinsic::Ilogb => "ilogbf",
379            GpuIntrinsic::Lgamma => "lgammaf",
380            GpuIntrinsic::Tgamma => "tgammaf",
381            GpuIntrinsic::Erf => "erff",
382            GpuIntrinsic::Erfc => "erfcf",
383            GpuIntrinsic::Erfinv => "erfinvf",
384            GpuIntrinsic::Erfcinv => "erfcinvf",
385
386            // Classification and comparison
387            GpuIntrinsic::Isnan => "isnan",
388            GpuIntrinsic::Isinf => "isinf",
389            GpuIntrinsic::Isfinite => "isfinite",
390            GpuIntrinsic::Isnormal => "isnormal",
391            GpuIntrinsic::Signbit => "signbit",
392            GpuIntrinsic::Nextafter => "nextafterf",
393            GpuIntrinsic::Fdim => "fdimf",
394            GpuIntrinsic::Nan => "nanf",
395
396            // Warp-level operations
397            GpuIntrinsic::WarpShfl => "__shfl_sync",
398            GpuIntrinsic::WarpShflUp => "__shfl_up_sync",
399            GpuIntrinsic::WarpShflDown => "__shfl_down_sync",
400            GpuIntrinsic::WarpShflXor => "__shfl_xor_sync",
401            GpuIntrinsic::WarpActiveMask => "__activemask()",
402            GpuIntrinsic::WarpBallot => "__ballot_sync",
403            GpuIntrinsic::WarpAll => "__all_sync",
404            GpuIntrinsic::WarpAny => "__any_sync",
405            GpuIntrinsic::WarpMatchAny => "__match_any_sync",
406            GpuIntrinsic::WarpMatchAll => "__match_all_sync",
407            GpuIntrinsic::WarpReduceAdd => "__reduce_add_sync",
408            GpuIntrinsic::WarpReduceMin => "__reduce_min_sync",
409            GpuIntrinsic::WarpReduceMax => "__reduce_max_sync",
410            GpuIntrinsic::WarpReduceAnd => "__reduce_and_sync",
411            GpuIntrinsic::WarpReduceOr => "__reduce_or_sync",
412            GpuIntrinsic::WarpReduceXor => "__reduce_xor_sync",
413
414            // Bit manipulation
415            GpuIntrinsic::Popc => "__popc",
416            GpuIntrinsic::Clz => "__clz",
417            GpuIntrinsic::Ctz => "__ffs", // ffs returns 1 + ctz, but commonly used
418            GpuIntrinsic::Ffs => "__ffs",
419            GpuIntrinsic::Brev => "__brev",
420            GpuIntrinsic::BytePerm => "__byte_perm",
421            GpuIntrinsic::FunnelShiftLeft => "__funnelshift_l",
422            GpuIntrinsic::FunnelShiftRight => "__funnelshift_r",
423
424            // Memory operations
425            GpuIntrinsic::Ldg => "__ldg",
426            GpuIntrinsic::PrefetchL1 => "__prefetch_l1",
427            GpuIntrinsic::PrefetchL2 => "__prefetch_l2",
428
429            // Special functions
430            GpuIntrinsic::Rcp => "__frcp_rn",
431            GpuIntrinsic::Fdividef => "__fdividef",
432            GpuIntrinsic::Saturate => "__saturatef",
433            GpuIntrinsic::J0 => "j0f",
434            GpuIntrinsic::J1 => "j1f",
435            GpuIntrinsic::Jn => "jnf",
436            GpuIntrinsic::Y0 => "y0f",
437            GpuIntrinsic::Y1 => "y1f",
438            GpuIntrinsic::Yn => "ynf",
439            GpuIntrinsic::Normcdf => "normcdff",
440            GpuIntrinsic::Normcdfinv => "normcdfinvf",
441            GpuIntrinsic::CylBesselI0 => "cyl_bessel_i0f",
442            GpuIntrinsic::CylBesselI1 => "cyl_bessel_i1f",
443
444            // Thread/block indices
445            GpuIntrinsic::ThreadIdxX => "threadIdx.x",
446            GpuIntrinsic::ThreadIdxY => "threadIdx.y",
447            GpuIntrinsic::ThreadIdxZ => "threadIdx.z",
448            GpuIntrinsic::BlockIdxX => "blockIdx.x",
449            GpuIntrinsic::BlockIdxY => "blockIdx.y",
450            GpuIntrinsic::BlockIdxZ => "blockIdx.z",
451            GpuIntrinsic::BlockDimX => "blockDim.x",
452            GpuIntrinsic::BlockDimY => "blockDim.y",
453            GpuIntrinsic::BlockDimZ => "blockDim.z",
454            GpuIntrinsic::GridDimX => "gridDim.x",
455            GpuIntrinsic::GridDimY => "gridDim.y",
456            GpuIntrinsic::GridDimZ => "gridDim.z",
457            GpuIntrinsic::WarpSize => "warpSize",
458
459            // Clock and timing
460            GpuIntrinsic::Clock => "clock()",
461            GpuIntrinsic::Clock64 => "clock64()",
462            GpuIntrinsic::Nanosleep => "__nanosleep",
463        }
464    }
465
466    /// Check if this intrinsic is a value (no parentheses needed).
467    pub fn is_value_intrinsic(&self) -> bool {
468        matches!(
469            self,
470            GpuIntrinsic::ThreadIdxX
471                | GpuIntrinsic::ThreadIdxY
472                | GpuIntrinsic::ThreadIdxZ
473                | GpuIntrinsic::BlockIdxX
474                | GpuIntrinsic::BlockIdxY
475                | GpuIntrinsic::BlockIdxZ
476                | GpuIntrinsic::BlockDimX
477                | GpuIntrinsic::BlockDimY
478                | GpuIntrinsic::BlockDimZ
479                | GpuIntrinsic::GridDimX
480                | GpuIntrinsic::GridDimY
481                | GpuIntrinsic::GridDimZ
482                | GpuIntrinsic::WarpSize
483        )
484    }
485
486    /// Check if this intrinsic is a zero-argument function (ends with ()).
487    pub fn is_zero_arg_function(&self) -> bool {
488        matches!(
489            self,
490            GpuIntrinsic::SyncThreads
491                | GpuIntrinsic::ThreadFence
492                | GpuIntrinsic::ThreadFenceBlock
493                | GpuIntrinsic::ThreadFenceSystem
494                | GpuIntrinsic::WarpActiveMask
495                | GpuIntrinsic::Clock
496                | GpuIntrinsic::Clock64
497        )
498    }
499
500    /// Check if this intrinsic requires a mask argument (warp operations).
501    pub fn requires_mask(&self) -> bool {
502        matches!(
503            self,
504            GpuIntrinsic::WarpShfl
505                | GpuIntrinsic::WarpShflUp
506                | GpuIntrinsic::WarpShflDown
507                | GpuIntrinsic::WarpShflXor
508                | GpuIntrinsic::WarpBallot
509                | GpuIntrinsic::WarpAll
510                | GpuIntrinsic::WarpAny
511                | GpuIntrinsic::WarpMatchAny
512                | GpuIntrinsic::WarpMatchAll
513                | GpuIntrinsic::WarpReduceAdd
514                | GpuIntrinsic::WarpReduceMin
515                | GpuIntrinsic::WarpReduceMax
516                | GpuIntrinsic::WarpReduceAnd
517                | GpuIntrinsic::WarpReduceOr
518                | GpuIntrinsic::WarpReduceXor
519        )
520    }
521
522    /// Get the category of this intrinsic for documentation purposes.
523    pub fn category(&self) -> &'static str {
524        match self {
525            GpuIntrinsic::SyncThreads
526            | GpuIntrinsic::ThreadFence
527            | GpuIntrinsic::ThreadFenceBlock
528            | GpuIntrinsic::ThreadFenceSystem
529            | GpuIntrinsic::SyncThreadsCount
530            | GpuIntrinsic::SyncThreadsAnd
531            | GpuIntrinsic::SyncThreadsOr => "synchronization",
532
533            GpuIntrinsic::AtomicAdd
534            | GpuIntrinsic::AtomicSub
535            | GpuIntrinsic::AtomicMin
536            | GpuIntrinsic::AtomicMax
537            | GpuIntrinsic::AtomicExch
538            | GpuIntrinsic::AtomicCas
539            | GpuIntrinsic::AtomicAnd
540            | GpuIntrinsic::AtomicOr
541            | GpuIntrinsic::AtomicXor
542            | GpuIntrinsic::AtomicInc
543            | GpuIntrinsic::AtomicDec => "atomic",
544
545            GpuIntrinsic::Sqrt
546            | GpuIntrinsic::Rsqrt
547            | GpuIntrinsic::Abs
548            | GpuIntrinsic::Fabs
549            | GpuIntrinsic::Floor
550            | GpuIntrinsic::Ceil
551            | GpuIntrinsic::Round
552            | GpuIntrinsic::Trunc
553            | GpuIntrinsic::Fma
554            | GpuIntrinsic::Min
555            | GpuIntrinsic::Max
556            | GpuIntrinsic::Fmod
557            | GpuIntrinsic::Remainder
558            | GpuIntrinsic::Copysign
559            | GpuIntrinsic::Cbrt
560            | GpuIntrinsic::Hypot => "math",
561
562            GpuIntrinsic::Sin
563            | GpuIntrinsic::Cos
564            | GpuIntrinsic::Tan
565            | GpuIntrinsic::Asin
566            | GpuIntrinsic::Acos
567            | GpuIntrinsic::Atan
568            | GpuIntrinsic::Atan2
569            | GpuIntrinsic::Sincos
570            | GpuIntrinsic::Sinpi
571            | GpuIntrinsic::Cospi => "trigonometric",
572
573            GpuIntrinsic::Sinh
574            | GpuIntrinsic::Cosh
575            | GpuIntrinsic::Tanh
576            | GpuIntrinsic::Asinh
577            | GpuIntrinsic::Acosh
578            | GpuIntrinsic::Atanh => "hyperbolic",
579
580            GpuIntrinsic::Exp
581            | GpuIntrinsic::Exp2
582            | GpuIntrinsic::Exp10
583            | GpuIntrinsic::Expm1
584            | GpuIntrinsic::Log
585            | GpuIntrinsic::Log2
586            | GpuIntrinsic::Log10
587            | GpuIntrinsic::Log1p
588            | GpuIntrinsic::Pow
589            | GpuIntrinsic::Ldexp
590            | GpuIntrinsic::Scalbn
591            | GpuIntrinsic::Ilogb
592            | GpuIntrinsic::Lgamma
593            | GpuIntrinsic::Tgamma
594            | GpuIntrinsic::Erf
595            | GpuIntrinsic::Erfc
596            | GpuIntrinsic::Erfinv
597            | GpuIntrinsic::Erfcinv => "exponential",
598
599            GpuIntrinsic::Isnan
600            | GpuIntrinsic::Isinf
601            | GpuIntrinsic::Isfinite
602            | GpuIntrinsic::Isnormal
603            | GpuIntrinsic::Signbit
604            | GpuIntrinsic::Nextafter
605            | GpuIntrinsic::Fdim
606            | GpuIntrinsic::Nan => "classification",
607
608            GpuIntrinsic::WarpShfl
609            | GpuIntrinsic::WarpShflUp
610            | GpuIntrinsic::WarpShflDown
611            | GpuIntrinsic::WarpShflXor
612            | GpuIntrinsic::WarpActiveMask
613            | GpuIntrinsic::WarpBallot
614            | GpuIntrinsic::WarpAll
615            | GpuIntrinsic::WarpAny
616            | GpuIntrinsic::WarpMatchAny
617            | GpuIntrinsic::WarpMatchAll
618            | GpuIntrinsic::WarpReduceAdd
619            | GpuIntrinsic::WarpReduceMin
620            | GpuIntrinsic::WarpReduceMax
621            | GpuIntrinsic::WarpReduceAnd
622            | GpuIntrinsic::WarpReduceOr
623            | GpuIntrinsic::WarpReduceXor => "warp",
624
625            GpuIntrinsic::Popc
626            | GpuIntrinsic::Clz
627            | GpuIntrinsic::Ctz
628            | GpuIntrinsic::Ffs
629            | GpuIntrinsic::Brev
630            | GpuIntrinsic::BytePerm
631            | GpuIntrinsic::FunnelShiftLeft
632            | GpuIntrinsic::FunnelShiftRight => "bit",
633
634            GpuIntrinsic::Ldg | GpuIntrinsic::PrefetchL1 | GpuIntrinsic::PrefetchL2 => "memory",
635
636            GpuIntrinsic::Rcp
637            | GpuIntrinsic::Fdividef
638            | GpuIntrinsic::Saturate
639            | GpuIntrinsic::J0
640            | GpuIntrinsic::J1
641            | GpuIntrinsic::Jn
642            | GpuIntrinsic::Y0
643            | GpuIntrinsic::Y1
644            | GpuIntrinsic::Yn
645            | GpuIntrinsic::Normcdf
646            | GpuIntrinsic::Normcdfinv
647            | GpuIntrinsic::CylBesselI0
648            | GpuIntrinsic::CylBesselI1 => "special",
649
650            GpuIntrinsic::ThreadIdxX
651            | GpuIntrinsic::ThreadIdxY
652            | GpuIntrinsic::ThreadIdxZ
653            | GpuIntrinsic::BlockIdxX
654            | GpuIntrinsic::BlockIdxY
655            | GpuIntrinsic::BlockIdxZ
656            | GpuIntrinsic::BlockDimX
657            | GpuIntrinsic::BlockDimY
658            | GpuIntrinsic::BlockDimZ
659            | GpuIntrinsic::GridDimX
660            | GpuIntrinsic::GridDimY
661            | GpuIntrinsic::GridDimZ
662            | GpuIntrinsic::WarpSize => "index",
663
664            GpuIntrinsic::Clock | GpuIntrinsic::Clock64 | GpuIntrinsic::Nanosleep => "timing",
665        }
666    }
667}
668
669/// Registry for mapping Rust function names to GPU intrinsics.
670#[derive(Debug)]
671pub struct IntrinsicRegistry {
672    mappings: HashMap<String, GpuIntrinsic>,
673}
674
675impl Default for IntrinsicRegistry {
676    fn default() -> Self {
677        Self::new()
678    }
679}
680
681impl IntrinsicRegistry {
682    /// Create a new registry with default mappings.
683    pub fn new() -> Self {
684        let mut mappings = HashMap::new();
685
686        // === Synchronization ===
687        mappings.insert("sync_threads".to_string(), GpuIntrinsic::SyncThreads);
688        mappings.insert("thread_fence".to_string(), GpuIntrinsic::ThreadFence);
689        mappings.insert(
690            "thread_fence_block".to_string(),
691            GpuIntrinsic::ThreadFenceBlock,
692        );
693        mappings.insert(
694            "thread_fence_system".to_string(),
695            GpuIntrinsic::ThreadFenceSystem,
696        );
697        mappings.insert(
698            "sync_threads_count".to_string(),
699            GpuIntrinsic::SyncThreadsCount,
700        );
701        mappings.insert("sync_threads_and".to_string(), GpuIntrinsic::SyncThreadsAnd);
702        mappings.insert("sync_threads_or".to_string(), GpuIntrinsic::SyncThreadsOr);
703
704        // === Atomic operations ===
705        mappings.insert("atomic_add".to_string(), GpuIntrinsic::AtomicAdd);
706        mappings.insert("atomic_sub".to_string(), GpuIntrinsic::AtomicSub);
707        mappings.insert("atomic_min".to_string(), GpuIntrinsic::AtomicMin);
708        mappings.insert("atomic_max".to_string(), GpuIntrinsic::AtomicMax);
709        mappings.insert("atomic_exchange".to_string(), GpuIntrinsic::AtomicExch);
710        mappings.insert("atomic_exch".to_string(), GpuIntrinsic::AtomicExch);
711        mappings.insert("atomic_cas".to_string(), GpuIntrinsic::AtomicCas);
712        mappings.insert("atomic_compare_swap".to_string(), GpuIntrinsic::AtomicCas);
713        mappings.insert("atomic_and".to_string(), GpuIntrinsic::AtomicAnd);
714        mappings.insert("atomic_or".to_string(), GpuIntrinsic::AtomicOr);
715        mappings.insert("atomic_xor".to_string(), GpuIntrinsic::AtomicXor);
716        mappings.insert("atomic_inc".to_string(), GpuIntrinsic::AtomicInc);
717        mappings.insert("atomic_dec".to_string(), GpuIntrinsic::AtomicDec);
718
719        // === Basic math functions ===
720        mappings.insert("sqrt".to_string(), GpuIntrinsic::Sqrt);
721        mappings.insert("rsqrt".to_string(), GpuIntrinsic::Rsqrt);
722        mappings.insert("abs".to_string(), GpuIntrinsic::Fabs);
723        mappings.insert("fabs".to_string(), GpuIntrinsic::Fabs);
724        mappings.insert("floor".to_string(), GpuIntrinsic::Floor);
725        mappings.insert("ceil".to_string(), GpuIntrinsic::Ceil);
726        mappings.insert("round".to_string(), GpuIntrinsic::Round);
727        mappings.insert("trunc".to_string(), GpuIntrinsic::Trunc);
728        mappings.insert("mul_add".to_string(), GpuIntrinsic::Fma);
729        mappings.insert("fma".to_string(), GpuIntrinsic::Fma);
730        mappings.insert("min".to_string(), GpuIntrinsic::Min);
731        mappings.insert("max".to_string(), GpuIntrinsic::Max);
732        mappings.insert("fmin".to_string(), GpuIntrinsic::Min);
733        mappings.insert("fmax".to_string(), GpuIntrinsic::Max);
734        mappings.insert("fmod".to_string(), GpuIntrinsic::Fmod);
735        mappings.insert("remainder".to_string(), GpuIntrinsic::Remainder);
736        mappings.insert("copysign".to_string(), GpuIntrinsic::Copysign);
737        mappings.insert("cbrt".to_string(), GpuIntrinsic::Cbrt);
738        mappings.insert("hypot".to_string(), GpuIntrinsic::Hypot);
739
740        // === Trigonometric functions ===
741        mappings.insert("sin".to_string(), GpuIntrinsic::Sin);
742        mappings.insert("cos".to_string(), GpuIntrinsic::Cos);
743        mappings.insert("tan".to_string(), GpuIntrinsic::Tan);
744        mappings.insert("asin".to_string(), GpuIntrinsic::Asin);
745        mappings.insert("acos".to_string(), GpuIntrinsic::Acos);
746        mappings.insert("atan".to_string(), GpuIntrinsic::Atan);
747        mappings.insert("atan2".to_string(), GpuIntrinsic::Atan2);
748        mappings.insert("sincos".to_string(), GpuIntrinsic::Sincos);
749        mappings.insert("sinpi".to_string(), GpuIntrinsic::Sinpi);
750        mappings.insert("cospi".to_string(), GpuIntrinsic::Cospi);
751
752        // === Hyperbolic functions ===
753        mappings.insert("sinh".to_string(), GpuIntrinsic::Sinh);
754        mappings.insert("cosh".to_string(), GpuIntrinsic::Cosh);
755        mappings.insert("tanh".to_string(), GpuIntrinsic::Tanh);
756        mappings.insert("asinh".to_string(), GpuIntrinsic::Asinh);
757        mappings.insert("acosh".to_string(), GpuIntrinsic::Acosh);
758        mappings.insert("atanh".to_string(), GpuIntrinsic::Atanh);
759
760        // === Exponential and logarithmic ===
761        mappings.insert("exp".to_string(), GpuIntrinsic::Exp);
762        mappings.insert("exp2".to_string(), GpuIntrinsic::Exp2);
763        mappings.insert("exp10".to_string(), GpuIntrinsic::Exp10);
764        mappings.insert("expm1".to_string(), GpuIntrinsic::Expm1);
765        mappings.insert("ln".to_string(), GpuIntrinsic::Log);
766        mappings.insert("log".to_string(), GpuIntrinsic::Log);
767        mappings.insert("log2".to_string(), GpuIntrinsic::Log2);
768        mappings.insert("log10".to_string(), GpuIntrinsic::Log10);
769        mappings.insert("log1p".to_string(), GpuIntrinsic::Log1p);
770        mappings.insert("powf".to_string(), GpuIntrinsic::Pow);
771        mappings.insert("powi".to_string(), GpuIntrinsic::Pow);
772        mappings.insert("pow".to_string(), GpuIntrinsic::Pow);
773        mappings.insert("ldexp".to_string(), GpuIntrinsic::Ldexp);
774        mappings.insert("scalbn".to_string(), GpuIntrinsic::Scalbn);
775        mappings.insert("ilogb".to_string(), GpuIntrinsic::Ilogb);
776        mappings.insert("lgamma".to_string(), GpuIntrinsic::Lgamma);
777        mappings.insert("tgamma".to_string(), GpuIntrinsic::Tgamma);
778        mappings.insert("gamma".to_string(), GpuIntrinsic::Tgamma);
779        mappings.insert("erf".to_string(), GpuIntrinsic::Erf);
780        mappings.insert("erfc".to_string(), GpuIntrinsic::Erfc);
781        mappings.insert("erfinv".to_string(), GpuIntrinsic::Erfinv);
782        mappings.insert("erfcinv".to_string(), GpuIntrinsic::Erfcinv);
783
784        // === Classification and comparison ===
785        mappings.insert("is_nan".to_string(), GpuIntrinsic::Isnan);
786        mappings.insert("isnan".to_string(), GpuIntrinsic::Isnan);
787        mappings.insert("is_infinite".to_string(), GpuIntrinsic::Isinf);
788        mappings.insert("isinf".to_string(), GpuIntrinsic::Isinf);
789        mappings.insert("is_finite".to_string(), GpuIntrinsic::Isfinite);
790        mappings.insert("isfinite".to_string(), GpuIntrinsic::Isfinite);
791        mappings.insert("is_normal".to_string(), GpuIntrinsic::Isnormal);
792        mappings.insert("isnormal".to_string(), GpuIntrinsic::Isnormal);
793        mappings.insert("is_sign_negative".to_string(), GpuIntrinsic::Signbit);
794        mappings.insert("signbit".to_string(), GpuIntrinsic::Signbit);
795        mappings.insert("nextafter".to_string(), GpuIntrinsic::Nextafter);
796        mappings.insert("fdim".to_string(), GpuIntrinsic::Fdim);
797        mappings.insert("nan".to_string(), GpuIntrinsic::Nan);
798
799        // === Warp operations ===
800        mappings.insert("warp_shfl".to_string(), GpuIntrinsic::WarpShfl);
801        mappings.insert("warp_shuffle".to_string(), GpuIntrinsic::WarpShfl);
802        mappings.insert("warp_shfl_up".to_string(), GpuIntrinsic::WarpShflUp);
803        mappings.insert("warp_shuffle_up".to_string(), GpuIntrinsic::WarpShflUp);
804        mappings.insert("warp_shfl_down".to_string(), GpuIntrinsic::WarpShflDown);
805        mappings.insert("warp_shuffle_down".to_string(), GpuIntrinsic::WarpShflDown);
806        mappings.insert("warp_shfl_xor".to_string(), GpuIntrinsic::WarpShflXor);
807        mappings.insert("warp_shuffle_xor".to_string(), GpuIntrinsic::WarpShflXor);
808        mappings.insert("warp_active_mask".to_string(), GpuIntrinsic::WarpActiveMask);
809        mappings.insert("active_mask".to_string(), GpuIntrinsic::WarpActiveMask);
810        mappings.insert("warp_ballot".to_string(), GpuIntrinsic::WarpBallot);
811        mappings.insert("ballot".to_string(), GpuIntrinsic::WarpBallot);
812        mappings.insert("warp_all".to_string(), GpuIntrinsic::WarpAll);
813        mappings.insert("warp_any".to_string(), GpuIntrinsic::WarpAny);
814        mappings.insert("warp_match_any".to_string(), GpuIntrinsic::WarpMatchAny);
815        mappings.insert("warp_match_all".to_string(), GpuIntrinsic::WarpMatchAll);
816        mappings.insert("warp_reduce_add".to_string(), GpuIntrinsic::WarpReduceAdd);
817        mappings.insert("warp_reduce_min".to_string(), GpuIntrinsic::WarpReduceMin);
818        mappings.insert("warp_reduce_max".to_string(), GpuIntrinsic::WarpReduceMax);
819        mappings.insert("warp_reduce_and".to_string(), GpuIntrinsic::WarpReduceAnd);
820        mappings.insert("warp_reduce_or".to_string(), GpuIntrinsic::WarpReduceOr);
821        mappings.insert("warp_reduce_xor".to_string(), GpuIntrinsic::WarpReduceXor);
822
823        // === Bit manipulation ===
824        mappings.insert("popc".to_string(), GpuIntrinsic::Popc);
825        mappings.insert("popcount".to_string(), GpuIntrinsic::Popc);
826        mappings.insert("count_ones".to_string(), GpuIntrinsic::Popc);
827        mappings.insert("clz".to_string(), GpuIntrinsic::Clz);
828        mappings.insert("leading_zeros".to_string(), GpuIntrinsic::Clz);
829        mappings.insert("ctz".to_string(), GpuIntrinsic::Ctz);
830        mappings.insert("trailing_zeros".to_string(), GpuIntrinsic::Ctz);
831        mappings.insert("ffs".to_string(), GpuIntrinsic::Ffs);
832        mappings.insert("brev".to_string(), GpuIntrinsic::Brev);
833        mappings.insert("reverse_bits".to_string(), GpuIntrinsic::Brev);
834        mappings.insert("byte_perm".to_string(), GpuIntrinsic::BytePerm);
835        mappings.insert(
836            "funnel_shift_left".to_string(),
837            GpuIntrinsic::FunnelShiftLeft,
838        );
839        mappings.insert(
840            "funnel_shift_right".to_string(),
841            GpuIntrinsic::FunnelShiftRight,
842        );
843
844        // === Memory operations ===
845        mappings.insert("ldg".to_string(), GpuIntrinsic::Ldg);
846        mappings.insert("load_global".to_string(), GpuIntrinsic::Ldg);
847        mappings.insert("prefetch_l1".to_string(), GpuIntrinsic::PrefetchL1);
848        mappings.insert("prefetch_l2".to_string(), GpuIntrinsic::PrefetchL2);
849
850        // === Special functions ===
851        mappings.insert("rcp".to_string(), GpuIntrinsic::Rcp);
852        mappings.insert("recip".to_string(), GpuIntrinsic::Rcp);
853        mappings.insert("fdividef".to_string(), GpuIntrinsic::Fdividef);
854        mappings.insert("fast_div".to_string(), GpuIntrinsic::Fdividef);
855        mappings.insert("saturate".to_string(), GpuIntrinsic::Saturate);
856        mappings.insert("clamp_01".to_string(), GpuIntrinsic::Saturate);
857        mappings.insert("j0".to_string(), GpuIntrinsic::J0);
858        mappings.insert("j1".to_string(), GpuIntrinsic::J1);
859        mappings.insert("jn".to_string(), GpuIntrinsic::Jn);
860        mappings.insert("y0".to_string(), GpuIntrinsic::Y0);
861        mappings.insert("y1".to_string(), GpuIntrinsic::Y1);
862        mappings.insert("yn".to_string(), GpuIntrinsic::Yn);
863        mappings.insert("normcdf".to_string(), GpuIntrinsic::Normcdf);
864        mappings.insert("norm_cdf".to_string(), GpuIntrinsic::Normcdf);
865        mappings.insert("normcdfinv".to_string(), GpuIntrinsic::Normcdfinv);
866        mappings.insert("norm_cdf_inv".to_string(), GpuIntrinsic::Normcdfinv);
867        mappings.insert("cyl_bessel_i0".to_string(), GpuIntrinsic::CylBesselI0);
868        mappings.insert("cyl_bessel_i1".to_string(), GpuIntrinsic::CylBesselI1);
869
870        // === Thread/block indices ===
871        mappings.insert("thread_idx_x".to_string(), GpuIntrinsic::ThreadIdxX);
872        mappings.insert("thread_idx_y".to_string(), GpuIntrinsic::ThreadIdxY);
873        mappings.insert("thread_idx_z".to_string(), GpuIntrinsic::ThreadIdxZ);
874        mappings.insert("block_idx_x".to_string(), GpuIntrinsic::BlockIdxX);
875        mappings.insert("block_idx_y".to_string(), GpuIntrinsic::BlockIdxY);
876        mappings.insert("block_idx_z".to_string(), GpuIntrinsic::BlockIdxZ);
877        mappings.insert("block_dim_x".to_string(), GpuIntrinsic::BlockDimX);
878        mappings.insert("block_dim_y".to_string(), GpuIntrinsic::BlockDimY);
879        mappings.insert("block_dim_z".to_string(), GpuIntrinsic::BlockDimZ);
880        mappings.insert("grid_dim_x".to_string(), GpuIntrinsic::GridDimX);
881        mappings.insert("grid_dim_y".to_string(), GpuIntrinsic::GridDimY);
882        mappings.insert("grid_dim_z".to_string(), GpuIntrinsic::GridDimZ);
883        mappings.insert("warp_size".to_string(), GpuIntrinsic::WarpSize);
884
885        // === Clock and timing ===
886        mappings.insert("clock".to_string(), GpuIntrinsic::Clock);
887        mappings.insert("clock64".to_string(), GpuIntrinsic::Clock64);
888        mappings.insert("nanosleep".to_string(), GpuIntrinsic::Nanosleep);
889
890        Self { mappings }
891    }
892
893    /// Look up an intrinsic by Rust function name.
894    pub fn lookup(&self, name: &str) -> Option<&GpuIntrinsic> {
895        self.mappings.get(name)
896    }
897
898    /// Register a custom intrinsic mapping.
899    pub fn register(&mut self, rust_name: &str, intrinsic: GpuIntrinsic) {
900        self.mappings.insert(rust_name.to_string(), intrinsic);
901    }
902
903    /// Check if a name is a known intrinsic.
904    pub fn is_intrinsic(&self, name: &str) -> bool {
905        self.mappings.contains_key(name)
906    }
907}
908
909/// Stencil-specific intrinsics for neighbor access.
910///
911/// These are special intrinsics that the transpiler handles
912/// differently based on stencil configuration.
913#[derive(Debug, Clone, PartialEq)]
914pub enum StencilIntrinsic {
915    /// Get current cell index: `pos.idx()`
916    Index,
917    /// Access north neighbor: `pos.north(buf)`
918    North,
919    /// Access south neighbor: `pos.south(buf)`
920    South,
921    /// Access east neighbor: `pos.east(buf)`
922    East,
923    /// Access west neighbor: `pos.west(buf)`
924    West,
925    /// Access neighbor at offset: `pos.at(buf, dx, dy)`
926    At,
927    /// 3D: Access neighbor above: `pos.up(buf)`
928    Up,
929    /// 3D: Access neighbor below: `pos.down(buf)`
930    Down,
931}
932
933impl StencilIntrinsic {
934    /// Parse a method name to stencil intrinsic.
935    pub fn from_method_name(name: &str) -> Option<Self> {
936        match name {
937            "idx" => Some(StencilIntrinsic::Index),
938            "north" => Some(StencilIntrinsic::North),
939            "south" => Some(StencilIntrinsic::South),
940            "east" => Some(StencilIntrinsic::East),
941            "west" => Some(StencilIntrinsic::West),
942            "at" => Some(StencilIntrinsic::At),
943            "up" => Some(StencilIntrinsic::Up),
944            "down" => Some(StencilIntrinsic::Down),
945            _ => None,
946        }
947    }
948}
949
950/// Ring kernel intrinsics for persistent actor kernels.
951///
952/// These intrinsics provide access to control block state, queue operations,
953/// and HLC (Hybrid Logical Clock) functionality within ring kernel handlers.
954#[derive(Debug, Clone, Copy, PartialEq, Eq)]
955pub enum RingKernelIntrinsic {
956    // === Control Block Access ===
957    /// Check if kernel is active: `is_active()`
958    IsActive,
959    /// Check if termination requested: `should_terminate()`
960    ShouldTerminate,
961    /// Mark kernel as terminated: `mark_terminated()`
962    MarkTerminated,
963    /// Get messages processed count: `messages_processed()`
964    GetMessagesProcessed,
965
966    // === Queue Operations ===
967    /// Get input queue size: `input_queue_size()`
968    InputQueueSize,
969    /// Get output queue size: `output_queue_size()`
970    OutputQueueSize,
971    /// Check if input queue empty: `input_queue_empty()`
972    InputQueueEmpty,
973    /// Check if output queue empty: `output_queue_empty()`
974    OutputQueueEmpty,
975    /// Enqueue a response: `enqueue_response(&response)`
976    EnqueueResponse,
977
978    // === HLC Operations ===
979    /// Increment HLC logical counter: `hlc_tick()`
980    HlcTick,
981    /// Update HLC with received timestamp: `hlc_update(received_ts)`
982    HlcUpdate,
983    /// Get current HLC timestamp: `hlc_now()`
984    HlcNow,
985
986    // === K2K Operations ===
987    /// Send message to another kernel: `k2k_send(target_id, &msg)`
988    K2kSend,
989    /// Try to receive K2K message: `k2k_try_recv()`
990    K2kTryRecv,
991    /// Check for K2K messages: `k2k_has_message()`
992    K2kHasMessage,
993    /// Peek at next K2K message without consuming: `k2k_peek()`
994    K2kPeek,
995    /// Get number of pending K2K messages: `k2k_pending_count()`
996    K2kPendingCount,
997
998    // === Timing ===
999    /// Sleep for nanoseconds: `nanosleep(ns)`
1000    Nanosleep,
1001}
1002
1003impl RingKernelIntrinsic {
1004    /// Get the CUDA code for this intrinsic.
1005    pub fn to_cuda(&self, args: &[String]) -> String {
1006        match self {
1007            Self::IsActive => "atomicAdd(&control->is_active, 0) != 0".to_string(),
1008            Self::ShouldTerminate => "atomicAdd(&control->should_terminate, 0) != 0".to_string(),
1009            Self::MarkTerminated => "atomicExch(&control->has_terminated, 1)".to_string(),
1010            Self::GetMessagesProcessed => "atomicAdd(&control->messages_processed, 0)".to_string(),
1011
1012            Self::InputQueueSize => {
1013                "(atomicAdd(&control->input_head, 0) - atomicAdd(&control->input_tail, 0))"
1014                    .to_string()
1015            }
1016            Self::OutputQueueSize => {
1017                "(atomicAdd(&control->output_head, 0) - atomicAdd(&control->output_tail, 0))"
1018                    .to_string()
1019            }
1020            Self::InputQueueEmpty => {
1021                "(atomicAdd(&control->input_head, 0) == atomicAdd(&control->input_tail, 0))"
1022                    .to_string()
1023            }
1024            Self::OutputQueueEmpty => {
1025                "(atomicAdd(&control->output_head, 0) == atomicAdd(&control->output_tail, 0))"
1026                    .to_string()
1027            }
1028            Self::EnqueueResponse => {
1029                if !args.is_empty() {
1030                    format!(
1031                        "{{ unsigned long long _out_idx = atomicAdd(&control->output_head, 1) & control->output_mask; \
1032                         memcpy(&output_buffer[_out_idx * RESP_SIZE], {}, RESP_SIZE); }}",
1033                        args[0]
1034                    )
1035                } else {
1036                    "/* enqueue_response requires response pointer */".to_string()
1037                }
1038            }
1039
1040            Self::HlcTick => "hlc_logical++".to_string(),
1041            Self::HlcUpdate => {
1042                if !args.is_empty() {
1043                    format!(
1044                        "{{ if ({} > hlc_physical) {{ hlc_physical = {}; hlc_logical = 0; }} else {{ hlc_logical++; }} }}",
1045                        args[0], args[0]
1046                    )
1047                } else {
1048                    "hlc_logical++".to_string()
1049                }
1050            }
1051            Self::HlcNow => "(hlc_physical << 32) | (hlc_logical & 0xFFFFFFFF)".to_string(),
1052
1053            Self::K2kSend => {
1054                if args.len() >= 2 {
1055                    // k2k_send(target_id, msg_ptr) -> k2k_send(k2k_routes, target_id, msg_ptr, sizeof(*msg_ptr))
1056                    format!(
1057                        "k2k_send(k2k_routes, {}, {}, sizeof(*{}))",
1058                        args[0], args[1], args[1]
1059                    )
1060                } else {
1061                    "/* k2k_send requires target_id and msg_ptr */".to_string()
1062                }
1063            }
1064            Self::K2kTryRecv => "k2k_try_recv(k2k_inbox)".to_string(),
1065            Self::K2kHasMessage => "k2k_has_message(k2k_inbox)".to_string(),
1066            Self::K2kPeek => "k2k_peek(k2k_inbox)".to_string(),
1067            Self::K2kPendingCount => "k2k_pending_count(k2k_inbox)".to_string(),
1068
1069            Self::Nanosleep => {
1070                if !args.is_empty() {
1071                    format!("__nanosleep({})", args[0])
1072                } else {
1073                    "__nanosleep(1000)".to_string()
1074                }
1075            }
1076        }
1077    }
1078
1079    /// Parse a function name to get the intrinsic.
1080    pub fn from_name(name: &str) -> Option<Self> {
1081        match name {
1082            "is_active" | "is_kernel_active" => Some(Self::IsActive),
1083            "should_terminate" => Some(Self::ShouldTerminate),
1084            "mark_terminated" => Some(Self::MarkTerminated),
1085            "messages_processed" | "get_messages_processed" => Some(Self::GetMessagesProcessed),
1086
1087            "input_queue_size" => Some(Self::InputQueueSize),
1088            "output_queue_size" => Some(Self::OutputQueueSize),
1089            "input_queue_empty" => Some(Self::InputQueueEmpty),
1090            "output_queue_empty" => Some(Self::OutputQueueEmpty),
1091            "enqueue_response" | "enqueue" => Some(Self::EnqueueResponse),
1092
1093            "hlc_tick" => Some(Self::HlcTick),
1094            "hlc_update" => Some(Self::HlcUpdate),
1095            "hlc_now" => Some(Self::HlcNow),
1096
1097            "k2k_send" => Some(Self::K2kSend),
1098            "k2k_try_recv" => Some(Self::K2kTryRecv),
1099            "k2k_has_message" => Some(Self::K2kHasMessage),
1100            "k2k_peek" => Some(Self::K2kPeek),
1101            "k2k_pending_count" | "k2k_pending" => Some(Self::K2kPendingCount),
1102
1103            "nanosleep" => Some(Self::Nanosleep),
1104
1105            _ => None,
1106        }
1107    }
1108
1109    /// Check if this intrinsic requires the control block.
1110    pub fn requires_control_block(&self) -> bool {
1111        matches!(
1112            self,
1113            Self::IsActive
1114                | Self::ShouldTerminate
1115                | Self::MarkTerminated
1116                | Self::GetMessagesProcessed
1117                | Self::InputQueueSize
1118                | Self::OutputQueueSize
1119                | Self::InputQueueEmpty
1120                | Self::OutputQueueEmpty
1121                | Self::EnqueueResponse
1122        )
1123    }
1124
1125    /// Check if this intrinsic requires HLC state.
1126    pub fn requires_hlc(&self) -> bool {
1127        matches!(self, Self::HlcTick | Self::HlcUpdate | Self::HlcNow)
1128    }
1129
1130    /// Check if this intrinsic requires K2K support.
1131    pub fn requires_k2k(&self) -> bool {
1132        matches!(
1133            self,
1134            Self::K2kSend
1135                | Self::K2kTryRecv
1136                | Self::K2kHasMessage
1137                | Self::K2kPeek
1138                | Self::K2kPendingCount
1139        )
1140    }
1141}
1142
1143impl StencilIntrinsic {
1144    /// Get the index offset for 2D stencil (relative to buffer_width).
1145    ///
1146    /// Returns (row_offset, col_offset) where final offset is:
1147    /// `row_offset * buffer_width + col_offset`
1148    pub fn get_offset_2d(&self) -> Option<(i32, i32)> {
1149        match self {
1150            StencilIntrinsic::Index => Some((0, 0)),
1151            StencilIntrinsic::North => Some((-1, 0)),
1152            StencilIntrinsic::South => Some((1, 0)),
1153            StencilIntrinsic::East => Some((0, 1)),
1154            StencilIntrinsic::West => Some((0, -1)),
1155            StencilIntrinsic::At => None, // Requires runtime offset
1156            StencilIntrinsic::Up | StencilIntrinsic::Down => None, // 3D only
1157        }
1158    }
1159
1160    /// Get the index offset for 3D stencil.
1161    ///
1162    /// Returns (z_offset, row_offset, col_offset) where final offset is:
1163    /// `z_offset * buffer_slice + row_offset * buffer_width + col_offset`
1164    pub fn get_offset_3d(&self) -> Option<(i32, i32, i32)> {
1165        match self {
1166            StencilIntrinsic::Index => Some((0, 0, 0)),
1167            StencilIntrinsic::North => Some((0, -1, 0)),
1168            StencilIntrinsic::South => Some((0, 1, 0)),
1169            StencilIntrinsic::East => Some((0, 0, 1)),
1170            StencilIntrinsic::West => Some((0, 0, -1)),
1171            StencilIntrinsic::Up => Some((-1, 0, 0)),
1172            StencilIntrinsic::Down => Some((1, 0, 0)),
1173            StencilIntrinsic::At => None, // Requires runtime offset
1174        }
1175    }
1176
1177    /// Check if this is a 3D-only intrinsic.
1178    pub fn is_3d_only(&self) -> bool {
1179        matches!(self, StencilIntrinsic::Up | StencilIntrinsic::Down)
1180    }
1181
1182    /// Generate CUDA index expression for 2D stencil.
1183    ///
1184    /// # Arguments
1185    /// * `buffer_name` - Name of the buffer variable
1186    /// * `buffer_width` - Width expression (e.g., "18" for tile_size + 2*halo)
1187    /// * `idx_var` - Name of the current index variable
1188    pub fn to_cuda_index_2d(&self, buffer_name: &str, buffer_width: &str, idx_var: &str) -> String {
1189        match self {
1190            StencilIntrinsic::Index => format!("{}[{}]", buffer_name, idx_var),
1191            StencilIntrinsic::North => {
1192                format!("{}[{} - {}]", buffer_name, idx_var, buffer_width)
1193            }
1194            StencilIntrinsic::South => {
1195                format!("{}[{} + {}]", buffer_name, idx_var, buffer_width)
1196            }
1197            StencilIntrinsic::East => format!("{}[{} + 1]", buffer_name, idx_var),
1198            StencilIntrinsic::West => format!("{}[{} - 1]", buffer_name, idx_var),
1199            StencilIntrinsic::At => {
1200                // This should be handled specially with provided offsets
1201                format!("{}[{}]", buffer_name, idx_var)
1202            }
1203            _ => format!("{}[{}]", buffer_name, idx_var),
1204        }
1205    }
1206
1207    /// Generate CUDA index expression for 3D stencil.
1208    ///
1209    /// # Arguments
1210    /// * `buffer_name` - Name of the buffer variable
1211    /// * `buffer_width` - Width expression
1212    /// * `buffer_slice` - Slice size expression (width * height)
1213    /// * `idx_var` - Name of the current index variable
1214    pub fn to_cuda_index_3d(
1215        &self,
1216        buffer_name: &str,
1217        buffer_width: &str,
1218        buffer_slice: &str,
1219        idx_var: &str,
1220    ) -> String {
1221        match self {
1222            StencilIntrinsic::Index => format!("{}[{}]", buffer_name, idx_var),
1223            StencilIntrinsic::North => {
1224                format!("{}[{} - {}]", buffer_name, idx_var, buffer_width)
1225            }
1226            StencilIntrinsic::South => {
1227                format!("{}[{} + {}]", buffer_name, idx_var, buffer_width)
1228            }
1229            StencilIntrinsic::East => format!("{}[{} + 1]", buffer_name, idx_var),
1230            StencilIntrinsic::West => format!("{}[{} - 1]", buffer_name, idx_var),
1231            StencilIntrinsic::Up => {
1232                format!("{}[{} - {}]", buffer_name, idx_var, buffer_slice)
1233            }
1234            StencilIntrinsic::Down => {
1235                format!("{}[{} + {}]", buffer_name, idx_var, buffer_slice)
1236            }
1237            StencilIntrinsic::At => {
1238                // This should be handled specially with provided offsets
1239                format!("{}[{}]", buffer_name, idx_var)
1240            }
1241        }
1242    }
1243}
1244
1245#[cfg(test)]
1246mod tests {
1247    use super::*;
1248
1249    #[test]
1250    fn test_intrinsic_lookup() {
1251        let registry = IntrinsicRegistry::new();
1252
1253        assert_eq!(
1254            registry.lookup("sync_threads"),
1255            Some(&GpuIntrinsic::SyncThreads)
1256        );
1257        assert_eq!(registry.lookup("sqrt"), Some(&GpuIntrinsic::Sqrt));
1258        assert_eq!(registry.lookup("unknown_func"), None);
1259    }
1260
1261    #[test]
1262    fn test_intrinsic_cuda_output() {
1263        assert_eq!(
1264            GpuIntrinsic::SyncThreads.to_cuda_string(),
1265            "__syncthreads()"
1266        );
1267        assert_eq!(GpuIntrinsic::AtomicAdd.to_cuda_string(), "atomicAdd");
1268        assert_eq!(GpuIntrinsic::Sqrt.to_cuda_string(), "sqrtf");
1269    }
1270
1271    #[test]
1272    fn test_stencil_intrinsic_parsing() {
1273        assert_eq!(
1274            StencilIntrinsic::from_method_name("north"),
1275            Some(StencilIntrinsic::North)
1276        );
1277        assert_eq!(
1278            StencilIntrinsic::from_method_name("idx"),
1279            Some(StencilIntrinsic::Index)
1280        );
1281        assert_eq!(StencilIntrinsic::from_method_name("unknown"), None);
1282    }
1283
1284    #[test]
1285    fn test_stencil_cuda_index() {
1286        let north = StencilIntrinsic::North;
1287        assert_eq!(
1288            north.to_cuda_index_2d("p", "buffer_width", "idx"),
1289            "p[idx - buffer_width]"
1290        );
1291
1292        let east = StencilIntrinsic::East;
1293        assert_eq!(east.to_cuda_index_2d("p", "18", "idx"), "p[idx + 1]");
1294    }
1295
1296    #[test]
1297    fn test_stencil_offset() {
1298        assert_eq!(StencilIntrinsic::North.get_offset_2d(), Some((-1, 0)));
1299        assert_eq!(StencilIntrinsic::East.get_offset_2d(), Some((0, 1)));
1300        assert_eq!(StencilIntrinsic::Index.get_offset_2d(), Some((0, 0)));
1301    }
1302
1303    #[test]
1304    fn test_ring_kernel_intrinsic_lookup() {
1305        assert_eq!(
1306            RingKernelIntrinsic::from_name("is_active"),
1307            Some(RingKernelIntrinsic::IsActive)
1308        );
1309        assert_eq!(
1310            RingKernelIntrinsic::from_name("should_terminate"),
1311            Some(RingKernelIntrinsic::ShouldTerminate)
1312        );
1313        assert_eq!(
1314            RingKernelIntrinsic::from_name("hlc_tick"),
1315            Some(RingKernelIntrinsic::HlcTick)
1316        );
1317        assert_eq!(
1318            RingKernelIntrinsic::from_name("enqueue_response"),
1319            Some(RingKernelIntrinsic::EnqueueResponse)
1320        );
1321        assert_eq!(RingKernelIntrinsic::from_name("unknown"), None);
1322    }
1323
1324    #[test]
1325    fn test_ring_kernel_intrinsic_cuda_output() {
1326        assert!(RingKernelIntrinsic::IsActive
1327            .to_cuda(&[])
1328            .contains("is_active"));
1329        assert!(RingKernelIntrinsic::ShouldTerminate
1330            .to_cuda(&[])
1331            .contains("should_terminate"));
1332        assert!(RingKernelIntrinsic::HlcTick
1333            .to_cuda(&[])
1334            .contains("hlc_logical"));
1335        assert!(RingKernelIntrinsic::InputQueueEmpty
1336            .to_cuda(&[])
1337            .contains("input_head"));
1338    }
1339
1340    #[test]
1341    fn test_ring_kernel_queue_intrinsics() {
1342        let enqueue = RingKernelIntrinsic::EnqueueResponse;
1343        let cuda = enqueue.to_cuda(&["&response".to_string()]);
1344        assert!(cuda.contains("output_head"));
1345        assert!(cuda.contains("memcpy"));
1346    }
1347
1348    #[test]
1349    fn test_k2k_intrinsics() {
1350        // Test k2k_send
1351        let send = RingKernelIntrinsic::K2kSend;
1352        let cuda = send.to_cuda(&["target_id".to_string(), "&msg".to_string()]);
1353        assert!(cuda.contains("k2k_send"));
1354        assert!(cuda.contains("k2k_routes"));
1355        assert!(cuda.contains("target_id"));
1356
1357        // Test k2k_try_recv
1358        assert_eq!(
1359            RingKernelIntrinsic::K2kTryRecv.to_cuda(&[]),
1360            "k2k_try_recv(k2k_inbox)"
1361        );
1362
1363        // Test k2k_has_message
1364        assert_eq!(
1365            RingKernelIntrinsic::K2kHasMessage.to_cuda(&[]),
1366            "k2k_has_message(k2k_inbox)"
1367        );
1368
1369        // Test k2k_peek
1370        assert_eq!(
1371            RingKernelIntrinsic::K2kPeek.to_cuda(&[]),
1372            "k2k_peek(k2k_inbox)"
1373        );
1374
1375        // Test k2k_pending_count
1376        assert_eq!(
1377            RingKernelIntrinsic::K2kPendingCount.to_cuda(&[]),
1378            "k2k_pending_count(k2k_inbox)"
1379        );
1380    }
1381
1382    #[test]
1383    fn test_k2k_intrinsic_lookup() {
1384        assert_eq!(
1385            RingKernelIntrinsic::from_name("k2k_send"),
1386            Some(RingKernelIntrinsic::K2kSend)
1387        );
1388        assert_eq!(
1389            RingKernelIntrinsic::from_name("k2k_try_recv"),
1390            Some(RingKernelIntrinsic::K2kTryRecv)
1391        );
1392        assert_eq!(
1393            RingKernelIntrinsic::from_name("k2k_has_message"),
1394            Some(RingKernelIntrinsic::K2kHasMessage)
1395        );
1396        assert_eq!(
1397            RingKernelIntrinsic::from_name("k2k_peek"),
1398            Some(RingKernelIntrinsic::K2kPeek)
1399        );
1400        assert_eq!(
1401            RingKernelIntrinsic::from_name("k2k_pending_count"),
1402            Some(RingKernelIntrinsic::K2kPendingCount)
1403        );
1404    }
1405
1406    #[test]
1407    fn test_intrinsic_requirements() {
1408        // K2K intrinsics require K2K
1409        assert!(RingKernelIntrinsic::K2kSend.requires_k2k());
1410        assert!(RingKernelIntrinsic::K2kTryRecv.requires_k2k());
1411        assert!(RingKernelIntrinsic::K2kPeek.requires_k2k());
1412        assert!(!RingKernelIntrinsic::HlcTick.requires_k2k());
1413
1414        // HLC intrinsics require HLC
1415        assert!(RingKernelIntrinsic::HlcTick.requires_hlc());
1416        assert!(RingKernelIntrinsic::HlcNow.requires_hlc());
1417        assert!(!RingKernelIntrinsic::K2kSend.requires_hlc());
1418
1419        // Control block intrinsics require control block
1420        assert!(RingKernelIntrinsic::IsActive.requires_control_block());
1421        assert!(RingKernelIntrinsic::EnqueueResponse.requires_control_block());
1422        assert!(!RingKernelIntrinsic::HlcTick.requires_control_block());
1423    }
1424
1425    // === NEW INTRINSIC TESTS ===
1426
1427    #[test]
1428    fn test_new_atomic_intrinsics() {
1429        let registry = IntrinsicRegistry::new();
1430
1431        // Test bitwise atomics
1432        assert_eq!(
1433            registry.lookup("atomic_and"),
1434            Some(&GpuIntrinsic::AtomicAnd)
1435        );
1436        assert_eq!(registry.lookup("atomic_or"), Some(&GpuIntrinsic::AtomicOr));
1437        assert_eq!(
1438            registry.lookup("atomic_xor"),
1439            Some(&GpuIntrinsic::AtomicXor)
1440        );
1441        assert_eq!(
1442            registry.lookup("atomic_inc"),
1443            Some(&GpuIntrinsic::AtomicInc)
1444        );
1445        assert_eq!(
1446            registry.lookup("atomic_dec"),
1447            Some(&GpuIntrinsic::AtomicDec)
1448        );
1449
1450        // Test CUDA output
1451        assert_eq!(GpuIntrinsic::AtomicAnd.to_cuda_string(), "atomicAnd");
1452        assert_eq!(GpuIntrinsic::AtomicOr.to_cuda_string(), "atomicOr");
1453        assert_eq!(GpuIntrinsic::AtomicXor.to_cuda_string(), "atomicXor");
1454        assert_eq!(GpuIntrinsic::AtomicInc.to_cuda_string(), "atomicInc");
1455        assert_eq!(GpuIntrinsic::AtomicDec.to_cuda_string(), "atomicDec");
1456    }
1457
1458    #[test]
1459    fn test_trigonometric_intrinsics() {
1460        let registry = IntrinsicRegistry::new();
1461
1462        // Test inverse trig
1463        assert_eq!(registry.lookup("asin"), Some(&GpuIntrinsic::Asin));
1464        assert_eq!(registry.lookup("acos"), Some(&GpuIntrinsic::Acos));
1465        assert_eq!(registry.lookup("atan"), Some(&GpuIntrinsic::Atan));
1466        assert_eq!(registry.lookup("atan2"), Some(&GpuIntrinsic::Atan2));
1467
1468        // Test CUDA output
1469        assert_eq!(GpuIntrinsic::Asin.to_cuda_string(), "asinf");
1470        assert_eq!(GpuIntrinsic::Acos.to_cuda_string(), "acosf");
1471        assert_eq!(GpuIntrinsic::Atan.to_cuda_string(), "atanf");
1472        assert_eq!(GpuIntrinsic::Atan2.to_cuda_string(), "atan2f");
1473    }
1474
1475    #[test]
1476    fn test_hyperbolic_intrinsics() {
1477        let registry = IntrinsicRegistry::new();
1478
1479        // Test hyperbolic functions
1480        assert_eq!(registry.lookup("sinh"), Some(&GpuIntrinsic::Sinh));
1481        assert_eq!(registry.lookup("cosh"), Some(&GpuIntrinsic::Cosh));
1482        assert_eq!(registry.lookup("tanh"), Some(&GpuIntrinsic::Tanh));
1483        assert_eq!(registry.lookup("asinh"), Some(&GpuIntrinsic::Asinh));
1484        assert_eq!(registry.lookup("acosh"), Some(&GpuIntrinsic::Acosh));
1485        assert_eq!(registry.lookup("atanh"), Some(&GpuIntrinsic::Atanh));
1486
1487        // Test CUDA output
1488        assert_eq!(GpuIntrinsic::Sinh.to_cuda_string(), "sinhf");
1489        assert_eq!(GpuIntrinsic::Cosh.to_cuda_string(), "coshf");
1490        assert_eq!(GpuIntrinsic::Tanh.to_cuda_string(), "tanhf");
1491    }
1492
1493    #[test]
1494    fn test_exponential_logarithmic_intrinsics() {
1495        let registry = IntrinsicRegistry::new();
1496
1497        // Test exp variants
1498        assert_eq!(registry.lookup("exp2"), Some(&GpuIntrinsic::Exp2));
1499        assert_eq!(registry.lookup("exp10"), Some(&GpuIntrinsic::Exp10));
1500        assert_eq!(registry.lookup("expm1"), Some(&GpuIntrinsic::Expm1));
1501
1502        // Test log variants
1503        assert_eq!(registry.lookup("log2"), Some(&GpuIntrinsic::Log2));
1504        assert_eq!(registry.lookup("log10"), Some(&GpuIntrinsic::Log10));
1505        assert_eq!(registry.lookup("log1p"), Some(&GpuIntrinsic::Log1p));
1506
1507        // Test CUDA output
1508        assert_eq!(GpuIntrinsic::Exp2.to_cuda_string(), "exp2f");
1509        assert_eq!(GpuIntrinsic::Log2.to_cuda_string(), "log2f");
1510        assert_eq!(GpuIntrinsic::Log10.to_cuda_string(), "log10f");
1511    }
1512
1513    #[test]
1514    fn test_classification_intrinsics() {
1515        let registry = IntrinsicRegistry::new();
1516
1517        // Test classification functions
1518        assert_eq!(registry.lookup("is_nan"), Some(&GpuIntrinsic::Isnan));
1519        assert_eq!(registry.lookup("isnan"), Some(&GpuIntrinsic::Isnan));
1520        assert_eq!(registry.lookup("is_infinite"), Some(&GpuIntrinsic::Isinf));
1521        assert_eq!(registry.lookup("is_finite"), Some(&GpuIntrinsic::Isfinite));
1522        assert_eq!(registry.lookup("is_normal"), Some(&GpuIntrinsic::Isnormal));
1523        assert_eq!(registry.lookup("signbit"), Some(&GpuIntrinsic::Signbit));
1524
1525        // Test CUDA output
1526        assert_eq!(GpuIntrinsic::Isnan.to_cuda_string(), "isnan");
1527        assert_eq!(GpuIntrinsic::Isinf.to_cuda_string(), "isinf");
1528        assert_eq!(GpuIntrinsic::Isfinite.to_cuda_string(), "isfinite");
1529    }
1530
1531    #[test]
1532    fn test_warp_reduce_intrinsics() {
1533        let registry = IntrinsicRegistry::new();
1534
1535        // Test warp reduce operations
1536        assert_eq!(
1537            registry.lookup("warp_reduce_add"),
1538            Some(&GpuIntrinsic::WarpReduceAdd)
1539        );
1540        assert_eq!(
1541            registry.lookup("warp_reduce_min"),
1542            Some(&GpuIntrinsic::WarpReduceMin)
1543        );
1544        assert_eq!(
1545            registry.lookup("warp_reduce_max"),
1546            Some(&GpuIntrinsic::WarpReduceMax)
1547        );
1548        assert_eq!(
1549            registry.lookup("warp_reduce_and"),
1550            Some(&GpuIntrinsic::WarpReduceAnd)
1551        );
1552        assert_eq!(
1553            registry.lookup("warp_reduce_or"),
1554            Some(&GpuIntrinsic::WarpReduceOr)
1555        );
1556        assert_eq!(
1557            registry.lookup("warp_reduce_xor"),
1558            Some(&GpuIntrinsic::WarpReduceXor)
1559        );
1560
1561        // Test CUDA output
1562        assert_eq!(
1563            GpuIntrinsic::WarpReduceAdd.to_cuda_string(),
1564            "__reduce_add_sync"
1565        );
1566        assert_eq!(
1567            GpuIntrinsic::WarpReduceMin.to_cuda_string(),
1568            "__reduce_min_sync"
1569        );
1570        assert_eq!(
1571            GpuIntrinsic::WarpReduceMax.to_cuda_string(),
1572            "__reduce_max_sync"
1573        );
1574    }
1575
1576    #[test]
1577    fn test_warp_match_intrinsics() {
1578        let registry = IntrinsicRegistry::new();
1579
1580        assert_eq!(
1581            registry.lookup("warp_match_any"),
1582            Some(&GpuIntrinsic::WarpMatchAny)
1583        );
1584        assert_eq!(
1585            registry.lookup("warp_match_all"),
1586            Some(&GpuIntrinsic::WarpMatchAll)
1587        );
1588
1589        assert_eq!(
1590            GpuIntrinsic::WarpMatchAny.to_cuda_string(),
1591            "__match_any_sync"
1592        );
1593        assert_eq!(
1594            GpuIntrinsic::WarpMatchAll.to_cuda_string(),
1595            "__match_all_sync"
1596        );
1597    }
1598
1599    #[test]
1600    fn test_bit_manipulation_intrinsics() {
1601        let registry = IntrinsicRegistry::new();
1602
1603        // Test bit manipulation
1604        assert_eq!(registry.lookup("popc"), Some(&GpuIntrinsic::Popc));
1605        assert_eq!(registry.lookup("popcount"), Some(&GpuIntrinsic::Popc));
1606        assert_eq!(registry.lookup("count_ones"), Some(&GpuIntrinsic::Popc));
1607        assert_eq!(registry.lookup("clz"), Some(&GpuIntrinsic::Clz));
1608        assert_eq!(registry.lookup("leading_zeros"), Some(&GpuIntrinsic::Clz));
1609        assert_eq!(registry.lookup("ctz"), Some(&GpuIntrinsic::Ctz));
1610        assert_eq!(registry.lookup("ffs"), Some(&GpuIntrinsic::Ffs));
1611        assert_eq!(registry.lookup("brev"), Some(&GpuIntrinsic::Brev));
1612        assert_eq!(registry.lookup("reverse_bits"), Some(&GpuIntrinsic::Brev));
1613
1614        // Test CUDA output
1615        assert_eq!(GpuIntrinsic::Popc.to_cuda_string(), "__popc");
1616        assert_eq!(GpuIntrinsic::Clz.to_cuda_string(), "__clz");
1617        assert_eq!(GpuIntrinsic::Ffs.to_cuda_string(), "__ffs");
1618        assert_eq!(GpuIntrinsic::Brev.to_cuda_string(), "__brev");
1619    }
1620
1621    #[test]
1622    fn test_funnel_shift_intrinsics() {
1623        let registry = IntrinsicRegistry::new();
1624
1625        assert_eq!(
1626            registry.lookup("funnel_shift_left"),
1627            Some(&GpuIntrinsic::FunnelShiftLeft)
1628        );
1629        assert_eq!(
1630            registry.lookup("funnel_shift_right"),
1631            Some(&GpuIntrinsic::FunnelShiftRight)
1632        );
1633
1634        assert_eq!(
1635            GpuIntrinsic::FunnelShiftLeft.to_cuda_string(),
1636            "__funnelshift_l"
1637        );
1638        assert_eq!(
1639            GpuIntrinsic::FunnelShiftRight.to_cuda_string(),
1640            "__funnelshift_r"
1641        );
1642    }
1643
1644    #[test]
1645    fn test_memory_intrinsics() {
1646        let registry = IntrinsicRegistry::new();
1647
1648        assert_eq!(registry.lookup("ldg"), Some(&GpuIntrinsic::Ldg));
1649        assert_eq!(registry.lookup("load_global"), Some(&GpuIntrinsic::Ldg));
1650        assert_eq!(
1651            registry.lookup("prefetch_l1"),
1652            Some(&GpuIntrinsic::PrefetchL1)
1653        );
1654        assert_eq!(
1655            registry.lookup("prefetch_l2"),
1656            Some(&GpuIntrinsic::PrefetchL2)
1657        );
1658
1659        assert_eq!(GpuIntrinsic::Ldg.to_cuda_string(), "__ldg");
1660        assert_eq!(GpuIntrinsic::PrefetchL1.to_cuda_string(), "__prefetch_l1");
1661        assert_eq!(GpuIntrinsic::PrefetchL2.to_cuda_string(), "__prefetch_l2");
1662    }
1663
1664    #[test]
1665    fn test_clock_intrinsics() {
1666        let registry = IntrinsicRegistry::new();
1667
1668        assert_eq!(registry.lookup("clock"), Some(&GpuIntrinsic::Clock));
1669        assert_eq!(registry.lookup("clock64"), Some(&GpuIntrinsic::Clock64));
1670        assert_eq!(registry.lookup("nanosleep"), Some(&GpuIntrinsic::Nanosleep));
1671
1672        assert_eq!(GpuIntrinsic::Clock.to_cuda_string(), "clock()");
1673        assert_eq!(GpuIntrinsic::Clock64.to_cuda_string(), "clock64()");
1674        assert_eq!(GpuIntrinsic::Nanosleep.to_cuda_string(), "__nanosleep");
1675    }
1676
1677    #[test]
1678    fn test_special_function_intrinsics() {
1679        let registry = IntrinsicRegistry::new();
1680
1681        assert_eq!(registry.lookup("rcp"), Some(&GpuIntrinsic::Rcp));
1682        assert_eq!(registry.lookup("recip"), Some(&GpuIntrinsic::Rcp));
1683        assert_eq!(registry.lookup("saturate"), Some(&GpuIntrinsic::Saturate));
1684        assert_eq!(registry.lookup("clamp_01"), Some(&GpuIntrinsic::Saturate));
1685
1686        assert_eq!(GpuIntrinsic::Rcp.to_cuda_string(), "__frcp_rn");
1687        assert_eq!(GpuIntrinsic::Saturate.to_cuda_string(), "__saturatef");
1688    }
1689
1690    #[test]
1691    fn test_intrinsic_categories() {
1692        // Test category assignment
1693        assert_eq!(GpuIntrinsic::SyncThreads.category(), "synchronization");
1694        assert_eq!(GpuIntrinsic::AtomicAdd.category(), "atomic");
1695        assert_eq!(GpuIntrinsic::Sqrt.category(), "math");
1696        assert_eq!(GpuIntrinsic::Sin.category(), "trigonometric");
1697        assert_eq!(GpuIntrinsic::Sinh.category(), "hyperbolic");
1698        assert_eq!(GpuIntrinsic::Exp.category(), "exponential");
1699        assert_eq!(GpuIntrinsic::Isnan.category(), "classification");
1700        assert_eq!(GpuIntrinsic::WarpShfl.category(), "warp");
1701        assert_eq!(GpuIntrinsic::Popc.category(), "bit");
1702        assert_eq!(GpuIntrinsic::Ldg.category(), "memory");
1703        assert_eq!(GpuIntrinsic::Rcp.category(), "special");
1704        assert_eq!(GpuIntrinsic::ThreadIdxX.category(), "index");
1705        assert_eq!(GpuIntrinsic::Clock.category(), "timing");
1706    }
1707
1708    #[test]
1709    fn test_intrinsic_flags() {
1710        // Test is_value_intrinsic
1711        assert!(GpuIntrinsic::ThreadIdxX.is_value_intrinsic());
1712        assert!(GpuIntrinsic::BlockDimX.is_value_intrinsic());
1713        assert!(GpuIntrinsic::WarpSize.is_value_intrinsic());
1714        assert!(!GpuIntrinsic::Sin.is_value_intrinsic());
1715        assert!(!GpuIntrinsic::AtomicAdd.is_value_intrinsic());
1716
1717        // Test is_zero_arg_function
1718        assert!(GpuIntrinsic::SyncThreads.is_zero_arg_function());
1719        assert!(GpuIntrinsic::ThreadFence.is_zero_arg_function());
1720        assert!(GpuIntrinsic::WarpActiveMask.is_zero_arg_function());
1721        assert!(GpuIntrinsic::Clock.is_zero_arg_function());
1722        assert!(!GpuIntrinsic::Sin.is_zero_arg_function());
1723
1724        // Test requires_mask
1725        assert!(GpuIntrinsic::WarpShfl.requires_mask());
1726        assert!(GpuIntrinsic::WarpBallot.requires_mask());
1727        assert!(GpuIntrinsic::WarpReduceAdd.requires_mask());
1728        assert!(!GpuIntrinsic::Sin.requires_mask());
1729        assert!(!GpuIntrinsic::AtomicAdd.requires_mask());
1730    }
1731
1732    #[test]
1733    fn test_3d_stencil_intrinsics() {
1734        assert_eq!(
1735            StencilIntrinsic::from_method_name("up"),
1736            Some(StencilIntrinsic::Up)
1737        );
1738        assert_eq!(
1739            StencilIntrinsic::from_method_name("down"),
1740            Some(StencilIntrinsic::Down)
1741        );
1742
1743        // Test 3D only flag
1744        assert!(StencilIntrinsic::Up.is_3d_only());
1745        assert!(StencilIntrinsic::Down.is_3d_only());
1746        assert!(!StencilIntrinsic::North.is_3d_only());
1747        assert!(!StencilIntrinsic::East.is_3d_only());
1748        assert!(!StencilIntrinsic::Index.is_3d_only());
1749
1750        // Test 3D offsets
1751        assert_eq!(StencilIntrinsic::Up.get_offset_3d(), Some((-1, 0, 0)));
1752        assert_eq!(StencilIntrinsic::Down.get_offset_3d(), Some((1, 0, 0)));
1753        assert_eq!(StencilIntrinsic::North.get_offset_3d(), Some((0, -1, 0)));
1754        assert_eq!(StencilIntrinsic::South.get_offset_3d(), Some((0, 1, 0)));
1755        assert_eq!(StencilIntrinsic::East.get_offset_3d(), Some((0, 0, 1)));
1756        assert_eq!(StencilIntrinsic::West.get_offset_3d(), Some((0, 0, -1)));
1757
1758        // Test 3D index generation
1759        let up = StencilIntrinsic::Up;
1760        assert_eq!(up.to_cuda_index_3d("p", "18", "324", "idx"), "p[idx - 324]");
1761
1762        let down = StencilIntrinsic::Down;
1763        assert_eq!(
1764            down.to_cuda_index_3d("p", "18", "324", "idx"),
1765            "p[idx + 324]"
1766        );
1767    }
1768
1769    #[test]
1770    fn test_sync_intrinsics() {
1771        let registry = IntrinsicRegistry::new();
1772
1773        assert_eq!(
1774            registry.lookup("sync_threads_count"),
1775            Some(&GpuIntrinsic::SyncThreadsCount)
1776        );
1777        assert_eq!(
1778            registry.lookup("sync_threads_and"),
1779            Some(&GpuIntrinsic::SyncThreadsAnd)
1780        );
1781        assert_eq!(
1782            registry.lookup("sync_threads_or"),
1783            Some(&GpuIntrinsic::SyncThreadsOr)
1784        );
1785
1786        assert_eq!(
1787            GpuIntrinsic::SyncThreadsCount.to_cuda_string(),
1788            "__syncthreads_count"
1789        );
1790        assert_eq!(
1791            GpuIntrinsic::SyncThreadsAnd.to_cuda_string(),
1792            "__syncthreads_and"
1793        );
1794        assert_eq!(
1795            GpuIntrinsic::SyncThreadsOr.to_cuda_string(),
1796            "__syncthreads_or"
1797        );
1798    }
1799
1800    #[test]
1801    fn test_math_extras() {
1802        let registry = IntrinsicRegistry::new();
1803
1804        assert_eq!(registry.lookup("trunc"), Some(&GpuIntrinsic::Trunc));
1805        assert_eq!(registry.lookup("cbrt"), Some(&GpuIntrinsic::Cbrt));
1806        assert_eq!(registry.lookup("hypot"), Some(&GpuIntrinsic::Hypot));
1807        assert_eq!(registry.lookup("copysign"), Some(&GpuIntrinsic::Copysign));
1808        assert_eq!(registry.lookup("fmod"), Some(&GpuIntrinsic::Fmod));
1809
1810        assert_eq!(GpuIntrinsic::Trunc.to_cuda_string(), "truncf");
1811        assert_eq!(GpuIntrinsic::Cbrt.to_cuda_string(), "cbrtf");
1812        assert_eq!(GpuIntrinsic::Hypot.to_cuda_string(), "hypotf");
1813    }
1814}