ringkernel_cuda_codegen/
intrinsics.rs

1//! GPU intrinsic mapping for CUDA code generation.
2//!
3//! This module provides mappings from high-level Rust operations to
4//! CUDA intrinsics and built-in functions.
5
6use std::collections::HashMap;
7
8/// GPU intrinsic operations.
9#[derive(Debug, Clone, PartialEq)]
10pub enum GpuIntrinsic {
11    // === Synchronization ===
12    /// Thread synchronization within a block.
13    SyncThreads,
14    /// Thread fence (memory ordering across device).
15    ThreadFence,
16    /// Thread fence within block.
17    ThreadFenceBlock,
18    /// Thread fence across system.
19    ThreadFenceSystem,
20    /// Synchronize threads with predicate.
21    SyncThreadsCount,
22    /// Synchronize threads with AND predicate.
23    SyncThreadsAnd,
24    /// Synchronize threads with OR predicate.
25    SyncThreadsOr,
26
27    // === Atomic Operations (Integer) ===
28    /// Atomic add.
29    AtomicAdd,
30    /// Atomic subtract.
31    AtomicSub,
32    /// Atomic minimum.
33    AtomicMin,
34    /// Atomic maximum.
35    AtomicMax,
36    /// Atomic exchange.
37    AtomicExch,
38    /// Atomic compare-and-swap.
39    AtomicCas,
40    /// Atomic bitwise AND.
41    AtomicAnd,
42    /// Atomic bitwise OR.
43    AtomicOr,
44    /// Atomic bitwise XOR.
45    AtomicXor,
46    /// Atomic increment (with wrap).
47    AtomicInc,
48    /// Atomic decrement (with wrap).
49    AtomicDec,
50
51    // === Basic Math Functions ===
52    /// Square root.
53    Sqrt,
54    /// Reciprocal square root.
55    Rsqrt,
56    /// Absolute value (integer).
57    Abs,
58    /// Absolute value (floating point).
59    Fabs,
60    /// Floor.
61    Floor,
62    /// Ceiling.
63    Ceil,
64    /// Round to nearest.
65    Round,
66    /// Truncate toward zero.
67    Trunc,
68    /// Fused multiply-add.
69    Fma,
70    /// Minimum.
71    Min,
72    /// Maximum.
73    Max,
74    /// Floating-point modulo.
75    Fmod,
76    /// Remainder.
77    Remainder,
78    /// Copy sign.
79    Copysign,
80    /// Cube root.
81    Cbrt,
82    /// Hypotenuse.
83    Hypot,
84
85    // === Trigonometric Functions ===
86    /// Sine.
87    Sin,
88    /// Cosine.
89    Cos,
90    /// Tangent.
91    Tan,
92    /// Arcsine.
93    Asin,
94    /// Arccosine.
95    Acos,
96    /// Arctangent.
97    Atan,
98    /// Two-argument arctangent.
99    Atan2,
100    /// Sine and cosine (combined).
101    Sincos,
102    /// Sine of pi*x.
103    Sinpi,
104    /// Cosine of pi*x.
105    Cospi,
106
107    // === Hyperbolic Functions ===
108    /// Hyperbolic sine.
109    Sinh,
110    /// Hyperbolic cosine.
111    Cosh,
112    /// Hyperbolic tangent.
113    Tanh,
114    /// Inverse hyperbolic sine.
115    Asinh,
116    /// Inverse hyperbolic cosine.
117    Acosh,
118    /// Inverse hyperbolic tangent.
119    Atanh,
120
121    // === Exponential and Logarithmic Functions ===
122    /// Exponential (base e).
123    Exp,
124    /// Exponential (base 2).
125    Exp2,
126    /// Exponential (base 10).
127    Exp10,
128    /// exp(x) - 1 (accurate for small x).
129    Expm1,
130    /// Natural logarithm (base e).
131    Log,
132    /// Logarithm (base 2).
133    Log2,
134    /// Logarithm (base 10).
135    Log10,
136    /// log(1 + x) (accurate for small x).
137    Log1p,
138    /// Power.
139    Pow,
140    /// Load exponent.
141    Ldexp,
142    /// Scale by power of 2.
143    Scalbn,
144    /// Extract exponent.
145    Ilogb,
146    /// Logarithm of gamma function.
147    Lgamma,
148    /// Gamma function.
149    Tgamma,
150    /// Error function.
151    Erf,
152    /// Complementary error function.
153    Erfc,
154    /// Inverse error function.
155    Erfinv,
156    /// Inverse complementary error function.
157    Erfcinv,
158
159    // === Classification and Comparison ===
160    /// Check if NaN.
161    Isnan,
162    /// Check if infinite.
163    Isinf,
164    /// Check if finite.
165    Isfinite,
166    /// Check if normal.
167    Isnormal,
168    /// Check sign bit.
169    Signbit,
170    /// Next representable value.
171    Nextafter,
172    /// Floating-point difference.
173    Fdim,
174    /// Not-a-Number.
175    Nan,
176
177    // === Warp-Level Operations ===
178    /// Warp shuffle.
179    WarpShfl,
180    /// Warp shuffle up.
181    WarpShflUp,
182    /// Warp shuffle down.
183    WarpShflDown,
184    /// Warp shuffle XOR.
185    WarpShflXor,
186    /// Get active thread mask.
187    WarpActiveMask,
188    /// Warp ballot.
189    WarpBallot,
190    /// Warp all predicate.
191    WarpAll,
192    /// Warp any predicate.
193    WarpAny,
194    /// Warp match any.
195    WarpMatchAny,
196    /// Warp match all.
197    WarpMatchAll,
198    /// Warp reduce add.
199    WarpReduceAdd,
200    /// Warp reduce min.
201    WarpReduceMin,
202    /// Warp reduce max.
203    WarpReduceMax,
204    /// Warp reduce AND.
205    WarpReduceAnd,
206    /// Warp reduce OR.
207    WarpReduceOr,
208    /// Warp reduce XOR.
209    WarpReduceXor,
210
211    // === Bit Manipulation ===
212    /// Population count (count set bits).
213    Popc,
214    /// Count leading zeros.
215    Clz,
216    /// Count trailing zeros (via ffs).
217    Ctz,
218    /// Find first set bit.
219    Ffs,
220    /// Bit reverse.
221    Brev,
222    /// Byte permute.
223    BytePerm,
224    /// Funnel shift left.
225    FunnelShiftLeft,
226    /// Funnel shift right.
227    FunnelShiftRight,
228
229    // === Memory Operations ===
230    /// Read-only cache load.
231    Ldg,
232    /// Prefetch L1.
233    PrefetchL1,
234    /// Prefetch L2.
235    PrefetchL2,
236
237    // === Special Functions ===
238    /// Reciprocal.
239    Rcp,
240    /// Division (fast).
241    Fdividef,
242    /// Saturate to [0,1].
243    Saturate,
244    /// Bessel J0.
245    J0,
246    /// Bessel J1.
247    J1,
248    /// Bessel Jn.
249    Jn,
250    /// Bessel Y0.
251    Y0,
252    /// Bessel Y1.
253    Y1,
254    /// Bessel Yn.
255    Yn,
256    /// Normal CDF.
257    Normcdf,
258    /// Inverse normal CDF.
259    Normcdfinv,
260    /// Cylindrical Bessel I0.
261    CylBesselI0,
262    /// Cylindrical Bessel I1.
263    CylBesselI1,
264
265    // === CUDA Thread/Block Indices ===
266    /// Thread index X.
267    ThreadIdxX,
268    /// Thread index Y.
269    ThreadIdxY,
270    /// Thread index Z.
271    ThreadIdxZ,
272    /// Block index X.
273    BlockIdxX,
274    /// Block index Y.
275    BlockIdxY,
276    /// Block index Z.
277    BlockIdxZ,
278    /// Block dimension X.
279    BlockDimX,
280    /// Block dimension Y.
281    BlockDimY,
282    /// Block dimension Z.
283    BlockDimZ,
284    /// Grid dimension X.
285    GridDimX,
286    /// Grid dimension Y.
287    GridDimY,
288    /// Grid dimension Z.
289    GridDimZ,
290    /// Warp size (always 32).
291    WarpSize,
292
293    // === Clock and Timing ===
294    /// Read clock counter.
295    Clock,
296    /// Read 64-bit clock counter.
297    Clock64,
298    /// Nanosleep.
299    Nanosleep,
300
301    // === Block-Level Reductions ===
302    /// Block-level sum reduction using shared memory.
303    BlockReduceSum,
304    /// Block-level minimum reduction using shared memory.
305    BlockReduceMin,
306    /// Block-level maximum reduction using shared memory.
307    BlockReduceMax,
308    /// Block-level AND reduction using shared memory.
309    BlockReduceAnd,
310    /// Block-level OR reduction using shared memory.
311    BlockReduceOr,
312
313    // === Grid-Level Reductions ===
314    /// Grid-level sum reduction with atomic accumulation.
315    GridReduceSum,
316    /// Grid-level minimum reduction with atomic accumulation.
317    GridReduceMin,
318    /// Grid-level maximum reduction with atomic accumulation.
319    GridReduceMax,
320
321    // === Reduce and Broadcast ===
322    /// Reduce-and-broadcast: all threads get the global reduction result.
323    /// This combines block reduction, atomic accumulation, grid sync, and broadcast.
324    ReduceAndBroadcast,
325}
326
327impl GpuIntrinsic {
328    /// Convert to CUDA function/intrinsic name.
329    pub fn to_cuda_string(&self) -> &'static str {
330        match self {
331            // Synchronization
332            GpuIntrinsic::SyncThreads => "__syncthreads()",
333            GpuIntrinsic::ThreadFence => "__threadfence()",
334            GpuIntrinsic::ThreadFenceBlock => "__threadfence_block()",
335            GpuIntrinsic::ThreadFenceSystem => "__threadfence_system()",
336            GpuIntrinsic::SyncThreadsCount => "__syncthreads_count",
337            GpuIntrinsic::SyncThreadsAnd => "__syncthreads_and",
338            GpuIntrinsic::SyncThreadsOr => "__syncthreads_or",
339
340            // Atomic operations
341            GpuIntrinsic::AtomicAdd => "atomicAdd",
342            GpuIntrinsic::AtomicSub => "atomicSub",
343            GpuIntrinsic::AtomicMin => "atomicMin",
344            GpuIntrinsic::AtomicMax => "atomicMax",
345            GpuIntrinsic::AtomicExch => "atomicExch",
346            GpuIntrinsic::AtomicCas => "atomicCAS",
347            GpuIntrinsic::AtomicAnd => "atomicAnd",
348            GpuIntrinsic::AtomicOr => "atomicOr",
349            GpuIntrinsic::AtomicXor => "atomicXor",
350            GpuIntrinsic::AtomicInc => "atomicInc",
351            GpuIntrinsic::AtomicDec => "atomicDec",
352
353            // Basic math
354            GpuIntrinsic::Sqrt => "sqrtf",
355            GpuIntrinsic::Rsqrt => "rsqrtf",
356            GpuIntrinsic::Abs => "abs",
357            GpuIntrinsic::Fabs => "fabsf",
358            GpuIntrinsic::Floor => "floorf",
359            GpuIntrinsic::Ceil => "ceilf",
360            GpuIntrinsic::Round => "roundf",
361            GpuIntrinsic::Trunc => "truncf",
362            GpuIntrinsic::Fma => "fmaf",
363            GpuIntrinsic::Min => "fminf",
364            GpuIntrinsic::Max => "fmaxf",
365            GpuIntrinsic::Fmod => "fmodf",
366            GpuIntrinsic::Remainder => "remainderf",
367            GpuIntrinsic::Copysign => "copysignf",
368            GpuIntrinsic::Cbrt => "cbrtf",
369            GpuIntrinsic::Hypot => "hypotf",
370
371            // Trigonometric
372            GpuIntrinsic::Sin => "sinf",
373            GpuIntrinsic::Cos => "cosf",
374            GpuIntrinsic::Tan => "tanf",
375            GpuIntrinsic::Asin => "asinf",
376            GpuIntrinsic::Acos => "acosf",
377            GpuIntrinsic::Atan => "atanf",
378            GpuIntrinsic::Atan2 => "atan2f",
379            GpuIntrinsic::Sincos => "sincosf",
380            GpuIntrinsic::Sinpi => "sinpif",
381            GpuIntrinsic::Cospi => "cospif",
382
383            // Hyperbolic
384            GpuIntrinsic::Sinh => "sinhf",
385            GpuIntrinsic::Cosh => "coshf",
386            GpuIntrinsic::Tanh => "tanhf",
387            GpuIntrinsic::Asinh => "asinhf",
388            GpuIntrinsic::Acosh => "acoshf",
389            GpuIntrinsic::Atanh => "atanhf",
390
391            // Exponential and logarithmic
392            GpuIntrinsic::Exp => "expf",
393            GpuIntrinsic::Exp2 => "exp2f",
394            GpuIntrinsic::Exp10 => "exp10f",
395            GpuIntrinsic::Expm1 => "expm1f",
396            GpuIntrinsic::Log => "logf",
397            GpuIntrinsic::Log2 => "log2f",
398            GpuIntrinsic::Log10 => "log10f",
399            GpuIntrinsic::Log1p => "log1pf",
400            GpuIntrinsic::Pow => "powf",
401            GpuIntrinsic::Ldexp => "ldexpf",
402            GpuIntrinsic::Scalbn => "scalbnf",
403            GpuIntrinsic::Ilogb => "ilogbf",
404            GpuIntrinsic::Lgamma => "lgammaf",
405            GpuIntrinsic::Tgamma => "tgammaf",
406            GpuIntrinsic::Erf => "erff",
407            GpuIntrinsic::Erfc => "erfcf",
408            GpuIntrinsic::Erfinv => "erfinvf",
409            GpuIntrinsic::Erfcinv => "erfcinvf",
410
411            // Classification and comparison
412            GpuIntrinsic::Isnan => "isnan",
413            GpuIntrinsic::Isinf => "isinf",
414            GpuIntrinsic::Isfinite => "isfinite",
415            GpuIntrinsic::Isnormal => "isnormal",
416            GpuIntrinsic::Signbit => "signbit",
417            GpuIntrinsic::Nextafter => "nextafterf",
418            GpuIntrinsic::Fdim => "fdimf",
419            GpuIntrinsic::Nan => "nanf",
420
421            // Warp-level operations
422            GpuIntrinsic::WarpShfl => "__shfl_sync",
423            GpuIntrinsic::WarpShflUp => "__shfl_up_sync",
424            GpuIntrinsic::WarpShflDown => "__shfl_down_sync",
425            GpuIntrinsic::WarpShflXor => "__shfl_xor_sync",
426            GpuIntrinsic::WarpActiveMask => "__activemask()",
427            GpuIntrinsic::WarpBallot => "__ballot_sync",
428            GpuIntrinsic::WarpAll => "__all_sync",
429            GpuIntrinsic::WarpAny => "__any_sync",
430            GpuIntrinsic::WarpMatchAny => "__match_any_sync",
431            GpuIntrinsic::WarpMatchAll => "__match_all_sync",
432            GpuIntrinsic::WarpReduceAdd => "__reduce_add_sync",
433            GpuIntrinsic::WarpReduceMin => "__reduce_min_sync",
434            GpuIntrinsic::WarpReduceMax => "__reduce_max_sync",
435            GpuIntrinsic::WarpReduceAnd => "__reduce_and_sync",
436            GpuIntrinsic::WarpReduceOr => "__reduce_or_sync",
437            GpuIntrinsic::WarpReduceXor => "__reduce_xor_sync",
438
439            // Bit manipulation
440            GpuIntrinsic::Popc => "__popc",
441            GpuIntrinsic::Clz => "__clz",
442            GpuIntrinsic::Ctz => "__ffs", // ffs returns 1 + ctz, but commonly used
443            GpuIntrinsic::Ffs => "__ffs",
444            GpuIntrinsic::Brev => "__brev",
445            GpuIntrinsic::BytePerm => "__byte_perm",
446            GpuIntrinsic::FunnelShiftLeft => "__funnelshift_l",
447            GpuIntrinsic::FunnelShiftRight => "__funnelshift_r",
448
449            // Memory operations
450            GpuIntrinsic::Ldg => "__ldg",
451            GpuIntrinsic::PrefetchL1 => "__prefetch_l1",
452            GpuIntrinsic::PrefetchL2 => "__prefetch_l2",
453
454            // Special functions
455            GpuIntrinsic::Rcp => "__frcp_rn",
456            GpuIntrinsic::Fdividef => "__fdividef",
457            GpuIntrinsic::Saturate => "__saturatef",
458            GpuIntrinsic::J0 => "j0f",
459            GpuIntrinsic::J1 => "j1f",
460            GpuIntrinsic::Jn => "jnf",
461            GpuIntrinsic::Y0 => "y0f",
462            GpuIntrinsic::Y1 => "y1f",
463            GpuIntrinsic::Yn => "ynf",
464            GpuIntrinsic::Normcdf => "normcdff",
465            GpuIntrinsic::Normcdfinv => "normcdfinvf",
466            GpuIntrinsic::CylBesselI0 => "cyl_bessel_i0f",
467            GpuIntrinsic::CylBesselI1 => "cyl_bessel_i1f",
468
469            // Thread/block indices
470            GpuIntrinsic::ThreadIdxX => "threadIdx.x",
471            GpuIntrinsic::ThreadIdxY => "threadIdx.y",
472            GpuIntrinsic::ThreadIdxZ => "threadIdx.z",
473            GpuIntrinsic::BlockIdxX => "blockIdx.x",
474            GpuIntrinsic::BlockIdxY => "blockIdx.y",
475            GpuIntrinsic::BlockIdxZ => "blockIdx.z",
476            GpuIntrinsic::BlockDimX => "blockDim.x",
477            GpuIntrinsic::BlockDimY => "blockDim.y",
478            GpuIntrinsic::BlockDimZ => "blockDim.z",
479            GpuIntrinsic::GridDimX => "gridDim.x",
480            GpuIntrinsic::GridDimY => "gridDim.y",
481            GpuIntrinsic::GridDimZ => "gridDim.z",
482            GpuIntrinsic::WarpSize => "warpSize",
483
484            // Clock and timing
485            GpuIntrinsic::Clock => "clock()",
486            GpuIntrinsic::Clock64 => "clock64()",
487            GpuIntrinsic::Nanosleep => "__nanosleep",
488
489            // Block-level reductions (require code generation, not simple function calls)
490            GpuIntrinsic::BlockReduceSum => "__block_reduce_sum",
491            GpuIntrinsic::BlockReduceMin => "__block_reduce_min",
492            GpuIntrinsic::BlockReduceMax => "__block_reduce_max",
493            GpuIntrinsic::BlockReduceAnd => "__block_reduce_and",
494            GpuIntrinsic::BlockReduceOr => "__block_reduce_or",
495
496            // Grid-level reductions (require code generation with atomics)
497            GpuIntrinsic::GridReduceSum => "__grid_reduce_sum",
498            GpuIntrinsic::GridReduceMin => "__grid_reduce_min",
499            GpuIntrinsic::GridReduceMax => "__grid_reduce_max",
500
501            // Reduce-and-broadcast (requires grid sync)
502            GpuIntrinsic::ReduceAndBroadcast => "__reduce_and_broadcast",
503        }
504    }
505
506    /// Check if this intrinsic is a value (no parentheses needed).
507    pub fn is_value_intrinsic(&self) -> bool {
508        matches!(
509            self,
510            GpuIntrinsic::ThreadIdxX
511                | GpuIntrinsic::ThreadIdxY
512                | GpuIntrinsic::ThreadIdxZ
513                | GpuIntrinsic::BlockIdxX
514                | GpuIntrinsic::BlockIdxY
515                | GpuIntrinsic::BlockIdxZ
516                | GpuIntrinsic::BlockDimX
517                | GpuIntrinsic::BlockDimY
518                | GpuIntrinsic::BlockDimZ
519                | GpuIntrinsic::GridDimX
520                | GpuIntrinsic::GridDimY
521                | GpuIntrinsic::GridDimZ
522                | GpuIntrinsic::WarpSize
523        )
524    }
525
526    /// Check if this intrinsic is a zero-argument function (ends with ()).
527    pub fn is_zero_arg_function(&self) -> bool {
528        matches!(
529            self,
530            GpuIntrinsic::SyncThreads
531                | GpuIntrinsic::ThreadFence
532                | GpuIntrinsic::ThreadFenceBlock
533                | GpuIntrinsic::ThreadFenceSystem
534                | GpuIntrinsic::WarpActiveMask
535                | GpuIntrinsic::Clock
536                | GpuIntrinsic::Clock64
537        )
538    }
539
540    /// Check if this intrinsic requires a mask argument (warp operations).
541    pub fn requires_mask(&self) -> bool {
542        matches!(
543            self,
544            GpuIntrinsic::WarpShfl
545                | GpuIntrinsic::WarpShflUp
546                | GpuIntrinsic::WarpShflDown
547                | GpuIntrinsic::WarpShflXor
548                | GpuIntrinsic::WarpBallot
549                | GpuIntrinsic::WarpAll
550                | GpuIntrinsic::WarpAny
551                | GpuIntrinsic::WarpMatchAny
552                | GpuIntrinsic::WarpMatchAll
553                | GpuIntrinsic::WarpReduceAdd
554                | GpuIntrinsic::WarpReduceMin
555                | GpuIntrinsic::WarpReduceMax
556                | GpuIntrinsic::WarpReduceAnd
557                | GpuIntrinsic::WarpReduceOr
558                | GpuIntrinsic::WarpReduceXor
559        )
560    }
561
562    /// Get the category of this intrinsic for documentation purposes.
563    pub fn category(&self) -> &'static str {
564        match self {
565            GpuIntrinsic::SyncThreads
566            | GpuIntrinsic::ThreadFence
567            | GpuIntrinsic::ThreadFenceBlock
568            | GpuIntrinsic::ThreadFenceSystem
569            | GpuIntrinsic::SyncThreadsCount
570            | GpuIntrinsic::SyncThreadsAnd
571            | GpuIntrinsic::SyncThreadsOr => "synchronization",
572
573            GpuIntrinsic::AtomicAdd
574            | GpuIntrinsic::AtomicSub
575            | GpuIntrinsic::AtomicMin
576            | GpuIntrinsic::AtomicMax
577            | GpuIntrinsic::AtomicExch
578            | GpuIntrinsic::AtomicCas
579            | GpuIntrinsic::AtomicAnd
580            | GpuIntrinsic::AtomicOr
581            | GpuIntrinsic::AtomicXor
582            | GpuIntrinsic::AtomicInc
583            | GpuIntrinsic::AtomicDec => "atomic",
584
585            GpuIntrinsic::Sqrt
586            | GpuIntrinsic::Rsqrt
587            | GpuIntrinsic::Abs
588            | GpuIntrinsic::Fabs
589            | GpuIntrinsic::Floor
590            | GpuIntrinsic::Ceil
591            | GpuIntrinsic::Round
592            | GpuIntrinsic::Trunc
593            | GpuIntrinsic::Fma
594            | GpuIntrinsic::Min
595            | GpuIntrinsic::Max
596            | GpuIntrinsic::Fmod
597            | GpuIntrinsic::Remainder
598            | GpuIntrinsic::Copysign
599            | GpuIntrinsic::Cbrt
600            | GpuIntrinsic::Hypot => "math",
601
602            GpuIntrinsic::Sin
603            | GpuIntrinsic::Cos
604            | GpuIntrinsic::Tan
605            | GpuIntrinsic::Asin
606            | GpuIntrinsic::Acos
607            | GpuIntrinsic::Atan
608            | GpuIntrinsic::Atan2
609            | GpuIntrinsic::Sincos
610            | GpuIntrinsic::Sinpi
611            | GpuIntrinsic::Cospi => "trigonometric",
612
613            GpuIntrinsic::Sinh
614            | GpuIntrinsic::Cosh
615            | GpuIntrinsic::Tanh
616            | GpuIntrinsic::Asinh
617            | GpuIntrinsic::Acosh
618            | GpuIntrinsic::Atanh => "hyperbolic",
619
620            GpuIntrinsic::Exp
621            | GpuIntrinsic::Exp2
622            | GpuIntrinsic::Exp10
623            | GpuIntrinsic::Expm1
624            | GpuIntrinsic::Log
625            | GpuIntrinsic::Log2
626            | GpuIntrinsic::Log10
627            | GpuIntrinsic::Log1p
628            | GpuIntrinsic::Pow
629            | GpuIntrinsic::Ldexp
630            | GpuIntrinsic::Scalbn
631            | GpuIntrinsic::Ilogb
632            | GpuIntrinsic::Lgamma
633            | GpuIntrinsic::Tgamma
634            | GpuIntrinsic::Erf
635            | GpuIntrinsic::Erfc
636            | GpuIntrinsic::Erfinv
637            | GpuIntrinsic::Erfcinv => "exponential",
638
639            GpuIntrinsic::Isnan
640            | GpuIntrinsic::Isinf
641            | GpuIntrinsic::Isfinite
642            | GpuIntrinsic::Isnormal
643            | GpuIntrinsic::Signbit
644            | GpuIntrinsic::Nextafter
645            | GpuIntrinsic::Fdim
646            | GpuIntrinsic::Nan => "classification",
647
648            GpuIntrinsic::WarpShfl
649            | GpuIntrinsic::WarpShflUp
650            | GpuIntrinsic::WarpShflDown
651            | GpuIntrinsic::WarpShflXor
652            | GpuIntrinsic::WarpActiveMask
653            | GpuIntrinsic::WarpBallot
654            | GpuIntrinsic::WarpAll
655            | GpuIntrinsic::WarpAny
656            | GpuIntrinsic::WarpMatchAny
657            | GpuIntrinsic::WarpMatchAll
658            | GpuIntrinsic::WarpReduceAdd
659            | GpuIntrinsic::WarpReduceMin
660            | GpuIntrinsic::WarpReduceMax
661            | GpuIntrinsic::WarpReduceAnd
662            | GpuIntrinsic::WarpReduceOr
663            | GpuIntrinsic::WarpReduceXor => "warp",
664
665            GpuIntrinsic::Popc
666            | GpuIntrinsic::Clz
667            | GpuIntrinsic::Ctz
668            | GpuIntrinsic::Ffs
669            | GpuIntrinsic::Brev
670            | GpuIntrinsic::BytePerm
671            | GpuIntrinsic::FunnelShiftLeft
672            | GpuIntrinsic::FunnelShiftRight => "bit",
673
674            GpuIntrinsic::Ldg | GpuIntrinsic::PrefetchL1 | GpuIntrinsic::PrefetchL2 => "memory",
675
676            GpuIntrinsic::Rcp
677            | GpuIntrinsic::Fdividef
678            | GpuIntrinsic::Saturate
679            | GpuIntrinsic::J0
680            | GpuIntrinsic::J1
681            | GpuIntrinsic::Jn
682            | GpuIntrinsic::Y0
683            | GpuIntrinsic::Y1
684            | GpuIntrinsic::Yn
685            | GpuIntrinsic::Normcdf
686            | GpuIntrinsic::Normcdfinv
687            | GpuIntrinsic::CylBesselI0
688            | GpuIntrinsic::CylBesselI1 => "special",
689
690            GpuIntrinsic::ThreadIdxX
691            | GpuIntrinsic::ThreadIdxY
692            | GpuIntrinsic::ThreadIdxZ
693            | GpuIntrinsic::BlockIdxX
694            | GpuIntrinsic::BlockIdxY
695            | GpuIntrinsic::BlockIdxZ
696            | GpuIntrinsic::BlockDimX
697            | GpuIntrinsic::BlockDimY
698            | GpuIntrinsic::BlockDimZ
699            | GpuIntrinsic::GridDimX
700            | GpuIntrinsic::GridDimY
701            | GpuIntrinsic::GridDimZ
702            | GpuIntrinsic::WarpSize => "index",
703
704            GpuIntrinsic::Clock | GpuIntrinsic::Clock64 | GpuIntrinsic::Nanosleep => "timing",
705
706            // Reduction intrinsics
707            GpuIntrinsic::BlockReduceSum
708            | GpuIntrinsic::BlockReduceMin
709            | GpuIntrinsic::BlockReduceMax
710            | GpuIntrinsic::BlockReduceAnd
711            | GpuIntrinsic::BlockReduceOr
712            | GpuIntrinsic::GridReduceSum
713            | GpuIntrinsic::GridReduceMin
714            | GpuIntrinsic::GridReduceMax
715            | GpuIntrinsic::ReduceAndBroadcast => "reduction",
716        }
717    }
718}
719
720/// Registry for mapping Rust function names to GPU intrinsics.
721#[derive(Debug)]
722pub struct IntrinsicRegistry {
723    mappings: HashMap<String, GpuIntrinsic>,
724}
725
726impl Default for IntrinsicRegistry {
727    fn default() -> Self {
728        Self::new()
729    }
730}
731
732impl IntrinsicRegistry {
733    /// Create a new registry with default mappings.
734    pub fn new() -> Self {
735        let mut mappings = HashMap::new();
736
737        // === Synchronization ===
738        mappings.insert("sync_threads".to_string(), GpuIntrinsic::SyncThreads);
739        mappings.insert("thread_fence".to_string(), GpuIntrinsic::ThreadFence);
740        mappings.insert(
741            "thread_fence_block".to_string(),
742            GpuIntrinsic::ThreadFenceBlock,
743        );
744        mappings.insert(
745            "thread_fence_system".to_string(),
746            GpuIntrinsic::ThreadFenceSystem,
747        );
748        mappings.insert(
749            "sync_threads_count".to_string(),
750            GpuIntrinsic::SyncThreadsCount,
751        );
752        mappings.insert("sync_threads_and".to_string(), GpuIntrinsic::SyncThreadsAnd);
753        mappings.insert("sync_threads_or".to_string(), GpuIntrinsic::SyncThreadsOr);
754
755        // === Atomic operations ===
756        mappings.insert("atomic_add".to_string(), GpuIntrinsic::AtomicAdd);
757        mappings.insert("atomic_sub".to_string(), GpuIntrinsic::AtomicSub);
758        mappings.insert("atomic_min".to_string(), GpuIntrinsic::AtomicMin);
759        mappings.insert("atomic_max".to_string(), GpuIntrinsic::AtomicMax);
760        mappings.insert("atomic_exchange".to_string(), GpuIntrinsic::AtomicExch);
761        mappings.insert("atomic_exch".to_string(), GpuIntrinsic::AtomicExch);
762        mappings.insert("atomic_cas".to_string(), GpuIntrinsic::AtomicCas);
763        mappings.insert("atomic_compare_swap".to_string(), GpuIntrinsic::AtomicCas);
764        mappings.insert("atomic_and".to_string(), GpuIntrinsic::AtomicAnd);
765        mappings.insert("atomic_or".to_string(), GpuIntrinsic::AtomicOr);
766        mappings.insert("atomic_xor".to_string(), GpuIntrinsic::AtomicXor);
767        mappings.insert("atomic_inc".to_string(), GpuIntrinsic::AtomicInc);
768        mappings.insert("atomic_dec".to_string(), GpuIntrinsic::AtomicDec);
769
770        // === Basic math functions ===
771        mappings.insert("sqrt".to_string(), GpuIntrinsic::Sqrt);
772        mappings.insert("rsqrt".to_string(), GpuIntrinsic::Rsqrt);
773        mappings.insert("abs".to_string(), GpuIntrinsic::Fabs);
774        mappings.insert("fabs".to_string(), GpuIntrinsic::Fabs);
775        mappings.insert("floor".to_string(), GpuIntrinsic::Floor);
776        mappings.insert("ceil".to_string(), GpuIntrinsic::Ceil);
777        mappings.insert("round".to_string(), GpuIntrinsic::Round);
778        mappings.insert("trunc".to_string(), GpuIntrinsic::Trunc);
779        mappings.insert("mul_add".to_string(), GpuIntrinsic::Fma);
780        mappings.insert("fma".to_string(), GpuIntrinsic::Fma);
781        mappings.insert("min".to_string(), GpuIntrinsic::Min);
782        mappings.insert("max".to_string(), GpuIntrinsic::Max);
783        mappings.insert("fmin".to_string(), GpuIntrinsic::Min);
784        mappings.insert("fmax".to_string(), GpuIntrinsic::Max);
785        mappings.insert("fmod".to_string(), GpuIntrinsic::Fmod);
786        mappings.insert("remainder".to_string(), GpuIntrinsic::Remainder);
787        mappings.insert("copysign".to_string(), GpuIntrinsic::Copysign);
788        mappings.insert("cbrt".to_string(), GpuIntrinsic::Cbrt);
789        mappings.insert("hypot".to_string(), GpuIntrinsic::Hypot);
790
791        // === Trigonometric functions ===
792        mappings.insert("sin".to_string(), GpuIntrinsic::Sin);
793        mappings.insert("cos".to_string(), GpuIntrinsic::Cos);
794        mappings.insert("tan".to_string(), GpuIntrinsic::Tan);
795        mappings.insert("asin".to_string(), GpuIntrinsic::Asin);
796        mappings.insert("acos".to_string(), GpuIntrinsic::Acos);
797        mappings.insert("atan".to_string(), GpuIntrinsic::Atan);
798        mappings.insert("atan2".to_string(), GpuIntrinsic::Atan2);
799        mappings.insert("sincos".to_string(), GpuIntrinsic::Sincos);
800        mappings.insert("sinpi".to_string(), GpuIntrinsic::Sinpi);
801        mappings.insert("cospi".to_string(), GpuIntrinsic::Cospi);
802
803        // === Hyperbolic functions ===
804        mappings.insert("sinh".to_string(), GpuIntrinsic::Sinh);
805        mappings.insert("cosh".to_string(), GpuIntrinsic::Cosh);
806        mappings.insert("tanh".to_string(), GpuIntrinsic::Tanh);
807        mappings.insert("asinh".to_string(), GpuIntrinsic::Asinh);
808        mappings.insert("acosh".to_string(), GpuIntrinsic::Acosh);
809        mappings.insert("atanh".to_string(), GpuIntrinsic::Atanh);
810
811        // === Exponential and logarithmic ===
812        mappings.insert("exp".to_string(), GpuIntrinsic::Exp);
813        mappings.insert("exp2".to_string(), GpuIntrinsic::Exp2);
814        mappings.insert("exp10".to_string(), GpuIntrinsic::Exp10);
815        mappings.insert("expm1".to_string(), GpuIntrinsic::Expm1);
816        mappings.insert("ln".to_string(), GpuIntrinsic::Log);
817        mappings.insert("log".to_string(), GpuIntrinsic::Log);
818        mappings.insert("log2".to_string(), GpuIntrinsic::Log2);
819        mappings.insert("log10".to_string(), GpuIntrinsic::Log10);
820        mappings.insert("log1p".to_string(), GpuIntrinsic::Log1p);
821        mappings.insert("powf".to_string(), GpuIntrinsic::Pow);
822        mappings.insert("powi".to_string(), GpuIntrinsic::Pow);
823        mappings.insert("pow".to_string(), GpuIntrinsic::Pow);
824        mappings.insert("ldexp".to_string(), GpuIntrinsic::Ldexp);
825        mappings.insert("scalbn".to_string(), GpuIntrinsic::Scalbn);
826        mappings.insert("ilogb".to_string(), GpuIntrinsic::Ilogb);
827        mappings.insert("lgamma".to_string(), GpuIntrinsic::Lgamma);
828        mappings.insert("tgamma".to_string(), GpuIntrinsic::Tgamma);
829        mappings.insert("gamma".to_string(), GpuIntrinsic::Tgamma);
830        mappings.insert("erf".to_string(), GpuIntrinsic::Erf);
831        mappings.insert("erfc".to_string(), GpuIntrinsic::Erfc);
832        mappings.insert("erfinv".to_string(), GpuIntrinsic::Erfinv);
833        mappings.insert("erfcinv".to_string(), GpuIntrinsic::Erfcinv);
834
835        // === Classification and comparison ===
836        mappings.insert("is_nan".to_string(), GpuIntrinsic::Isnan);
837        mappings.insert("isnan".to_string(), GpuIntrinsic::Isnan);
838        mappings.insert("is_infinite".to_string(), GpuIntrinsic::Isinf);
839        mappings.insert("isinf".to_string(), GpuIntrinsic::Isinf);
840        mappings.insert("is_finite".to_string(), GpuIntrinsic::Isfinite);
841        mappings.insert("isfinite".to_string(), GpuIntrinsic::Isfinite);
842        mappings.insert("is_normal".to_string(), GpuIntrinsic::Isnormal);
843        mappings.insert("isnormal".to_string(), GpuIntrinsic::Isnormal);
844        mappings.insert("is_sign_negative".to_string(), GpuIntrinsic::Signbit);
845        mappings.insert("signbit".to_string(), GpuIntrinsic::Signbit);
846        mappings.insert("nextafter".to_string(), GpuIntrinsic::Nextafter);
847        mappings.insert("fdim".to_string(), GpuIntrinsic::Fdim);
848        mappings.insert("nan".to_string(), GpuIntrinsic::Nan);
849
850        // === Warp operations ===
851        mappings.insert("warp_shfl".to_string(), GpuIntrinsic::WarpShfl);
852        mappings.insert("warp_shuffle".to_string(), GpuIntrinsic::WarpShfl);
853        mappings.insert("warp_shfl_up".to_string(), GpuIntrinsic::WarpShflUp);
854        mappings.insert("warp_shuffle_up".to_string(), GpuIntrinsic::WarpShflUp);
855        mappings.insert("warp_shfl_down".to_string(), GpuIntrinsic::WarpShflDown);
856        mappings.insert("warp_shuffle_down".to_string(), GpuIntrinsic::WarpShflDown);
857        mappings.insert("warp_shfl_xor".to_string(), GpuIntrinsic::WarpShflXor);
858        mappings.insert("warp_shuffle_xor".to_string(), GpuIntrinsic::WarpShflXor);
859        mappings.insert("warp_active_mask".to_string(), GpuIntrinsic::WarpActiveMask);
860        mappings.insert("active_mask".to_string(), GpuIntrinsic::WarpActiveMask);
861        mappings.insert("warp_ballot".to_string(), GpuIntrinsic::WarpBallot);
862        mappings.insert("ballot".to_string(), GpuIntrinsic::WarpBallot);
863        mappings.insert("warp_all".to_string(), GpuIntrinsic::WarpAll);
864        mappings.insert("warp_any".to_string(), GpuIntrinsic::WarpAny);
865        mappings.insert("warp_match_any".to_string(), GpuIntrinsic::WarpMatchAny);
866        mappings.insert("warp_match_all".to_string(), GpuIntrinsic::WarpMatchAll);
867        mappings.insert("warp_reduce_add".to_string(), GpuIntrinsic::WarpReduceAdd);
868        mappings.insert("warp_reduce_min".to_string(), GpuIntrinsic::WarpReduceMin);
869        mappings.insert("warp_reduce_max".to_string(), GpuIntrinsic::WarpReduceMax);
870        mappings.insert("warp_reduce_and".to_string(), GpuIntrinsic::WarpReduceAnd);
871        mappings.insert("warp_reduce_or".to_string(), GpuIntrinsic::WarpReduceOr);
872        mappings.insert("warp_reduce_xor".to_string(), GpuIntrinsic::WarpReduceXor);
873
874        // === Bit manipulation ===
875        mappings.insert("popc".to_string(), GpuIntrinsic::Popc);
876        mappings.insert("popcount".to_string(), GpuIntrinsic::Popc);
877        mappings.insert("count_ones".to_string(), GpuIntrinsic::Popc);
878        mappings.insert("clz".to_string(), GpuIntrinsic::Clz);
879        mappings.insert("leading_zeros".to_string(), GpuIntrinsic::Clz);
880        mappings.insert("ctz".to_string(), GpuIntrinsic::Ctz);
881        mappings.insert("trailing_zeros".to_string(), GpuIntrinsic::Ctz);
882        mappings.insert("ffs".to_string(), GpuIntrinsic::Ffs);
883        mappings.insert("brev".to_string(), GpuIntrinsic::Brev);
884        mappings.insert("reverse_bits".to_string(), GpuIntrinsic::Brev);
885        mappings.insert("byte_perm".to_string(), GpuIntrinsic::BytePerm);
886        mappings.insert(
887            "funnel_shift_left".to_string(),
888            GpuIntrinsic::FunnelShiftLeft,
889        );
890        mappings.insert(
891            "funnel_shift_right".to_string(),
892            GpuIntrinsic::FunnelShiftRight,
893        );
894
895        // === Memory operations ===
896        mappings.insert("ldg".to_string(), GpuIntrinsic::Ldg);
897        mappings.insert("load_global".to_string(), GpuIntrinsic::Ldg);
898        mappings.insert("prefetch_l1".to_string(), GpuIntrinsic::PrefetchL1);
899        mappings.insert("prefetch_l2".to_string(), GpuIntrinsic::PrefetchL2);
900
901        // === Special functions ===
902        mappings.insert("rcp".to_string(), GpuIntrinsic::Rcp);
903        mappings.insert("recip".to_string(), GpuIntrinsic::Rcp);
904        mappings.insert("fdividef".to_string(), GpuIntrinsic::Fdividef);
905        mappings.insert("fast_div".to_string(), GpuIntrinsic::Fdividef);
906        mappings.insert("saturate".to_string(), GpuIntrinsic::Saturate);
907        mappings.insert("clamp_01".to_string(), GpuIntrinsic::Saturate);
908        mappings.insert("j0".to_string(), GpuIntrinsic::J0);
909        mappings.insert("j1".to_string(), GpuIntrinsic::J1);
910        mappings.insert("jn".to_string(), GpuIntrinsic::Jn);
911        mappings.insert("y0".to_string(), GpuIntrinsic::Y0);
912        mappings.insert("y1".to_string(), GpuIntrinsic::Y1);
913        mappings.insert("yn".to_string(), GpuIntrinsic::Yn);
914        mappings.insert("normcdf".to_string(), GpuIntrinsic::Normcdf);
915        mappings.insert("norm_cdf".to_string(), GpuIntrinsic::Normcdf);
916        mappings.insert("normcdfinv".to_string(), GpuIntrinsic::Normcdfinv);
917        mappings.insert("norm_cdf_inv".to_string(), GpuIntrinsic::Normcdfinv);
918        mappings.insert("cyl_bessel_i0".to_string(), GpuIntrinsic::CylBesselI0);
919        mappings.insert("cyl_bessel_i1".to_string(), GpuIntrinsic::CylBesselI1);
920
921        // === Thread/block indices ===
922        mappings.insert("thread_idx_x".to_string(), GpuIntrinsic::ThreadIdxX);
923        mappings.insert("thread_idx_y".to_string(), GpuIntrinsic::ThreadIdxY);
924        mappings.insert("thread_idx_z".to_string(), GpuIntrinsic::ThreadIdxZ);
925        mappings.insert("block_idx_x".to_string(), GpuIntrinsic::BlockIdxX);
926        mappings.insert("block_idx_y".to_string(), GpuIntrinsic::BlockIdxY);
927        mappings.insert("block_idx_z".to_string(), GpuIntrinsic::BlockIdxZ);
928        mappings.insert("block_dim_x".to_string(), GpuIntrinsic::BlockDimX);
929        mappings.insert("block_dim_y".to_string(), GpuIntrinsic::BlockDimY);
930        mappings.insert("block_dim_z".to_string(), GpuIntrinsic::BlockDimZ);
931        mappings.insert("grid_dim_x".to_string(), GpuIntrinsic::GridDimX);
932        mappings.insert("grid_dim_y".to_string(), GpuIntrinsic::GridDimY);
933        mappings.insert("grid_dim_z".to_string(), GpuIntrinsic::GridDimZ);
934        mappings.insert("warp_size".to_string(), GpuIntrinsic::WarpSize);
935
936        // === Clock and timing ===
937        mappings.insert("clock".to_string(), GpuIntrinsic::Clock);
938        mappings.insert("clock64".to_string(), GpuIntrinsic::Clock64);
939        mappings.insert("nanosleep".to_string(), GpuIntrinsic::Nanosleep);
940
941        // === Block-level reductions ===
942        mappings.insert("block_reduce_sum".to_string(), GpuIntrinsic::BlockReduceSum);
943        mappings.insert("block_reduce_min".to_string(), GpuIntrinsic::BlockReduceMin);
944        mappings.insert("block_reduce_max".to_string(), GpuIntrinsic::BlockReduceMax);
945        mappings.insert("block_reduce_and".to_string(), GpuIntrinsic::BlockReduceAnd);
946        mappings.insert("block_reduce_or".to_string(), GpuIntrinsic::BlockReduceOr);
947
948        // === Grid-level reductions ===
949        mappings.insert("grid_reduce_sum".to_string(), GpuIntrinsic::GridReduceSum);
950        mappings.insert("grid_reduce_min".to_string(), GpuIntrinsic::GridReduceMin);
951        mappings.insert("grid_reduce_max".to_string(), GpuIntrinsic::GridReduceMax);
952
953        // === Reduce and broadcast ===
954        mappings.insert(
955            "reduce_and_broadcast".to_string(),
956            GpuIntrinsic::ReduceAndBroadcast,
957        );
958        // Alternative naming convention
959        mappings.insert(
960            "reduce_broadcast".to_string(),
961            GpuIntrinsic::ReduceAndBroadcast,
962        );
963
964        Self { mappings }
965    }
966
967    /// Look up an intrinsic by Rust function name.
968    pub fn lookup(&self, name: &str) -> Option<&GpuIntrinsic> {
969        self.mappings.get(name)
970    }
971
972    /// Register a custom intrinsic mapping.
973    pub fn register(&mut self, rust_name: &str, intrinsic: GpuIntrinsic) {
974        self.mappings.insert(rust_name.to_string(), intrinsic);
975    }
976
977    /// Check if a name is a known intrinsic.
978    pub fn is_intrinsic(&self, name: &str) -> bool {
979        self.mappings.contains_key(name)
980    }
981}
982
983/// Stencil-specific intrinsics for neighbor access.
984///
985/// These are special intrinsics that the transpiler handles
986/// differently based on stencil configuration.
987#[derive(Debug, Clone, PartialEq)]
988pub enum StencilIntrinsic {
989    /// Get current cell index: `pos.idx()`
990    Index,
991    /// Access north neighbor: `pos.north(buf)`
992    North,
993    /// Access south neighbor: `pos.south(buf)`
994    South,
995    /// Access east neighbor: `pos.east(buf)`
996    East,
997    /// Access west neighbor: `pos.west(buf)`
998    West,
999    /// Access neighbor at offset: `pos.at(buf, dx, dy)`
1000    At,
1001    /// 3D: Access neighbor above: `pos.up(buf)`
1002    Up,
1003    /// 3D: Access neighbor below: `pos.down(buf)`
1004    Down,
1005}
1006
1007impl StencilIntrinsic {
1008    /// Parse a method name to stencil intrinsic.
1009    pub fn from_method_name(name: &str) -> Option<Self> {
1010        match name {
1011            "idx" => Some(StencilIntrinsic::Index),
1012            "north" => Some(StencilIntrinsic::North),
1013            "south" => Some(StencilIntrinsic::South),
1014            "east" => Some(StencilIntrinsic::East),
1015            "west" => Some(StencilIntrinsic::West),
1016            "at" => Some(StencilIntrinsic::At),
1017            "up" => Some(StencilIntrinsic::Up),
1018            "down" => Some(StencilIntrinsic::Down),
1019            _ => None,
1020        }
1021    }
1022}
1023
1024/// Ring kernel intrinsics for persistent actor kernels.
1025///
1026/// These intrinsics provide access to control block state, queue operations,
1027/// and HLC (Hybrid Logical Clock) functionality within ring kernel handlers.
1028#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1029pub enum RingKernelIntrinsic {
1030    // === Control Block Access ===
1031    /// Check if kernel is active: `is_active()`
1032    IsActive,
1033    /// Check if termination requested: `should_terminate()`
1034    ShouldTerminate,
1035    /// Mark kernel as terminated: `mark_terminated()`
1036    MarkTerminated,
1037    /// Get messages processed count: `messages_processed()`
1038    GetMessagesProcessed,
1039
1040    // === Queue Operations ===
1041    /// Get input queue size: `input_queue_size()`
1042    InputQueueSize,
1043    /// Get output queue size: `output_queue_size()`
1044    OutputQueueSize,
1045    /// Check if input queue empty: `input_queue_empty()`
1046    InputQueueEmpty,
1047    /// Check if output queue empty: `output_queue_empty()`
1048    OutputQueueEmpty,
1049    /// Enqueue a response: `enqueue_response(&response)`
1050    EnqueueResponse,
1051
1052    // === HLC Operations ===
1053    /// Increment HLC logical counter: `hlc_tick()`
1054    HlcTick,
1055    /// Update HLC with received timestamp: `hlc_update(received_ts)`
1056    HlcUpdate,
1057    /// Get current HLC timestamp: `hlc_now()`
1058    HlcNow,
1059
1060    // === K2K Operations ===
1061    /// Send message to another kernel: `k2k_send(target_id, &msg)`
1062    K2kSend,
1063    /// Try to receive K2K message: `k2k_try_recv()`
1064    K2kTryRecv,
1065    /// Check for K2K messages: `k2k_has_message()`
1066    K2kHasMessage,
1067    /// Peek at next K2K message without consuming: `k2k_peek()`
1068    K2kPeek,
1069    /// Get number of pending K2K messages: `k2k_pending_count()`
1070    K2kPendingCount,
1071
1072    // === Timing ===
1073    /// Sleep for nanoseconds: `nanosleep(ns)`
1074    Nanosleep,
1075}
1076
1077impl RingKernelIntrinsic {
1078    /// Get the CUDA code for this intrinsic.
1079    pub fn to_cuda(&self, args: &[String]) -> String {
1080        match self {
1081            Self::IsActive => "atomicAdd(&control->is_active, 0) != 0".to_string(),
1082            Self::ShouldTerminate => "atomicAdd(&control->should_terminate, 0) != 0".to_string(),
1083            Self::MarkTerminated => "atomicExch(&control->has_terminated, 1)".to_string(),
1084            Self::GetMessagesProcessed => "atomicAdd(&control->messages_processed, 0)".to_string(),
1085
1086            Self::InputQueueSize => {
1087                "(atomicAdd(&control->input_head, 0) - atomicAdd(&control->input_tail, 0))"
1088                    .to_string()
1089            }
1090            Self::OutputQueueSize => {
1091                "(atomicAdd(&control->output_head, 0) - atomicAdd(&control->output_tail, 0))"
1092                    .to_string()
1093            }
1094            Self::InputQueueEmpty => {
1095                "(atomicAdd(&control->input_head, 0) == atomicAdd(&control->input_tail, 0))"
1096                    .to_string()
1097            }
1098            Self::OutputQueueEmpty => {
1099                "(atomicAdd(&control->output_head, 0) == atomicAdd(&control->output_tail, 0))"
1100                    .to_string()
1101            }
1102            Self::EnqueueResponse => {
1103                if !args.is_empty() {
1104                    format!(
1105                        "{{ unsigned long long _out_idx = atomicAdd(&control->output_head, 1) & control->output_mask; \
1106                         memcpy(&output_buffer[_out_idx * RESP_SIZE], {}, RESP_SIZE); }}",
1107                        args[0]
1108                    )
1109                } else {
1110                    "/* enqueue_response requires response pointer */".to_string()
1111                }
1112            }
1113
1114            Self::HlcTick => "hlc_logical++".to_string(),
1115            Self::HlcUpdate => {
1116                if !args.is_empty() {
1117                    format!(
1118                        "{{ if ({} > hlc_physical) {{ hlc_physical = {}; hlc_logical = 0; }} else {{ hlc_logical++; }} }}",
1119                        args[0], args[0]
1120                    )
1121                } else {
1122                    "hlc_logical++".to_string()
1123                }
1124            }
1125            Self::HlcNow => "(hlc_physical << 32) | (hlc_logical & 0xFFFFFFFF)".to_string(),
1126
1127            Self::K2kSend => {
1128                if args.len() >= 2 {
1129                    // k2k_send(target_id, msg_ptr) -> k2k_send(k2k_routes, target_id, msg_ptr, sizeof(*msg_ptr))
1130                    format!(
1131                        "k2k_send(k2k_routes, {}, {}, sizeof(*{}))",
1132                        args[0], args[1], args[1]
1133                    )
1134                } else {
1135                    "/* k2k_send requires target_id and msg_ptr */".to_string()
1136                }
1137            }
1138            Self::K2kTryRecv => "k2k_try_recv(k2k_inbox)".to_string(),
1139            Self::K2kHasMessage => "k2k_has_message(k2k_inbox)".to_string(),
1140            Self::K2kPeek => "k2k_peek(k2k_inbox)".to_string(),
1141            Self::K2kPendingCount => "k2k_pending_count(k2k_inbox)".to_string(),
1142
1143            Self::Nanosleep => {
1144                if !args.is_empty() {
1145                    format!("__nanosleep({})", args[0])
1146                } else {
1147                    "__nanosleep(1000)".to_string()
1148                }
1149            }
1150        }
1151    }
1152
1153    /// Parse a function name to get the intrinsic.
1154    pub fn from_name(name: &str) -> Option<Self> {
1155        match name {
1156            "is_active" | "is_kernel_active" => Some(Self::IsActive),
1157            "should_terminate" => Some(Self::ShouldTerminate),
1158            "mark_terminated" => Some(Self::MarkTerminated),
1159            "messages_processed" | "get_messages_processed" => Some(Self::GetMessagesProcessed),
1160
1161            "input_queue_size" => Some(Self::InputQueueSize),
1162            "output_queue_size" => Some(Self::OutputQueueSize),
1163            "input_queue_empty" => Some(Self::InputQueueEmpty),
1164            "output_queue_empty" => Some(Self::OutputQueueEmpty),
1165            "enqueue_response" | "enqueue" => Some(Self::EnqueueResponse),
1166
1167            "hlc_tick" => Some(Self::HlcTick),
1168            "hlc_update" => Some(Self::HlcUpdate),
1169            "hlc_now" => Some(Self::HlcNow),
1170
1171            "k2k_send" => Some(Self::K2kSend),
1172            "k2k_try_recv" => Some(Self::K2kTryRecv),
1173            "k2k_has_message" => Some(Self::K2kHasMessage),
1174            "k2k_peek" => Some(Self::K2kPeek),
1175            "k2k_pending_count" | "k2k_pending" => Some(Self::K2kPendingCount),
1176
1177            "nanosleep" => Some(Self::Nanosleep),
1178
1179            _ => None,
1180        }
1181    }
1182
1183    /// Check if this intrinsic requires the control block.
1184    pub fn requires_control_block(&self) -> bool {
1185        matches!(
1186            self,
1187            Self::IsActive
1188                | Self::ShouldTerminate
1189                | Self::MarkTerminated
1190                | Self::GetMessagesProcessed
1191                | Self::InputQueueSize
1192                | Self::OutputQueueSize
1193                | Self::InputQueueEmpty
1194                | Self::OutputQueueEmpty
1195                | Self::EnqueueResponse
1196        )
1197    }
1198
1199    /// Check if this intrinsic requires HLC state.
1200    pub fn requires_hlc(&self) -> bool {
1201        matches!(self, Self::HlcTick | Self::HlcUpdate | Self::HlcNow)
1202    }
1203
1204    /// Check if this intrinsic requires K2K support.
1205    pub fn requires_k2k(&self) -> bool {
1206        matches!(
1207            self,
1208            Self::K2kSend
1209                | Self::K2kTryRecv
1210                | Self::K2kHasMessage
1211                | Self::K2kPeek
1212                | Self::K2kPendingCount
1213        )
1214    }
1215}
1216
1217impl StencilIntrinsic {
1218    /// Get the index offset for 2D stencil (relative to buffer_width).
1219    ///
1220    /// Returns (row_offset, col_offset) where final offset is:
1221    /// `row_offset * buffer_width + col_offset`
1222    pub fn get_offset_2d(&self) -> Option<(i32, i32)> {
1223        match self {
1224            StencilIntrinsic::Index => Some((0, 0)),
1225            StencilIntrinsic::North => Some((-1, 0)),
1226            StencilIntrinsic::South => Some((1, 0)),
1227            StencilIntrinsic::East => Some((0, 1)),
1228            StencilIntrinsic::West => Some((0, -1)),
1229            StencilIntrinsic::At => None, // Requires runtime offset
1230            StencilIntrinsic::Up | StencilIntrinsic::Down => None, // 3D only
1231        }
1232    }
1233
1234    /// Get the index offset for 3D stencil.
1235    ///
1236    /// Returns (z_offset, row_offset, col_offset) where final offset is:
1237    /// `z_offset * buffer_slice + row_offset * buffer_width + col_offset`
1238    pub fn get_offset_3d(&self) -> Option<(i32, i32, i32)> {
1239        match self {
1240            StencilIntrinsic::Index => Some((0, 0, 0)),
1241            StencilIntrinsic::North => Some((0, -1, 0)),
1242            StencilIntrinsic::South => Some((0, 1, 0)),
1243            StencilIntrinsic::East => Some((0, 0, 1)),
1244            StencilIntrinsic::West => Some((0, 0, -1)),
1245            StencilIntrinsic::Up => Some((-1, 0, 0)),
1246            StencilIntrinsic::Down => Some((1, 0, 0)),
1247            StencilIntrinsic::At => None, // Requires runtime offset
1248        }
1249    }
1250
1251    /// Check if this is a 3D-only intrinsic.
1252    pub fn is_3d_only(&self) -> bool {
1253        matches!(self, StencilIntrinsic::Up | StencilIntrinsic::Down)
1254    }
1255
1256    /// Generate CUDA index expression for 2D stencil.
1257    ///
1258    /// # Arguments
1259    /// * `buffer_name` - Name of the buffer variable
1260    /// * `buffer_width` - Width expression (e.g., "18" for tile_size + 2*halo)
1261    /// * `idx_var` - Name of the current index variable
1262    pub fn to_cuda_index_2d(&self, buffer_name: &str, buffer_width: &str, idx_var: &str) -> String {
1263        match self {
1264            StencilIntrinsic::Index => format!("{}[{}]", buffer_name, idx_var),
1265            StencilIntrinsic::North => {
1266                format!("{}[{} - {}]", buffer_name, idx_var, buffer_width)
1267            }
1268            StencilIntrinsic::South => {
1269                format!("{}[{} + {}]", buffer_name, idx_var, buffer_width)
1270            }
1271            StencilIntrinsic::East => format!("{}[{} + 1]", buffer_name, idx_var),
1272            StencilIntrinsic::West => format!("{}[{} - 1]", buffer_name, idx_var),
1273            StencilIntrinsic::At => {
1274                // This should be handled specially with provided offsets
1275                format!("{}[{}]", buffer_name, idx_var)
1276            }
1277            _ => format!("{}[{}]", buffer_name, idx_var),
1278        }
1279    }
1280
1281    /// Generate CUDA index expression for 3D stencil.
1282    ///
1283    /// # Arguments
1284    /// * `buffer_name` - Name of the buffer variable
1285    /// * `buffer_width` - Width expression
1286    /// * `buffer_slice` - Slice size expression (width * height)
1287    /// * `idx_var` - Name of the current index variable
1288    pub fn to_cuda_index_3d(
1289        &self,
1290        buffer_name: &str,
1291        buffer_width: &str,
1292        buffer_slice: &str,
1293        idx_var: &str,
1294    ) -> String {
1295        match self {
1296            StencilIntrinsic::Index => format!("{}[{}]", buffer_name, idx_var),
1297            StencilIntrinsic::North => {
1298                format!("{}[{} - {}]", buffer_name, idx_var, buffer_width)
1299            }
1300            StencilIntrinsic::South => {
1301                format!("{}[{} + {}]", buffer_name, idx_var, buffer_width)
1302            }
1303            StencilIntrinsic::East => format!("{}[{} + 1]", buffer_name, idx_var),
1304            StencilIntrinsic::West => format!("{}[{} - 1]", buffer_name, idx_var),
1305            StencilIntrinsic::Up => {
1306                format!("{}[{} - {}]", buffer_name, idx_var, buffer_slice)
1307            }
1308            StencilIntrinsic::Down => {
1309                format!("{}[{} + {}]", buffer_name, idx_var, buffer_slice)
1310            }
1311            StencilIntrinsic::At => {
1312                // This should be handled specially with provided offsets
1313                format!("{}[{}]", buffer_name, idx_var)
1314            }
1315        }
1316    }
1317}
1318
1319#[cfg(test)]
1320mod tests {
1321    use super::*;
1322
1323    #[test]
1324    fn test_intrinsic_lookup() {
1325        let registry = IntrinsicRegistry::new();
1326
1327        assert_eq!(
1328            registry.lookup("sync_threads"),
1329            Some(&GpuIntrinsic::SyncThreads)
1330        );
1331        assert_eq!(registry.lookup("sqrt"), Some(&GpuIntrinsic::Sqrt));
1332        assert_eq!(registry.lookup("unknown_func"), None);
1333    }
1334
1335    #[test]
1336    fn test_intrinsic_cuda_output() {
1337        assert_eq!(
1338            GpuIntrinsic::SyncThreads.to_cuda_string(),
1339            "__syncthreads()"
1340        );
1341        assert_eq!(GpuIntrinsic::AtomicAdd.to_cuda_string(), "atomicAdd");
1342        assert_eq!(GpuIntrinsic::Sqrt.to_cuda_string(), "sqrtf");
1343    }
1344
1345    #[test]
1346    fn test_stencil_intrinsic_parsing() {
1347        assert_eq!(
1348            StencilIntrinsic::from_method_name("north"),
1349            Some(StencilIntrinsic::North)
1350        );
1351        assert_eq!(
1352            StencilIntrinsic::from_method_name("idx"),
1353            Some(StencilIntrinsic::Index)
1354        );
1355        assert_eq!(StencilIntrinsic::from_method_name("unknown"), None);
1356    }
1357
1358    #[test]
1359    fn test_stencil_cuda_index() {
1360        let north = StencilIntrinsic::North;
1361        assert_eq!(
1362            north.to_cuda_index_2d("p", "buffer_width", "idx"),
1363            "p[idx - buffer_width]"
1364        );
1365
1366        let east = StencilIntrinsic::East;
1367        assert_eq!(east.to_cuda_index_2d("p", "18", "idx"), "p[idx + 1]");
1368    }
1369
1370    #[test]
1371    fn test_stencil_offset() {
1372        assert_eq!(StencilIntrinsic::North.get_offset_2d(), Some((-1, 0)));
1373        assert_eq!(StencilIntrinsic::East.get_offset_2d(), Some((0, 1)));
1374        assert_eq!(StencilIntrinsic::Index.get_offset_2d(), Some((0, 0)));
1375    }
1376
1377    #[test]
1378    fn test_ring_kernel_intrinsic_lookup() {
1379        assert_eq!(
1380            RingKernelIntrinsic::from_name("is_active"),
1381            Some(RingKernelIntrinsic::IsActive)
1382        );
1383        assert_eq!(
1384            RingKernelIntrinsic::from_name("should_terminate"),
1385            Some(RingKernelIntrinsic::ShouldTerminate)
1386        );
1387        assert_eq!(
1388            RingKernelIntrinsic::from_name("hlc_tick"),
1389            Some(RingKernelIntrinsic::HlcTick)
1390        );
1391        assert_eq!(
1392            RingKernelIntrinsic::from_name("enqueue_response"),
1393            Some(RingKernelIntrinsic::EnqueueResponse)
1394        );
1395        assert_eq!(RingKernelIntrinsic::from_name("unknown"), None);
1396    }
1397
1398    #[test]
1399    fn test_ring_kernel_intrinsic_cuda_output() {
1400        assert!(RingKernelIntrinsic::IsActive
1401            .to_cuda(&[])
1402            .contains("is_active"));
1403        assert!(RingKernelIntrinsic::ShouldTerminate
1404            .to_cuda(&[])
1405            .contains("should_terminate"));
1406        assert!(RingKernelIntrinsic::HlcTick
1407            .to_cuda(&[])
1408            .contains("hlc_logical"));
1409        assert!(RingKernelIntrinsic::InputQueueEmpty
1410            .to_cuda(&[])
1411            .contains("input_head"));
1412    }
1413
1414    #[test]
1415    fn test_ring_kernel_queue_intrinsics() {
1416        let enqueue = RingKernelIntrinsic::EnqueueResponse;
1417        let cuda = enqueue.to_cuda(&["&response".to_string()]);
1418        assert!(cuda.contains("output_head"));
1419        assert!(cuda.contains("memcpy"));
1420    }
1421
1422    #[test]
1423    fn test_k2k_intrinsics() {
1424        // Test k2k_send
1425        let send = RingKernelIntrinsic::K2kSend;
1426        let cuda = send.to_cuda(&["target_id".to_string(), "&msg".to_string()]);
1427        assert!(cuda.contains("k2k_send"));
1428        assert!(cuda.contains("k2k_routes"));
1429        assert!(cuda.contains("target_id"));
1430
1431        // Test k2k_try_recv
1432        assert_eq!(
1433            RingKernelIntrinsic::K2kTryRecv.to_cuda(&[]),
1434            "k2k_try_recv(k2k_inbox)"
1435        );
1436
1437        // Test k2k_has_message
1438        assert_eq!(
1439            RingKernelIntrinsic::K2kHasMessage.to_cuda(&[]),
1440            "k2k_has_message(k2k_inbox)"
1441        );
1442
1443        // Test k2k_peek
1444        assert_eq!(
1445            RingKernelIntrinsic::K2kPeek.to_cuda(&[]),
1446            "k2k_peek(k2k_inbox)"
1447        );
1448
1449        // Test k2k_pending_count
1450        assert_eq!(
1451            RingKernelIntrinsic::K2kPendingCount.to_cuda(&[]),
1452            "k2k_pending_count(k2k_inbox)"
1453        );
1454    }
1455
1456    #[test]
1457    fn test_k2k_intrinsic_lookup() {
1458        assert_eq!(
1459            RingKernelIntrinsic::from_name("k2k_send"),
1460            Some(RingKernelIntrinsic::K2kSend)
1461        );
1462        assert_eq!(
1463            RingKernelIntrinsic::from_name("k2k_try_recv"),
1464            Some(RingKernelIntrinsic::K2kTryRecv)
1465        );
1466        assert_eq!(
1467            RingKernelIntrinsic::from_name("k2k_has_message"),
1468            Some(RingKernelIntrinsic::K2kHasMessage)
1469        );
1470        assert_eq!(
1471            RingKernelIntrinsic::from_name("k2k_peek"),
1472            Some(RingKernelIntrinsic::K2kPeek)
1473        );
1474        assert_eq!(
1475            RingKernelIntrinsic::from_name("k2k_pending_count"),
1476            Some(RingKernelIntrinsic::K2kPendingCount)
1477        );
1478    }
1479
1480    #[test]
1481    fn test_intrinsic_requirements() {
1482        // K2K intrinsics require K2K
1483        assert!(RingKernelIntrinsic::K2kSend.requires_k2k());
1484        assert!(RingKernelIntrinsic::K2kTryRecv.requires_k2k());
1485        assert!(RingKernelIntrinsic::K2kPeek.requires_k2k());
1486        assert!(!RingKernelIntrinsic::HlcTick.requires_k2k());
1487
1488        // HLC intrinsics require HLC
1489        assert!(RingKernelIntrinsic::HlcTick.requires_hlc());
1490        assert!(RingKernelIntrinsic::HlcNow.requires_hlc());
1491        assert!(!RingKernelIntrinsic::K2kSend.requires_hlc());
1492
1493        // Control block intrinsics require control block
1494        assert!(RingKernelIntrinsic::IsActive.requires_control_block());
1495        assert!(RingKernelIntrinsic::EnqueueResponse.requires_control_block());
1496        assert!(!RingKernelIntrinsic::HlcTick.requires_control_block());
1497    }
1498
1499    // === NEW INTRINSIC TESTS ===
1500
1501    #[test]
1502    fn test_new_atomic_intrinsics() {
1503        let registry = IntrinsicRegistry::new();
1504
1505        // Test bitwise atomics
1506        assert_eq!(
1507            registry.lookup("atomic_and"),
1508            Some(&GpuIntrinsic::AtomicAnd)
1509        );
1510        assert_eq!(registry.lookup("atomic_or"), Some(&GpuIntrinsic::AtomicOr));
1511        assert_eq!(
1512            registry.lookup("atomic_xor"),
1513            Some(&GpuIntrinsic::AtomicXor)
1514        );
1515        assert_eq!(
1516            registry.lookup("atomic_inc"),
1517            Some(&GpuIntrinsic::AtomicInc)
1518        );
1519        assert_eq!(
1520            registry.lookup("atomic_dec"),
1521            Some(&GpuIntrinsic::AtomicDec)
1522        );
1523
1524        // Test CUDA output
1525        assert_eq!(GpuIntrinsic::AtomicAnd.to_cuda_string(), "atomicAnd");
1526        assert_eq!(GpuIntrinsic::AtomicOr.to_cuda_string(), "atomicOr");
1527        assert_eq!(GpuIntrinsic::AtomicXor.to_cuda_string(), "atomicXor");
1528        assert_eq!(GpuIntrinsic::AtomicInc.to_cuda_string(), "atomicInc");
1529        assert_eq!(GpuIntrinsic::AtomicDec.to_cuda_string(), "atomicDec");
1530    }
1531
1532    #[test]
1533    fn test_trigonometric_intrinsics() {
1534        let registry = IntrinsicRegistry::new();
1535
1536        // Test inverse trig
1537        assert_eq!(registry.lookup("asin"), Some(&GpuIntrinsic::Asin));
1538        assert_eq!(registry.lookup("acos"), Some(&GpuIntrinsic::Acos));
1539        assert_eq!(registry.lookup("atan"), Some(&GpuIntrinsic::Atan));
1540        assert_eq!(registry.lookup("atan2"), Some(&GpuIntrinsic::Atan2));
1541
1542        // Test CUDA output
1543        assert_eq!(GpuIntrinsic::Asin.to_cuda_string(), "asinf");
1544        assert_eq!(GpuIntrinsic::Acos.to_cuda_string(), "acosf");
1545        assert_eq!(GpuIntrinsic::Atan.to_cuda_string(), "atanf");
1546        assert_eq!(GpuIntrinsic::Atan2.to_cuda_string(), "atan2f");
1547    }
1548
1549    #[test]
1550    fn test_hyperbolic_intrinsics() {
1551        let registry = IntrinsicRegistry::new();
1552
1553        // Test hyperbolic functions
1554        assert_eq!(registry.lookup("sinh"), Some(&GpuIntrinsic::Sinh));
1555        assert_eq!(registry.lookup("cosh"), Some(&GpuIntrinsic::Cosh));
1556        assert_eq!(registry.lookup("tanh"), Some(&GpuIntrinsic::Tanh));
1557        assert_eq!(registry.lookup("asinh"), Some(&GpuIntrinsic::Asinh));
1558        assert_eq!(registry.lookup("acosh"), Some(&GpuIntrinsic::Acosh));
1559        assert_eq!(registry.lookup("atanh"), Some(&GpuIntrinsic::Atanh));
1560
1561        // Test CUDA output
1562        assert_eq!(GpuIntrinsic::Sinh.to_cuda_string(), "sinhf");
1563        assert_eq!(GpuIntrinsic::Cosh.to_cuda_string(), "coshf");
1564        assert_eq!(GpuIntrinsic::Tanh.to_cuda_string(), "tanhf");
1565    }
1566
1567    #[test]
1568    fn test_exponential_logarithmic_intrinsics() {
1569        let registry = IntrinsicRegistry::new();
1570
1571        // Test exp variants
1572        assert_eq!(registry.lookup("exp2"), Some(&GpuIntrinsic::Exp2));
1573        assert_eq!(registry.lookup("exp10"), Some(&GpuIntrinsic::Exp10));
1574        assert_eq!(registry.lookup("expm1"), Some(&GpuIntrinsic::Expm1));
1575
1576        // Test log variants
1577        assert_eq!(registry.lookup("log2"), Some(&GpuIntrinsic::Log2));
1578        assert_eq!(registry.lookup("log10"), Some(&GpuIntrinsic::Log10));
1579        assert_eq!(registry.lookup("log1p"), Some(&GpuIntrinsic::Log1p));
1580
1581        // Test CUDA output
1582        assert_eq!(GpuIntrinsic::Exp2.to_cuda_string(), "exp2f");
1583        assert_eq!(GpuIntrinsic::Log2.to_cuda_string(), "log2f");
1584        assert_eq!(GpuIntrinsic::Log10.to_cuda_string(), "log10f");
1585    }
1586
1587    #[test]
1588    fn test_classification_intrinsics() {
1589        let registry = IntrinsicRegistry::new();
1590
1591        // Test classification functions
1592        assert_eq!(registry.lookup("is_nan"), Some(&GpuIntrinsic::Isnan));
1593        assert_eq!(registry.lookup("isnan"), Some(&GpuIntrinsic::Isnan));
1594        assert_eq!(registry.lookup("is_infinite"), Some(&GpuIntrinsic::Isinf));
1595        assert_eq!(registry.lookup("is_finite"), Some(&GpuIntrinsic::Isfinite));
1596        assert_eq!(registry.lookup("is_normal"), Some(&GpuIntrinsic::Isnormal));
1597        assert_eq!(registry.lookup("signbit"), Some(&GpuIntrinsic::Signbit));
1598
1599        // Test CUDA output
1600        assert_eq!(GpuIntrinsic::Isnan.to_cuda_string(), "isnan");
1601        assert_eq!(GpuIntrinsic::Isinf.to_cuda_string(), "isinf");
1602        assert_eq!(GpuIntrinsic::Isfinite.to_cuda_string(), "isfinite");
1603    }
1604
1605    #[test]
1606    fn test_warp_reduce_intrinsics() {
1607        let registry = IntrinsicRegistry::new();
1608
1609        // Test warp reduce operations
1610        assert_eq!(
1611            registry.lookup("warp_reduce_add"),
1612            Some(&GpuIntrinsic::WarpReduceAdd)
1613        );
1614        assert_eq!(
1615            registry.lookup("warp_reduce_min"),
1616            Some(&GpuIntrinsic::WarpReduceMin)
1617        );
1618        assert_eq!(
1619            registry.lookup("warp_reduce_max"),
1620            Some(&GpuIntrinsic::WarpReduceMax)
1621        );
1622        assert_eq!(
1623            registry.lookup("warp_reduce_and"),
1624            Some(&GpuIntrinsic::WarpReduceAnd)
1625        );
1626        assert_eq!(
1627            registry.lookup("warp_reduce_or"),
1628            Some(&GpuIntrinsic::WarpReduceOr)
1629        );
1630        assert_eq!(
1631            registry.lookup("warp_reduce_xor"),
1632            Some(&GpuIntrinsic::WarpReduceXor)
1633        );
1634
1635        // Test CUDA output
1636        assert_eq!(
1637            GpuIntrinsic::WarpReduceAdd.to_cuda_string(),
1638            "__reduce_add_sync"
1639        );
1640        assert_eq!(
1641            GpuIntrinsic::WarpReduceMin.to_cuda_string(),
1642            "__reduce_min_sync"
1643        );
1644        assert_eq!(
1645            GpuIntrinsic::WarpReduceMax.to_cuda_string(),
1646            "__reduce_max_sync"
1647        );
1648    }
1649
1650    #[test]
1651    fn test_warp_match_intrinsics() {
1652        let registry = IntrinsicRegistry::new();
1653
1654        assert_eq!(
1655            registry.lookup("warp_match_any"),
1656            Some(&GpuIntrinsic::WarpMatchAny)
1657        );
1658        assert_eq!(
1659            registry.lookup("warp_match_all"),
1660            Some(&GpuIntrinsic::WarpMatchAll)
1661        );
1662
1663        assert_eq!(
1664            GpuIntrinsic::WarpMatchAny.to_cuda_string(),
1665            "__match_any_sync"
1666        );
1667        assert_eq!(
1668            GpuIntrinsic::WarpMatchAll.to_cuda_string(),
1669            "__match_all_sync"
1670        );
1671    }
1672
1673    #[test]
1674    fn test_bit_manipulation_intrinsics() {
1675        let registry = IntrinsicRegistry::new();
1676
1677        // Test bit manipulation
1678        assert_eq!(registry.lookup("popc"), Some(&GpuIntrinsic::Popc));
1679        assert_eq!(registry.lookup("popcount"), Some(&GpuIntrinsic::Popc));
1680        assert_eq!(registry.lookup("count_ones"), Some(&GpuIntrinsic::Popc));
1681        assert_eq!(registry.lookup("clz"), Some(&GpuIntrinsic::Clz));
1682        assert_eq!(registry.lookup("leading_zeros"), Some(&GpuIntrinsic::Clz));
1683        assert_eq!(registry.lookup("ctz"), Some(&GpuIntrinsic::Ctz));
1684        assert_eq!(registry.lookup("ffs"), Some(&GpuIntrinsic::Ffs));
1685        assert_eq!(registry.lookup("brev"), Some(&GpuIntrinsic::Brev));
1686        assert_eq!(registry.lookup("reverse_bits"), Some(&GpuIntrinsic::Brev));
1687
1688        // Test CUDA output
1689        assert_eq!(GpuIntrinsic::Popc.to_cuda_string(), "__popc");
1690        assert_eq!(GpuIntrinsic::Clz.to_cuda_string(), "__clz");
1691        assert_eq!(GpuIntrinsic::Ffs.to_cuda_string(), "__ffs");
1692        assert_eq!(GpuIntrinsic::Brev.to_cuda_string(), "__brev");
1693    }
1694
1695    #[test]
1696    fn test_funnel_shift_intrinsics() {
1697        let registry = IntrinsicRegistry::new();
1698
1699        assert_eq!(
1700            registry.lookup("funnel_shift_left"),
1701            Some(&GpuIntrinsic::FunnelShiftLeft)
1702        );
1703        assert_eq!(
1704            registry.lookup("funnel_shift_right"),
1705            Some(&GpuIntrinsic::FunnelShiftRight)
1706        );
1707
1708        assert_eq!(
1709            GpuIntrinsic::FunnelShiftLeft.to_cuda_string(),
1710            "__funnelshift_l"
1711        );
1712        assert_eq!(
1713            GpuIntrinsic::FunnelShiftRight.to_cuda_string(),
1714            "__funnelshift_r"
1715        );
1716    }
1717
1718    #[test]
1719    fn test_memory_intrinsics() {
1720        let registry = IntrinsicRegistry::new();
1721
1722        assert_eq!(registry.lookup("ldg"), Some(&GpuIntrinsic::Ldg));
1723        assert_eq!(registry.lookup("load_global"), Some(&GpuIntrinsic::Ldg));
1724        assert_eq!(
1725            registry.lookup("prefetch_l1"),
1726            Some(&GpuIntrinsic::PrefetchL1)
1727        );
1728        assert_eq!(
1729            registry.lookup("prefetch_l2"),
1730            Some(&GpuIntrinsic::PrefetchL2)
1731        );
1732
1733        assert_eq!(GpuIntrinsic::Ldg.to_cuda_string(), "__ldg");
1734        assert_eq!(GpuIntrinsic::PrefetchL1.to_cuda_string(), "__prefetch_l1");
1735        assert_eq!(GpuIntrinsic::PrefetchL2.to_cuda_string(), "__prefetch_l2");
1736    }
1737
1738    #[test]
1739    fn test_clock_intrinsics() {
1740        let registry = IntrinsicRegistry::new();
1741
1742        assert_eq!(registry.lookup("clock"), Some(&GpuIntrinsic::Clock));
1743        assert_eq!(registry.lookup("clock64"), Some(&GpuIntrinsic::Clock64));
1744        assert_eq!(registry.lookup("nanosleep"), Some(&GpuIntrinsic::Nanosleep));
1745
1746        assert_eq!(GpuIntrinsic::Clock.to_cuda_string(), "clock()");
1747        assert_eq!(GpuIntrinsic::Clock64.to_cuda_string(), "clock64()");
1748        assert_eq!(GpuIntrinsic::Nanosleep.to_cuda_string(), "__nanosleep");
1749    }
1750
1751    #[test]
1752    fn test_special_function_intrinsics() {
1753        let registry = IntrinsicRegistry::new();
1754
1755        assert_eq!(registry.lookup("rcp"), Some(&GpuIntrinsic::Rcp));
1756        assert_eq!(registry.lookup("recip"), Some(&GpuIntrinsic::Rcp));
1757        assert_eq!(registry.lookup("saturate"), Some(&GpuIntrinsic::Saturate));
1758        assert_eq!(registry.lookup("clamp_01"), Some(&GpuIntrinsic::Saturate));
1759
1760        assert_eq!(GpuIntrinsic::Rcp.to_cuda_string(), "__frcp_rn");
1761        assert_eq!(GpuIntrinsic::Saturate.to_cuda_string(), "__saturatef");
1762    }
1763
1764    #[test]
1765    fn test_intrinsic_categories() {
1766        // Test category assignment
1767        assert_eq!(GpuIntrinsic::SyncThreads.category(), "synchronization");
1768        assert_eq!(GpuIntrinsic::AtomicAdd.category(), "atomic");
1769        assert_eq!(GpuIntrinsic::Sqrt.category(), "math");
1770        assert_eq!(GpuIntrinsic::Sin.category(), "trigonometric");
1771        assert_eq!(GpuIntrinsic::Sinh.category(), "hyperbolic");
1772        assert_eq!(GpuIntrinsic::Exp.category(), "exponential");
1773        assert_eq!(GpuIntrinsic::Isnan.category(), "classification");
1774        assert_eq!(GpuIntrinsic::WarpShfl.category(), "warp");
1775        assert_eq!(GpuIntrinsic::Popc.category(), "bit");
1776        assert_eq!(GpuIntrinsic::Ldg.category(), "memory");
1777        assert_eq!(GpuIntrinsic::Rcp.category(), "special");
1778        assert_eq!(GpuIntrinsic::ThreadIdxX.category(), "index");
1779        assert_eq!(GpuIntrinsic::Clock.category(), "timing");
1780    }
1781
1782    #[test]
1783    fn test_intrinsic_flags() {
1784        // Test is_value_intrinsic
1785        assert!(GpuIntrinsic::ThreadIdxX.is_value_intrinsic());
1786        assert!(GpuIntrinsic::BlockDimX.is_value_intrinsic());
1787        assert!(GpuIntrinsic::WarpSize.is_value_intrinsic());
1788        assert!(!GpuIntrinsic::Sin.is_value_intrinsic());
1789        assert!(!GpuIntrinsic::AtomicAdd.is_value_intrinsic());
1790
1791        // Test is_zero_arg_function
1792        assert!(GpuIntrinsic::SyncThreads.is_zero_arg_function());
1793        assert!(GpuIntrinsic::ThreadFence.is_zero_arg_function());
1794        assert!(GpuIntrinsic::WarpActiveMask.is_zero_arg_function());
1795        assert!(GpuIntrinsic::Clock.is_zero_arg_function());
1796        assert!(!GpuIntrinsic::Sin.is_zero_arg_function());
1797
1798        // Test requires_mask
1799        assert!(GpuIntrinsic::WarpShfl.requires_mask());
1800        assert!(GpuIntrinsic::WarpBallot.requires_mask());
1801        assert!(GpuIntrinsic::WarpReduceAdd.requires_mask());
1802        assert!(!GpuIntrinsic::Sin.requires_mask());
1803        assert!(!GpuIntrinsic::AtomicAdd.requires_mask());
1804    }
1805
1806    #[test]
1807    fn test_3d_stencil_intrinsics() {
1808        assert_eq!(
1809            StencilIntrinsic::from_method_name("up"),
1810            Some(StencilIntrinsic::Up)
1811        );
1812        assert_eq!(
1813            StencilIntrinsic::from_method_name("down"),
1814            Some(StencilIntrinsic::Down)
1815        );
1816
1817        // Test 3D only flag
1818        assert!(StencilIntrinsic::Up.is_3d_only());
1819        assert!(StencilIntrinsic::Down.is_3d_only());
1820        assert!(!StencilIntrinsic::North.is_3d_only());
1821        assert!(!StencilIntrinsic::East.is_3d_only());
1822        assert!(!StencilIntrinsic::Index.is_3d_only());
1823
1824        // Test 3D offsets
1825        assert_eq!(StencilIntrinsic::Up.get_offset_3d(), Some((-1, 0, 0)));
1826        assert_eq!(StencilIntrinsic::Down.get_offset_3d(), Some((1, 0, 0)));
1827        assert_eq!(StencilIntrinsic::North.get_offset_3d(), Some((0, -1, 0)));
1828        assert_eq!(StencilIntrinsic::South.get_offset_3d(), Some((0, 1, 0)));
1829        assert_eq!(StencilIntrinsic::East.get_offset_3d(), Some((0, 0, 1)));
1830        assert_eq!(StencilIntrinsic::West.get_offset_3d(), Some((0, 0, -1)));
1831
1832        // Test 3D index generation
1833        let up = StencilIntrinsic::Up;
1834        assert_eq!(up.to_cuda_index_3d("p", "18", "324", "idx"), "p[idx - 324]");
1835
1836        let down = StencilIntrinsic::Down;
1837        assert_eq!(
1838            down.to_cuda_index_3d("p", "18", "324", "idx"),
1839            "p[idx + 324]"
1840        );
1841    }
1842
1843    #[test]
1844    fn test_sync_intrinsics() {
1845        let registry = IntrinsicRegistry::new();
1846
1847        assert_eq!(
1848            registry.lookup("sync_threads_count"),
1849            Some(&GpuIntrinsic::SyncThreadsCount)
1850        );
1851        assert_eq!(
1852            registry.lookup("sync_threads_and"),
1853            Some(&GpuIntrinsic::SyncThreadsAnd)
1854        );
1855        assert_eq!(
1856            registry.lookup("sync_threads_or"),
1857            Some(&GpuIntrinsic::SyncThreadsOr)
1858        );
1859
1860        assert_eq!(
1861            GpuIntrinsic::SyncThreadsCount.to_cuda_string(),
1862            "__syncthreads_count"
1863        );
1864        assert_eq!(
1865            GpuIntrinsic::SyncThreadsAnd.to_cuda_string(),
1866            "__syncthreads_and"
1867        );
1868        assert_eq!(
1869            GpuIntrinsic::SyncThreadsOr.to_cuda_string(),
1870            "__syncthreads_or"
1871        );
1872    }
1873
1874    #[test]
1875    fn test_math_extras() {
1876        let registry = IntrinsicRegistry::new();
1877
1878        assert_eq!(registry.lookup("trunc"), Some(&GpuIntrinsic::Trunc));
1879        assert_eq!(registry.lookup("cbrt"), Some(&GpuIntrinsic::Cbrt));
1880        assert_eq!(registry.lookup("hypot"), Some(&GpuIntrinsic::Hypot));
1881        assert_eq!(registry.lookup("copysign"), Some(&GpuIntrinsic::Copysign));
1882        assert_eq!(registry.lookup("fmod"), Some(&GpuIntrinsic::Fmod));
1883
1884        assert_eq!(GpuIntrinsic::Trunc.to_cuda_string(), "truncf");
1885        assert_eq!(GpuIntrinsic::Cbrt.to_cuda_string(), "cbrtf");
1886        assert_eq!(GpuIntrinsic::Hypot.to_cuda_string(), "hypotf");
1887    }
1888
1889    #[test]
1890    fn test_block_reduction_intrinsics() {
1891        let registry = IntrinsicRegistry::new();
1892
1893        // Test block-level reductions
1894        assert_eq!(
1895            registry.lookup("block_reduce_sum"),
1896            Some(&GpuIntrinsic::BlockReduceSum)
1897        );
1898        assert_eq!(
1899            registry.lookup("block_reduce_min"),
1900            Some(&GpuIntrinsic::BlockReduceMin)
1901        );
1902        assert_eq!(
1903            registry.lookup("block_reduce_max"),
1904            Some(&GpuIntrinsic::BlockReduceMax)
1905        );
1906        assert_eq!(
1907            registry.lookup("block_reduce_and"),
1908            Some(&GpuIntrinsic::BlockReduceAnd)
1909        );
1910        assert_eq!(
1911            registry.lookup("block_reduce_or"),
1912            Some(&GpuIntrinsic::BlockReduceOr)
1913        );
1914
1915        // Test CUDA output
1916        assert_eq!(
1917            GpuIntrinsic::BlockReduceSum.to_cuda_string(),
1918            "__block_reduce_sum"
1919        );
1920        assert_eq!(
1921            GpuIntrinsic::BlockReduceMin.to_cuda_string(),
1922            "__block_reduce_min"
1923        );
1924        assert_eq!(
1925            GpuIntrinsic::BlockReduceMax.to_cuda_string(),
1926            "__block_reduce_max"
1927        );
1928
1929        // Test category
1930        assert_eq!(GpuIntrinsic::BlockReduceSum.category(), "reduction");
1931        assert_eq!(GpuIntrinsic::BlockReduceMin.category(), "reduction");
1932    }
1933
1934    #[test]
1935    fn test_grid_reduction_intrinsics() {
1936        let registry = IntrinsicRegistry::new();
1937
1938        // Test grid-level reductions
1939        assert_eq!(
1940            registry.lookup("grid_reduce_sum"),
1941            Some(&GpuIntrinsic::GridReduceSum)
1942        );
1943        assert_eq!(
1944            registry.lookup("grid_reduce_min"),
1945            Some(&GpuIntrinsic::GridReduceMin)
1946        );
1947        assert_eq!(
1948            registry.lookup("grid_reduce_max"),
1949            Some(&GpuIntrinsic::GridReduceMax)
1950        );
1951
1952        // Test CUDA output
1953        assert_eq!(
1954            GpuIntrinsic::GridReduceSum.to_cuda_string(),
1955            "__grid_reduce_sum"
1956        );
1957        assert_eq!(
1958            GpuIntrinsic::GridReduceMin.to_cuda_string(),
1959            "__grid_reduce_min"
1960        );
1961        assert_eq!(
1962            GpuIntrinsic::GridReduceMax.to_cuda_string(),
1963            "__grid_reduce_max"
1964        );
1965
1966        // Test category
1967        assert_eq!(GpuIntrinsic::GridReduceSum.category(), "reduction");
1968    }
1969
1970    #[test]
1971    fn test_reduce_and_broadcast_intrinsic() {
1972        let registry = IntrinsicRegistry::new();
1973
1974        // Test reduce-and-broadcast
1975        assert_eq!(
1976            registry.lookup("reduce_and_broadcast"),
1977            Some(&GpuIntrinsic::ReduceAndBroadcast)
1978        );
1979        // Alternative name
1980        assert_eq!(
1981            registry.lookup("reduce_broadcast"),
1982            Some(&GpuIntrinsic::ReduceAndBroadcast)
1983        );
1984
1985        // Test CUDA output
1986        assert_eq!(
1987            GpuIntrinsic::ReduceAndBroadcast.to_cuda_string(),
1988            "__reduce_and_broadcast"
1989        );
1990
1991        // Test category
1992        assert_eq!(GpuIntrinsic::ReduceAndBroadcast.category(), "reduction");
1993    }
1994}