1use std::collections::HashMap;
7
8#[derive(Debug, Clone, PartialEq)]
10pub enum GpuIntrinsic {
11 SyncThreads,
13 ThreadFence,
15 ThreadFenceBlock,
16 ThreadFenceSystem,
17
18 AtomicAdd,
20 AtomicSub,
21 AtomicMin,
22 AtomicMax,
23 AtomicExch,
24 AtomicCas,
25
26 Sqrt,
28 Rsqrt,
29 Abs,
30 Fabs,
31 Floor,
32 Ceil,
33 Round,
34 Sin,
35 Cos,
36 Tan,
37 Exp,
38 Log,
39 Pow,
40 Fma,
41 Min,
42 Max,
43
44 WarpShfl,
46 WarpShflUp,
47 WarpShflDown,
48 WarpShflXor,
49 WarpActiveMask,
50 WarpBallot,
51 WarpAll,
52 WarpAny,
53
54 ThreadIdxX,
56 ThreadIdxY,
57 ThreadIdxZ,
58 BlockIdxX,
59 BlockIdxY,
60 BlockIdxZ,
61 BlockDimX,
62 BlockDimY,
63 BlockDimZ,
64 GridDimX,
65 GridDimY,
66 GridDimZ,
67}
68
69impl GpuIntrinsic {
70 pub fn to_cuda_string(&self) -> &'static str {
72 match self {
73 GpuIntrinsic::SyncThreads => "__syncthreads()",
74 GpuIntrinsic::ThreadFence => "__threadfence()",
75 GpuIntrinsic::ThreadFenceBlock => "__threadfence_block()",
76 GpuIntrinsic::ThreadFenceSystem => "__threadfence_system()",
77 GpuIntrinsic::AtomicAdd => "atomicAdd",
78 GpuIntrinsic::AtomicSub => "atomicSub",
79 GpuIntrinsic::AtomicMin => "atomicMin",
80 GpuIntrinsic::AtomicMax => "atomicMax",
81 GpuIntrinsic::AtomicExch => "atomicExch",
82 GpuIntrinsic::AtomicCas => "atomicCAS",
83 GpuIntrinsic::Sqrt => "sqrtf",
84 GpuIntrinsic::Rsqrt => "rsqrtf",
85 GpuIntrinsic::Abs => "abs",
86 GpuIntrinsic::Fabs => "fabsf",
87 GpuIntrinsic::Floor => "floorf",
88 GpuIntrinsic::Ceil => "ceilf",
89 GpuIntrinsic::Round => "roundf",
90 GpuIntrinsic::Sin => "sinf",
91 GpuIntrinsic::Cos => "cosf",
92 GpuIntrinsic::Tan => "tanf",
93 GpuIntrinsic::Exp => "expf",
94 GpuIntrinsic::Log => "logf",
95 GpuIntrinsic::Pow => "powf",
96 GpuIntrinsic::Fma => "fmaf",
97 GpuIntrinsic::Min => "fminf",
98 GpuIntrinsic::Max => "fmaxf",
99 GpuIntrinsic::WarpShfl => "__shfl_sync",
100 GpuIntrinsic::WarpShflUp => "__shfl_up_sync",
101 GpuIntrinsic::WarpShflDown => "__shfl_down_sync",
102 GpuIntrinsic::WarpShflXor => "__shfl_xor_sync",
103 GpuIntrinsic::WarpActiveMask => "__activemask()",
104 GpuIntrinsic::WarpBallot => "__ballot_sync",
105 GpuIntrinsic::WarpAll => "__all_sync",
106 GpuIntrinsic::WarpAny => "__any_sync",
107 GpuIntrinsic::ThreadIdxX => "threadIdx.x",
108 GpuIntrinsic::ThreadIdxY => "threadIdx.y",
109 GpuIntrinsic::ThreadIdxZ => "threadIdx.z",
110 GpuIntrinsic::BlockIdxX => "blockIdx.x",
111 GpuIntrinsic::BlockIdxY => "blockIdx.y",
112 GpuIntrinsic::BlockIdxZ => "blockIdx.z",
113 GpuIntrinsic::BlockDimX => "blockDim.x",
114 GpuIntrinsic::BlockDimY => "blockDim.y",
115 GpuIntrinsic::BlockDimZ => "blockDim.z",
116 GpuIntrinsic::GridDimX => "gridDim.x",
117 GpuIntrinsic::GridDimY => "gridDim.y",
118 GpuIntrinsic::GridDimZ => "gridDim.z",
119 }
120 }
121}
122
123#[derive(Debug)]
125pub struct IntrinsicRegistry {
126 mappings: HashMap<String, GpuIntrinsic>,
127}
128
129impl Default for IntrinsicRegistry {
130 fn default() -> Self {
131 Self::new()
132 }
133}
134
135impl IntrinsicRegistry {
136 pub fn new() -> Self {
138 let mut mappings = HashMap::new();
139
140 mappings.insert("sync_threads".to_string(), GpuIntrinsic::SyncThreads);
142 mappings.insert("thread_fence".to_string(), GpuIntrinsic::ThreadFence);
143 mappings.insert(
144 "thread_fence_block".to_string(),
145 GpuIntrinsic::ThreadFenceBlock,
146 );
147 mappings.insert(
148 "thread_fence_system".to_string(),
149 GpuIntrinsic::ThreadFenceSystem,
150 );
151
152 mappings.insert("atomic_add".to_string(), GpuIntrinsic::AtomicAdd);
154 mappings.insert("atomic_sub".to_string(), GpuIntrinsic::AtomicSub);
155 mappings.insert("atomic_min".to_string(), GpuIntrinsic::AtomicMin);
156 mappings.insert("atomic_max".to_string(), GpuIntrinsic::AtomicMax);
157 mappings.insert("atomic_exchange".to_string(), GpuIntrinsic::AtomicExch);
158 mappings.insert("atomic_cas".to_string(), GpuIntrinsic::AtomicCas);
159
160 mappings.insert("sqrt".to_string(), GpuIntrinsic::Sqrt);
162 mappings.insert("abs".to_string(), GpuIntrinsic::Fabs);
163 mappings.insert("floor".to_string(), GpuIntrinsic::Floor);
164 mappings.insert("ceil".to_string(), GpuIntrinsic::Ceil);
165 mappings.insert("round".to_string(), GpuIntrinsic::Round);
166 mappings.insert("sin".to_string(), GpuIntrinsic::Sin);
167 mappings.insert("cos".to_string(), GpuIntrinsic::Cos);
168 mappings.insert("tan".to_string(), GpuIntrinsic::Tan);
169 mappings.insert("exp".to_string(), GpuIntrinsic::Exp);
170 mappings.insert("ln".to_string(), GpuIntrinsic::Log);
171 mappings.insert("log".to_string(), GpuIntrinsic::Log);
172 mappings.insert("powf".to_string(), GpuIntrinsic::Pow);
173 mappings.insert("powi".to_string(), GpuIntrinsic::Pow);
174 mappings.insert("mul_add".to_string(), GpuIntrinsic::Fma);
175 mappings.insert("min".to_string(), GpuIntrinsic::Min);
176 mappings.insert("max".to_string(), GpuIntrinsic::Max);
177
178 mappings.insert("thread_idx_x".to_string(), GpuIntrinsic::ThreadIdxX);
180 mappings.insert("thread_idx_y".to_string(), GpuIntrinsic::ThreadIdxY);
181 mappings.insert("thread_idx_z".to_string(), GpuIntrinsic::ThreadIdxZ);
182 mappings.insert("block_idx_x".to_string(), GpuIntrinsic::BlockIdxX);
183 mappings.insert("block_idx_y".to_string(), GpuIntrinsic::BlockIdxY);
184 mappings.insert("block_idx_z".to_string(), GpuIntrinsic::BlockIdxZ);
185 mappings.insert("block_dim_x".to_string(), GpuIntrinsic::BlockDimX);
186 mappings.insert("block_dim_y".to_string(), GpuIntrinsic::BlockDimY);
187 mappings.insert("block_dim_z".to_string(), GpuIntrinsic::BlockDimZ);
188 mappings.insert("grid_dim_x".to_string(), GpuIntrinsic::GridDimX);
189 mappings.insert("grid_dim_y".to_string(), GpuIntrinsic::GridDimY);
190 mappings.insert("grid_dim_z".to_string(), GpuIntrinsic::GridDimZ);
191
192 Self { mappings }
193 }
194
195 pub fn lookup(&self, name: &str) -> Option<&GpuIntrinsic> {
197 self.mappings.get(name)
198 }
199
200 pub fn register(&mut self, rust_name: &str, intrinsic: GpuIntrinsic) {
202 self.mappings.insert(rust_name.to_string(), intrinsic);
203 }
204
205 pub fn is_intrinsic(&self, name: &str) -> bool {
207 self.mappings.contains_key(name)
208 }
209}
210
211#[derive(Debug, Clone, PartialEq)]
216pub enum StencilIntrinsic {
217 Index,
219 North,
221 South,
223 East,
225 West,
227 At,
229 Up,
231 Down,
233}
234
235impl StencilIntrinsic {
236 pub fn from_method_name(name: &str) -> Option<Self> {
238 match name {
239 "idx" => Some(StencilIntrinsic::Index),
240 "north" => Some(StencilIntrinsic::North),
241 "south" => Some(StencilIntrinsic::South),
242 "east" => Some(StencilIntrinsic::East),
243 "west" => Some(StencilIntrinsic::West),
244 "at" => Some(StencilIntrinsic::At),
245 "up" => Some(StencilIntrinsic::Up),
246 "down" => Some(StencilIntrinsic::Down),
247 _ => None,
248 }
249 }
250}
251
252#[derive(Debug, Clone, Copy, PartialEq, Eq)]
257pub enum RingKernelIntrinsic {
258 IsActive,
261 ShouldTerminate,
263 MarkTerminated,
265 GetMessagesProcessed,
267
268 InputQueueSize,
271 OutputQueueSize,
273 InputQueueEmpty,
275 OutputQueueEmpty,
277 EnqueueResponse,
279
280 HlcTick,
283 HlcUpdate,
285 HlcNow,
287
288 K2kSend,
291 K2kTryRecv,
293 K2kHasMessage,
295 K2kPeek,
297 K2kPendingCount,
299
300 Nanosleep,
303}
304
305impl RingKernelIntrinsic {
306 pub fn to_cuda(&self, args: &[String]) -> String {
308 match self {
309 Self::IsActive => "atomicAdd(&control->is_active, 0) != 0".to_string(),
310 Self::ShouldTerminate => "atomicAdd(&control->should_terminate, 0) != 0".to_string(),
311 Self::MarkTerminated => "atomicExch(&control->has_terminated, 1)".to_string(),
312 Self::GetMessagesProcessed => "atomicAdd(&control->messages_processed, 0)".to_string(),
313
314 Self::InputQueueSize => {
315 "(atomicAdd(&control->input_head, 0) - atomicAdd(&control->input_tail, 0))"
316 .to_string()
317 }
318 Self::OutputQueueSize => {
319 "(atomicAdd(&control->output_head, 0) - atomicAdd(&control->output_tail, 0))"
320 .to_string()
321 }
322 Self::InputQueueEmpty => {
323 "(atomicAdd(&control->input_head, 0) == atomicAdd(&control->input_tail, 0))"
324 .to_string()
325 }
326 Self::OutputQueueEmpty => {
327 "(atomicAdd(&control->output_head, 0) == atomicAdd(&control->output_tail, 0))"
328 .to_string()
329 }
330 Self::EnqueueResponse => {
331 if !args.is_empty() {
332 format!(
333 "{{ unsigned long long _out_idx = atomicAdd(&control->output_head, 1) & control->output_mask; \
334 memcpy(&output_buffer[_out_idx * RESP_SIZE], {}, RESP_SIZE); }}",
335 args[0]
336 )
337 } else {
338 "/* enqueue_response requires response pointer */".to_string()
339 }
340 }
341
342 Self::HlcTick => "hlc_logical++".to_string(),
343 Self::HlcUpdate => {
344 if !args.is_empty() {
345 format!(
346 "{{ if ({} > hlc_physical) {{ hlc_physical = {}; hlc_logical = 0; }} else {{ hlc_logical++; }} }}",
347 args[0], args[0]
348 )
349 } else {
350 "hlc_logical++".to_string()
351 }
352 }
353 Self::HlcNow => "(hlc_physical << 32) | (hlc_logical & 0xFFFFFFFF)".to_string(),
354
355 Self::K2kSend => {
356 if args.len() >= 2 {
357 format!(
359 "k2k_send(k2k_routes, {}, {}, sizeof(*{}))",
360 args[0], args[1], args[1]
361 )
362 } else {
363 "/* k2k_send requires target_id and msg_ptr */".to_string()
364 }
365 }
366 Self::K2kTryRecv => "k2k_try_recv(k2k_inbox)".to_string(),
367 Self::K2kHasMessage => "k2k_has_message(k2k_inbox)".to_string(),
368 Self::K2kPeek => "k2k_peek(k2k_inbox)".to_string(),
369 Self::K2kPendingCount => "k2k_pending_count(k2k_inbox)".to_string(),
370
371 Self::Nanosleep => {
372 if !args.is_empty() {
373 format!("__nanosleep({})", args[0])
374 } else {
375 "__nanosleep(1000)".to_string()
376 }
377 }
378 }
379 }
380
381 pub fn from_name(name: &str) -> Option<Self> {
383 match name {
384 "is_active" | "is_kernel_active" => Some(Self::IsActive),
385 "should_terminate" => Some(Self::ShouldTerminate),
386 "mark_terminated" => Some(Self::MarkTerminated),
387 "messages_processed" | "get_messages_processed" => Some(Self::GetMessagesProcessed),
388
389 "input_queue_size" => Some(Self::InputQueueSize),
390 "output_queue_size" => Some(Self::OutputQueueSize),
391 "input_queue_empty" => Some(Self::InputQueueEmpty),
392 "output_queue_empty" => Some(Self::OutputQueueEmpty),
393 "enqueue_response" | "enqueue" => Some(Self::EnqueueResponse),
394
395 "hlc_tick" => Some(Self::HlcTick),
396 "hlc_update" => Some(Self::HlcUpdate),
397 "hlc_now" => Some(Self::HlcNow),
398
399 "k2k_send" => Some(Self::K2kSend),
400 "k2k_try_recv" => Some(Self::K2kTryRecv),
401 "k2k_has_message" => Some(Self::K2kHasMessage),
402 "k2k_peek" => Some(Self::K2kPeek),
403 "k2k_pending_count" | "k2k_pending" => Some(Self::K2kPendingCount),
404
405 "nanosleep" => Some(Self::Nanosleep),
406
407 _ => None,
408 }
409 }
410
411 pub fn requires_control_block(&self) -> bool {
413 matches!(
414 self,
415 Self::IsActive
416 | Self::ShouldTerminate
417 | Self::MarkTerminated
418 | Self::GetMessagesProcessed
419 | Self::InputQueueSize
420 | Self::OutputQueueSize
421 | Self::InputQueueEmpty
422 | Self::OutputQueueEmpty
423 | Self::EnqueueResponse
424 )
425 }
426
427 pub fn requires_hlc(&self) -> bool {
429 matches!(self, Self::HlcTick | Self::HlcUpdate | Self::HlcNow)
430 }
431
432 pub fn requires_k2k(&self) -> bool {
434 matches!(
435 self,
436 Self::K2kSend
437 | Self::K2kTryRecv
438 | Self::K2kHasMessage
439 | Self::K2kPeek
440 | Self::K2kPendingCount
441 )
442 }
443}
444
445impl StencilIntrinsic {
446 pub fn get_offset_2d(&self) -> Option<(i32, i32)> {
451 match self {
452 StencilIntrinsic::Index => Some((0, 0)),
453 StencilIntrinsic::North => Some((-1, 0)),
454 StencilIntrinsic::South => Some((1, 0)),
455 StencilIntrinsic::East => Some((0, 1)),
456 StencilIntrinsic::West => Some((0, -1)),
457 StencilIntrinsic::At => None, StencilIntrinsic::Up | StencilIntrinsic::Down => None, }
460 }
461
462 pub fn to_cuda_index_2d(&self, buffer_name: &str, buffer_width: &str, idx_var: &str) -> String {
469 match self {
470 StencilIntrinsic::Index => format!("{}[{}]", buffer_name, idx_var),
471 StencilIntrinsic::North => {
472 format!("{}[{} - {}]", buffer_name, idx_var, buffer_width)
473 }
474 StencilIntrinsic::South => {
475 format!("{}[{} + {}]", buffer_name, idx_var, buffer_width)
476 }
477 StencilIntrinsic::East => format!("{}[{} + 1]", buffer_name, idx_var),
478 StencilIntrinsic::West => format!("{}[{} - 1]", buffer_name, idx_var),
479 StencilIntrinsic::At => {
480 format!("{}[{}]", buffer_name, idx_var)
482 }
483 _ => format!("{}[{}]", buffer_name, idx_var),
484 }
485 }
486}
487
488#[cfg(test)]
489mod tests {
490 use super::*;
491
492 #[test]
493 fn test_intrinsic_lookup() {
494 let registry = IntrinsicRegistry::new();
495
496 assert_eq!(
497 registry.lookup("sync_threads"),
498 Some(&GpuIntrinsic::SyncThreads)
499 );
500 assert_eq!(registry.lookup("sqrt"), Some(&GpuIntrinsic::Sqrt));
501 assert_eq!(registry.lookup("unknown_func"), None);
502 }
503
504 #[test]
505 fn test_intrinsic_cuda_output() {
506 assert_eq!(
507 GpuIntrinsic::SyncThreads.to_cuda_string(),
508 "__syncthreads()"
509 );
510 assert_eq!(GpuIntrinsic::AtomicAdd.to_cuda_string(), "atomicAdd");
511 assert_eq!(GpuIntrinsic::Sqrt.to_cuda_string(), "sqrtf");
512 }
513
514 #[test]
515 fn test_stencil_intrinsic_parsing() {
516 assert_eq!(
517 StencilIntrinsic::from_method_name("north"),
518 Some(StencilIntrinsic::North)
519 );
520 assert_eq!(
521 StencilIntrinsic::from_method_name("idx"),
522 Some(StencilIntrinsic::Index)
523 );
524 assert_eq!(StencilIntrinsic::from_method_name("unknown"), None);
525 }
526
527 #[test]
528 fn test_stencil_cuda_index() {
529 let north = StencilIntrinsic::North;
530 assert_eq!(
531 north.to_cuda_index_2d("p", "buffer_width", "idx"),
532 "p[idx - buffer_width]"
533 );
534
535 let east = StencilIntrinsic::East;
536 assert_eq!(east.to_cuda_index_2d("p", "18", "idx"), "p[idx + 1]");
537 }
538
539 #[test]
540 fn test_stencil_offset() {
541 assert_eq!(StencilIntrinsic::North.get_offset_2d(), Some((-1, 0)));
542 assert_eq!(StencilIntrinsic::East.get_offset_2d(), Some((0, 1)));
543 assert_eq!(StencilIntrinsic::Index.get_offset_2d(), Some((0, 0)));
544 }
545
546 #[test]
547 fn test_ring_kernel_intrinsic_lookup() {
548 assert_eq!(
549 RingKernelIntrinsic::from_name("is_active"),
550 Some(RingKernelIntrinsic::IsActive)
551 );
552 assert_eq!(
553 RingKernelIntrinsic::from_name("should_terminate"),
554 Some(RingKernelIntrinsic::ShouldTerminate)
555 );
556 assert_eq!(
557 RingKernelIntrinsic::from_name("hlc_tick"),
558 Some(RingKernelIntrinsic::HlcTick)
559 );
560 assert_eq!(
561 RingKernelIntrinsic::from_name("enqueue_response"),
562 Some(RingKernelIntrinsic::EnqueueResponse)
563 );
564 assert_eq!(RingKernelIntrinsic::from_name("unknown"), None);
565 }
566
567 #[test]
568 fn test_ring_kernel_intrinsic_cuda_output() {
569 assert!(RingKernelIntrinsic::IsActive
570 .to_cuda(&[])
571 .contains("is_active"));
572 assert!(RingKernelIntrinsic::ShouldTerminate
573 .to_cuda(&[])
574 .contains("should_terminate"));
575 assert!(RingKernelIntrinsic::HlcTick
576 .to_cuda(&[])
577 .contains("hlc_logical"));
578 assert!(RingKernelIntrinsic::InputQueueEmpty
579 .to_cuda(&[])
580 .contains("input_head"));
581 }
582
583 #[test]
584 fn test_ring_kernel_queue_intrinsics() {
585 let enqueue = RingKernelIntrinsic::EnqueueResponse;
586 let cuda = enqueue.to_cuda(&["&response".to_string()]);
587 assert!(cuda.contains("output_head"));
588 assert!(cuda.contains("memcpy"));
589 }
590
591 #[test]
592 fn test_k2k_intrinsics() {
593 let send = RingKernelIntrinsic::K2kSend;
595 let cuda = send.to_cuda(&["target_id".to_string(), "&msg".to_string()]);
596 assert!(cuda.contains("k2k_send"));
597 assert!(cuda.contains("k2k_routes"));
598 assert!(cuda.contains("target_id"));
599
600 assert_eq!(
602 RingKernelIntrinsic::K2kTryRecv.to_cuda(&[]),
603 "k2k_try_recv(k2k_inbox)"
604 );
605
606 assert_eq!(
608 RingKernelIntrinsic::K2kHasMessage.to_cuda(&[]),
609 "k2k_has_message(k2k_inbox)"
610 );
611
612 assert_eq!(
614 RingKernelIntrinsic::K2kPeek.to_cuda(&[]),
615 "k2k_peek(k2k_inbox)"
616 );
617
618 assert_eq!(
620 RingKernelIntrinsic::K2kPendingCount.to_cuda(&[]),
621 "k2k_pending_count(k2k_inbox)"
622 );
623 }
624
625 #[test]
626 fn test_k2k_intrinsic_lookup() {
627 assert_eq!(
628 RingKernelIntrinsic::from_name("k2k_send"),
629 Some(RingKernelIntrinsic::K2kSend)
630 );
631 assert_eq!(
632 RingKernelIntrinsic::from_name("k2k_try_recv"),
633 Some(RingKernelIntrinsic::K2kTryRecv)
634 );
635 assert_eq!(
636 RingKernelIntrinsic::from_name("k2k_has_message"),
637 Some(RingKernelIntrinsic::K2kHasMessage)
638 );
639 assert_eq!(
640 RingKernelIntrinsic::from_name("k2k_peek"),
641 Some(RingKernelIntrinsic::K2kPeek)
642 );
643 assert_eq!(
644 RingKernelIntrinsic::from_name("k2k_pending_count"),
645 Some(RingKernelIntrinsic::K2kPendingCount)
646 );
647 }
648
649 #[test]
650 fn test_intrinsic_requirements() {
651 assert!(RingKernelIntrinsic::K2kSend.requires_k2k());
653 assert!(RingKernelIntrinsic::K2kTryRecv.requires_k2k());
654 assert!(RingKernelIntrinsic::K2kPeek.requires_k2k());
655 assert!(!RingKernelIntrinsic::HlcTick.requires_k2k());
656
657 assert!(RingKernelIntrinsic::HlcTick.requires_hlc());
659 assert!(RingKernelIntrinsic::HlcNow.requires_hlc());
660 assert!(!RingKernelIntrinsic::K2kSend.requires_hlc());
661
662 assert!(RingKernelIntrinsic::IsActive.requires_control_block());
664 assert!(RingKernelIntrinsic::EnqueueResponse.requires_control_block());
665 assert!(!RingKernelIntrinsic::HlcTick.requires_control_block());
666 }
667}