1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
//! Private module for proc macro integration.
//!
//! This module contains types that are used by the `ringkernel-derive` proc macros
//! but are not part of the public API. They are exposed for macro-generated code only.
use crate::types::KernelMode;
/// Kernel registration information collected by `#[ring_kernel]` macro.
///
/// This struct is used with the `inventory` crate to collect kernel registrations
/// at compile time. It should not be used directly by user code.
#[derive(Debug, Clone)]
pub struct KernelRegistration {
/// Unique kernel identifier.
pub id: &'static str,
/// Execution mode (persistent or event-driven).
pub mode: KernelMode,
/// Grid size (number of blocks).
pub grid_size: u32,
/// Block size (threads per block).
pub block_size: u32,
/// Target kernels this kernel publishes to (for K2K routing).
pub publishes_to: &'static [&'static str],
}
// Register the type with inventory for static collection
inventory::collect!(KernelRegistration);
/// Get all registered kernels.
pub fn registered_kernels() -> impl Iterator<Item = &'static KernelRegistration> {
inventory::iter::<KernelRegistration>()
}
/// Stencil kernel registration information collected by `#[stencil_kernel]` macro.
///
/// This struct stores the generated CUDA source code and stencil configuration
/// for runtime compilation and execution.
#[derive(Debug, Clone)]
pub struct StencilKernelRegistration {
/// Unique kernel identifier.
pub id: &'static str,
/// Grid dimensionality ("1d", "2d", or "3d").
pub grid: &'static str,
/// Tile width (block X dimension).
pub tile_width: u32,
/// Tile height (block Y dimension).
pub tile_height: u32,
/// Halo/ghost cell width (stencil radius).
pub halo: u32,
/// Generated CUDA C source code.
pub cuda_source: &'static str,
}
// Register stencil kernels with inventory
inventory::collect!(StencilKernelRegistration);
/// Get all registered stencil kernels.
pub fn registered_stencil_kernels() -> impl Iterator<Item = &'static StencilKernelRegistration> {
inventory::iter::<StencilKernelRegistration>()
}
/// Find a stencil kernel registration by ID.
pub fn find_stencil_kernel(id: &str) -> Option<&'static StencilKernelRegistration> {
registered_stencil_kernels().find(|k| k.id == id)
}
/// GPU kernel registration for multi-backend kernels collected by `#[gpu_kernel]` macro.
///
/// This struct stores backend-independent kernel metadata and capability requirements.
/// The actual backend-specific source code is stored in constants generated by the macro.
#[derive(Debug, Clone)]
pub struct GpuKernelRegistration {
/// Unique kernel identifier.
pub id: &'static str,
/// Block/workgroup size.
pub block_size: u32,
/// Required GPU capabilities (e.g., "f64", "atomic64", "subgroups").
pub capabilities: &'static [&'static str],
/// Compatible backends that support all required capabilities.
pub backends: &'static [&'static str],
/// Fallback order for runtime backend selection.
pub fallback_order: &'static [&'static str],
}
// Register GPU kernels with inventory
inventory::collect!(GpuKernelRegistration);
/// Get all registered GPU kernels.
pub fn registered_gpu_kernels() -> impl Iterator<Item = &'static GpuKernelRegistration> {
inventory::iter::<GpuKernelRegistration>()
}
/// Find a GPU kernel registration by ID.
pub fn find_gpu_kernel(id: &str) -> Option<&'static GpuKernelRegistration> {
registered_gpu_kernels().find(|k| k.id == id)
}
/// Check if a backend supports a specific capability.
pub fn backend_supports_capability(backend: &str, capability: &str) -> bool {
match (backend, capability) {
// CUDA supports everything
("cuda", _) => true,
// Metal capabilities
("metal", "f64") => false,
("metal", "cooperative_groups") => false,
("metal", "dynamic_parallelism") => false,
("metal", _) => true,
// WebGPU capabilities
("wgpu", "f64") => false,
("wgpu", "i64") => false,
("wgpu", "atomic64") => false, // Emulated only
("wgpu", "cooperative_groups") => false,
("wgpu", "dynamic_parallelism") => false,
("wgpu", _) => true,
// CPU supports everything (in emulation)
("cpu", _) => true,
// Unknown backend/capability
_ => false,
}
}
/// Select the best backend from fallback order that supports all capabilities.
pub fn select_backend(
fallback_order: &[&str],
required_capabilities: &[&str],
available_backends: &[&str],
) -> Option<&'static str> {
for backend in fallback_order {
// Check if backend is available
if !available_backends.contains(backend) {
continue;
}
// Check if backend supports all required capabilities
let supports_all = required_capabilities
.iter()
.all(|cap| backend_supports_capability(backend, cap));
if supports_all {
// Return a static string for the matching backend
return match *backend {
"cuda" => Some("cuda"),
"metal" => Some("metal"),
"wgpu" => Some("wgpu"),
"cpu" => Some("cpu"),
_ => None,
};
}
}
None
}