amdgpu_device_libs/intrinsics.rs
1//! amdgpu compiler intrinsics.
2//!
3//! Intrinsics defined for the amdgpu LLVM backend.
4//! Availability of intrinsics varies depending on the target architecture.
5
6unsafe extern "C" {
7 /// Returns the x coordinate of the workitem index within the workgroup.
8 #[link_name = "llvm.amdgcn.workitem.id.x"]
9 pub safe fn workitem_id_x() -> u32;
10 /// Returns the y coordinate of the workitem index within the workgroup.
11 #[link_name = "llvm.amdgcn.workitem.id.y"]
12 pub safe fn workitem_id_y() -> u32;
13 /// Returns the z coordinate of the workitem index within the workgroup.
14 #[link_name = "llvm.amdgcn.workitem.id.z"]
15 pub safe fn workitem_id_z() -> u32;
16
17 /// Returns the x coordinate of the workgroup index within the dispatch.
18 #[link_name = "llvm.amdgcn.workgroup.id.x"]
19 pub safe fn workgroup_id_x() -> u32;
20 /// Returns the y coordinate of the workgroup index within the dispatch.
21 #[link_name = "llvm.amdgcn.workgroup.id.y"]
22 pub safe fn workgroup_id_y() -> u32;
23 /// Returns the z coordinate of the workgroup index within the dispatch.
24 #[link_name = "llvm.amdgcn.workgroup.id.z"]
25 pub safe fn workgroup_id_z() -> u32;
26
27 /// Returns the number of LDS bytes statically allocated for this program.
28 #[link_name = "llvm.amdgcn.groupstaticsize"]
29 pub safe fn groupstaticsize() -> u32;
30 /// Returns the id of the dispatch that is currently executed.
31 #[link_name = "llvm.amdgcn.dispatch.id"]
32 pub safe fn dispatch_id() -> u64;
33
34 /// Returns the number of threads in a wavefront.
35 ///
36 /// Is always a power of 2.
37 #[link_name = "llvm.amdgcn.wavefrontsize"]
38 pub safe fn wavefrontsize() -> u32;
39
40 /// Synchronize all wavefronts in a workgroup.
41 ///
42 /// Each wavefronts in a workgroup waits at the barrier until all wavefronts in the workgroup arrive at a barrier.
43 #[link_name = "llvm.amdgcn.s.barrier"]
44 pub safe fn s_barrier();
45
46 /// Sleeps for approximately `count * 64` cycles.
47 ///
48 /// `count` must be a constant.
49 /// Only the lower 7 bits of `count` are used.
50 #[link_name = "llvm.amdgcn.s.sleep"]
51 pub safe fn s_sleep(count: u32);
52
53 /// Stop execution of the kernel.
54 ///
55 /// This usually signals an error state.
56 #[link_name = "llvm.amdgcn.s.sethalt"]
57 pub safe fn s_sethalt(value: u32) -> !;
58
59 /// Masked bit count, low 32 lanes.
60 ///
61 /// Computes the number of bits set in `value`, masked with a thread mask
62 /// which contains 1 for all active threads less than the current thread within a wavefront.
63 /// `init` is added to the result.
64 #[link_name = "llvm.amdgcn.mbcnt.lo"]
65 pub safe fn mbcnt_lo(value: u32, init: u32) -> u32;
66 /// Masked bit count, high 32 lanes.
67 ///
68 /// Computes the number of bits set in `value`, masked with a thread mask
69 /// which contains 1 for all active threads less than the current thread within a wavefront.
70 /// `init` is added to the result.
71 #[link_name = "llvm.amdgcn.mbcnt.hi"]
72 pub safe fn mbcnt_hi(value: u32, init: u32) -> u32;
73
74 /// Returns a bitfield (`i32` or `i64`) containing the result of its i1 argument
75 /// in all active lanes, and zero in all inactive lanes.
76 #[link_name = "llvm.amdgcn.ballot"]
77 pub safe fn ballot(b: bool) -> u64;
78
79 /// Indexes into the `value` with the current lane id and returns for each lane
80 /// if the corresponding bit is set.
81 ///
82 /// While [`ballot`] converts a `bool` to a mask, `inverse_ballot` converts a mask back to a `bool`.
83 /// This means `inverse_ballot(ballot(b)) == b`.
84 /// The inverse of `ballot(inverse_ballot(value)) ~= value` is not always true as inactive lanes are set to zero by `ballot`.
85 #[link_name = "llvm.amdgcn.inverse.ballot"]
86 pub safe fn inverse_ballot(value: u64) -> bool;
87
88 // The following intrinsics can have multiple sizes
89
90 /// Get `value` from the first active lane in the wavefront.
91 #[link_name = "llvm.amdgcn.readfirstlane.i32"]
92 pub safe fn readfirstlane_u32(value: u32) -> u32;
93 /// Get `value` from the first active lane in the wavefront.
94 #[link_name = "llvm.amdgcn.readfirstlane.i64"]
95 pub safe fn readfirstlane_u64(value: u64) -> u64;
96 /// Get `value` from the lane at index `lane` in the wavefront.
97 ///
98 /// The lane argument must be uniform across the currently active threads
99 /// of the current wavefront. Otherwise, the result is undefined.
100 #[link_name = "llvm.amdgcn.readlane.i32"]
101 pub fn readlane_u32(value: u32, lane: u32) -> u32;
102 /// Get `value` from the lane at index `lane` in the wavefront.
103 ///
104 /// The lane argument must be uniform across the currently active threads
105 /// of the current wavefront. Otherwise, the result is undefined.
106 #[link_name = "llvm.amdgcn.readlane.i64"]
107 pub fn readlane_u64(value: u64, lane: u64) -> u64;
108 /// Return `value` for the lane at index `lane` in the wavefront.
109 /// Return `default` for all other lanes.
110 ///
111 /// The value to write and lane select arguments must be uniform across the
112 /// currently active threads of the current wavefront. Otherwise, the result is
113 /// undefined.
114 ///
115 /// `value` is the value returned by `lane`.
116 /// `default` is the value returned by all lanes other than `lane`.
117 #[link_name = "llvm.amdgcn.writelane.i32"]
118 pub fn writelane_u32(value: u32, lane: u32, default: u32) -> u32;
119 /// Return `value` for the lane at index `lane` in the wavefront.
120 /// Return `default` for all other lanes.
121 ///
122 /// The value to write and lane select arguments must be uniform across the
123 /// currently active threads of the current wavefront. Otherwise, the result is
124 /// undefined.
125 ///
126 /// `value` is the value returned by `lane`.
127 /// `default` is the value returned by all lanes other than `lane`.
128 #[link_name = "llvm.amdgcn.writelane.i64"]
129 pub fn writelane_u64(value: u64, lane: u64, default: u64) -> u64;
130
131 /// Stop execution of the wavefront.
132 ///
133 /// This usually signals the end of a successful execution.
134 #[link_name = "llvm.amdgcn.endpgm"]
135 pub safe fn endpgm() -> !;
136
137 /// The `update_dpp` intrinsic represents the `update.dpp` operation in AMDGPU.
138 /// It takes an old value, a source operand, a DPP control operand, a row mask, a bank mask, and a bound control.
139 /// This operation is equivalent to a sequence of `v_mov_b32` operations.
140 ///
141 /// `llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>`
142 /// Should be equivalent to:
143 /// ```asm
144 /// v_mov_b32 <dest> <old>
145 /// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
146 /// ```
147 #[link_name = "llvm.amdgcn.update.dpp.i32"]
148 pub fn update_dpp(
149 old: u32,
150 src: u32,
151 dpp_ctrl: u32,
152 row_mask: u32,
153 bank_mask: u32,
154 bound_control: bool,
155 ) -> u32;
156
157 /// Measures time based on a fixed frequency.
158 ///
159 /// Provides a real-time clock counter that runs at constant speed (typically 100 MHz) independent of ALU clock speeds.
160 /// The clock is consistent across the chip, so can be used for measuring between different wavefronts.
161 #[link_name = "llvm.amdgcn.s.memrealtime"]
162 pub safe fn s_memrealtime() -> u64;
163
164 /// Scatter data across all lanes in a wavefront.
165 ///
166 /// Writes `value` to the lane `lane`.
167 ///
168 /// Reading from inactive lanes returns `0`.
169 /// In case multiple values get written to the same `lane`, the value from the source lane with the higher index is taken.
170 #[link_name = "llvm.amdgcn.ds.permute"]
171 pub fn ds_permute(lane: u32, value: u32) -> u32;
172 /// Gather data across all lanes in a wavefront.
173 ///
174 /// Returns the `value` given to `ds_permute` by lane `lane`.
175 ///
176 /// Reading from inactive lanes returns `0`.
177 #[link_name = "llvm.amdgcn.ds.bpermute"]
178 pub fn ds_bpermute(lane: u32, value: u32) -> u32;
179 /// Permute a 64-bit value.
180 ///
181 /// `selector` selects between different patterns in which the 64-bit value represented by `src0` and `src1` are permuted.
182 #[link_name = "llvm.amdgcn.perm"]
183 pub fn perm(src0: u32, src1: u32, selector: u32) -> u32;
184
185 // gfx10
186 /// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand.
187 ///
188 /// The third and fourth inputs must be uniform across the current wavefront.
189 /// These are combined into a single 64-bit value representing lane selects used to swizzle within each row.
190 #[link_name = "llvm.amdgcn.permlane16.i32"]
191 pub fn permlane16_u32(
192 old: u32,
193 src0: u32,
194 src1: u32,
195 src2: u32,
196 fi: bool,
197 bound_control: bool,
198 ) -> u32;
199
200 // gfx10
201 /// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand.
202 ///
203 /// The third and fourth inputs must be uniform across the current wavefront.
204 /// These are combined into a single 64-bit value representing lane selects used to swizzle within each row.
205 #[link_name = "llvm.amdgcn.permlanex16.i32"]
206 pub fn permlanex16_u32(
207 old: u32,
208 src0: u32,
209 src1: u32,
210 src2: u32,
211 fi: bool,
212 bound_control: bool,
213 ) -> u32;
214
215 /// Get the index of the current wavefront in the workgroup.
216 #[link_name = "llvm.amdgcn.s.get.waveid.in.workgroup"]
217 pub safe fn s_get_waveid_in_workgroup() -> u32;
218
219 // gfx10
220 /// Clamping atomic subtraction
221 ///
222 /// Subtract `val` from the value at `addr`, clamping at `0` if the value would become negative.
223 /// Returns the value at `addr` before the subtraction.
224 #[link_name = "llvm.amdgcn.global.atomic.csub"]
225 pub fn global_atomic_csub(addr: *mut u32, val: u32) -> u32;
226
227 // gfx11
228 /// Swap `value` between upper and lower 32 lanes in a wavefront.
229 ///
230 /// Does nothing for wave32.
231 #[link_name = "llvm.amdgcn.permlane64"]
232 pub fn permlane64_u32(value: u32) -> u32;
233
234 // gfx12
235 /// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand.
236 ///
237 /// In contrast to [`permlane16_u32`], allows each lane to specify its own gather lane.
238 #[link_name = "llvm.amdgcn.permlane16.var"]
239 pub fn permlane16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32;
240
241 // gfx12
242 /// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand.
243 ///
244 /// In contrast to [`permlanex16_u32`], allows each lane to specify its own gather lane.
245 #[link_name = "llvm.amdgcn.permlanex16.var"]
246 pub fn permlanex16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32;
247
248 // gfx12
249 /// Conditional atomic subtraction
250 ///
251 /// If the value at `addr` is greater or equal than `val`, subtracts `val` from the `value`.
252 /// If the value at `addr` is less than `val`, does nothing.
253 /// Returns the value at `addr` before the subtraction.
254 #[link_name = "llvm.amdgcn.global.atomic.cond.sub"]
255 pub fn global_atomic_cond_sub(addr: *mut u32, val: u32) -> u32;
256
257 /// Get the index of the current wavefront in the workgroup.
258 #[link_name = "llvm.amdgcn.wave.id"]
259 pub safe fn wave_id() -> u32;
260
261 // gfx950
262 /// Provide direct access to `v_permlane16_swap_b32` instruction on supported targets.
263 ///
264 /// Swaps the values across lanes of first 2 operands.
265 /// Odd rows of the first operand are swapped with even rows of the second operand (one row is 16 lanes).
266 /// Returns a pair for the swapped registers.
267 /// The first element of the return corresponds to the swapped element of the first argument.
268 #[allow(improper_ctypes)]
269 #[link_name = "llvm.amdgcn.permlane16.swap"]
270 pub fn permlane16_swap(
271 vdst_old: u32,
272 vsrc_src0: u32,
273 fi: bool,
274 bound_control: bool,
275 ) -> (u32, u32);
276
277 // gfx950
278 /// Provide direct access to `v_permlane32_swap_b32` instruction on supported targets.
279 ///
280 /// Swaps the values across lanes of first 2 operands.
281 /// Rows 2 and 3 of the first operand are swapped with rows 0 and 1 of the second operand (one row is 16 lanes).
282 /// Returns a pair for the swapped registers.
283 /// The first element of the return corresponds to the swapped element of the first argument.
284 #[allow(improper_ctypes)]
285 #[link_name = "llvm.amdgcn.permlane32.swap"]
286 pub fn permlane32_swap(
287 vdst_old: u32,
288 vsrc_src0: u32,
289 fi: bool,
290 bound_control: bool,
291 ) -> (u32, u32);
292}