amdgpu_device_libs/
intrinsics.rs

1//! amdgpu compiler intrinsics.
2//!
3//! Intrinsics defined for the amdgpu LLVM backend.
4//! Availability of intrinsics varies depending on the target architecture.
5
6unsafe extern "C" {
7    /// Returns the x coordinate of the workitem index within the workgroup.
8    #[link_name = "llvm.amdgcn.workitem.id.x"]
9    pub safe fn workitem_id_x() -> u32;
10    /// Returns the y coordinate of the workitem index within the workgroup.
11    #[link_name = "llvm.amdgcn.workitem.id.y"]
12    pub safe fn workitem_id_y() -> u32;
13    /// Returns the z coordinate of the workitem index within the workgroup.
14    #[link_name = "llvm.amdgcn.workitem.id.z"]
15    pub safe fn workitem_id_z() -> u32;
16
17    /// Returns the x coordinate of the workgroup index within the dispatch.
18    #[link_name = "llvm.amdgcn.workgroup.id.x"]
19    pub safe fn workgroup_id_x() -> u32;
20    /// Returns the y coordinate of the workgroup index within the dispatch.
21    #[link_name = "llvm.amdgcn.workgroup.id.y"]
22    pub safe fn workgroup_id_y() -> u32;
23    /// Returns the z coordinate of the workgroup index within the dispatch.
24    #[link_name = "llvm.amdgcn.workgroup.id.z"]
25    pub safe fn workgroup_id_z() -> u32;
26
27    /// Returns the number of LDS bytes statically allocated for this program.
28    #[link_name = "llvm.amdgcn.groupstaticsize"]
29    pub safe fn groupstaticsize() -> u32;
30    /// Returns the id of the dispatch that is currently executed.
31    #[link_name = "llvm.amdgcn.dispatch.id"]
32    pub safe fn dispatch_id() -> u64;
33
34    /// Returns the number of threads in a wavefront.
35    ///
36    /// Is always a power of 2.
37    #[link_name = "llvm.amdgcn.wavefrontsize"]
38    pub safe fn wavefrontsize() -> u32;
39
40    /// Synchronize all wavefronts in a workgroup.
41    ///
42    /// Each wavefronts in a workgroup waits at the barrier until all wavefronts in the workgroup arrive at a barrier.
43    #[link_name = "llvm.amdgcn.s.barrier"]
44    pub safe fn s_barrier();
45
46    /// Sleeps for approximately `count * 64` cycles.
47    ///
48    /// `count` must be a constant.
49    /// Only the lower 7 bits of `count` are used.
50    #[link_name = "llvm.amdgcn.s.sleep"]
51    pub safe fn s_sleep(count: u32);
52
53    /// Stop execution of the kernel.
54    ///
55    /// This usually signals an error state.
56    #[link_name = "llvm.amdgcn.s.sethalt"]
57    pub safe fn s_sethalt(value: u32) -> !;
58
59    /// Masked bit count, low 32 lanes.
60    ///
61    /// Computes the number of bits set in `value`, masked with a thread mask
62    /// which contains 1 for all active threads less than the current thread within a wavefront.
63    /// `init` is added to the result.
64    #[link_name = "llvm.amdgcn.mbcnt.lo"]
65    pub safe fn mbcnt_lo(value: u32, init: u32) -> u32;
66    /// Masked bit count, high 32 lanes.
67    ///
68    /// Computes the number of bits set in `value`, masked with a thread mask
69    /// which contains 1 for all active threads less than the current thread within a wavefront.
70    /// `init` is added to the result.
71    #[link_name = "llvm.amdgcn.mbcnt.hi"]
72    pub safe fn mbcnt_hi(value: u32, init: u32) -> u32;
73
74    /// Returns a bitfield (`i32` or `i64`) containing the result of its i1 argument
75    /// in all active lanes, and zero in all inactive lanes.
76    #[link_name = "llvm.amdgcn.ballot"]
77    pub safe fn ballot(b: bool) -> u64;
78
79    /// Indexes into the `value` with the current lane id and returns for each lane
80    /// if the corresponding bit is set.
81    ///
82    /// While [`ballot`] converts a `bool` to a mask, `inverse_ballot` converts a mask back to a `bool`.
83    /// This means `inverse_ballot(ballot(b)) == b`.
84    /// The inverse of `ballot(inverse_ballot(value)) ~= value` is not always true as inactive lanes are set to zero by `ballot`.
85    #[link_name = "llvm.amdgcn.inverse.ballot"]
86    pub safe fn inverse_ballot(value: u64) -> bool;
87
88    // The following intrinsics can have multiple sizes
89
90    /// Get `value` from the first active lane in the wavefront.
91    #[link_name = "llvm.amdgcn.readfirstlane.i32"]
92    pub safe fn readfirstlane_u32(value: u32) -> u32;
93    /// Get `value` from the first active lane in the wavefront.
94    #[link_name = "llvm.amdgcn.readfirstlane.i64"]
95    pub safe fn readfirstlane_u64(value: u64) -> u64;
96    /// Get `value` from the lane at index `lane` in the wavefront.
97    ///
98    /// The lane argument must be uniform across the currently active threads
99    /// of the current wavefront. Otherwise, the result is undefined.
100    #[link_name = "llvm.amdgcn.readlane.i32"]
101    pub fn readlane_u32(value: u32, lane: u32) -> u32;
102    /// Get `value` from the lane at index `lane` in the wavefront.
103    ///
104    /// The lane argument must be uniform across the currently active threads
105    /// of the current wavefront. Otherwise, the result is undefined.
106    #[link_name = "llvm.amdgcn.readlane.i64"]
107    pub fn readlane_u64(value: u64, lane: u64) -> u64;
108    /// Return `value` for the lane at index `lane` in the wavefront.
109    /// Return `default` for all other lanes.
110    ///
111    /// The value to write and lane select arguments must be uniform across the
112    /// currently active threads of the current wavefront. Otherwise, the result is
113    /// undefined.
114    ///
115    /// `value` is the value returned by `lane`.
116    /// `default` is the value returned by all lanes other than `lane`.
117    #[link_name = "llvm.amdgcn.writelane.i32"]
118    pub fn writelane_u32(value: u32, lane: u32, default: u32) -> u32;
119    /// Return `value` for the lane at index `lane` in the wavefront.
120    /// Return `default` for all other lanes.
121    ///
122    /// The value to write and lane select arguments must be uniform across the
123    /// currently active threads of the current wavefront. Otherwise, the result is
124    /// undefined.
125    ///
126    /// `value` is the value returned by `lane`.
127    /// `default` is the value returned by all lanes other than `lane`.
128    #[link_name = "llvm.amdgcn.writelane.i64"]
129    pub fn writelane_u64(value: u64, lane: u64, default: u64) -> u64;
130
131    /// Stop execution of the wavefront.
132    ///
133    /// This usually signals the end of a successful execution.
134    #[link_name = "llvm.amdgcn.endpgm"]
135    pub safe fn endpgm() -> !;
136
137    /// The `update_dpp` intrinsic represents the `update.dpp` operation in AMDGPU.
138    /// It takes an old value, a source operand, a DPP control operand, a row mask, a bank mask, and a bound control.
139    /// This operation is equivalent to a sequence of `v_mov_b32` operations.
140    ///
141    /// `llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>`
142    /// Should be equivalent to:
143    /// ```asm
144    /// v_mov_b32 <dest> <old>
145    /// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
146    /// ```
147    #[link_name = "llvm.amdgcn.update.dpp.i32"]
148    pub fn update_dpp(
149        old: u32,
150        src: u32,
151        dpp_ctrl: u32,
152        row_mask: u32,
153        bank_mask: u32,
154        bound_control: bool,
155    ) -> u32;
156
157    /// Measures time based on a fixed frequency.
158    ///
159    /// Provides a real-time clock counter that runs at constant speed (typically 100 MHz) independent of ALU clock speeds.
160    /// The clock is consistent across the chip, so can be used for measuring between different wavefronts.
161    #[link_name = "llvm.amdgcn.s.memrealtime"]
162    pub safe fn s_memrealtime() -> u64;
163
164    /// Scatter data across all lanes in a wavefront.
165    ///
166    /// Writes `value` to the lane `lane`.
167    ///
168    /// Reading from inactive lanes returns `0`.
169    /// In case multiple values get written to the same `lane`, the value from the source lane with the higher index is taken.
170    #[link_name = "llvm.amdgcn.ds.permute"]
171    pub fn ds_permute(lane: u32, value: u32) -> u32;
172    /// Gather data across all lanes in a wavefront.
173    ///
174    /// Returns the `value` given to `ds_permute` by lane `lane`.
175    ///
176    /// Reading from inactive lanes returns `0`.
177    #[link_name = "llvm.amdgcn.ds.bpermute"]
178    pub fn ds_bpermute(lane: u32, value: u32) -> u32;
179    /// Permute a 64-bit value.
180    ///
181    /// `selector` selects between different patterns in which the 64-bit value represented by `src0` and `src1` are permuted.
182    #[link_name = "llvm.amdgcn.perm"]
183    pub fn perm(src0: u32, src1: u32, selector: u32) -> u32;
184
185    // gfx10
186    /// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand.
187    ///
188    /// The third and fourth inputs must be uniform across the current wavefront.
189    /// These are combined into a single 64-bit value representing lane selects used to swizzle within each row.
190    #[link_name = "llvm.amdgcn.permlane16.i32"]
191    pub fn permlane16_u32(
192        old: u32,
193        src0: u32,
194        src1: u32,
195        src2: u32,
196        fi: bool,
197        bound_control: bool,
198    ) -> u32;
199
200    // gfx10
201    /// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand.
202    ///
203    /// The third and fourth inputs must be uniform across the current wavefront.
204    /// These are combined into a single 64-bit value representing lane selects used to swizzle within each row.
205    #[link_name = "llvm.amdgcn.permlanex16.i32"]
206    pub fn permlanex16_u32(
207        old: u32,
208        src0: u32,
209        src1: u32,
210        src2: u32,
211        fi: bool,
212        bound_control: bool,
213    ) -> u32;
214
215    /// Get the index of the current wavefront in the workgroup.
216    #[link_name = "llvm.amdgcn.s.get.waveid.in.workgroup"]
217    pub safe fn s_get_waveid_in_workgroup() -> u32;
218
219    // gfx10
220    /// Clamping atomic subtraction
221    ///
222    /// Subtract `val` from the value at `addr`, clamping at `0` if the value would become negative.
223    /// Returns the value at `addr` before the subtraction.
224    #[link_name = "llvm.amdgcn.global.atomic.csub"]
225    pub fn global_atomic_csub(addr: *mut u32, val: u32) -> u32;
226
227    // gfx11
228    /// Swap `value` between upper and lower 32 lanes in a wavefront.
229    ///
230    /// Does nothing for wave32.
231    #[link_name = "llvm.amdgcn.permlane64"]
232    pub fn permlane64_u32(value: u32) -> u32;
233
234    // gfx12
235    /// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand.
236    ///
237    /// In contrast to [`permlane16_u32`], allows each lane to specify its own gather lane.
238    #[link_name = "llvm.amdgcn.permlane16.var"]
239    pub fn permlane16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32;
240
241    // gfx12
242    /// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand.
243    ///
244    /// In contrast to [`permlanex16_u32`], allows each lane to specify its own gather lane.
245    #[link_name = "llvm.amdgcn.permlanex16.var"]
246    pub fn permlanex16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32;
247
248    // gfx12
249    /// Conditional atomic subtraction
250    ///
251    /// If the value at `addr` is greater or equal than `val`, subtracts `val` from the `value`.
252    /// If the value at `addr` is less than `val`, does nothing.
253    /// Returns the value at `addr` before the subtraction.
254    #[link_name = "llvm.amdgcn.global.atomic.cond.sub"]
255    pub fn global_atomic_cond_sub(addr: *mut u32, val: u32) -> u32;
256
257    /// Get the index of the current wavefront in the workgroup.
258    #[link_name = "llvm.amdgcn.wave.id"]
259    pub safe fn wave_id() -> u32;
260
261    // gfx950
262    /// Provide direct access to `v_permlane16_swap_b32` instruction on supported targets.
263    ///
264    /// Swaps the values across lanes of first 2 operands.
265    /// Odd rows of the first operand are swapped with even rows of the second operand (one row is 16 lanes).
266    /// Returns a pair for the swapped registers.
267    /// The first element of the return corresponds to the swapped element of the first argument.
268    #[allow(improper_ctypes)]
269    #[link_name = "llvm.amdgcn.permlane16.swap"]
270    pub fn permlane16_swap(
271        vdst_old: u32,
272        vsrc_src0: u32,
273        fi: bool,
274        bound_control: bool,
275    ) -> (u32, u32);
276
277    // gfx950
278    /// Provide direct access to `v_permlane32_swap_b32` instruction on supported targets.
279    ///
280    /// Swaps the values across lanes of first 2 operands.
281    /// Rows 2 and 3 of the first operand are swapped with rows 0 and 1 of the second operand (one row is 16 lanes).
282    /// Returns a pair for the swapped registers.
283    /// The first element of the return corresponds to the swapped element of the first argument.
284    #[allow(improper_ctypes)]
285    #[link_name = "llvm.amdgcn.permlane32.swap"]
286    pub fn permlane32_swap(
287        vdst_old: u32,
288        vsrc_src0: u32,
289        fi: bool,
290        bound_control: bool,
291    ) -> (u32, u32);
292}
amdgpu_device_libs/intrinsics.rs

amdgpu_device_libs/
intrinsics.rs