1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
//! amdgpu compiler intrinsics.
//!
//! Intrinsics defined for the amdgpu LLVM backend.
//! Availability of intrinsics varies depending on the target architecture.
unsafe extern "C" {
/// Returns the x coordinate of the workitem index within the workgroup.
#[link_name = "llvm.amdgcn.workitem.id.x"]
pub safe fn workitem_id_x() -> u32;
/// Returns the y coordinate of the workitem index within the workgroup.
#[link_name = "llvm.amdgcn.workitem.id.y"]
pub safe fn workitem_id_y() -> u32;
/// Returns the z coordinate of the workitem index within the workgroup.
#[link_name = "llvm.amdgcn.workitem.id.z"]
pub safe fn workitem_id_z() -> u32;
/// Returns the x coordinate of the workgroup index within the dispatch.
#[link_name = "llvm.amdgcn.workgroup.id.x"]
pub safe fn workgroup_id_x() -> u32;
/// Returns the y coordinate of the workgroup index within the dispatch.
#[link_name = "llvm.amdgcn.workgroup.id.y"]
pub safe fn workgroup_id_y() -> u32;
/// Returns the z coordinate of the workgroup index within the dispatch.
#[link_name = "llvm.amdgcn.workgroup.id.z"]
pub safe fn workgroup_id_z() -> u32;
/// Returns the number of LDS bytes statically allocated for this program.
#[link_name = "llvm.amdgcn.groupstaticsize"]
pub safe fn groupstaticsize() -> u32;
/// Returns the id of the dispatch that is currently executed.
#[link_name = "llvm.amdgcn.dispatch.id"]
pub safe fn dispatch_id() -> u64;
/// Returns the number of threads in a wavefront.
///
/// Is always a power of 2.
#[link_name = "llvm.amdgcn.wavefrontsize"]
pub safe fn wavefrontsize() -> u32;
/// Synchronize all wavefronts in a workgroup.
///
/// Each wavefronts in a workgroup waits at the barrier until all wavefronts in the workgroup arrive at a barrier.
#[link_name = "llvm.amdgcn.s.barrier"]
pub safe fn s_barrier();
/// Sleeps for approximately `count * 64` cycles.
///
/// `count` must be a constant.
/// Only the lower 7 bits of `count` are used.
#[link_name = "llvm.amdgcn.s.sleep"]
pub safe fn s_sleep(count: u32);
/// Stop execution of the kernel.
///
/// This usually signals an error state.
#[link_name = "llvm.amdgcn.s.sethalt"]
pub safe fn s_sethalt(value: u32) -> !;
/// Masked bit count, low 32 lanes.
///
/// Computes the number of bits set in `value`, masked with a thread mask
/// which contains 1 for all active threads less than the current thread within a wavefront.
/// `init` is added to the result.
#[link_name = "llvm.amdgcn.mbcnt.lo"]
pub safe fn mbcnt_lo(value: u32, init: u32) -> u32;
/// Masked bit count, high 32 lanes.
///
/// Computes the number of bits set in `value`, masked with a thread mask
/// which contains 1 for all active threads less than the current thread within a wavefront.
/// `init` is added to the result.
#[link_name = "llvm.amdgcn.mbcnt.hi"]
pub safe fn mbcnt_hi(value: u32, init: u32) -> u32;
/// Returns a bitfield (`i32` or `i64`) containing the result of its i1 argument
/// in all active lanes, and zero in all inactive lanes.
#[link_name = "llvm.amdgcn.ballot"]
pub safe fn ballot(b: bool) -> u64;
/// Indexes into the `value` with the current lane id and returns for each lane
/// if the corresponding bit is set.
///
/// While [`ballot`] converts a `bool` to a mask, `inverse_ballot` converts a mask back to a `bool`.
/// This means `inverse_ballot(ballot(b)) == b`.
/// The inverse of `ballot(inverse_ballot(value)) ~= value` is not always true as inactive lanes are set to zero by `ballot`.
#[link_name = "llvm.amdgcn.inverse.ballot"]
pub safe fn inverse_ballot(value: u64) -> bool;
// The following intrinsics can have multiple sizes
/// Get `value` from the first active lane in the wavefront.
#[link_name = "llvm.amdgcn.readfirstlane.i32"]
pub safe fn readfirstlane_u32(value: u32) -> u32;
/// Get `value` from the first active lane in the wavefront.
#[link_name = "llvm.amdgcn.readfirstlane.i64"]
pub safe fn readfirstlane_u64(value: u64) -> u64;
/// Get `value` from the lane at index `lane` in the wavefront.
///
/// The lane argument must be uniform across the currently active threads
/// of the current wavefront. Otherwise, the result is undefined.
#[link_name = "llvm.amdgcn.readlane.i32"]
pub fn readlane_u32(value: u32, lane: u32) -> u32;
/// Get `value` from the lane at index `lane` in the wavefront.
///
/// The lane argument must be uniform across the currently active threads
/// of the current wavefront. Otherwise, the result is undefined.
#[link_name = "llvm.amdgcn.readlane.i64"]
pub fn readlane_u64(value: u64, lane: u64) -> u64;
/// Return `value` for the lane at index `lane` in the wavefront.
/// Return `default` for all other lanes.
///
/// The value to write and lane select arguments must be uniform across the
/// currently active threads of the current wavefront. Otherwise, the result is
/// undefined.
///
/// `value` is the value returned by `lane`.
/// `default` is the value returned by all lanes other than `lane`.
#[link_name = "llvm.amdgcn.writelane.i32"]
pub fn writelane_u32(value: u32, lane: u32, default: u32) -> u32;
/// Return `value` for the lane at index `lane` in the wavefront.
/// Return `default` for all other lanes.
///
/// The value to write and lane select arguments must be uniform across the
/// currently active threads of the current wavefront. Otherwise, the result is
/// undefined.
///
/// `value` is the value returned by `lane`.
/// `default` is the value returned by all lanes other than `lane`.
#[link_name = "llvm.amdgcn.writelane.i64"]
pub fn writelane_u64(value: u64, lane: u64, default: u64) -> u64;
/// Stop execution of the wavefront.
///
/// This usually signals the end of a successful execution.
#[link_name = "llvm.amdgcn.endpgm"]
pub safe fn endpgm() -> !;
/// The `update_dpp` intrinsic represents the `update.dpp` operation in AMDGPU.
/// It takes an old value, a source operand, a DPP control operand, a row mask, a bank mask, and a bound control.
/// This operation is equivalent to a sequence of `v_mov_b32` operations.
///
/// `llvm.amdgcn.update.dpp.i32 <old> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>`
/// Should be equivalent to:
/// ```asm
/// v_mov_b32 <dest> <old>
/// v_mov_b32 <dest> <src> <dpp_ctrl> <row_mask> <bank_mask> <bound_ctrl>
/// ```
#[link_name = "llvm.amdgcn.update.dpp.i32"]
pub fn update_dpp(
old: u32,
src: u32,
dpp_ctrl: u32,
row_mask: u32,
bank_mask: u32,
bound_control: bool,
) -> u32;
/// Measures time based on a fixed frequency.
///
/// Provides a real-time clock counter that runs at constant speed (typically 100 MHz) independent of ALU clock speeds.
/// The clock is consistent across the chip, so can be used for measuring between different wavefronts.
#[link_name = "llvm.amdgcn.s.memrealtime"]
pub safe fn s_memrealtime() -> u64;
/// Scatter data across all lanes in a wavefront.
///
/// Writes `value` to the lane `lane`.
///
/// Reading from inactive lanes returns `0`.
/// In case multiple values get written to the same `lane`, the value from the source lane with the higher index is taken.
#[link_name = "llvm.amdgcn.ds.permute"]
pub fn ds_permute(lane: u32, value: u32) -> u32;
/// Gather data across all lanes in a wavefront.
///
/// Returns the `value` given to `ds_permute` by lane `lane`.
///
/// Reading from inactive lanes returns `0`.
#[link_name = "llvm.amdgcn.ds.bpermute"]
pub fn ds_bpermute(lane: u32, value: u32) -> u32;
/// Permute a 64-bit value.
///
/// `selector` selects between different patterns in which the 64-bit value represented by `src0` and `src1` are permuted.
#[link_name = "llvm.amdgcn.perm"]
pub fn perm(src0: u32, src1: u32, selector: u32) -> u32;
// gfx10
/// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand.
///
/// The third and fourth inputs must be uniform across the current wavefront.
/// These are combined into a single 64-bit value representing lane selects used to swizzle within each row.
#[link_name = "llvm.amdgcn.permlane16.i32"]
pub fn permlane16_u32(
old: u32,
src0: u32,
src1: u32,
src2: u32,
fi: bool,
bound_control: bool,
) -> u32;
// gfx10
/// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand.
///
/// The third and fourth inputs must be uniform across the current wavefront.
/// These are combined into a single 64-bit value representing lane selects used to swizzle within each row.
#[link_name = "llvm.amdgcn.permlanex16.i32"]
pub fn permlanex16_u32(
old: u32,
src0: u32,
src1: u32,
src2: u32,
fi: bool,
bound_control: bool,
) -> u32;
/// Get the index of the current wavefront in the workgroup.
#[link_name = "llvm.amdgcn.s.get.waveid.in.workgroup"]
pub safe fn s_get_waveid_in_workgroup() -> u32;
// gfx10
/// Clamping atomic subtraction
///
/// Subtract `val` from the value at `addr`, clamping at `0` if the value would become negative.
/// Returns the value at `addr` before the subtraction.
#[link_name = "llvm.amdgcn.global.atomic.csub"]
pub fn global_atomic_csub(addr: *mut u32, val: u32) -> u32;
// gfx11
/// Swap `value` between upper and lower 32 lanes in a wavefront.
///
/// Does nothing for wave32.
#[link_name = "llvm.amdgcn.permlane64"]
pub fn permlane64_u32(value: u32) -> u32;
// gfx12
/// Performs arbitrary gather-style operation within a row (16 contiguous lanes) of the second input operand.
///
/// In contrast to [`permlane16_u32`], allows each lane to specify its own gather lane.
#[link_name = "llvm.amdgcn.permlane16.var"]
pub fn permlane16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32;
// gfx12
/// Performs arbitrary gather-style operation across two rows (16 contiguous lanes) of the second input operand.
///
/// In contrast to [`permlanex16_u32`], allows each lane to specify its own gather lane.
#[link_name = "llvm.amdgcn.permlanex16.var"]
pub fn permlanex16_var(old: u32, src0: u32, src1: u32, fi: bool, bound_control: bool) -> u32;
// gfx12
/// Conditional atomic subtraction
///
/// If the value at `addr` is greater or equal than `val`, subtracts `val` from the `value`.
/// If the value at `addr` is less than `val`, does nothing.
/// Returns the value at `addr` before the subtraction.
#[link_name = "llvm.amdgcn.global.atomic.cond.sub"]
pub fn global_atomic_cond_sub(addr: *mut u32, val: u32) -> u32;
/// Get the index of the current wavefront in the workgroup.
#[link_name = "llvm.amdgcn.wave.id"]
pub safe fn wave_id() -> u32;
// gfx950
/// Provide direct access to `v_permlane16_swap_b32` instruction on supported targets.
///
/// Swaps the values across lanes of first 2 operands.
/// Odd rows of the first operand are swapped with even rows of the second operand (one row is 16 lanes).
/// Returns a pair for the swapped registers.
/// The first element of the return corresponds to the swapped element of the first argument.
#[allow(improper_ctypes)]
#[link_name = "llvm.amdgcn.permlane16.swap"]
pub fn permlane16_swap(
vdst_old: u32,
vsrc_src0: u32,
fi: bool,
bound_control: bool,
) -> (u32, u32);
// gfx950
/// Provide direct access to `v_permlane32_swap_b32` instruction on supported targets.
///
/// Swaps the values across lanes of first 2 operands.
/// Rows 2 and 3 of the first operand are swapped with rows 0 and 1 of the second operand (one row is 16 lanes).
/// Returns a pair for the swapped registers.
/// The first element of the return corresponds to the swapped element of the first argument.
#[allow(improper_ctypes)]
#[link_name = "llvm.amdgcn.permlane32.swap"]
pub fn permlane32_swap(
vdst_old: u32,
vsrc_src0: u32,
fi: bool,
bound_control: bool,
) -> (u32, u32);
}