amdgpu_device_libs/lib.rs
1// Update:
2// cargo readme > README.md
3// cargo readme --template ../README.tpl > ../README.md
4
5//! Support library for the amdgpu target.
6//!
7//! By default, the amdgpu target supports `core`, but not `std`.
8//! `alloc` is supported when a global allocator is specified.
9//!
10//! `amdgpu-device-libs` brings some std-like features to the amdgpu target:
11//!
12//! - `print!()` and `println!()` macros for printing on the host stdout
13//! - A global allocator to support `alloc`
14//! - A panic handler
15//! - Access to more intrinsics and device-libs functions
16//!
17//! All these features are enabled by default, but can be turned on selectively with `default-features = false, features = […]`.
18//!
19//! `amdgpu-device-libs` works by linking to the [ROCm device-libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) and a pre-compiled helper library.
20//! The libraries are linked from a ROCm installation.
21//! To make sure the libraries are found, set the environment variable `ROCM_PATH` or `ROCM_DEVICE_LIB_PATH` (higher priority if it is set).
22//! It looks for `amdgcn/bitcode/*.bc` files in this path.
23//!
24//! ## Usage
25//!
26//! Create a new cargo library project and change it to compile a cdylib:
27//! ```toml
28//! # Cargo.toml
29//! # Force lto
30//! [profile.dev]
31//! lto = true
32//! [profile.release]
33//! lto = true
34//!
35//! [lib]
36//! # Compile a cdylib
37//! crate-type = ["cdylib"]
38//!
39//! [build-dependencies]
40//! # Used in build script to specify linker flags and link in device-libs
41//! amdgpu-device-libs-build = { path = "../../amdgpu-device-libs-build" }
42//!
43//! [dependencies]
44//! amdgpu-device-libs = { path = "../../amdgpu-device-libs" }
45//! ```
46//!
47//! Add extra flags in `.cargo/config.toml`:
48//! ```toml
49//! # .cargo/config.toml
50//! [build]
51//! target = "amdgcn-amd-amdhsa"
52//! # Enable linker-plugin-lto and workarounds
53//! # Either add -Ctarget-cpu=gfx<version> here or specify it in CARGO_BUILD_RUSTFLAGS='-Ctarget-cpu=gfx<version>'
54//! rustflags = ["-Clinker-plugin-lto", "-Zemit-thin-lto=no"]
55//!
56//! [unstable]
57//! build-std = ["core", "alloc"]
58//! ```
59//!
60//! And add a `build.rs` build script that links to the required libraries:
61//! ```rust,ignore
62//! // build.rs
63//! fn main() {
64//! amdgpu_device_libs_build::build();
65//! }
66//! ```
67//!
68//! ## Example
69//!
70//! Minimal usage sample, see [`examples/println`](https://github.com/Flakebi/amdgpu-rs/tree/main/examples/println) for the full code.
71//! ```rust
72//! #![feature(abi_gpu_kernel)]
73//! #![no_std]
74//!
75//! extern crate alloc;
76//!
77//! use alloc::vec::Vec;
78//!
79//! use amdgpu_device_libs::prelude::*;
80//!
81//! #[unsafe(no_mangle)]
82//! pub extern "gpu-kernel" fn kernel(output: *mut u32) {
83//! let wg_id = workgroup_id_x();
84//! let id = workitem_id_x();
85//! let dispatch = dispatch_ptr();
86//! let complete_id = wg_id as usize * dispatch.workgroup_size_x as usize + id as usize;
87//!
88//! println!("Hello world from the GPU! (thread {wg_id}-{id})");
89//!
90//! let mut v = Vec::<u32>::new();
91//! for i in 0..100 {
92//! v.push(100 + i);
93//! }
94//!
95//! unsafe {
96//! *output.add(complete_id) = v[complete_id];
97//! }
98//! }
99//! ```
100#![deny(missing_docs)]
101#![allow(internal_features)]
102#![feature(core_intrinsics, link_llvm_intrinsics)]
103#![no_std]
104
105#[cfg(feature = "alloc")]
106extern crate alloc;
107
108#[cfg(feature = "alloc")]
109use core::alloc::{GlobalAlloc, Layout};
110use core::ffi;
111
112/// Prints to the standard output.
113///
114/// Formats all arguments to [`format!`](alloc::format!).
115#[cfg(feature = "print")]
116#[macro_export]
117macro_rules! print {
118 ($($arg:tt)*) => {
119 $crate::print(::alloc::format!($($arg)*));
120 };
121}
122
123/// Prints to the standard output, with a newline.
124///
125/// Formats all arguments to [`format!`](alloc::format!) and appends a `\n`.
126#[cfg(feature = "print")]
127#[macro_export]
128macro_rules! println {
129 ($($arg:tt)*) => {
130 let mut s = ::alloc::format!($($arg)*);
131 s.push('\n');
132 $crate::print(&s);
133 };
134}
135
136pub mod intrinsics;
137
138/// Prelude for functions that are generally useful when writing kernels.
139///
140/// Use as
141/// ```rust
142/// # #![no_std]
143/// use amdgpu_device_libs::prelude::*;
144/// ```
145///
146/// Contains `print!`, `println!`, intrinsics to get workitem and workgroup id among others.
147pub mod prelude {
148 #[cfg(feature = "device_libs")]
149 pub use crate::dispatch_ptr;
150 pub use crate::intrinsics::{
151 s_barrier, workgroup_id_x, workgroup_id_y, workgroup_id_z, workitem_id_x, workitem_id_y,
152 workitem_id_z,
153 };
154 #[cfg(feature = "print")]
155 pub use print;
156 #[cfg(feature = "print")]
157 pub use println;
158}
159
160#[cfg(feature = "device_libs")]
161unsafe extern "C" {
162 #[cfg(feature = "hostcall")]
163 #[allow(improper_ctypes)]
164 fn __ockl_call_host_function(
165 fptr: ffi::c_ulong,
166 arg0: ffi::c_ulong,
167 arg1: ffi::c_ulong,
168 arg2: ffi::c_ulong,
169 arg3: ffi::c_ulong,
170 arg4: ffi::c_ulong,
171 arg5: ffi::c_ulong,
172 arg6: ffi::c_ulong,
173 ) -> u128;
174
175 // Functions implemented in HIP
176 fn __amdgpu_util_alloc(size: ffi::c_ulong) -> *mut ffi::c_void;
177 fn __amdgpu_util_dealloc(addr: *mut ffi::c_void);
178 fn __amdgpu_util_print_stdout(s: *const ffi::c_char, size: ffi::c_int);
179
180 // Intrinsics that return special addrspaces and therefore cannot be pure rust at the moment
181 safe fn __amdgpu_util_dispatch_ptr() -> *const ffi::c_void;
182 safe fn __amdgpu_util_queue_ptr() -> *mut ffi::c_void;
183 safe fn __amdgpu_util_kernarg_segment_ptr() -> *const ffi::c_void;
184 safe fn __amdgpu_util_implicitarg_ptr() -> *const ffi::c_void;
185}
186
187/// Handle to an HSA signal.
188#[cfg(feature = "device_libs")]
189#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
190#[repr(C)]
191pub struct HsaSignal {
192 /// The internal representation of an HSA signal.
193 pub handle: u64,
194}
195
196/// HSA packet to dispatch a kernel.
197///
198/// A pointer to the packet that was used to dispatch the currently running kernel can be obtained with [`dispatch_ptr`].
199#[cfg(feature = "device_libs")]
200#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
201#[repr(C)]
202pub struct HsaKernelDispatchPacket {
203 /// Packet header. Used to configure multiple packet parameters such as the
204 /// packet type. The parameters are described by hsa_packet_header_t.
205 pub header: u16,
206 /// Dispatch setup parameters. Used to configure kernel dispatch parameters
207 /// such as the number of dimensions in the grid. The parameters are described
208 /// by hsa_kernel_dispatch_packet_setup_t.
209 pub setup: u16,
210 /// X dimension of work-group, in work-items. Must be greater than 0.
211 pub workgroup_size_x: u16,
212 /// Y dimension of work-group, in work-items. Must be greater than
213 /// 0. If the grid has 1 dimension, the only valid value is 1.
214 pub workgroup_size_y: u16,
215 /// Z dimension of work-group, in work-items. Must be greater than
216 /// 0. If the grid has 1 or 2 dimensions, the only valid value is 1.
217 pub workgroup_size_z: u16,
218 /// Reserved. Must be 0.
219 pub reserved0: u16,
220 /// X dimension of grid, in work-items. Must be greater than 0. Must
221 /// not be smaller than @a workgroup_size_x.
222 pub grid_size_x: u32,
223 /// Y dimension of grid, in work-items. Must be greater than 0. If the grid has
224 /// 1 dimension, the only valid value is 1. Must not be smaller than @a
225 /// workgroup_size_y.
226 pub grid_size_y: u32,
227 /// Z dimension of grid, in work-items. Must be greater than 0. If the grid has
228 /// 1 or 2 dimensions, the only valid value is 1. Must not be smaller than @a
229 /// workgroup_size_z.
230 pub grid_size_z: u32,
231 /// Size in bytes of private memory allocation request (per work-item).
232 pub private_segment_size: u32,
233 /// Size in bytes of group memory allocation request (per work-group). Must not
234 /// be less than the sum of the group memory used by the kernel (and the
235 /// functions it calls directly or indirectly) and the dynamically allocated
236 /// group segment variables.
237 pub group_segment_size: u32,
238 /// Opaque handle to a code object that includes an implementation-defined
239 /// executable code for the kernel.
240 pub kernel_object: u64,
241 /// Pointer to the kernel arguments.
242 pub kernarg_address: *mut ffi::c_void,
243 /// Reserved. Must be 0.
244 pub reserved2: u64,
245 /// Signal used to indicate completion of the job. The application can use the
246 /// special signal handle 0 to indicate that no signal is used.
247 pub completion_signal: HsaSignal,
248}
249
250/// Panic handler.
251///
252/// Prints the panic message if the `print` feature is enabled.
253/// Aborts the kernel.
254#[cfg(feature = "panic_handler")]
255#[cfg_attr(not(feature = "print"), allow(unused_variables))]
256#[panic_handler]
257#[inline]
258fn panic(panic_info: &core::panic::PanicInfo) -> ! {
259 #[cfg(feature = "print")]
260 {
261 use prelude::*;
262 // workgroup x thread y panicked at …
263 println!(
264 "workgroup {},{},{} thread {},{},{} {panic_info}",
265 workgroup_id_x(),
266 workgroup_id_y(),
267 workgroup_id_z(),
268 workitem_id_x(),
269 workitem_id_y(),
270 workitem_id_z()
271 );
272 }
273
274 core::intrinsics::abort();
275}
276
277/// The memory allocator of `device-libs`.
278///
279/// Allocates memory from the host through hostcalls to the HIP runtime in larger chunks and the subdivides them on the GPU.
280#[cfg(feature = "alloc")]
281pub struct Allocator;
282
283#[cfg(feature = "alloc")]
284unsafe impl GlobalAlloc for Allocator {
285 #[inline]
286 unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
287 unsafe { __amdgpu_util_alloc(layout.size() as ffi::c_ulong) as *mut _ }
288 }
289
290 #[inline]
291 unsafe fn dealloc(&self, ptr: *mut u8, _: Layout) {
292 unsafe { __amdgpu_util_dealloc(ptr as *mut _) };
293 }
294}
295
296/// The bare print function underlying the `print!` macros.
297///
298/// Sends a string to the host console using the `printf` support of the HIP runtime.
299///
300/// # Example
301///
302/// ```rust
303/// # #![no_std]
304/// # fn main() {
305/// amdgpu_device_libs::print("Printed on the host terminal\n");
306/// # }
307/// ```
308#[cfg(feature = "print")]
309#[inline]
310pub fn print(s: &str) {
311 unsafe {
312 __amdgpu_util_print_stdout(
313 s.as_ptr() as *const ffi::c_char,
314 s.len().try_into().expect("String too long to print"),
315 );
316 }
317}
318
319/// Get the packet for this dispatch.
320///
321/// Get a reference to the packet that was used to dispatch this kernel.
322/// The dispatch packet contains information like the workgroup size and dispatch size.
323///
324/// # Example
325///
326/// ```rust
327/// # #![no_std]
328/// # extern crate alloc;
329/// # fn main() {
330/// use amdgpu_device_libs::prelude::*;
331///
332/// let dispatch = dispatch_ptr();
333/// println!("Workgroup size {}x{}x{}", dispatch.workgroup_size_x, dispatch.workgroup_size_y, dispatch.workgroup_size_z);
334/// # }
335/// ```
336#[cfg(feature = "device_libs")]
337#[inline]
338pub fn dispatch_ptr() -> &'static HsaKernelDispatchPacket {
339 unsafe { &*(__amdgpu_util_dispatch_ptr() as *const HsaKernelDispatchPacket) }
340}
341
342/// Call a function on the host.
343///
344/// This allows calling functions on the CPU from the GPU.
345/// `function` must be the address of a function on the CPU.
346/// Up to 7 64-bit arguments can be passed and two 64-bit values are returned.
347///
348/// The signature of the CPU function must be `fn(output: *mut u64, input: *const u64)`.
349/// `output` points to two `u64` values for the return value and `input` points to seven `u64` for the function arguments.
350///
351/// The `function` pointer must be passed to the GPU through some mechanism like kernel arguments.
352///
353/// # Example
354///
355/// ```rust
356/// # #![no_std]
357/// # fn main() {
358/// # let host_func = core::panic!();
359/// // Get host_func from somewhere, e.g. arguments.
360/// let arg0 = 42;
361/// unsafe {
362/// amdgpu_device_libs::call_host_function(host_func, arg0, 0, 0, 0, 0, 0, 0);
363/// }
364/// # }
365/// ```
366///
367/// # Additional information
368///
369/// The CPU side is implemented here (`SERVICE_FUNCTION_CALL`): [ROCm/clr/rocclr/device/devhostcall.cpp](https://github.com/ROCm/clr/blob/f5b2516f5d8a44b06ad1907594db1be25a9fe57b/rocclr/device/devhostcall.cpp)
370/// The GPU side here: [ROCm/llvm-project/amd/device-libs/ockl/src/services.cl](https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/amd/device-libs/ockl/src/services.cl)
371#[cfg(feature = "hostcall")]
372#[allow(clippy::missing_safety_doc, clippy::too_many_arguments)]
373pub unsafe fn call_host_function(
374 function: u64,
375 arg0: u64,
376 arg1: u64,
377 arg2: u64,
378 arg3: u64,
379 arg4: u64,
380 arg5: u64,
381 arg6: u64,
382) -> u128 {
383 unsafe { __ockl_call_host_function(function, arg0, arg1, arg2, arg3, arg4, arg5, arg6) }
384}
385
386// Define here, otherwise we may get undefined symbols.
387// TODO implement in LLVM?
388#[unsafe(no_mangle)]
389#[inline]
390extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
391 for i in 0..n {
392 let diff = unsafe { i32::from(s1.add(i).read()) - i32::from(s2.add(i).read()) };
393 if diff != 0 {
394 return diff;
395 }
396 }
397 0
398}
399
400/// Define global allocator.
401#[cfg(feature = "global_allocator")]
402#[global_allocator]
403static HEAP: Allocator = Allocator;