amdgpu_device_libs/
lib.rs

1// Update:
2// cargo readme > README.md
3// cargo readme --template ../README.tpl > ../README.md
4
5//! Support library for the amdgpu target.
6//!
7//! By default, the amdgpu target supports `core`, but not `std`.
8//! `alloc` is supported when a global allocator is specified.
9//!
10//! `amdgpu-device-libs` brings some std-like features to the amdgpu target:
11//!
12//! - `print!()` and `println!()` macros for printing on the host stdout
13//! - A global allocator to support `alloc`
14//! - A panic handler
15//! - Access to more intrinsics and device-libs functions
16//!
17//! All these features are enabled by default, but can be turned on selectively with `default-features = false, features = […]`.
18//!
19//! `amdgpu-device-libs` works by linking to the [ROCm device-libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) and a pre-compiled helper library.
20//! The libraries are linked from a ROCm installation.
21//! To make sure the libraries are found, set the environment variable `ROCM_PATH` or `ROCM_DEVICE_LIB_PATH` (higher priority if it is set).
22//! It looks for `amdgcn/bitcode/*.bc` files in this path.
23//!
24//! ## Usage
25//!
26//! Create a new cargo library project and change it to compile a cdylib:
27//! ```toml
28//! # Cargo.toml
29//! # Force lto
30//! [profile.dev]
31//! lto = true
32//! [profile.release]
33//! lto = true
34//!
35//! [lib]
36//! # Compile a cdylib
37//! crate-type = ["cdylib"]
38//!
39//! [build-dependencies]
40//! # Used in build script to specify linker flags and link in device-libs
41//! amdgpu-device-libs-build = { path = "../../amdgpu-device-libs-build" }
42//!
43//! [dependencies]
44//! amdgpu-device-libs = { path = "../../amdgpu-device-libs" }
45//! ```
46//!
47//! Add extra flags in `.cargo/config.toml`:
48//! ```toml
49//! # .cargo/config.toml
50//! [build]
51//! target = "amdgcn-amd-amdhsa"
52//! # Enable linker-plugin-lto and workarounds
53//! # Either add -Ctarget-cpu=gfx<version> here or specify it in CARGO_BUILD_RUSTFLAGS='-Ctarget-cpu=gfx<version>'
54//! rustflags = ["-Clinker-plugin-lto", "-Zemit-thin-lto=no"]
55//!
56//! [unstable]
57//! build-std = ["core", "alloc"]
58//! ```
59//!
60//! And add a `build.rs` build script that links to the required libraries:
61//! ```rust,ignore
62//! // build.rs
63//! fn main() {
64//!     amdgpu_device_libs_build::build();
65//! }
66//! ```
67//!
68//! ## Example
69//!
70//! Minimal usage sample, see [`examples/println`](https://github.com/Flakebi/amdgpu-rs/tree/main/examples/println) for the full code.
71//! ```rust
72//! #![feature(abi_gpu_kernel)]
73//! #![no_std]
74//!
75//! extern crate alloc;
76//!
77//! use alloc::vec::Vec;
78//!
79//! use amdgpu_device_libs::prelude::*;
80//!
81//! #[unsafe(no_mangle)]
82//! pub extern "gpu-kernel" fn kernel(output: *mut u32) {
83//!     let wg_id = workgroup_id_x();
84//!     let id = workitem_id_x();
85//!     let dispatch = dispatch_ptr();
86//!     let complete_id = wg_id as usize * dispatch.workgroup_size_x as usize + id as usize;
87//!     
88//!     println!("Hello world from the GPU! (thread {wg_id}-{id})");
89//!     
90//!     let mut v = Vec::<u32>::new();
91//!     for i in 0..100 {
92//!         v.push(100 + i);
93//!     }
94//!     
95//!     unsafe {
96//!         *output.add(complete_id) = v[complete_id];
97//!     }
98//! }
99//! ```
100#![deny(missing_docs)]
101#![allow(internal_features)]
102#![feature(core_intrinsics, link_llvm_intrinsics)]
103#![no_std]
104
105#[cfg(feature = "alloc")]
106extern crate alloc;
107
108#[cfg(feature = "alloc")]
109use core::alloc::{GlobalAlloc, Layout};
110use core::ffi;
111
112/// Prints to the standard output.
113///
114/// Formats all arguments to [`format!`](alloc::format!).
115#[cfg(feature = "print")]
116#[macro_export]
117macro_rules! print {
118    ($($arg:tt)*) => {
119        $crate::print(::alloc::format!($($arg)*));
120    };
121}
122
123/// Prints to the standard output, with a newline.
124///
125/// Formats all arguments to [`format!`](alloc::format!) and appends a `\n`.
126#[cfg(feature = "print")]
127#[macro_export]
128macro_rules! println {
129    ($($arg:tt)*) => {
130        let mut s = ::alloc::format!($($arg)*);
131        s.push('\n');
132        $crate::print(&s);
133    };
134}
135
136pub mod intrinsics;
137
138/// Prelude for functions that are generally useful when writing kernels.
139///
140/// Use as
141/// ```rust
142/// # #![no_std]
143/// use amdgpu_device_libs::prelude::*;
144/// ```
145///
146/// Contains `print!`, `println!`, intrinsics to get workitem and workgroup id among others.
147pub mod prelude {
148    #[cfg(feature = "device_libs")]
149    pub use crate::dispatch_ptr;
150    pub use crate::intrinsics::{
151        s_barrier, workgroup_id_x, workgroup_id_y, workgroup_id_z, workitem_id_x, workitem_id_y,
152        workitem_id_z,
153    };
154    #[cfg(feature = "print")]
155    pub use print;
156    #[cfg(feature = "print")]
157    pub use println;
158}
159
160#[cfg(feature = "device_libs")]
161unsafe extern "C" {
162    #[cfg(feature = "hostcall")]
163    #[allow(improper_ctypes)]
164    fn __ockl_call_host_function(
165        fptr: ffi::c_ulong,
166        arg0: ffi::c_ulong,
167        arg1: ffi::c_ulong,
168        arg2: ffi::c_ulong,
169        arg3: ffi::c_ulong,
170        arg4: ffi::c_ulong,
171        arg5: ffi::c_ulong,
172        arg6: ffi::c_ulong,
173    ) -> u128;
174
175    // Functions implemented in HIP
176    fn __amdgpu_util_alloc(size: ffi::c_ulong) -> *mut ffi::c_void;
177    fn __amdgpu_util_dealloc(addr: *mut ffi::c_void);
178    fn __amdgpu_util_print_stdout(s: *const ffi::c_char, size: ffi::c_int);
179
180    // Intrinsics that return special addrspaces and therefore cannot be pure rust at the moment
181    safe fn __amdgpu_util_dispatch_ptr() -> *const ffi::c_void;
182    safe fn __amdgpu_util_queue_ptr() -> *mut ffi::c_void;
183    safe fn __amdgpu_util_kernarg_segment_ptr() -> *const ffi::c_void;
184    safe fn __amdgpu_util_implicitarg_ptr() -> *const ffi::c_void;
185}
186
187/// Handle to an HSA signal.
188#[cfg(feature = "device_libs")]
189#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
190#[repr(C)]
191pub struct HsaSignal {
192    /// The internal representation of an HSA signal.
193    pub handle: u64,
194}
195
196/// HSA packet to dispatch a kernel.
197///
198/// A pointer to the packet that was used to dispatch the currently running kernel can be obtained with [`dispatch_ptr`].
199#[cfg(feature = "device_libs")]
200#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
201#[repr(C)]
202pub struct HsaKernelDispatchPacket {
203    /// Packet header. Used to configure multiple packet parameters such as the
204    /// packet type. The parameters are described by hsa_packet_header_t.
205    pub header: u16,
206    /// Dispatch setup parameters. Used to configure kernel dispatch parameters
207    /// such as the number of dimensions in the grid. The parameters are described
208    /// by hsa_kernel_dispatch_packet_setup_t.
209    pub setup: u16,
210    /// X dimension of work-group, in work-items. Must be greater than 0.
211    pub workgroup_size_x: u16,
212    /// Y dimension of work-group, in work-items. Must be greater than
213    /// 0. If the grid has 1 dimension, the only valid value is 1.
214    pub workgroup_size_y: u16,
215    /// Z dimension of work-group, in work-items. Must be greater than
216    /// 0. If the grid has 1 or 2 dimensions, the only valid value is 1.
217    pub workgroup_size_z: u16,
218    /// Reserved. Must be 0.
219    pub reserved0: u16,
220    /// X dimension of grid, in work-items. Must be greater than 0. Must
221    /// not be smaller than @a workgroup_size_x.
222    pub grid_size_x: u32,
223    /// Y dimension of grid, in work-items. Must be greater than 0. If the grid has
224    /// 1 dimension, the only valid value is 1. Must not be smaller than @a
225    /// workgroup_size_y.
226    pub grid_size_y: u32,
227    /// Z dimension of grid, in work-items. Must be greater than 0. If the grid has
228    /// 1 or 2 dimensions, the only valid value is 1. Must not be smaller than @a
229    /// workgroup_size_z.
230    pub grid_size_z: u32,
231    /// Size in bytes of private memory allocation request (per work-item).
232    pub private_segment_size: u32,
233    /// Size in bytes of group memory allocation request (per work-group). Must not
234    /// be less than the sum of the group memory used by the kernel (and the
235    /// functions it calls directly or indirectly) and the dynamically allocated
236    /// group segment variables.
237    pub group_segment_size: u32,
238    /// Opaque handle to a code object that includes an implementation-defined
239    /// executable code for the kernel.
240    pub kernel_object: u64,
241    /// Pointer to the kernel arguments.
242    pub kernarg_address: *mut ffi::c_void,
243    /// Reserved. Must be 0.
244    pub reserved2: u64,
245    /// Signal used to indicate completion of the job. The application can use the
246    /// special signal handle 0 to indicate that no signal is used.
247    pub completion_signal: HsaSignal,
248}
249
250/// Panic handler.
251///
252/// Prints the panic message if the `print` feature is enabled.
253/// Aborts the kernel.
254#[cfg(feature = "panic_handler")]
255#[cfg_attr(not(feature = "print"), allow(unused_variables))]
256#[panic_handler]
257#[inline]
258fn panic(panic_info: &core::panic::PanicInfo) -> ! {
259    #[cfg(feature = "print")]
260    {
261        use prelude::*;
262        // workgroup x thread y panicked at …
263        println!(
264            "workgroup {},{},{} thread {},{},{} {panic_info}",
265            workgroup_id_x(),
266            workgroup_id_y(),
267            workgroup_id_z(),
268            workitem_id_x(),
269            workitem_id_y(),
270            workitem_id_z()
271        );
272    }
273
274    core::intrinsics::abort();
275}
276
277/// The memory allocator of `device-libs`.
278///
279/// Allocates memory from the host through hostcalls to the HIP runtime in larger chunks and the subdivides them on the GPU.
280#[cfg(feature = "alloc")]
281pub struct Allocator;
282
283#[cfg(feature = "alloc")]
284unsafe impl GlobalAlloc for Allocator {
285    #[inline]
286    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
287        unsafe { __amdgpu_util_alloc(layout.size() as ffi::c_ulong) as *mut _ }
288    }
289
290    #[inline]
291    unsafe fn dealloc(&self, ptr: *mut u8, _: Layout) {
292        unsafe { __amdgpu_util_dealloc(ptr as *mut _) };
293    }
294}
295
296/// The bare print function underlying the `print!` macros.
297///
298/// Sends a string to the host console using the `printf` support of the HIP runtime.
299///
300/// # Example
301///
302/// ```rust
303/// # #![no_std]
304/// # fn main() {
305/// amdgpu_device_libs::print("Printed on the host terminal\n");
306/// # }
307/// ```
308#[cfg(feature = "print")]
309#[inline]
310pub fn print(s: &str) {
311    unsafe {
312        __amdgpu_util_print_stdout(
313            s.as_ptr() as *const ffi::c_char,
314            s.len().try_into().expect("String too long to print"),
315        );
316    }
317}
318
319/// Get the packet for this dispatch.
320///
321/// Get a reference to the packet that was used to dispatch this kernel.
322/// The dispatch packet contains information like the workgroup size and dispatch size.
323///
324/// # Example
325///
326/// ```rust
327/// # #![no_std]
328/// # extern crate alloc;
329/// # fn main() {
330/// use amdgpu_device_libs::prelude::*;
331///
332/// let dispatch = dispatch_ptr();
333/// println!("Workgroup size {}x{}x{}", dispatch.workgroup_size_x, dispatch.workgroup_size_y, dispatch.workgroup_size_z);
334/// # }
335/// ```
336#[cfg(feature = "device_libs")]
337#[inline]
338pub fn dispatch_ptr() -> &'static HsaKernelDispatchPacket {
339    unsafe { &*(__amdgpu_util_dispatch_ptr() as *const HsaKernelDispatchPacket) }
340}
341
342/// Call a function on the host.
343///
344/// This allows calling functions on the CPU from the GPU.
345/// `function` must be the address of a function on the CPU.
346/// Up to 7 64-bit arguments can be passed and two 64-bit values are returned.
347///
348/// The signature of the CPU function must be `fn(output: *mut u64, input: *const u64)`.
349/// `output` points to two `u64` values for the return value and `input` points to seven `u64` for the function arguments.
350///
351/// The `function` pointer must be passed to the GPU through some mechanism like kernel arguments.
352///
353/// # Example
354///
355/// ```rust
356/// # #![no_std]
357/// # fn main() {
358/// # let host_func = core::panic!();
359/// // Get host_func from somewhere, e.g. arguments.
360/// let arg0 = 42;
361/// unsafe {
362///     amdgpu_device_libs::call_host_function(host_func, arg0, 0, 0, 0, 0, 0, 0);
363/// }
364/// # }
365/// ```
366///
367/// # Additional information
368///
369/// The CPU side is implemented here (`SERVICE_FUNCTION_CALL`): [ROCm/clr/rocclr/device/devhostcall.cpp](https://github.com/ROCm/clr/blob/f5b2516f5d8a44b06ad1907594db1be25a9fe57b/rocclr/device/devhostcall.cpp)  
370/// The GPU side here: [ROCm/llvm-project/amd/device-libs/ockl/src/services.cl](https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/amd/device-libs/ockl/src/services.cl)
371#[cfg(feature = "hostcall")]
372#[allow(clippy::missing_safety_doc, clippy::too_many_arguments)]
373pub unsafe fn call_host_function(
374    function: u64,
375    arg0: u64,
376    arg1: u64,
377    arg2: u64,
378    arg3: u64,
379    arg4: u64,
380    arg5: u64,
381    arg6: u64,
382) -> u128 {
383    unsafe { __ockl_call_host_function(function, arg0, arg1, arg2, arg3, arg4, arg5, arg6) }
384}
385
386// Define here, otherwise we may get undefined symbols.
387// TODO implement in LLVM?
388#[unsafe(no_mangle)]
389#[inline]
390extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
391    for i in 0..n {
392        let diff = unsafe { i32::from(s1.add(i).read()) - i32::from(s2.add(i).read()) };
393        if diff != 0 {
394            return diff;
395        }
396    }
397    0
398}
399
400/// Define global allocator.
401#[cfg(feature = "global_allocator")]
402#[global_allocator]
403static HEAP: Allocator = Allocator;
amdgpu_device_libs/lib.rs

amdgpu_device_libs/
lib.rs