amdgpu-device-libs 0.1.0

Support library for the amdgpu target, linking to device-libs
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
// Update:
// cargo readme > README.md
// cargo readme --template ../README.tpl > ../README.md

//! Support library for the amdgpu target.
//!
//! By default, the amdgpu target supports `core`, but not `std`.
//! `alloc` is supported when a global allocator is specified.
//!
//! `amdgpu-device-libs` brings some std-like features to the amdgpu target:
//!
//! - `print!()` and `println!()` macros for printing on the host stdout
//! - A global allocator to support `alloc`
//! - A panic handler
//! - Access to more intrinsics and device-libs functions
//!
//! All these features are enabled by default, but can be turned on selectively with `default-features = false, features = […]`.
//!
//! `amdgpu-device-libs` works by linking to the [ROCm device-libs](https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs) and a pre-compiled helper library.
//! The libraries are linked from a ROCm installation.
//! To make sure the libraries are found, set the environment variable `ROCM_PATH` or `ROCM_DEVICE_LIB_PATH` (higher priority if it is set).
//! It looks for `amdgcn/bitcode/*.bc` files in this path.
//!
//! ## Usage
//!
//! Create a new cargo library project and change it to compile a cdylib:
//! ```toml
//! # Cargo.toml
//! # Force lto
//! [profile.dev]
//! lto = true
//! [profile.release]
//! lto = true
//!
//! [lib]
//! # Compile a cdylib
//! crate-type = ["cdylib"]
//!
//! [build-dependencies]
//! # Used in build script to specify linker flags and link in device-libs
//! amdgpu-device-libs-build = { path = "../../amdgpu-device-libs-build" }
//!
//! [dependencies]
//! amdgpu-device-libs = { path = "../../amdgpu-device-libs" }
//! ```
//!
//! Add extra flags in `.cargo/config.toml`:
//! ```toml
//! # .cargo/config.toml
//! [build]
//! target = "amdgcn-amd-amdhsa"
//! # Enable linker-plugin-lto and workarounds
//! # Either add -Ctarget-cpu=gfx<version> here or specify it in CARGO_BUILD_RUSTFLAGS='-Ctarget-cpu=gfx<version>'
//! rustflags = ["-Clinker-plugin-lto", "-Zemit-thin-lto=no"]
//!
//! [unstable]
//! build-std = ["core", "alloc"]
//! ```
//!
//! And add a `build.rs` build script that links to the required libraries:
//! ```rust,ignore
//! // build.rs
//! fn main() {
//!     amdgpu_device_libs_build::build();
//! }
//! ```
//!
//! ## Example
//!
//! Minimal usage sample, see [`examples/println`](https://github.com/Flakebi/amdgpu-rs/tree/main/examples/println) for the full code.
//! ```rust
//! #![feature(abi_gpu_kernel)]
//! #![no_std]
//!
//! extern crate alloc;
//!
//! use alloc::vec::Vec;
//!
//! use amdgpu_device_libs::prelude::*;
//!
//! #[unsafe(no_mangle)]
//! pub extern "gpu-kernel" fn kernel(output: *mut u32) {
//!     let wg_id = workgroup_id_x();
//!     let id = workitem_id_x();
//!     let dispatch = dispatch_ptr();
//!     let complete_id = wg_id as usize * dispatch.workgroup_size_x as usize + id as usize;
//!     
//!     println!("Hello world from the GPU! (thread {wg_id}-{id})");
//!     
//!     let mut v = Vec::<u32>::new();
//!     for i in 0..100 {
//!         v.push(100 + i);
//!     }
//!     
//!     unsafe {
//!         *output.add(complete_id) = v[complete_id];
//!     }
//! }
//! ```
#![deny(missing_docs)]
#![allow(internal_features)]
#![feature(core_intrinsics, link_llvm_intrinsics)]
#![no_std]

#[cfg(feature = "alloc")]
extern crate alloc;

#[cfg(feature = "alloc")]
use core::alloc::{GlobalAlloc, Layout};
use core::ffi;

/// Prints to the standard output.
///
/// Formats all arguments to [`format!`](alloc::format!).
#[cfg(feature = "print")]
#[macro_export]
macro_rules! print {
    ($($arg:tt)*) => {
        $crate::print(::alloc::format!($($arg)*));
    };
}

/// Prints to the standard output, with a newline.
///
/// Formats all arguments to [`format!`](alloc::format!) and appends a `\n`.
#[cfg(feature = "print")]
#[macro_export]
macro_rules! println {
    ($($arg:tt)*) => {
        let mut s = ::alloc::format!($($arg)*);
        s.push('\n');
        $crate::print(&s);
    };
}

pub mod intrinsics;

/// Prelude for functions that are generally useful when writing kernels.
///
/// Use as
/// ```rust
/// # #![no_std]
/// use amdgpu_device_libs::prelude::*;
/// ```
///
/// Contains `print!`, `println!`, intrinsics to get workitem and workgroup id among others.
pub mod prelude {
    #[cfg(feature = "device_libs")]
    pub use crate::dispatch_ptr;
    pub use crate::intrinsics::{
        s_barrier, workgroup_id_x, workgroup_id_y, workgroup_id_z, workitem_id_x, workitem_id_y,
        workitem_id_z,
    };
    #[cfg(feature = "print")]
    pub use print;
    #[cfg(feature = "print")]
    pub use println;
}

#[cfg(feature = "device_libs")]
unsafe extern "C" {
    #[cfg(feature = "hostcall")]
    #[allow(improper_ctypes)]
    fn __ockl_call_host_function(
        fptr: ffi::c_ulong,
        arg0: ffi::c_ulong,
        arg1: ffi::c_ulong,
        arg2: ffi::c_ulong,
        arg3: ffi::c_ulong,
        arg4: ffi::c_ulong,
        arg5: ffi::c_ulong,
        arg6: ffi::c_ulong,
    ) -> u128;

    // Functions implemented in HIP
    fn __amdgpu_util_alloc(size: ffi::c_ulong) -> *mut ffi::c_void;
    fn __amdgpu_util_dealloc(addr: *mut ffi::c_void);
    fn __amdgpu_util_print_stdout(s: *const ffi::c_char, size: ffi::c_int);

    // Intrinsics that return special addrspaces and therefore cannot be pure rust at the moment
    safe fn __amdgpu_util_dispatch_ptr() -> *const ffi::c_void;
    safe fn __amdgpu_util_queue_ptr() -> *mut ffi::c_void;
    safe fn __amdgpu_util_kernarg_segment_ptr() -> *const ffi::c_void;
    safe fn __amdgpu_util_implicitarg_ptr() -> *const ffi::c_void;
}

/// Handle to an HSA signal.
#[cfg(feature = "device_libs")]
#[derive(Copy, Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
#[repr(C)]
pub struct HsaSignal {
    /// The internal representation of an HSA signal.
    pub handle: u64,
}

/// HSA packet to dispatch a kernel.
///
/// A pointer to the packet that was used to dispatch the currently running kernel can be obtained with [`dispatch_ptr`].
#[cfg(feature = "device_libs")]
#[derive(Clone, Eq, PartialEq, Ord, PartialOrd, Hash, Debug)]
#[repr(C)]
pub struct HsaKernelDispatchPacket {
    /// Packet header. Used to configure multiple packet parameters such as the
    /// packet type. The parameters are described by hsa_packet_header_t.
    pub header: u16,
    /// Dispatch setup parameters. Used to configure kernel dispatch parameters
    /// such as the number of dimensions in the grid. The parameters are described
    /// by hsa_kernel_dispatch_packet_setup_t.
    pub setup: u16,
    /// X dimension of work-group, in work-items. Must be greater than 0.
    pub workgroup_size_x: u16,
    /// Y dimension of work-group, in work-items. Must be greater than
    /// 0. If the grid has 1 dimension, the only valid value is 1.
    pub workgroup_size_y: u16,
    /// Z dimension of work-group, in work-items. Must be greater than
    /// 0. If the grid has 1 or 2 dimensions, the only valid value is 1.
    pub workgroup_size_z: u16,
    /// Reserved. Must be 0.
    pub reserved0: u16,
    /// X dimension of grid, in work-items. Must be greater than 0. Must
    /// not be smaller than @a workgroup_size_x.
    pub grid_size_x: u32,
    /// Y dimension of grid, in work-items. Must be greater than 0. If the grid has
    /// 1 dimension, the only valid value is 1. Must not be smaller than @a
    /// workgroup_size_y.
    pub grid_size_y: u32,
    /// Z dimension of grid, in work-items. Must be greater than 0. If the grid has
    /// 1 or 2 dimensions, the only valid value is 1. Must not be smaller than @a
    /// workgroup_size_z.
    pub grid_size_z: u32,
    /// Size in bytes of private memory allocation request (per work-item).
    pub private_segment_size: u32,
    /// Size in bytes of group memory allocation request (per work-group). Must not
    /// be less than the sum of the group memory used by the kernel (and the
    /// functions it calls directly or indirectly) and the dynamically allocated
    /// group segment variables.
    pub group_segment_size: u32,
    /// Opaque handle to a code object that includes an implementation-defined
    /// executable code for the kernel.
    pub kernel_object: u64,
    /// Pointer to the kernel arguments.
    pub kernarg_address: *mut ffi::c_void,
    /// Reserved. Must be 0.
    pub reserved2: u64,
    /// Signal used to indicate completion of the job. The application can use the
    /// special signal handle 0 to indicate that no signal is used.
    pub completion_signal: HsaSignal,
}

/// Panic handler.
///
/// Prints the panic message if the `print` feature is enabled.
/// Aborts the kernel.
#[cfg(feature = "panic_handler")]
#[cfg_attr(not(feature = "print"), allow(unused_variables))]
#[panic_handler]
#[inline]
fn panic(panic_info: &core::panic::PanicInfo) -> ! {
    #[cfg(feature = "print")]
    {
        use prelude::*;
        // workgroup x thread y panicked at …
        println!(
            "workgroup {},{},{} thread {},{},{} {panic_info}",
            workgroup_id_x(),
            workgroup_id_y(),
            workgroup_id_z(),
            workitem_id_x(),
            workitem_id_y(),
            workitem_id_z()
        );
    }

    core::intrinsics::abort();
}

/// The memory allocator of `device-libs`.
///
/// Allocates memory from the host through hostcalls to the HIP runtime in larger chunks and the subdivides them on the GPU.
#[cfg(feature = "alloc")]
pub struct Allocator;

#[cfg(feature = "alloc")]
unsafe impl GlobalAlloc for Allocator {
    #[inline]
    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
        unsafe { __amdgpu_util_alloc(layout.size() as ffi::c_ulong) as *mut _ }
    }

    #[inline]
    unsafe fn dealloc(&self, ptr: *mut u8, _: Layout) {
        unsafe { __amdgpu_util_dealloc(ptr as *mut _) };
    }
}

/// The bare print function underlying the `print!` macros.
///
/// Sends a string to the host console using the `printf` support of the HIP runtime.
///
/// # Example
///
/// ```rust
/// # #![no_std]
/// # fn main() {
/// amdgpu_device_libs::print("Printed on the host terminal\n");
/// # }
/// ```
#[cfg(feature = "print")]
#[inline]
pub fn print(s: &str) {
    unsafe {
        __amdgpu_util_print_stdout(
            s.as_ptr() as *const ffi::c_char,
            s.len().try_into().expect("String too long to print"),
        );
    }
}

/// Get the packet for this dispatch.
///
/// Get a reference to the packet that was used to dispatch this kernel.
/// The dispatch packet contains information like the workgroup size and dispatch size.
///
/// # Example
///
/// ```rust
/// # #![no_std]
/// # extern crate alloc;
/// # fn main() {
/// use amdgpu_device_libs::prelude::*;
///
/// let dispatch = dispatch_ptr();
/// println!("Workgroup size {}x{}x{}", dispatch.workgroup_size_x, dispatch.workgroup_size_y, dispatch.workgroup_size_z);
/// # }
/// ```
#[cfg(feature = "device_libs")]
#[inline]
pub fn dispatch_ptr() -> &'static HsaKernelDispatchPacket {
    unsafe { &*(__amdgpu_util_dispatch_ptr() as *const HsaKernelDispatchPacket) }
}

/// Call a function on the host.
///
/// This allows calling functions on the CPU from the GPU.
/// `function` must be the address of a function on the CPU.
/// Up to 7 64-bit arguments can be passed and two 64-bit values are returned.
///
/// The signature of the CPU function must be `fn(output: *mut u64, input: *const u64)`.
/// `output` points to two `u64` values for the return value and `input` points to seven `u64` for the function arguments.
///
/// The `function` pointer must be passed to the GPU through some mechanism like kernel arguments.
///
/// # Example
///
/// ```rust
/// # #![no_std]
/// # fn main() {
/// # let host_func = core::panic!();
/// // Get host_func from somewhere, e.g. arguments.
/// let arg0 = 42;
/// unsafe {
///     amdgpu_device_libs::call_host_function(host_func, arg0, 0, 0, 0, 0, 0, 0);
/// }
/// # }
/// ```
///
/// # Additional information
///
/// The CPU side is implemented here (`SERVICE_FUNCTION_CALL`): [ROCm/clr/rocclr/device/devhostcall.cpp](https://github.com/ROCm/clr/blob/f5b2516f5d8a44b06ad1907594db1be25a9fe57b/rocclr/device/devhostcall.cpp)  
/// The GPU side here: [ROCm/llvm-project/amd/device-libs/ockl/src/services.cl](https://github.com/ROCm/llvm-project/blob/656552edc693e2bb4abc9258399c39d190fce2b3/amd/device-libs/ockl/src/services.cl)
#[cfg(feature = "hostcall")]
#[allow(clippy::missing_safety_doc, clippy::too_many_arguments)]
pub unsafe fn call_host_function(
    function: u64,
    arg0: u64,
    arg1: u64,
    arg2: u64,
    arg3: u64,
    arg4: u64,
    arg5: u64,
    arg6: u64,
) -> u128 {
    unsafe { __ockl_call_host_function(function, arg0, arg1, arg2, arg3, arg4, arg5, arg6) }
}

// Define here, otherwise we may get undefined symbols.
// TODO implement in LLVM?
#[unsafe(no_mangle)]
#[inline]
extern "C" fn memcmp(s1: *const u8, s2: *const u8, n: usize) -> i32 {
    for i in 0..n {
        let diff = unsafe { i32::from(s1.add(i).read()) - i32::from(s2.add(i).read()) };
        if diff != 0 {
            return diff;
        }
    }
    0
}

/// Define global allocator.
#[cfg(feature = "global_allocator")]
#[global_allocator]
static HEAP: Allocator = Allocator;