/**
* @brief Shared definitions for the StringZilla CUDA library.
* @file types.cuh
* @author Ash Vardanian
*
* The goal for this header is to provide absolutely-minimal set of types and forward-declarations for
* CUDA backends of higher-level complex templated algorithms implemented outside of the C layer, like:
*
* - `unified_alloc` - a custom allocator that uses CUDA Unified Memory for allocation.
* - `gpu_specs_t` - a structure that contains the GPU specifications, like number of SMs, VRAM size, etc.
* - `cuda_status_t` - a composite of the CUDA status, error code, and elapsed kernel execution time.
*/
#ifndef STRINGZILLAS_TYPES_CUH_
#define STRINGZILLAS_TYPES_CUH_
#include "stringzilla/types.hpp"
#include <cuda_runtime.h> // `cudaMallocManaged`, `cudaFree`, `cudaSuccess`, `cudaGetErrorString`
#include <optional> // `std::optional`
#include <algorithm> // `std::sort`, `std::partition`
namespace ashvardanian {
namespace stringzillas {
using namespace stringzilla;
/**
* @brief A custom allocator that uses CUDA Unified Memory for allocation.
*/
template <typename value_type_>
struct unified_alloc {
using value_type = value_type_;
using pointer = value_type *;
using size_type = sz_size_t;
using difference_type = sz_ssize_t;
/* Those are needed for compatibility with our custom containers.
* @see https://en.cppreference.com/w/cpp/memory/allocator_traits
*/
using propagate_on_container_move_assignment = std::true_type;
using propagate_on_container_copy_assignment = std::false_type;
template <typename other_value_type_>
struct rebind {
using other = unified_alloc<other_value_type_>;
};
constexpr unified_alloc() noexcept = default;
constexpr unified_alloc(unified_alloc const &) noexcept = default;
template <typename other_value_type_>
constexpr unified_alloc(unified_alloc<other_value_type_> const &) noexcept {}
value_type *allocate(size_type n) const noexcept {
value_type *result = nullptr;
auto error = cudaMallocManaged((value_type **)&result, n * sizeof(value_type));
if (error != cudaSuccess) return nullptr;
return result;
}
void deallocate(pointer p, size_type) const noexcept {
if (!p) return;
cudaFree(p);
}
template <typename other_type_>
bool operator==(unified_alloc<other_type_> const &) const noexcept {
return true;
}
template <typename other_type_>
bool operator!=(unified_alloc<other_type_> const &) const noexcept {
return false;
}
};
using unified_alloc_t = unified_alloc<char>;
/** @brief Returns `true` if the pointer refers to device-accessible memory (Device or Managed/Unified). */
inline bool is_device_accessible_memory(void const *ptr) noexcept {
if (!ptr) return true;
cudaPointerAttributes attr;
cudaError_t err = cudaPointerGetAttributes(&attr, ptr);
if (err != cudaSuccess) return false;
#if defined(CUDART_VERSION) && (CUDART_VERSION >= 10000) // Modern CUDA: use `type`
return attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged;
#else // Legacy CUDA: `memoryType` and `isManaged`
return attr.memoryType == cudaMemoryTypeDevice || attr.isManaged;
#endif
}
struct cuda_status_t {
status_t status = status_t::success_k;
cudaError_t cuda_error = cudaSuccess;
float elapsed_milliseconds = 0.0;
inline operator status_t() const noexcept { return status; }
};
inline cuda_status_t gpu_specs_fetch(gpu_specs_t &specs, int device_id = 0) noexcept {
cudaDeviceProp prop;
cudaError_t cuda_error = cudaGetDeviceProperties(&prop, device_id);
// Distinguish between "no GPU available" vs other CUDA errors for clearer handling upstream.
if (cuda_error != cudaSuccess) {
status_t status = status_t::unknown_k;
if (cuda_error == cudaErrorNoDevice) status = status_t::missing_gpu_k;
if (cuda_error == cudaErrorInvalidDevice) status = status_t::missing_gpu_k;
if (cuda_error == cudaErrorInsufficientDriver) status = status_t::missing_gpu_k;
return {status, cuda_error};
}
// Set the GPU specs
specs.streaming_multiprocessors = prop.multiProcessorCount;
specs.constant_memory_bytes = prop.totalConstMem;
specs.vram_bytes = prop.totalGlobalMem;
specs.warp_size = prop.warpSize;
// Infer other global settings, that CUDA doesn't expose directly
specs.shared_memory_bytes = prop.sharedMemPerMultiprocessor * prop.multiProcessorCount;
specs.sm_code = gpu_specs_t::pack_sm_code(prop.major, prop.minor);
specs.cuda_cores = gpu_specs_t::cores_per_multiprocessor(specs.sm_code) * specs.streaming_multiprocessors;
// Scheduling-related constants
specs.max_blocks_per_multiprocessor = prop.maxBlocksPerMultiProcessor;
specs.reserved_memory_per_block = prop.reservedSharedMemPerBlock;
return {status_t::success_k, cudaSuccess};
}
class cuda_executor_t {
cudaStream_t stream_ = 0;
int device_id_ = 0;
public:
constexpr cuda_executor_t() noexcept = default;
constexpr cuda_executor_t(cuda_executor_t const &) noexcept = default;
constexpr cuda_executor_t &operator=(cuda_executor_t const &) noexcept = default;
cuda_status_t try_scheduling(int device_id) noexcept {
device_id_ = -1; // ? Invalid device ID
cudaError_t switching_error = cudaSetDevice(device_id);
if (switching_error != cudaSuccess) return {status_t::unknown_k, switching_error};
cudaError_t creation_error = cudaStreamCreateWithFlags(&stream_, cudaStreamNonBlocking);
if (creation_error != cudaSuccess) return {status_t::unknown_k, creation_error};
device_id_ = device_id;
return {status_t::success_k, cudaSuccess};
}
explicit operator bool() const noexcept { return device_id_ >= 0; }
inline cudaStream_t stream() const noexcept { return stream_; }
inline int device_id() const noexcept { return device_id_; }
};
/**
* @brief Loads 32 bits from an unaligned address using the well known @b `prmt` trick.
* @see https://stackoverflow.com/a/40198552/2766161
*/
__forceinline__ __device__ sz_u32_vec_t sz_u32_load_unaligned(void const *ptr) noexcept {
// In reality we load 64 bits, and then, with `.f4e`, we forward-extract
// four consecutive bytes into a 32-bit register.
sz_u32_vec_t result;
asm("{\n\t"
" .reg .b64 aligned_ptr;\n\t"
" .reg .b32 low, high, alignment;\n\t"
" and.b64 aligned_ptr, %1, 0xfffffffffffffffc;\n\t"
" ld.u32 low, [aligned_ptr];\n\t"
" ld.u32 high, [aligned_ptr+4];\n\t"
" cvt.u32.u64 alignment, %1;\n\t"
" prmt.b32.f4e %0, low, high, alignment;\n\t"
"}"
: "=r"(result.u32)
: "l"(ptr));
return result;
}
/** @brief Number of threads per warp on the GPU. */
enum warp_size_t : unsigned {
warp_size_nvidia_k = 32, // ? NVIDIA GPUs use 32 threads per warp
warp_size_amd_k = 64, // ? AMD GPUs use 64 threads per wave
};
/**
* @brief Defines the upper bound on the number of warps per multi processor we may theoretically
* be able to run as part of one or many blocks. Generally this number depends on the amount
* of shared memory available on the device, and the amount of reserved memory per block.
*/
enum warp_tasks_density_t : unsigned {
warps_working_together_k = 0,
one_warp_per_multiprocessor_k = 1,
two_warps_per_multiprocessor_k = 2,
four_warps_per_multiprocessor_k = 4,
eight_warps_per_multiprocessor_k = 8,
sixteen_warps_per_multiprocessor_k = 16,
thirty_two_warps_per_multiprocessor_k = 32,
sixty_four_warps_per_multiprocessor_k = 64,
infinite_warps_per_multiprocessor_k = 0xFFFFFFFF
};
inline warp_tasks_density_t warp_tasks_density(size_t task_memory_requirement, gpu_specs_t const &specs) noexcept {
std::initializer_list<warp_tasks_density_t> densities {
sixty_four_warps_per_multiprocessor_k, thirty_two_warps_per_multiprocessor_k,
sixteen_warps_per_multiprocessor_k, eight_warps_per_multiprocessor_k,
four_warps_per_multiprocessor_k, two_warps_per_multiprocessor_k,
one_warp_per_multiprocessor_k,
};
if (task_memory_requirement == 0) return infinite_warps_per_multiprocessor_k;
for (auto density : densities) {
if (density > specs.max_blocks_per_multiprocessor) continue;
size_t required_block_memory = task_memory_requirement * density + specs.reserved_memory_per_block * density;
if (required_block_memory < specs.shared_memory_per_multiprocessor()) return density;
}
return warps_working_together_k;
}
struct speculative_warp_tasks_density_t {
warp_tasks_density_t density = warps_working_together_k;
size_t speculative_factor = 0;
};
/**
* @brief Multiple warps can run concurrently on the same multiprocessor, which helps hide the latency
* of memory operations. It only happens, if we have enough shared memory, so we may want to reduce
* the density of the tasks proportional to the current GPU's speculative factor.
*/
inline speculative_warp_tasks_density_t speculation_friendly_density(warp_tasks_density_t maximum_density) noexcept {
// if (maximum_density >= 16) return {static_cast<warp_tasks_density_t>(maximum_density / 16), 16};
// if (maximum_density >= 8) return {static_cast<warp_tasks_density_t>(maximum_density / 8), 8};
if (maximum_density >= 4) return {static_cast<warp_tasks_density_t>(maximum_density / 4), 4};
if (maximum_density >= 2) return {static_cast<warp_tasks_density_t>(maximum_density / 2), 2};
return {maximum_density, 1};
}
template <typename task_type_>
struct warp_tasks_groups {
span<task_type_> device_level_tasks;
span<task_type_> warp_level_tasks;
span<task_type_> empty_tasks;
};
/**
* Let's say you have a list of GPU tasks of similar nature, but all of them require different amount of
* shared memory for efficiency. Ideally, we want each warp to receive it's own task (independent of the
* others), and have enough shared memory for more than one warp per multiprocessor.
*
* To optimize the scheduling we will look into several factors of individual tasks.
*
* @p `tasks[i].bytes_per_cell` - defines the actual kernel template to be used and serves as the natural boundary,
* as different kernels *kind of* can't run concurrently.
*
* Theoretically, that isn't true. On paper, several CUDA kernels can run concurrently on the same physical device.
* It makes sense, assuming the Multi-Instance GPU (MIG) virtualization feature on them. But even when pulling
* the `cudaGetDeviceProperties` and logging @b `cudaDeviceProp::concurrentKernels` for the H200 GPU with 140 GB
* of VRAM, it states 1. 🤯
*
* @p `tasks[i].density` - max number of warps per multiprocessor, executing simultaneously. It's inferred from the
* shared memory amount, supported by device, which is of the order of 100-200 KB, minus the reserved memory per block.
*
* Theoretically, it's true, but in practice, to address more than 48 KB of shared memory from a single block,
* you need to override that @b `cudaFuncAttributeMaxDynamicSharedMemorySize` setting for each individual kernel,
* also subtracting the reserved memory for the expected number of blocks. 🤯
* Moreover, calling @b `cudaFuncSetAttribute` is a synchronous operation, so say goodbye to asynchronous kernel
* launches and latency hiding. So to reduce the overall runtime, we need to have as few individual launches
* as possible.
*
* @p `specs.shared_memory_per_multiprocessor()` and @p `specs.reserved_memory_per_block` - our memory limits.
* @p `specs.streaming_multiprocessors` - the actual number of physical multiprocessors on the device. It defines
* the reasonable lower bound for a single kernel launch. On H100 it would be 132, so if a particular group of inputs
* can have 2 simultaneous warps per multiprocessor, but has fewer than (132 / 2 = 66) blocks, it would be better to
* merge it with a more memory-hungry group to avoid the overhead of additional kernel launches.
*
* It's possible to have a very memory-efficient task, that allows for 32 warps per multiprocessor, and it would be
* best schedule them in the largest possible block size - 1024 threads.
*
* @throws `std::bad_alloc` if the `std::sort` or `std::partition` fails to allocate memory.
*
* @post The @p tasks are sorted in-place, first containing the returned number of device-wide tasks,
* and then the warp-wide tasks grouped by ( @p bytes_per_cell, @p density ) pairs. Use @b `group_by`
* to navigate the output.
*/
template <typename task_type_>
warp_tasks_groups<task_type_> warp_tasks_grouping(span<task_type_> tasks, gpu_specs_t const &specs) noexcept(false) {
using task_t = task_type_;
warp_tasks_groups<task_t> result;
// Determine if there are tasks that require the whole device memory.
size_t const device_level_tasks =
std::partition(tasks.begin(), tasks.end(),
[](task_t const &task) { return task.density == warps_working_together_k; }) -
tasks.begin();
// Determine the number of empty tasks and put them aside.
auto const warp_tasks_begin = tasks.begin() + device_level_tasks;
size_t const non_empty_tasks =
std::partition(warp_tasks_begin, tasks.end(),
[](task_t const &task) { return task.density != infinite_warps_per_multiprocessor_k; }) -
warp_tasks_begin;
// The remaining tasks will be sorted from smallest memory consumption to largest.
auto const warp_tasks_end = warp_tasks_begin + non_empty_tasks;
std::sort(warp_tasks_begin, warp_tasks_end, [](task_t const &lhs, task_t const &rhs) {
return lhs.bytes_per_cell == rhs.bytes_per_cell ? lhs.density < rhs.density
: lhs.bytes_per_cell < rhs.bytes_per_cell;
});
result.device_level_tasks = {tasks.begin(), warp_tasks_begin};
result.warp_level_tasks = {warp_tasks_begin, warp_tasks_end};
result.empty_tasks = {warp_tasks_end, tasks.end()};
// The naive next step would be to simply group them by their memory requirements,
// but we our high-level goal isn't maximum utilization of the GPU, but rather
// the fastest execution time. And assuming the scheduling & synchronization
// costs, we may want to combine consecutive groups of tasks to ensure they are large enough.
size_t tasks_remaining = result.warp_level_tasks.size() - device_level_tasks;
while (tasks_remaining > 1) { // 1 task or less ~ nothing to merge
size_t const first_task_index = result.warp_level_tasks.size() - tasks_remaining;
task_t &indicative_task = tasks[first_task_index];
size_t const tasks_with_same_density =
std::find_if(&indicative_task, result.warp_level_tasks.end(),
[&](task_t const &task) {
return task.bytes_per_cell != indicative_task.bytes_per_cell ||
task.density != indicative_task.density;
}) -
&indicative_task;
size_t const following_tasks = tasks_remaining - tasks_with_same_density;
if (!following_tasks) break; // No more tasks to merge
// Check if we have enough tasks to keep all the warps busy.
size_t const possible_warps = size_t(indicative_task.density) * specs.streaming_multiprocessors;
if (tasks_with_same_density > possible_warps) {
tasks_remaining -= tasks_with_same_density;
continue; // Jump to the next group
}
// If the next task has a different cell size, we can't merge them.
task_t &next_indicative_task = tasks[first_task_index + tasks_with_same_density];
if (indicative_task.bytes_per_cell != next_indicative_task.bytes_per_cell) {
tasks_remaining -= tasks_with_same_density;
continue; // Jump to the next group
}
// Update all the operations in the current group to have the same "sparser" density
// as the next group.
for (size_t i = 0; i < tasks_with_same_density; ++i) {
task_t &task = tasks[first_task_index + i];
task.density = next_indicative_task.density;
}
}
return result;
}
} // namespace stringzillas
} // namespace ashvardanian
#endif // STRINGZILLAS_TYPES_CUH_