// Copyright Supranational LLC
// Licensed under the Apache License, Version 2.0, see LICENSE for details.
// SPDX-License-Identifier: Apache-2.0
#ifndef __SPPARK_UTIL_GPU_T_CUH__
#define __SPPARK_UTIL_GPU_T_CUH__
#ifndef __CUDACC__
# include <cuda_runtime.h>
#endif
#include "thread_pool_t.hpp"
#include "exception.cuh"
#include "slice_t.hpp"
#ifndef WARP_SZ
# define WARP_SZ 32
#endif
class gpu_t;
size_t ngpus();
const gpu_t& select_gpu(int id = 0);
const cudaDeviceProp& gpu_props(int id = 0);
const std::vector<const gpu_t*>& all_gpus();
extern "C" bool cuda_available();
class event_t {
cudaEvent_t event;
public:
event_t() : event(nullptr)
{ CUDA_UNWRAP_SPPARK(cudaEventCreate(&event, cudaEventDisableTiming)); }
event_t(cudaStream_t stream) : event(nullptr)
{
CUDA_UNWRAP_SPPARK(cudaEventCreate(&event, cudaEventDisableTiming));
CUDA_UNWRAP_SPPARK(cudaEventRecord(event, stream));
}
~event_t()
{ if (event) cudaEventDestroy(event); }
inline operator decltype(event)() const
{ return event; }
inline void record(cudaStream_t stream)
{ CUDA_UNWRAP_SPPARK(cudaEventRecord(event, stream)); }
inline void wait(cudaStream_t stream)
{ CUDA_UNWRAP_SPPARK(cudaStreamWaitEvent(stream, event)); }
};
struct launch_params_t {
dim3 gridDim, blockDim;
size_t shared;
launch_params_t(dim3 g, dim3 b, size_t sz = 0) : gridDim(g), blockDim(b), shared(sz) {}
launch_params_t(int g, int b, size_t sz = 0) : gridDim(g>0 ? g : 0), blockDim(b), shared(sz) {}
launch_params_t(unsigned int g, unsigned int b, size_t sz = 0) : gridDim(g), blockDim(b), shared(sz) {}
};
class stream_t {
cudaStream_t stream;
const int gpu_id;
public:
stream_t(int id) : gpu_id(id)
{ CUDA_UNWRAP_SPPARK(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); }
~stream_t()
{ cudaStreamDestroy(stream); }
inline operator decltype(stream)() const { return stream; }
inline int id() const { return gpu_id; }
inline operator int() const { return gpu_id; }
inline int sm_count() const
{ return gpu_props(gpu_id).multiProcessorCount; }
inline void* Dmalloc(size_t sz) const
{ void *d_ptr;
CUDA_UNWRAP_SPPARK(cudaMallocAsync(&d_ptr, sz, stream));
return d_ptr;
}
inline void Dfree(void* d_ptr) const
{ CUDA_UNWRAP_SPPARK(cudaFreeAsync(d_ptr, stream)); }
template<typename T>
inline void HtoD(T* dst, const void* src, size_t nelems,
size_t sz = sizeof(T)) const
{ if (sz == sizeof(T))
CUDA_UNWRAP_SPPARK(cudaMemcpyAsync(dst, src, nelems*sizeof(T),
cudaMemcpyHostToDevice, stream));
else
CUDA_UNWRAP_SPPARK(cudaMemcpy2DAsync(dst, sizeof(T), src, sz,
std::min(sizeof(T), sz), nelems,
cudaMemcpyHostToDevice, stream));
}
template<typename T>
inline void HtoD(T& dst, const void* src, size_t nelems,
size_t sz = sizeof(T)) const
{ HtoD(&dst, src, nelems, sz); }
template<typename T, typename U>
inline void HtoD(T& dst, const std::vector<U>& src,
size_t sz = sizeof(T)) const
{ HtoD(&dst, &src[0], src.size(), sz); }
template<typename T, typename U>
inline void HtoD(T* dst, const std::vector<U>& src,
size_t sz = sizeof(T)) const
{ HtoD(dst, &src[0], src.size(), sz); }
template<typename T, typename U>
inline void HtoD(T& dst, slice_t<U> src, size_t sz = sizeof(T)) const
{ HtoD(&dst, &src[0], src.size(), sz); }
template<typename T, typename U>
inline void HtoD(T* dst, slice_t<U> src, size_t sz = sizeof(T)) const
{ HtoD(dst, &src[0], src.size(), sz); }
template<typename... Types>
inline void launch_coop(void(*f)(Types...), dim3 gridDim, dim3 blockDim,
size_t shared_sz,
Types... args) const
{
if (gpu_props(gpu_id).sharedMemPerBlock < shared_sz)
CUDA_UNWRAP_SPPARK(cudaFuncSetAttribute(f, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_sz));
if (gridDim.x == 0 || blockDim.x == 0) {
int blockSize, minGridSize;
CUDA_UNWRAP_SPPARK(cudaOccupancyMaxPotentialBlockSize(&minGridSize, &blockSize, f));
if (blockDim.x == 0) blockDim.x = blockSize;
if (gridDim.x == 0) gridDim.x = minGridSize;
}
void* va_args[sizeof...(args)] = { &args... };
CUDA_UNWRAP_SPPARK(cudaLaunchCooperativeKernel((const void*)f, gridDim, blockDim,
va_args, shared_sz, stream));
}
template<typename... Types>
inline void launch_coop(void(*f)(Types...), const launch_params_t& lps,
Types... args) const
{ launch_coop(f, lps.gridDim, lps.blockDim, lps.shared, args...); }
template<typename T>
inline void DtoH(T* dst, const void* src, size_t nelems,
size_t sz = sizeof(T)) const
{ if (sz == sizeof(T))
CUDA_UNWRAP_SPPARK(cudaMemcpyAsync(dst, src, nelems*sizeof(T),
cudaMemcpyDeviceToHost, stream));
else
CUDA_UNWRAP_SPPARK(cudaMemcpy2DAsync(dst, sizeof(T), src, sz,
std::min(sizeof(T), sz), nelems,
cudaMemcpyDeviceToHost, stream));
}
template<typename T>
inline void DtoH(T& dst, const void* src, size_t nelems,
size_t sz = sizeof(T)) const
{ DtoH(&dst, src, nelems, sz); }
template<typename T>
inline void DtoH(std::vector<T>& dst, const void* src,
size_t sz = sizeof(T)) const
{ DtoH(&dst[0], src, dst.size(), sz); }
inline void sync() const
{ CUDA_UNWRAP_SPPARK(cudaStreamSynchronize(stream)); }
inline void notify(cudaHostFn_t cb, void* data)
{ CUDA_UNWRAP_SPPARK(cudaLaunchHostFunc(stream, cb, data)); }
template<class T>
inline void notify(T& sema)
{ notify([](void* s) { reinterpret_cast<T*>(s)->notify(); }, &sema); }
inline void record(cudaEvent_t event)
{ CUDA_UNWRAP_SPPARK(cudaEventRecord(event, stream)); }
inline void wait(cudaEvent_t event)
{ CUDA_UNWRAP_SPPARK(cudaStreamWaitEvent(stream, event)); }
};
class gpu_t {
public:
static const size_t FLIP_FLOP = 3;
private:
int gpu_id, cuda_id;
cudaDeviceProp prop;
size_t total_mem;
mutable stream_t zero = {gpu_id};
mutable stream_t flipflop[FLIP_FLOP] = {gpu_id, gpu_id, gpu_id};
mutable thread_pool_t pool{"SPPARK_GPU_T_AFFINITY"};
public:
gpu_t(int id, int real_id, const cudaDeviceProp& p)
: gpu_id(id), cuda_id(real_id), prop(p)
{ size_t freeMem;
CUDA_UNWRAP_SPPARK(cudaMemGetInfo(&freeMem, &total_mem));
}
inline int cid() const { return cuda_id; }
inline int id() const { return gpu_id; }
inline operator int() const { return gpu_id; }
inline const auto& props() const { return prop; }
inline int sm_count() const { return prop.multiProcessorCount; }
inline void select() const { cudaSetDevice(cuda_id); }
stream_t& operator[](size_t i) const { return flipflop[i%FLIP_FLOP]; }
inline operator stream_t&() const { return zero; }
inline operator cudaStream_t() const { return zero; }
inline size_t ncpus() const { return pool.size(); }
template<class Workable>
inline void spawn(Workable work) const { pool.spawn(work); }
template<class Workable>
inline void par_map(size_t num_items, size_t stride, Workable work,
size_t max_workers = 0) const
{ pool.par_map(num_items, stride, work, max_workers); }
inline void* Dmalloc(size_t sz) const
{ void *d_ptr = zero.Dmalloc(sz);
zero.sync();
return d_ptr;
}
inline void Dfree(void* d_ptr) const
{ zero.Dfree(d_ptr);
zero.sync();
}
template<typename T>
inline void HtoD(T* dst, const void* src, size_t nelems,
size_t sz = sizeof(T)) const
{ zero.HtoD(dst, src, nelems, sz); }
template<typename T>
inline void HtoD(T& dst, const void* src, size_t nelems,
size_t sz = sizeof(T)) const
{ HtoD(&dst, src, nelems, sz); }
template<typename T, typename U>
inline void HtoD(T& dst, const std::vector<U>& src,
size_t sz = sizeof(T)) const
{ HtoD(&dst, &src[0], src.size(), sz); }
template<typename... Types>
inline void launch_coop(void(*f)(Types...), dim3 gridDim, dim3 blockDim,
size_t shared_sz,
Types... args) const
{ zero.launch_coop(f, gridDim, blockDim, shared_sz,
args...);
}
template<typename... Types>
inline void launch_coop(void(*f)(Types...), const launch_params_t& lps,
Types... args) const
{ zero.launch_coop(f, lps, args...); }
template<typename T>
inline void DtoH(T* dst, const void* src, size_t nelems,
size_t sz = sizeof(T)) const
{ zero.DtoH(dst, src, nelems, sz); }
template<typename T>
inline void DtoH(T& dst, const void* src, size_t nelems,
size_t sz = sizeof(T)) const
{ DtoH(&dst, src, nelems, sz); }
template<typename T>
inline void DtoH(std::vector<T>& dst, const void* src,
size_t sz = sizeof(T)) const
{ DtoH(&dst[0], src, dst.size(), sz); }
inline void sync() const
{
zero.sync();
for (auto& f : flipflop)
f.sync();
}
};
template<typename T> class gpu_ptr_t {
struct inner {
T* ptr;
std::atomic<size_t> ref_cnt;
int real_id;
inline inner(T* p) : ptr(p), ref_cnt(1)
{ cudaGetDevice(&real_id); }
};
inner *ptr;
public:
gpu_ptr_t() : ptr(nullptr) {}
gpu_ptr_t(T* p) { ptr = new inner(p); }
gpu_ptr_t(const gpu_ptr_t& r) { *this = r; }
~gpu_ptr_t()
{
if (ptr && ptr->ref_cnt.fetch_sub(1, std::memory_order_seq_cst) == 1) {
int current_id;
cudaGetDevice(¤t_id);
if (current_id != ptr->real_id)
cudaSetDevice(ptr->real_id);
cudaFree(ptr->ptr);
if (current_id != ptr->real_id)
cudaSetDevice(current_id);
delete ptr;
}
}
gpu_ptr_t& operator=(const gpu_ptr_t& r)
{
if (this != &r)
(ptr = r.ptr)->ref_cnt.fetch_add(1, std::memory_order_relaxed);
return *this;
}
gpu_ptr_t& operator=(gpu_ptr_t&& r) noexcept
{
if (this != &r) {
ptr = r.ptr;
r.ptr = nullptr;
}
return *this;
}
inline operator T*() const { return ptr->ptr; }
// facilitate return by value through FFI, as gpu_ptr_t<T>::by_value.
using by_value = struct { inner *ptr; };
operator by_value() const
{ ptr->ref_cnt.fetch_add(1, std::memory_order_relaxed); return {ptr}; }
gpu_ptr_t(by_value v) { ptr = v.ptr; }
};
// A simple way to allocate a temporary device pointer without having to
// care about freeing it.
template<typename T> class dev_ptr_t {
T* d_ptr;
cudaStream_t stream;
public:
dev_ptr_t(size_t nelems) : d_ptr(nullptr), stream(nullptr)
{
if (nelems) {
size_t n = (nelems+WARP_SZ-1) & ((size_t)0-WARP_SZ);
CUDA_UNWRAP_SPPARK(cudaMalloc(&d_ptr, n * sizeof(T)));
}
}
dev_ptr_t(size_t nelems, const cudaStream_t s) : d_ptr(nullptr), stream(s)
{
if (nelems) {
size_t n = (nelems+WARP_SZ-1) & ((size_t)0-WARP_SZ);
CUDA_UNWRAP_SPPARK(cudaMallocAsync(&d_ptr, n * sizeof(T), stream));
}
}
dev_ptr_t(size_t nelems, stream_t& s) : d_ptr(nullptr), stream(s)
{
if (nelems) {
size_t n = (nelems+WARP_SZ-1) & ((size_t)0-WARP_SZ);
CUDA_UNWRAP_SPPARK(cudaMallocAsync(&d_ptr, n * sizeof(T), s));
}
}
dev_ptr_t(const dev_ptr_t& r) = delete;
dev_ptr_t& operator=(const dev_ptr_t& r) = delete;
~dev_ptr_t()
{
if (d_ptr) {
if (stream) (void)cudaFreeAsync((void*)d_ptr, stream);
else (void)cudaFree((void*)d_ptr);
}
}
inline operator const T*() const { return d_ptr; }
inline operator T*() const { return d_ptr; }
inline operator void*() const { return (void*)d_ptr; }
inline const T& operator[](size_t i) const { return d_ptr[i]; }
inline T& operator[](size_t i) { return d_ptr[i]; }
};
#endif