#pragma once
#include <unistd.h>
#include <cstdio>
#undef CEILING
#define CEILING(x, y) (((x) + (y)-1) / (y))
#define CUDA_SAFECALL(call) \
{ \
call; \
cudaError err = cudaGetLastError(); \
if (cudaSuccess != err) { \
fprintf(stderr, \
"Cuda error in function '%s' file '%s' in line %i : %s.\n", \
#call, __FILE__, __LINE__, cudaGetErrorString(err)); \
fflush(stderr); \
_exit(EXIT_FAILURE); \
} \
}
__device__ __forceinline__ unsigned int get_smid(void) {
unsigned int ret;
asm("mov.u32 %0, %smid;" : "=r"(ret));
return ret;
}
__device__ __forceinline__ unsigned int get_warpid(void) {
unsigned int ret;
asm("mov.u32 %0, %warpid;" : "=r"(ret));
return ret;
}
__device__ __forceinline__ unsigned int get_laneid(void) {
unsigned int laneid;
asm volatile("mov.u32 %0, %laneid;" : "=r"(laneid));
return laneid;
}
__device__ __forceinline__ int get_global_warp_id() {
int block_id =
blockIdx.x + blockIdx.y * gridDim.x + gridDim.x * gridDim.y * blockIdx.z;
int l_thread_id = (threadIdx.z * (blockDim.x * blockDim.y)) +
(threadIdx.y * blockDim.x) + threadIdx.x;
int l_warp_id = l_thread_id / 32;
int n_warps = CEILING(blockDim.x * blockDim.y * blockDim.z, 32);
int g_warp_id = block_id * n_warps + l_warp_id;
return g_warp_id;
}
__device__ __forceinline__ int4 get_ctaid(void) {
int4 ret;
asm("mov.u32 %0, %ctaid.x;" : "=r"(ret.x));
asm("mov.u32 %0, %ctaid.y;" : "=r"(ret.y));
asm("mov.u32 %0, %ctaid.z;" : "=r"(ret.z));
return ret;
}
__device__ __forceinline__ int4 get_nctaid(void) {
int4 ret;
asm("mov.u32 %0, %nctaid.x;" : "=r"(ret.x));
asm("mov.u32 %0, %nctaid.y;" : "=r"(ret.y));
asm("mov.u32 %0, %nctaid.z;" : "=r"(ret.z));
return ret;
}
__device__ __forceinline__ void csleep(uint64_t clock_count) {
if (clock_count == 0)
return;
clock_t start_clock = clock64();
clock_t clock_offset = 0;
while (clock_offset < clock_count) {
clock_offset = clock64() - start_clock;
}
}
class Managed {
public:
void *operator new(size_t len) {
printf("managed: malloc\n");
void *ptr;
cudaMallocManaged(&ptr, len);
return ptr;
}
void operator delete(void *ptr) {
printf("managed: free\n");
cudaFree(ptr);
}
void *operator new[](size_t len) {
printf("managed: malloc\n");
void *ptr;
cudaMallocManaged(&ptr, len);
return ptr;
}
void operator delete[](void *ptr) {
printf("managed: free\n");
cudaFree(ptr);
}
};