COPY_ASYNC

Constant COPY_ASYNC 

Source
pub const COPY_ASYNC: &str = "// Wrapper for https://docs.nvidia.com/cuda/parallel-thread-execution/#parallel-synchronization-and-communication-instructions-cp-async-mbarrier-arrive\ninline __device__ void\n__cp_async_arrive(::cuda::barrier<::cuda::thread_scope_block> &__bar) {\n  uint64 *mbar_ptr = ::cuda::device::barrier_native_handle(__bar);\n\n  uint32 smem_int_mbar =\n      static_cast<uint32>(__cvta_generic_to_shared(mbar_ptr));\n  asm volatile(\"cp.async.mbarrier.arrive.shared::cta.b64 [%0];\\n\"\n               :: \"r\"(smem_int_mbar));\n}\n\n// Wrappers for https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async\n\n// Only 16 byte size allows the `cg` modifier, so this is a more general version with `ca`. Executes\n// a partial copy of `src_size` bytes.\ntemplate <size_t _Copy_size>\ninline __device__ void\n__cp_async_shared_global(void const *src_ptr, void *smem_ptr, int32 const &src_size) {\n  static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16,\n                \"cp.async.shared.global requires a copy size of 4, 8, or 16.\");\n\n  uint64 gmem_int_desc = reinterpret_cast<uint64>(src_ptr);\n  uint32 smem_int_ptr = static_cast<uint32>(__cvta_generic_to_shared(smem_ptr));\n  asm volatile(\"cp.async.ca.shared::cta.global [%0], [%1], %2, %3;\\n\"\n               :: \"r\"(smem_int_ptr), \"l\"(gmem_int_desc), \"n\"(_Copy_size), \"r\"(src_size) : \"memory\");\n}\n\n// Specialized on copy size 16 to use `cg`. Executes a partial copy of `src_size` bytes.\ntemplate <>\ninline __device__ void\n__cp_async_shared_global<16>(void const *src_ptr, void *smem_ptr, int32 const &src_size) {\n  uint64 gmem_int_desc = reinterpret_cast<uint64>(src_ptr);\n  uint32 smem_int_ptr = static_cast<uint32>(__cvta_generic_to_shared(smem_ptr));\n  asm volatile(\"cp.async.cg.shared::cta.global [%0], [%1], %2, %3;\\n\"\n               :: \"r\"(smem_int_ptr), \"l\"(gmem_int_desc), \"n\"(16), \"r\"(src_size) : \"memory\");\n}\n\n// Only 16 byte size allows the `cg` modifier, so this is a more general version with `ca`. Executes\n// a full copy.\ntemplate <size_t _Copy_size>\ninline __device__ void\n__cp_async_shared_global(void const *src_ptr, void *smem_ptr) {\n  static_assert(_Copy_size == 4 || _Copy_size == 8 || _Copy_size == 16,\n                \"cp.async.shared.global requires a copy size of 4, 8, or 16.\");\n\n  uint64 gmem_int_desc = reinterpret_cast<uint64>(src_ptr);\n  uint32 smem_int_ptr = static_cast<uint32>(__cvta_generic_to_shared(smem_ptr));\n  asm volatile(\"cp.async.ca.shared::cta.global [%0], [%1], %2, %2;\\n\"\n               :: \"r\"(smem_int_ptr), \"l\"(gmem_int_desc), \"n\"(_Copy_size) : \"memory\");\n}\n\n// Specialized on copy size 16 to use `cg`. Executes a full copy.\ntemplate <>\ninline __device__ void\n__cp_async_shared_global<16>(void const *src_ptr, void *smem_ptr) {\n  uint64 gmem_int_desc = reinterpret_cast<uint64>(src_ptr);\n  uint32 smem_int_ptr = static_cast<uint32>(__cvta_generic_to_shared(smem_ptr));\n  asm volatile(\"cp.async.cg.shared::cta.global [%0], [%1], %2, %2;\\n\"\n               :: \"r\"(smem_int_ptr), \"l\"(gmem_int_desc), \"n\"(16) : \"memory\");\n}";