TMA_LOAD_IM2COL

Constant TMA_LOAD_IM2COL 

Source
pub const TMA_LOAD_IM2COL: &str = "inline __device__ void\ntma_load_im2col_3d(void const *desc_ptr,\n                   ::cuda::barrier<::cuda::thread_scope_block> &__bar,\n                   void *smem_ptr, int32 const &coord_c, int32 const &coord_w,\n                   int32 const &coord_n, uint16 const &offset_w) {\n  uint64 *mbar_ptr = ::cuda::device::barrier_native_handle(__bar);\n\n  uint64 gmem_int_desc = reinterpret_cast<uint64>(desc_ptr);\n  uint32 smem_int_mbar =\n      static_cast<uint32>(__cvta_generic_to_shared(mbar_ptr));\n  uint32 smem_int_ptr = static_cast<uint32>(__cvta_generic_to_shared(smem_ptr));\n  // Copy from global to shared::cluster.\n  asm volatile(\"cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier:\"\n               \":complete_tx::bytes\"\n               \" [%0], [%1, {%3, %4, %5}], [%2], {%6};\"\n               :\n               : \"r\"(smem_int_ptr), \"l\"(gmem_int_desc), \"r\"(smem_int_mbar),\n                 \"r\"(coord_c), \"r\"(coord_w), \"r\"(coord_n), \"h\"(offset_w)\n               : \"memory\");\n}\n\ninline __device__ void\ntma_load_im2col_4d(void const *desc_ptr,\n                   ::cuda::barrier<::cuda::thread_scope_block> &__bar,\n                   void *smem_ptr, int32 const &coord_c, int32 const &coord_w,\n                   int32 const &coord_h, int32 const &coord_n,\n                   uint16 const &offset_w, uint16 const &offset_h) {\n  uint64 *mbar_ptr = ::cuda::device::barrier_native_handle(__bar);\n\n  uint64 gmem_int_desc = reinterpret_cast<uint64>(desc_ptr);\n  uint32 smem_int_mbar =\n      static_cast<uint32>(__cvta_generic_to_shared(mbar_ptr));\n  uint32 smem_int_ptr = static_cast<uint32>(__cvta_generic_to_shared(smem_ptr));\n\n  // Copy from global to shared::cluster.\n  asm volatile(\"cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier:\"\n               \":complete_tx::bytes\"\n               \" [%0], [%1, {%3, %4, %5, %6}], [%2], {%7, %8};\"\n               :\n               : \"r\"(smem_int_ptr), \"l\"(gmem_int_desc), \"r\"(smem_int_mbar),\n                 \"r\"(coord_c), \"r\"(coord_w), \"r\"(coord_h), \"r\"(coord_n),\n                 \"h\"(offset_w), \"h\"(offset_h)\n               : \"memory\");\n}\n\ninline __device__ void tma_load_im2col_5d(\n    void const *desc_ptr, ::cuda::barrier<::cuda::thread_scope_block> &__bar,\n    void *smem_ptr, int32 const &coord_c, int32 const &coord_w,\n    int32 const &coord_h, int32 const &coord_d, int32 const &coord_n,\n    uint16 const &offset_w, uint16 const &offset_h, uint16 const &offset_d) {\n  uint64 *mbar_ptr = ::cuda::device::barrier_native_handle(__bar);\n\n  uint64 gmem_int_desc = reinterpret_cast<uint64>(desc_ptr);\n  uint32 smem_int_mbar =\n      static_cast<uint32>(__cvta_generic_to_shared(mbar_ptr));\n  uint32 smem_int_ptr = static_cast<uint32>(__cvta_generic_to_shared(smem_ptr));\n  // Copy from global to shared::cluster.\n  asm volatile(\"cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier:\"\n               \":complete_tx::bytes\"\n               \" [%0], [%1, {%3, %4, %5, %6, %7}], [%2], {%8, %9, %10};\"\n               :\n               : \"r\"(smem_int_ptr), \"l\"(gmem_int_desc), \"r\"(smem_int_mbar),\n                 \"r\"(coord_c), \"r\"(coord_w), \"r\"(coord_h), \"r\"(coord_d),\n                 \"r\"(coord_n), \"h\"(offset_w), \"h\"(offset_h), \"h\"(offset_d)\n               : \"memory\");\n}";