//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-29920130
// Cuda compilation tools, release 11.3, V11.3.109
// Based on NVVM 7.0.1
//
.version 7.3
.target sm_86
.address_size 64
// .globl matrixMul_bs16_64bit
// _ZZ9matrixMulILi16EmEvPdS0_S0_T0_S1_E2As has been demoted
// _ZZ9matrixMulILi16EmEvPdS0_S0_T0_S1_E2Bs has been demoted
// _ZZ9matrixMulILi32EmEvPdS0_S0_T0_S1_E2As has been demoted
// _ZZ9matrixMulILi32EmEvPdS0_S0_T0_S1_E2Bs has been demoted
.visible .entry matrixMul_bs16_64bit(
.param .u64 matrixMul_bs16_64bit_param_0,
.param .u64 matrixMul_bs16_64bit_param_1,
.param .u64 matrixMul_bs16_64bit_param_2,
.param .u64 matrixMul_bs16_64bit_param_3,
.param .u64 matrixMul_bs16_64bit_param_4
)
{
.reg .pred %p<3>;
.reg .b32 %r<16>;
.reg .f64 %fd<57>;
.reg .b64 %rd<41>;
// demoted variable
.shared .align 8 .b8 _ZZ9matrixMulILi16EmEvPdS0_S0_T0_S1_E2As[2048];
// demoted variable
.shared .align 8 .b8 _ZZ9matrixMulILi16EmEvPdS0_S0_T0_S1_E2Bs[2048];
ld.param.u64 %rd18, [matrixMul_bs16_64bit_param_0];
ld.param.u64 %rd19, [matrixMul_bs16_64bit_param_1];
ld.param.u64 %rd20, [matrixMul_bs16_64bit_param_2];
ld.param.u64 %rd21, [matrixMul_bs16_64bit_param_3];
ld.param.u64 %rd22, [matrixMul_bs16_64bit_param_4];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ctaid.y;
cvt.u64.u32 %rd1, %r6;
mov.u32 %r7, %tid.x;
cvt.u64.u32 %rd2, %r7;
mov.u32 %r8, %tid.y;
cvt.u64.u32 %rd3, %r8;
mul.lo.s64 %rd23, %rd21, %rd1;
shl.b64 %rd40, %rd23, 4;
add.s64 %rd24, %rd21, -1;
add.s64 %rd5, %rd24, %rd40;
setp.lt.u64 %p1, %rd5, %rd24;
mul.wide.u32 %rd6, %r5, 16;
shl.b64 %rd7, %rd22, 4;
mul.lo.s64 %rd8, %rd3, %rd22;
mov.f64 %fd56, 0d0000000000000000;
@%p1 bra $L__BB0_3;
cvt.u32.u64 %r9, %rd3;
mul.lo.s64 %rd25, %rd3, %rd21;
add.s64 %rd9, %rd25, %rd2;
cvt.u32.u64 %r10, %rd2;
shl.b32 %r11, %r9, 7;
mov.u32 %r12, _ZZ9matrixMulILi16EmEvPdS0_S0_T0_S1_E2As;
add.s32 %r3, %r12, %r11;
shl.b32 %r13, %r10, 3;
add.s32 %r1, %r3, %r13;
add.s64 %rd10, %rd8, %rd2;
mov.u32 %r14, _ZZ9matrixMulILi16EmEvPdS0_S0_T0_S1_E2Bs;
add.s32 %r15, %r14, %r11;
add.s32 %r2, %r15, %r13;
add.s32 %r4, %r14, %r13;
cvta.to.global.u64 %rd12, %rd19;
cvta.to.global.u64 %rd13, %rd20;
mov.f64 %fd56, 0d0000000000000000;
mov.u64 %rd39, %rd6;
$L__BB0_2:
add.s64 %rd26, %rd9, %rd40;
shl.b64 %rd27, %rd26, 3;
add.s64 %rd28, %rd12, %rd27;
ld.global.f64 %fd6, [%rd28];
st.shared.f64 [%r1], %fd6;
add.s64 %rd29, %rd10, %rd39;
shl.b64 %rd30, %rd29, 3;
add.s64 %rd31, %rd13, %rd30;
ld.global.f64 %fd7, [%rd31];
st.shared.f64 [%r2], %fd7;
bar.sync 0;
ld.shared.f64 %fd8, [%r4];
ld.shared.f64 %fd9, [%r3];
fma.rn.f64 %fd10, %fd9, %fd8, %fd56;
ld.shared.f64 %fd11, [%r4+128];
ld.shared.f64 %fd12, [%r3+8];
fma.rn.f64 %fd13, %fd12, %fd11, %fd10;
ld.shared.f64 %fd14, [%r4+256];
ld.shared.f64 %fd15, [%r3+16];
fma.rn.f64 %fd16, %fd15, %fd14, %fd13;
ld.shared.f64 %fd17, [%r4+384];
ld.shared.f64 %fd18, [%r3+24];
fma.rn.f64 %fd19, %fd18, %fd17, %fd16;
ld.shared.f64 %fd20, [%r4+512];
ld.shared.f64 %fd21, [%r3+32];
fma.rn.f64 %fd22, %fd21, %fd20, %fd19;
ld.shared.f64 %fd23, [%r4+640];
ld.shared.f64 %fd24, [%r3+40];
fma.rn.f64 %fd25, %fd24, %fd23, %fd22;
ld.shared.f64 %fd26, [%r4+768];
ld.shared.f64 %fd27, [%r3+48];
fma.rn.f64 %fd28, %fd27, %fd26, %fd25;
ld.shared.f64 %fd29, [%r4+896];
ld.shared.f64 %fd30, [%r3+56];
fma.rn.f64 %fd31, %fd30, %fd29, %fd28;
ld.shared.f64 %fd32, [%r4+1024];
ld.shared.f64 %fd33, [%r3+64];
fma.rn.f64 %fd34, %fd33, %fd32, %fd31;
ld.shared.f64 %fd35, [%r4+1152];
ld.shared.f64 %fd36, [%r3+72];
fma.rn.f64 %fd37, %fd36, %fd35, %fd34;
ld.shared.f64 %fd38, [%r4+1280];
ld.shared.f64 %fd39, [%r3+80];
fma.rn.f64 %fd40, %fd39, %fd38, %fd37;
ld.shared.f64 %fd41, [%r4+1408];
ld.shared.f64 %fd42, [%r3+88];
fma.rn.f64 %fd43, %fd42, %fd41, %fd40;
ld.shared.f64 %fd44, [%r4+1536];
ld.shared.f64 %fd45, [%r3+96];
fma.rn.f64 %fd46, %fd45, %fd44, %fd43;
ld.shared.f64 %fd47, [%r4+1664];
ld.shared.f64 %fd48, [%r3+104];
fma.rn.f64 %fd49, %fd48, %fd47, %fd46;
ld.shared.f64 %fd50, [%r4+1792];
ld.shared.f64 %fd51, [%r3+112];
fma.rn.f64 %fd52, %fd51, %fd50, %fd49;
ld.shared.f64 %fd53, [%r4+1920];
ld.shared.f64 %fd54, [%r3+120];
fma.rn.f64 %fd56, %fd54, %fd53, %fd52;
bar.sync 0;
add.s64 %rd39, %rd39, %rd7;
add.s64 %rd40, %rd40, 16;
setp.le.u64 %p2, %rd40, %rd5;
@%p2 bra $L__BB0_2;
$L__BB0_3:
add.s64 %rd32, %rd6, %rd2;
add.s64 %rd33, %rd32, %rd8;
mul.lo.s64 %rd34, %rd7, %rd1;
add.s64 %rd35, %rd33, %rd34;
cvta.to.global.u64 %rd36, %rd18;
shl.b64 %rd37, %rd35, 3;
add.s64 %rd38, %rd36, %rd37;
st.global.f64 [%rd38], %fd56;
ret;
}
// .globl matrixMul_bs32_64bit
.visible .entry matrixMul_bs32_64bit(
.param .u64 matrixMul_bs32_64bit_param_0,
.param .u64 matrixMul_bs32_64bit_param_1,
.param .u64 matrixMul_bs32_64bit_param_2,
.param .u64 matrixMul_bs32_64bit_param_3,
.param .u64 matrixMul_bs32_64bit_param_4
)
{
.reg .pred %p<3>;
.reg .b32 %r<16>;
.reg .f64 %fd<105>;
.reg .b64 %rd<41>;
// demoted variable
.shared .align 8 .b8 _ZZ9matrixMulILi32EmEvPdS0_S0_T0_S1_E2As[8192];
// demoted variable
.shared .align 8 .b8 _ZZ9matrixMulILi32EmEvPdS0_S0_T0_S1_E2Bs[8192];
ld.param.u64 %rd18, [matrixMul_bs32_64bit_param_0];
ld.param.u64 %rd19, [matrixMul_bs32_64bit_param_1];
ld.param.u64 %rd20, [matrixMul_bs32_64bit_param_2];
ld.param.u64 %rd21, [matrixMul_bs32_64bit_param_3];
ld.param.u64 %rd22, [matrixMul_bs32_64bit_param_4];
mov.u32 %r5, %ctaid.x;
mov.u32 %r6, %ctaid.y;
cvt.u64.u32 %rd1, %r6;
mov.u32 %r7, %tid.x;
cvt.u64.u32 %rd2, %r7;
mov.u32 %r8, %tid.y;
cvt.u64.u32 %rd3, %r8;
mul.lo.s64 %rd23, %rd21, %rd1;
shl.b64 %rd40, %rd23, 5;
add.s64 %rd24, %rd21, -1;
add.s64 %rd5, %rd24, %rd40;
setp.lt.u64 %p1, %rd5, %rd24;
mul.wide.u32 %rd6, %r5, 32;
shl.b64 %rd7, %rd22, 5;
mul.lo.s64 %rd8, %rd3, %rd22;
mov.f64 %fd104, 0d0000000000000000;
@%p1 bra $L__BB1_3;
cvt.u32.u64 %r9, %rd3;
mul.lo.s64 %rd25, %rd3, %rd21;
add.s64 %rd9, %rd25, %rd2;
cvt.u32.u64 %r10, %rd2;
shl.b32 %r11, %r9, 8;
mov.u32 %r12, _ZZ9matrixMulILi32EmEvPdS0_S0_T0_S1_E2As;
add.s32 %r3, %r12, %r11;
shl.b32 %r13, %r10, 3;
add.s32 %r1, %r3, %r13;
add.s64 %rd10, %rd8, %rd2;
mov.u32 %r14, _ZZ9matrixMulILi32EmEvPdS0_S0_T0_S1_E2Bs;
add.s32 %r15, %r14, %r11;
add.s32 %r2, %r15, %r13;
add.s32 %r4, %r14, %r13;
cvta.to.global.u64 %rd12, %rd19;
cvta.to.global.u64 %rd13, %rd20;
mov.f64 %fd104, 0d0000000000000000;
mov.u64 %rd39, %rd6;
$L__BB1_2:
add.s64 %rd26, %rd9, %rd40;
shl.b64 %rd27, %rd26, 3;
add.s64 %rd28, %rd12, %rd27;
ld.global.f64 %fd6, [%rd28];
st.shared.f64 [%r1], %fd6;
add.s64 %rd29, %rd10, %rd39;
shl.b64 %rd30, %rd29, 3;
add.s64 %rd31, %rd13, %rd30;
ld.global.f64 %fd7, [%rd31];
st.shared.f64 [%r2], %fd7;
bar.sync 0;
ld.shared.f64 %fd8, [%r4];
ld.shared.f64 %fd9, [%r3];
fma.rn.f64 %fd10, %fd9, %fd8, %fd104;
ld.shared.f64 %fd11, [%r4+256];
ld.shared.f64 %fd12, [%r3+8];
fma.rn.f64 %fd13, %fd12, %fd11, %fd10;
ld.shared.f64 %fd14, [%r4+512];
ld.shared.f64 %fd15, [%r3+16];
fma.rn.f64 %fd16, %fd15, %fd14, %fd13;
ld.shared.f64 %fd17, [%r4+768];
ld.shared.f64 %fd18, [%r3+24];
fma.rn.f64 %fd19, %fd18, %fd17, %fd16;
ld.shared.f64 %fd20, [%r4+1024];
ld.shared.f64 %fd21, [%r3+32];
fma.rn.f64 %fd22, %fd21, %fd20, %fd19;
ld.shared.f64 %fd23, [%r4+1280];
ld.shared.f64 %fd24, [%r3+40];
fma.rn.f64 %fd25, %fd24, %fd23, %fd22;
ld.shared.f64 %fd26, [%r4+1536];
ld.shared.f64 %fd27, [%r3+48];
fma.rn.f64 %fd28, %fd27, %fd26, %fd25;
ld.shared.f64 %fd29, [%r4+1792];
ld.shared.f64 %fd30, [%r3+56];
fma.rn.f64 %fd31, %fd30, %fd29, %fd28;
ld.shared.f64 %fd32, [%r4+2048];
ld.shared.f64 %fd33, [%r3+64];
fma.rn.f64 %fd34, %fd33, %fd32, %fd31;
ld.shared.f64 %fd35, [%r4+2304];
ld.shared.f64 %fd36, [%r3+72];
fma.rn.f64 %fd37, %fd36, %fd35, %fd34;
ld.shared.f64 %fd38, [%r4+2560];
ld.shared.f64 %fd39, [%r3+80];
fma.rn.f64 %fd40, %fd39, %fd38, %fd37;
ld.shared.f64 %fd41, [%r4+2816];
ld.shared.f64 %fd42, [%r3+88];
fma.rn.f64 %fd43, %fd42, %fd41, %fd40;
ld.shared.f64 %fd44, [%r4+3072];
ld.shared.f64 %fd45, [%r3+96];
fma.rn.f64 %fd46, %fd45, %fd44, %fd43;
ld.shared.f64 %fd47, [%r4+3328];
ld.shared.f64 %fd48, [%r3+104];
fma.rn.f64 %fd49, %fd48, %fd47, %fd46;
ld.shared.f64 %fd50, [%r4+3584];
ld.shared.f64 %fd51, [%r3+112];
fma.rn.f64 %fd52, %fd51, %fd50, %fd49;
ld.shared.f64 %fd53, [%r4+3840];
ld.shared.f64 %fd54, [%r3+120];
fma.rn.f64 %fd55, %fd54, %fd53, %fd52;
ld.shared.f64 %fd56, [%r4+4096];
ld.shared.f64 %fd57, [%r3+128];
fma.rn.f64 %fd58, %fd57, %fd56, %fd55;
ld.shared.f64 %fd59, [%r4+4352];
ld.shared.f64 %fd60, [%r3+136];
fma.rn.f64 %fd61, %fd60, %fd59, %fd58;
ld.shared.f64 %fd62, [%r4+4608];
ld.shared.f64 %fd63, [%r3+144];
fma.rn.f64 %fd64, %fd63, %fd62, %fd61;
ld.shared.f64 %fd65, [%r4+4864];
ld.shared.f64 %fd66, [%r3+152];
fma.rn.f64 %fd67, %fd66, %fd65, %fd64;
ld.shared.f64 %fd68, [%r4+5120];
ld.shared.f64 %fd69, [%r3+160];
fma.rn.f64 %fd70, %fd69, %fd68, %fd67;
ld.shared.f64 %fd71, [%r4+5376];
ld.shared.f64 %fd72, [%r3+168];
fma.rn.f64 %fd73, %fd72, %fd71, %fd70;
ld.shared.f64 %fd74, [%r4+5632];
ld.shared.f64 %fd75, [%r3+176];
fma.rn.f64 %fd76, %fd75, %fd74, %fd73;
ld.shared.f64 %fd77, [%r4+5888];
ld.shared.f64 %fd78, [%r3+184];
fma.rn.f64 %fd79, %fd78, %fd77, %fd76;
ld.shared.f64 %fd80, [%r4+6144];
ld.shared.f64 %fd81, [%r3+192];
fma.rn.f64 %fd82, %fd81, %fd80, %fd79;
ld.shared.f64 %fd83, [%r4+6400];
ld.shared.f64 %fd84, [%r3+200];
fma.rn.f64 %fd85, %fd84, %fd83, %fd82;
ld.shared.f64 %fd86, [%r4+6656];
ld.shared.f64 %fd87, [%r3+208];
fma.rn.f64 %fd88, %fd87, %fd86, %fd85;
ld.shared.f64 %fd89, [%r4+6912];
ld.shared.f64 %fd90, [%r3+216];
fma.rn.f64 %fd91, %fd90, %fd89, %fd88;
ld.shared.f64 %fd92, [%r4+7168];
ld.shared.f64 %fd93, [%r3+224];
fma.rn.f64 %fd94, %fd93, %fd92, %fd91;
ld.shared.f64 %fd95, [%r4+7424];
ld.shared.f64 %fd96, [%r3+232];
fma.rn.f64 %fd97, %fd96, %fd95, %fd94;
ld.shared.f64 %fd98, [%r4+7680];
ld.shared.f64 %fd99, [%r3+240];
fma.rn.f64 %fd100, %fd99, %fd98, %fd97;
ld.shared.f64 %fd101, [%r4+7936];
ld.shared.f64 %fd102, [%r3+248];
fma.rn.f64 %fd104, %fd102, %fd101, %fd100;
bar.sync 0;
add.s64 %rd39, %rd39, %rd7;
add.s64 %rd40, %rd40, 32;
setp.le.u64 %p2, %rd40, %rd5;
@%p2 bra $L__BB1_2;
$L__BB1_3:
add.s64 %rd32, %rd6, %rd2;
add.s64 %rd33, %rd32, %rd8;
mul.lo.s64 %rd34, %rd7, %rd1;
add.s64 %rd35, %rd33, %rd34;
cvta.to.global.u64 %rd36, %rd18;
shl.b64 %rd37, %rd35, 3;
add.s64 %rd38, %rd36, %rd37;
st.global.f64 [%rd38], %fd104;
ret;
}