cuda-oxide 0.4.0

cuda-oxide provides a high-level, rusty wrapper over CUDA. It provides the best safety one can get when working with hardware.
Documentation
//
// Generated by NVIDIA NVVM Compiler
//
// Compiler Build ID: CL-29920130
// Cuda compilation tools, release 11.3, V11.3.109
// Based on NVVM 7.0.1
//

.version 7.3
.target sm_86
.address_size 64

	// .globl	matrixMul_bs16_64bit
// _ZZ9matrixMulILi16EmEvPdS0_S0_T0_S1_E2As has been demoted
// _ZZ9matrixMulILi16EmEvPdS0_S0_T0_S1_E2Bs has been demoted
// _ZZ9matrixMulILi32EmEvPdS0_S0_T0_S1_E2As has been demoted
// _ZZ9matrixMulILi32EmEvPdS0_S0_T0_S1_E2Bs has been demoted

.visible .entry matrixMul_bs16_64bit(
	.param .u64 matrixMul_bs16_64bit_param_0,
	.param .u64 matrixMul_bs16_64bit_param_1,
	.param .u64 matrixMul_bs16_64bit_param_2,
	.param .u64 matrixMul_bs16_64bit_param_3,
	.param .u64 matrixMul_bs16_64bit_param_4
)
{
	.reg .pred 	%p<3>;
	.reg .b32 	%r<16>;
	.reg .f64 	%fd<57>;
	.reg .b64 	%rd<41>;
	// demoted variable
	.shared .align 8 .b8 _ZZ9matrixMulILi16EmEvPdS0_S0_T0_S1_E2As[2048];
	// demoted variable
	.shared .align 8 .b8 _ZZ9matrixMulILi16EmEvPdS0_S0_T0_S1_E2Bs[2048];

	ld.param.u64 	%rd18, [matrixMul_bs16_64bit_param_0];
	ld.param.u64 	%rd19, [matrixMul_bs16_64bit_param_1];
	ld.param.u64 	%rd20, [matrixMul_bs16_64bit_param_2];
	ld.param.u64 	%rd21, [matrixMul_bs16_64bit_param_3];
	ld.param.u64 	%rd22, [matrixMul_bs16_64bit_param_4];
	mov.u32 	%r5, %ctaid.x;
	mov.u32 	%r6, %ctaid.y;
	cvt.u64.u32 	%rd1, %r6;
	mov.u32 	%r7, %tid.x;
	cvt.u64.u32 	%rd2, %r7;
	mov.u32 	%r8, %tid.y;
	cvt.u64.u32 	%rd3, %r8;
	mul.lo.s64 	%rd23, %rd21, %rd1;
	shl.b64 	%rd40, %rd23, 4;
	add.s64 	%rd24, %rd21, -1;
	add.s64 	%rd5, %rd24, %rd40;
	setp.lt.u64 	%p1, %rd5, %rd24;
	mul.wide.u32 	%rd6, %r5, 16;
	shl.b64 	%rd7, %rd22, 4;
	mul.lo.s64 	%rd8, %rd3, %rd22;
	mov.f64 	%fd56, 0d0000000000000000;
	@%p1 bra 	$L__BB0_3;

	cvt.u32.u64 	%r9, %rd3;
	mul.lo.s64 	%rd25, %rd3, %rd21;
	add.s64 	%rd9, %rd25, %rd2;
	cvt.u32.u64 	%r10, %rd2;
	shl.b32 	%r11, %r9, 7;
	mov.u32 	%r12, _ZZ9matrixMulILi16EmEvPdS0_S0_T0_S1_E2As;
	add.s32 	%r3, %r12, %r11;
	shl.b32 	%r13, %r10, 3;
	add.s32 	%r1, %r3, %r13;
	add.s64 	%rd10, %rd8, %rd2;
	mov.u32 	%r14, _ZZ9matrixMulILi16EmEvPdS0_S0_T0_S1_E2Bs;
	add.s32 	%r15, %r14, %r11;
	add.s32 	%r2, %r15, %r13;
	add.s32 	%r4, %r14, %r13;
	cvta.to.global.u64 	%rd12, %rd19;
	cvta.to.global.u64 	%rd13, %rd20;
	mov.f64 	%fd56, 0d0000000000000000;
	mov.u64 	%rd39, %rd6;

$L__BB0_2:
	add.s64 	%rd26, %rd9, %rd40;
	shl.b64 	%rd27, %rd26, 3;
	add.s64 	%rd28, %rd12, %rd27;
	ld.global.f64 	%fd6, [%rd28];
	st.shared.f64 	[%r1], %fd6;
	add.s64 	%rd29, %rd10, %rd39;
	shl.b64 	%rd30, %rd29, 3;
	add.s64 	%rd31, %rd13, %rd30;
	ld.global.f64 	%fd7, [%rd31];
	st.shared.f64 	[%r2], %fd7;
	bar.sync 	0;
	ld.shared.f64 	%fd8, [%r4];
	ld.shared.f64 	%fd9, [%r3];
	fma.rn.f64 	%fd10, %fd9, %fd8, %fd56;
	ld.shared.f64 	%fd11, [%r4+128];
	ld.shared.f64 	%fd12, [%r3+8];
	fma.rn.f64 	%fd13, %fd12, %fd11, %fd10;
	ld.shared.f64 	%fd14, [%r4+256];
	ld.shared.f64 	%fd15, [%r3+16];
	fma.rn.f64 	%fd16, %fd15, %fd14, %fd13;
	ld.shared.f64 	%fd17, [%r4+384];
	ld.shared.f64 	%fd18, [%r3+24];
	fma.rn.f64 	%fd19, %fd18, %fd17, %fd16;
	ld.shared.f64 	%fd20, [%r4+512];
	ld.shared.f64 	%fd21, [%r3+32];
	fma.rn.f64 	%fd22, %fd21, %fd20, %fd19;
	ld.shared.f64 	%fd23, [%r4+640];
	ld.shared.f64 	%fd24, [%r3+40];
	fma.rn.f64 	%fd25, %fd24, %fd23, %fd22;
	ld.shared.f64 	%fd26, [%r4+768];
	ld.shared.f64 	%fd27, [%r3+48];
	fma.rn.f64 	%fd28, %fd27, %fd26, %fd25;
	ld.shared.f64 	%fd29, [%r4+896];
	ld.shared.f64 	%fd30, [%r3+56];
	fma.rn.f64 	%fd31, %fd30, %fd29, %fd28;
	ld.shared.f64 	%fd32, [%r4+1024];
	ld.shared.f64 	%fd33, [%r3+64];
	fma.rn.f64 	%fd34, %fd33, %fd32, %fd31;
	ld.shared.f64 	%fd35, [%r4+1152];
	ld.shared.f64 	%fd36, [%r3+72];
	fma.rn.f64 	%fd37, %fd36, %fd35, %fd34;
	ld.shared.f64 	%fd38, [%r4+1280];
	ld.shared.f64 	%fd39, [%r3+80];
	fma.rn.f64 	%fd40, %fd39, %fd38, %fd37;
	ld.shared.f64 	%fd41, [%r4+1408];
	ld.shared.f64 	%fd42, [%r3+88];
	fma.rn.f64 	%fd43, %fd42, %fd41, %fd40;
	ld.shared.f64 	%fd44, [%r4+1536];
	ld.shared.f64 	%fd45, [%r3+96];
	fma.rn.f64 	%fd46, %fd45, %fd44, %fd43;
	ld.shared.f64 	%fd47, [%r4+1664];
	ld.shared.f64 	%fd48, [%r3+104];
	fma.rn.f64 	%fd49, %fd48, %fd47, %fd46;
	ld.shared.f64 	%fd50, [%r4+1792];
	ld.shared.f64 	%fd51, [%r3+112];
	fma.rn.f64 	%fd52, %fd51, %fd50, %fd49;
	ld.shared.f64 	%fd53, [%r4+1920];
	ld.shared.f64 	%fd54, [%r3+120];
	fma.rn.f64 	%fd56, %fd54, %fd53, %fd52;
	bar.sync 	0;
	add.s64 	%rd39, %rd39, %rd7;
	add.s64 	%rd40, %rd40, 16;
	setp.le.u64 	%p2, %rd40, %rd5;
	@%p2 bra 	$L__BB0_2;

$L__BB0_3:
	add.s64 	%rd32, %rd6, %rd2;
	add.s64 	%rd33, %rd32, %rd8;
	mul.lo.s64 	%rd34, %rd7, %rd1;
	add.s64 	%rd35, %rd33, %rd34;
	cvta.to.global.u64 	%rd36, %rd18;
	shl.b64 	%rd37, %rd35, 3;
	add.s64 	%rd38, %rd36, %rd37;
	st.global.f64 	[%rd38], %fd56;
	ret;

}
	// .globl	matrixMul_bs32_64bit
.visible .entry matrixMul_bs32_64bit(
	.param .u64 matrixMul_bs32_64bit_param_0,
	.param .u64 matrixMul_bs32_64bit_param_1,
	.param .u64 matrixMul_bs32_64bit_param_2,
	.param .u64 matrixMul_bs32_64bit_param_3,
	.param .u64 matrixMul_bs32_64bit_param_4
)
{
	.reg .pred 	%p<3>;
	.reg .b32 	%r<16>;
	.reg .f64 	%fd<105>;
	.reg .b64 	%rd<41>;
	// demoted variable
	.shared .align 8 .b8 _ZZ9matrixMulILi32EmEvPdS0_S0_T0_S1_E2As[8192];
	// demoted variable
	.shared .align 8 .b8 _ZZ9matrixMulILi32EmEvPdS0_S0_T0_S1_E2Bs[8192];

	ld.param.u64 	%rd18, [matrixMul_bs32_64bit_param_0];
	ld.param.u64 	%rd19, [matrixMul_bs32_64bit_param_1];
	ld.param.u64 	%rd20, [matrixMul_bs32_64bit_param_2];
	ld.param.u64 	%rd21, [matrixMul_bs32_64bit_param_3];
	ld.param.u64 	%rd22, [matrixMul_bs32_64bit_param_4];
	mov.u32 	%r5, %ctaid.x;
	mov.u32 	%r6, %ctaid.y;
	cvt.u64.u32 	%rd1, %r6;
	mov.u32 	%r7, %tid.x;
	cvt.u64.u32 	%rd2, %r7;
	mov.u32 	%r8, %tid.y;
	cvt.u64.u32 	%rd3, %r8;
	mul.lo.s64 	%rd23, %rd21, %rd1;
	shl.b64 	%rd40, %rd23, 5;
	add.s64 	%rd24, %rd21, -1;
	add.s64 	%rd5, %rd24, %rd40;
	setp.lt.u64 	%p1, %rd5, %rd24;
	mul.wide.u32 	%rd6, %r5, 32;
	shl.b64 	%rd7, %rd22, 5;
	mul.lo.s64 	%rd8, %rd3, %rd22;
	mov.f64 	%fd104, 0d0000000000000000;
	@%p1 bra 	$L__BB1_3;

	cvt.u32.u64 	%r9, %rd3;
	mul.lo.s64 	%rd25, %rd3, %rd21;
	add.s64 	%rd9, %rd25, %rd2;
	cvt.u32.u64 	%r10, %rd2;
	shl.b32 	%r11, %r9, 8;
	mov.u32 	%r12, _ZZ9matrixMulILi32EmEvPdS0_S0_T0_S1_E2As;
	add.s32 	%r3, %r12, %r11;
	shl.b32 	%r13, %r10, 3;
	add.s32 	%r1, %r3, %r13;
	add.s64 	%rd10, %rd8, %rd2;
	mov.u32 	%r14, _ZZ9matrixMulILi32EmEvPdS0_S0_T0_S1_E2Bs;
	add.s32 	%r15, %r14, %r11;
	add.s32 	%r2, %r15, %r13;
	add.s32 	%r4, %r14, %r13;
	cvta.to.global.u64 	%rd12, %rd19;
	cvta.to.global.u64 	%rd13, %rd20;
	mov.f64 	%fd104, 0d0000000000000000;
	mov.u64 	%rd39, %rd6;

$L__BB1_2:
	add.s64 	%rd26, %rd9, %rd40;
	shl.b64 	%rd27, %rd26, 3;
	add.s64 	%rd28, %rd12, %rd27;
	ld.global.f64 	%fd6, [%rd28];
	st.shared.f64 	[%r1], %fd6;
	add.s64 	%rd29, %rd10, %rd39;
	shl.b64 	%rd30, %rd29, 3;
	add.s64 	%rd31, %rd13, %rd30;
	ld.global.f64 	%fd7, [%rd31];
	st.shared.f64 	[%r2], %fd7;
	bar.sync 	0;
	ld.shared.f64 	%fd8, [%r4];
	ld.shared.f64 	%fd9, [%r3];
	fma.rn.f64 	%fd10, %fd9, %fd8, %fd104;
	ld.shared.f64 	%fd11, [%r4+256];
	ld.shared.f64 	%fd12, [%r3+8];
	fma.rn.f64 	%fd13, %fd12, %fd11, %fd10;
	ld.shared.f64 	%fd14, [%r4+512];
	ld.shared.f64 	%fd15, [%r3+16];
	fma.rn.f64 	%fd16, %fd15, %fd14, %fd13;
	ld.shared.f64 	%fd17, [%r4+768];
	ld.shared.f64 	%fd18, [%r3+24];
	fma.rn.f64 	%fd19, %fd18, %fd17, %fd16;
	ld.shared.f64 	%fd20, [%r4+1024];
	ld.shared.f64 	%fd21, [%r3+32];
	fma.rn.f64 	%fd22, %fd21, %fd20, %fd19;
	ld.shared.f64 	%fd23, [%r4+1280];
	ld.shared.f64 	%fd24, [%r3+40];
	fma.rn.f64 	%fd25, %fd24, %fd23, %fd22;
	ld.shared.f64 	%fd26, [%r4+1536];
	ld.shared.f64 	%fd27, [%r3+48];
	fma.rn.f64 	%fd28, %fd27, %fd26, %fd25;
	ld.shared.f64 	%fd29, [%r4+1792];
	ld.shared.f64 	%fd30, [%r3+56];
	fma.rn.f64 	%fd31, %fd30, %fd29, %fd28;
	ld.shared.f64 	%fd32, [%r4+2048];
	ld.shared.f64 	%fd33, [%r3+64];
	fma.rn.f64 	%fd34, %fd33, %fd32, %fd31;
	ld.shared.f64 	%fd35, [%r4+2304];
	ld.shared.f64 	%fd36, [%r3+72];
	fma.rn.f64 	%fd37, %fd36, %fd35, %fd34;
	ld.shared.f64 	%fd38, [%r4+2560];
	ld.shared.f64 	%fd39, [%r3+80];
	fma.rn.f64 	%fd40, %fd39, %fd38, %fd37;
	ld.shared.f64 	%fd41, [%r4+2816];
	ld.shared.f64 	%fd42, [%r3+88];
	fma.rn.f64 	%fd43, %fd42, %fd41, %fd40;
	ld.shared.f64 	%fd44, [%r4+3072];
	ld.shared.f64 	%fd45, [%r3+96];
	fma.rn.f64 	%fd46, %fd45, %fd44, %fd43;
	ld.shared.f64 	%fd47, [%r4+3328];
	ld.shared.f64 	%fd48, [%r3+104];
	fma.rn.f64 	%fd49, %fd48, %fd47, %fd46;
	ld.shared.f64 	%fd50, [%r4+3584];
	ld.shared.f64 	%fd51, [%r3+112];
	fma.rn.f64 	%fd52, %fd51, %fd50, %fd49;
	ld.shared.f64 	%fd53, [%r4+3840];
	ld.shared.f64 	%fd54, [%r3+120];
	fma.rn.f64 	%fd55, %fd54, %fd53, %fd52;
	ld.shared.f64 	%fd56, [%r4+4096];
	ld.shared.f64 	%fd57, [%r3+128];
	fma.rn.f64 	%fd58, %fd57, %fd56, %fd55;
	ld.shared.f64 	%fd59, [%r4+4352];
	ld.shared.f64 	%fd60, [%r3+136];
	fma.rn.f64 	%fd61, %fd60, %fd59, %fd58;
	ld.shared.f64 	%fd62, [%r4+4608];
	ld.shared.f64 	%fd63, [%r3+144];
	fma.rn.f64 	%fd64, %fd63, %fd62, %fd61;
	ld.shared.f64 	%fd65, [%r4+4864];
	ld.shared.f64 	%fd66, [%r3+152];
	fma.rn.f64 	%fd67, %fd66, %fd65, %fd64;
	ld.shared.f64 	%fd68, [%r4+5120];
	ld.shared.f64 	%fd69, [%r3+160];
	fma.rn.f64 	%fd70, %fd69, %fd68, %fd67;
	ld.shared.f64 	%fd71, [%r4+5376];
	ld.shared.f64 	%fd72, [%r3+168];
	fma.rn.f64 	%fd73, %fd72, %fd71, %fd70;
	ld.shared.f64 	%fd74, [%r4+5632];
	ld.shared.f64 	%fd75, [%r3+176];
	fma.rn.f64 	%fd76, %fd75, %fd74, %fd73;
	ld.shared.f64 	%fd77, [%r4+5888];
	ld.shared.f64 	%fd78, [%r3+184];
	fma.rn.f64 	%fd79, %fd78, %fd77, %fd76;
	ld.shared.f64 	%fd80, [%r4+6144];
	ld.shared.f64 	%fd81, [%r3+192];
	fma.rn.f64 	%fd82, %fd81, %fd80, %fd79;
	ld.shared.f64 	%fd83, [%r4+6400];
	ld.shared.f64 	%fd84, [%r3+200];
	fma.rn.f64 	%fd85, %fd84, %fd83, %fd82;
	ld.shared.f64 	%fd86, [%r4+6656];
	ld.shared.f64 	%fd87, [%r3+208];
	fma.rn.f64 	%fd88, %fd87, %fd86, %fd85;
	ld.shared.f64 	%fd89, [%r4+6912];
	ld.shared.f64 	%fd90, [%r3+216];
	fma.rn.f64 	%fd91, %fd90, %fd89, %fd88;
	ld.shared.f64 	%fd92, [%r4+7168];
	ld.shared.f64 	%fd93, [%r3+224];
	fma.rn.f64 	%fd94, %fd93, %fd92, %fd91;
	ld.shared.f64 	%fd95, [%r4+7424];
	ld.shared.f64 	%fd96, [%r3+232];
	fma.rn.f64 	%fd97, %fd96, %fd95, %fd94;
	ld.shared.f64 	%fd98, [%r4+7680];
	ld.shared.f64 	%fd99, [%r3+240];
	fma.rn.f64 	%fd100, %fd99, %fd98, %fd97;
	ld.shared.f64 	%fd101, [%r4+7936];
	ld.shared.f64 	%fd102, [%r3+248];
	fma.rn.f64 	%fd104, %fd102, %fd101, %fd100;
	bar.sync 	0;
	add.s64 	%rd39, %rd39, %rd7;
	add.s64 	%rd40, %rd40, 32;
	setp.le.u64 	%p2, %rd40, %rd5;
	@%p2 bra 	$L__BB1_2;

$L__BB1_3:
	add.s64 	%rd32, %rd6, %rd2;
	add.s64 	%rd33, %rd32, %rd8;
	mul.lo.s64 	%rd34, %rd7, %rd1;
	add.s64 	%rd35, %rd33, %rd34;
	cvta.to.global.u64 	%rd36, %rd18;
	shl.b64 	%rd37, %rd35, 3;
	add.s64 	%rd38, %rd36, %rd37;
	st.global.f64 	[%rd38], %fd104;
	ret;

}