//
// Same `vector_add` kernel as baracuda-driver's fixture, duplicated here so
// the runtime integration test is self-contained. Regenerate via:
// nvcc --ptx -arch=sm_50 vector_add.cu -o vector_add.ptx
// or the bundled helper `cargo xtask build-kernels`.
//
.version 7.4
.target sm_50
.address_size 64
.visible .entry vector_add(
.param .u64 vector_add_param_0,
.param .u64 vector_add_param_1,
.param .u64 vector_add_param_2,
.param .u32 vector_add_param_3
)
{
.reg .pred %p<2>;
.reg .b32 %r<6>;
.reg .f32 %f<4>;
.reg .b64 %rd<11>;
ld.param.u64 %rd1, [vector_add_param_0];
ld.param.u64 %rd2, [vector_add_param_1];
ld.param.u64 %rd3, [vector_add_param_2];
ld.param.u32 %r1, [vector_add_param_3];
mov.u32 %r2, %ntid.x;
mov.u32 %r3, %ctaid.x;
mov.u32 %r4, %tid.x;
mad.lo.s32 %r5, %r3, %r2, %r4;
setp.ge.u32 %p1, %r5, %r1;
@%p1 bra $L__BB0_2;
cvta.to.global.u64 %rd4, %rd1;
mul.wide.u32 %rd5, %r5, 4;
add.s64 %rd6, %rd4, %rd5;
ld.global.f32 %f1, [%rd6];
cvta.to.global.u64 %rd7, %rd2;
add.s64 %rd8, %rd7, %rd5;
ld.global.f32 %f2, [%rd8];
add.f32 %f3, %f1, %f2;
cvta.to.global.u64 %rd9, %rd3;
add.s64 %rd10, %rd9, %rd5;
st.global.f32 [%rd10], %f3;
$L__BB0_2:
ret;
}