baracuda-runtime 0.0.1-alpha.33

Safe Rust wrappers for the CUDA Runtime API (devices, streams, events, managed memory, kernel launch via the library API).
Documentation
//
// Same `vector_add` kernel as baracuda-driver's fixture, duplicated here so
// the runtime integration test is self-contained. Regenerate via:
//   nvcc --ptx -arch=sm_50 vector_add.cu -o vector_add.ptx
// or the bundled helper `cargo xtask build-kernels`.
//

.version 7.4
.target sm_50
.address_size 64

.visible .entry vector_add(
    .param .u64 vector_add_param_0,
    .param .u64 vector_add_param_1,
    .param .u64 vector_add_param_2,
    .param .u32 vector_add_param_3
)
{
    .reg .pred %p<2>;
    .reg .b32 %r<6>;
    .reg .f32 %f<4>;
    .reg .b64 %rd<11>;

    ld.param.u64  %rd1, [vector_add_param_0];
    ld.param.u64  %rd2, [vector_add_param_1];
    ld.param.u64  %rd3, [vector_add_param_2];
    ld.param.u32  %r1,  [vector_add_param_3];
    mov.u32       %r2, %ntid.x;
    mov.u32       %r3, %ctaid.x;
    mov.u32       %r4, %tid.x;
    mad.lo.s32    %r5, %r3, %r2, %r4;
    setp.ge.u32   %p1, %r5, %r1;
    @%p1 bra      $L__BB0_2;

    cvta.to.global.u64 %rd4, %rd1;
    mul.wide.u32       %rd5, %r5, 4;
    add.s64            %rd6, %rd4, %rd5;
    ld.global.f32      %f1, [%rd6];
    cvta.to.global.u64 %rd7, %rd2;
    add.s64            %rd8, %rd7, %rd5;
    ld.global.f32      %f2, [%rd8];
    add.f32            %f3, %f1, %f2;
    cvta.to.global.u64 %rd9, %rd3;
    add.s64            %rd10, %rd9, %rd5;
    st.global.f32      [%rd10], %f3;

$L__BB0_2:
    ret;
}