ringkernel-codegen 0.4.2

Code generation for RingKernel - generates GPU kernel source code
Documentation
//
// RingKernel CUDA PTX Template
// Kernel ID: {{KERNEL_ID}}
// Generated by ringkernel-codegen
//

.version 7.0
.target sm_70
.address_size 64

// Control block structure offsets
.const .u32 CB_IS_ACTIVE = 0;
.const .u32 CB_SHOULD_TERMINATE = 4;
.const .u32 CB_HAS_TERMINATED = 8;
.const .u32 CB_MESSAGES_PROCESSED = 16;
.const .u32 CB_INPUT_HEAD = 32;
.const .u32 CB_INPUT_TAIL = 40;

// Persistent ring kernel entry point
.visible .entry {{KERNEL_ID}}_main(
    .param .u64 control_block,
    .param .u64 input_queue,
    .param .u64 output_queue,
    .param .u64 user_state
) {
    // Register declarations
    .reg .u32 %tid, %bid, %ntid, %nctaid;
    .reg .u64 %cb, %in_q, %out_q, %state;
    .reg .u32 %active, %terminate;
    .reg .pred %p_active, %p_term;

    // Load parameters
    ld.param.u64 %cb, [control_block];
    ld.param.u64 %in_q, [input_queue];
    ld.param.u64 %out_q, [output_queue];
    ld.param.u64 %state, [user_state];

    // Get thread/block IDs
    mov.u32 %tid, %tid.x;
    mov.u32 %bid, %ctaid.x;
    mov.u32 %ntid, %ntid.x;
    mov.u32 %nctaid, %nctaid.x;

MAIN_LOOP:
    // Check termination flag
    ld.global.acquire.gpu.u32 %terminate, [%cb + 4];
    setp.ne.u32 %p_term, %terminate, 0;
    @%p_term bra EXIT;

    // Check if active
    ld.global.acquire.gpu.u32 %active, [%cb];
    setp.eq.u32 %p_active, %active, 0;
    @%p_active bra MAIN_LOOP;

    // Synchronize threads before processing
    bar.sync 0;

    // === User kernel code ===
    {{USER_CODE}}
    // === End user code ===

    // Synchronize after processing
    bar.sync 0;

    // Continue processing loop
    bra MAIN_LOOP;

EXIT:
    // Mark kernel as terminated
    .reg .u32 %one;
    mov.u32 %one, 1;
    st.global.release.gpu.u32 [%cb + 8], %one;
    ret;
}

// Helper: Atomic add u64
.func (.reg .u64 %old) atomic_add_u64(
    .reg .u64 %ptr,
    .reg .u64 %val
) {
    atom.global.add.u64 %old, [%ptr], %val;
    ret;
}

// Helper: Memory fence
.func memory_fence_block() {
    membar.cta;
    ret;
}

.func memory_fence_device() {
    membar.gl;
    ret;
}

.func memory_fence_system() {
    membar.sys;
    ret;
}