//
// RingKernel CUDA PTX Template
// Kernel ID: {{KERNEL_ID}}
// Generated by ringkernel-codegen
//
.version 7.0
.target sm_70
.address_size 64
// Control block structure offsets
.const .u32 CB_IS_ACTIVE = 0;
.const .u32 CB_SHOULD_TERMINATE = 4;
.const .u32 CB_HAS_TERMINATED = 8;
.const .u32 CB_MESSAGES_PROCESSED = 16;
.const .u32 CB_INPUT_HEAD = 32;
.const .u32 CB_INPUT_TAIL = 40;
// Persistent ring kernel entry point
.visible .entry {{KERNEL_ID}}_main(
.param .u64 control_block,
.param .u64 input_queue,
.param .u64 output_queue,
.param .u64 user_state
) {
// Register declarations
.reg .u32 %tid, %bid, %ntid, %nctaid;
.reg .u64 %cb, %in_q, %out_q, %state;
.reg .u32 %active, %terminate;
.reg .pred %p_active, %p_term;
// Load parameters
ld.param.u64 %cb, [control_block];
ld.param.u64 %in_q, [input_queue];
ld.param.u64 %out_q, [output_queue];
ld.param.u64 %state, [user_state];
// Get thread/block IDs
mov.u32 %tid, %tid.x;
mov.u32 %bid, %ctaid.x;
mov.u32 %ntid, %ntid.x;
mov.u32 %nctaid, %nctaid.x;
MAIN_LOOP:
// Check termination flag
ld.global.acquire.gpu.u32 %terminate, [%cb + 4];
setp.ne.u32 %p_term, %terminate, 0;
@%p_term bra EXIT;
// Check if active
ld.global.acquire.gpu.u32 %active, [%cb];
setp.eq.u32 %p_active, %active, 0;
@%p_active bra MAIN_LOOP;
// Synchronize threads before processing
bar.sync 0;
// === User kernel code ===
{{USER_CODE}}
// === End user code ===
// Synchronize after processing
bar.sync 0;
// Continue processing loop
bra MAIN_LOOP;
EXIT:
// Mark kernel as terminated
.reg .u32 %one;
mov.u32 %one, 1;
st.global.release.gpu.u32 [%cb + 8], %one;
ret;
}
// Helper: Atomic add u64
.func (.reg .u64 %old) atomic_add_u64(
.reg .u64 %ptr,
.reg .u64 %val
) {
atom.global.add.u64 %old, [%ptr], %val;
ret;
}
// Helper: Memory fence
.func memory_fence_block() {
membar.cta;
ret;
}
.func memory_fence_device() {
membar.gl;
ret;
}
.func memory_fence_system() {
membar.sys;
ret;
}