Skip to main content

kaio_core/ir/
kernel.rs

1//! PTX kernel — a single `.entry` function in a PTX module.
2
3use super::instruction::PtxInstruction;
4use super::param::PtxParam;
5use super::register::Register;
6use crate::instr::ArithOp;
7use crate::instr::control::ControlOp;
8use crate::instr::memory::MemoryOp;
9use crate::instr::tensor_core::TensorCoreOp;
10use crate::types::RegKind;
11
12/// Shared memory declaration in a PTX kernel preamble.
13///
14/// Emitted as `.shared .align {align} .b8 {name}[{size_bytes}];` after
15/// register declarations.
16#[derive(Debug, Clone)]
17pub struct SharedDecl {
18    /// Name of the shared memory allocation (e.g., `"sdata"`).
19    pub name: String,
20    /// Alignment in bytes (4 for f32, 8 for f64).
21    pub align: u32,
22    /// Total allocation size in bytes.
23    pub size_bytes: u32,
24}
25
26/// A PTX kernel function (`.visible .entry`).
27///
28/// Built by constructing parameters, allocating registers, and pushing
29/// instructions. Call [`set_registers`](Self::set_registers) with the
30/// allocator's output before emission so the kernel knows which `.reg`
31/// declarations to emit.
32#[derive(Debug, Clone)]
33pub struct PtxKernel {
34    /// Kernel entry point name.
35    pub name: String,
36    /// Declared parameters (in signature order).
37    pub params: Vec<PtxParam>,
38    /// Instruction body.
39    pub body: Vec<PtxInstruction>,
40    /// All registers used, for `.reg` declaration emission.
41    pub registers: Vec<Register>,
42    /// Shared memory declarations (emitted after register declarations).
43    pub shared_decls: Vec<SharedDecl>,
44}
45
46impl PtxKernel {
47    /// Create a new empty kernel with the given name.
48    pub fn new(name: &str) -> Self {
49        Self {
50            name: name.to_string(),
51            params: Vec::new(),
52            body: Vec::new(),
53            registers: Vec::new(),
54            shared_decls: Vec::new(),
55        }
56    }
57
58    /// Add a parameter to the kernel signature.
59    pub fn add_param(&mut self, param: PtxParam) {
60        self.params.push(param);
61    }
62
63    /// Append an instruction to the kernel body.
64    pub fn push(&mut self, instr: PtxInstruction) {
65        self.body.push(instr);
66    }
67
68    /// Set the register list (from [`super::register::RegisterAllocator::into_allocated`]).
69    pub fn set_registers(&mut self, regs: Vec<Register>) {
70        self.registers = regs;
71    }
72
73    /// Add a shared memory declaration to the kernel preamble.
74    pub fn add_shared_decl(&mut self, decl: SharedDecl) {
75        self.shared_decls.push(decl);
76    }
77
78    /// Compute structural statistics about this kernel's emitted PTX.
79    ///
80    /// Walks the instruction body and counts instruction types, registers
81    /// by kind, and declared shared memory. Useful for inspection and
82    /// comparison between kernel variants.
83    ///
84    /// These are **not** runtime profiling data — final hardware register
85    /// allocation and occupancy may differ after CUDA driver compilation.
86    pub fn stats(&self) -> KernelStats {
87        let mut s = KernelStats::default();
88
89        for instr in &self.body {
90            match instr {
91                PtxInstruction::Arith(op) => {
92                    s.total_instructions += 1;
93                    if matches!(op, ArithOp::Fma { .. }) {
94                        s.fma += 1;
95                    } else {
96                        s.arith_other += 1;
97                    }
98                }
99                PtxInstruction::Memory(op) => {
100                    s.total_instructions += 1;
101                    match op {
102                        MemoryOp::LdGlobal { .. } => s.ld_global += 1,
103                        MemoryOp::StGlobal { .. } => s.st_global += 1,
104                        MemoryOp::LdShared { .. } => s.ld_shared += 1,
105                        MemoryOp::StShared { .. } => s.st_shared += 1,
106                        MemoryOp::CpAsyncCaSharedGlobal { .. } => s.cp_async += 1,
107                        MemoryOp::CpAsyncCommitGroup => s.cp_async_commit += 1,
108                        MemoryOp::CpAsyncWaitGroup { .. } => s.cp_async_wait += 1,
109                        _ => {}
110                    }
111                }
112                PtxInstruction::TensorCore(op) => {
113                    s.total_instructions += 1;
114                    match op {
115                        TensorCoreOp::MmaSync { .. } => s.mma += 1,
116                    }
117                }
118                PtxInstruction::Control(op) => {
119                    s.total_instructions += 1;
120                    match op {
121                        ControlOp::BarSync { .. } => s.bar_sync += 1,
122                        ControlOp::BraPred { .. } | ControlOp::Bra { .. } => s.branches += 1,
123                        ControlOp::SetP { .. } => s.setp += 1,
124                        _ => {}
125                    }
126                }
127                PtxInstruction::Mov { .. } => {
128                    s.total_instructions += 1;
129                    s.mov += 1;
130                }
131                PtxInstruction::Cvt { .. } => {
132                    s.total_instructions += 1;
133                    s.cvt += 1;
134                }
135                PtxInstruction::Label(_) | PtxInstruction::Comment(_) => {}
136            }
137        }
138
139        for reg in &self.registers {
140            match reg.kind {
141                RegKind::R => s.registers_r += 1,
142                RegKind::Rd => s.registers_rd += 1,
143                RegKind::F => s.registers_f += 1,
144                RegKind::Fd => s.registers_fd += 1,
145                RegKind::P => s.registers_p += 1,
146                RegKind::H => s.registers_h += 1,
147                RegKind::Hb => s.registers_hb += 1,
148            }
149        }
150
151        s.shared_bytes = self.shared_decls.iter().map(|d| d.size_bytes).sum();
152
153        s
154    }
155}
156
157/// Structural statistics about a compiled kernel's emitted PTX.
158///
159/// These describe the instruction mix and declared resource usage in
160/// KAIO's generated PTX — useful for inspection and comparison between
161/// kernel variants, but **not** a substitute for runtime profiling.
162/// Final hardware register allocation and occupancy may differ from
163/// these counts after the CUDA driver's backend compilation (PTX → SASS).
164#[derive(Debug, Default, PartialEq, Eq)]
165pub struct KernelStats {
166    /// Total instructions (excludes labels and comments).
167    pub total_instructions: usize,
168    /// `ld.global` count.
169    pub ld_global: usize,
170    /// `st.global` count.
171    pub st_global: usize,
172    /// `ld.shared` count.
173    pub ld_shared: usize,
174    /// `st.shared` count.
175    pub st_shared: usize,
176    /// `bar.sync` count.
177    pub bar_sync: usize,
178    /// `mma.sync` instruction count (all tensor-core shapes).
179    pub mma: usize,
180    /// `cp.async.ca.shared.global` instruction count.
181    pub cp_async: usize,
182    /// `cp.async.commit_group` instruction count.
183    pub cp_async_commit: usize,
184    /// `cp.async.wait_group` instruction count.
185    pub cp_async_wait: usize,
186    /// `fma` instruction count.
187    pub fma: usize,
188    /// Non-FMA arithmetic instructions (add, mul, sub, etc.).
189    pub arith_other: usize,
190    /// `mov` instruction count.
191    pub mov: usize,
192    /// `cvt` instruction count.
193    pub cvt: usize,
194    /// Branch instructions (`bra`, `@pred bra`).
195    pub branches: usize,
196    /// `setp` comparison-to-predicate instructions.
197    pub setp: usize,
198    /// `%r` registers (32-bit integer).
199    pub registers_r: u32,
200    /// `%rd` registers (64-bit integer).
201    pub registers_rd: u32,
202    /// `%f` registers (f32).
203    pub registers_f: u32,
204    /// `%fd` registers (f64).
205    pub registers_fd: u32,
206    /// `%p` registers (predicate).
207    pub registers_p: u32,
208    /// `%h` registers (f16).
209    pub registers_h: u32,
210    /// `%hb` registers (bf16).
211    pub registers_hb: u32,
212    /// Total declared shared memory in bytes.
213    pub shared_bytes: u32,
214}
215
216#[cfg(test)]
217mod tests {
218    use super::*;
219    use crate::ir::Operand;
220    use crate::types::PtxType;
221
222    fn reg(kind: RegKind, index: u32, ptx_type: PtxType) -> Register {
223        Register {
224            kind,
225            index,
226            ptx_type,
227        }
228    }
229
230    #[test]
231    fn stats_empty_kernel() {
232        let kernel = PtxKernel::new("empty");
233        let s = kernel.stats();
234        assert_eq!(s, KernelStats::default());
235    }
236
237    #[test]
238    fn stats_counts_instruction_types() {
239        let mut kernel = PtxKernel::new("test");
240
241        // 2 FMA
242        for _ in 0..2 {
243            kernel.push(PtxInstruction::Arith(ArithOp::Fma {
244                dst: reg(RegKind::F, 0, PtxType::F32),
245                a: Operand::Reg(reg(RegKind::F, 1, PtxType::F32)),
246                b: Operand::Reg(reg(RegKind::F, 2, PtxType::F32)),
247                c: Operand::Reg(reg(RegKind::F, 3, PtxType::F32)),
248                ty: PtxType::F32,
249            }));
250        }
251        // 1 Add (arith_other)
252        kernel.push(PtxInstruction::Arith(ArithOp::Add {
253            dst: reg(RegKind::R, 0, PtxType::U32),
254            lhs: Operand::Reg(reg(RegKind::R, 1, PtxType::U32)),
255            rhs: Operand::ImmU32(1),
256            ty: PtxType::U32,
257        }));
258        // 1 ld.global + 1 st.global
259        kernel.push(PtxInstruction::Memory(MemoryOp::LdGlobal {
260            dst: reg(RegKind::F, 0, PtxType::F32),
261            addr: reg(RegKind::Rd, 0, PtxType::U64),
262            ty: PtxType::F32,
263        }));
264        kernel.push(PtxInstruction::Memory(MemoryOp::StGlobal {
265            addr: reg(RegKind::Rd, 0, PtxType::U64),
266            src: reg(RegKind::F, 0, PtxType::F32),
267            ty: PtxType::F32,
268        }));
269        // 1 ld.shared + 1 st.shared
270        kernel.push(PtxInstruction::Memory(MemoryOp::LdShared {
271            dst: reg(RegKind::F, 0, PtxType::F32),
272            addr: reg(RegKind::R, 0, PtxType::U32),
273            ty: PtxType::F32,
274        }));
275        kernel.push(PtxInstruction::Memory(MemoryOp::StShared {
276            addr: reg(RegKind::R, 0, PtxType::U32),
277            src: reg(RegKind::F, 0, PtxType::F32),
278            ty: PtxType::F32,
279        }));
280        // 1 ld.param (memory, total-only)
281        kernel.push(PtxInstruction::Memory(MemoryOp::LdParam {
282            dst: reg(RegKind::Rd, 0, PtxType::U64),
283            param_name: "p0".to_string(),
284            ty: PtxType::U64,
285        }));
286        // 1 bar.sync
287        kernel.push(PtxInstruction::Control(ControlOp::BarSync {
288            barrier_id: 0,
289        }));
290        // 1 branch + 1 setp
291        kernel.push(PtxInstruction::Control(ControlOp::BraPred {
292            pred: reg(RegKind::P, 0, PtxType::Pred),
293            target: "L0".to_string(),
294            negate: false,
295        }));
296        kernel.push(PtxInstruction::Control(ControlOp::SetP {
297            dst: reg(RegKind::P, 0, PtxType::Pred),
298            cmp_op: crate::instr::control::CmpOp::Lt,
299            lhs: Operand::Reg(reg(RegKind::R, 0, PtxType::U32)),
300            rhs: Operand::ImmU32(10),
301            ty: PtxType::U32,
302        }));
303        // 1 mov + 1 cvt
304        kernel.push(PtxInstruction::Mov {
305            dst: reg(RegKind::R, 0, PtxType::U32),
306            src: Operand::ImmU32(0),
307            ty: PtxType::U32,
308        });
309        kernel.push(PtxInstruction::Cvt {
310            dst: reg(RegKind::F, 0, PtxType::F32),
311            src: reg(RegKind::R, 0, PtxType::U32),
312            dst_ty: PtxType::F32,
313            src_ty: PtxType::U32,
314        });
315        // 1 ret
316        kernel.push(PtxInstruction::Control(ControlOp::Ret));
317        // Label + Comment — should not count
318        kernel.push(PtxInstruction::Label("L0".to_string()));
319        kernel.push(PtxInstruction::Comment("test".to_string()));
320
321        let s = kernel.stats();
322        // 2 fma + 1 add + 1 ld.global + 1 st.global + 1 ld.shared +
323        // 1 st.shared + 1 ld.param + 1 bar.sync + 1 branch + 1 setp +
324        // 1 mov + 1 cvt + 1 ret = 14
325        assert_eq!(s.total_instructions, 14);
326        assert_eq!(s.fma, 2);
327        assert_eq!(s.arith_other, 1);
328        assert_eq!(s.ld_global, 1);
329        assert_eq!(s.st_global, 1);
330        assert_eq!(s.ld_shared, 1);
331        assert_eq!(s.st_shared, 1);
332        assert_eq!(s.bar_sync, 1);
333        assert_eq!(s.branches, 1);
334        assert_eq!(s.setp, 1);
335        assert_eq!(s.mov, 1);
336        assert_eq!(s.cvt, 1);
337    }
338
339    #[test]
340    fn stats_counts_registers_by_kind() {
341        let mut kernel = PtxKernel::new("test");
342        kernel.set_registers(vec![
343            reg(RegKind::R, 0, PtxType::U32),
344            reg(RegKind::R, 1, PtxType::S32),
345            reg(RegKind::R, 2, PtxType::U32),
346            reg(RegKind::Rd, 0, PtxType::U64),
347            reg(RegKind::F, 0, PtxType::F32),
348            reg(RegKind::F, 1, PtxType::F32),
349            reg(RegKind::Fd, 0, PtxType::F64),
350            reg(RegKind::P, 0, PtxType::Pred),
351            reg(RegKind::P, 1, PtxType::Pred),
352        ]);
353
354        let s = kernel.stats();
355        assert_eq!(s.registers_r, 3);
356        assert_eq!(s.registers_rd, 1);
357        assert_eq!(s.registers_f, 2);
358        assert_eq!(s.registers_fd, 1);
359        assert_eq!(s.registers_p, 2);
360    }
361
362    #[test]
363    fn stats_counts_tensor_core_and_cp_async() {
364        use crate::fragment::{alloc_a, alloc_b, alloc_c};
365        use crate::instr::MmaShape;
366        use crate::ir::RegisterAllocator;
367
368        let mut alloc = RegisterAllocator::new();
369        let mut kernel = PtxKernel::new("tc_stats_test");
370
371        // 2 mma.sync
372        for _ in 0..2 {
373            kernel.push(PtxInstruction::TensorCore(
374                crate::instr::TensorCoreOp::MmaSync {
375                    d: alloc_c(&mut alloc),
376                    a: alloc_a(&mut alloc),
377                    b: alloc_b(&mut alloc),
378                    c: alloc_c(&mut alloc),
379                    shape: MmaShape::M16N8K16,
380                    d_ty: PtxType::F32,
381                    a_ty: PtxType::F16,
382                    b_ty: PtxType::F16,
383                    c_ty: PtxType::F32,
384                },
385            ));
386        }
387
388        // 3 cp.async loads, 1 commit, 1 wait
389        let dst_shared = reg(RegKind::R, 0, PtxType::U32);
390        let src_global = reg(RegKind::Rd, 0, PtxType::U64);
391        for _ in 0..3 {
392            kernel.push(PtxInstruction::Memory(MemoryOp::new_cp_async_ca(
393                dst_shared, src_global, 16,
394            )));
395        }
396        kernel.push(PtxInstruction::Memory(MemoryOp::CpAsyncCommitGroup));
397        kernel.push(PtxInstruction::Memory(MemoryOp::CpAsyncWaitGroup { n: 0 }));
398
399        let s = kernel.stats();
400        assert_eq!(s.mma, 2);
401        assert_eq!(s.cp_async, 3);
402        assert_eq!(s.cp_async_commit, 1);
403        assert_eq!(s.cp_async_wait, 1);
404        // 2 mma + 3 cp.async + 1 commit + 1 wait = 7 total
405        assert_eq!(s.total_instructions, 7);
406    }
407
408    #[test]
409    fn stats_counts_shared_bytes() {
410        let mut kernel = PtxKernel::new("test");
411        kernel.add_shared_decl(SharedDecl {
412            name: "tile_a".to_string(),
413            align: 4,
414            size_bytes: 4352, // 64 * 17 * 4
415        });
416        kernel.add_shared_decl(SharedDecl {
417            name: "tile_b".to_string(),
418            align: 4,
419            size_bytes: 4160, // 16 * 65 * 4
420        });
421
422        let s = kernel.stats();
423        assert_eq!(s.shared_bytes, 4352 + 4160);
424    }
425}