Skip to main content

kaio_core/ir/
kernel.rs

1//! PTX kernel — a single `.entry` function in a PTX module.
2
3use super::instruction::PtxInstruction;
4use super::param::PtxParam;
5use super::register::Register;
6use crate::instr::ArithOp;
7use crate::instr::control::ControlOp;
8use crate::instr::memory::MemoryOp;
9use crate::instr::tensor_core::TensorCoreOp;
10use crate::types::RegKind;
11
12/// Shared memory declaration in a PTX kernel preamble.
13///
14/// Emitted as `.shared .align {align} .b8 {name}[{size_bytes}];` after
15/// register declarations.
16#[derive(Debug, Clone)]
17pub struct SharedDecl {
18    /// Name of the shared memory allocation (e.g., `"sdata"`).
19    pub name: String,
20    /// Alignment in bytes (4 for f32, 8 for f64).
21    pub align: u32,
22    /// Total allocation size in bytes.
23    pub size_bytes: u32,
24}
25
26/// A PTX kernel function (`.visible .entry`).
27///
28/// Built by constructing parameters, allocating registers, and pushing
29/// instructions. Call [`set_registers`](Self::set_registers) with the
30/// allocator's output before emission so the kernel knows which `.reg`
31/// declarations to emit.
32#[derive(Debug, Clone)]
33pub struct PtxKernel {
34    /// Kernel entry point name.
35    pub name: String,
36    /// Declared parameters (in signature order).
37    pub params: Vec<PtxParam>,
38    /// Instruction body.
39    pub body: Vec<PtxInstruction>,
40    /// All registers used, for `.reg` declaration emission.
41    pub registers: Vec<Register>,
42    /// Shared memory declarations (emitted after register declarations).
43    pub shared_decls: Vec<SharedDecl>,
44}
45
46impl PtxKernel {
47    /// Create a new empty kernel with the given name.
48    pub fn new(name: &str) -> Self {
49        Self {
50            name: name.to_string(),
51            params: Vec::new(),
52            body: Vec::new(),
53            registers: Vec::new(),
54            shared_decls: Vec::new(),
55        }
56    }
57
58    /// Add a parameter to the kernel signature.
59    pub fn add_param(&mut self, param: PtxParam) {
60        self.params.push(param);
61    }
62
63    /// Append an instruction to the kernel body.
64    pub fn push(&mut self, instr: PtxInstruction) {
65        self.body.push(instr);
66    }
67
68    /// Set the register list (from [`super::register::RegisterAllocator::into_allocated`]).
69    pub fn set_registers(&mut self, regs: Vec<Register>) {
70        self.registers = regs;
71    }
72
73    /// Add a shared memory declaration to the kernel preamble.
74    pub fn add_shared_decl(&mut self, decl: SharedDecl) {
75        self.shared_decls.push(decl);
76    }
77
78    /// Compute structural statistics about this kernel's emitted PTX.
79    ///
80    /// Walks the instruction body and counts instruction types, registers
81    /// by kind, and declared shared memory. Useful for inspection and
82    /// comparison between kernel variants.
83    ///
84    /// These are **not** runtime profiling data — final hardware register
85    /// allocation and occupancy may differ after CUDA driver compilation.
86    pub fn stats(&self) -> KernelStats {
87        let mut s = KernelStats::default();
88
89        for instr in &self.body {
90            match instr {
91                PtxInstruction::Arith(op) => {
92                    s.total_instructions += 1;
93                    if matches!(op, ArithOp::Fma { .. }) {
94                        s.fma += 1;
95                    } else {
96                        s.arith_other += 1;
97                    }
98                }
99                PtxInstruction::Memory(op) => {
100                    s.total_instructions += 1;
101                    match op {
102                        MemoryOp::LdGlobal { .. } => s.ld_global += 1,
103                        MemoryOp::StGlobal { .. } => s.st_global += 1,
104                        MemoryOp::LdShared { .. } => s.ld_shared += 1,
105                        MemoryOp::StShared { .. } => s.st_shared += 1,
106                        MemoryOp::CpAsyncCaSharedGlobal { .. } => s.cp_async += 1,
107                        MemoryOp::CpAsyncCommitGroup => s.cp_async_commit += 1,
108                        MemoryOp::CpAsyncWaitGroup { .. } => s.cp_async_wait += 1,
109                        _ => {}
110                    }
111                }
112                PtxInstruction::TensorCore(op) => {
113                    s.total_instructions += 1;
114                    match op {
115                        TensorCoreOp::MmaSync { .. } | TensorCoreOp::MmaSyncInt8 { .. } => {
116                            s.mma += 1
117                        }
118                    }
119                }
120                PtxInstruction::Control(op) => {
121                    s.total_instructions += 1;
122                    match op {
123                        ControlOp::BarSync { .. } => s.bar_sync += 1,
124                        ControlOp::BraPred { .. } | ControlOp::Bra { .. } => s.branches += 1,
125                        ControlOp::SetP { .. } => s.setp += 1,
126                        _ => {}
127                    }
128                }
129                PtxInstruction::Mov { .. } => {
130                    s.total_instructions += 1;
131                    s.mov += 1;
132                }
133                PtxInstruction::Cvt { .. } => {
134                    s.total_instructions += 1;
135                    s.cvt += 1;
136                }
137                PtxInstruction::Label(_) | PtxInstruction::Comment(_) => {}
138            }
139        }
140
141        for reg in &self.registers {
142            match reg.kind {
143                RegKind::R => s.registers_r += 1,
144                RegKind::Rd => s.registers_rd += 1,
145                RegKind::F => s.registers_f += 1,
146                RegKind::Fd => s.registers_fd += 1,
147                RegKind::P => s.registers_p += 1,
148                RegKind::H => s.registers_h += 1,
149                RegKind::Hb => s.registers_hb += 1,
150            }
151        }
152
153        s.shared_bytes = self.shared_decls.iter().map(|d| d.size_bytes).sum();
154
155        s
156    }
157}
158
159/// Structural statistics about a compiled kernel's emitted PTX.
160///
161/// These describe the instruction mix and declared resource usage in
162/// KAIO's generated PTX — useful for inspection and comparison between
163/// kernel variants, but **not** a substitute for runtime profiling.
164/// Final hardware register allocation and occupancy may differ from
165/// these counts after the CUDA driver's backend compilation (PTX → SASS).
166#[derive(Debug, Default, PartialEq, Eq)]
167pub struct KernelStats {
168    /// Total instructions (excludes labels and comments).
169    pub total_instructions: usize,
170    /// `ld.global` count.
171    pub ld_global: usize,
172    /// `st.global` count.
173    pub st_global: usize,
174    /// `ld.shared` count.
175    pub ld_shared: usize,
176    /// `st.shared` count.
177    pub st_shared: usize,
178    /// `bar.sync` count.
179    pub bar_sync: usize,
180    /// `mma.sync` instruction count (all tensor-core shapes).
181    pub mma: usize,
182    /// `cp.async.ca.shared.global` instruction count.
183    pub cp_async: usize,
184    /// `cp.async.commit_group` instruction count.
185    pub cp_async_commit: usize,
186    /// `cp.async.wait_group` instruction count.
187    pub cp_async_wait: usize,
188    /// `fma` instruction count.
189    pub fma: usize,
190    /// Non-FMA arithmetic instructions (add, mul, sub, etc.).
191    pub arith_other: usize,
192    /// `mov` instruction count.
193    pub mov: usize,
194    /// `cvt` instruction count.
195    pub cvt: usize,
196    /// Branch instructions (`bra`, `@pred bra`).
197    pub branches: usize,
198    /// `setp` comparison-to-predicate instructions.
199    pub setp: usize,
200    /// `%r` registers (32-bit integer).
201    pub registers_r: u32,
202    /// `%rd` registers (64-bit integer).
203    pub registers_rd: u32,
204    /// `%f` registers (f32).
205    pub registers_f: u32,
206    /// `%fd` registers (f64).
207    pub registers_fd: u32,
208    /// `%p` registers (predicate).
209    pub registers_p: u32,
210    /// `%h` registers (f16).
211    pub registers_h: u32,
212    /// `%hb` registers (bf16).
213    pub registers_hb: u32,
214    /// Total declared shared memory in bytes.
215    pub shared_bytes: u32,
216}
217
218#[cfg(test)]
219mod tests {
220    use super::*;
221    use crate::ir::Operand;
222    use crate::types::PtxType;
223
224    fn reg(kind: RegKind, index: u32, ptx_type: PtxType) -> Register {
225        Register {
226            kind,
227            index,
228            ptx_type,
229        }
230    }
231
232    #[test]
233    fn stats_empty_kernel() {
234        let kernel = PtxKernel::new("empty");
235        let s = kernel.stats();
236        assert_eq!(s, KernelStats::default());
237    }
238
239    #[test]
240    fn stats_counts_instruction_types() {
241        let mut kernel = PtxKernel::new("test");
242
243        // 2 FMA
244        for _ in 0..2 {
245            kernel.push(PtxInstruction::Arith(ArithOp::Fma {
246                dst: reg(RegKind::F, 0, PtxType::F32),
247                a: Operand::Reg(reg(RegKind::F, 1, PtxType::F32)),
248                b: Operand::Reg(reg(RegKind::F, 2, PtxType::F32)),
249                c: Operand::Reg(reg(RegKind::F, 3, PtxType::F32)),
250                ty: PtxType::F32,
251            }));
252        }
253        // 1 Add (arith_other)
254        kernel.push(PtxInstruction::Arith(ArithOp::Add {
255            dst: reg(RegKind::R, 0, PtxType::U32),
256            lhs: Operand::Reg(reg(RegKind::R, 1, PtxType::U32)),
257            rhs: Operand::ImmU32(1),
258            ty: PtxType::U32,
259        }));
260        // 1 ld.global + 1 st.global
261        kernel.push(PtxInstruction::Memory(MemoryOp::LdGlobal {
262            dst: reg(RegKind::F, 0, PtxType::F32),
263            addr: reg(RegKind::Rd, 0, PtxType::U64),
264            ty: PtxType::F32,
265        }));
266        kernel.push(PtxInstruction::Memory(MemoryOp::StGlobal {
267            addr: reg(RegKind::Rd, 0, PtxType::U64),
268            src: reg(RegKind::F, 0, PtxType::F32),
269            ty: PtxType::F32,
270        }));
271        // 1 ld.shared + 1 st.shared
272        kernel.push(PtxInstruction::Memory(MemoryOp::LdShared {
273            dst: reg(RegKind::F, 0, PtxType::F32),
274            addr: reg(RegKind::R, 0, PtxType::U32),
275            ty: PtxType::F32,
276        }));
277        kernel.push(PtxInstruction::Memory(MemoryOp::StShared {
278            addr: reg(RegKind::R, 0, PtxType::U32),
279            src: reg(RegKind::F, 0, PtxType::F32),
280            ty: PtxType::F32,
281        }));
282        // 1 ld.param (memory, total-only)
283        kernel.push(PtxInstruction::Memory(MemoryOp::LdParam {
284            dst: reg(RegKind::Rd, 0, PtxType::U64),
285            param_name: "p0".to_string(),
286            ty: PtxType::U64,
287        }));
288        // 1 bar.sync
289        kernel.push(PtxInstruction::Control(ControlOp::BarSync {
290            barrier_id: 0,
291        }));
292        // 1 branch + 1 setp
293        kernel.push(PtxInstruction::Control(ControlOp::BraPred {
294            pred: reg(RegKind::P, 0, PtxType::Pred),
295            target: "L0".to_string(),
296            negate: false,
297        }));
298        kernel.push(PtxInstruction::Control(ControlOp::SetP {
299            dst: reg(RegKind::P, 0, PtxType::Pred),
300            cmp_op: crate::instr::control::CmpOp::Lt,
301            lhs: Operand::Reg(reg(RegKind::R, 0, PtxType::U32)),
302            rhs: Operand::ImmU32(10),
303            ty: PtxType::U32,
304        }));
305        // 1 mov + 1 cvt
306        kernel.push(PtxInstruction::Mov {
307            dst: reg(RegKind::R, 0, PtxType::U32),
308            src: Operand::ImmU32(0),
309            ty: PtxType::U32,
310        });
311        kernel.push(PtxInstruction::Cvt {
312            dst: reg(RegKind::F, 0, PtxType::F32),
313            src: reg(RegKind::R, 0, PtxType::U32),
314            dst_ty: PtxType::F32,
315            src_ty: PtxType::U32,
316        });
317        // 1 ret
318        kernel.push(PtxInstruction::Control(ControlOp::Ret));
319        // Label + Comment — should not count
320        kernel.push(PtxInstruction::Label("L0".to_string()));
321        kernel.push(PtxInstruction::Comment("test".to_string()));
322
323        let s = kernel.stats();
324        // 2 fma + 1 add + 1 ld.global + 1 st.global + 1 ld.shared +
325        // 1 st.shared + 1 ld.param + 1 bar.sync + 1 branch + 1 setp +
326        // 1 mov + 1 cvt + 1 ret = 14
327        assert_eq!(s.total_instructions, 14);
328        assert_eq!(s.fma, 2);
329        assert_eq!(s.arith_other, 1);
330        assert_eq!(s.ld_global, 1);
331        assert_eq!(s.st_global, 1);
332        assert_eq!(s.ld_shared, 1);
333        assert_eq!(s.st_shared, 1);
334        assert_eq!(s.bar_sync, 1);
335        assert_eq!(s.branches, 1);
336        assert_eq!(s.setp, 1);
337        assert_eq!(s.mov, 1);
338        assert_eq!(s.cvt, 1);
339    }
340
341    #[test]
342    fn stats_counts_registers_by_kind() {
343        let mut kernel = PtxKernel::new("test");
344        kernel.set_registers(vec![
345            reg(RegKind::R, 0, PtxType::U32),
346            reg(RegKind::R, 1, PtxType::S32),
347            reg(RegKind::R, 2, PtxType::U32),
348            reg(RegKind::Rd, 0, PtxType::U64),
349            reg(RegKind::F, 0, PtxType::F32),
350            reg(RegKind::F, 1, PtxType::F32),
351            reg(RegKind::Fd, 0, PtxType::F64),
352            reg(RegKind::P, 0, PtxType::Pred),
353            reg(RegKind::P, 1, PtxType::Pred),
354        ]);
355
356        let s = kernel.stats();
357        assert_eq!(s.registers_r, 3);
358        assert_eq!(s.registers_rd, 1);
359        assert_eq!(s.registers_f, 2);
360        assert_eq!(s.registers_fd, 1);
361        assert_eq!(s.registers_p, 2);
362    }
363
364    #[test]
365    fn stats_counts_tensor_core_and_cp_async() {
366        use crate::fragment::{alloc_a, alloc_b, alloc_c};
367        use crate::instr::MmaShape;
368        use crate::ir::RegisterAllocator;
369
370        let mut alloc = RegisterAllocator::new();
371        let mut kernel = PtxKernel::new("tc_stats_test");
372
373        // 2 mma.sync
374        for _ in 0..2 {
375            kernel.push(PtxInstruction::TensorCore(
376                crate::instr::TensorCoreOp::MmaSync {
377                    d: alloc_c(&mut alloc),
378                    a: alloc_a(&mut alloc),
379                    b: alloc_b(&mut alloc),
380                    c: alloc_c(&mut alloc),
381                    shape: MmaShape::M16N8K16,
382                    d_ty: PtxType::F32,
383                    a_ty: PtxType::F16,
384                    b_ty: PtxType::F16,
385                    c_ty: PtxType::F32,
386                },
387            ));
388        }
389
390        // 3 cp.async loads, 1 commit, 1 wait
391        let dst_shared = reg(RegKind::R, 0, PtxType::U32);
392        let src_global = reg(RegKind::Rd, 0, PtxType::U64);
393        for _ in 0..3 {
394            kernel.push(PtxInstruction::Memory(MemoryOp::new_cp_async_ca(
395                dst_shared, src_global, 16,
396            )));
397        }
398        kernel.push(PtxInstruction::Memory(MemoryOp::CpAsyncCommitGroup));
399        kernel.push(PtxInstruction::Memory(MemoryOp::CpAsyncWaitGroup { n: 0 }));
400
401        let s = kernel.stats();
402        assert_eq!(s.mma, 2);
403        assert_eq!(s.cp_async, 3);
404        assert_eq!(s.cp_async_commit, 1);
405        assert_eq!(s.cp_async_wait, 1);
406        // 2 mma + 3 cp.async + 1 commit + 1 wait = 7 total
407        assert_eq!(s.total_instructions, 7);
408    }
409
410    #[test]
411    fn stats_counts_shared_bytes() {
412        let mut kernel = PtxKernel::new("test");
413        kernel.add_shared_decl(SharedDecl {
414            name: "tile_a".to_string(),
415            align: 4,
416            size_bytes: 4352, // 64 * 17 * 4
417        });
418        kernel.add_shared_decl(SharedDecl {
419            name: "tile_b".to_string(),
420            align: 4,
421            size_bytes: 4160, // 16 * 65 * 4
422        });
423
424        let s = kernel.stats();
425        assert_eq!(s.shared_bytes, 4352 + 4160);
426    }
427}