1use super::instruction::PtxInstruction;
4use super::param::PtxParam;
5use super::register::Register;
6use crate::instr::ArithOp;
7use crate::instr::control::ControlOp;
8use crate::instr::memory::MemoryOp;
9use crate::instr::tensor_core::TensorCoreOp;
10use crate::types::RegKind;
11
12#[derive(Debug, Clone)]
17pub struct SharedDecl {
18 pub name: String,
20 pub align: u32,
22 pub size_bytes: u32,
24}
25
26#[derive(Debug, Clone)]
33pub struct PtxKernel {
34 pub name: String,
36 pub params: Vec<PtxParam>,
38 pub body: Vec<PtxInstruction>,
40 pub registers: Vec<Register>,
42 pub shared_decls: Vec<SharedDecl>,
44}
45
46impl PtxKernel {
47 pub fn new(name: &str) -> Self {
49 Self {
50 name: name.to_string(),
51 params: Vec::new(),
52 body: Vec::new(),
53 registers: Vec::new(),
54 shared_decls: Vec::new(),
55 }
56 }
57
58 pub fn add_param(&mut self, param: PtxParam) {
60 self.params.push(param);
61 }
62
63 pub fn push(&mut self, instr: PtxInstruction) {
65 self.body.push(instr);
66 }
67
68 pub fn set_registers(&mut self, regs: Vec<Register>) {
70 self.registers = regs;
71 }
72
73 pub fn add_shared_decl(&mut self, decl: SharedDecl) {
75 self.shared_decls.push(decl);
76 }
77
78 pub fn stats(&self) -> KernelStats {
87 let mut s = KernelStats::default();
88
89 for instr in &self.body {
90 match instr {
91 PtxInstruction::Arith(op) => {
92 s.total_instructions += 1;
93 if matches!(op, ArithOp::Fma { .. }) {
94 s.fma += 1;
95 } else {
96 s.arith_other += 1;
97 }
98 }
99 PtxInstruction::Memory(op) => {
100 s.total_instructions += 1;
101 match op {
102 MemoryOp::LdGlobal { .. } => s.ld_global += 1,
103 MemoryOp::StGlobal { .. } => s.st_global += 1,
104 MemoryOp::LdShared { .. } => s.ld_shared += 1,
105 MemoryOp::StShared { .. } => s.st_shared += 1,
106 MemoryOp::CpAsyncCaSharedGlobal { .. } => s.cp_async += 1,
107 MemoryOp::CpAsyncCommitGroup => s.cp_async_commit += 1,
108 MemoryOp::CpAsyncWaitGroup { .. } => s.cp_async_wait += 1,
109 _ => {}
110 }
111 }
112 PtxInstruction::TensorCore(op) => {
113 s.total_instructions += 1;
114 match op {
115 TensorCoreOp::MmaSync { .. } | TensorCoreOp::MmaSyncInt8 { .. } => {
116 s.mma += 1
117 }
118 }
119 }
120 PtxInstruction::Control(op) => {
121 s.total_instructions += 1;
122 match op {
123 ControlOp::BarSync { .. } => s.bar_sync += 1,
124 ControlOp::BraPred { .. } | ControlOp::Bra { .. } => s.branches += 1,
125 ControlOp::SetP { .. } => s.setp += 1,
126 _ => {}
127 }
128 }
129 PtxInstruction::Mov { .. } => {
130 s.total_instructions += 1;
131 s.mov += 1;
132 }
133 PtxInstruction::Cvt { .. } => {
134 s.total_instructions += 1;
135 s.cvt += 1;
136 }
137 PtxInstruction::Label(_) | PtxInstruction::Comment(_) => {}
138 }
139 }
140
141 for reg in &self.registers {
142 match reg.kind {
143 RegKind::R => s.registers_r += 1,
144 RegKind::Rd => s.registers_rd += 1,
145 RegKind::F => s.registers_f += 1,
146 RegKind::Fd => s.registers_fd += 1,
147 RegKind::P => s.registers_p += 1,
148 RegKind::H => s.registers_h += 1,
149 RegKind::Hb => s.registers_hb += 1,
150 }
151 }
152
153 s.shared_bytes = self.shared_decls.iter().map(|d| d.size_bytes).sum();
154
155 s
156 }
157}
158
159#[derive(Debug, Default, PartialEq, Eq)]
167pub struct KernelStats {
168 pub total_instructions: usize,
170 pub ld_global: usize,
172 pub st_global: usize,
174 pub ld_shared: usize,
176 pub st_shared: usize,
178 pub bar_sync: usize,
180 pub mma: usize,
182 pub cp_async: usize,
184 pub cp_async_commit: usize,
186 pub cp_async_wait: usize,
188 pub fma: usize,
190 pub arith_other: usize,
192 pub mov: usize,
194 pub cvt: usize,
196 pub branches: usize,
198 pub setp: usize,
200 pub registers_r: u32,
202 pub registers_rd: u32,
204 pub registers_f: u32,
206 pub registers_fd: u32,
208 pub registers_p: u32,
210 pub registers_h: u32,
212 pub registers_hb: u32,
214 pub shared_bytes: u32,
216}
217
218#[cfg(test)]
219mod tests {
220 use super::*;
221 use crate::ir::Operand;
222 use crate::types::PtxType;
223
224 fn reg(kind: RegKind, index: u32, ptx_type: PtxType) -> Register {
225 Register {
226 kind,
227 index,
228 ptx_type,
229 }
230 }
231
232 #[test]
233 fn stats_empty_kernel() {
234 let kernel = PtxKernel::new("empty");
235 let s = kernel.stats();
236 assert_eq!(s, KernelStats::default());
237 }
238
239 #[test]
240 fn stats_counts_instruction_types() {
241 let mut kernel = PtxKernel::new("test");
242
243 for _ in 0..2 {
245 kernel.push(PtxInstruction::Arith(ArithOp::Fma {
246 dst: reg(RegKind::F, 0, PtxType::F32),
247 a: Operand::Reg(reg(RegKind::F, 1, PtxType::F32)),
248 b: Operand::Reg(reg(RegKind::F, 2, PtxType::F32)),
249 c: Operand::Reg(reg(RegKind::F, 3, PtxType::F32)),
250 ty: PtxType::F32,
251 }));
252 }
253 kernel.push(PtxInstruction::Arith(ArithOp::Add {
255 dst: reg(RegKind::R, 0, PtxType::U32),
256 lhs: Operand::Reg(reg(RegKind::R, 1, PtxType::U32)),
257 rhs: Operand::ImmU32(1),
258 ty: PtxType::U32,
259 }));
260 kernel.push(PtxInstruction::Memory(MemoryOp::LdGlobal {
262 dst: reg(RegKind::F, 0, PtxType::F32),
263 addr: reg(RegKind::Rd, 0, PtxType::U64),
264 ty: PtxType::F32,
265 }));
266 kernel.push(PtxInstruction::Memory(MemoryOp::StGlobal {
267 addr: reg(RegKind::Rd, 0, PtxType::U64),
268 src: reg(RegKind::F, 0, PtxType::F32),
269 ty: PtxType::F32,
270 }));
271 kernel.push(PtxInstruction::Memory(MemoryOp::LdShared {
273 dst: reg(RegKind::F, 0, PtxType::F32),
274 addr: reg(RegKind::R, 0, PtxType::U32),
275 ty: PtxType::F32,
276 }));
277 kernel.push(PtxInstruction::Memory(MemoryOp::StShared {
278 addr: reg(RegKind::R, 0, PtxType::U32),
279 src: reg(RegKind::F, 0, PtxType::F32),
280 ty: PtxType::F32,
281 }));
282 kernel.push(PtxInstruction::Memory(MemoryOp::LdParam {
284 dst: reg(RegKind::Rd, 0, PtxType::U64),
285 param_name: "p0".to_string(),
286 ty: PtxType::U64,
287 }));
288 kernel.push(PtxInstruction::Control(ControlOp::BarSync {
290 barrier_id: 0,
291 }));
292 kernel.push(PtxInstruction::Control(ControlOp::BraPred {
294 pred: reg(RegKind::P, 0, PtxType::Pred),
295 target: "L0".to_string(),
296 negate: false,
297 }));
298 kernel.push(PtxInstruction::Control(ControlOp::SetP {
299 dst: reg(RegKind::P, 0, PtxType::Pred),
300 cmp_op: crate::instr::control::CmpOp::Lt,
301 lhs: Operand::Reg(reg(RegKind::R, 0, PtxType::U32)),
302 rhs: Operand::ImmU32(10),
303 ty: PtxType::U32,
304 }));
305 kernel.push(PtxInstruction::Mov {
307 dst: reg(RegKind::R, 0, PtxType::U32),
308 src: Operand::ImmU32(0),
309 ty: PtxType::U32,
310 });
311 kernel.push(PtxInstruction::Cvt {
312 dst: reg(RegKind::F, 0, PtxType::F32),
313 src: reg(RegKind::R, 0, PtxType::U32),
314 dst_ty: PtxType::F32,
315 src_ty: PtxType::U32,
316 });
317 kernel.push(PtxInstruction::Control(ControlOp::Ret));
319 kernel.push(PtxInstruction::Label("L0".to_string()));
321 kernel.push(PtxInstruction::Comment("test".to_string()));
322
323 let s = kernel.stats();
324 assert_eq!(s.total_instructions, 14);
328 assert_eq!(s.fma, 2);
329 assert_eq!(s.arith_other, 1);
330 assert_eq!(s.ld_global, 1);
331 assert_eq!(s.st_global, 1);
332 assert_eq!(s.ld_shared, 1);
333 assert_eq!(s.st_shared, 1);
334 assert_eq!(s.bar_sync, 1);
335 assert_eq!(s.branches, 1);
336 assert_eq!(s.setp, 1);
337 assert_eq!(s.mov, 1);
338 assert_eq!(s.cvt, 1);
339 }
340
341 #[test]
342 fn stats_counts_registers_by_kind() {
343 let mut kernel = PtxKernel::new("test");
344 kernel.set_registers(vec![
345 reg(RegKind::R, 0, PtxType::U32),
346 reg(RegKind::R, 1, PtxType::S32),
347 reg(RegKind::R, 2, PtxType::U32),
348 reg(RegKind::Rd, 0, PtxType::U64),
349 reg(RegKind::F, 0, PtxType::F32),
350 reg(RegKind::F, 1, PtxType::F32),
351 reg(RegKind::Fd, 0, PtxType::F64),
352 reg(RegKind::P, 0, PtxType::Pred),
353 reg(RegKind::P, 1, PtxType::Pred),
354 ]);
355
356 let s = kernel.stats();
357 assert_eq!(s.registers_r, 3);
358 assert_eq!(s.registers_rd, 1);
359 assert_eq!(s.registers_f, 2);
360 assert_eq!(s.registers_fd, 1);
361 assert_eq!(s.registers_p, 2);
362 }
363
364 #[test]
365 fn stats_counts_tensor_core_and_cp_async() {
366 use crate::fragment::{alloc_a, alloc_b, alloc_c};
367 use crate::instr::MmaShape;
368 use crate::ir::RegisterAllocator;
369
370 let mut alloc = RegisterAllocator::new();
371 let mut kernel = PtxKernel::new("tc_stats_test");
372
373 for _ in 0..2 {
375 kernel.push(PtxInstruction::TensorCore(
376 crate::instr::TensorCoreOp::MmaSync {
377 d: alloc_c(&mut alloc),
378 a: alloc_a(&mut alloc),
379 b: alloc_b(&mut alloc),
380 c: alloc_c(&mut alloc),
381 shape: MmaShape::M16N8K16,
382 d_ty: PtxType::F32,
383 a_ty: PtxType::F16,
384 b_ty: PtxType::F16,
385 c_ty: PtxType::F32,
386 },
387 ));
388 }
389
390 let dst_shared = reg(RegKind::R, 0, PtxType::U32);
392 let src_global = reg(RegKind::Rd, 0, PtxType::U64);
393 for _ in 0..3 {
394 kernel.push(PtxInstruction::Memory(MemoryOp::new_cp_async_ca(
395 dst_shared, src_global, 16,
396 )));
397 }
398 kernel.push(PtxInstruction::Memory(MemoryOp::CpAsyncCommitGroup));
399 kernel.push(PtxInstruction::Memory(MemoryOp::CpAsyncWaitGroup { n: 0 }));
400
401 let s = kernel.stats();
402 assert_eq!(s.mma, 2);
403 assert_eq!(s.cp_async, 3);
404 assert_eq!(s.cp_async_commit, 1);
405 assert_eq!(s.cp_async_wait, 1);
406 assert_eq!(s.total_instructions, 7);
408 }
409
410 #[test]
411 fn stats_counts_shared_bytes() {
412 let mut kernel = PtxKernel::new("test");
413 kernel.add_shared_decl(SharedDecl {
414 name: "tile_a".to_string(),
415 align: 4,
416 size_bytes: 4352, });
418 kernel.add_shared_decl(SharedDecl {
419 name: "tile_b".to_string(),
420 align: 4,
421 size_bytes: 4160, });
423
424 let s = kernel.stats();
425 assert_eq!(s.shared_bytes, 4352 + 4160);
426 }
427}