1use super::instruction::PtxInstruction;
4use super::param::PtxParam;
5use super::register::Register;
6use crate::instr::ArithOp;
7use crate::instr::control::ControlOp;
8use crate::instr::memory::MemoryOp;
9use crate::instr::tensor_core::TensorCoreOp;
10use crate::types::RegKind;
11
12#[derive(Debug, Clone)]
17pub struct SharedDecl {
18 pub name: String,
20 pub align: u32,
22 pub size_bytes: u32,
24}
25
26#[derive(Debug, Clone)]
33pub struct PtxKernel {
34 pub name: String,
36 pub params: Vec<PtxParam>,
38 pub body: Vec<PtxInstruction>,
40 pub registers: Vec<Register>,
42 pub shared_decls: Vec<SharedDecl>,
44}
45
46impl PtxKernel {
47 pub fn new(name: &str) -> Self {
49 Self {
50 name: name.to_string(),
51 params: Vec::new(),
52 body: Vec::new(),
53 registers: Vec::new(),
54 shared_decls: Vec::new(),
55 }
56 }
57
58 pub fn add_param(&mut self, param: PtxParam) {
60 self.params.push(param);
61 }
62
63 pub fn push(&mut self, instr: PtxInstruction) {
65 self.body.push(instr);
66 }
67
68 pub fn set_registers(&mut self, regs: Vec<Register>) {
70 self.registers = regs;
71 }
72
73 pub fn add_shared_decl(&mut self, decl: SharedDecl) {
75 self.shared_decls.push(decl);
76 }
77
78 pub fn stats(&self) -> KernelStats {
87 let mut s = KernelStats::default();
88
89 for instr in &self.body {
90 match instr {
91 PtxInstruction::Arith(op) => {
92 s.total_instructions += 1;
93 if matches!(op, ArithOp::Fma { .. }) {
94 s.fma += 1;
95 } else {
96 s.arith_other += 1;
97 }
98 }
99 PtxInstruction::Memory(op) => {
100 s.total_instructions += 1;
101 match op {
102 MemoryOp::LdGlobal { .. } => s.ld_global += 1,
103 MemoryOp::StGlobal { .. } => s.st_global += 1,
104 MemoryOp::LdShared { .. } => s.ld_shared += 1,
105 MemoryOp::StShared { .. } => s.st_shared += 1,
106 MemoryOp::CpAsyncCaSharedGlobal { .. } => s.cp_async += 1,
107 MemoryOp::CpAsyncCommitGroup => s.cp_async_commit += 1,
108 MemoryOp::CpAsyncWaitGroup { .. } => s.cp_async_wait += 1,
109 _ => {}
110 }
111 }
112 PtxInstruction::TensorCore(op) => {
113 s.total_instructions += 1;
114 match op {
115 TensorCoreOp::MmaSync { .. } => s.mma += 1,
116 }
117 }
118 PtxInstruction::Control(op) => {
119 s.total_instructions += 1;
120 match op {
121 ControlOp::BarSync { .. } => s.bar_sync += 1,
122 ControlOp::BraPred { .. } | ControlOp::Bra { .. } => s.branches += 1,
123 ControlOp::SetP { .. } => s.setp += 1,
124 _ => {}
125 }
126 }
127 PtxInstruction::Mov { .. } => {
128 s.total_instructions += 1;
129 s.mov += 1;
130 }
131 PtxInstruction::Cvt { .. } => {
132 s.total_instructions += 1;
133 s.cvt += 1;
134 }
135 PtxInstruction::Label(_) | PtxInstruction::Comment(_) => {}
136 }
137 }
138
139 for reg in &self.registers {
140 match reg.kind {
141 RegKind::R => s.registers_r += 1,
142 RegKind::Rd => s.registers_rd += 1,
143 RegKind::F => s.registers_f += 1,
144 RegKind::Fd => s.registers_fd += 1,
145 RegKind::P => s.registers_p += 1,
146 RegKind::H => s.registers_h += 1,
147 RegKind::Hb => s.registers_hb += 1,
148 }
149 }
150
151 s.shared_bytes = self.shared_decls.iter().map(|d| d.size_bytes).sum();
152
153 s
154 }
155}
156
157#[derive(Debug, Default, PartialEq, Eq)]
165pub struct KernelStats {
166 pub total_instructions: usize,
168 pub ld_global: usize,
170 pub st_global: usize,
172 pub ld_shared: usize,
174 pub st_shared: usize,
176 pub bar_sync: usize,
178 pub mma: usize,
180 pub cp_async: usize,
182 pub cp_async_commit: usize,
184 pub cp_async_wait: usize,
186 pub fma: usize,
188 pub arith_other: usize,
190 pub mov: usize,
192 pub cvt: usize,
194 pub branches: usize,
196 pub setp: usize,
198 pub registers_r: u32,
200 pub registers_rd: u32,
202 pub registers_f: u32,
204 pub registers_fd: u32,
206 pub registers_p: u32,
208 pub registers_h: u32,
210 pub registers_hb: u32,
212 pub shared_bytes: u32,
214}
215
216#[cfg(test)]
217mod tests {
218 use super::*;
219 use crate::ir::Operand;
220 use crate::types::PtxType;
221
222 fn reg(kind: RegKind, index: u32, ptx_type: PtxType) -> Register {
223 Register {
224 kind,
225 index,
226 ptx_type,
227 }
228 }
229
230 #[test]
231 fn stats_empty_kernel() {
232 let kernel = PtxKernel::new("empty");
233 let s = kernel.stats();
234 assert_eq!(s, KernelStats::default());
235 }
236
237 #[test]
238 fn stats_counts_instruction_types() {
239 let mut kernel = PtxKernel::new("test");
240
241 for _ in 0..2 {
243 kernel.push(PtxInstruction::Arith(ArithOp::Fma {
244 dst: reg(RegKind::F, 0, PtxType::F32),
245 a: Operand::Reg(reg(RegKind::F, 1, PtxType::F32)),
246 b: Operand::Reg(reg(RegKind::F, 2, PtxType::F32)),
247 c: Operand::Reg(reg(RegKind::F, 3, PtxType::F32)),
248 ty: PtxType::F32,
249 }));
250 }
251 kernel.push(PtxInstruction::Arith(ArithOp::Add {
253 dst: reg(RegKind::R, 0, PtxType::U32),
254 lhs: Operand::Reg(reg(RegKind::R, 1, PtxType::U32)),
255 rhs: Operand::ImmU32(1),
256 ty: PtxType::U32,
257 }));
258 kernel.push(PtxInstruction::Memory(MemoryOp::LdGlobal {
260 dst: reg(RegKind::F, 0, PtxType::F32),
261 addr: reg(RegKind::Rd, 0, PtxType::U64),
262 ty: PtxType::F32,
263 }));
264 kernel.push(PtxInstruction::Memory(MemoryOp::StGlobal {
265 addr: reg(RegKind::Rd, 0, PtxType::U64),
266 src: reg(RegKind::F, 0, PtxType::F32),
267 ty: PtxType::F32,
268 }));
269 kernel.push(PtxInstruction::Memory(MemoryOp::LdShared {
271 dst: reg(RegKind::F, 0, PtxType::F32),
272 addr: reg(RegKind::R, 0, PtxType::U32),
273 ty: PtxType::F32,
274 }));
275 kernel.push(PtxInstruction::Memory(MemoryOp::StShared {
276 addr: reg(RegKind::R, 0, PtxType::U32),
277 src: reg(RegKind::F, 0, PtxType::F32),
278 ty: PtxType::F32,
279 }));
280 kernel.push(PtxInstruction::Memory(MemoryOp::LdParam {
282 dst: reg(RegKind::Rd, 0, PtxType::U64),
283 param_name: "p0".to_string(),
284 ty: PtxType::U64,
285 }));
286 kernel.push(PtxInstruction::Control(ControlOp::BarSync {
288 barrier_id: 0,
289 }));
290 kernel.push(PtxInstruction::Control(ControlOp::BraPred {
292 pred: reg(RegKind::P, 0, PtxType::Pred),
293 target: "L0".to_string(),
294 negate: false,
295 }));
296 kernel.push(PtxInstruction::Control(ControlOp::SetP {
297 dst: reg(RegKind::P, 0, PtxType::Pred),
298 cmp_op: crate::instr::control::CmpOp::Lt,
299 lhs: Operand::Reg(reg(RegKind::R, 0, PtxType::U32)),
300 rhs: Operand::ImmU32(10),
301 ty: PtxType::U32,
302 }));
303 kernel.push(PtxInstruction::Mov {
305 dst: reg(RegKind::R, 0, PtxType::U32),
306 src: Operand::ImmU32(0),
307 ty: PtxType::U32,
308 });
309 kernel.push(PtxInstruction::Cvt {
310 dst: reg(RegKind::F, 0, PtxType::F32),
311 src: reg(RegKind::R, 0, PtxType::U32),
312 dst_ty: PtxType::F32,
313 src_ty: PtxType::U32,
314 });
315 kernel.push(PtxInstruction::Control(ControlOp::Ret));
317 kernel.push(PtxInstruction::Label("L0".to_string()));
319 kernel.push(PtxInstruction::Comment("test".to_string()));
320
321 let s = kernel.stats();
322 assert_eq!(s.total_instructions, 14);
326 assert_eq!(s.fma, 2);
327 assert_eq!(s.arith_other, 1);
328 assert_eq!(s.ld_global, 1);
329 assert_eq!(s.st_global, 1);
330 assert_eq!(s.ld_shared, 1);
331 assert_eq!(s.st_shared, 1);
332 assert_eq!(s.bar_sync, 1);
333 assert_eq!(s.branches, 1);
334 assert_eq!(s.setp, 1);
335 assert_eq!(s.mov, 1);
336 assert_eq!(s.cvt, 1);
337 }
338
339 #[test]
340 fn stats_counts_registers_by_kind() {
341 let mut kernel = PtxKernel::new("test");
342 kernel.set_registers(vec![
343 reg(RegKind::R, 0, PtxType::U32),
344 reg(RegKind::R, 1, PtxType::S32),
345 reg(RegKind::R, 2, PtxType::U32),
346 reg(RegKind::Rd, 0, PtxType::U64),
347 reg(RegKind::F, 0, PtxType::F32),
348 reg(RegKind::F, 1, PtxType::F32),
349 reg(RegKind::Fd, 0, PtxType::F64),
350 reg(RegKind::P, 0, PtxType::Pred),
351 reg(RegKind::P, 1, PtxType::Pred),
352 ]);
353
354 let s = kernel.stats();
355 assert_eq!(s.registers_r, 3);
356 assert_eq!(s.registers_rd, 1);
357 assert_eq!(s.registers_f, 2);
358 assert_eq!(s.registers_fd, 1);
359 assert_eq!(s.registers_p, 2);
360 }
361
362 #[test]
363 fn stats_counts_tensor_core_and_cp_async() {
364 use crate::fragment::{alloc_a, alloc_b, alloc_c};
365 use crate::instr::MmaShape;
366 use crate::ir::RegisterAllocator;
367
368 let mut alloc = RegisterAllocator::new();
369 let mut kernel = PtxKernel::new("tc_stats_test");
370
371 for _ in 0..2 {
373 kernel.push(PtxInstruction::TensorCore(
374 crate::instr::TensorCoreOp::MmaSync {
375 d: alloc_c(&mut alloc),
376 a: alloc_a(&mut alloc),
377 b: alloc_b(&mut alloc),
378 c: alloc_c(&mut alloc),
379 shape: MmaShape::M16N8K16,
380 d_ty: PtxType::F32,
381 a_ty: PtxType::F16,
382 b_ty: PtxType::F16,
383 c_ty: PtxType::F32,
384 },
385 ));
386 }
387
388 let dst_shared = reg(RegKind::R, 0, PtxType::U32);
390 let src_global = reg(RegKind::Rd, 0, PtxType::U64);
391 for _ in 0..3 {
392 kernel.push(PtxInstruction::Memory(MemoryOp::new_cp_async_ca(
393 dst_shared, src_global, 16,
394 )));
395 }
396 kernel.push(PtxInstruction::Memory(MemoryOp::CpAsyncCommitGroup));
397 kernel.push(PtxInstruction::Memory(MemoryOp::CpAsyncWaitGroup { n: 0 }));
398
399 let s = kernel.stats();
400 assert_eq!(s.mma, 2);
401 assert_eq!(s.cp_async, 3);
402 assert_eq!(s.cp_async_commit, 1);
403 assert_eq!(s.cp_async_wait, 1);
404 assert_eq!(s.total_instructions, 7);
406 }
407
408 #[test]
409 fn stats_counts_shared_bytes() {
410 let mut kernel = PtxKernel::new("test");
411 kernel.add_shared_decl(SharedDecl {
412 name: "tile_a".to_string(),
413 align: 4,
414 size_bytes: 4352, });
416 kernel.add_shared_decl(SharedDecl {
417 name: "tile_b".to_string(),
418 align: 4,
419 size_bytes: 4160, });
421
422 let s = kernel.stats();
423 assert_eq!(s.shared_bytes, 4352 + 4160);
424 }
425}