1use std::fmt;
12
13#[derive(Debug, Clone)]
15pub struct GpuArchSpec {
16 pub name: String,
18 pub max_threads_per_sm: u32,
20 pub max_blocks_per_sm: u32,
22 pub max_warps_per_sm: u32,
24 pub warp_size: u32,
26 pub registers_per_sm: u32,
28 pub register_alloc_granularity: u32,
30 pub shared_memory_per_sm: u32,
32 pub shared_memory_alloc_granularity: u32,
34 pub sm_count: u32,
36}
37
38impl GpuArchSpec {
39 pub fn hopper() -> Self {
41 Self {
42 name: "sm_90".into(),
43 max_threads_per_sm: 2048,
44 max_blocks_per_sm: 32,
45 max_warps_per_sm: 64,
46 warp_size: 32,
47 registers_per_sm: 65536,
48 register_alloc_granularity: 256,
49 shared_memory_per_sm: 228 * 1024,
50 shared_memory_alloc_granularity: 256,
51 sm_count: 132,
52 }
53 }
54
55 pub fn ada_lovelace() -> Self {
57 Self {
58 name: "sm_89".into(),
59 max_threads_per_sm: 1536,
60 max_blocks_per_sm: 24,
61 max_warps_per_sm: 48,
62 warp_size: 32,
63 registers_per_sm: 65536,
64 register_alloc_granularity: 256,
65 shared_memory_per_sm: 100 * 1024,
66 shared_memory_alloc_granularity: 256,
67 sm_count: 128,
68 }
69 }
70
71 pub fn ampere() -> Self {
73 Self {
74 name: "sm_80".into(),
75 max_threads_per_sm: 2048,
76 max_blocks_per_sm: 32,
77 max_warps_per_sm: 64,
78 warp_size: 32,
79 registers_per_sm: 65536,
80 register_alloc_granularity: 256,
81 shared_memory_per_sm: 164 * 1024,
82 shared_memory_alloc_granularity: 128,
83 sm_count: 108,
84 }
85 }
86
87 pub fn cdna3() -> Self {
89 Self {
90 name: "gfx942".into(),
91 max_threads_per_sm: 2048,
92 max_blocks_per_sm: 32,
93 max_warps_per_sm: 32,
94 warp_size: 64, registers_per_sm: 65536,
96 register_alloc_granularity: 256,
97 shared_memory_per_sm: 64 * 1024,
98 shared_memory_alloc_granularity: 256,
99 sm_count: 304,
100 }
101 }
102
103 pub fn generic() -> Self {
105 Self {
106 name: "generic".into(),
107 max_threads_per_sm: 1024,
108 max_blocks_per_sm: 16,
109 max_warps_per_sm: 32,
110 warp_size: 32,
111 registers_per_sm: 32768,
112 register_alloc_granularity: 256,
113 shared_memory_per_sm: 48 * 1024,
114 shared_memory_alloc_granularity: 256,
115 sm_count: 1,
116 }
117 }
118}
119
120#[derive(Debug, Clone)]
122pub struct KernelResources {
123 pub threads_per_block: u32,
125 pub registers_per_thread: u32,
127 pub shared_memory_static: u32,
129 pub shared_memory_dynamic: u32,
131}
132
133impl KernelResources {
134 pub fn new(threads_per_block: u32, registers_per_thread: u32, shared_memory: u32) -> Self {
136 Self {
137 threads_per_block,
138 registers_per_thread,
139 shared_memory_static: shared_memory,
140 shared_memory_dynamic: 0,
141 }
142 }
143
144 pub fn total_shared_memory(&self) -> u32 {
146 self.shared_memory_static + self.shared_memory_dynamic
147 }
148}
149
150#[derive(Debug, Clone)]
152pub struct OccupancyResult {
153 pub active_blocks_per_sm: u32,
155 pub active_warps_per_sm: u32,
157 pub max_warps_per_sm: u32,
159 pub occupancy: f64,
161 pub limiting_factor: LimitingFactor,
163 pub blocks_limited_by_threads: u32,
165 pub blocks_limited_by_registers: u32,
167 pub blocks_limited_by_smem: u32,
169 pub blocks_limited_by_max_blocks: u32,
171}
172
173#[derive(Debug, Clone, Copy, PartialEq)]
175pub enum LimitingFactor {
176 Threads,
177 Registers,
178 SharedMemory,
179 MaxBlocksPerSm,
180}
181
182pub fn calculate_occupancy(arch: &GpuArchSpec, kernel: &KernelResources) -> OccupancyResult {
184 let warp_size = arch.warp_size;
185
186 let warps_per_block = (kernel.threads_per_block + warp_size - 1) / warp_size;
188
189 let blocks_by_threads = if warps_per_block > 0 {
191 arch.max_warps_per_sm / warps_per_block
192 } else {
193 0
194 };
195
196 let regs_per_warp = kernel.registers_per_thread * warp_size;
198 let regs_per_warp_aligned = round_up(regs_per_warp, arch.register_alloc_granularity);
199 let regs_per_block = regs_per_warp_aligned * warps_per_block;
200 let blocks_by_registers = if regs_per_block > 0 {
201 arch.registers_per_sm / regs_per_block
202 } else {
203 arch.max_blocks_per_sm
204 };
205
206 let smem_per_block = kernel.total_shared_memory();
208 let smem_aligned = round_up(smem_per_block, arch.shared_memory_alloc_granularity);
209 let blocks_by_smem = if smem_aligned > 0 {
210 arch.shared_memory_per_sm / smem_aligned
211 } else {
212 arch.max_blocks_per_sm
213 };
214
215 let blocks_by_max = arch.max_blocks_per_sm;
217
218 let active_blocks = blocks_by_threads
220 .min(blocks_by_registers)
221 .min(blocks_by_smem)
222 .min(blocks_by_max);
223
224 let active_warps = active_blocks * warps_per_block;
225 let occupancy = active_warps as f64 / arch.max_warps_per_sm as f64;
226
227 let limiting_factor = if active_blocks == blocks_by_threads {
228 LimitingFactor::Threads
229 } else if active_blocks == blocks_by_registers {
230 LimitingFactor::Registers
231 } else if active_blocks == blocks_by_smem {
232 LimitingFactor::SharedMemory
233 } else {
234 LimitingFactor::MaxBlocksPerSm
235 };
236
237 OccupancyResult {
238 active_blocks_per_sm: active_blocks,
239 active_warps_per_sm: active_warps,
240 max_warps_per_sm: arch.max_warps_per_sm,
241 occupancy,
242 limiting_factor,
243 blocks_limited_by_threads: blocks_by_threads,
244 blocks_limited_by_registers: blocks_by_registers,
245 blocks_limited_by_smem: blocks_by_smem,
246 blocks_limited_by_max_blocks: blocks_by_max,
247 }
248}
249
250pub fn suggest_block_size(arch: &GpuArchSpec, registers_per_thread: u32, shared_memory: u32) -> BlockSizeSuggestion {
252 let mut best_occupancy = 0.0;
253 let mut best_block_size = arch.warp_size;
254 let mut results = Vec::new();
255
256 let max_block = arch.max_threads_per_sm.min(1024);
258 let mut block_size = arch.warp_size;
259
260 while block_size <= max_block {
261 let kernel = KernelResources::new(block_size, registers_per_thread, shared_memory);
262 let result = calculate_occupancy(arch, &kernel);
263
264 results.push((block_size, result.occupancy));
265
266 if result.occupancy > best_occupancy {
267 best_occupancy = result.occupancy;
268 best_block_size = block_size;
269 }
270 block_size += arch.warp_size;
271 }
272
273 BlockSizeSuggestion {
274 optimal_block_size: best_block_size,
275 max_occupancy: best_occupancy,
276 all_results: results,
277 }
278}
279
280#[derive(Debug)]
282pub struct BlockSizeSuggestion {
283 pub optimal_block_size: u32,
284 pub max_occupancy: f64,
285 pub all_results: Vec<(u32, f64)>,
286}
287
288fn round_up(value: u32, granularity: u32) -> u32 {
289 if granularity == 0 { return value; }
290 ((value + granularity - 1) / granularity) * granularity
291}
292
293impl fmt::Display for OccupancyResult {
294 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
295 write!(f, "Occupancy: {:.1}% ({}/{} warps, {} blocks/SM, limited by {:?})",
296 self.occupancy * 100.0,
297 self.active_warps_per_sm,
298 self.max_warps_per_sm,
299 self.active_blocks_per_sm,
300 self.limiting_factor)
301 }
302}
303
304#[cfg(test)]
307mod tests {
308 use super::*;
309
310 #[test]
311 fn test_occupancy_basic() {
312 let arch = GpuArchSpec::ampere();
313 let kernel = KernelResources::new(256, 32, 0);
314 let result = calculate_occupancy(&arch, &kernel);
315
316 assert!(result.occupancy > 0.0);
317 assert!(result.occupancy <= 1.0);
318 assert!(result.active_blocks_per_sm > 0);
319 }
320
321 #[test]
322 fn test_occupancy_register_limited() {
323 let arch = GpuArchSpec::ampere();
324 let kernel = KernelResources::new(256, 128, 0);
326 let result = calculate_occupancy(&arch, &kernel);
327
328 assert!(result.occupancy < 1.0);
329 assert!(result.active_blocks_per_sm <= 2);
332 }
333
334 #[test]
335 fn test_occupancy_smem_limited() {
336 let arch = GpuArchSpec::ampere();
337 let kernel = KernelResources::new(256, 32, 48 * 1024);
339 let result = calculate_occupancy(&arch, &kernel);
340
341 assert!(result.active_blocks_per_sm <= 4);
343 }
344
345 #[test]
346 fn test_occupancy_full() {
347 let arch = GpuArchSpec::ampere();
348 let kernel = KernelResources::new(64, 16, 0);
350 let result = calculate_occupancy(&arch, &kernel);
351 assert!(result.occupancy >= 0.5, "Expected high occupancy, got {}", result.occupancy);
352 }
353
354 #[test]
355 fn test_suggest_block_size() {
356 let arch = GpuArchSpec::ampere();
357 let suggestion = suggest_block_size(&arch, 32, 0);
358
359 assert!(suggestion.optimal_block_size >= 32);
360 assert!(suggestion.optimal_block_size <= 1024);
361 assert!(suggestion.max_occupancy > 0.0);
362 assert!(!suggestion.all_results.is_empty());
363 }
364
365 #[test]
366 fn test_hopper_arch() {
367 let arch = GpuArchSpec::hopper();
368 let kernel = KernelResources::new(256, 32, 0);
369 let result = calculate_occupancy(&arch, &kernel);
370 assert!(result.active_blocks_per_sm > 0);
373 assert!(result.occupancy > 0.0);
374 }
375
376 #[test]
377 fn test_amd_cdna3() {
378 let arch = GpuArchSpec::cdna3();
379 let kernel = KernelResources::new(256, 32, 0);
380 let result = calculate_occupancy(&arch, &kernel);
381 assert!(result.active_blocks_per_sm > 0);
383 }
384
385 #[test]
386 fn test_occupancy_display() {
387 let result = OccupancyResult {
388 active_blocks_per_sm: 8,
389 active_warps_per_sm: 64,
390 max_warps_per_sm: 64,
391 occupancy: 1.0,
392 limiting_factor: LimitingFactor::Threads,
393 blocks_limited_by_threads: 8,
394 blocks_limited_by_registers: 16,
395 blocks_limited_by_smem: 32,
396 blocks_limited_by_max_blocks: 32,
397 };
398 let s = format!("{}", result);
399 assert!(s.contains("100.0%"));
400 assert!(s.contains("Threads"));
401 }
402
403 #[test]
404 fn test_dynamic_shared_memory() {
405 let mut kernel = KernelResources::new(256, 32, 1024);
406 kernel.shared_memory_dynamic = 2048;
407 assert_eq!(kernel.total_shared_memory(), 3072);
408
409 let arch = GpuArchSpec::ampere();
410 let result = calculate_occupancy(&arch, &kernel);
411 assert!(result.occupancy > 0.0);
412 }
413}