1use std::collections::HashMap;
27use std::fmt;
28use std::time::Instant;
29
30#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
40pub enum SmVersion {
41 Sm75,
43 Sm80,
45 Sm86,
47 Sm89,
49 Sm90,
51 Sm100,
53 Sm120,
55}
56
57impl SmVersion {
58 #[must_use]
60 pub const fn max_warps_per_sm(self) -> u32 {
61 match self {
62 Self::Sm75 => 32,
63 Self::Sm89 => 48,
64 Self::Sm80 | Self::Sm86 | Self::Sm90 | Self::Sm100 | Self::Sm120 => 64,
65 }
66 }
67
68 #[must_use]
70 pub const fn max_blocks_per_sm(self) -> u32 {
71 match self {
72 Self::Sm75 | Self::Sm80 | Self::Sm86 | Self::Sm89 => 16,
73 Self::Sm90 | Self::Sm100 | Self::Sm120 => 32,
74 }
75 }
76
77 #[must_use]
79 pub const fn registers_per_sm(self) -> u32 {
80 65536
81 }
82
83 #[must_use]
85 pub const fn max_registers_per_thread(self) -> u32 {
86 255
87 }
88
89 #[must_use]
91 pub const fn max_shared_mem_per_sm(self) -> u32 {
92 match self {
93 Self::Sm75 => 65_536,
94 Self::Sm80 | Self::Sm86 => 163_840,
95 Self::Sm89 => 101_376,
96 Self::Sm90 | Self::Sm100 | Self::Sm120 => 232_448,
97 }
98 }
99
100 #[must_use]
102 pub const fn warp_size(self) -> u32 {
103 32
104 }
105
106 #[must_use]
111 pub const fn register_alloc_granularity(self) -> u32 {
112 8
116 }
117
118 #[must_use]
120 pub const fn shared_mem_alloc_granularity(self) -> u32 {
121 match self {
122 Self::Sm75 | Self::Sm80 | Self::Sm86 | Self::Sm89 => 256,
123 Self::Sm90 | Self::Sm100 | Self::Sm120 => 128,
124 }
125 }
126}
127
128impl fmt::Display for SmVersion {
129 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
130 let s = match self {
131 Self::Sm75 => "sm_75",
132 Self::Sm80 => "sm_80",
133 Self::Sm86 => "sm_86",
134 Self::Sm89 => "sm_89",
135 Self::Sm90 => "sm_90",
136 Self::Sm100 => "sm_100",
137 Self::Sm120 => "sm_120",
138 };
139 f.write_str(s)
140 }
141}
142
143#[derive(Debug, Clone)]
153pub struct LaunchTelemetry {
154 pub kernel_name: String,
156 pub grid_dim: (u32, u32, u32),
158 pub block_dim: (u32, u32, u32),
160 pub shared_memory_bytes: u32,
162 pub register_count: Option<u32>,
164 pub elapsed_ms: Option<f64>,
166 pub achieved_occupancy: Option<f64>,
168 pub theoretical_occupancy: Option<f64>,
170 pub timestamp: Instant,
172}
173
174impl LaunchTelemetry {
175 #[must_use]
180 pub fn new(kernel_name: &str, grid_dim: (u32, u32, u32), block_dim: (u32, u32, u32)) -> Self {
181 Self {
182 kernel_name: kernel_name.to_owned(),
183 grid_dim,
184 block_dim,
185 shared_memory_bytes: 0,
186 register_count: None,
187 elapsed_ms: None,
188 achieved_occupancy: None,
189 theoretical_occupancy: None,
190 timestamp: Instant::now(),
191 }
192 }
193
194 #[must_use]
196 pub fn with_shared_memory(mut self, bytes: u32) -> Self {
197 self.shared_memory_bytes = bytes;
198 self
199 }
200
201 #[must_use]
203 pub fn with_register_count(mut self, count: u32) -> Self {
204 self.register_count = Some(count);
205 self
206 }
207
208 #[must_use]
210 pub fn with_elapsed_ms(mut self, ms: f64) -> Self {
211 self.elapsed_ms = Some(ms);
212 self
213 }
214
215 #[must_use]
217 pub fn with_achieved_occupancy(mut self, occ: f64) -> Self {
218 self.achieved_occupancy = Some(occ);
219 self
220 }
221
222 #[must_use]
224 pub fn with_theoretical_occupancy(mut self, occ: f64) -> Self {
225 self.theoretical_occupancy = Some(occ);
226 self
227 }
228
229 #[must_use]
231 pub fn total_threads(&self) -> u64 {
232 let grid_total = self.grid_dim.0 as u64 * self.grid_dim.1 as u64 * self.grid_dim.2 as u64;
233 let block_total =
234 self.block_dim.0 as u64 * self.block_dim.1 as u64 * self.block_dim.2 as u64;
235 grid_total * block_total
236 }
237}
238
239impl fmt::Display for LaunchTelemetry {
240 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
241 write!(
242 f,
243 "Kernel '{}': grid=({},{},{}), block=({},{},{}), smem={}B",
244 self.kernel_name,
245 self.grid_dim.0,
246 self.grid_dim.1,
247 self.grid_dim.2,
248 self.block_dim.0,
249 self.block_dim.1,
250 self.block_dim.2,
251 self.shared_memory_bytes,
252 )?;
253 if let Some(regs) = self.register_count {
254 write!(f, ", regs={regs}")?;
255 }
256 if let Some(ms) = self.elapsed_ms {
257 write!(f, ", time={ms:.3}ms")?;
258 }
259 if let Some(occ) = self.achieved_occupancy {
260 write!(f, ", occupancy={:.1}%", occ * 100.0)?;
261 }
262 Ok(())
263 }
264}
265
266#[derive(Debug, Clone)]
272pub struct KernelStats {
273 pub kernel_name: String,
275 pub launch_count: u32,
277 pub total_time_ms: f64,
279 pub avg_time_ms: f64,
281 pub min_time_ms: f64,
283 pub max_time_ms: f64,
285 pub avg_occupancy: f64,
287}
288
289impl fmt::Display for KernelStats {
290 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
291 write!(
292 f,
293 "{}: {} launches, total={:.3}ms, avg={:.3}ms, min={:.3}ms, max={:.3}ms, occ={:.1}%",
294 self.kernel_name,
295 self.launch_count,
296 self.total_time_ms,
297 self.avg_time_ms,
298 self.min_time_ms,
299 self.max_time_ms,
300 self.avg_occupancy * 100.0,
301 )
302 }
303}
304
305#[derive(Debug, Clone)]
314pub struct TelemetrySummary {
315 pub total_launches: usize,
317 pub total_gpu_time_ms: f64,
319 pub avg_gpu_time_ms: f64,
321 pub min_gpu_time_ms: f64,
323 pub max_gpu_time_ms: f64,
325 pub avg_occupancy: f64,
327 pub hottest_kernel: Option<String>,
329 pub per_kernel_stats: Vec<KernelStats>,
331}
332
333impl fmt::Display for TelemetrySummary {
334 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
335 writeln!(f, "=== Telemetry Summary ===")?;
336 writeln!(f, "Total launches: {}", self.total_launches)?;
337 writeln!(f, "Total GPU time: {:.3} ms", self.total_gpu_time_ms)?;
338 writeln!(f, "Avg GPU time: {:.3} ms", self.avg_gpu_time_ms)?;
339 writeln!(f, "Min GPU time: {:.3} ms", self.min_gpu_time_ms)?;
340 writeln!(f, "Max GPU time: {:.3} ms", self.max_gpu_time_ms)?;
341 writeln!(f, "Avg occupancy: {:.1}%", self.avg_occupancy * 100.0)?;
342 if let Some(ref hot) = self.hottest_kernel {
343 writeln!(f, "Hottest kernel: {hot}")?;
344 }
345 if !self.per_kernel_stats.is_empty() {
346 writeln!(f, "--- Per-kernel ---")?;
347 for ks in &self.per_kernel_stats {
348 writeln!(f, " {ks}")?;
349 }
350 }
351 Ok(())
352 }
353}
354
355#[derive(Debug)]
364pub struct TelemetryCollector {
365 entries: Vec<LaunchTelemetry>,
366 enabled: bool,
367 max_entries: usize,
368}
369
370impl TelemetryCollector {
371 #[must_use]
373 pub fn new(max_entries: usize) -> Self {
374 Self {
375 entries: Vec::new(),
376 enabled: true,
377 max_entries,
378 }
379 }
380
381 pub fn record(&mut self, telemetry: LaunchTelemetry) {
386 if !self.enabled {
387 return;
388 }
389 if self.entries.len() >= self.max_entries {
390 return;
391 }
392 self.entries.push(telemetry);
393 }
394
395 pub fn enable(&mut self) {
397 self.enabled = true;
398 }
399
400 pub fn disable(&mut self) {
402 self.enabled = false;
403 }
404
405 #[must_use]
407 pub fn is_enabled(&self) -> bool {
408 self.enabled
409 }
410
411 pub fn clear(&mut self) {
413 self.entries.clear();
414 }
415
416 #[must_use]
418 pub fn entries(&self) -> &[LaunchTelemetry] {
419 &self.entries
420 }
421
422 #[must_use]
424 pub fn len(&self) -> usize {
425 self.entries.len()
426 }
427
428 #[must_use]
430 pub fn is_empty(&self) -> bool {
431 self.entries.is_empty()
432 }
433
434 #[must_use]
438 pub fn summary(&self) -> TelemetrySummary {
439 compute_summary(&self.entries)
440 }
441}
442
443fn compute_summary(entries: &[LaunchTelemetry]) -> TelemetrySummary {
445 if entries.is_empty() {
446 return TelemetrySummary {
447 total_launches: 0,
448 total_gpu_time_ms: 0.0,
449 avg_gpu_time_ms: 0.0,
450 min_gpu_time_ms: 0.0,
451 max_gpu_time_ms: 0.0,
452 avg_occupancy: 0.0,
453 hottest_kernel: None,
454 per_kernel_stats: Vec::new(),
455 };
456 }
457
458 let mut total_time = 0.0_f64;
459 let mut min_time = f64::MAX;
460 let mut max_time = f64::MIN;
461 let mut time_count = 0usize;
462 let mut total_occ = 0.0_f64;
463 let mut occ_count = 0usize;
464
465 struct KernelAccum {
467 count: u32,
468 total_time: f64,
469 min_time: f64,
470 max_time: f64,
471 total_occ: f64,
472 occ_count: u32,
473 }
474
475 let mut per_kernel: HashMap<String, KernelAccum> = HashMap::new();
476
477 for entry in entries {
478 if let Some(ms) = entry.elapsed_ms {
479 total_time += ms;
480 if ms < min_time {
481 min_time = ms;
482 }
483 if ms > max_time {
484 max_time = ms;
485 }
486 time_count += 1;
487 }
488 if let Some(occ) = entry.achieved_occupancy {
489 total_occ += occ;
490 occ_count += 1;
491 }
492
493 let acc = per_kernel
494 .entry(entry.kernel_name.clone())
495 .or_insert(KernelAccum {
496 count: 0,
497 total_time: 0.0,
498 min_time: f64::MAX,
499 max_time: f64::MIN,
500 total_occ: 0.0,
501 occ_count: 0,
502 });
503 acc.count += 1;
504 if let Some(ms) = entry.elapsed_ms {
505 acc.total_time += ms;
506 if ms < acc.min_time {
507 acc.min_time = ms;
508 }
509 if ms > acc.max_time {
510 acc.max_time = ms;
511 }
512 }
513 if let Some(occ) = entry.achieved_occupancy {
514 acc.total_occ += occ;
515 acc.occ_count += 1;
516 }
517 }
518
519 if time_count == 0 {
521 min_time = 0.0;
522 max_time = 0.0;
523 }
524
525 let mut per_kernel_stats: Vec<KernelStats> = per_kernel
527 .into_iter()
528 .map(|(name, acc)| {
529 let min_t = if acc.min_time == f64::MAX {
530 0.0
531 } else {
532 acc.min_time
533 };
534 let max_t = if acc.max_time == f64::MIN {
535 0.0
536 } else {
537 acc.max_time
538 };
539 let avg_t = if acc.count > 0 {
540 acc.total_time / f64::from(acc.count)
541 } else {
542 0.0
543 };
544 let avg_o = if acc.occ_count > 0 {
545 acc.total_occ / f64::from(acc.occ_count)
546 } else {
547 0.0
548 };
549 KernelStats {
550 kernel_name: name,
551 launch_count: acc.count,
552 total_time_ms: acc.total_time,
553 avg_time_ms: avg_t,
554 min_time_ms: min_t,
555 max_time_ms: max_t,
556 avg_occupancy: avg_o,
557 }
558 })
559 .collect();
560
561 per_kernel_stats.sort_by(|a, b| {
563 b.total_time_ms
564 .partial_cmp(&a.total_time_ms)
565 .unwrap_or(std::cmp::Ordering::Equal)
566 });
567
568 let hottest_kernel = per_kernel_stats.first().map(|ks| ks.kernel_name.clone());
569
570 let avg_gpu_time = if time_count > 0 {
571 total_time / time_count as f64
572 } else {
573 0.0
574 };
575 let avg_occ = if occ_count > 0 {
576 total_occ / occ_count as f64
577 } else {
578 0.0
579 };
580
581 TelemetrySummary {
582 total_launches: entries.len(),
583 total_gpu_time_ms: total_time,
584 avg_gpu_time_ms: avg_gpu_time,
585 min_gpu_time_ms: min_time,
586 max_gpu_time_ms: max_time,
587 avg_occupancy: avg_occ,
588 hottest_kernel,
589 per_kernel_stats,
590 }
591}
592
593pub struct TelemetryExporter;
601
602impl TelemetryExporter {
603 #[must_use]
608 pub fn to_json(entries: &[LaunchTelemetry]) -> String {
609 let mut out = String::from("[\n");
610 for (i, e) in entries.iter().enumerate() {
611 out.push_str(" {\n");
612 json_field_str(&mut out, "kernel_name", &e.kernel_name);
613 out.push_str(&format!(
614 " \"grid_dim\": [{}, {}, {}],\n",
615 e.grid_dim.0, e.grid_dim.1, e.grid_dim.2
616 ));
617 out.push_str(&format!(
618 " \"block_dim\": [{}, {}, {}],\n",
619 e.block_dim.0, e.block_dim.1, e.block_dim.2
620 ));
621 out.push_str(&format!(
622 " \"shared_memory_bytes\": {},\n",
623 e.shared_memory_bytes
624 ));
625 json_field_opt_u32(&mut out, "register_count", e.register_count);
626 json_field_opt_f64(&mut out, "elapsed_ms", e.elapsed_ms);
627 json_field_opt_f64(&mut out, "achieved_occupancy", e.achieved_occupancy);
628 json_field_opt_f64_last(&mut out, "theoretical_occupancy", e.theoretical_occupancy);
629 out.push_str(" }");
630 if i + 1 < entries.len() {
631 out.push(',');
632 }
633 out.push('\n');
634 }
635 out.push(']');
636 out
637 }
638
639 #[must_use]
643 pub fn to_csv(entries: &[LaunchTelemetry]) -> String {
644 let mut out = String::from(
645 "kernel_name,grid_x,grid_y,grid_z,block_x,block_y,block_z,\
646 shared_memory_bytes,register_count,elapsed_ms,\
647 achieved_occupancy,theoretical_occupancy\n",
648 );
649 for e in entries {
650 out.push_str(&csv_escape(&e.kernel_name));
651 out.push(',');
652 out.push_str(&format!(
653 "{},{},{},{},{},{},{},",
654 e.grid_dim.0,
655 e.grid_dim.1,
656 e.grid_dim.2,
657 e.block_dim.0,
658 e.block_dim.1,
659 e.block_dim.2,
660 e.shared_memory_bytes,
661 ));
662 csv_opt_u32(&mut out, e.register_count);
663 out.push(',');
664 csv_opt_f64(&mut out, e.elapsed_ms);
665 out.push(',');
666 csv_opt_f64(&mut out, e.achieved_occupancy);
667 out.push(',');
668 csv_opt_f64(&mut out, e.theoretical_occupancy);
669 out.push('\n');
670 }
671 out
672 }
673
674 #[must_use]
679 pub fn to_chrome_trace(entries: &[LaunchTelemetry]) -> String {
680 let mut out = String::from("{\"traceEvents\":[\n");
681 let mut ts_us = 0.0_f64; for (i, e) in entries.iter().enumerate() {
683 let dur_us = e.elapsed_ms.unwrap_or(0.0) * 1000.0;
684 out.push_str(&format!(
685 " {{\"name\":\"{}\",\"cat\":\"gpu\",\"ph\":\"X\",\
686 \"ts\":{:.3},\"dur\":{:.3},\"pid\":1,\"tid\":1,\
687 \"args\":{{\"grid\":\"{},{},{}\",\"block\":\"{},{},{}\",\
688 \"smem\":{}",
689 json_escape_str(&e.kernel_name),
690 ts_us,
691 dur_us,
692 e.grid_dim.0,
693 e.grid_dim.1,
694 e.grid_dim.2,
695 e.block_dim.0,
696 e.block_dim.1,
697 e.block_dim.2,
698 e.shared_memory_bytes,
699 ));
700 if let Some(regs) = e.register_count {
701 out.push_str(&format!(",\"regs\":{regs}"));
702 }
703 if let Some(occ) = e.achieved_occupancy {
704 out.push_str(&format!(",\"occupancy\":{occ:.4}"));
705 }
706 out.push_str("}}");
707 if i + 1 < entries.len() {
708 out.push(',');
709 }
710 out.push('\n');
711 ts_us += dur_us;
712 }
713 out.push_str("]}\n");
714 out
715 }
716}
717
718fn json_escape_str(s: &str) -> String {
723 s.replace('\\', "\\\\")
724 .replace('"', "\\\"")
725 .replace('\n', "\\n")
726 .replace('\r', "\\r")
727 .replace('\t', "\\t")
728}
729
730fn json_field_str(out: &mut String, key: &str, val: &str) {
731 out.push_str(&format!(" \"{key}\": \"{}\",\n", json_escape_str(val)));
732}
733
734fn json_field_opt_u32(out: &mut String, key: &str, val: Option<u32>) {
735 match val {
736 Some(v) => out.push_str(&format!(" \"{key}\": {v},\n")),
737 None => out.push_str(&format!(" \"{key}\": null,\n")),
738 }
739}
740
741fn json_field_opt_f64(out: &mut String, key: &str, val: Option<f64>) {
742 match val {
743 Some(v) => out.push_str(&format!(" \"{key}\": {v},\n")),
744 None => out.push_str(&format!(" \"{key}\": null,\n")),
745 }
746}
747
748fn json_field_opt_f64_last(out: &mut String, key: &str, val: Option<f64>) {
749 match val {
750 Some(v) => out.push_str(&format!(" \"{key}\": {v}\n")),
751 None => out.push_str(&format!(" \"{key}\": null\n")),
752 }
753}
754
755fn csv_escape(s: &str) -> String {
756 if s.contains(',') || s.contains('"') || s.contains('\n') {
757 format!("\"{}\"", s.replace('"', "\"\""))
758 } else {
759 s.to_owned()
760 }
761}
762
763fn csv_opt_u32(out: &mut String, val: Option<u32>) {
764 if let Some(v) = val {
765 out.push_str(&v.to_string());
766 }
767}
768
769fn csv_opt_f64(out: &mut String, val: Option<f64>) {
770 if let Some(v) = val {
771 out.push_str(&format!("{v}"));
772 }
773}
774
775#[must_use]
800pub fn estimate_occupancy(
801 block_size: u32,
802 registers_per_thread: u32,
803 shared_mem: u32,
804 sm_version: SmVersion,
805) -> f64 {
806 if block_size == 0 {
807 return 0.0;
808 }
809
810 let warp_size = sm_version.warp_size();
811 let max_warps = sm_version.max_warps_per_sm();
812 let max_blocks = sm_version.max_blocks_per_sm();
813 let regs_per_sm = sm_version.registers_per_sm();
814 let max_smem = sm_version.max_shared_mem_per_sm();
815 let reg_granularity = sm_version.register_alloc_granularity();
816 let smem_granularity = sm_version.shared_mem_alloc_granularity();
817
818 let warps_per_block = block_size.div_ceil(warp_size);
820
821 let regs_per_thread = if registers_per_thread == 0 {
823 1 } else {
825 registers_per_thread
826 };
827 let regs_alloc = regs_per_thread.div_ceil(reg_granularity) * reg_granularity;
829 let regs_per_warp = regs_alloc * warp_size;
830 let warps_limited_by_regs = regs_per_sm.checked_div(regs_per_warp).unwrap_or(max_warps);
831
832 let smem_per_block = if shared_mem == 0 {
834 0
835 } else {
836 shared_mem.div_ceil(smem_granularity) * smem_granularity
837 };
838 let blocks_limited_by_smem = max_smem.checked_div(smem_per_block).unwrap_or(max_blocks);
839
840 let blocks_by_warps = warps_limited_by_regs
842 .checked_div(warps_per_block)
843 .unwrap_or(max_blocks);
844
845 let active_blocks = max_blocks.min(blocks_by_warps).min(blocks_limited_by_smem);
846
847 let active_warps = active_blocks * warps_per_block;
848 let occupancy = active_warps as f64 / max_warps as f64;
849
850 occupancy.clamp(0.0, 1.0)
851}
852
853#[cfg(test)]
858mod tests {
859 use super::*;
860
861 #[test]
864 fn telemetry_new_defaults() {
865 let t = LaunchTelemetry::new("kern", (4, 1, 1), (256, 1, 1));
866 assert_eq!(t.kernel_name, "kern");
867 assert_eq!(t.grid_dim, (4, 1, 1));
868 assert_eq!(t.block_dim, (256, 1, 1));
869 assert_eq!(t.shared_memory_bytes, 0);
870 assert!(t.register_count.is_none());
871 assert!(t.elapsed_ms.is_none());
872 assert!(t.achieved_occupancy.is_none());
873 assert!(t.theoretical_occupancy.is_none());
874 }
875
876 #[test]
877 fn telemetry_builder_methods() {
878 let t = LaunchTelemetry::new("kern", (1, 1, 1), (128, 1, 1))
879 .with_shared_memory(4096)
880 .with_register_count(32)
881 .with_elapsed_ms(1.5)
882 .with_achieved_occupancy(0.75)
883 .with_theoretical_occupancy(0.80);
884
885 assert_eq!(t.shared_memory_bytes, 4096);
886 assert_eq!(t.register_count, Some(32));
887 assert!((t.elapsed_ms.unwrap_or(0.0) - 1.5).abs() < f64::EPSILON);
888 assert!((t.achieved_occupancy.unwrap_or(0.0) - 0.75).abs() < f64::EPSILON);
889 assert!((t.theoretical_occupancy.unwrap_or(0.0) - 0.80).abs() < f64::EPSILON);
890 }
891
892 #[test]
893 fn telemetry_total_threads() {
894 let t = LaunchTelemetry::new("k", (4, 2, 1), (16, 16, 1));
895 assert_eq!(t.total_threads(), 4 * 2 * 16 * 16);
896 }
897
898 #[test]
899 fn telemetry_display() {
900 let t = LaunchTelemetry::new("add", (4, 1, 1), (256, 1, 1))
901 .with_elapsed_ms(0.5)
902 .with_register_count(24)
903 .with_achieved_occupancy(0.85);
904 let s = format!("{t}");
905 assert!(s.contains("add"));
906 assert!(s.contains("0.500ms"));
907 assert!(s.contains("regs=24"));
908 assert!(s.contains("85.0%"));
909 }
910
911 #[test]
914 fn collector_record_and_len() {
915 let mut c = TelemetryCollector::new(100);
916 assert!(c.is_empty());
917 c.record(LaunchTelemetry::new("k", (1, 1, 1), (64, 1, 1)));
918 assert_eq!(c.len(), 1);
919 assert!(!c.is_empty());
920 }
921
922 #[test]
923 fn collector_enable_disable() {
924 let mut c = TelemetryCollector::new(100);
925 assert!(c.is_enabled());
926
927 c.disable();
928 assert!(!c.is_enabled());
929 c.record(LaunchTelemetry::new("k", (1, 1, 1), (64, 1, 1)));
930 assert_eq!(c.len(), 0); c.enable();
933 c.record(LaunchTelemetry::new("k", (1, 1, 1), (64, 1, 1)));
934 assert_eq!(c.len(), 1);
935 }
936
937 #[test]
938 fn collector_max_entries_cap() {
939 let mut c = TelemetryCollector::new(3);
940 for _ in 0..10 {
941 c.record(LaunchTelemetry::new("k", (1, 1, 1), (64, 1, 1)));
942 }
943 assert_eq!(c.len(), 3);
944 }
945
946 #[test]
947 fn collector_clear() {
948 let mut c = TelemetryCollector::new(100);
949 c.record(LaunchTelemetry::new("k", (1, 1, 1), (64, 1, 1)));
950 c.clear();
951 assert!(c.is_empty());
952 }
953
954 #[test]
957 fn summary_empty() {
958 let c = TelemetryCollector::new(100);
959 let s = c.summary();
960 assert_eq!(s.total_launches, 0);
961 assert!((s.total_gpu_time_ms).abs() < f64::EPSILON);
962 assert!(s.hottest_kernel.is_none());
963 assert!(s.per_kernel_stats.is_empty());
964 }
965
966 #[test]
967 fn summary_single_kernel() {
968 let mut c = TelemetryCollector::new(100);
969 c.record(
970 LaunchTelemetry::new("add", (1, 1, 1), (256, 1, 1))
971 .with_elapsed_ms(1.0)
972 .with_achieved_occupancy(0.8),
973 );
974 c.record(
975 LaunchTelemetry::new("add", (1, 1, 1), (256, 1, 1))
976 .with_elapsed_ms(3.0)
977 .with_achieved_occupancy(0.9),
978 );
979 let s = c.summary();
980 assert_eq!(s.total_launches, 2);
981 assert!((s.total_gpu_time_ms - 4.0).abs() < f64::EPSILON);
982 assert!((s.avg_gpu_time_ms - 2.0).abs() < f64::EPSILON);
983 assert!((s.min_gpu_time_ms - 1.0).abs() < f64::EPSILON);
984 assert!((s.max_gpu_time_ms - 3.0).abs() < f64::EPSILON);
985 assert!((s.avg_occupancy - 0.85).abs() < 1e-9);
986 assert_eq!(s.hottest_kernel.as_deref(), Some("add"));
987 assert_eq!(s.per_kernel_stats.len(), 1);
988 assert_eq!(s.per_kernel_stats[0].launch_count, 2);
989 }
990
991 #[test]
992 fn summary_per_kernel_aggregation() {
993 let mut c = TelemetryCollector::new(100);
994 c.record(LaunchTelemetry::new("matmul", (1, 1, 1), (256, 1, 1)).with_elapsed_ms(10.0));
995 c.record(LaunchTelemetry::new("add", (1, 1, 1), (128, 1, 1)).with_elapsed_ms(1.0));
996 c.record(LaunchTelemetry::new("matmul", (1, 1, 1), (256, 1, 1)).with_elapsed_ms(12.0));
997
998 let s = c.summary();
999 assert_eq!(s.total_launches, 3);
1000 assert_eq!(s.hottest_kernel.as_deref(), Some("matmul"));
1002 assert_eq!(s.per_kernel_stats.len(), 2);
1003 assert_eq!(s.per_kernel_stats[0].kernel_name, "matmul");
1005 assert_eq!(s.per_kernel_stats[0].launch_count, 2);
1006 assert!((s.per_kernel_stats[0].total_time_ms - 22.0).abs() < f64::EPSILON);
1007 }
1008
1009 #[test]
1010 fn summary_display() {
1011 let mut c = TelemetryCollector::new(100);
1012 c.record(
1013 LaunchTelemetry::new("k", (1, 1, 1), (256, 1, 1))
1014 .with_elapsed_ms(2.0)
1015 .with_achieved_occupancy(0.5),
1016 );
1017 let s = c.summary();
1018 let text = format!("{s}");
1019 assert!(text.contains("Telemetry Summary"));
1020 assert!(text.contains("Total launches: 1"));
1021 assert!(text.contains("50.0%"));
1022 }
1023
1024 #[test]
1027 fn export_json() {
1028 let entries = vec![
1029 LaunchTelemetry::new("kern", (4, 1, 1), (256, 1, 1))
1030 .with_elapsed_ms(0.5)
1031 .with_register_count(32),
1032 ];
1033 let json = TelemetryExporter::to_json(&entries);
1034 assert!(json.starts_with('['));
1035 assert!(json.contains("\"kernel_name\": \"kern\""));
1036 assert!(json.contains("\"grid_dim\": [4, 1, 1]"));
1037 assert!(json.contains("\"elapsed_ms\": 0.5"));
1038 assert!(json.contains("\"register_count\": 32"));
1039 assert!(json.contains("\"achieved_occupancy\": null"));
1040 }
1041
1042 #[test]
1045 fn export_csv() {
1046 let entries =
1047 vec![LaunchTelemetry::new("kern", (2, 1, 1), (128, 1, 1)).with_elapsed_ms(1.0)];
1048 let csv = TelemetryExporter::to_csv(&entries);
1049 let lines: Vec<&str> = csv.lines().collect();
1050 assert_eq!(lines.len(), 2); assert!(lines[0].starts_with("kernel_name,"));
1052 assert!(lines[1].starts_with("kern,"));
1053 assert!(lines[1].contains("128"));
1054 }
1055
1056 #[test]
1059 fn export_chrome_trace() {
1060 let entries = vec![
1061 LaunchTelemetry::new("k1", (1, 1, 1), (256, 1, 1)).with_elapsed_ms(1.0),
1062 LaunchTelemetry::new("k2", (2, 1, 1), (128, 1, 1)).with_elapsed_ms(2.0),
1063 ];
1064 let trace = TelemetryExporter::to_chrome_trace(&entries);
1065 assert!(trace.contains("\"traceEvents\""));
1066 assert!(trace.contains("\"name\":\"k1\""));
1067 assert!(trace.contains("\"name\":\"k2\""));
1068 assert!(trace.contains("\"ph\":\"X\""));
1069 assert!(trace.contains("\"cat\":\"gpu\""));
1070 }
1071
1072 #[test]
1075 fn occupancy_basic() {
1076 let occ = estimate_occupancy(256, 32, 0, SmVersion::Sm80);
1077 assert!(occ > 0.0);
1078 assert!(occ <= 1.0);
1079 }
1080
1081 #[test]
1082 fn occupancy_zero_block() {
1083 let occ = estimate_occupancy(0, 32, 0, SmVersion::Sm80);
1084 assert!((occ).abs() < f64::EPSILON);
1085 }
1086
1087 #[test]
1088 fn occupancy_high_registers_lowers_occupancy() {
1089 let high_reg = estimate_occupancy(256, 128, 0, SmVersion::Sm80);
1090 let low_reg = estimate_occupancy(256, 16, 0, SmVersion::Sm80);
1091 assert!(high_reg < low_reg);
1092 }
1093
1094 #[test]
1095 fn occupancy_large_shared_mem_lowers_occupancy() {
1096 let large_smem = estimate_occupancy(256, 32, 100_000, SmVersion::Sm80);
1097 let small_smem = estimate_occupancy(256, 32, 0, SmVersion::Sm80);
1098 assert!(large_smem <= small_smem);
1099 }
1100
1101 #[test]
1102 fn occupancy_sm_versions() {
1103 for sm in [
1104 SmVersion::Sm75,
1105 SmVersion::Sm80,
1106 SmVersion::Sm86,
1107 SmVersion::Sm89,
1108 SmVersion::Sm90,
1109 SmVersion::Sm100,
1110 SmVersion::Sm120,
1111 ] {
1112 let occ = estimate_occupancy(128, 32, 0, sm);
1113 assert!(occ > 0.0, "occupancy should be positive for {sm}");
1114 assert!(occ <= 1.0, "occupancy should be <= 1.0 for {sm}");
1115 }
1116 }
1117
1118 #[test]
1121 fn sm_version_display() {
1122 assert_eq!(format!("{}", SmVersion::Sm80), "sm_80");
1123 assert_eq!(format!("{}", SmVersion::Sm90), "sm_90");
1124 }
1125
1126 #[test]
1129 fn kernel_stats_display() {
1130 let ks = KernelStats {
1131 kernel_name: "matmul".to_owned(),
1132 launch_count: 5,
1133 total_time_ms: 10.0,
1134 avg_time_ms: 2.0,
1135 min_time_ms: 1.0,
1136 max_time_ms: 4.0,
1137 avg_occupancy: 0.75,
1138 };
1139 let s = format!("{ks}");
1140 assert!(s.contains("matmul"));
1141 assert!(s.contains("5 launches"));
1142 assert!(s.contains("75.0%"));
1143 }
1144}