Skip to main content

oxicuda_launch/
telemetry.rs

1//! Launch telemetry: timing, occupancy, and register usage reporting.
2//!
3//! This module provides post-launch diagnostics for GPU kernel execution.
4//! After a kernel launch, [`LaunchTelemetry`] captures grid/block dimensions,
5//! GPU-side timing, achieved occupancy, and register usage. A
6//! [`TelemetryCollector`] accumulates entries and produces a
7//! [`TelemetrySummary`] with per-kernel aggregation.
8//!
9//! Telemetry can be exported to JSON, CSV, or Chrome trace format via
10//! [`TelemetryExporter`].
11//!
12//! # Example
13//!
14//! ```
15//! use oxicuda_launch::telemetry::{LaunchTelemetry, TelemetryCollector};
16//!
17//! let mut collector = TelemetryCollector::new(1000);
18//! let entry = LaunchTelemetry::new("vector_add", (4, 1, 1), (256, 1, 1))
19//!     .with_elapsed_ms(0.5)
20//!     .with_achieved_occupancy(0.85);
21//! collector.record(entry);
22//! let summary = collector.summary();
23//! assert_eq!(summary.total_launches, 1);
24//! ```
25
26use std::collections::HashMap;
27use std::fmt;
28use std::time::Instant;
29
30// ---------------------------------------------------------------------------
31// SmVersion (local, avoids oxicuda-ptx dependency)
32// ---------------------------------------------------------------------------
33
34/// GPU architecture version for occupancy estimation.
35///
36/// This is a local copy that avoids a dependency on `oxicuda-ptx`.
37/// Each variant encodes the SM architecture parameters needed for
38/// occupancy calculations (max warps per SM, register file size, etc.).
39#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
40pub enum SmVersion {
41    /// Turing (compute capability 7.5).
42    Sm75,
43    /// Ampere (compute capability 8.0).
44    Sm80,
45    /// Ampere GA10x (compute capability 8.6).
46    Sm86,
47    /// Ada Lovelace (compute capability 8.9).
48    Sm89,
49    /// Hopper (compute capability 9.0).
50    Sm90,
51    /// Blackwell (compute capability 10.0).
52    Sm100,
53    /// Blackwell B200 (compute capability 12.0).
54    Sm120,
55}
56
57impl SmVersion {
58    /// Maximum number of warps that can reside on a single SM.
59    #[must_use]
60    pub const fn max_warps_per_sm(self) -> u32 {
61        match self {
62            Self::Sm75 => 32,
63            Self::Sm89 => 48,
64            Self::Sm80 | Self::Sm86 | Self::Sm90 | Self::Sm100 | Self::Sm120 => 64,
65        }
66    }
67
68    /// Maximum number of thread blocks that can reside on a single SM.
69    #[must_use]
70    pub const fn max_blocks_per_sm(self) -> u32 {
71        match self {
72            Self::Sm75 | Self::Sm80 | Self::Sm86 | Self::Sm89 => 16,
73            Self::Sm90 | Self::Sm100 | Self::Sm120 => 32,
74        }
75    }
76
77    /// Total number of 32-bit registers available per SM.
78    #[must_use]
79    pub const fn registers_per_sm(self) -> u32 {
80        65536
81    }
82
83    /// Maximum number of registers a single thread can use.
84    #[must_use]
85    pub const fn max_registers_per_thread(self) -> u32 {
86        255
87    }
88
89    /// Maximum shared memory per SM in bytes.
90    #[must_use]
91    pub const fn max_shared_mem_per_sm(self) -> u32 {
92        match self {
93            Self::Sm75 => 65_536,
94            Self::Sm80 | Self::Sm86 => 163_840,
95            Self::Sm89 => 101_376,
96            Self::Sm90 | Self::Sm100 | Self::Sm120 => 232_448,
97        }
98    }
99
100    /// Warp size (always 32 for NVIDIA GPUs).
101    #[must_use]
102    pub const fn warp_size(self) -> u32 {
103        32
104    }
105
106    /// Register allocation granularity (in warps).
107    ///
108    /// Registers are allocated to warps in chunks of this many registers
109    /// per thread, rounded up to the nearest multiple.
110    #[must_use]
111    pub const fn register_alloc_granularity(self) -> u32 {
112        // All modern NVIDIA GPUs allocate registers in units of 256
113        // (i.e., 8 registers per thread * 32 threads = 256 regs per warp).
114        // The granularity for per-thread count rounding is 8.
115        8
116    }
117
118    /// Shared memory allocation granularity in bytes.
119    #[must_use]
120    pub const fn shared_mem_alloc_granularity(self) -> u32 {
121        match self {
122            Self::Sm75 | Self::Sm80 | Self::Sm86 | Self::Sm89 => 256,
123            Self::Sm90 | Self::Sm100 | Self::Sm120 => 128,
124        }
125    }
126}
127
128impl fmt::Display for SmVersion {
129    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
130        let s = match self {
131            Self::Sm75 => "sm_75",
132            Self::Sm80 => "sm_80",
133            Self::Sm86 => "sm_86",
134            Self::Sm89 => "sm_89",
135            Self::Sm90 => "sm_90",
136            Self::Sm100 => "sm_100",
137            Self::Sm120 => "sm_120",
138        };
139        f.write_str(s)
140    }
141}
142
143// ---------------------------------------------------------------------------
144// LaunchTelemetry
145// ---------------------------------------------------------------------------
146
147/// Telemetry data collected after a single kernel launch.
148///
149/// Records dimensions, timing, occupancy, and register usage.
150/// Use the builder methods (`with_*`) to set optional fields after
151/// constructing with [`LaunchTelemetry::new`].
152#[derive(Debug, Clone)]
153pub struct LaunchTelemetry {
154    /// Name of the launched kernel.
155    pub kernel_name: String,
156    /// Grid dimensions `(x, y, z)`.
157    pub grid_dim: (u32, u32, u32),
158    /// Block dimensions `(x, y, z)`.
159    pub block_dim: (u32, u32, u32),
160    /// Dynamic shared memory allocated in bytes.
161    pub shared_memory_bytes: u32,
162    /// Number of registers used per thread, if known.
163    pub register_count: Option<u32>,
164    /// GPU-side elapsed time in milliseconds, if measured.
165    pub elapsed_ms: Option<f64>,
166    /// Achieved occupancy (0.0..=1.0), if measured.
167    pub achieved_occupancy: Option<f64>,
168    /// Theoretical occupancy (0.0..=1.0), if computed.
169    pub theoretical_occupancy: Option<f64>,
170    /// Wall-clock timestamp when the telemetry was recorded.
171    pub timestamp: Instant,
172}
173
174impl LaunchTelemetry {
175    /// Creates a new telemetry entry with the given kernel name and dimensions.
176    ///
177    /// Optional fields default to `None` / `0`. Use the `with_*` builder
178    /// methods to set them.
179    #[must_use]
180    pub fn new(kernel_name: &str, grid_dim: (u32, u32, u32), block_dim: (u32, u32, u32)) -> Self {
181        Self {
182            kernel_name: kernel_name.to_owned(),
183            grid_dim,
184            block_dim,
185            shared_memory_bytes: 0,
186            register_count: None,
187            elapsed_ms: None,
188            achieved_occupancy: None,
189            theoretical_occupancy: None,
190            timestamp: Instant::now(),
191        }
192    }
193
194    /// Sets the dynamic shared memory allocation.
195    #[must_use]
196    pub fn with_shared_memory(mut self, bytes: u32) -> Self {
197        self.shared_memory_bytes = bytes;
198        self
199    }
200
201    /// Sets the register count per thread.
202    #[must_use]
203    pub fn with_register_count(mut self, count: u32) -> Self {
204        self.register_count = Some(count);
205        self
206    }
207
208    /// Sets the GPU-side elapsed time in milliseconds.
209    #[must_use]
210    pub fn with_elapsed_ms(mut self, ms: f64) -> Self {
211        self.elapsed_ms = Some(ms);
212        self
213    }
214
215    /// Sets the achieved occupancy (0.0..=1.0).
216    #[must_use]
217    pub fn with_achieved_occupancy(mut self, occ: f64) -> Self {
218        self.achieved_occupancy = Some(occ);
219        self
220    }
221
222    /// Sets the theoretical occupancy (0.0..=1.0).
223    #[must_use]
224    pub fn with_theoretical_occupancy(mut self, occ: f64) -> Self {
225        self.theoretical_occupancy = Some(occ);
226        self
227    }
228
229    /// Total number of threads launched (grid_total * block_total).
230    #[must_use]
231    pub fn total_threads(&self) -> u64 {
232        let grid_total = self.grid_dim.0 as u64 * self.grid_dim.1 as u64 * self.grid_dim.2 as u64;
233        let block_total =
234            self.block_dim.0 as u64 * self.block_dim.1 as u64 * self.block_dim.2 as u64;
235        grid_total * block_total
236    }
237}
238
239impl fmt::Display for LaunchTelemetry {
240    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
241        write!(
242            f,
243            "Kernel '{}': grid=({},{},{}), block=({},{},{}), smem={}B",
244            self.kernel_name,
245            self.grid_dim.0,
246            self.grid_dim.1,
247            self.grid_dim.2,
248            self.block_dim.0,
249            self.block_dim.1,
250            self.block_dim.2,
251            self.shared_memory_bytes,
252        )?;
253        if let Some(regs) = self.register_count {
254            write!(f, ", regs={regs}")?;
255        }
256        if let Some(ms) = self.elapsed_ms {
257            write!(f, ", time={ms:.3}ms")?;
258        }
259        if let Some(occ) = self.achieved_occupancy {
260            write!(f, ", occupancy={:.1}%", occ * 100.0)?;
261        }
262        Ok(())
263    }
264}
265
266// ---------------------------------------------------------------------------
267// KernelStats
268// ---------------------------------------------------------------------------
269
270/// Aggregated statistics for a single kernel across multiple launches.
271#[derive(Debug, Clone)]
272pub struct KernelStats {
273    /// Name of the kernel.
274    pub kernel_name: String,
275    /// Number of times this kernel was launched.
276    pub launch_count: u32,
277    /// Total GPU time across all launches in milliseconds.
278    pub total_time_ms: f64,
279    /// Average GPU time per launch in milliseconds.
280    pub avg_time_ms: f64,
281    /// Minimum GPU time observed in milliseconds.
282    pub min_time_ms: f64,
283    /// Maximum GPU time observed in milliseconds.
284    pub max_time_ms: f64,
285    /// Average achieved occupancy across launches.
286    pub avg_occupancy: f64,
287}
288
289impl fmt::Display for KernelStats {
290    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
291        write!(
292            f,
293            "{}: {} launches, total={:.3}ms, avg={:.3}ms, min={:.3}ms, max={:.3}ms, occ={:.1}%",
294            self.kernel_name,
295            self.launch_count,
296            self.total_time_ms,
297            self.avg_time_ms,
298            self.min_time_ms,
299            self.max_time_ms,
300            self.avg_occupancy * 100.0,
301        )
302    }
303}
304
305// ---------------------------------------------------------------------------
306// TelemetrySummary
307// ---------------------------------------------------------------------------
308
309/// Summary of all collected telemetry data.
310///
311/// Provides aggregate statistics across all recorded kernel launches
312/// and per-kernel breakdowns via [`KernelStats`].
313#[derive(Debug, Clone)]
314pub struct TelemetrySummary {
315    /// Total number of kernel launches recorded.
316    pub total_launches: usize,
317    /// Total GPU time across all launches in milliseconds.
318    pub total_gpu_time_ms: f64,
319    /// Average GPU time per launch in milliseconds.
320    pub avg_gpu_time_ms: f64,
321    /// Minimum GPU time observed across all launches.
322    pub min_gpu_time_ms: f64,
323    /// Maximum GPU time observed across all launches.
324    pub max_gpu_time_ms: f64,
325    /// Average achieved occupancy across all launches.
326    pub avg_occupancy: f64,
327    /// Kernel with the most cumulative GPU time.
328    pub hottest_kernel: Option<String>,
329    /// Per-kernel aggregated statistics.
330    pub per_kernel_stats: Vec<KernelStats>,
331}
332
333impl fmt::Display for TelemetrySummary {
334    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
335        writeln!(f, "=== Telemetry Summary ===")?;
336        writeln!(f, "Total launches: {}", self.total_launches)?;
337        writeln!(f, "Total GPU time: {:.3} ms", self.total_gpu_time_ms)?;
338        writeln!(f, "Avg GPU time:   {:.3} ms", self.avg_gpu_time_ms)?;
339        writeln!(f, "Min GPU time:   {:.3} ms", self.min_gpu_time_ms)?;
340        writeln!(f, "Max GPU time:   {:.3} ms", self.max_gpu_time_ms)?;
341        writeln!(f, "Avg occupancy:  {:.1}%", self.avg_occupancy * 100.0)?;
342        if let Some(ref hot) = self.hottest_kernel {
343            writeln!(f, "Hottest kernel: {hot}")?;
344        }
345        if !self.per_kernel_stats.is_empty() {
346            writeln!(f, "--- Per-kernel ---")?;
347            for ks in &self.per_kernel_stats {
348                writeln!(f, "  {ks}")?;
349            }
350        }
351        Ok(())
352    }
353}
354
355// ---------------------------------------------------------------------------
356// TelemetryCollector
357// ---------------------------------------------------------------------------
358
359/// Accumulates [`LaunchTelemetry`] entries and produces summaries.
360///
361/// The collector can be enabled/disabled at runtime and caps the number
362/// of stored entries to `max_entries` to bound memory usage.
363#[derive(Debug)]
364pub struct TelemetryCollector {
365    entries: Vec<LaunchTelemetry>,
366    enabled: bool,
367    max_entries: usize,
368}
369
370impl TelemetryCollector {
371    /// Creates a new collector that stores up to `max_entries` telemetry records.
372    #[must_use]
373    pub fn new(max_entries: usize) -> Self {
374        Self {
375            entries: Vec::new(),
376            enabled: true,
377            max_entries,
378        }
379    }
380
381    /// Records a telemetry entry.
382    ///
383    /// If the collector is disabled or the entry count has reached
384    /// `max_entries`, the entry is silently dropped.
385    pub fn record(&mut self, telemetry: LaunchTelemetry) {
386        if !self.enabled {
387            return;
388        }
389        if self.entries.len() >= self.max_entries {
390            return;
391        }
392        self.entries.push(telemetry);
393    }
394
395    /// Enables telemetry recording.
396    pub fn enable(&mut self) {
397        self.enabled = true;
398    }
399
400    /// Disables telemetry recording. Existing entries are preserved.
401    pub fn disable(&mut self) {
402        self.enabled = false;
403    }
404
405    /// Returns whether the collector is currently enabled.
406    #[must_use]
407    pub fn is_enabled(&self) -> bool {
408        self.enabled
409    }
410
411    /// Clears all recorded entries.
412    pub fn clear(&mut self) {
413        self.entries.clear();
414    }
415
416    /// Returns a reference to all recorded entries.
417    #[must_use]
418    pub fn entries(&self) -> &[LaunchTelemetry] {
419        &self.entries
420    }
421
422    /// Returns the number of recorded entries.
423    #[must_use]
424    pub fn len(&self) -> usize {
425        self.entries.len()
426    }
427
428    /// Returns `true` if no entries have been recorded.
429    #[must_use]
430    pub fn is_empty(&self) -> bool {
431        self.entries.is_empty()
432    }
433
434    /// Computes a summary of all recorded telemetry.
435    ///
436    /// If no entries have been recorded, returns a zeroed summary.
437    #[must_use]
438    pub fn summary(&self) -> TelemetrySummary {
439        compute_summary(&self.entries)
440    }
441}
442
443/// Computes a [`TelemetrySummary`] from a slice of telemetry entries.
444fn compute_summary(entries: &[LaunchTelemetry]) -> TelemetrySummary {
445    if entries.is_empty() {
446        return TelemetrySummary {
447            total_launches: 0,
448            total_gpu_time_ms: 0.0,
449            avg_gpu_time_ms: 0.0,
450            min_gpu_time_ms: 0.0,
451            max_gpu_time_ms: 0.0,
452            avg_occupancy: 0.0,
453            hottest_kernel: None,
454            per_kernel_stats: Vec::new(),
455        };
456    }
457
458    let mut total_time = 0.0_f64;
459    let mut min_time = f64::MAX;
460    let mut max_time = f64::MIN;
461    let mut time_count = 0usize;
462    let mut total_occ = 0.0_f64;
463    let mut occ_count = 0usize;
464
465    // Per-kernel accumulators
466    struct KernelAccum {
467        count: u32,
468        total_time: f64,
469        min_time: f64,
470        max_time: f64,
471        total_occ: f64,
472        occ_count: u32,
473    }
474
475    let mut per_kernel: HashMap<String, KernelAccum> = HashMap::new();
476
477    for entry in entries {
478        if let Some(ms) = entry.elapsed_ms {
479            total_time += ms;
480            if ms < min_time {
481                min_time = ms;
482            }
483            if ms > max_time {
484                max_time = ms;
485            }
486            time_count += 1;
487        }
488        if let Some(occ) = entry.achieved_occupancy {
489            total_occ += occ;
490            occ_count += 1;
491        }
492
493        let acc = per_kernel
494            .entry(entry.kernel_name.clone())
495            .or_insert(KernelAccum {
496                count: 0,
497                total_time: 0.0,
498                min_time: f64::MAX,
499                max_time: f64::MIN,
500                total_occ: 0.0,
501                occ_count: 0,
502            });
503        acc.count += 1;
504        if let Some(ms) = entry.elapsed_ms {
505            acc.total_time += ms;
506            if ms < acc.min_time {
507                acc.min_time = ms;
508            }
509            if ms > acc.max_time {
510                acc.max_time = ms;
511            }
512        }
513        if let Some(occ) = entry.achieved_occupancy {
514            acc.total_occ += occ;
515            acc.occ_count += 1;
516        }
517    }
518
519    // Fix sentinel values when no timing data was present
520    if time_count == 0 {
521        min_time = 0.0;
522        max_time = 0.0;
523    }
524
525    // Build per-kernel stats
526    let mut per_kernel_stats: Vec<KernelStats> = per_kernel
527        .into_iter()
528        .map(|(name, acc)| {
529            let min_t = if acc.min_time == f64::MAX {
530                0.0
531            } else {
532                acc.min_time
533            };
534            let max_t = if acc.max_time == f64::MIN {
535                0.0
536            } else {
537                acc.max_time
538            };
539            let avg_t = if acc.count > 0 {
540                acc.total_time / f64::from(acc.count)
541            } else {
542                0.0
543            };
544            let avg_o = if acc.occ_count > 0 {
545                acc.total_occ / f64::from(acc.occ_count)
546            } else {
547                0.0
548            };
549            KernelStats {
550                kernel_name: name,
551                launch_count: acc.count,
552                total_time_ms: acc.total_time,
553                avg_time_ms: avg_t,
554                min_time_ms: min_t,
555                max_time_ms: max_t,
556                avg_occupancy: avg_o,
557            }
558        })
559        .collect();
560
561    // Sort by total time descending for deterministic output
562    per_kernel_stats.sort_by(|a, b| {
563        b.total_time_ms
564            .partial_cmp(&a.total_time_ms)
565            .unwrap_or(std::cmp::Ordering::Equal)
566    });
567
568    let hottest_kernel = per_kernel_stats.first().map(|ks| ks.kernel_name.clone());
569
570    let avg_gpu_time = if time_count > 0 {
571        total_time / time_count as f64
572    } else {
573        0.0
574    };
575    let avg_occ = if occ_count > 0 {
576        total_occ / occ_count as f64
577    } else {
578        0.0
579    };
580
581    TelemetrySummary {
582        total_launches: entries.len(),
583        total_gpu_time_ms: total_time,
584        avg_gpu_time_ms: avg_gpu_time,
585        min_gpu_time_ms: min_time,
586        max_gpu_time_ms: max_time,
587        avg_occupancy: avg_occ,
588        hottest_kernel,
589        per_kernel_stats,
590    }
591}
592
593// ---------------------------------------------------------------------------
594// TelemetryExporter
595// ---------------------------------------------------------------------------
596
597/// Export telemetry data in various formats.
598///
599/// All methods are stateless and operate on slices of [`LaunchTelemetry`].
600pub struct TelemetryExporter;
601
602impl TelemetryExporter {
603    /// Exports telemetry entries as a JSON array.
604    ///
605    /// Each entry becomes a JSON object with all fields. `None` values
606    /// are serialized as `null`.
607    #[must_use]
608    pub fn to_json(entries: &[LaunchTelemetry]) -> String {
609        let mut out = String::from("[\n");
610        for (i, e) in entries.iter().enumerate() {
611            out.push_str("  {\n");
612            json_field_str(&mut out, "kernel_name", &e.kernel_name);
613            out.push_str(&format!(
614                "    \"grid_dim\": [{}, {}, {}],\n",
615                e.grid_dim.0, e.grid_dim.1, e.grid_dim.2
616            ));
617            out.push_str(&format!(
618                "    \"block_dim\": [{}, {}, {}],\n",
619                e.block_dim.0, e.block_dim.1, e.block_dim.2
620            ));
621            out.push_str(&format!(
622                "    \"shared_memory_bytes\": {},\n",
623                e.shared_memory_bytes
624            ));
625            json_field_opt_u32(&mut out, "register_count", e.register_count);
626            json_field_opt_f64(&mut out, "elapsed_ms", e.elapsed_ms);
627            json_field_opt_f64(&mut out, "achieved_occupancy", e.achieved_occupancy);
628            json_field_opt_f64_last(&mut out, "theoretical_occupancy", e.theoretical_occupancy);
629            out.push_str("  }");
630            if i + 1 < entries.len() {
631                out.push(',');
632            }
633            out.push('\n');
634        }
635        out.push(']');
636        out
637    }
638
639    /// Exports telemetry entries as CSV.
640    ///
641    /// The first line is a header row. Missing values are empty cells.
642    #[must_use]
643    pub fn to_csv(entries: &[LaunchTelemetry]) -> String {
644        let mut out = String::from(
645            "kernel_name,grid_x,grid_y,grid_z,block_x,block_y,block_z,\
646             shared_memory_bytes,register_count,elapsed_ms,\
647             achieved_occupancy,theoretical_occupancy\n",
648        );
649        for e in entries {
650            out.push_str(&csv_escape(&e.kernel_name));
651            out.push(',');
652            out.push_str(&format!(
653                "{},{},{},{},{},{},{},",
654                e.grid_dim.0,
655                e.grid_dim.1,
656                e.grid_dim.2,
657                e.block_dim.0,
658                e.block_dim.1,
659                e.block_dim.2,
660                e.shared_memory_bytes,
661            ));
662            csv_opt_u32(&mut out, e.register_count);
663            out.push(',');
664            csv_opt_f64(&mut out, e.elapsed_ms);
665            out.push(',');
666            csv_opt_f64(&mut out, e.achieved_occupancy);
667            out.push(',');
668            csv_opt_f64(&mut out, e.theoretical_occupancy);
669            out.push('\n');
670        }
671        out
672    }
673
674    /// Exports telemetry entries in Chrome `chrome://tracing` JSON format.
675    ///
676    /// Each kernel launch becomes a duration event (`ph: "X"`).
677    /// Launches without timing data use a duration of 0.
678    #[must_use]
679    pub fn to_chrome_trace(entries: &[LaunchTelemetry]) -> String {
680        let mut out = String::from("{\"traceEvents\":[\n");
681        let mut ts_us = 0.0_f64; // cumulative timestamp in microseconds
682        for (i, e) in entries.iter().enumerate() {
683            let dur_us = e.elapsed_ms.unwrap_or(0.0) * 1000.0;
684            out.push_str(&format!(
685                "  {{\"name\":\"{}\",\"cat\":\"gpu\",\"ph\":\"X\",\
686                 \"ts\":{:.3},\"dur\":{:.3},\"pid\":1,\"tid\":1,\
687                 \"args\":{{\"grid\":\"{},{},{}\",\"block\":\"{},{},{}\",\
688                 \"smem\":{}",
689                json_escape_str(&e.kernel_name),
690                ts_us,
691                dur_us,
692                e.grid_dim.0,
693                e.grid_dim.1,
694                e.grid_dim.2,
695                e.block_dim.0,
696                e.block_dim.1,
697                e.block_dim.2,
698                e.shared_memory_bytes,
699            ));
700            if let Some(regs) = e.register_count {
701                out.push_str(&format!(",\"regs\":{regs}"));
702            }
703            if let Some(occ) = e.achieved_occupancy {
704                out.push_str(&format!(",\"occupancy\":{occ:.4}"));
705            }
706            out.push_str("}}");
707            if i + 1 < entries.len() {
708                out.push(',');
709            }
710            out.push('\n');
711            ts_us += dur_us;
712        }
713        out.push_str("]}\n");
714        out
715    }
716}
717
718// ---------------------------------------------------------------------------
719// JSON / CSV helpers
720// ---------------------------------------------------------------------------
721
722fn json_escape_str(s: &str) -> String {
723    s.replace('\\', "\\\\")
724        .replace('"', "\\\"")
725        .replace('\n', "\\n")
726        .replace('\r', "\\r")
727        .replace('\t', "\\t")
728}
729
730fn json_field_str(out: &mut String, key: &str, val: &str) {
731    out.push_str(&format!("    \"{key}\": \"{}\",\n", json_escape_str(val)));
732}
733
734fn json_field_opt_u32(out: &mut String, key: &str, val: Option<u32>) {
735    match val {
736        Some(v) => out.push_str(&format!("    \"{key}\": {v},\n")),
737        None => out.push_str(&format!("    \"{key}\": null,\n")),
738    }
739}
740
741fn json_field_opt_f64(out: &mut String, key: &str, val: Option<f64>) {
742    match val {
743        Some(v) => out.push_str(&format!("    \"{key}\": {v},\n")),
744        None => out.push_str(&format!("    \"{key}\": null,\n")),
745    }
746}
747
748fn json_field_opt_f64_last(out: &mut String, key: &str, val: Option<f64>) {
749    match val {
750        Some(v) => out.push_str(&format!("    \"{key}\": {v}\n")),
751        None => out.push_str(&format!("    \"{key}\": null\n")),
752    }
753}
754
755fn csv_escape(s: &str) -> String {
756    if s.contains(',') || s.contains('"') || s.contains('\n') {
757        format!("\"{}\"", s.replace('"', "\"\""))
758    } else {
759        s.to_owned()
760    }
761}
762
763fn csv_opt_u32(out: &mut String, val: Option<u32>) {
764    if let Some(v) = val {
765        out.push_str(&v.to_string());
766    }
767}
768
769fn csv_opt_f64(out: &mut String, val: Option<f64>) {
770    if let Some(v) = val {
771        out.push_str(&format!("{v}"));
772    }
773}
774
775// ---------------------------------------------------------------------------
776// Occupancy estimation
777// ---------------------------------------------------------------------------
778
779/// Estimates theoretical occupancy for a kernel launch configuration.
780///
781/// The occupancy is the ratio of active warps to the maximum possible
782/// warps on a streaming multiprocessor. This depends on:
783///
784/// - `block_size`: threads per block
785/// - `registers_per_thread`: registers consumed by each thread
786/// - `shared_mem`: dynamic shared memory per block in bytes
787/// - `sm_version`: target GPU architecture
788///
789/// Returns a value in the range `0.0..=1.0`.
790///
791/// # Example
792///
793/// ```
794/// use oxicuda_launch::telemetry::{estimate_occupancy, SmVersion};
795///
796/// let occ = estimate_occupancy(256, 32, 0, SmVersion::Sm80);
797/// assert!(occ > 0.0 && occ <= 1.0);
798/// ```
799#[must_use]
800pub fn estimate_occupancy(
801    block_size: u32,
802    registers_per_thread: u32,
803    shared_mem: u32,
804    sm_version: SmVersion,
805) -> f64 {
806    if block_size == 0 {
807        return 0.0;
808    }
809
810    let warp_size = sm_version.warp_size();
811    let max_warps = sm_version.max_warps_per_sm();
812    let max_blocks = sm_version.max_blocks_per_sm();
813    let regs_per_sm = sm_version.registers_per_sm();
814    let max_smem = sm_version.max_shared_mem_per_sm();
815    let reg_granularity = sm_version.register_alloc_granularity();
816    let smem_granularity = sm_version.shared_mem_alloc_granularity();
817
818    // Warps per block
819    let warps_per_block = block_size.div_ceil(warp_size);
820
821    // --- Register limit ---
822    let regs_per_thread = if registers_per_thread == 0 {
823        1 // must use at least 1 register
824    } else {
825        registers_per_thread
826    };
827    // Round up to allocation granularity
828    let regs_alloc = regs_per_thread.div_ceil(reg_granularity) * reg_granularity;
829    let regs_per_warp = regs_alloc * warp_size;
830    let warps_limited_by_regs = regs_per_sm.checked_div(regs_per_warp).unwrap_or(max_warps);
831
832    // --- Shared memory limit ---
833    let smem_per_block = if shared_mem == 0 {
834        0
835    } else {
836        shared_mem.div_ceil(smem_granularity) * smem_granularity
837    };
838    let blocks_limited_by_smem = max_smem.checked_div(smem_per_block).unwrap_or(max_blocks);
839
840    // --- Block limit ---
841    let blocks_by_warps = warps_limited_by_regs
842        .checked_div(warps_per_block)
843        .unwrap_or(max_blocks);
844
845    let active_blocks = max_blocks.min(blocks_by_warps).min(blocks_limited_by_smem);
846
847    let active_warps = active_blocks * warps_per_block;
848    let occupancy = active_warps as f64 / max_warps as f64;
849
850    occupancy.clamp(0.0, 1.0)
851}
852
853// ---------------------------------------------------------------------------
854// Tests
855// ---------------------------------------------------------------------------
856
857#[cfg(test)]
858mod tests {
859    use super::*;
860
861    // -- LaunchTelemetry construction and builder --
862
863    #[test]
864    fn telemetry_new_defaults() {
865        let t = LaunchTelemetry::new("kern", (4, 1, 1), (256, 1, 1));
866        assert_eq!(t.kernel_name, "kern");
867        assert_eq!(t.grid_dim, (4, 1, 1));
868        assert_eq!(t.block_dim, (256, 1, 1));
869        assert_eq!(t.shared_memory_bytes, 0);
870        assert!(t.register_count.is_none());
871        assert!(t.elapsed_ms.is_none());
872        assert!(t.achieved_occupancy.is_none());
873        assert!(t.theoretical_occupancy.is_none());
874    }
875
876    #[test]
877    fn telemetry_builder_methods() {
878        let t = LaunchTelemetry::new("kern", (1, 1, 1), (128, 1, 1))
879            .with_shared_memory(4096)
880            .with_register_count(32)
881            .with_elapsed_ms(1.5)
882            .with_achieved_occupancy(0.75)
883            .with_theoretical_occupancy(0.80);
884
885        assert_eq!(t.shared_memory_bytes, 4096);
886        assert_eq!(t.register_count, Some(32));
887        assert!((t.elapsed_ms.unwrap_or(0.0) - 1.5).abs() < f64::EPSILON);
888        assert!((t.achieved_occupancy.unwrap_or(0.0) - 0.75).abs() < f64::EPSILON);
889        assert!((t.theoretical_occupancy.unwrap_or(0.0) - 0.80).abs() < f64::EPSILON);
890    }
891
892    #[test]
893    fn telemetry_total_threads() {
894        let t = LaunchTelemetry::new("k", (4, 2, 1), (16, 16, 1));
895        assert_eq!(t.total_threads(), 4 * 2 * 16 * 16);
896    }
897
898    #[test]
899    fn telemetry_display() {
900        let t = LaunchTelemetry::new("add", (4, 1, 1), (256, 1, 1))
901            .with_elapsed_ms(0.5)
902            .with_register_count(24)
903            .with_achieved_occupancy(0.85);
904        let s = format!("{t}");
905        assert!(s.contains("add"));
906        assert!(s.contains("0.500ms"));
907        assert!(s.contains("regs=24"));
908        assert!(s.contains("85.0%"));
909    }
910
911    // -- TelemetryCollector --
912
913    #[test]
914    fn collector_record_and_len() {
915        let mut c = TelemetryCollector::new(100);
916        assert!(c.is_empty());
917        c.record(LaunchTelemetry::new("k", (1, 1, 1), (64, 1, 1)));
918        assert_eq!(c.len(), 1);
919        assert!(!c.is_empty());
920    }
921
922    #[test]
923    fn collector_enable_disable() {
924        let mut c = TelemetryCollector::new(100);
925        assert!(c.is_enabled());
926
927        c.disable();
928        assert!(!c.is_enabled());
929        c.record(LaunchTelemetry::new("k", (1, 1, 1), (64, 1, 1)));
930        assert_eq!(c.len(), 0); // dropped because disabled
931
932        c.enable();
933        c.record(LaunchTelemetry::new("k", (1, 1, 1), (64, 1, 1)));
934        assert_eq!(c.len(), 1);
935    }
936
937    #[test]
938    fn collector_max_entries_cap() {
939        let mut c = TelemetryCollector::new(3);
940        for _ in 0..10 {
941            c.record(LaunchTelemetry::new("k", (1, 1, 1), (64, 1, 1)));
942        }
943        assert_eq!(c.len(), 3);
944    }
945
946    #[test]
947    fn collector_clear() {
948        let mut c = TelemetryCollector::new(100);
949        c.record(LaunchTelemetry::new("k", (1, 1, 1), (64, 1, 1)));
950        c.clear();
951        assert!(c.is_empty());
952    }
953
954    // -- TelemetrySummary --
955
956    #[test]
957    fn summary_empty() {
958        let c = TelemetryCollector::new(100);
959        let s = c.summary();
960        assert_eq!(s.total_launches, 0);
961        assert!((s.total_gpu_time_ms).abs() < f64::EPSILON);
962        assert!(s.hottest_kernel.is_none());
963        assert!(s.per_kernel_stats.is_empty());
964    }
965
966    #[test]
967    fn summary_single_kernel() {
968        let mut c = TelemetryCollector::new(100);
969        c.record(
970            LaunchTelemetry::new("add", (1, 1, 1), (256, 1, 1))
971                .with_elapsed_ms(1.0)
972                .with_achieved_occupancy(0.8),
973        );
974        c.record(
975            LaunchTelemetry::new("add", (1, 1, 1), (256, 1, 1))
976                .with_elapsed_ms(3.0)
977                .with_achieved_occupancy(0.9),
978        );
979        let s = c.summary();
980        assert_eq!(s.total_launches, 2);
981        assert!((s.total_gpu_time_ms - 4.0).abs() < f64::EPSILON);
982        assert!((s.avg_gpu_time_ms - 2.0).abs() < f64::EPSILON);
983        assert!((s.min_gpu_time_ms - 1.0).abs() < f64::EPSILON);
984        assert!((s.max_gpu_time_ms - 3.0).abs() < f64::EPSILON);
985        assert!((s.avg_occupancy - 0.85).abs() < 1e-9);
986        assert_eq!(s.hottest_kernel.as_deref(), Some("add"));
987        assert_eq!(s.per_kernel_stats.len(), 1);
988        assert_eq!(s.per_kernel_stats[0].launch_count, 2);
989    }
990
991    #[test]
992    fn summary_per_kernel_aggregation() {
993        let mut c = TelemetryCollector::new(100);
994        c.record(LaunchTelemetry::new("matmul", (1, 1, 1), (256, 1, 1)).with_elapsed_ms(10.0));
995        c.record(LaunchTelemetry::new("add", (1, 1, 1), (128, 1, 1)).with_elapsed_ms(1.0));
996        c.record(LaunchTelemetry::new("matmul", (1, 1, 1), (256, 1, 1)).with_elapsed_ms(12.0));
997
998        let s = c.summary();
999        assert_eq!(s.total_launches, 3);
1000        // hottest should be matmul (22ms > 1ms)
1001        assert_eq!(s.hottest_kernel.as_deref(), Some("matmul"));
1002        assert_eq!(s.per_kernel_stats.len(), 2);
1003        // First entry should be matmul (sorted by total time desc)
1004        assert_eq!(s.per_kernel_stats[0].kernel_name, "matmul");
1005        assert_eq!(s.per_kernel_stats[0].launch_count, 2);
1006        assert!((s.per_kernel_stats[0].total_time_ms - 22.0).abs() < f64::EPSILON);
1007    }
1008
1009    #[test]
1010    fn summary_display() {
1011        let mut c = TelemetryCollector::new(100);
1012        c.record(
1013            LaunchTelemetry::new("k", (1, 1, 1), (256, 1, 1))
1014                .with_elapsed_ms(2.0)
1015                .with_achieved_occupancy(0.5),
1016        );
1017        let s = c.summary();
1018        let text = format!("{s}");
1019        assert!(text.contains("Telemetry Summary"));
1020        assert!(text.contains("Total launches: 1"));
1021        assert!(text.contains("50.0%"));
1022    }
1023
1024    // -- TelemetryExporter: JSON --
1025
1026    #[test]
1027    fn export_json() {
1028        let entries = vec![
1029            LaunchTelemetry::new("kern", (4, 1, 1), (256, 1, 1))
1030                .with_elapsed_ms(0.5)
1031                .with_register_count(32),
1032        ];
1033        let json = TelemetryExporter::to_json(&entries);
1034        assert!(json.starts_with('['));
1035        assert!(json.contains("\"kernel_name\": \"kern\""));
1036        assert!(json.contains("\"grid_dim\": [4, 1, 1]"));
1037        assert!(json.contains("\"elapsed_ms\": 0.5"));
1038        assert!(json.contains("\"register_count\": 32"));
1039        assert!(json.contains("\"achieved_occupancy\": null"));
1040    }
1041
1042    // -- TelemetryExporter: CSV --
1043
1044    #[test]
1045    fn export_csv() {
1046        let entries =
1047            vec![LaunchTelemetry::new("kern", (2, 1, 1), (128, 1, 1)).with_elapsed_ms(1.0)];
1048        let csv = TelemetryExporter::to_csv(&entries);
1049        let lines: Vec<&str> = csv.lines().collect();
1050        assert_eq!(lines.len(), 2); // header + 1 data row
1051        assert!(lines[0].starts_with("kernel_name,"));
1052        assert!(lines[1].starts_with("kern,"));
1053        assert!(lines[1].contains("128"));
1054    }
1055
1056    // -- TelemetryExporter: Chrome trace --
1057
1058    #[test]
1059    fn export_chrome_trace() {
1060        let entries = vec![
1061            LaunchTelemetry::new("k1", (1, 1, 1), (256, 1, 1)).with_elapsed_ms(1.0),
1062            LaunchTelemetry::new("k2", (2, 1, 1), (128, 1, 1)).with_elapsed_ms(2.0),
1063        ];
1064        let trace = TelemetryExporter::to_chrome_trace(&entries);
1065        assert!(trace.contains("\"traceEvents\""));
1066        assert!(trace.contains("\"name\":\"k1\""));
1067        assert!(trace.contains("\"name\":\"k2\""));
1068        assert!(trace.contains("\"ph\":\"X\""));
1069        assert!(trace.contains("\"cat\":\"gpu\""));
1070    }
1071
1072    // -- Occupancy estimation --
1073
1074    #[test]
1075    fn occupancy_basic() {
1076        let occ = estimate_occupancy(256, 32, 0, SmVersion::Sm80);
1077        assert!(occ > 0.0);
1078        assert!(occ <= 1.0);
1079    }
1080
1081    #[test]
1082    fn occupancy_zero_block() {
1083        let occ = estimate_occupancy(0, 32, 0, SmVersion::Sm80);
1084        assert!((occ).abs() < f64::EPSILON);
1085    }
1086
1087    #[test]
1088    fn occupancy_high_registers_lowers_occupancy() {
1089        let high_reg = estimate_occupancy(256, 128, 0, SmVersion::Sm80);
1090        let low_reg = estimate_occupancy(256, 16, 0, SmVersion::Sm80);
1091        assert!(high_reg < low_reg);
1092    }
1093
1094    #[test]
1095    fn occupancy_large_shared_mem_lowers_occupancy() {
1096        let large_smem = estimate_occupancy(256, 32, 100_000, SmVersion::Sm80);
1097        let small_smem = estimate_occupancy(256, 32, 0, SmVersion::Sm80);
1098        assert!(large_smem <= small_smem);
1099    }
1100
1101    #[test]
1102    fn occupancy_sm_versions() {
1103        for sm in [
1104            SmVersion::Sm75,
1105            SmVersion::Sm80,
1106            SmVersion::Sm86,
1107            SmVersion::Sm89,
1108            SmVersion::Sm90,
1109            SmVersion::Sm100,
1110            SmVersion::Sm120,
1111        ] {
1112            let occ = estimate_occupancy(128, 32, 0, sm);
1113            assert!(occ > 0.0, "occupancy should be positive for {sm}");
1114            assert!(occ <= 1.0, "occupancy should be <= 1.0 for {sm}");
1115        }
1116    }
1117
1118    // -- SmVersion --
1119
1120    #[test]
1121    fn sm_version_display() {
1122        assert_eq!(format!("{}", SmVersion::Sm80), "sm_80");
1123        assert_eq!(format!("{}", SmVersion::Sm90), "sm_90");
1124    }
1125
1126    // -- KernelStats Display --
1127
1128    #[test]
1129    fn kernel_stats_display() {
1130        let ks = KernelStats {
1131            kernel_name: "matmul".to_owned(),
1132            launch_count: 5,
1133            total_time_ms: 10.0,
1134            avg_time_ms: 2.0,
1135            min_time_ms: 1.0,
1136            max_time_ms: 4.0,
1137            avg_occupancy: 0.75,
1138        };
1139        let s = format!("{ks}");
1140        assert!(s.contains("matmul"));
1141        assert!(s.contains("5 launches"));
1142        assert!(s.contains("75.0%"));
1143    }
1144}