Skip to main content

gam_gpu/
profile.rs

1use std::collections::VecDeque;
2use std::sync::{Mutex, OnceLock};
3
4const MAX_STATS: usize = 1024;
5
6#[derive(Clone, Debug, Default)]
7pub struct KernelStat {
8    pub name: &'static str,
9    pub n: usize,
10    pub p: usize,
11    pub k: usize,
12    pub nnz: usize,
13    pub flops_est: usize,
14    pub bytes_est: usize,
15    pub cpu_ms: f64,
16    pub gpu_ms: Option<f64>,
17}
18
19#[derive(Clone, Debug, Default)]
20pub struct KernelStatsSnapshot {
21    pub stats: Vec<KernelStat>,
22}
23
24static STATS: OnceLock<Mutex<VecDeque<KernelStat>>> = OnceLock::new();
25
26fn stats() -> &'static Mutex<VecDeque<KernelStat>> {
27    STATS.get_or_init(|| Mutex::new(VecDeque::with_capacity(MAX_STATS)))
28}
29
30pub fn record(stat: KernelStat) {
31    if let Ok(mut guard) = stats().lock() {
32        if guard.len() == MAX_STATS {
33            guard.pop_front();
34        }
35        guard.push_back(stat);
36    }
37}
38
39pub fn snapshot() -> KernelStatsSnapshot {
40    if let Ok(guard) = stats().lock() {
41        KernelStatsSnapshot {
42            stats: guard.iter().cloned().collect(),
43        }
44    } else {
45        KernelStatsSnapshot::default()
46    }
47}
48
49pub fn clear() {
50    if let Ok(mut guard) = stats().lock() {
51        guard.clear();
52    }
53}
54
55// ---------------------------------------------------------------------------
56// GPU execution telemetry (issue #1017).
57//
58// The original `used_device: bool` could report `true` while the device had
59// silently declined the workload and the solve ran on the CPU. A boolean
60// cannot expose that: it carries no count of handles created, factorizations
61// run, kernels launched, or — critically — CPU fallbacks taken and why. These
62// per-thread counters make the resident solver's actual device activity
63// auditable, so a silent fallback shows up as `cpu_fallback_count > 0` with a
64// recorded reason rather than a lie. They are observability only and never
65// change any numerical result.
66// ---------------------------------------------------------------------------
67
68use std::cell::RefCell;
69
70/// Monotonic counters describing what the GPU-resident solver actually did on
71/// the current thread. Snapshot with [`telemetry_snapshot`]; reset with
72/// [`telemetry_reset`].
73#[derive(Clone, Debug, Default, PartialEq, Eq)]
74pub struct GpuExecutionTelemetry {
75    /// Bytes uploaded host→device.
76    pub h2d_bytes: usize,
77    /// Bytes read back device→host.
78    pub d2h_bytes: usize,
79    /// Cholesky / Schur factorizations performed on the device.
80    pub factorization_count: usize,
81    /// cuBLAS / cuSOLVER / stream handle creations.
82    pub handle_creation_count: usize,
83    /// Device kernel launches (per-row + border solves).
84    pub kernel_launch_count: usize,
85    /// Times a path that intended to use the device fell back to the CPU.
86    pub cpu_fallback_count: usize,
87    /// Human-readable reasons recorded alongside each CPU fallback.
88    pub cpu_fallback_reasons: Vec<String>,
89    /// Opaque context identifier of the device this thread last touched
90    /// (e.g. the CUDA device ordinal), `0` when no device was used.
91    pub context_id: usize,
92}
93
94thread_local! {
95    static EXECUTION_TELEMETRY: RefCell<GpuExecutionTelemetry> =
96        RefCell::new(GpuExecutionTelemetry::default());
97}
98
99/// Mutate the calling thread's execution telemetry in place.
100#[inline]
101pub fn telemetry_with<R>(f: impl FnOnce(&mut GpuExecutionTelemetry) -> R) -> R {
102    EXECUTION_TELEMETRY.with(|cell| f(&mut cell.borrow_mut()))
103}
104
105/// Record a host→device upload of `bytes`.
106#[inline]
107pub fn telemetry_record_h2d(bytes: usize) {
108    telemetry_with(|t| t.h2d_bytes += bytes);
109}
110
111/// Record a device→host readback of `bytes`.
112#[inline]
113pub fn telemetry_record_d2h(bytes: usize) {
114    telemetry_with(|t| t.d2h_bytes += bytes);
115}
116
117/// Record a device factorization (POTRF / Schur factor).
118#[inline]
119pub fn telemetry_record_factorization() {
120    telemetry_with(|t| t.factorization_count += 1);
121}
122
123/// Record creation of a device handle/stream and the context it bound.
124#[inline]
125pub fn telemetry_record_handle_creation(context_id: usize) {
126    telemetry_with(|t| {
127        t.handle_creation_count += 1;
128        t.context_id = context_id;
129    });
130}
131
132/// Record a device kernel launch.
133#[inline]
134pub fn telemetry_record_kernel_launch() {
135    telemetry_with(|t| t.kernel_launch_count += 1);
136}
137
138/// Record a CPU fallback together with the reason it happened. This is the
139/// counter that would have exposed the original silent-fallback bug.
140#[inline]
141pub fn telemetry_record_cpu_fallback(reason: impl Into<String>) {
142    telemetry_with(|t| {
143        t.cpu_fallback_count += 1;
144        t.cpu_fallback_reasons.push(reason.into());
145    });
146}
147
148/// Snapshot the calling thread's execution telemetry.
149#[must_use]
150pub fn telemetry_snapshot() -> GpuExecutionTelemetry {
151    telemetry_with(|t| t.clone())
152}
153
154/// Reset the calling thread's execution telemetry to zero.
155pub fn telemetry_reset() {
156    telemetry_with(|t| *t = GpuExecutionTelemetry::default());
157}