1use std::collections::VecDeque;
2use std::sync::{Mutex, OnceLock};
3
4const MAX_STATS: usize = 1024;
5
6#[derive(Clone, Debug, Default)]
7pub struct KernelStat {
8 pub name: &'static str,
9 pub n: usize,
10 pub p: usize,
11 pub k: usize,
12 pub nnz: usize,
13 pub flops_est: usize,
14 pub bytes_est: usize,
15 pub cpu_ms: f64,
16 pub gpu_ms: Option<f64>,
17}
18
19#[derive(Clone, Debug, Default)]
20pub struct KernelStatsSnapshot {
21 pub stats: Vec<KernelStat>,
22}
23
24static STATS: OnceLock<Mutex<VecDeque<KernelStat>>> = OnceLock::new();
25
26fn stats() -> &'static Mutex<VecDeque<KernelStat>> {
27 STATS.get_or_init(|| Mutex::new(VecDeque::with_capacity(MAX_STATS)))
28}
29
30pub fn record(stat: KernelStat) {
31 if let Ok(mut guard) = stats().lock() {
32 if guard.len() == MAX_STATS {
33 guard.pop_front();
34 }
35 guard.push_back(stat);
36 }
37}
38
39pub fn snapshot() -> KernelStatsSnapshot {
40 if let Ok(guard) = stats().lock() {
41 KernelStatsSnapshot {
42 stats: guard.iter().cloned().collect(),
43 }
44 } else {
45 KernelStatsSnapshot::default()
46 }
47}
48
49pub fn clear() {
50 if let Ok(mut guard) = stats().lock() {
51 guard.clear();
52 }
53}
54
55use std::cell::RefCell;
69
70#[derive(Clone, Debug, Default, PartialEq, Eq)]
74pub struct GpuExecutionTelemetry {
75 pub h2d_bytes: usize,
77 pub d2h_bytes: usize,
79 pub factorization_count: usize,
81 pub handle_creation_count: usize,
83 pub kernel_launch_count: usize,
85 pub cpu_fallback_count: usize,
87 pub cpu_fallback_reasons: Vec<String>,
89 pub context_id: usize,
92}
93
94thread_local! {
95 static EXECUTION_TELEMETRY: RefCell<GpuExecutionTelemetry> =
96 RefCell::new(GpuExecutionTelemetry::default());
97}
98
99#[inline]
101pub fn telemetry_with<R>(f: impl FnOnce(&mut GpuExecutionTelemetry) -> R) -> R {
102 EXECUTION_TELEMETRY.with(|cell| f(&mut cell.borrow_mut()))
103}
104
105#[inline]
107pub fn telemetry_record_h2d(bytes: usize) {
108 telemetry_with(|t| t.h2d_bytes += bytes);
109}
110
111#[inline]
113pub fn telemetry_record_d2h(bytes: usize) {
114 telemetry_with(|t| t.d2h_bytes += bytes);
115}
116
117#[inline]
119pub fn telemetry_record_factorization() {
120 telemetry_with(|t| t.factorization_count += 1);
121}
122
123#[inline]
125pub fn telemetry_record_handle_creation(context_id: usize) {
126 telemetry_with(|t| {
127 t.handle_creation_count += 1;
128 t.context_id = context_id;
129 });
130}
131
132#[inline]
134pub fn telemetry_record_kernel_launch() {
135 telemetry_with(|t| t.kernel_launch_count += 1);
136}
137
138#[inline]
141pub fn telemetry_record_cpu_fallback(reason: impl Into<String>) {
142 telemetry_with(|t| {
143 t.cpu_fallback_count += 1;
144 t.cpu_fallback_reasons.push(reason.into());
145 });
146}
147
148#[must_use]
150pub fn telemetry_snapshot() -> GpuExecutionTelemetry {
151 telemetry_with(|t| t.clone())
152}
153
154pub fn telemetry_reset() {
156 telemetry_with(|t| *t = GpuExecutionTelemetry::default());
157}