Skip to main content

vyre_driver/
residency.rs

1//! Backend-neutral resident-resource reuse telemetry.
2//!
3//! Resident graph reuse is a cross-backend performance invariant, not a CUDA
4//! detail. CUDA planners, WGPU resident caches, and higher-level users need
5//! to report cold uploads and warm resident reuses with the same vocabulary
6//! so upload pressure can be compared without backend-specific adapters.
7
8/// Cold-upload and warm-reuse counters for a retained resident graph.
9#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)]
10pub struct ResidentGraphReuseTelemetry {
11    /// Resident graph cache misses that required host-to-device upload.
12    pub cold_uploads: u64,
13    /// Resident graph cache hits that reused an already-live device graph.
14    pub warm_reuses: u64,
15    /// Graph bytes uploaded by cold misses.
16    pub upload_bytes: u64,
17    /// Graph upload bytes avoided by warm reuses.
18    pub avoided_upload_bytes: u64,
19}
20
21impl ResidentGraphReuseTelemetry {
22    /// Build telemetry from explicit counters.
23    #[must_use]
24    pub const fn from_counters(
25        cold_uploads: u64,
26        warm_reuses: u64,
27        upload_bytes: u64,
28        avoided_upload_bytes: u64,
29    ) -> Self {
30        Self {
31            cold_uploads,
32            warm_reuses,
33            upload_bytes,
34            avoided_upload_bytes,
35        }
36    }
37
38    /// Telemetry for one cold graph upload.
39    #[must_use]
40    pub const fn cold_upload(upload_bytes: u64) -> Self {
41        Self {
42            cold_uploads: 1,
43            warm_reuses: 0,
44            upload_bytes,
45            avoided_upload_bytes: 0,
46        }
47    }
48
49    /// Telemetry for one warm resident graph reuse.
50    #[must_use]
51    pub const fn warm_reuse(avoided_upload_bytes: u64) -> Self {
52        Self {
53            cold_uploads: 0,
54            warm_reuses: 1,
55            upload_bytes: 0,
56            avoided_upload_bytes,
57        }
58    }
59
60    /// Return true when no resident-graph reuse event has been recorded.
61    #[must_use]
62    pub const fn is_empty(self) -> bool {
63        self.cold_uploads == 0
64            && self.warm_reuses == 0
65            && self.upload_bytes == 0
66            && self.avoided_upload_bytes == 0
67    }
68
69    /// Merge two telemetry snapshots with checked arithmetic.
70    pub fn checked_add(self, rhs: Self) -> Result<Self, ResidentGraphReuseTelemetryError> {
71        Ok(Self {
72            cold_uploads: crate::accounting::checked_add_u64_value(
73                self.cold_uploads,
74                rhs.cold_uploads,
75                ResidentGraphReuseTelemetryError::CounterOverflow {
76                    counter: "cold_uploads",
77                },
78            )?,
79            warm_reuses: crate::accounting::checked_add_u64_value(
80                self.warm_reuses,
81                rhs.warm_reuses,
82                ResidentGraphReuseTelemetryError::CounterOverflow {
83                    counter: "warm_reuses",
84                },
85            )?,
86            upload_bytes: crate::accounting::checked_add_u64_value(
87                self.upload_bytes,
88                rhs.upload_bytes,
89                ResidentGraphReuseTelemetryError::ByteCounterOverflow {
90                    counter: "upload_bytes",
91                },
92            )?,
93            avoided_upload_bytes: crate::accounting::checked_add_u64_value(
94                self.avoided_upload_bytes,
95                rhs.avoided_upload_bytes,
96                ResidentGraphReuseTelemetryError::ByteCounterOverflow {
97                    counter: "avoided_upload_bytes",
98                },
99            )?,
100        })
101    }
102
103    /// Return the telemetry delta observed after an earlier monotonic snapshot.
104    pub fn checked_delta_since(
105        self,
106        earlier: Self,
107    ) -> Result<Self, ResidentGraphReuseTelemetryError> {
108        Ok(Self {
109            cold_uploads: crate::accounting::checked_sub_u64_value(
110                self.cold_uploads,
111                earlier.cold_uploads,
112                ResidentGraphReuseTelemetryError::CounterUnderflow {
113                    counter: "cold_uploads",
114                },
115            )?,
116            warm_reuses: crate::accounting::checked_sub_u64_value(
117                self.warm_reuses,
118                earlier.warm_reuses,
119                ResidentGraphReuseTelemetryError::CounterUnderflow {
120                    counter: "warm_reuses",
121                },
122            )?,
123            upload_bytes: crate::accounting::checked_sub_u64_value(
124                self.upload_bytes,
125                earlier.upload_bytes,
126                ResidentGraphReuseTelemetryError::ByteCounterUnderflow {
127                    counter: "upload_bytes",
128                },
129            )?,
130            avoided_upload_bytes: crate::accounting::checked_sub_u64_value(
131                self.avoided_upload_bytes,
132                earlier.avoided_upload_bytes,
133                ResidentGraphReuseTelemetryError::ByteCounterUnderflow {
134                    counter: "avoided_upload_bytes",
135                },
136            )?,
137        })
138    }
139
140    /// Record one cold graph upload in place.
141    pub fn record_cold_upload(
142        &mut self,
143        upload_bytes: u64,
144    ) -> Result<(), ResidentGraphReuseTelemetryError> {
145        *self = (*self).checked_add(Self::cold_upload(upload_bytes))?;
146        Ok(())
147    }
148
149    /// Record one warm resident graph reuse in place.
150    pub fn record_warm_reuse(
151        &mut self,
152        avoided_upload_bytes: u64,
153    ) -> Result<(), ResidentGraphReuseTelemetryError> {
154        *self = (*self).checked_add(Self::warm_reuse(avoided_upload_bytes))?;
155        Ok(())
156    }
157
158    /// Record several cold graph uploads in place.
159    pub fn record_cold_uploads(
160        &mut self,
161        cold_uploads: u64,
162        upload_bytes: u64,
163    ) -> Result<(), ResidentGraphReuseTelemetryError> {
164        *self = (*self).checked_add(Self::from_counters(cold_uploads, 0, upload_bytes, 0))?;
165        Ok(())
166    }
167
168    /// Record several warm resident graph reuses in place.
169    pub fn record_warm_reuses(
170        &mut self,
171        warm_reuses: u64,
172        avoided_upload_bytes: u64,
173    ) -> Result<(), ResidentGraphReuseTelemetryError> {
174        *self =
175            (*self).checked_add(Self::from_counters(0, warm_reuses, 0, avoided_upload_bytes))?;
176        Ok(())
177    }
178}
179
180/// Resident graph reuse telemetry arithmetic failed.
181#[derive(Clone, Copy, Debug, Eq, PartialEq)]
182pub enum ResidentGraphReuseTelemetryError {
183    /// A count field overflowed `u64`.
184    CounterOverflow {
185        /// Counter that overflowed.
186        counter: &'static str,
187    },
188    /// A count field moved backward between monotonic snapshots.
189    CounterUnderflow {
190        /// Counter that moved backward.
191        counter: &'static str,
192    },
193    /// A byte counter field overflowed `u64`.
194    ByteCounterOverflow {
195        /// Byte counter that overflowed.
196        counter: &'static str,
197    },
198    /// A byte counter field moved backward between monotonic snapshots.
199    ByteCounterUnderflow {
200        /// Byte counter that moved backward.
201        counter: &'static str,
202    },
203}
204
205impl std::fmt::Display for ResidentGraphReuseTelemetryError {
206    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
207        match self {
208            Self::CounterOverflow { counter } => write!(
209                f,
210                "resident graph reuse telemetry counter {counter} overflowed u64. Fix: rotate the telemetry window before resident graph reuse accounting saturates."
211            ),
212            Self::CounterUnderflow { counter } => write!(
213                f,
214                "resident graph reuse telemetry counter {counter} moved backward between snapshots. Fix: rebuild the resident owner; cache telemetry must be monotonic."
215            ),
216            Self::ByteCounterOverflow { counter } => write!(
217                f,
218                "resident graph reuse telemetry byte counter {counter} overflowed u64. Fix: shard the resident graph workload or rotate the telemetry window before byte accounting saturates."
219            ),
220            Self::ByteCounterUnderflow { counter } => write!(
221                f,
222                "resident graph reuse telemetry byte counter {counter} moved backward between snapshots. Fix: rebuild the resident owner; cache telemetry must be monotonic."
223            ),
224        }
225    }
226}
227
228impl std::error::Error for ResidentGraphReuseTelemetryError {}
229
230#[cfg(test)]
231mod tests {
232    use super::{ResidentGraphReuseTelemetry, ResidentGraphReuseTelemetryError};
233
234    #[test]
235    fn checked_delta_since_returns_monotonic_snapshot_delta() {
236        let earlier = ResidentGraphReuseTelemetry::from_counters(1, 2, 64, 128);
237        let later = ResidentGraphReuseTelemetry::from_counters(4, 7, 256, 1_024);
238
239        assert_eq!(
240            later.checked_delta_since(earlier),
241            Ok(ResidentGraphReuseTelemetry::from_counters(3, 5, 192, 896))
242        );
243    }
244
245    #[test]
246    fn checked_delta_since_rejects_counter_regression() {
247        let earlier = ResidentGraphReuseTelemetry::from_counters(2, 2, 64, 128);
248        let later = ResidentGraphReuseTelemetry::from_counters(1, 2, 64, 128);
249
250        assert_eq!(
251            later.checked_delta_since(earlier),
252            Err(ResidentGraphReuseTelemetryError::CounterUnderflow {
253                counter: "cold_uploads"
254            })
255        );
256    }
257
258    #[test]
259    fn checked_delta_since_rejects_byte_counter_regression() {
260        let earlier = ResidentGraphReuseTelemetry::from_counters(2, 2, 128, 128);
261        let later = ResidentGraphReuseTelemetry::from_counters(2, 2, 64, 128);
262
263        assert_eq!(
264            later.checked_delta_since(earlier),
265            Err(ResidentGraphReuseTelemetryError::ByteCounterUnderflow {
266                counter: "upload_bytes"
267            })
268        );
269    }
270}