Skip to main content

entrenar/gpu/
guard.rs

1//! VRAM Guard (GPU-SHARE-002).
2//!
3//! Pre-allocation check enforcing Contract C-VRAM-001:
4//! `CudaTrainer::new()` MUST NOT allocate if budget exceeds available VRAM.
5//!
6//! # Usage
7//!
8//! ```ignore
9//! let guard = VramGuard::acquire(budget_mb, "qlora-7b")?;
10//! // ... create CudaTrainer, allocate GPU memory ...
11//! guard.update_actual(actual_mb)?;
12//! // guard releases on Drop
13//! ```
14
15use crate::trace::{TraceStep, TRACER};
16
17use super::error::GpuError;
18use super::ledger::VramLedger;
19use super::profiler::GpuProfiler;
20use super::wait::{self, WaitConfig};
21
22/// VRAM reservation guard.
23///
24/// Acquires a VRAM reservation on creation and releases it on drop.
25/// Enforces C-VRAM-001: no allocation beyond budget.
26pub struct VramGuard {
27    ledger: VramLedger,
28    budget_mb: usize,
29}
30
31impl VramGuard {
32    /// Acquire a VRAM reservation.
33    ///
34    /// Checks the ledger and reserves `budget_mb` of VRAM.
35    /// Returns `GpuError::InsufficientMemory` if not enough VRAM available.
36    pub fn acquire(budget_mb: usize, task: &str) -> Result<Self, GpuError> {
37        TRACER.span(TraceStep::VramQuery, format!("guard_acquire budget={budget_mb}MB"), || {
38            let mut ledger = super::ledger::auto_ledger();
39            ledger.try_reserve(budget_mb, task)?;
40            Ok(Self { ledger, budget_mb })
41        })
42    }
43
44    /// Acquire with waiting: poll until VRAM is available or timeout.
45    pub fn acquire_wait(budget_mb: usize, task: &str, timeout_secs: u64) -> Result<Self, GpuError> {
46        TRACER.span(
47            TraceStep::WaitPoll,
48            format!("guard_wait budget={budget_mb}MB timeout={timeout_secs}s"),
49            || {
50                let mut ledger = super::ledger::auto_ledger();
51                let config = WaitConfig::with_timeout_secs(timeout_secs);
52                let mut profiler = GpuProfiler::disabled();
53                wait::wait_for_vram(&mut ledger, budget_mb, task, &config, &mut profiler)?;
54                Ok(Self { ledger, budget_mb })
55            },
56        )
57    }
58
59    /// Update the actual measured VRAM after GPU initialization.
60    ///
61    /// Call this after `CudaTrainer::new()` + weight upload to record
62    /// the real VRAM usage (may be less than budgeted).
63    pub fn update_actual(&mut self, actual_mb: usize) -> Result<(), GpuError> {
64        self.ledger.update_actual(actual_mb)
65    }
66
67    /// Budget that was reserved (MB).
68    pub fn budget_mb(&self) -> usize {
69        self.budget_mb
70    }
71
72    /// GPU UUID this guard is for.
73    pub fn gpu_uuid(&self) -> &str {
74        &self.ledger.gpu_uuid
75    }
76
77    /// Read the current GPU status display.
78    pub fn status(&self) -> Result<String, GpuError> {
79        super::ledger::gpu_status_display(&self.ledger)
80    }
81    /// Check if actual VRAM usage overshoots the budget.
82    ///
83    /// Returns `Some((actual, budget))` if overshoot detected, `None` if within budget.
84    ///
85    /// Contract: vram-guard-v1 / budget_overshoot
86    pub fn check_overshoot(&self) -> Option<(usize, usize)> {
87        let actual = self
88            .ledger
89            .read_reservations()
90            .ok()?
91            .iter()
92            .filter(|r| r.pid == std::process::id())
93            .find_map(|r| r.actual_mb)?;
94        if actual > self.budget_mb {
95            Some((actual, self.budget_mb))
96        } else {
97            None
98        }
99    }
100
101    /// Auto-estimate the VRAM budget for a model based on parameter count.
102    ///
103    /// Uses rule of thumb: ~2 bytes per parameter for f16 weights + 20% overhead.
104    ///
105    /// Contract: vram-guard-v1 / auto_budget_estimate
106    pub fn auto_estimate_budget(param_count: usize) -> usize {
107        let weight_mb = (param_count * 2) / (1024 * 1024);
108        let overhead = weight_mb / 5; // 20% overhead for activations + KV cache
109        weight_mb + overhead
110    }
111}
112
113#[cfg(test)]
114mod tests {
115    use super::*;
116
117    use std::sync::atomic::{AtomicU32, Ordering};
118
119    static TEST_COUNTER: AtomicU32 = AtomicU32::new(0);
120
121    fn test_guard_ledger(total_mb: usize) -> VramLedger {
122        let n = TEST_COUNTER.fetch_add(1, Ordering::Relaxed);
123        let dir = std::env::temp_dir().join("entrenar-guard-test");
124        std::fs::create_dir_all(&dir).expect("dir creation should succeed");
125        let path = dir.join(format!("guard-{n}-{}.json", std::process::id()));
126        VramLedger::new("GPU-test-guard".into(), total_mb, 0.85).with_path(path)
127    }
128
129    #[test]
130    fn test_guard_direct_acquire() {
131        let mut ledger = test_guard_ledger(24000);
132        ledger.try_reserve(5000, "guard-test").expect("should succeed");
133        assert_eq!(ledger.total_reserved().expect("should succeed"), 5000);
134        // Drop releases
135        drop(ledger);
136    }
137
138    #[test]
139    fn test_guard_update_actual() {
140        let mut ledger = test_guard_ledger(24000);
141        ledger.try_reserve(8000, "guard-actual").expect("should succeed");
142        ledger.update_actual(7200).expect("should succeed");
143        assert_eq!(ledger.total_reserved().expect("should succeed"), 7200);
144    }
145
146    #[test]
147    fn test_guard_rejects_over_budget() {
148        let mut ledger = test_guard_ledger(10000);
149        let result = ledger.try_reserve(9000, "too-big");
150        assert!(result.is_err());
151    }
152
153    #[test]
154    fn test_guard_budget_mb() {
155        let ledger = test_guard_ledger(24000);
156        let guard = VramGuard { ledger, budget_mb: 8000 };
157        assert_eq!(guard.budget_mb(), 8000);
158    }
159
160    #[test]
161    fn test_guard_gpu_uuid() {
162        let ledger = test_guard_ledger(24000);
163        let guard = VramGuard { ledger, budget_mb: 5000 };
164        assert_eq!(guard.gpu_uuid(), "GPU-test-guard");
165    }
166
167    #[test]
168    fn test_guard_status() {
169        let ledger = test_guard_ledger(24000);
170        let guard = VramGuard { ledger, budget_mb: 5000 };
171        // status() reads the ledger file — should not panic
172        let result = guard.status();
173        // May succeed or fail depending on ledger state, but should not panic
174        let _ = result;
175    }
176
177    #[test]
178    fn test_guard_update_actual_without_reservation() {
179        let ledger = test_guard_ledger(24000);
180        let mut guard = VramGuard { ledger, budget_mb: 5000 };
181        // No reservation made, update_actual should be a no-op
182        let result = guard.update_actual(4000);
183        assert!(result.is_ok());
184    }
185
186    #[test]
187    fn test_guard_multiple_reservations_sequential() {
188        let mut ledger1 = test_guard_ledger(24000);
189        ledger1.try_reserve(3000, "task-1").expect("should succeed");
190        let reserved = ledger1.total_reserved().expect("should succeed");
191        assert_eq!(reserved, 3000);
192
193        // After drop, reservation should be released
194        drop(ledger1);
195    }
196
197    #[test]
198    fn test_guard_zero_budget() {
199        let mut ledger = test_guard_ledger(24000);
200        // Reserving 0 MB should succeed
201        let result = ledger.try_reserve(0, "zero-budget");
202        assert!(result.is_ok());
203    }
204
205    #[test]
206    fn test_guard_exact_budget() {
207        // Ledger total 10000 with 0.85 headroom factor = 8500 usable
208        let mut ledger = test_guard_ledger(10000);
209        // Try to reserve exactly at the headroom limit
210        let result = ledger.try_reserve(8000, "near-limit");
211        assert!(result.is_ok());
212    }
213
214    #[test]
215    fn test_guard_update_actual_reduces_reserved() {
216        let mut ledger = test_guard_ledger(24000);
217        ledger.try_reserve(8000, "actual-test").expect("should succeed");
218        assert_eq!(ledger.total_reserved().expect("should succeed"), 8000);
219        ledger.update_actual(6000).expect("should succeed");
220        assert_eq!(ledger.total_reserved().expect("should succeed"), 6000);
221    }
222
223    // ── Additional coverage tests ──
224
225    #[test]
226    fn test_guard_struct_fields() {
227        let ledger = test_guard_ledger(16000);
228        let guard = VramGuard { ledger, budget_mb: 4000 };
229        assert_eq!(guard.budget_mb(), 4000);
230        assert_eq!(guard.gpu_uuid(), "GPU-test-guard");
231    }
232
233    #[test]
234    fn test_guard_status_returns_string() {
235        let mut ledger = test_guard_ledger(24000);
236        ledger.try_reserve(5000, "status-test").expect("should succeed");
237        let guard = VramGuard { ledger, budget_mb: 5000 };
238        let status = guard.status();
239        assert!(status.is_ok());
240        let status_str = status.unwrap();
241        assert!(status_str.contains("GPU-test-guard"));
242        assert!(status_str.contains("5000 MB budget"));
243    }
244
245    #[test]
246    fn test_guard_status_empty_ledger() {
247        let ledger = test_guard_ledger(24000);
248        let guard = VramGuard { ledger, budget_mb: 0 };
249        let status = guard.status();
250        assert!(status.is_ok());
251        let s = status.unwrap();
252        assert!(s.contains("none") || s.contains("Reservations"));
253    }
254
255    #[test]
256    fn test_guard_update_actual_with_active_reservation() {
257        let mut ledger = test_guard_ledger(24000);
258        ledger.try_reserve(10000, "update-actual").expect("should succeed");
259        let mut guard = VramGuard { ledger, budget_mb: 10000 };
260        let result = guard.update_actual(9500);
261        assert!(result.is_ok());
262    }
263
264    #[test]
265    fn test_guard_small_gpu() {
266        // Test with small GPU (e.g., embedded GPU)
267        let mut ledger = test_guard_ledger(2048);
268        // Capacity = 2048 * 0.85 = 1740
269        let result = ledger.try_reserve(1740, "small-gpu");
270        assert!(result.is_ok());
271        // One more MB should fail
272        let result2 = ledger.try_reserve(1, "overflow");
273        assert!(result2.is_err());
274    }
275
276    #[test]
277    fn test_guard_capacity_calculation() {
278        let ledger = test_guard_ledger(10000);
279        // 10000 * 0.85 = 8500
280        assert_eq!(ledger.capacity_mb(), 8500);
281    }
282
283    #[test]
284    fn test_guard_available_mb_after_reserve() {
285        let mut ledger = test_guard_ledger(20000);
286        // capacity = 17000
287        ledger.try_reserve(7000, "test").expect("should succeed");
288        let available = ledger.available_mb().expect("should succeed");
289        assert_eq!(available, 10000);
290    }
291
292    #[test]
293    fn test_guard_multiple_sequential_reserve_release() {
294        let mut ledger = test_guard_ledger(24000);
295        // Reserve, release, reserve again
296        ledger.try_reserve(5000, "first").expect("ok");
297        assert_eq!(ledger.total_reserved().expect("ok"), 5000);
298        ledger.release().expect("ok");
299        assert_eq!(ledger.total_reserved().expect("ok"), 0);
300        ledger.try_reserve(8000, "second").expect("ok");
301        assert_eq!(ledger.total_reserved().expect("ok"), 8000);
302    }
303
304    #[test]
305    fn test_guard_profiler_report_accessible() {
306        let ledger = test_guard_ledger(24000);
307        let guard = VramGuard { ledger, budget_mb: 0 };
308        // Can access profiler report through guard's ledger
309        let report = guard.ledger.profiler_report();
310        assert!(report.contains("No operations recorded"));
311    }
312
313    #[test]
314    fn test_guard_drop_does_not_panic() {
315        let ledger = test_guard_ledger(24000);
316        let guard = VramGuard { ledger, budget_mb: 3000 };
317        // Dropping without reservation should not panic
318        drop(guard);
319    }
320
321    #[test]
322    fn test_guard_drop_with_reservation_releases() {
323        let n = TEST_COUNTER.fetch_add(1, Ordering::Relaxed);
324        let dir = std::env::temp_dir().join("entrenar-guard-test");
325        std::fs::create_dir_all(&dir).expect("dir creation should succeed");
326        let path = dir.join(format!("guard-drop-{n}-{}.json", std::process::id()));
327
328        {
329            let mut ledger =
330                VramLedger::new("GPU-test-guard".into(), 24000, 0.85).with_path(path.clone());
331            ledger.try_reserve(5000, "drop-reserve").expect("ok");
332            let guard = VramGuard { ledger, budget_mb: 5000 };
333            // guard dropped here, should release
334            drop(guard);
335        }
336
337        // Verify reservation was cleaned up
338        let check_ledger = VramLedger::new("GPU-test-guard".into(), 24000, 0.85).with_path(path);
339        let reserved = check_ledger.total_reserved().expect("ok");
340        assert_eq!(reserved, 0);
341    }
342}