Skip to main content

entrenar/gpu/
ledger.rs

1//! VRAM Reservation Ledger (GPU-SHARE-001).
2//!
3//! Uses flock for mutual exclusion and atomic write (write-to-temp, rename)
4//! for crash safety.
5//!
6//! # Contract C-VRAM-001
7//!
8//! `CudaTrainer::new()` MUST NOT allocate if
9//! `ledger.total_reserved() + budget > total_mb × reserve_factor`.
10//!
11//! # Protocol
12//!
13//! 1. Acquire `flock(LOCK_EX)` on ledger file
14//! 2. Read reservations, prune dead PIDs + expired leases
15//! 3. Check capacity: `sum(active.budget_mb) + my_budget <= total_mb × reserve_factor`
16//! 4. Write reservation via atomic rename (write tmp → rename)
17//! 5. Release lock (close fd / drop)
18//! 6. On exit: best-effort cleanup via `Drop`
19
20use std::fs::{self, File, OpenOptions};
21use std::io::{Read as _, Write as _};
22use std::path::{Path, PathBuf};
23
24use chrono::{DateTime, Utc};
25use fs4::fs_std::FileExt;
26use serde::{Deserialize, Serialize};
27
28use super::error::GpuError;
29use super::profiler::GpuProfiler;
30use crate::trace::{TraceStep, TRACER};
31
32/// Default ledger location: `~/.cache/entrenar/gpu-ledger.json`.
33fn default_ledger_path() -> PathBuf {
34    dirs::cache_dir()
35        .unwrap_or_else(|| PathBuf::from("/tmp"))
36        .join("entrenar")
37        .join("gpu-ledger.json")
38}
39
40/// Reserve factor for discrete GPUs (15% headroom).
41pub const RESERVE_FACTOR_DISCRETE: f32 = 0.85;
42
43/// Reserve factor for unified memory (40% headroom for OS).
44pub const RESERVE_FACTOR_UNIFIED: f32 = 0.60;
45
46/// Default lease duration (24 hours).
47pub const DEFAULT_LEASE_HOURS: i64 = 24;
48
49/// A single VRAM reservation.
50#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct Reservation {
52    /// Unique reservation ID.
53    pub id: u64,
54    /// Process ID of the holder.
55    pub pid: u32,
56    /// Budgeted VRAM in MB.
57    pub budget_mb: usize,
58    /// Actual measured VRAM in MB (updated post-init).
59    pub actual_mb: Option<usize>,
60    /// Human-readable task description.
61    pub task: String,
62    /// GPU UUID this reservation is for.
63    pub gpu_uuid: String,
64    /// When the reservation was created.
65    pub started: DateTime<Utc>,
66    /// When the lease automatically expires.
67    pub lease_expires: DateTime<Utc>,
68}
69
70impl Reservation {
71    /// Whether this reservation's lease has expired.
72    pub fn is_expired(&self) -> bool {
73        Utc::now() > self.lease_expires
74    }
75
76    /// Whether the holding process is still alive (Linux /proc check).
77    pub fn is_alive(&self) -> bool {
78        Path::new(&format!("/proc/{}/stat", self.pid)).exists()
79    }
80
81    /// Whether this reservation should be pruned.
82    pub fn should_prune(&self) -> bool {
83        self.is_expired() || !self.is_alive()
84    }
85}
86
87/// Ledger file contents.
88#[derive(Debug, Default, Serialize, Deserialize)]
89struct LedgerData {
90    reservations: Vec<Reservation>,
91}
92
93impl LedgerData {
94    /// Remove dead PIDs and expired leases in-place.
95    fn prune_dead(&mut self) {
96        self.reservations.retain(|r| !r.should_prune());
97    }
98
99    /// Sum of reserved VRAM for a specific GPU.
100    fn total_reserved_for(&self, gpu_uuid: &str) -> usize {
101        self.reservations
102            .iter()
103            .filter(|r| r.gpu_uuid == gpu_uuid)
104            .map(|r| r.actual_mb.unwrap_or(r.budget_mb))
105            .sum()
106    }
107}
108
109/// VRAM reservation ledger with flock-based mutual exclusion.
110pub struct VramLedger {
111    path: PathBuf,
112    /// GPU UUID (from nvidia-smi -L).
113    pub gpu_uuid: String,
114    /// Total GPU memory in MB.
115    pub total_mb: usize,
116    /// Fraction of total usable (0.85 discrete, 0.60 unified).
117    pub reserve_factor: f32,
118    lease_hours: i64,
119    /// Profiler for brick-phase timing.
120    profiler: GpuProfiler,
121    /// Our reservation ID if we hold one.
122    pub our_reservation_id: Option<u64>,
123}
124
125impl VramLedger {
126    /// Create a new ledger for the specified GPU.
127    pub fn new(gpu_uuid: String, total_mb: usize, reserve_factor: f32) -> Self {
128        Self {
129            path: default_ledger_path(),
130            gpu_uuid,
131            total_mb,
132            reserve_factor,
133            lease_hours: DEFAULT_LEASE_HOURS,
134            profiler: GpuProfiler::disabled(),
135            our_reservation_id: None,
136        }
137    }
138
139    /// Create a ledger at a custom path (for testing).
140    pub fn with_path(mut self, path: PathBuf) -> Self {
141        self.path = path;
142        self
143    }
144
145    /// Enable profiling.
146    pub fn with_profiling(mut self, enabled: bool) -> Self {
147        self.profiler = GpuProfiler::new(enabled);
148        self
149    }
150
151    /// Set custom lease duration.
152    pub fn with_lease_hours(mut self, hours: i64) -> Self {
153        self.lease_hours = hours;
154        self
155    }
156
157    /// Usable VRAM capacity in MB.
158    pub fn capacity_mb(&self) -> usize {
159        (self.total_mb as f32 * self.reserve_factor) as usize
160    }
161
162    /// Total reserved VRAM across all active reservations for our GPU.
163    pub fn total_reserved(&self) -> Result<usize, GpuError> {
164        let gpu_uuid = self.gpu_uuid.clone();
165        self.with_lock_read(|data| {
166            data.reservations
167                .iter()
168                .filter(|r| r.gpu_uuid == gpu_uuid && !r.should_prune())
169                .map(|r| r.actual_mb.unwrap_or(r.budget_mb))
170                .sum()
171        })
172    }
173
174    /// Available VRAM for new reservations (capacity - reserved).
175    pub fn available_mb(&self) -> Result<usize, GpuError> {
176        let reserved = self.total_reserved()?;
177        Ok(self.capacity_mb().saturating_sub(reserved))
178    }
179
180    /// Try to reserve VRAM. Returns reservation ID on success.
181    ///
182    /// # Contract C-VRAM-001
183    ///
184    /// Fails with `GpuError::InsufficientMemory` if
185    /// `total_reserved + budget_mb > capacity_mb`.
186    pub fn try_reserve(&mut self, budget_mb: usize, task: &str) -> Result<u64, GpuError> {
187        TRACER.span(
188            TraceStep::LedgerReserve,
189            format!("ledger_reserve budget={budget_mb}MB gpu={}", self.gpu_uuid),
190            || self.try_reserve_inner(budget_mb, task),
191        )
192    }
193
194    fn try_reserve_inner(&mut self, budget_mb: usize, task: &str) -> Result<u64, GpuError> {
195        let gpu_uuid = self.gpu_uuid.clone();
196        let lease_hours = self.lease_hours;
197        let capacity = self.capacity_mb();
198        let total_mb = self.total_mb;
199
200        let id = self.with_lock_write(|data| {
201            data.prune_dead();
202
203            let reserved = data.total_reserved_for(&gpu_uuid);
204
205            if reserved + budget_mb > capacity {
206                return Err(GpuError::InsufficientMemory {
207                    budget_mb,
208                    available_mb: capacity.saturating_sub(reserved),
209                    reserved_mb: reserved,
210                    total_mb,
211                });
212            }
213
214            let now = Utc::now();
215            let id = reservation_id(&gpu_uuid, std::process::id(), now);
216            let reservation = Reservation {
217                id,
218                pid: std::process::id(),
219                budget_mb,
220                actual_mb: None,
221                task: task.to_string(),
222                gpu_uuid: gpu_uuid.clone(),
223                started: now,
224                lease_expires: now + chrono::Duration::hours(lease_hours),
225            };
226
227            data.reservations.push(reservation);
228            Ok(id)
229        })?;
230
231        self.our_reservation_id = Some(id);
232        self.profiler.finish_op();
233        Ok(id)
234    }
235
236    /// Update the actual measured VRAM for our reservation.
237    pub fn update_actual(&mut self, actual_mb: usize) -> Result<(), GpuError> {
238        let Some(our_id) = self.our_reservation_id else {
239            return Ok(());
240        };
241
242        self.with_lock_write(|data| {
243            if let Some(r) = data.reservations.iter_mut().find(|r| r.id == our_id) {
244                r.actual_mb = Some(actual_mb);
245            }
246            Ok(())
247        })
248    }
249
250    /// Release our reservation.
251    pub fn release(&mut self) -> Result<(), GpuError> {
252        let Some(our_id) = self.our_reservation_id.take() else {
253            return Ok(());
254        };
255
256        TRACER.span(TraceStep::LedgerRelease, format!("ledger_release id={our_id}"), || {
257            self.with_lock_write(|data| {
258                data.reservations.retain(|r| r.id != our_id);
259                Ok(())
260            })
261        })
262    }
263
264    /// Read all reservations for our GPU (pruned).
265    pub fn read_reservations(&self) -> Result<Vec<Reservation>, GpuError> {
266        let gpu_uuid = self.gpu_uuid.clone();
267        self.with_lock_read(|data| {
268            data.reservations
269                .iter()
270                .filter(|r| r.gpu_uuid == gpu_uuid && !r.should_prune())
271                .cloned()
272                .collect()
273        })
274    }
275
276    /// Get profiler report.
277    pub fn profiler_report(&self) -> String {
278        self.profiler.report()
279    }
280
281    // ── flock + atomic read/write ──
282
283    /// Execute a read-only operation under flock.
284    fn with_lock_read<F, T>(&self, f: F) -> Result<T, GpuError>
285    where
286        F: FnOnce(&LedgerData) -> T,
287    {
288        ensure_parent_dir(&self.path)?;
289
290        let file = OpenOptions::new()
291            .read(true)
292            .write(true)
293            .create(true)
294            .truncate(false)
295            .open(&self.path)?;
296
297        file.lock_exclusive()
298            .map_err(|e| GpuError::Io(std::io::Error::other(format!("flock: {e}"))))?;
299
300        let data = read_ledger(&file)?;
301        let result = f(&data);
302
303        #[allow(clippy::incompatible_msrv)]
304        file.unlock().map_err(|e| GpuError::Io(std::io::Error::other(format!("funlock: {e}"))))?;
305
306        Ok(result)
307    }
308
309    /// Execute a read-modify-write operation under flock with atomic write.
310    fn with_lock_write<F, T>(&mut self, f: F) -> Result<T, GpuError>
311    where
312        F: FnOnce(&mut LedgerData) -> Result<T, GpuError>,
313    {
314        ensure_parent_dir(&self.path)?;
315
316        let file = OpenOptions::new()
317            .read(true)
318            .write(true)
319            .create(true)
320            .truncate(false)
321            .open(&self.path)?;
322
323        // Phase: lock_acq
324        self.profiler.begin(GpuProfiler::LOCK_ACQ);
325        file.lock_exclusive()
326            .map_err(|e| GpuError::Io(std::io::Error::other(format!("flock: {e}"))))?;
327        self.profiler.end(GpuProfiler::LOCK_ACQ);
328
329        // Phase: ledger_rd
330        self.profiler.begin(GpuProfiler::LEDGER_RD);
331        let mut data = read_ledger(&file)?;
332        self.profiler.end(GpuProfiler::LEDGER_RD);
333
334        let result = f(&mut data)?;
335
336        // Atomic write-ahead: file → fsync → rename
337        self.profiler.begin(GpuProfiler::LEDGER_WR);
338        atomic_write_ledger(&self.path, &data)?;
339        self.profiler.end(GpuProfiler::LEDGER_WR);
340
341        // Phase: lock_rel
342        self.profiler.begin(GpuProfiler::LOCK_REL);
343        #[allow(clippy::incompatible_msrv)]
344        file.unlock().map_err(|e| GpuError::Io(std::io::Error::other(format!("funlock: {e}"))))?;
345        self.profiler.end(GpuProfiler::LOCK_REL);
346
347        Ok(result)
348    }
349}
350
351impl Drop for VramLedger {
352    fn drop(&mut self) {
353        let _ = self.release();
354    }
355}
356
357// ── Helper functions ──
358
359/// Generate a deterministic reservation ID.
360fn reservation_id(gpu_uuid: &str, pid: u32, time: DateTime<Utc>) -> u64 {
361    use std::hash::{Hash, Hasher};
362    let mut hasher = std::collections::hash_map::DefaultHasher::new();
363    gpu_uuid.hash(&mut hasher);
364    pid.hash(&mut hasher);
365    time.timestamp_nanos_opt().unwrap_or(0).hash(&mut hasher);
366    hasher.finish()
367}
368
369/// Read ledger JSON from an open file. Returns empty data if file is empty.
370fn read_ledger(file: &File) -> Result<LedgerData, GpuError> {
371    let mut contents = String::new();
372    let mut reader = file;
373    if reader.read_to_string(&mut contents).is_err() || contents.trim().is_empty() {
374        return Ok(LedgerData::default());
375    }
376    // Graceful recovery: if ledger is corrupted (e.g., concurrent write race),
377    // treat as empty rather than failing — lost reservations are self-healing
378    // since processes re-register on next operation.
379    serde_json::from_str(&contents).or_else(|_| Ok(LedgerData::default()))
380}
381
382/// Atomic write: write to temp file, fsync, rename over ledger.
383fn atomic_write_ledger(path: &Path, data: &LedgerData) -> Result<(), GpuError> {
384    let tmp_path = path.with_extension("tmp");
385    let json = serde_json::to_string_pretty(data)
386        .map_err(|e| GpuError::LedgerCorrupt(format!("JSON serialize: {e}")))?;
387
388    let mut tmp_file = File::create(&tmp_path)?;
389    tmp_file.write_all(json.as_bytes())?;
390    tmp_file.sync_all()?;
391
392    fs::rename(&tmp_path, path)?;
393    Ok(())
394}
395
396/// Ensure parent directory exists.
397fn ensure_parent_dir(path: &Path) -> Result<(), GpuError> {
398    if let Some(parent) = path.parent() {
399        fs::create_dir_all(parent)?;
400    }
401    Ok(())
402}
403
404/// Detect GPU UUID by shelling out to `nvidia-smi -L`.
405pub fn detect_gpu_uuid() -> String {
406    std::process::Command::new("nvidia-smi")
407        .args(["-L"])
408        .output()
409        .ok()
410        .and_then(|out| {
411            let stdout = String::from_utf8_lossy(&out.stdout);
412            stdout.lines().find_map(|line| {
413                let start = line.find("UUID: ")?;
414                let uuid_start = start + 6;
415                let end = line[uuid_start..].find(')')? + uuid_start;
416                Some(line[uuid_start..end].to_string())
417            })
418        })
419        .unwrap_or_else(|| "GPU-unknown".to_string())
420}
421
422/// Detect total GPU memory in MB via `nvidia-smi`.
423pub fn detect_total_memory_mb() -> usize {
424    let gpu_mb = std::process::Command::new("nvidia-smi")
425        .args(["--query-gpu=memory.total", "--format=csv,noheader,nounits"])
426        .output()
427        .ok()
428        .and_then(|out| {
429            let stdout = String::from_utf8_lossy(&out.stdout);
430            stdout.trim().lines().next()?.trim().parse::<usize>().ok()
431        })
432        .unwrap_or(0);
433
434    if gpu_mb > 0 {
435        return gpu_mb;
436    }
437
438    // GB10 / DIGITS unified memory: nvidia-smi reports [N/A] for memory.total.
439    // Use system RAM as VRAM (unified memory architecture).
440    if detect_memory_type() == MemoryType::Unified {
441        let sys_mb = sys_total_memory_mb();
442        if sys_mb > 0 {
443            eprintln!("[GPU-SHARE] Unified memory: using system RAM ({sys_mb} MB) as VRAM");
444            return sys_mb;
445        }
446    }
447
448    0
449}
450
451/// Read total system memory from /proc/meminfo (Linux).
452fn sys_total_memory_mb() -> usize {
453    std::fs::read_to_string("/proc/meminfo")
454        .ok()
455        .and_then(|s| {
456            s.lines()
457                .find(|l| l.starts_with("MemTotal:"))
458                .and_then(|l| l.split_whitespace().nth(1)?.parse::<usize>().ok())
459        })
460        .map_or(0, |kb| kb / 1024)
461}
462
463/// Detect whether GPU has unified memory (Jetson) vs discrete.
464pub fn detect_memory_type() -> MemoryType {
465    std::process::Command::new("nvidia-smi")
466        .args(["--query-gpu=name", "--format=csv,noheader"])
467        .output()
468        .ok()
469        .map_or(MemoryType::Discrete, |out| {
470            let name = String::from_utf8_lossy(&out.stdout).to_lowercase();
471            // Unified memory: Jetson/Orin/Tegra + Project DIGITS GB10
472            if name.contains("jetson")
473                || name.contains("orin")
474                || name.contains("tegra")
475                || name.contains("gb10")
476                || name.contains("digits")
477            {
478                MemoryType::Unified
479            } else {
480                MemoryType::Discrete
481            }
482        })
483}
484
485/// GPU memory type.
486#[derive(Debug, Clone, Copy, PartialEq, Eq)]
487pub enum MemoryType {
488    /// Discrete GPU (PCIe, e.g., RTX 4090). Reserve factor: 0.85.
489    Discrete,
490    /// Unified memory (e.g., Jetson Orin). Reserve factor: 0.60.
491    Unified,
492}
493
494impl MemoryType {
495    /// Reserve factor for this memory type.
496    pub fn reserve_factor(self) -> f32 {
497        match self {
498            Self::Discrete => RESERVE_FACTOR_DISCRETE,
499            Self::Unified => RESERVE_FACTOR_UNIFIED,
500        }
501    }
502}
503
504/// Create a ledger auto-detecting GPU properties.
505pub fn auto_ledger() -> VramLedger {
506    let uuid = detect_gpu_uuid();
507    let total_mb = detect_total_memory_mb();
508    let mem_type = detect_memory_type();
509    VramLedger::new(uuid, total_mb, mem_type.reserve_factor())
510}
511
512/// Human-readable GPU status display.
513pub fn gpu_status_display(ledger: &VramLedger) -> Result<String, GpuError> {
514    let reservations = ledger.read_reservations()?;
515    let reserved: usize = reservations.iter().map(|r| r.actual_mb.unwrap_or(r.budget_mb)).sum();
516
517    let mut out = String::new();
518    out.push_str(&format!(
519        "{}: {} MB total, {:.0}% reserve factor\n",
520        ledger.gpu_uuid,
521        ledger.total_mb,
522        ledger.reserve_factor * 100.0
523    ));
524    out.push_str(&format!(
525        "  Capacity: {} MB usable ({} MB reserved, {} MB available)\n",
526        ledger.capacity_mb(),
527        reserved,
528        ledger.capacity_mb().saturating_sub(reserved),
529    ));
530
531    if reservations.is_empty() {
532        out.push_str("  Reservations: none\n");
533    } else {
534        out.push_str(&format!("  Reservations: {}\n", reservations.len()));
535        for r in &reservations {
536            let actual = r
537                .actual_mb
538                .map_or_else(|| "measuring...".to_string(), |a| format!("{a} MB actual"));
539            let elapsed = Utc::now().signed_duration_since(r.started);
540            let hours = elapsed.num_hours();
541            let mins = elapsed.num_minutes() % 60;
542            out.push_str(&format!(
543                "    PID {}: {} MB budget / {} ({}) — {}h {}m\n",
544                r.pid, r.budget_mb, actual, r.task, hours, mins
545            ));
546        }
547    }
548
549    Ok(out)
550}
551
552#[cfg(test)]
553mod tests {
554    use super::*;
555    use std::sync::atomic::{AtomicU32, Ordering};
556    use std::time::Duration;
557
558    static TEST_COUNTER: AtomicU32 = AtomicU32::new(0);
559
560    fn test_ledger_path() -> PathBuf {
561        let n = TEST_COUNTER.fetch_add(1, Ordering::Relaxed);
562        let dir = std::env::temp_dir().join("entrenar-ledger-test");
563        fs::create_dir_all(&dir).expect("test dir creation should succeed");
564        dir.join(format!("test-ledger-{n}-{}.json", std::process::id()))
565    }
566
567    fn cleanup(path: &Path) {
568        let _ = fs::remove_file(path);
569        let _ = fs::remove_file(path.with_extension("tmp"));
570    }
571
572    #[test]
573    fn test_empty_ledger_has_full_capacity() {
574        let path = test_ledger_path();
575        let ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
576
577        assert_eq!(ledger.capacity_mb(), 20400);
578        assert_eq!(ledger.total_reserved().expect("should succeed"), 0);
579        assert_eq!(ledger.available_mb().expect("should succeed"), 20400);
580
581        cleanup(&path);
582    }
583
584    #[test]
585    fn test_reserve_and_release() {
586        let path = test_ledger_path();
587        let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
588
589        let id = ledger.try_reserve(8000, "test-job").expect("should succeed");
590        assert!(id != 0);
591        assert_eq!(ledger.total_reserved().expect("should succeed"), 8000);
592        assert_eq!(ledger.available_mb().expect("should succeed"), 12400);
593
594        ledger.release().expect("should succeed");
595        assert_eq!(ledger.total_reserved().expect("should succeed"), 0);
596
597        cleanup(&path);
598    }
599
600    #[test]
601    fn test_capacity_invariant_prevents_overallocation() {
602        let path = test_ledger_path();
603        let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
604
605        ledger.try_reserve(15000, "job-1").expect("should succeed");
606
607        let result = ledger.try_reserve(10000, "job-2");
608        assert!(result.is_err());
609        match result.expect_err("should be InsufficientMemory") {
610            GpuError::InsufficientMemory { budget_mb, available_mb, .. } => {
611                assert_eq!(budget_mb, 10000);
612                assert_eq!(available_mb, 5400);
613            }
614            other => panic!("expected InsufficientMemory, got {other}"),
615        }
616
617        cleanup(&path);
618    }
619
620    #[test]
621    fn test_reserve_factor_limits_total() {
622        let path = test_ledger_path();
623        let mut ledger = VramLedger::new("GPU-test".into(), 10000, 0.85).with_path(path.clone());
624
625        let result = ledger.try_reserve(9000, "too-big");
626        assert!(result.is_err());
627
628        cleanup(&path);
629    }
630
631    #[test]
632    fn test_update_actual() {
633        let path = test_ledger_path();
634        let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
635
636        ledger.try_reserve(8000, "test-job").expect("should succeed");
637        ledger.update_actual(7300).expect("should succeed");
638
639        assert_eq!(ledger.total_reserved().expect("should succeed"), 7300);
640
641        cleanup(&path);
642    }
643
644    #[test]
645    fn test_expired_lease_pruned() {
646        let path = test_ledger_path();
647        let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85)
648            .with_path(path.clone())
649            .with_lease_hours(0);
650
651        ledger.try_reserve(8000, "expiring-job").expect("should succeed");
652
653        std::thread::sleep(Duration::from_millis(10));
654
655        assert_eq!(ledger.total_reserved().expect("should succeed"), 0);
656
657        cleanup(&path);
658    }
659
660    #[test]
661    fn test_atomic_write_produces_valid_json() {
662        let path = test_ledger_path();
663        let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
664
665        ledger.try_reserve(5000, "json-test").expect("should succeed");
666
667        let contents = fs::read_to_string(&path).expect("should read");
668        let data: LedgerData = serde_json::from_str(&contents).expect("should parse");
669        assert_eq!(data.reservations.len(), 1);
670        assert_eq!(data.reservations[0].budget_mb, 5000);
671
672        cleanup(&path);
673    }
674
675    #[test]
676    fn test_gpu_status_display() {
677        let path = test_ledger_path();
678        let mut ledger =
679            VramLedger::new("GPU-test-display".into(), 24000, 0.85).with_path(path.clone());
680
681        ledger.try_reserve(7000, "display-test").expect("should succeed");
682
683        let status = gpu_status_display(&ledger).expect("should succeed");
684        assert!(status.contains("GPU-test-display"));
685        assert!(status.contains("24000 MB total"));
686        assert!(status.contains("7000 MB budget"));
687        assert!(status.contains("display-test"));
688
689        cleanup(&path);
690    }
691
692    #[test]
693    fn test_memory_type_reserve_factors() {
694        assert!((MemoryType::Discrete.reserve_factor() - 0.85).abs() < f32::EPSILON);
695        assert!((MemoryType::Unified.reserve_factor() - 0.60).abs() < f32::EPSILON);
696    }
697
698    #[test]
699    fn test_reservation_id_deterministic() {
700        let now = Utc::now();
701        let id1 = reservation_id("GPU-abc", 1234, now);
702        let id2 = reservation_id("GPU-abc", 1234, now);
703        assert_eq!(id1, id2);
704    }
705
706    #[test]
707    fn test_reservation_id_varies_with_input() {
708        let now = Utc::now();
709        let id1 = reservation_id("GPU-abc", 1234, now);
710        let id2 = reservation_id("GPU-xyz", 1234, now);
711        assert_ne!(id1, id2);
712    }
713
714    #[test]
715    fn test_profiling_disabled_by_default() {
716        let path = test_ledger_path();
717        let ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
718
719        assert!(!ledger.profiler.is_enabled());
720        let report = ledger.profiler_report();
721        assert!(report.contains("No operations recorded"));
722
723        cleanup(&path);
724    }
725
726    #[test]
727    fn test_profiling_enabled_records_phases() {
728        let path = test_ledger_path();
729        let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85)
730            .with_path(path.clone())
731            .with_profiling(true);
732
733        ledger.try_reserve(5000, "profiled-job").expect("should succeed");
734
735        let report = ledger.profiler_report();
736        assert!(report.contains("lock_acq"));
737        assert!(report.contains("ledger_rd"));
738        assert!(report.contains("ledger_wr"));
739
740        cleanup(&path);
741    }
742
743    // ── Additional coverage tests ──
744
745    #[test]
746    fn test_capacity_mb_discrete() {
747        let path = test_ledger_path();
748        let ledger = VramLedger::new("GPU-test".into(), 24000, RESERVE_FACTOR_DISCRETE)
749            .with_path(path.clone());
750        // 24000 * 0.85 = 20400
751        assert_eq!(ledger.capacity_mb(), 20400);
752        cleanup(&path);
753    }
754
755    #[test]
756    fn test_capacity_mb_unified() {
757        let path = test_ledger_path();
758        let ledger = VramLedger::new("GPU-test".into(), 8192, RESERVE_FACTOR_UNIFIED)
759            .with_path(path.clone());
760        // 8192 * 0.60 = 4915.2 -> 4915
761        assert_eq!(ledger.capacity_mb(), 4915);
762        cleanup(&path);
763    }
764
765    #[test]
766    fn test_with_lease_hours_custom() {
767        let path = test_ledger_path();
768        let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85)
769            .with_path(path.clone())
770            .with_lease_hours(48);
771
772        let id = ledger.try_reserve(1000, "long-lease").expect("should succeed");
773        assert!(id != 0);
774        // After 10ms with 48h lease, reservation should still be active
775        std::thread::sleep(Duration::from_millis(10));
776        assert_eq!(ledger.total_reserved().expect("should succeed"), 1000);
777        cleanup(&path);
778    }
779
780    #[test]
781    fn test_multiple_reservations_same_gpu() {
782        let path = test_ledger_path();
783        let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
784
785        ledger.try_reserve(5000, "job-1").expect("should succeed");
786        // Reserved should be 5000, available should be capacity - 5000
787        assert_eq!(ledger.total_reserved().expect("ok"), 5000);
788        assert_eq!(ledger.available_mb().expect("ok"), 15400);
789
790        cleanup(&path);
791    }
792
793    #[test]
794    fn test_read_reservations_returns_our_gpu_only() {
795        let path = test_ledger_path();
796        let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
797
798        ledger.try_reserve(3000, "gpu-test-job").expect("should succeed");
799        let reservations = ledger.read_reservations().expect("should succeed");
800        assert_eq!(reservations.len(), 1);
801        assert_eq!(reservations[0].budget_mb, 3000);
802        assert_eq!(reservations[0].gpu_uuid, "GPU-test");
803        assert_eq!(reservations[0].task, "gpu-test-job");
804
805        cleanup(&path);
806    }
807
808    #[test]
809    fn test_update_actual_without_reservation_is_noop() {
810        let path = test_ledger_path();
811        let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
812
813        // No reservation, update_actual is a no-op
814        let result = ledger.update_actual(5000);
815        assert!(result.is_ok());
816        assert_eq!(ledger.total_reserved().expect("ok"), 0);
817
818        cleanup(&path);
819    }
820
821    #[test]
822    fn test_release_without_reservation_is_noop() {
823        let path = test_ledger_path();
824        let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
825
826        // No reservation, release is a no-op
827        let result = ledger.release();
828        assert!(result.is_ok());
829
830        cleanup(&path);
831    }
832
833    #[test]
834    fn test_drop_releases_reservation() {
835        let path = test_ledger_path();
836        {
837            let mut ledger =
838                VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
839            ledger.try_reserve(5000, "drop-test").expect("should succeed");
840            assert_eq!(ledger.total_reserved().expect("ok"), 5000);
841            // Drop happens here
842        }
843
844        // After drop, a new ledger should show the reservation gone
845        // (since the process is still alive, the reservation may or may not be pruned
846        //  depending on timing, but the explicit release in Drop should have removed it)
847        let ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
848        let reserved = ledger.total_reserved().expect("ok");
849        assert_eq!(reserved, 0);
850
851        cleanup(&path);
852    }
853
854    #[test]
855    fn test_reservation_is_expired_zero_lease() {
856        let now = chrono::Utc::now();
857        let reservation = Reservation {
858            id: 123,
859            pid: std::process::id(),
860            budget_mb: 1000,
861            actual_mb: None,
862            task: "test".to_string(),
863            gpu_uuid: "GPU-test".to_string(),
864            started: now - chrono::Duration::seconds(10),
865            lease_expires: now - chrono::Duration::seconds(1), // already expired
866        };
867        assert!(reservation.is_expired());
868        assert!(reservation.should_prune());
869    }
870
871    #[test]
872    fn test_reservation_is_alive_current_process() {
873        let now = chrono::Utc::now();
874        let reservation = Reservation {
875            id: 123,
876            pid: std::process::id(), // current process is alive
877            budget_mb: 1000,
878            actual_mb: None,
879            task: "test".to_string(),
880            gpu_uuid: "GPU-test".to_string(),
881            started: now,
882            lease_expires: now + chrono::Duration::hours(24),
883        };
884        assert!(reservation.is_alive());
885        assert!(!reservation.is_expired());
886        assert!(!reservation.should_prune());
887    }
888
889    #[test]
890    fn test_reservation_is_alive_dead_process() {
891        let now = chrono::Utc::now();
892        let reservation = Reservation {
893            id: 123,
894            pid: u32::MAX, // extremely unlikely to be a real PID
895            budget_mb: 1000,
896            actual_mb: None,
897            task: "dead-process".to_string(),
898            gpu_uuid: "GPU-test".to_string(),
899            started: now,
900            lease_expires: now + chrono::Duration::hours(24),
901        };
902        assert!(!reservation.is_alive());
903        assert!(reservation.should_prune());
904    }
905
906    #[test]
907    fn test_ledger_data_total_reserved_for() {
908        let now = chrono::Utc::now();
909        let data = LedgerData {
910            reservations: vec![
911                Reservation {
912                    id: 1,
913                    pid: std::process::id(),
914                    budget_mb: 3000,
915                    actual_mb: None,
916                    task: "a".to_string(),
917                    gpu_uuid: "GPU-A".to_string(),
918                    started: now,
919                    lease_expires: now + chrono::Duration::hours(1),
920                },
921                Reservation {
922                    id: 2,
923                    pid: std::process::id(),
924                    budget_mb: 5000,
925                    actual_mb: Some(4500),
926                    task: "b".to_string(),
927                    gpu_uuid: "GPU-A".to_string(),
928                    started: now,
929                    lease_expires: now + chrono::Duration::hours(1),
930                },
931                Reservation {
932                    id: 3,
933                    pid: std::process::id(),
934                    budget_mb: 2000,
935                    actual_mb: None,
936                    task: "c".to_string(),
937                    gpu_uuid: "GPU-B".to_string(),
938                    started: now,
939                    lease_expires: now + chrono::Duration::hours(1),
940                },
941            ],
942        };
943        // GPU-A: 3000 (budget, no actual) + 4500 (actual) = 7500
944        assert_eq!(data.total_reserved_for("GPU-A"), 7500);
945        // GPU-B: 2000
946        assert_eq!(data.total_reserved_for("GPU-B"), 2000);
947        // GPU-C: 0
948        assert_eq!(data.total_reserved_for("GPU-C"), 0);
949    }
950
951    #[test]
952    fn test_ledger_data_prune_dead() {
953        let now = chrono::Utc::now();
954        let mut data = LedgerData {
955            reservations: vec![
956                // This one is expired
957                Reservation {
958                    id: 1,
959                    pid: std::process::id(),
960                    budget_mb: 1000,
961                    actual_mb: None,
962                    task: "expired".to_string(),
963                    gpu_uuid: "GPU-A".to_string(),
964                    started: now - chrono::Duration::hours(2),
965                    lease_expires: now - chrono::Duration::seconds(1),
966                },
967                // This one is alive and not expired
968                Reservation {
969                    id: 2,
970                    pid: std::process::id(),
971                    budget_mb: 2000,
972                    actual_mb: None,
973                    task: "alive".to_string(),
974                    gpu_uuid: "GPU-A".to_string(),
975                    started: now,
976                    lease_expires: now + chrono::Duration::hours(24),
977                },
978                // This one has a dead PID
979                Reservation {
980                    id: 3,
981                    pid: u32::MAX,
982                    budget_mb: 3000,
983                    actual_mb: None,
984                    task: "dead".to_string(),
985                    gpu_uuid: "GPU-A".to_string(),
986                    started: now,
987                    lease_expires: now + chrono::Duration::hours(24),
988                },
989            ],
990        };
991        data.prune_dead();
992        assert_eq!(data.reservations.len(), 1);
993        assert_eq!(data.reservations[0].task, "alive");
994    }
995
996    #[test]
997    fn test_reservation_id_varies_with_pid() {
998        let now = chrono::Utc::now();
999        let id1 = reservation_id("GPU-abc", 100, now);
1000        let id2 = reservation_id("GPU-abc", 200, now);
1001        assert_ne!(id1, id2);
1002    }
1003
1004    #[test]
1005    fn test_reservation_id_varies_with_time() {
1006        let now = chrono::Utc::now();
1007        let later = now + chrono::Duration::seconds(1);
1008        let id1 = reservation_id("GPU-abc", 100, now);
1009        let id2 = reservation_id("GPU-abc", 100, later);
1010        assert_ne!(id1, id2);
1011    }
1012
1013    #[test]
1014    fn test_gpu_status_display_no_reservations() {
1015        let path = test_ledger_path();
1016        let ledger = VramLedger::new("GPU-no-res".into(), 16000, 0.85).with_path(path.clone());
1017
1018        let status = gpu_status_display(&ledger).expect("should succeed");
1019        assert!(status.contains("GPU-no-res"));
1020        assert!(status.contains("16000 MB total"));
1021        assert!(status.contains("Reservations: none"));
1022
1023        cleanup(&path);
1024    }
1025
1026    #[test]
1027    fn test_gpu_status_display_with_actual_mb() {
1028        let path = test_ledger_path();
1029        let mut ledger = VramLedger::new("GPU-act".into(), 24000, 0.85).with_path(path.clone());
1030
1031        ledger.try_reserve(8000, "actual-test").expect("should succeed");
1032        ledger.update_actual(7500).expect("should succeed");
1033
1034        let status = gpu_status_display(&ledger).expect("should succeed");
1035        assert!(status.contains("7500 MB actual"));
1036        assert!(status.contains("actual-test"));
1037
1038        cleanup(&path);
1039    }
1040
1041    #[test]
1042    fn test_memory_type_reserve_factor_values() {
1043        assert_eq!(MemoryType::Discrete.reserve_factor(), RESERVE_FACTOR_DISCRETE);
1044        assert_eq!(MemoryType::Unified.reserve_factor(), RESERVE_FACTOR_UNIFIED);
1045    }
1046
1047    #[test]
1048    fn test_memory_type_equality() {
1049        assert_eq!(MemoryType::Discrete, MemoryType::Discrete);
1050        assert_eq!(MemoryType::Unified, MemoryType::Unified);
1051        assert_ne!(MemoryType::Discrete, MemoryType::Unified);
1052    }
1053
1054    #[test]
1055    fn test_ensure_parent_dir_existing() {
1056        let dir = tempfile::tempdir().expect("ok");
1057        let path = dir.path().join("subdir").join("ledger.json");
1058        ensure_parent_dir(&path).expect("should succeed");
1059        assert!(path.parent().expect("ok").exists());
1060    }
1061
1062    #[test]
1063    fn test_reserve_exact_capacity() {
1064        let path = test_ledger_path();
1065        let mut ledger = VramLedger::new("GPU-test".into(), 10000, 0.85).with_path(path.clone());
1066        // capacity = 8500
1067        let result = ledger.try_reserve(8500, "exact-fit");
1068        assert!(result.is_ok());
1069        assert_eq!(ledger.available_mb().expect("ok"), 0);
1070
1071        cleanup(&path);
1072    }
1073
1074    #[test]
1075    fn test_reserve_one_over_capacity() {
1076        let path = test_ledger_path();
1077        let mut ledger = VramLedger::new("GPU-test".into(), 10000, 0.85).with_path(path.clone());
1078        // capacity = 8500, request 8501 should fail
1079        let result = ledger.try_reserve(8501, "too-big");
1080        assert!(result.is_err());
1081
1082        cleanup(&path);
1083    }
1084
1085    #[test]
1086    fn test_ledger_default_path() {
1087        let ledger = VramLedger::new("GPU-test".into(), 24000, 0.85);
1088        // Default path should be under cache dir
1089        let path_str = format!("{}", ledger.path.display());
1090        assert!(path_str.contains("gpu-ledger.json"));
1091    }
1092
1093    #[test]
1094    fn test_reservation_serde_roundtrip() {
1095        let now = chrono::Utc::now();
1096        let reservation = Reservation {
1097            id: 42,
1098            pid: 12345,
1099            budget_mb: 8000,
1100            actual_mb: Some(7500),
1101            task: "serde-test".to_string(),
1102            gpu_uuid: "GPU-0000".to_string(),
1103            started: now,
1104            lease_expires: now + chrono::Duration::hours(24),
1105        };
1106        let json = serde_json::to_string(&reservation).expect("serialize");
1107        let restored: Reservation = serde_json::from_str(&json).expect("deserialize");
1108        assert_eq!(restored.id, 42);
1109        assert_eq!(restored.pid, 12345);
1110        assert_eq!(restored.budget_mb, 8000);
1111        assert_eq!(restored.actual_mb, Some(7500));
1112        assert_eq!(restored.task, "serde-test");
1113    }
1114}