1use std::fs::{self, File, OpenOptions};
21use std::io::{Read as _, Write as _};
22use std::path::{Path, PathBuf};
23
24use chrono::{DateTime, Utc};
25use fs4::fs_std::FileExt;
26use serde::{Deserialize, Serialize};
27
28use super::error::GpuError;
29use super::profiler::GpuProfiler;
30use crate::trace::{TraceStep, TRACER};
31
32fn default_ledger_path() -> PathBuf {
34 dirs::cache_dir()
35 .unwrap_or_else(|| PathBuf::from("/tmp"))
36 .join("entrenar")
37 .join("gpu-ledger.json")
38}
39
40pub const RESERVE_FACTOR_DISCRETE: f32 = 0.85;
42
43pub const RESERVE_FACTOR_UNIFIED: f32 = 0.60;
45
46pub const DEFAULT_LEASE_HOURS: i64 = 24;
48
49#[derive(Debug, Clone, Serialize, Deserialize)]
51pub struct Reservation {
52 pub id: u64,
54 pub pid: u32,
56 pub budget_mb: usize,
58 pub actual_mb: Option<usize>,
60 pub task: String,
62 pub gpu_uuid: String,
64 pub started: DateTime<Utc>,
66 pub lease_expires: DateTime<Utc>,
68}
69
70impl Reservation {
71 pub fn is_expired(&self) -> bool {
73 Utc::now() > self.lease_expires
74 }
75
76 pub fn is_alive(&self) -> bool {
78 Path::new(&format!("/proc/{}/stat", self.pid)).exists()
79 }
80
81 pub fn should_prune(&self) -> bool {
83 self.is_expired() || !self.is_alive()
84 }
85}
86
87#[derive(Debug, Default, Serialize, Deserialize)]
89struct LedgerData {
90 reservations: Vec<Reservation>,
91}
92
93impl LedgerData {
94 fn prune_dead(&mut self) {
96 self.reservations.retain(|r| !r.should_prune());
97 }
98
99 fn total_reserved_for(&self, gpu_uuid: &str) -> usize {
101 self.reservations
102 .iter()
103 .filter(|r| r.gpu_uuid == gpu_uuid)
104 .map(|r| r.actual_mb.unwrap_or(r.budget_mb))
105 .sum()
106 }
107}
108
109pub struct VramLedger {
111 path: PathBuf,
112 pub gpu_uuid: String,
114 pub total_mb: usize,
116 pub reserve_factor: f32,
118 lease_hours: i64,
119 profiler: GpuProfiler,
121 pub our_reservation_id: Option<u64>,
123}
124
125impl VramLedger {
126 pub fn new(gpu_uuid: String, total_mb: usize, reserve_factor: f32) -> Self {
128 Self {
129 path: default_ledger_path(),
130 gpu_uuid,
131 total_mb,
132 reserve_factor,
133 lease_hours: DEFAULT_LEASE_HOURS,
134 profiler: GpuProfiler::disabled(),
135 our_reservation_id: None,
136 }
137 }
138
139 pub fn with_path(mut self, path: PathBuf) -> Self {
141 self.path = path;
142 self
143 }
144
145 pub fn with_profiling(mut self, enabled: bool) -> Self {
147 self.profiler = GpuProfiler::new(enabled);
148 self
149 }
150
151 pub fn with_lease_hours(mut self, hours: i64) -> Self {
153 self.lease_hours = hours;
154 self
155 }
156
157 pub fn capacity_mb(&self) -> usize {
159 (self.total_mb as f32 * self.reserve_factor) as usize
160 }
161
162 pub fn total_reserved(&self) -> Result<usize, GpuError> {
164 let gpu_uuid = self.gpu_uuid.clone();
165 self.with_lock_read(|data| {
166 data.reservations
167 .iter()
168 .filter(|r| r.gpu_uuid == gpu_uuid && !r.should_prune())
169 .map(|r| r.actual_mb.unwrap_or(r.budget_mb))
170 .sum()
171 })
172 }
173
174 pub fn available_mb(&self) -> Result<usize, GpuError> {
176 let reserved = self.total_reserved()?;
177 Ok(self.capacity_mb().saturating_sub(reserved))
178 }
179
180 pub fn try_reserve(&mut self, budget_mb: usize, task: &str) -> Result<u64, GpuError> {
187 TRACER.span(
188 TraceStep::LedgerReserve,
189 format!("ledger_reserve budget={budget_mb}MB gpu={}", self.gpu_uuid),
190 || self.try_reserve_inner(budget_mb, task),
191 )
192 }
193
194 fn try_reserve_inner(&mut self, budget_mb: usize, task: &str) -> Result<u64, GpuError> {
195 let gpu_uuid = self.gpu_uuid.clone();
196 let lease_hours = self.lease_hours;
197 let capacity = self.capacity_mb();
198 let total_mb = self.total_mb;
199
200 let id = self.with_lock_write(|data| {
201 data.prune_dead();
202
203 let reserved = data.total_reserved_for(&gpu_uuid);
204
205 if reserved + budget_mb > capacity {
206 return Err(GpuError::InsufficientMemory {
207 budget_mb,
208 available_mb: capacity.saturating_sub(reserved),
209 reserved_mb: reserved,
210 total_mb,
211 });
212 }
213
214 let now = Utc::now();
215 let id = reservation_id(&gpu_uuid, std::process::id(), now);
216 let reservation = Reservation {
217 id,
218 pid: std::process::id(),
219 budget_mb,
220 actual_mb: None,
221 task: task.to_string(),
222 gpu_uuid: gpu_uuid.clone(),
223 started: now,
224 lease_expires: now + chrono::Duration::hours(lease_hours),
225 };
226
227 data.reservations.push(reservation);
228 Ok(id)
229 })?;
230
231 self.our_reservation_id = Some(id);
232 self.profiler.finish_op();
233 Ok(id)
234 }
235
236 pub fn update_actual(&mut self, actual_mb: usize) -> Result<(), GpuError> {
238 let Some(our_id) = self.our_reservation_id else {
239 return Ok(());
240 };
241
242 self.with_lock_write(|data| {
243 if let Some(r) = data.reservations.iter_mut().find(|r| r.id == our_id) {
244 r.actual_mb = Some(actual_mb);
245 }
246 Ok(())
247 })
248 }
249
250 pub fn release(&mut self) -> Result<(), GpuError> {
252 let Some(our_id) = self.our_reservation_id.take() else {
253 return Ok(());
254 };
255
256 TRACER.span(TraceStep::LedgerRelease, format!("ledger_release id={our_id}"), || {
257 self.with_lock_write(|data| {
258 data.reservations.retain(|r| r.id != our_id);
259 Ok(())
260 })
261 })
262 }
263
264 pub fn read_reservations(&self) -> Result<Vec<Reservation>, GpuError> {
266 let gpu_uuid = self.gpu_uuid.clone();
267 self.with_lock_read(|data| {
268 data.reservations
269 .iter()
270 .filter(|r| r.gpu_uuid == gpu_uuid && !r.should_prune())
271 .cloned()
272 .collect()
273 })
274 }
275
276 pub fn profiler_report(&self) -> String {
278 self.profiler.report()
279 }
280
281 fn with_lock_read<F, T>(&self, f: F) -> Result<T, GpuError>
285 where
286 F: FnOnce(&LedgerData) -> T,
287 {
288 ensure_parent_dir(&self.path)?;
289
290 let file = OpenOptions::new()
291 .read(true)
292 .write(true)
293 .create(true)
294 .truncate(false)
295 .open(&self.path)?;
296
297 file.lock_exclusive()
298 .map_err(|e| GpuError::Io(std::io::Error::other(format!("flock: {e}"))))?;
299
300 let data = read_ledger(&file)?;
301 let result = f(&data);
302
303 #[allow(clippy::incompatible_msrv)]
304 file.unlock().map_err(|e| GpuError::Io(std::io::Error::other(format!("funlock: {e}"))))?;
305
306 Ok(result)
307 }
308
309 fn with_lock_write<F, T>(&mut self, f: F) -> Result<T, GpuError>
311 where
312 F: FnOnce(&mut LedgerData) -> Result<T, GpuError>,
313 {
314 ensure_parent_dir(&self.path)?;
315
316 let file = OpenOptions::new()
317 .read(true)
318 .write(true)
319 .create(true)
320 .truncate(false)
321 .open(&self.path)?;
322
323 self.profiler.begin(GpuProfiler::LOCK_ACQ);
325 file.lock_exclusive()
326 .map_err(|e| GpuError::Io(std::io::Error::other(format!("flock: {e}"))))?;
327 self.profiler.end(GpuProfiler::LOCK_ACQ);
328
329 self.profiler.begin(GpuProfiler::LEDGER_RD);
331 let mut data = read_ledger(&file)?;
332 self.profiler.end(GpuProfiler::LEDGER_RD);
333
334 let result = f(&mut data)?;
335
336 self.profiler.begin(GpuProfiler::LEDGER_WR);
338 atomic_write_ledger(&self.path, &data)?;
339 self.profiler.end(GpuProfiler::LEDGER_WR);
340
341 self.profiler.begin(GpuProfiler::LOCK_REL);
343 #[allow(clippy::incompatible_msrv)]
344 file.unlock().map_err(|e| GpuError::Io(std::io::Error::other(format!("funlock: {e}"))))?;
345 self.profiler.end(GpuProfiler::LOCK_REL);
346
347 Ok(result)
348 }
349}
350
351impl Drop for VramLedger {
352 fn drop(&mut self) {
353 let _ = self.release();
354 }
355}
356
357fn reservation_id(gpu_uuid: &str, pid: u32, time: DateTime<Utc>) -> u64 {
361 use std::hash::{Hash, Hasher};
362 let mut hasher = std::collections::hash_map::DefaultHasher::new();
363 gpu_uuid.hash(&mut hasher);
364 pid.hash(&mut hasher);
365 time.timestamp_nanos_opt().unwrap_or(0).hash(&mut hasher);
366 hasher.finish()
367}
368
369fn read_ledger(file: &File) -> Result<LedgerData, GpuError> {
371 let mut contents = String::new();
372 let mut reader = file;
373 if reader.read_to_string(&mut contents).is_err() || contents.trim().is_empty() {
374 return Ok(LedgerData::default());
375 }
376 serde_json::from_str(&contents).or_else(|_| Ok(LedgerData::default()))
380}
381
382fn atomic_write_ledger(path: &Path, data: &LedgerData) -> Result<(), GpuError> {
384 let tmp_path = path.with_extension("tmp");
385 let json = serde_json::to_string_pretty(data)
386 .map_err(|e| GpuError::LedgerCorrupt(format!("JSON serialize: {e}")))?;
387
388 let mut tmp_file = File::create(&tmp_path)?;
389 tmp_file.write_all(json.as_bytes())?;
390 tmp_file.sync_all()?;
391
392 fs::rename(&tmp_path, path)?;
393 Ok(())
394}
395
396fn ensure_parent_dir(path: &Path) -> Result<(), GpuError> {
398 if let Some(parent) = path.parent() {
399 fs::create_dir_all(parent)?;
400 }
401 Ok(())
402}
403
404pub fn detect_gpu_uuid() -> String {
406 std::process::Command::new("nvidia-smi")
407 .args(["-L"])
408 .output()
409 .ok()
410 .and_then(|out| {
411 let stdout = String::from_utf8_lossy(&out.stdout);
412 stdout.lines().find_map(|line| {
413 let start = line.find("UUID: ")?;
414 let uuid_start = start + 6;
415 let end = line[uuid_start..].find(')')? + uuid_start;
416 Some(line[uuid_start..end].to_string())
417 })
418 })
419 .unwrap_or_else(|| "GPU-unknown".to_string())
420}
421
422pub fn detect_total_memory_mb() -> usize {
424 let gpu_mb = std::process::Command::new("nvidia-smi")
425 .args(["--query-gpu=memory.total", "--format=csv,noheader,nounits"])
426 .output()
427 .ok()
428 .and_then(|out| {
429 let stdout = String::from_utf8_lossy(&out.stdout);
430 stdout.trim().lines().next()?.trim().parse::<usize>().ok()
431 })
432 .unwrap_or(0);
433
434 if gpu_mb > 0 {
435 return gpu_mb;
436 }
437
438 if detect_memory_type() == MemoryType::Unified {
441 let sys_mb = sys_total_memory_mb();
442 if sys_mb > 0 {
443 eprintln!("[GPU-SHARE] Unified memory: using system RAM ({sys_mb} MB) as VRAM");
444 return sys_mb;
445 }
446 }
447
448 0
449}
450
451fn sys_total_memory_mb() -> usize {
453 std::fs::read_to_string("/proc/meminfo")
454 .ok()
455 .and_then(|s| {
456 s.lines()
457 .find(|l| l.starts_with("MemTotal:"))
458 .and_then(|l| l.split_whitespace().nth(1)?.parse::<usize>().ok())
459 })
460 .map_or(0, |kb| kb / 1024)
461}
462
463pub fn detect_memory_type() -> MemoryType {
465 std::process::Command::new("nvidia-smi")
466 .args(["--query-gpu=name", "--format=csv,noheader"])
467 .output()
468 .ok()
469 .map_or(MemoryType::Discrete, |out| {
470 let name = String::from_utf8_lossy(&out.stdout).to_lowercase();
471 if name.contains("jetson")
473 || name.contains("orin")
474 || name.contains("tegra")
475 || name.contains("gb10")
476 || name.contains("digits")
477 {
478 MemoryType::Unified
479 } else {
480 MemoryType::Discrete
481 }
482 })
483}
484
485#[derive(Debug, Clone, Copy, PartialEq, Eq)]
487pub enum MemoryType {
488 Discrete,
490 Unified,
492}
493
494impl MemoryType {
495 pub fn reserve_factor(self) -> f32 {
497 match self {
498 Self::Discrete => RESERVE_FACTOR_DISCRETE,
499 Self::Unified => RESERVE_FACTOR_UNIFIED,
500 }
501 }
502}
503
504pub fn auto_ledger() -> VramLedger {
506 let uuid = detect_gpu_uuid();
507 let total_mb = detect_total_memory_mb();
508 let mem_type = detect_memory_type();
509 VramLedger::new(uuid, total_mb, mem_type.reserve_factor())
510}
511
512pub fn gpu_status_display(ledger: &VramLedger) -> Result<String, GpuError> {
514 let reservations = ledger.read_reservations()?;
515 let reserved: usize = reservations.iter().map(|r| r.actual_mb.unwrap_or(r.budget_mb)).sum();
516
517 let mut out = String::new();
518 out.push_str(&format!(
519 "{}: {} MB total, {:.0}% reserve factor\n",
520 ledger.gpu_uuid,
521 ledger.total_mb,
522 ledger.reserve_factor * 100.0
523 ));
524 out.push_str(&format!(
525 " Capacity: {} MB usable ({} MB reserved, {} MB available)\n",
526 ledger.capacity_mb(),
527 reserved,
528 ledger.capacity_mb().saturating_sub(reserved),
529 ));
530
531 if reservations.is_empty() {
532 out.push_str(" Reservations: none\n");
533 } else {
534 out.push_str(&format!(" Reservations: {}\n", reservations.len()));
535 for r in &reservations {
536 let actual = r
537 .actual_mb
538 .map_or_else(|| "measuring...".to_string(), |a| format!("{a} MB actual"));
539 let elapsed = Utc::now().signed_duration_since(r.started);
540 let hours = elapsed.num_hours();
541 let mins = elapsed.num_minutes() % 60;
542 out.push_str(&format!(
543 " PID {}: {} MB budget / {} ({}) — {}h {}m\n",
544 r.pid, r.budget_mb, actual, r.task, hours, mins
545 ));
546 }
547 }
548
549 Ok(out)
550}
551
552#[cfg(test)]
553mod tests {
554 use super::*;
555 use std::sync::atomic::{AtomicU32, Ordering};
556 use std::time::Duration;
557
558 static TEST_COUNTER: AtomicU32 = AtomicU32::new(0);
559
560 fn test_ledger_path() -> PathBuf {
561 let n = TEST_COUNTER.fetch_add(1, Ordering::Relaxed);
562 let dir = std::env::temp_dir().join("entrenar-ledger-test");
563 fs::create_dir_all(&dir).expect("test dir creation should succeed");
564 dir.join(format!("test-ledger-{n}-{}.json", std::process::id()))
565 }
566
567 fn cleanup(path: &Path) {
568 let _ = fs::remove_file(path);
569 let _ = fs::remove_file(path.with_extension("tmp"));
570 }
571
572 #[test]
573 fn test_empty_ledger_has_full_capacity() {
574 let path = test_ledger_path();
575 let ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
576
577 assert_eq!(ledger.capacity_mb(), 20400);
578 assert_eq!(ledger.total_reserved().expect("should succeed"), 0);
579 assert_eq!(ledger.available_mb().expect("should succeed"), 20400);
580
581 cleanup(&path);
582 }
583
584 #[test]
585 fn test_reserve_and_release() {
586 let path = test_ledger_path();
587 let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
588
589 let id = ledger.try_reserve(8000, "test-job").expect("should succeed");
590 assert!(id != 0);
591 assert_eq!(ledger.total_reserved().expect("should succeed"), 8000);
592 assert_eq!(ledger.available_mb().expect("should succeed"), 12400);
593
594 ledger.release().expect("should succeed");
595 assert_eq!(ledger.total_reserved().expect("should succeed"), 0);
596
597 cleanup(&path);
598 }
599
600 #[test]
601 fn test_capacity_invariant_prevents_overallocation() {
602 let path = test_ledger_path();
603 let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
604
605 ledger.try_reserve(15000, "job-1").expect("should succeed");
606
607 let result = ledger.try_reserve(10000, "job-2");
608 assert!(result.is_err());
609 match result.expect_err("should be InsufficientMemory") {
610 GpuError::InsufficientMemory { budget_mb, available_mb, .. } => {
611 assert_eq!(budget_mb, 10000);
612 assert_eq!(available_mb, 5400);
613 }
614 other => panic!("expected InsufficientMemory, got {other}"),
615 }
616
617 cleanup(&path);
618 }
619
620 #[test]
621 fn test_reserve_factor_limits_total() {
622 let path = test_ledger_path();
623 let mut ledger = VramLedger::new("GPU-test".into(), 10000, 0.85).with_path(path.clone());
624
625 let result = ledger.try_reserve(9000, "too-big");
626 assert!(result.is_err());
627
628 cleanup(&path);
629 }
630
631 #[test]
632 fn test_update_actual() {
633 let path = test_ledger_path();
634 let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
635
636 ledger.try_reserve(8000, "test-job").expect("should succeed");
637 ledger.update_actual(7300).expect("should succeed");
638
639 assert_eq!(ledger.total_reserved().expect("should succeed"), 7300);
640
641 cleanup(&path);
642 }
643
644 #[test]
645 fn test_expired_lease_pruned() {
646 let path = test_ledger_path();
647 let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85)
648 .with_path(path.clone())
649 .with_lease_hours(0);
650
651 ledger.try_reserve(8000, "expiring-job").expect("should succeed");
652
653 std::thread::sleep(Duration::from_millis(10));
654
655 assert_eq!(ledger.total_reserved().expect("should succeed"), 0);
656
657 cleanup(&path);
658 }
659
660 #[test]
661 fn test_atomic_write_produces_valid_json() {
662 let path = test_ledger_path();
663 let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
664
665 ledger.try_reserve(5000, "json-test").expect("should succeed");
666
667 let contents = fs::read_to_string(&path).expect("should read");
668 let data: LedgerData = serde_json::from_str(&contents).expect("should parse");
669 assert_eq!(data.reservations.len(), 1);
670 assert_eq!(data.reservations[0].budget_mb, 5000);
671
672 cleanup(&path);
673 }
674
675 #[test]
676 fn test_gpu_status_display() {
677 let path = test_ledger_path();
678 let mut ledger =
679 VramLedger::new("GPU-test-display".into(), 24000, 0.85).with_path(path.clone());
680
681 ledger.try_reserve(7000, "display-test").expect("should succeed");
682
683 let status = gpu_status_display(&ledger).expect("should succeed");
684 assert!(status.contains("GPU-test-display"));
685 assert!(status.contains("24000 MB total"));
686 assert!(status.contains("7000 MB budget"));
687 assert!(status.contains("display-test"));
688
689 cleanup(&path);
690 }
691
692 #[test]
693 fn test_memory_type_reserve_factors() {
694 assert!((MemoryType::Discrete.reserve_factor() - 0.85).abs() < f32::EPSILON);
695 assert!((MemoryType::Unified.reserve_factor() - 0.60).abs() < f32::EPSILON);
696 }
697
698 #[test]
699 fn test_reservation_id_deterministic() {
700 let now = Utc::now();
701 let id1 = reservation_id("GPU-abc", 1234, now);
702 let id2 = reservation_id("GPU-abc", 1234, now);
703 assert_eq!(id1, id2);
704 }
705
706 #[test]
707 fn test_reservation_id_varies_with_input() {
708 let now = Utc::now();
709 let id1 = reservation_id("GPU-abc", 1234, now);
710 let id2 = reservation_id("GPU-xyz", 1234, now);
711 assert_ne!(id1, id2);
712 }
713
714 #[test]
715 fn test_profiling_disabled_by_default() {
716 let path = test_ledger_path();
717 let ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
718
719 assert!(!ledger.profiler.is_enabled());
720 let report = ledger.profiler_report();
721 assert!(report.contains("No operations recorded"));
722
723 cleanup(&path);
724 }
725
726 #[test]
727 fn test_profiling_enabled_records_phases() {
728 let path = test_ledger_path();
729 let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85)
730 .with_path(path.clone())
731 .with_profiling(true);
732
733 ledger.try_reserve(5000, "profiled-job").expect("should succeed");
734
735 let report = ledger.profiler_report();
736 assert!(report.contains("lock_acq"));
737 assert!(report.contains("ledger_rd"));
738 assert!(report.contains("ledger_wr"));
739
740 cleanup(&path);
741 }
742
743 #[test]
746 fn test_capacity_mb_discrete() {
747 let path = test_ledger_path();
748 let ledger = VramLedger::new("GPU-test".into(), 24000, RESERVE_FACTOR_DISCRETE)
749 .with_path(path.clone());
750 assert_eq!(ledger.capacity_mb(), 20400);
752 cleanup(&path);
753 }
754
755 #[test]
756 fn test_capacity_mb_unified() {
757 let path = test_ledger_path();
758 let ledger = VramLedger::new("GPU-test".into(), 8192, RESERVE_FACTOR_UNIFIED)
759 .with_path(path.clone());
760 assert_eq!(ledger.capacity_mb(), 4915);
762 cleanup(&path);
763 }
764
765 #[test]
766 fn test_with_lease_hours_custom() {
767 let path = test_ledger_path();
768 let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85)
769 .with_path(path.clone())
770 .with_lease_hours(48);
771
772 let id = ledger.try_reserve(1000, "long-lease").expect("should succeed");
773 assert!(id != 0);
774 std::thread::sleep(Duration::from_millis(10));
776 assert_eq!(ledger.total_reserved().expect("should succeed"), 1000);
777 cleanup(&path);
778 }
779
780 #[test]
781 fn test_multiple_reservations_same_gpu() {
782 let path = test_ledger_path();
783 let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
784
785 ledger.try_reserve(5000, "job-1").expect("should succeed");
786 assert_eq!(ledger.total_reserved().expect("ok"), 5000);
788 assert_eq!(ledger.available_mb().expect("ok"), 15400);
789
790 cleanup(&path);
791 }
792
793 #[test]
794 fn test_read_reservations_returns_our_gpu_only() {
795 let path = test_ledger_path();
796 let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
797
798 ledger.try_reserve(3000, "gpu-test-job").expect("should succeed");
799 let reservations = ledger.read_reservations().expect("should succeed");
800 assert_eq!(reservations.len(), 1);
801 assert_eq!(reservations[0].budget_mb, 3000);
802 assert_eq!(reservations[0].gpu_uuid, "GPU-test");
803 assert_eq!(reservations[0].task, "gpu-test-job");
804
805 cleanup(&path);
806 }
807
808 #[test]
809 fn test_update_actual_without_reservation_is_noop() {
810 let path = test_ledger_path();
811 let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
812
813 let result = ledger.update_actual(5000);
815 assert!(result.is_ok());
816 assert_eq!(ledger.total_reserved().expect("ok"), 0);
817
818 cleanup(&path);
819 }
820
821 #[test]
822 fn test_release_without_reservation_is_noop() {
823 let path = test_ledger_path();
824 let mut ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
825
826 let result = ledger.release();
828 assert!(result.is_ok());
829
830 cleanup(&path);
831 }
832
833 #[test]
834 fn test_drop_releases_reservation() {
835 let path = test_ledger_path();
836 {
837 let mut ledger =
838 VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
839 ledger.try_reserve(5000, "drop-test").expect("should succeed");
840 assert_eq!(ledger.total_reserved().expect("ok"), 5000);
841 }
843
844 let ledger = VramLedger::new("GPU-test".into(), 24000, 0.85).with_path(path.clone());
848 let reserved = ledger.total_reserved().expect("ok");
849 assert_eq!(reserved, 0);
850
851 cleanup(&path);
852 }
853
854 #[test]
855 fn test_reservation_is_expired_zero_lease() {
856 let now = chrono::Utc::now();
857 let reservation = Reservation {
858 id: 123,
859 pid: std::process::id(),
860 budget_mb: 1000,
861 actual_mb: None,
862 task: "test".to_string(),
863 gpu_uuid: "GPU-test".to_string(),
864 started: now - chrono::Duration::seconds(10),
865 lease_expires: now - chrono::Duration::seconds(1), };
867 assert!(reservation.is_expired());
868 assert!(reservation.should_prune());
869 }
870
871 #[test]
872 fn test_reservation_is_alive_current_process() {
873 let now = chrono::Utc::now();
874 let reservation = Reservation {
875 id: 123,
876 pid: std::process::id(), budget_mb: 1000,
878 actual_mb: None,
879 task: "test".to_string(),
880 gpu_uuid: "GPU-test".to_string(),
881 started: now,
882 lease_expires: now + chrono::Duration::hours(24),
883 };
884 assert!(reservation.is_alive());
885 assert!(!reservation.is_expired());
886 assert!(!reservation.should_prune());
887 }
888
889 #[test]
890 fn test_reservation_is_alive_dead_process() {
891 let now = chrono::Utc::now();
892 let reservation = Reservation {
893 id: 123,
894 pid: u32::MAX, budget_mb: 1000,
896 actual_mb: None,
897 task: "dead-process".to_string(),
898 gpu_uuid: "GPU-test".to_string(),
899 started: now,
900 lease_expires: now + chrono::Duration::hours(24),
901 };
902 assert!(!reservation.is_alive());
903 assert!(reservation.should_prune());
904 }
905
906 #[test]
907 fn test_ledger_data_total_reserved_for() {
908 let now = chrono::Utc::now();
909 let data = LedgerData {
910 reservations: vec![
911 Reservation {
912 id: 1,
913 pid: std::process::id(),
914 budget_mb: 3000,
915 actual_mb: None,
916 task: "a".to_string(),
917 gpu_uuid: "GPU-A".to_string(),
918 started: now,
919 lease_expires: now + chrono::Duration::hours(1),
920 },
921 Reservation {
922 id: 2,
923 pid: std::process::id(),
924 budget_mb: 5000,
925 actual_mb: Some(4500),
926 task: "b".to_string(),
927 gpu_uuid: "GPU-A".to_string(),
928 started: now,
929 lease_expires: now + chrono::Duration::hours(1),
930 },
931 Reservation {
932 id: 3,
933 pid: std::process::id(),
934 budget_mb: 2000,
935 actual_mb: None,
936 task: "c".to_string(),
937 gpu_uuid: "GPU-B".to_string(),
938 started: now,
939 lease_expires: now + chrono::Duration::hours(1),
940 },
941 ],
942 };
943 assert_eq!(data.total_reserved_for("GPU-A"), 7500);
945 assert_eq!(data.total_reserved_for("GPU-B"), 2000);
947 assert_eq!(data.total_reserved_for("GPU-C"), 0);
949 }
950
951 #[test]
952 fn test_ledger_data_prune_dead() {
953 let now = chrono::Utc::now();
954 let mut data = LedgerData {
955 reservations: vec![
956 Reservation {
958 id: 1,
959 pid: std::process::id(),
960 budget_mb: 1000,
961 actual_mb: None,
962 task: "expired".to_string(),
963 gpu_uuid: "GPU-A".to_string(),
964 started: now - chrono::Duration::hours(2),
965 lease_expires: now - chrono::Duration::seconds(1),
966 },
967 Reservation {
969 id: 2,
970 pid: std::process::id(),
971 budget_mb: 2000,
972 actual_mb: None,
973 task: "alive".to_string(),
974 gpu_uuid: "GPU-A".to_string(),
975 started: now,
976 lease_expires: now + chrono::Duration::hours(24),
977 },
978 Reservation {
980 id: 3,
981 pid: u32::MAX,
982 budget_mb: 3000,
983 actual_mb: None,
984 task: "dead".to_string(),
985 gpu_uuid: "GPU-A".to_string(),
986 started: now,
987 lease_expires: now + chrono::Duration::hours(24),
988 },
989 ],
990 };
991 data.prune_dead();
992 assert_eq!(data.reservations.len(), 1);
993 assert_eq!(data.reservations[0].task, "alive");
994 }
995
996 #[test]
997 fn test_reservation_id_varies_with_pid() {
998 let now = chrono::Utc::now();
999 let id1 = reservation_id("GPU-abc", 100, now);
1000 let id2 = reservation_id("GPU-abc", 200, now);
1001 assert_ne!(id1, id2);
1002 }
1003
1004 #[test]
1005 fn test_reservation_id_varies_with_time() {
1006 let now = chrono::Utc::now();
1007 let later = now + chrono::Duration::seconds(1);
1008 let id1 = reservation_id("GPU-abc", 100, now);
1009 let id2 = reservation_id("GPU-abc", 100, later);
1010 assert_ne!(id1, id2);
1011 }
1012
1013 #[test]
1014 fn test_gpu_status_display_no_reservations() {
1015 let path = test_ledger_path();
1016 let ledger = VramLedger::new("GPU-no-res".into(), 16000, 0.85).with_path(path.clone());
1017
1018 let status = gpu_status_display(&ledger).expect("should succeed");
1019 assert!(status.contains("GPU-no-res"));
1020 assert!(status.contains("16000 MB total"));
1021 assert!(status.contains("Reservations: none"));
1022
1023 cleanup(&path);
1024 }
1025
1026 #[test]
1027 fn test_gpu_status_display_with_actual_mb() {
1028 let path = test_ledger_path();
1029 let mut ledger = VramLedger::new("GPU-act".into(), 24000, 0.85).with_path(path.clone());
1030
1031 ledger.try_reserve(8000, "actual-test").expect("should succeed");
1032 ledger.update_actual(7500).expect("should succeed");
1033
1034 let status = gpu_status_display(&ledger).expect("should succeed");
1035 assert!(status.contains("7500 MB actual"));
1036 assert!(status.contains("actual-test"));
1037
1038 cleanup(&path);
1039 }
1040
1041 #[test]
1042 fn test_memory_type_reserve_factor_values() {
1043 assert_eq!(MemoryType::Discrete.reserve_factor(), RESERVE_FACTOR_DISCRETE);
1044 assert_eq!(MemoryType::Unified.reserve_factor(), RESERVE_FACTOR_UNIFIED);
1045 }
1046
1047 #[test]
1048 fn test_memory_type_equality() {
1049 assert_eq!(MemoryType::Discrete, MemoryType::Discrete);
1050 assert_eq!(MemoryType::Unified, MemoryType::Unified);
1051 assert_ne!(MemoryType::Discrete, MemoryType::Unified);
1052 }
1053
1054 #[test]
1055 fn test_ensure_parent_dir_existing() {
1056 let dir = tempfile::tempdir().expect("ok");
1057 let path = dir.path().join("subdir").join("ledger.json");
1058 ensure_parent_dir(&path).expect("should succeed");
1059 assert!(path.parent().expect("ok").exists());
1060 }
1061
1062 #[test]
1063 fn test_reserve_exact_capacity() {
1064 let path = test_ledger_path();
1065 let mut ledger = VramLedger::new("GPU-test".into(), 10000, 0.85).with_path(path.clone());
1066 let result = ledger.try_reserve(8500, "exact-fit");
1068 assert!(result.is_ok());
1069 assert_eq!(ledger.available_mb().expect("ok"), 0);
1070
1071 cleanup(&path);
1072 }
1073
1074 #[test]
1075 fn test_reserve_one_over_capacity() {
1076 let path = test_ledger_path();
1077 let mut ledger = VramLedger::new("GPU-test".into(), 10000, 0.85).with_path(path.clone());
1078 let result = ledger.try_reserve(8501, "too-big");
1080 assert!(result.is_err());
1081
1082 cleanup(&path);
1083 }
1084
1085 #[test]
1086 fn test_ledger_default_path() {
1087 let ledger = VramLedger::new("GPU-test".into(), 24000, 0.85);
1088 let path_str = format!("{}", ledger.path.display());
1090 assert!(path_str.contains("gpu-ledger.json"));
1091 }
1092
1093 #[test]
1094 fn test_reservation_serde_roundtrip() {
1095 let now = chrono::Utc::now();
1096 let reservation = Reservation {
1097 id: 42,
1098 pid: 12345,
1099 budget_mb: 8000,
1100 actual_mb: Some(7500),
1101 task: "serde-test".to_string(),
1102 gpu_uuid: "GPU-0000".to_string(),
1103 started: now,
1104 lease_expires: now + chrono::Duration::hours(24),
1105 };
1106 let json = serde_json::to_string(&reservation).expect("serialize");
1107 let restored: Reservation = serde_json::from_str(&json).expect("deserialize");
1108 assert_eq!(restored.id, 42);
1109 assert_eq!(restored.pid, 12345);
1110 assert_eq!(restored.budget_mb, 8000);
1111 assert_eq!(restored.actual_mb, Some(7500));
1112 assert_eq!(restored.task, "serde-test");
1113 }
1114}