Skip to main content

hpc_node/
mount.rs

1//! Mount management conventions and trait.
2//!
3//! Defines refcounted mount management for uenv `SquashFS` images.
4//! Multiple allocations can share one mount. Lazy unmount with
5//! configurable hold time for cache locality.
6
7use serde::{Deserialize, Serialize};
8
9/// Well-known mount paths.
10pub mod paths {
11    /// Base directory for uenv `SquashFS` mounts.
12    pub const UENV_MOUNT_BASE: &str = "/run/pact/uenv";
13    /// Base directory for allocation working directories.
14    pub const WORKDIR_BASE: &str = "/run/pact/workdir";
15    /// Base directory for data staging mounts (NFS, S3).
16    pub const DATA_STAGE_BASE: &str = "/run/pact/data";
17}
18
19/// Handle to an acquired mount.
20///
21/// Returned by [`MountManager::acquire_mount`]. The holder must call
22/// [`MountManager::release_mount`] when the allocation no longer needs
23/// the mount.
24#[derive(Debug, Clone, Serialize, Deserialize)]
25pub struct MountHandle {
26    /// Path to the source image (e.g., `/images/pytorch-2.5.sqfs`).
27    pub image_path: String,
28    /// Where the image is mounted (e.g., `/run/pact/uenv/pytorch-2.5`).
29    pub mount_point: String,
30}
31
32/// Default hold time in seconds before unmounting after refcount reaches zero.
33pub const DEFAULT_HOLD_TIME_SECS: u64 = 60;
34
35/// Trait for refcounted mount management.
36///
37/// Both pact (as init) and lattice (standalone mode) implement this.
38///
39/// # Invariants
40///
41/// - WI2: refcount exactly equals active allocations using the mount.
42///   Refcount going negative is a bug — implementations must assert.
43/// - WI3: lazy unmount with configurable hold time. Emergency `--force`
44///   overrides the hold timer.
45/// - WI6: on agent restart, `reconstruct_state` rebuilds refcounts
46///   from the kernel mount table + active allocations.
47pub trait MountManager: Send + Sync {
48    /// Acquire a reference to a uenv mount.
49    ///
50    /// If this is the first reference, the `SquashFS` image is mounted.
51    /// Otherwise, the refcount is incremented and a bind-mount is
52    /// prepared for the allocation's mount namespace.
53    fn acquire_mount(&self, image_path: &str) -> Result<MountHandle, MountError>;
54
55    /// Release a reference to a mount.
56    ///
57    /// Decrements the refcount. When refcount reaches zero, starts
58    /// the cache hold timer. The mount is not unmounted until the
59    /// timer expires (or emergency force-unmount).
60    fn release_mount(&self, handle: &MountHandle) -> Result<(), MountError>;
61
62    /// Force-unmount regardless of refcount or hold timer.
63    ///
64    /// Only allowed during emergency mode (RI3). Cancels any running
65    /// hold timer and unmounts immediately.
66    fn force_unmount(&self, image_path: &str) -> Result<(), MountError>;
67
68    /// Reconstruct refcounts from kernel mount table and active allocations.
69    ///
70    /// Called on agent restart (WI6). Scans `/proc/mounts` and correlates
71    /// with the provided list of active allocation IDs (from journal state).
72    /// Mounts without matching allocations get refcount=0 and start hold timers.
73    fn reconstruct_state(&self, active_allocations: &[String]) -> Result<(), MountError>;
74}
75
76/// Errors from mount operations.
77#[derive(Debug, thiserror::Error)]
78pub enum MountError {
79    #[error("mount failed for {image_path}: {reason}")]
80    MountFailed { image_path: String, reason: String },
81
82    #[error("unmount failed for {mount_point}: {reason}")]
83    UnmountFailed { mount_point: String, reason: String },
84
85    #[error("refcount inconsistency for {image_path}: {detail}")]
86    RefcountInconsistency { image_path: String, detail: String },
87
88    #[error("mount I/O error: {0}")]
89    Io(#[from] std::io::Error),
90}
91
92#[cfg(test)]
93mod tests {
94    use super::*;
95
96    #[test]
97    fn mount_handle_serialization() {
98        let handle = MountHandle {
99            image_path: "/images/pytorch-2.5.sqfs".to_string(),
100            mount_point: "/run/pact/uenv/pytorch-2.5".to_string(),
101        };
102        let json = serde_json::to_string(&handle).unwrap();
103        let deser: MountHandle = serde_json::from_str(&json).unwrap();
104        assert_eq!(deser.image_path, "/images/pytorch-2.5.sqfs");
105        assert_eq!(deser.mount_point, "/run/pact/uenv/pytorch-2.5");
106    }
107
108    #[test]
109    fn well_known_paths() {
110        assert!(paths::UENV_MOUNT_BASE.starts_with("/run/pact/"));
111        assert!(paths::WORKDIR_BASE.starts_with("/run/pact/"));
112        assert!(paths::DATA_STAGE_BASE.starts_with("/run/pact/"));
113    }
114}