Skip to main content

nucleus/checkpoint/
metadata.rs

1use crate::container::{ContainerState, ContainerStateManager};
2use crate::error::{NucleusError, Result};
3use crate::resources::{IoDeviceLimit, ResourceLimits};
4use serde::{Deserialize, Serialize};
5use std::fs;
6use std::fs::OpenOptions;
7use std::io::Write;
8use std::os::unix::fs::OpenOptionsExt;
9use std::path::Path;
10use std::time::SystemTime;
11
12/// Resource limits captured from the original cgroup at checkpoint time.
13#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
14pub struct CheckpointResourceLimits {
15    pub memory_bytes: Option<u64>,
16    pub memory_high: Option<u64>,
17    pub memory_swap_max: Option<u64>,
18    pub cpu_quota_us: Option<u64>,
19    pub cpu_period_us: u64,
20    pub cpu_weight: Option<u64>,
21    pub pids_max: Option<u64>,
22    pub io_limits: Vec<IoDeviceLimit>,
23}
24
25impl CheckpointResourceLimits {
26    fn from_cgroup_dir(cgroup_path: &Path) -> Result<Self> {
27        let (cpu_quota_us, cpu_period_us) = Self::read_cpu_quota(cgroup_path.join("cpu.max"))?;
28        Ok(Self {
29            memory_bytes: Self::read_optional_u64(cgroup_path.join("memory.max"))?,
30            memory_high: Self::read_optional_u64(cgroup_path.join("memory.high"))?,
31            memory_swap_max: Self::read_optional_u64(cgroup_path.join("memory.swap.max"))?,
32            cpu_quota_us,
33            cpu_period_us,
34            cpu_weight: Self::read_optional_u64(cgroup_path.join("cpu.weight"))?,
35            pids_max: Self::read_optional_u64(cgroup_path.join("pids.max"))?,
36            io_limits: Self::read_io_limits(cgroup_path.join("io.max"))?,
37        })
38    }
39
40    pub fn to_resource_limits(&self) -> ResourceLimits {
41        ResourceLimits {
42            memory_bytes: self.memory_bytes,
43            memory_high: self.memory_high,
44            memory_swap_max: self.memory_swap_max,
45            cpu_quota_us: self.cpu_quota_us,
46            cpu_period_us: self.cpu_period_us,
47            cpu_weight: self.cpu_weight,
48            pids_max: self.pids_max,
49            io_limits: self.io_limits.clone(),
50            memlock_bytes: None,
51        }
52    }
53
54    pub fn validate(&self) -> Result<()> {
55        self.to_resource_limits()
56            .validate_runtime_sanity()
57            .map_err(|e| {
58                NucleusError::CheckpointError(format!("Invalid checkpoint resource limits: {}", e))
59            })
60    }
61
62    pub fn cpu_limit_millicores(&self) -> Option<u64> {
63        if self.cpu_period_us == 0 {
64            return None;
65        }
66        self.cpu_quota_us
67            .map(|quota| quota.saturating_mul(1000) / self.cpu_period_us)
68    }
69
70    fn read_optional_u64(path: impl AsRef<Path>) -> Result<Option<u64>> {
71        let path = path.as_ref();
72        let content = fs::read_to_string(path).map_err(|e| {
73            NucleusError::CheckpointError(format!(
74                "Failed to read cgroup limit file {:?}: {}",
75                path, e
76            ))
77        })?;
78        let value = content.trim();
79        if value == "max" {
80            return Ok(None);
81        }
82        value.parse::<u64>().map(Some).map_err(|e| {
83            NucleusError::CheckpointError(format!(
84                "Failed to parse cgroup limit file {:?}: {}",
85                path, e
86            ))
87        })
88    }
89
90    fn read_cpu_quota(path: impl AsRef<Path>) -> Result<(Option<u64>, u64)> {
91        let path = path.as_ref();
92        let content = fs::read_to_string(path).map_err(|e| {
93            NucleusError::CheckpointError(format!("Failed to read {:?}: {}", path, e))
94        })?;
95        let mut parts = content.split_whitespace();
96        let quota = parts.next().ok_or_else(|| {
97            NucleusError::CheckpointError(format!("Invalid cpu.max format in {:?}", path))
98        })?;
99        let period = parts.next().ok_or_else(|| {
100            NucleusError::CheckpointError(format!("Missing cpu.max period in {:?}", path))
101        })?;
102        if parts.next().is_some() {
103            return Err(NucleusError::CheckpointError(format!(
104                "Invalid cpu.max format in {:?}",
105                path
106            )));
107        }
108
109        let cpu_quota_us = if quota == "max" {
110            None
111        } else {
112            Some(quota.parse::<u64>().map_err(|e| {
113                NucleusError::CheckpointError(format!("Failed to parse cpu.max quota: {}", e))
114            })?)
115        };
116        let cpu_period_us = period.parse::<u64>().map_err(|e| {
117            NucleusError::CheckpointError(format!("Failed to parse cpu.max period: {}", e))
118        })?;
119        if cpu_period_us == 0 {
120            return Err(NucleusError::CheckpointError(format!(
121                "Invalid cpu.max period in {:?}: period must be greater than 0",
122                path
123            )));
124        }
125
126        Ok((cpu_quota_us, cpu_period_us))
127    }
128
129    fn read_io_limits(path: impl AsRef<Path>) -> Result<Vec<IoDeviceLimit>> {
130        let path = path.as_ref();
131        let content = match fs::read_to_string(path) {
132            Ok(content) => content,
133            Err(e) if e.kind() == std::io::ErrorKind::NotFound => return Ok(Vec::new()),
134            Err(e) => {
135                return Err(NucleusError::CheckpointError(format!(
136                    "Failed to read {:?}: {}",
137                    path, e
138                )))
139            }
140        };
141
142        content
143            .lines()
144            .filter(|line| !line.trim().is_empty())
145            .map(IoDeviceLimit::parse)
146            .collect()
147    }
148}
149
150/// Metadata stored alongside checkpoint images
151#[derive(Debug, Clone, Serialize, Deserialize)]
152pub struct CheckpointMetadata {
153    /// Container ID
154    pub container_id: String,
155
156    /// Container name
157    pub container_name: String,
158
159    /// Original PID
160    pub original_pid: u32,
161
162    /// Command that was running
163    pub command: Vec<String>,
164
165    /// Timestamp of checkpoint
166    pub checkpoint_at: u64,
167
168    /// Nucleus version
169    pub version: String,
170
171    /// Whether container was using gVisor
172    pub using_gvisor: bool,
173
174    /// Whether container was rootless
175    pub rootless: bool,
176
177    /// Original cgroup path, if the container was tracked in a cgroup.
178    #[serde(default)]
179    pub cgroup_path: Option<String>,
180
181    /// Resource limits captured from the original cgroup.
182    #[serde(default)]
183    pub resource_limits: Option<CheckpointResourceLimits>,
184}
185
186impl CheckpointMetadata {
187    /// Create metadata from current container state
188    pub fn from_state(state: &ContainerState) -> Result<Self> {
189        let checkpoint_at = SystemTime::now()
190            .duration_since(SystemTime::UNIX_EPOCH)
191            .unwrap_or_default()
192            .as_secs();
193        let resource_limits = state
194            .cgroup_path
195            .as_deref()
196            .map(|path| CheckpointResourceLimits::from_cgroup_dir(Path::new(path)))
197            .transpose()?;
198
199        Ok(Self {
200            container_id: state.id.clone(),
201            container_name: state.name.clone(),
202            original_pid: state.pid,
203            command: state.command.clone(),
204            checkpoint_at,
205            version: env!("CARGO_PKG_VERSION").to_string(),
206            using_gvisor: state.using_gvisor,
207            rootless: state.rootless,
208            cgroup_path: state.cgroup_path.clone(),
209            resource_limits,
210        })
211    }
212
213    /// Save metadata to checkpoint directory
214    pub fn save(&self, dir: &Path) -> Result<()> {
215        let path = dir.join("metadata.json");
216        let tmp_path = dir.join("metadata.json.tmp");
217        let json = serde_json::to_string_pretty(self).map_err(|e| {
218            NucleusError::CheckpointError(format!("Failed to serialize metadata: {}", e))
219        })?;
220
221        if tmp_path.exists() {
222            let meta = fs::symlink_metadata(&tmp_path).map_err(|e| {
223                NucleusError::CheckpointError(format!(
224                    "Failed to inspect temp metadata file {:?}: {}",
225                    tmp_path, e
226                ))
227            })?;
228            if meta.file_type().is_symlink() {
229                return Err(NucleusError::CheckpointError(format!(
230                    "Refusing symlink temp metadata file {:?}",
231                    tmp_path
232                )));
233            }
234            fs::remove_file(&tmp_path).map_err(|e| {
235                NucleusError::CheckpointError(format!(
236                    "Failed to remove stale temp metadata file {:?}: {}",
237                    tmp_path, e
238                ))
239            })?;
240        }
241
242        let mut file = OpenOptions::new()
243            .create_new(true)
244            .write(true)
245            .mode(0o600)
246            .custom_flags(libc::O_NOFOLLOW)
247            .open(&tmp_path)
248            .map_err(|e| {
249                NucleusError::CheckpointError(format!(
250                    "Failed to open temp metadata file {:?}: {}",
251                    tmp_path, e
252                ))
253            })?;
254
255        file.write_all(json.as_bytes()).map_err(|e| {
256            NucleusError::CheckpointError(format!(
257                "Failed to write metadata file {:?}: {}",
258                tmp_path, e
259            ))
260        })?;
261        file.sync_all().map_err(|e| {
262            NucleusError::CheckpointError(format!(
263                "Failed to sync metadata file {:?}: {}",
264                tmp_path, e
265            ))
266        })?;
267
268        fs::rename(&tmp_path, &path).map_err(|e| {
269            NucleusError::CheckpointError(format!(
270                "Failed to atomically replace metadata file {:?}: {}",
271                path, e
272            ))
273        })?;
274        Ok(())
275    }
276
277    /// Load metadata from checkpoint directory
278    pub fn load(dir: &Path) -> Result<Self> {
279        let path = dir.join("metadata.json");
280        let json = ContainerStateManager::read_file_nofollow(&path).map_err(|e| {
281            NucleusError::CheckpointError(format!("Failed to read metadata {:?}: {}", path, e))
282        })?;
283        let metadata: Self = serde_json::from_str(&json).map_err(|e| {
284            NucleusError::CheckpointError(format!("Failed to parse metadata: {}", e))
285        })?;
286        if let Some(resource_limits) = metadata.resource_limits.as_ref() {
287            resource_limits.validate()?;
288        }
289        Ok(metadata)
290    }
291}
292
293#[cfg(test)]
294mod tests {
295    use super::*;
296    use std::os::unix::fs as unix_fs;
297
298    #[test]
299    fn test_save_rejects_symlink_target() {
300        // BUG-11: CheckpointMetadata::save must use O_NOFOLLOW to prevent
301        // symlink attacks. Verify by creating a symlink at the temp file path
302        // and confirming save() refuses to follow it.
303        let dir = tempfile::tempdir().unwrap();
304        let attacker_target = dir.path().join("attacker-owned-file");
305        std::fs::write(&attacker_target, "").unwrap();
306
307        // Pre-create the symlink where save() will write its temp file
308        let symlink_path = dir.path().join("metadata.json.tmp");
309        unix_fs::symlink(&attacker_target, &symlink_path).unwrap();
310
311        let metadata = CheckpointMetadata {
312            container_id: "test-id".to_string(),
313            container_name: "test".to_string(),
314            original_pid: 1,
315            command: vec!["/bin/sh".to_string()],
316            checkpoint_at: 0,
317            version: "0.0.0".to_string(),
318            using_gvisor: false,
319            rootless: false,
320            cgroup_path: None,
321            resource_limits: None,
322        };
323
324        let result = metadata.save(dir.path());
325        assert!(
326            result.is_err(),
327            "save() must reject symlink at temp file path (O_NOFOLLOW / symlink check)"
328        );
329    }
330
331    #[test]
332    fn test_checkpoint_resource_limits_from_cgroup_dir() {
333        let dir = tempfile::tempdir().unwrap();
334        std::fs::write(dir.path().join("memory.max"), "536870912\n").unwrap();
335        std::fs::write(dir.path().join("memory.high"), "483183820\n").unwrap();
336        std::fs::write(dir.path().join("memory.swap.max"), "0\n").unwrap();
337        std::fs::write(dir.path().join("cpu.max"), "50000 100000\n").unwrap();
338        std::fs::write(dir.path().join("cpu.weight"), "100\n").unwrap();
339        std::fs::write(dir.path().join("pids.max"), "256\n").unwrap();
340        std::fs::write(dir.path().join("io.max"), "8:0 rbps=1048576 wbps=2097152\n").unwrap();
341
342        let limits = CheckpointResourceLimits::from_cgroup_dir(dir.path()).unwrap();
343        assert_eq!(limits.memory_bytes, Some(536_870_912));
344        assert_eq!(limits.memory_high, Some(483_183_820));
345        assert_eq!(limits.memory_swap_max, Some(0));
346        assert_eq!(limits.cpu_quota_us, Some(50_000));
347        assert_eq!(limits.cpu_period_us, 100_000);
348        assert_eq!(limits.cpu_weight, Some(100));
349        assert_eq!(limits.pids_max, Some(256));
350        assert_eq!(limits.io_limits.len(), 1);
351        assert_eq!(limits.cpu_limit_millicores(), Some(500));
352    }
353
354    #[test]
355    fn test_load_rejects_zero_cpu_period_in_metadata() {
356        let dir = tempfile::tempdir().unwrap();
357        std::fs::write(
358            dir.path().join("metadata.json"),
359            r#"{
360  "container_id": "test-id",
361  "container_name": "test",
362  "original_pid": 1,
363  "command": ["/bin/sh"],
364  "checkpoint_at": 0,
365  "version": "0.0.0",
366  "using_gvisor": false,
367  "rootless": false,
368  "cgroup_path": "/sys/fs/cgroup/nucleus-test",
369  "resource_limits": {
370    "memory_bytes": null,
371    "memory_high": null,
372    "memory_swap_max": null,
373    "cpu_quota_us": 50000,
374    "cpu_period_us": 0,
375    "cpu_weight": null,
376    "pids_max": 256,
377    "io_limits": []
378  }
379}"#,
380        )
381        .unwrap();
382
383        let err = CheckpointMetadata::load(dir.path()).unwrap_err();
384        assert!(err
385            .to_string()
386            .contains("Invalid checkpoint resource limits"));
387    }
388}