Skip to main content

nucleus/checkpoint/
criu.rs

1use crate::checkpoint::metadata::CheckpointMetadata;
2use crate::checkpoint::state::CheckpointState;
3use crate::container::ContainerState;
4use crate::error::{NucleusError, Result, StateTransition};
5use nix::unistd::Uid;
6use std::fs;
7use std::os::unix::fs::PermissionsExt;
8use std::path::{Path, PathBuf};
9use std::process::Command;
10use tempfile::Builder;
11use tracing::info;
12
13/// CRIU runtime for checkpoint/restore
14///
15/// Follows the same pattern as GVisorRuntime: find binary, validate, invoke via Command.
16pub struct CriuRuntime {
17    binary_path: PathBuf,
18    state: CheckpointState,
19}
20
21impl CriuRuntime {
22    /// Create a new CRIU runtime, finding the criu binary
23    pub fn new() -> Result<Self> {
24        let binary_path = Self::find_binary()?;
25
26        // Validate binary works
27        let output = Command::new(&binary_path)
28            .arg("--version")
29            .output()
30            .map_err(|e| NucleusError::CheckpointError(format!("Failed to execute criu: {}", e)))?;
31
32        if !output.status.success() {
33            return Err(NucleusError::CheckpointError(
34                "criu --version failed".to_string(),
35            ));
36        }
37
38        let version = String::from_utf8_lossy(&output.stdout);
39        info!("Found CRIU: {}", version.trim());
40
41        Ok(Self {
42            binary_path,
43            state: CheckpointState::None,
44        })
45    }
46
47    /// Validate a binary path for safe execution.
48    ///
49    /// Checks permissions (not world/group-writable) and ownership (must be
50    /// owned by root or the effective UID) to prevent execution of tampered
51    /// binaries.
52    fn validate_binary(path: &Path) -> Result<()> {
53        use std::os::unix::fs::MetadataExt;
54
55        let metadata = fs::metadata(path).map_err(|e| {
56            NucleusError::CheckpointError(format!("Cannot stat criu binary {:?}: {}", path, e))
57        })?;
58        let mode = metadata.permissions().mode();
59        if mode & 0o022 != 0 {
60            return Err(NucleusError::CheckpointError(format!(
61                "criu binary {:?} is writable by group/others (mode {:o}), refusing to execute",
62                path, mode
63            )));
64        }
65        if mode & 0o111 == 0 {
66            return Err(NucleusError::CheckpointError(format!(
67                "criu binary {:?} is not executable",
68                path
69            )));
70        }
71        let owner_uid = metadata.uid();
72        let euid = nix::unistd::Uid::effective().as_raw();
73        if owner_uid != 0 && owner_uid != euid {
74            return Err(NucleusError::CheckpointError(format!(
75                "criu binary {:?} is owned by UID {} (expected root or euid {}), refusing to execute",
76                path, owner_uid, euid
77            )));
78        }
79        Ok(())
80    }
81
82    fn find_binary() -> Result<PathBuf> {
83        // Check common locations
84        for path in &["/usr/sbin/criu", "/usr/bin/criu", "/usr/local/sbin/criu"] {
85            let p = PathBuf::from(path);
86            if p.exists() {
87                Self::validate_binary(&p)?;
88                return Ok(p);
89            }
90        }
91
92        // For privileged execution, do not resolve runtime binaries via PATH.
93        // This avoids environment-based binary hijacking when running as root.
94        if Uid::effective().is_root() {
95            return Err(NucleusError::CheckpointError(
96                "CRIU binary not found in trusted system paths".to_string(),
97            ));
98        }
99
100        // Try PATH for unprivileged execution.
101        if let Some(path_var) = std::env::var_os("PATH") {
102            for dir in std::env::split_paths(&path_var) {
103                let candidate = dir.join("criu");
104                if candidate.exists() {
105                    Self::validate_binary(&candidate)?;
106                    return Ok(candidate);
107                }
108            }
109        }
110
111        Err(NucleusError::CheckpointError(
112            "CRIU binary not found. Install criu to use checkpoint/restore.".to_string(),
113        ))
114    }
115
116    /// Checkpoint a running container
117    ///
118    /// State transitions: None -> Dumping -> Dumped (or Dumping -> None on failure)
119    pub fn checkpoint(
120        &mut self,
121        state: &ContainerState,
122        output_dir: &Path,
123        leave_running: bool,
124    ) -> Result<()> {
125        // Requires root
126        if !nix::unistd::Uid::effective().is_root() {
127            return Err(NucleusError::CheckpointError(
128                "Checkpoint requires root (CRIU needs CAP_SYS_PTRACE)".to_string(),
129            ));
130        }
131
132        if !state.is_running() {
133            return Err(NucleusError::CheckpointError(format!(
134                "Container {} is not running",
135                state.id
136            )));
137        }
138
139        // State transition: None -> Dumping
140        self.state = self.state.transition(CheckpointState::Dumping)?;
141
142        let images_dir = Self::prepare_checkpoint_dir(output_dir)?;
143
144        // Run criu dump
145        let mut cmd = Command::new(&self.binary_path);
146        cmd.arg("dump")
147            .arg("--tree")
148            .arg(state.pid.to_string())
149            .arg("--images-dir")
150            .arg(&images_dir)
151            .arg("--shell-job");
152
153        if leave_running {
154            cmd.arg("--leave-running");
155        }
156
157        info!(
158            "Checkpointing container {} (PID {}) to {:?}",
159            state.id, state.pid, output_dir
160        );
161
162        let output = cmd.output().map_err(|e| {
163            // Abort: Dumping -> None
164            self.state = self
165                .state
166                .transition(CheckpointState::None)
167                .unwrap_or(self.state);
168            NucleusError::CheckpointError(format!("Failed to run criu dump: {}", e))
169        })?;
170
171        if !output.status.success() {
172            // Abort: Dumping -> None
173            self.state = self
174                .state
175                .transition(CheckpointState::None)
176                .unwrap_or(self.state);
177            let stderr = String::from_utf8_lossy(&output.stderr);
178            return Err(NucleusError::CheckpointError(format!(
179                "criu dump failed: {}",
180                stderr
181            )));
182        }
183
184        // Write metadata
185        let metadata = CheckpointMetadata::from_state(state);
186        metadata.save(output_dir)?;
187
188        // State transition: Dumping -> Dumped
189        self.state = self.state.transition(CheckpointState::Dumped)?;
190
191        info!("Checkpoint complete: {:?}", output_dir);
192        Ok(())
193    }
194
195    /// Restore a container from checkpoint
196    ///
197    /// State transitions: None -> Restoring -> Restored (or Restoring -> None on failure)
198    pub fn restore(&mut self, input_dir: &Path) -> Result<u32> {
199        // Requires root
200        if !nix::unistd::Uid::effective().is_root() {
201            return Err(NucleusError::CheckpointError(
202                "Restore requires root (CRIU needs CAP_SYS_PTRACE)".to_string(),
203            ));
204        }
205
206        // Load and validate metadata
207        let metadata = CheckpointMetadata::load(input_dir)?;
208        info!(
209            "Restoring container {} from checkpoint (originally PID {})",
210            metadata.container_id, metadata.original_pid
211        );
212
213        let images_dir = input_dir.join("images");
214        if !images_dir.exists() {
215            return Err(NucleusError::CheckpointError(format!(
216                "Images directory not found: {:?}",
217                images_dir
218            )));
219        }
220
221        // H8: Verify checkpoint image integrity via HMAC if available
222        let hmac_path = input_dir.join("checkpoint.hmac");
223        if hmac_path.exists() {
224            info!("Verifying checkpoint HMAC integrity");
225            // HMAC is present — verify it
226            let expected = std::fs::read_to_string(&hmac_path).map_err(|e| {
227                NucleusError::CheckpointError(format!("Failed to read checkpoint HMAC: {}", e))
228            })?;
229            let expected = expected.trim();
230
231            // Compute HMAC over the metadata file
232            let metadata_path = input_dir.join("metadata.json");
233            let metadata_content = std::fs::read(&metadata_path).map_err(|e| {
234                NucleusError::CheckpointError(format!(
235                    "Failed to read checkpoint metadata for HMAC: {}",
236                    e
237                ))
238            })?;
239            let actual = crate::security::sha256_hex(&metadata_content);
240            if actual != expected {
241                return Err(NucleusError::CheckpointError(format!(
242                    "Checkpoint integrity verification failed: hash mismatch (expected {}, got {})",
243                    expected, actual
244                )));
245            }
246            info!("Checkpoint integrity verified");
247        } else {
248            tracing::warn!(
249                "No checkpoint HMAC found at {:?}; skipping integrity verification. \
250                 Consider generating HMACs during checkpoint for tamper detection.",
251                hmac_path
252            );
253        }
254
255        // State transition: None -> Restoring
256        self.state = self.state.transition(CheckpointState::Restoring)?;
257
258        // Capture the restored init PID explicitly.
259        let pidfile = Builder::new()
260            .prefix("nucleus-criu-restore-")
261            .tempfile()
262            .map_err(|e| {
263                NucleusError::CheckpointError(format!("Failed to create CRIU pidfile: {}", e))
264            })?;
265        let pidfile_path = pidfile.path().to_path_buf();
266
267        // Run criu restore
268        let output = Command::new(&self.binary_path)
269            .arg("restore")
270            .arg("--images-dir")
271            .arg(&images_dir)
272            .arg("--shell-job")
273            .arg("--pidfile")
274            .arg(&pidfile_path)
275            .output()
276            .map_err(|e| {
277                // Abort: Restoring -> None
278                self.state = self
279                    .state
280                    .transition(CheckpointState::None)
281                    .unwrap_or(self.state);
282                NucleusError::CheckpointError(format!("Failed to run criu restore: {}", e))
283            })?;
284
285        if !output.status.success() {
286            // Abort: Restoring -> None
287            self.state = self
288                .state
289                .transition(CheckpointState::None)
290                .unwrap_or(self.state);
291            let stderr = String::from_utf8_lossy(&output.stderr);
292            return Err(NucleusError::CheckpointError(format!(
293                "criu restore failed: {}",
294                stderr
295            )));
296        }
297
298        // State transition: Restoring -> Restored
299        self.state = self.state.transition(CheckpointState::Restored)?;
300
301        // Parse restored PID from pidfile, with output fallback for compatibility.
302        let pid_text = fs::read_to_string(&pidfile_path).unwrap_or_default();
303        if let Some(pid) = Self::parse_pidfile(&pid_text) {
304            info!("Restore complete, new PID: {}", pid);
305            return Ok(pid);
306        }
307
308        let stdout = String::from_utf8_lossy(&output.stdout);
309        if let Some(pid) = Self::parse_pid_text(&stdout) {
310            info!("Restore complete, new PID: {}", pid);
311            return Ok(pid);
312        }
313
314        let stderr = String::from_utf8_lossy(&output.stderr);
315        if let Some(pid) = Self::parse_pid_text(&stderr) {
316            info!("Restore complete, new PID: {}", pid);
317            return Ok(pid);
318        }
319
320        Err(NucleusError::CheckpointError(format!(
321            "Failed to parse restored PID from CRIU output (pidfile='{}', stdout='{}', stderr='{}')",
322            pid_text.trim(),
323            stdout.trim(),
324            stderr.trim()
325        )))
326    }
327
328    fn parse_pid_text(text: &str) -> Option<u32> {
329        text.split(|c: char| !c.is_ascii_digit())
330            .filter(|tok| !tok.is_empty())
331            .find_map(|tok| tok.parse::<u32>().ok())
332    }
333
334    fn parse_pidfile(text: &str) -> Option<u32> {
335        let trimmed = text.trim();
336        if trimmed.is_empty() || !trimmed.chars().all(|c| c.is_ascii_digit()) {
337            return None;
338        }
339        trimmed.parse::<u32>().ok()
340    }
341
342    fn prepare_checkpoint_dir(output_dir: &Path) -> Result<PathBuf> {
343        Self::ensure_secure_dir(output_dir, "checkpoint directory")?;
344        let images_dir = output_dir.join("images");
345        Self::ensure_secure_dir(&images_dir, "checkpoint images directory")?;
346        Ok(images_dir)
347    }
348
349    fn ensure_secure_dir(path: &Path, label: &str) -> Result<()> {
350        Self::reject_symlink_path(path, label)?;
351
352        if path.exists() {
353            if !path.is_dir() {
354                return Err(NucleusError::CheckpointError(format!(
355                    "{} {:?} is not a directory",
356                    label, path
357                )));
358            }
359        } else {
360            fs::create_dir_all(path).map_err(|e| {
361                NucleusError::CheckpointError(format!(
362                    "Failed to create {} {:?}: {}",
363                    label, path, e
364                ))
365            })?;
366        }
367
368        Self::reject_symlink_path(path, label)?;
369        fs::set_permissions(path, fs::Permissions::from_mode(0o700)).map_err(|e| {
370            NucleusError::CheckpointError(format!(
371                "Failed to set {} permissions {:?}: {}",
372                label, path, e
373            ))
374        })?;
375
376        Ok(())
377    }
378
379    fn reject_symlink_path(path: &Path, label: &str) -> Result<()> {
380        match fs::symlink_metadata(path) {
381            Ok(metadata) if metadata.file_type().is_symlink() => Err(
382                NucleusError::CheckpointError(format!("Refusing symlink {} {:?}", label, path)),
383            ),
384            Ok(_) | Err(_) => Ok(()),
385        }
386    }
387}
388
389#[cfg(test)]
390mod tests {
391    use super::CriuRuntime;
392    use std::fs;
393    use std::os::unix::fs::{symlink, PermissionsExt};
394    use tempfile::TempDir;
395
396    #[test]
397    fn test_parse_pid_text_plain() {
398        assert_eq!(CriuRuntime::parse_pid_text("1234\n"), Some(1234));
399    }
400
401    #[test]
402    fn test_parse_pid_text_embedded() {
403        assert_eq!(
404            CriuRuntime::parse_pid_text("restored successfully pid=5678"),
405            Some(5678)
406        );
407    }
408
409    #[test]
410    fn test_parse_pid_text_missing() {
411        assert_eq!(CriuRuntime::parse_pid_text("no pid here"), None);
412    }
413
414    #[test]
415    fn test_parse_pidfile_strict() {
416        // BUG-22: parse_pid_text must prefer strict pidfile parsing
417        // A pidfile should contain just a number, not extract first number from error messages
418        assert_eq!(CriuRuntime::parse_pidfile("1234\n"), Some(1234));
419        assert_eq!(CriuRuntime::parse_pidfile("  5678  \n"), Some(5678));
420        // Error messages should NOT parse as PIDs
421        assert_eq!(CriuRuntime::parse_pidfile("Error code: 255 (EPERM)"), None);
422        assert_eq!(
423            CriuRuntime::parse_pidfile("restored successfully pid=5678"),
424            None
425        );
426        assert_eq!(CriuRuntime::parse_pidfile(""), None);
427        assert_eq!(CriuRuntime::parse_pidfile("no pid here"), None);
428    }
429
430    #[test]
431    fn test_prepare_checkpoint_dir_rejects_symlinked_images_dir() {
432        let tmp = TempDir::new().unwrap();
433        let target = tmp.path().join("target");
434        fs::create_dir(&target).unwrap();
435        let images = tmp.path().join("images");
436        symlink(&target, &images).unwrap();
437
438        let err = CriuRuntime::prepare_checkpoint_dir(tmp.path()).unwrap_err();
439        assert!(
440            err.to_string().contains("symlink"),
441            "expected symlink rejection, got: {err}"
442        );
443    }
444
445    #[test]
446    fn test_prepare_checkpoint_dir_creates_images_subdir() {
447        let tmp = TempDir::new().unwrap();
448        let images = CriuRuntime::prepare_checkpoint_dir(tmp.path()).unwrap();
449        assert_eq!(images, tmp.path().join("images"));
450        assert!(images.is_dir());
451
452        // Verify permissions are 0o700
453        let mode = fs::metadata(&images).unwrap().permissions().mode() & 0o777;
454        assert_eq!(mode, 0o700, "images dir should be mode 700, got {:o}", mode);
455    }
456
457    #[test]
458    fn test_prepare_checkpoint_dir_rejects_file_as_output_dir() {
459        let tmp = TempDir::new().unwrap();
460        let file_path = tmp.path().join("not-a-dir");
461        fs::write(&file_path, "").unwrap();
462
463        let err = CriuRuntime::prepare_checkpoint_dir(&file_path).unwrap_err();
464        assert!(
465            err.to_string().contains("not a directory"),
466            "expected 'not a directory' error, got: {err}"
467        );
468    }
469
470    #[test]
471    fn test_prepare_checkpoint_dir_rejects_symlinked_output_dir() {
472        let tmp = TempDir::new().unwrap();
473        let real_dir = tmp.path().join("real");
474        fs::create_dir(&real_dir).unwrap();
475        let link = tmp.path().join("link");
476        symlink(&real_dir, &link).unwrap();
477
478        let err = CriuRuntime::prepare_checkpoint_dir(&link).unwrap_err();
479        assert!(
480            err.to_string().contains("symlink"),
481            "expected symlink rejection, got: {err}"
482        );
483    }
484
485    #[test]
486    fn test_validate_binary_rejects_group_writable() {
487        let tmp = TempDir::new().unwrap();
488        let bin = tmp.path().join("criu");
489        fs::write(&bin, "#!/bin/sh\n").unwrap();
490        fs::set_permissions(&bin, fs::Permissions::from_mode(0o775)).unwrap();
491
492        let err = CriuRuntime::validate_binary(&bin).unwrap_err();
493        assert!(
494            err.to_string().contains("writable by group/others"),
495            "expected group-writable rejection, got: {err}"
496        );
497    }
498
499    #[test]
500    fn test_validate_binary_rejects_world_writable() {
501        let tmp = TempDir::new().unwrap();
502        let bin = tmp.path().join("criu");
503        fs::write(&bin, "#!/bin/sh\n").unwrap();
504        fs::set_permissions(&bin, fs::Permissions::from_mode(0o757)).unwrap();
505
506        let err = CriuRuntime::validate_binary(&bin).unwrap_err();
507        assert!(
508            err.to_string().contains("writable by group/others"),
509            "expected world-writable rejection, got: {err}"
510        );
511    }
512
513    #[test]
514    fn test_validate_binary_rejects_non_executable() {
515        let tmp = TempDir::new().unwrap();
516        let bin = tmp.path().join("criu");
517        fs::write(&bin, "#!/bin/sh\n").unwrap();
518        fs::set_permissions(&bin, fs::Permissions::from_mode(0o600)).unwrap();
519
520        let err = CriuRuntime::validate_binary(&bin).unwrap_err();
521        assert!(
522            err.to_string().contains("not executable"),
523            "expected non-executable rejection, got: {err}"
524        );
525    }
526
527    #[test]
528    fn test_validate_binary_accepts_secure_binary() {
529        let tmp = TempDir::new().unwrap();
530        let bin = tmp.path().join("criu");
531        fs::write(&bin, "#!/bin/sh\n").unwrap();
532        fs::set_permissions(&bin, fs::Permissions::from_mode(0o755)).unwrap();
533
534        CriuRuntime::validate_binary(&bin).expect("should accept mode 0755");
535    }
536
537    #[test]
538    fn test_validate_binary_accepts_owner_only_executable() {
539        let tmp = TempDir::new().unwrap();
540        let bin = tmp.path().join("criu");
541        fs::write(&bin, "#!/bin/sh\n").unwrap();
542        fs::set_permissions(&bin, fs::Permissions::from_mode(0o700)).unwrap();
543
544        CriuRuntime::validate_binary(&bin).expect("should accept mode 0700");
545    }
546
547    #[test]
548    fn test_validate_binary_rejects_nonexistent() {
549        let tmp = TempDir::new().unwrap();
550        let bin = tmp.path().join("nonexistent");
551        assert!(CriuRuntime::validate_binary(&bin).is_err());
552    }
553
554    #[test]
555    fn test_checkpoint_state_transitions() {
556        use crate::checkpoint::state::CheckpointState;
557        use crate::error::StateTransition;
558
559        // Valid forward transitions
560        assert!(CheckpointState::None.can_transition_to(&CheckpointState::Dumping));
561        assert!(CheckpointState::Dumping.can_transition_to(&CheckpointState::Dumped));
562        assert!(CheckpointState::None.can_transition_to(&CheckpointState::Restoring));
563        assert!(CheckpointState::Restoring.can_transition_to(&CheckpointState::Restored));
564
565        // Valid abort transitions
566        assert!(CheckpointState::Dumping.can_transition_to(&CheckpointState::None));
567        assert!(CheckpointState::Restoring.can_transition_to(&CheckpointState::None));
568
569        // Invalid transitions
570        assert!(!CheckpointState::None.can_transition_to(&CheckpointState::Dumped));
571        assert!(!CheckpointState::None.can_transition_to(&CheckpointState::Restored));
572        assert!(!CheckpointState::Dumped.can_transition_to(&CheckpointState::Restoring));
573        assert!(!CheckpointState::Restored.can_transition_to(&CheckpointState::Dumping));
574    }
575
576    #[test]
577    fn test_prepare_checkpoint_dir_sets_secure_permissions() {
578        let tmp = TempDir::new().unwrap();
579        CriuRuntime::prepare_checkpoint_dir(tmp.path()).unwrap();
580
581        // Both output dir and images subdir should be 0700
582        let output_mode = fs::metadata(tmp.path()).unwrap().permissions().mode() & 0o777;
583        let images_mode = fs::metadata(tmp.path().join("images"))
584            .unwrap()
585            .permissions()
586            .mode()
587            & 0o777;
588        assert_eq!(output_mode, 0o700);
589        assert_eq!(images_mode, 0o700);
590    }
591}