Skip to main content

nucleus/security/
gvisor.rs

1use crate::error::{NucleusError, Result};
2use crate::oci::OciBundle;
3use nix::unistd::Uid;
4use std::ffi::CString;
5use std::os::unix::fs::PermissionsExt;
6use std::path::Path;
7use std::process::Command;
8use tracing::{debug, info};
9
10/// Network mode for gVisor runtime.
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub enum GVisorNetworkMode {
13    /// No networking (fully isolated). Default for agent workloads.
14    None,
15    /// gVisor user-space network stack. Suitable for networked production services
16    /// that need gVisor isolation with network access.
17    Sandbox,
18    /// Share host network namespace. Use with caution.
19    Host,
20}
21
22/// Platform backend for gVisor's Sentry.
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, clap::ValueEnum)]
24pub enum GVisorPlatform {
25    /// systrap backend, the current default and most broadly compatible option.
26    #[default]
27    Systrap,
28    /// KVM-backed sandboxing for the Sentry itself.
29    Kvm,
30    /// ptrace backend for maximal compatibility where systrap/KVM are unavailable.
31    Ptrace,
32}
33
34impl GVisorPlatform {
35    pub fn as_flag(self) -> &'static str {
36        match self {
37            Self::Systrap => "systrap",
38            Self::Kvm => "kvm",
39            Self::Ptrace => "ptrace",
40        }
41    }
42}
43
44/// GVisor runtime manager
45///
46/// Implements the gVisor state machine from
47/// NucleusSecurity_GVisor_GVisorRuntime.tla
48pub struct GVisorRuntime {
49    runsc_path: String,
50}
51
52impl GVisorRuntime {
53    /// Create a new GVisor runtime manager
54    ///
55    /// This checks for runsc binary availability
56    pub fn new() -> Result<Self> {
57        let runsc_path = Self::find_runsc()?;
58        info!("Found runsc at: {}", runsc_path);
59        Ok(Self { runsc_path })
60    }
61
62    /// Create a GVisor runtime with a pre-resolved runsc path.
63    ///
64    /// Use this when the path was resolved before privilege changes
65    /// (e.g. before entering a user namespace where UID 0 would block
66    /// PATH-based lookup).
67    pub fn with_path(runsc_path: String) -> Self {
68        Self { runsc_path }
69    }
70
71    /// Resolve the runsc path without constructing a full runtime.
72    /// Call this before fork/unshare so the path is resolved while
73    /// still unprivileged.
74    pub fn resolve_path() -> Result<String> {
75        Self::find_runsc()
76    }
77
78    /// Find the runsc binary
79    fn find_runsc() -> Result<String> {
80        // Try common locations
81        let paths = vec![
82            "/usr/local/bin/runsc",
83            "/usr/bin/runsc",
84            "/opt/gvisor/runsc",
85        ];
86
87        for path in &paths {
88            if let Some(validated) = Self::validate_runsc_path(Path::new(path))? {
89                return Ok(validated);
90            }
91        }
92
93        // For privileged execution, do not resolve runtime binaries via PATH.
94        // This avoids environment-based binary hijacking when running as root.
95        if Uid::effective().is_root() {
96            return Err(NucleusError::GVisorError(
97                "runsc binary not found in trusted system paths".to_string(),
98            ));
99        }
100
101        // Try to find in PATH without invoking a shell command.
102        if let Some(path_var) = std::env::var_os("PATH") {
103            for dir in std::env::split_paths(&path_var) {
104                let candidate = dir.join("runsc");
105                if let Some(validated) = Self::validate_runsc_path(&candidate)? {
106                    return Ok(validated);
107                }
108            }
109        }
110
111        Err(NucleusError::GVisorError(
112            "runsc binary not found. Please install gVisor.".to_string(),
113        ))
114    }
115
116    fn validate_runsc_path(path: &Path) -> Result<Option<String>> {
117        if !path.exists() {
118            return Ok(None);
119        }
120        if !path.is_file() {
121            return Ok(None);
122        }
123
124        let canonical = std::fs::canonicalize(path).map_err(|e| {
125            NucleusError::GVisorError(format!(
126                "Failed to canonicalize runsc path {:?}: {}",
127                path, e
128            ))
129        })?;
130
131        // If the candidate is a shell wrapper script (common on NixOS where
132        // nix wraps binaries to inject PATH), look for the real ELF binary
133        // next to it.  runsc's gofer subprocess re-execs via /proc/self/exe,
134        // which must point to the real binary – not a bash wrapper.
135        let resolved = Self::unwrap_nix_wrapper(&canonical).unwrap_or_else(|| canonical.clone());
136
137        let metadata = std::fs::metadata(&resolved).map_err(|e| {
138            NucleusError::GVisorError(format!("Failed to stat runsc path {:?}: {}", resolved, e))
139        })?;
140
141        let mode = metadata.permissions().mode();
142        if mode & 0o022 != 0 {
143            return Err(NucleusError::GVisorError(format!(
144                "Refusing insecure runsc binary permissions at {:?} (mode {:o})",
145                resolved, mode
146            )));
147        }
148        if mode & 0o111 == 0 {
149            return Ok(None);
150        }
151
152        // Reject binaries owned by other non-root users – a malicious user
153        // could place a trojan runsc earlier in PATH.
154        use std::os::unix::fs::MetadataExt;
155        let owner = metadata.uid();
156        let current_uid = nix::unistd::Uid::effective().as_raw();
157        if !Self::is_trusted_runsc_owner(&resolved, owner, current_uid) {
158            return Err(NucleusError::GVisorError(format!(
159                "Refusing runsc binary at {:?} owned by uid {} (expected root, current user {}, or immutable /nix/store artifact)",
160                resolved, owner, current_uid
161            )));
162        }
163
164        Ok(Some(resolved.to_string_lossy().to_string()))
165    }
166
167    fn is_trusted_runsc_owner(path: &Path, owner: u32, current_uid: u32) -> bool {
168        if owner == 0 || owner == current_uid {
169            return true;
170        }
171
172        // Nix store artifacts are immutable content-addressed paths and are
173        // commonly owned by `nobody` rather than root/current user.
174        // Extra hardening: verify the binary is not writable by *anyone* and
175        // the parent directory is also not writable, to guard against a
176        // compromised or mutable store.
177        if path.starts_with("/nix/store") {
178            if let Ok(meta) = std::fs::metadata(path) {
179                let mode = meta.permissions().mode();
180                // Reject if owner-writable (group/other already checked by caller)
181                if mode & 0o200 != 0 {
182                    return false;
183                }
184            } else {
185                return false;
186            }
187            // Verify the immediate parent directory is not writable
188            if let Some(parent) = path.parent() {
189                if let Ok(parent_meta) = std::fs::metadata(parent) {
190                    let parent_mode = parent_meta.permissions().mode();
191                    if parent_mode & 0o222 != 0 {
192                        return false;
193                    }
194                } else {
195                    return false;
196                }
197            }
198            return true;
199        }
200
201        false
202    }
203
204    /// If `path` is a Nix wrapper script, extract the real binary path.
205    ///
206    /// Nix wrapper scripts end with a line like:
207    ///   exec -a "$0" "/nix/store/…/.runsc-wrapped"  "$@"
208    /// We parse that to find the actual ELF binary.
209    fn unwrap_nix_wrapper(path: &Path) -> Option<std::path::PathBuf> {
210        let content = std::fs::read_to_string(path).ok()?;
211        // Only process short scripts (wrapper scripts are small)
212        if content.len() > 4096 || !content.starts_with("#!") {
213            return None;
214        }
215        // Look for the exec line that references the wrapped binary
216        for line in content.lines().rev() {
217            let trimmed = line.trim();
218            if trimmed.starts_with("exec ") {
219                // Parse: exec -a "$0" "/nix/store/.../bin/.runsc-wrapped"  "$@"
220                // or:    exec "/nix/store/.../bin/.runsc-wrapped"  "$@"
221                for token in trimmed.split_whitespace() {
222                    let unquoted = token.trim_matches('"');
223                    if unquoted.starts_with('/') && unquoted.contains("runsc") {
224                        let candidate = std::path::PathBuf::from(unquoted);
225                        if candidate.exists() && candidate.is_file() {
226                            debug!("Resolved Nix wrapper {:?} → {:?}", path, candidate);
227                            return Some(candidate);
228                        }
229                    }
230                }
231            }
232        }
233        None
234    }
235
236    /// Execute using gVisor with an OCI bundle
237    ///
238    /// This is the OCI-compliant way to run containers with gVisor.
239    /// The `network_mode` parameter controls gVisor's --network flag:
240    /// - `GVisorNetworkMode::None` → `--network none` (fully isolated, original behavior)
241    /// - `GVisorNetworkMode::Sandbox` → `--network sandbox` (gVisor user-space network stack)
242    /// - `GVisorNetworkMode::Host` → `--network host` (share host network namespace)
243    pub fn exec_with_oci_bundle(&self, container_id: &str, bundle: &OciBundle) -> Result<()> {
244        self.exec_with_oci_bundle_network(
245            container_id,
246            bundle,
247            GVisorNetworkMode::None,
248            false,
249            GVisorPlatform::Systrap,
250        )
251    }
252
253    /// Execute using gVisor with an OCI bundle and explicit network mode.
254    ///
255    /// When `rootless` is true, the OCI spec is expected to carry explicit
256    /// user namespace mappings. In that mode we do not pass runsc's CLI
257    /// `--rootless` flag, because gVisor documents that flag as the
258    /// `runsc do`-oriented path rather than the OCI `run` path. We still skip runsc's
259    /// internal cgroup configuration because Nucleus already manages cgroups
260    /// externally and unprivileged callers cannot configure them directly.
261    pub fn exec_with_oci_bundle_network(
262        &self,
263        container_id: &str,
264        bundle: &OciBundle,
265        network_mode: GVisorNetworkMode,
266        rootless: bool,
267        platform: GVisorPlatform,
268    ) -> Result<()> {
269        info!(
270            "Executing with gVisor using OCI bundle at {:?} (network: {:?}, platform: {:?})",
271            bundle.bundle_path(),
272            network_mode,
273            platform,
274        );
275
276        let network_flag = match network_mode {
277            GVisorNetworkMode::None => "none",
278            GVisorNetworkMode::Sandbox => "sandbox",
279            GVisorNetworkMode::Host => "host",
280        };
281
282        // Create a per-container root directory for runsc state.
283        // By default runsc uses /var/run/runsc which requires root privileges.
284        // We place it next to the OCI bundle so it is cleaned up together.
285        let runsc_root = bundle
286            .bundle_path()
287            .parent()
288            .unwrap_or(bundle.bundle_path())
289            .join("runsc-root");
290        std::fs::create_dir_all(&runsc_root).map_err(|e| {
291            NucleusError::GVisorError(format!("Failed to create runsc root directory: {}", e))
292        })?;
293        std::fs::set_permissions(&runsc_root, std::fs::Permissions::from_mode(0o700)).map_err(
294            |e| {
295                NucleusError::GVisorError(format!(
296                    "Failed to secure runsc root directory permissions: {}",
297                    e
298                ))
299            },
300        )?;
301
302        let runsc_runtime_dir = runsc_root.join("runtime");
303        std::fs::create_dir_all(&runsc_runtime_dir).map_err(|e| {
304            NucleusError::GVisorError(format!("Failed to create runsc runtime directory: {}", e))
305        })?;
306        std::fs::set_permissions(&runsc_runtime_dir, std::fs::Permissions::from_mode(0o700))
307            .map_err(|e| {
308                NucleusError::GVisorError(format!(
309                    "Failed to secure runsc runtime directory permissions: {}",
310                    e
311                ))
312            })?;
313
314        // Build runsc command with OCI bundle.
315        // Global flags (--root, --network, --platform) must come BEFORE the subcommand.
316        // runsc --root <dir> --network <mode> --platform <plat> run --bundle <path> <id>
317        let args = self.build_oci_run_args(
318            container_id,
319            bundle,
320            &runsc_root,
321            network_flag,
322            rootless,
323            platform,
324        );
325
326        debug!("runsc OCI args: {:?}", args);
327
328        // Convert to CStrings for exec
329        let program = CString::new(self.runsc_path.as_str())
330            .map_err(|e| NucleusError::GVisorError(format!("Invalid runsc path: {}", e)))?;
331
332        let c_args: Result<Vec<CString>> = args
333            .iter()
334            .map(|arg| {
335                CString::new(arg.as_str())
336                    .map_err(|e| NucleusError::GVisorError(format!("Invalid argument: {}", e)))
337            })
338            .collect();
339        let c_args = c_args?;
340
341        let c_env = self.exec_environment(&runsc_runtime_dir)?;
342
343        // Defense-in-depth: even though gVisor provides its own sandboxing,
344        // apply PR_SET_NO_NEW_PRIVS so the runsc process (and anything it
345        // spawns) cannot gain privileges via setuid/setgid binaries.
346        //
347        // PR_SET_NO_NEW_PRIVS only affects the calling thread. Verify we are
348        // single-threaded so no sibling thread can race to exec a setuid binary.
349        let thread_count = std::fs::read_to_string("/proc/self/status")
350            .ok()
351            .and_then(|s| {
352                s.lines()
353                    .find(|l| l.starts_with("Threads:"))
354                    .and_then(|l| l.split_whitespace().nth(1))
355                    .and_then(|n| n.parse::<u32>().ok())
356            });
357        if thread_count != Some(1) {
358            return Err(NucleusError::GVisorError(format!(
359                "PR_SET_NO_NEW_PRIVS requires single-threaded process, found {:?} threads",
360                thread_count
361            )));
362        }
363        // SAFETY: PR_SET_NO_NEW_PRIVS with arg 1 is always safe; we verified
364        // single-threaded above so no other thread can race this prctl.
365        let ret = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
366        if ret != 0 {
367            return Err(NucleusError::GVisorError(format!(
368                "Failed to set PR_SET_NO_NEW_PRIVS before gVisor exec: {}",
369                std::io::Error::last_os_error()
370            )));
371        }
372        info!("PR_SET_NO_NEW_PRIVS applied before gVisor exec (defense-in-depth)");
373
374        // execve - this replaces the current process with runsc
375        nix::unistd::execve::<std::ffi::CString, std::ffi::CString>(&program, &c_args, &c_env)?;
376
377        // Should never reach here
378        Ok(())
379    }
380
381    /// Check if gVisor is available on this system
382    pub fn is_available() -> bool {
383        Self::find_runsc().is_ok()
384    }
385
386    /// Get runsc version
387    pub fn version(&self) -> Result<String> {
388        let output = Command::new(&self.runsc_path)
389            .arg("--version")
390            .output()
391            .map_err(|e| NucleusError::GVisorError(format!("Failed to get version: {}", e)))?;
392
393        if !output.status.success() {
394            return Err(NucleusError::GVisorError(
395                "Failed to get runsc version".to_string(),
396            ));
397        }
398
399        let version = String::from_utf8_lossy(&output.stdout).to_string();
400        Ok(version.trim().to_string())
401    }
402
403    fn exec_environment(&self, runtime_dir: &Path) -> Result<Vec<CString>> {
404        let mut env = Vec::new();
405        let mut push = |key: &str, value: String| -> Result<()> {
406            env.push(
407                CString::new(format!("{}={}", key, value))
408                    .map_err(|e| NucleusError::GVisorError(format!("Invalid {}: {}", key, e)))?,
409            );
410            Ok(())
411        };
412
413        // Use a hardcoded PATH for the runsc supervisor process to prevent
414        // host PATH from leaking into the gVisor environment.
415        push(
416            "PATH",
417            "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
418        )?;
419        let runtime_dir = runtime_dir.to_string_lossy().to_string();
420        push("TMPDIR", runtime_dir.clone())?;
421        push("XDG_RUNTIME_DIR", runtime_dir)?;
422
423        // Hardcode safe values instead of leaking host identity/paths.
424        // HOME could point to an attacker-controlled directory; USER/LOGNAME
425        // leak host identity information – none of which gVisor needs.
426        push("HOME", "/root".to_string())?;
427        push("USER", "root".to_string())?;
428        push("LOGNAME", "root".to_string())?;
429
430        Ok(env)
431    }
432
433    fn build_oci_run_args(
434        &self,
435        container_id: &str,
436        bundle: &OciBundle,
437        runsc_root: &Path,
438        network_flag: &str,
439        rootless: bool,
440        platform: GVisorPlatform,
441    ) -> Vec<String> {
442        let mut args = vec![
443            self.runsc_path.clone(),
444            "--root".to_string(),
445            runsc_root.to_string_lossy().to_string(),
446        ];
447
448        // Rootless OCI mode relies on user namespace mappings in config.json.
449        // We intentionally do not pass runsc's CLI `--rootless` flag here.
450        if rootless {
451            args.push("--ignore-cgroups".to_string());
452        }
453
454        args.extend([
455            "--network".to_string(),
456            network_flag.to_string(),
457            "--platform".to_string(),
458            platform.as_flag().to_string(),
459            "run".to_string(),
460            "--bundle".to_string(),
461            bundle.bundle_path().to_string_lossy().to_string(),
462            container_id.to_string(),
463        ]);
464
465        args
466    }
467}
468
469#[cfg(test)]
470mod tests {
471    use super::*;
472    use std::path::Path;
473
474    #[test]
475    fn test_gvisor_availability() {
476        // This test just checks if we can determine availability
477        // It may pass or fail depending on whether gVisor is installed
478        let available = GVisorRuntime::is_available();
479        println!("gVisor available: {}", available);
480    }
481
482    #[test]
483    fn test_gvisor_new() {
484        let runtime = GVisorRuntime::new();
485        if let Ok(rt) = runtime {
486            println!("Found runsc at: {}", rt.runsc_path);
487            if let Ok(version) = rt.version() {
488                println!("runsc version: {}", version);
489            }
490        }
491    }
492
493    #[test]
494    fn test_find_runsc() {
495        // Test that find_runsc either succeeds or returns appropriate error
496        match GVisorRuntime::find_runsc() {
497            Ok(path) => {
498                println!("Found runsc at: {}", path);
499                assert!(!path.is_empty());
500            }
501            Err(e) => {
502                println!("runsc not found (expected if gVisor not installed): {}", e);
503            }
504        }
505    }
506
507    #[test]
508    fn test_validate_runsc_rejects_world_writable() {
509        let dir = tempfile::tempdir().unwrap();
510        let fake_runsc = dir.path().join("runsc");
511        std::fs::write(&fake_runsc, "#!/bin/sh\necho fake").unwrap();
512        // Make world-writable
513        std::fs::set_permissions(&fake_runsc, std::fs::Permissions::from_mode(0o777)).unwrap();
514
515        let result = GVisorRuntime::validate_runsc_path(&fake_runsc);
516        assert!(
517            result.is_err(),
518            "validate_runsc_path must reject world-writable binaries"
519        );
520    }
521
522    #[test]
523    fn test_validate_runsc_rejects_group_writable() {
524        let dir = tempfile::tempdir().unwrap();
525        let fake_runsc = dir.path().join("runsc");
526        std::fs::write(&fake_runsc, "#!/bin/sh\necho fake").unwrap();
527        // Make group-writable
528        std::fs::set_permissions(&fake_runsc, std::fs::Permissions::from_mode(0o775)).unwrap();
529
530        let result = GVisorRuntime::validate_runsc_path(&fake_runsc);
531        assert!(
532            result.is_err(),
533            "validate_runsc_path must reject group-writable binaries"
534        );
535    }
536
537    #[test]
538    fn test_runsc_owner_accepts_nix_store_artifact_owner() {
539        // Use a real Nix store binary so the metadata/permission checks pass.
540        // The /nix/store contents are read-only and content-addressed, so any
541        // existing file with mode 555 works.
542        let nix_binary = std::fs::read_dir("/nix/store")
543            .ok()
544            .and_then(|mut entries| {
545                entries.find_map(|e| {
546                    let dir = e.ok()?.path();
547                    let candidate = dir.join("bin/runsc");
548                    if candidate.exists() {
549                        Some(candidate)
550                    } else {
551                        None
552                    }
553                })
554            });
555
556        let path = match nix_binary {
557            Some(p) => p,
558            None => {
559                eprintln!("skipping: no runsc binary found in /nix/store");
560                return;
561            }
562        };
563
564        assert!(GVisorRuntime::is_trusted_runsc_owner(&path, 65534, 1000));
565    }
566
567    #[test]
568    fn test_exec_environment_uses_hardcoded_path() {
569        // The gVisor supervisor must NOT inherit the host PATH, to prevent
570        // host filesystem layout leaking into the container environment.
571        // Verify by setting a distinctive PATH and checking exec_environment
572        // returns a hardcoded value instead.
573        std::env::set_var("PATH", "/tmp/evil-inject/bin:/opt/attacker/sbin");
574        let rt = GVisorRuntime::with_path("/fake/runsc".to_string());
575        let tmp = tempfile::tempdir().unwrap();
576        let env = rt.exec_environment(tmp.path()).unwrap();
577        let path_entry = env
578            .iter()
579            .find(|e| e.to_str().is_ok_and(|s| s.starts_with("PATH=")))
580            .expect("exec_environment must set PATH");
581        let path_val = path_entry.to_str().unwrap();
582        assert!(
583            !path_val.contains("evil-inject") && !path_val.contains("attacker"),
584            "exec_environment must use hardcoded PATH, not host PATH. Got: {}",
585            path_val
586        );
587        assert_eq!(
588            path_val, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
589            "exec_environment PATH must be the standard hardcoded value"
590        );
591    }
592
593    #[test]
594    fn test_runsc_owner_rejects_untrusted_non_store_owner() {
595        assert!(!GVisorRuntime::is_trusted_runsc_owner(
596            Path::new("/tmp/runsc"),
597            4242,
598            1000
599        ));
600    }
601}