Skip to main content

nucleus/security/
gvisor.rs

1use crate::error::{NucleusError, Result};
2use crate::oci::OciBundle;
3use nix::unistd::Uid;
4use std::ffi::CString;
5use std::os::unix::fs::PermissionsExt;
6use std::path::Path;
7use std::process::Command;
8use tracing::{debug, info};
9
10/// Network mode for gVisor runtime.
11#[derive(Debug, Clone, Copy, PartialEq, Eq)]
12pub enum GVisorNetworkMode {
13    /// No networking (fully isolated). Default for agent workloads.
14    None,
15    /// gVisor user-space network stack. Suitable for networked production services
16    /// that need gVisor isolation with network access.
17    Sandbox,
18    /// Share host network namespace. Use with caution.
19    Host,
20}
21
22/// Platform backend for gVisor's Sentry.
23#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, clap::ValueEnum)]
24pub enum GVisorPlatform {
25    /// systrap backend, the current default and most broadly compatible option.
26    #[default]
27    Systrap,
28    /// KVM-backed sandboxing for the Sentry itself.
29    Kvm,
30    /// ptrace backend for maximal compatibility where systrap/KVM are unavailable.
31    Ptrace,
32}
33
34impl GVisorPlatform {
35    pub fn as_flag(self) -> &'static str {
36        match self {
37            Self::Systrap => "systrap",
38            Self::Kvm => "kvm",
39            Self::Ptrace => "ptrace",
40        }
41    }
42}
43
44/// GVisor runtime manager
45///
46/// Implements the gVisor state machine from
47/// NucleusSecurity_GVisor_GVisorRuntime.tla
48pub struct GVisorRuntime {
49    runsc_path: String,
50}
51
52impl GVisorRuntime {
53    /// Create a new GVisor runtime manager
54    ///
55    /// This checks for runsc binary availability
56    pub fn new() -> Result<Self> {
57        let runsc_path = Self::find_runsc()?;
58        info!("Found runsc at: {}", runsc_path);
59        Ok(Self { runsc_path })
60    }
61
62    /// Create a GVisor runtime with a pre-resolved runsc path.
63    ///
64    /// Use this when the path was resolved before privilege changes
65    /// (e.g. before entering a user namespace where UID 0 would block
66    /// PATH-based lookup).
67    pub fn with_path(runsc_path: String) -> Self {
68        Self { runsc_path }
69    }
70
71    /// Resolve the runsc path without constructing a full runtime.
72    /// Call this before fork/unshare so the path is resolved while
73    /// still unprivileged.
74    pub fn resolve_path() -> Result<String> {
75        Self::find_runsc()
76    }
77
78    /// Find the runsc binary
79    fn find_runsc() -> Result<String> {
80        // Try common locations
81        let paths = vec![
82            "/usr/local/bin/runsc",
83            "/usr/bin/runsc",
84            "/opt/gvisor/runsc",
85        ];
86
87        for path in &paths {
88            if let Some(validated) = Self::validate_runsc_path(Path::new(path))? {
89                return Ok(validated);
90            }
91        }
92
93        // For privileged execution, do not resolve runtime binaries via PATH.
94        // This avoids environment-based binary hijacking when running as root.
95        if Uid::effective().is_root() {
96            return Err(NucleusError::GVisorError(
97                "runsc binary not found in trusted system paths".to_string(),
98            ));
99        }
100
101        // Try to find in PATH without invoking a shell command.
102        if let Some(path_var) = std::env::var_os("PATH") {
103            for dir in std::env::split_paths(&path_var) {
104                let candidate = dir.join("runsc");
105                if let Some(validated) = Self::validate_runsc_path(&candidate)? {
106                    return Ok(validated);
107                }
108            }
109        }
110
111        Err(NucleusError::GVisorError(
112            "runsc binary not found. Please install gVisor.".to_string(),
113        ))
114    }
115
116    fn validate_runsc_path(path: &Path) -> Result<Option<String>> {
117        if !path.exists() {
118            return Ok(None);
119        }
120        if !path.is_file() {
121            return Ok(None);
122        }
123
124        let canonical = std::fs::canonicalize(path).map_err(|e| {
125            NucleusError::GVisorError(format!(
126                "Failed to canonicalize runsc path {:?}: {}",
127                path, e
128            ))
129        })?;
130
131        // If the candidate is a shell wrapper script (common on NixOS where
132        // nix wraps binaries to inject PATH), look for the real ELF binary
133        // next to it.  runsc's gofer subprocess re-execs via /proc/self/exe,
134        // which must point to the real binary — not a bash wrapper.
135        let resolved = Self::unwrap_nix_wrapper(&canonical).unwrap_or_else(|| canonical.clone());
136
137        let metadata = std::fs::metadata(&resolved).map_err(|e| {
138            NucleusError::GVisorError(format!("Failed to stat runsc path {:?}: {}", resolved, e))
139        })?;
140
141        let mode = metadata.permissions().mode();
142        if mode & 0o022 != 0 {
143            return Err(NucleusError::GVisorError(format!(
144                "Refusing insecure runsc binary permissions at {:?} (mode {:o})",
145                resolved, mode
146            )));
147        }
148        if mode & 0o111 == 0 {
149            return Ok(None);
150        }
151
152        // Reject binaries owned by other non-root users — a malicious user
153        // could place a trojan runsc earlier in PATH.
154        use std::os::unix::fs::MetadataExt;
155        let owner = metadata.uid();
156        let current_uid = nix::unistd::Uid::effective().as_raw();
157        if !Self::is_trusted_runsc_owner(&resolved, owner, current_uid) {
158            return Err(NucleusError::GVisorError(format!(
159                "Refusing runsc binary at {:?} owned by uid {} (expected root, current user {}, or immutable /nix/store artifact)",
160                resolved, owner, current_uid
161            )));
162        }
163
164        Ok(Some(resolved.to_string_lossy().to_string()))
165    }
166
167    fn is_trusted_runsc_owner(path: &Path, owner: u32, current_uid: u32) -> bool {
168        if owner == 0 || owner == current_uid {
169            return true;
170        }
171
172        // Nix store artifacts are immutable content-addressed paths and are
173        // commonly owned by `nobody` rather than root/current user.
174        // Extra hardening: verify the binary is not writable by *anyone* and
175        // the parent directory is also not writable, to guard against a
176        // compromised or mutable store.
177        if path.starts_with("/nix/store") {
178            if let Ok(meta) = std::fs::metadata(path) {
179                let mode = meta.permissions().mode();
180                // Reject if owner-writable (group/other already checked by caller)
181                if mode & 0o200 != 0 {
182                    return false;
183                }
184            } else {
185                return false;
186            }
187            // Verify the immediate parent directory is not writable
188            if let Some(parent) = path.parent() {
189                if let Ok(parent_meta) = std::fs::metadata(parent) {
190                    let parent_mode = parent_meta.permissions().mode();
191                    if parent_mode & 0o222 != 0 {
192                        return false;
193                    }
194                } else {
195                    return false;
196                }
197            }
198            return true;
199        }
200
201        false
202    }
203
204    /// If `path` is a Nix wrapper script, extract the real binary path.
205    ///
206    /// Nix wrapper scripts end with a line like:
207    ///   exec -a "$0" "/nix/store/…/.runsc-wrapped"  "$@"
208    /// We parse that to find the actual ELF binary.
209    fn unwrap_nix_wrapper(path: &Path) -> Option<std::path::PathBuf> {
210        let content = std::fs::read_to_string(path).ok()?;
211        // Only process short scripts (wrapper scripts are small)
212        if content.len() > 4096 || !content.starts_with("#!") {
213            return None;
214        }
215        // Look for the exec line that references the wrapped binary
216        for line in content.lines().rev() {
217            let trimmed = line.trim();
218            if trimmed.starts_with("exec ") {
219                // Parse: exec -a "$0" "/nix/store/.../bin/.runsc-wrapped"  "$@"
220                // or:    exec "/nix/store/.../bin/.runsc-wrapped"  "$@"
221                for token in trimmed.split_whitespace() {
222                    let unquoted = token.trim_matches('"');
223                    if unquoted.starts_with('/') && unquoted.contains("runsc") {
224                        let candidate = std::path::PathBuf::from(unquoted);
225                        if candidate.exists() && candidate.is_file() {
226                            debug!("Resolved Nix wrapper {:?} → {:?}", path, candidate);
227                            return Some(candidate);
228                        }
229                    }
230                }
231            }
232        }
233        None
234    }
235
236    /// Execute using gVisor with an OCI bundle
237    ///
238    /// This is the OCI-compliant way to run containers with gVisor.
239    /// The `network_mode` parameter controls gVisor's --network flag:
240    /// - `GVisorNetworkMode::None` → `--network none` (fully isolated, original behavior)
241    /// - `GVisorNetworkMode::Sandbox` → `--network sandbox` (gVisor user-space network stack)
242    /// - `GVisorNetworkMode::Host` → `--network host` (share host network namespace)
243    pub fn exec_with_oci_bundle(&self, container_id: &str, bundle: &OciBundle) -> Result<()> {
244        self.exec_with_oci_bundle_network(
245            container_id,
246            bundle,
247            GVisorNetworkMode::None,
248            false,
249            GVisorPlatform::Systrap,
250        )
251    }
252
253    /// Execute using gVisor with an OCI bundle and explicit network mode.
254    ///
255    /// When `rootless` is true, the OCI spec is expected to carry explicit
256    /// user namespace mappings. In that mode we do not pass runsc's CLI
257    /// `--rootless` flag, because gVisor documents that flag as the
258    /// `runsc do`-oriented path rather than the OCI `run` path. We still skip runsc's
259    /// internal cgroup configuration because Nucleus already manages cgroups
260    /// externally and unprivileged callers cannot configure them directly.
261    pub fn exec_with_oci_bundle_network(
262        &self,
263        container_id: &str,
264        bundle: &OciBundle,
265        network_mode: GVisorNetworkMode,
266        rootless: bool,
267        platform: GVisorPlatform,
268    ) -> Result<()> {
269        info!(
270            "Executing with gVisor using OCI bundle at {:?} (network: {:?}, platform: {:?})",
271            bundle.bundle_path(),
272            network_mode,
273            platform,
274        );
275
276        let network_flag = match network_mode {
277            GVisorNetworkMode::None => "none",
278            GVisorNetworkMode::Sandbox => "sandbox",
279            GVisorNetworkMode::Host => "host",
280        };
281
282        // Create a per-container root directory for runsc state.
283        // By default runsc uses /var/run/runsc which requires root privileges.
284        // We place it next to the OCI bundle so it is cleaned up together.
285        let runsc_root = bundle
286            .bundle_path()
287            .parent()
288            .unwrap_or(bundle.bundle_path())
289            .join("runsc-root");
290        std::fs::create_dir_all(&runsc_root).map_err(|e| {
291            NucleusError::GVisorError(format!("Failed to create runsc root directory: {}", e))
292        })?;
293        std::fs::set_permissions(&runsc_root, std::fs::Permissions::from_mode(0o700)).map_err(
294            |e| {
295                NucleusError::GVisorError(format!(
296                    "Failed to secure runsc root directory permissions: {}",
297                    e
298                ))
299            },
300        )?;
301
302        let runsc_runtime_dir = runsc_root.join("runtime");
303        std::fs::create_dir_all(&runsc_runtime_dir).map_err(|e| {
304            NucleusError::GVisorError(format!("Failed to create runsc runtime directory: {}", e))
305        })?;
306        std::fs::set_permissions(&runsc_runtime_dir, std::fs::Permissions::from_mode(0o700))
307            .map_err(|e| {
308                NucleusError::GVisorError(format!(
309                    "Failed to secure runsc runtime directory permissions: {}",
310                    e
311                ))
312            })?;
313
314        // Build runsc command with OCI bundle.
315        // Global flags (--root, --network, --platform) must come BEFORE the subcommand.
316        // runsc --root <dir> --network <mode> --platform <plat> run --bundle <path> <id>
317        let mut args = vec![
318            self.runsc_path.clone(),
319            "--root".to_string(),
320            runsc_root.to_string_lossy().to_string(),
321        ];
322
323        // Rootless OCI mode relies on user namespace mappings in config.json.
324        // We intentionally do not pass runsc's CLI `--rootless` flag here.
325        if rootless {
326            args.push("--ignore-cgroups".to_string());
327        }
328
329        args.extend([
330            "--network".to_string(),
331            network_flag.to_string(),
332            "--platform".to_string(),
333            platform.as_flag().to_string(),
334            "run".to_string(),
335            "--bundle".to_string(),
336            bundle.bundle_path().to_string_lossy().to_string(),
337            container_id.to_string(),
338        ]);
339
340        debug!("runsc OCI args: {:?}", args);
341
342        // Convert to CStrings for exec
343        let program = CString::new(self.runsc_path.as_str())
344            .map_err(|e| NucleusError::GVisorError(format!("Invalid runsc path: {}", e)))?;
345
346        let c_args: Result<Vec<CString>> = args
347            .iter()
348            .map(|arg| {
349                CString::new(arg.as_str())
350                    .map_err(|e| NucleusError::GVisorError(format!("Invalid argument: {}", e)))
351            })
352            .collect();
353        let c_args = c_args?;
354
355        let c_env = self.exec_environment(&runsc_runtime_dir)?;
356
357        // Defense-in-depth: even though gVisor provides its own sandboxing,
358        // apply PR_SET_NO_NEW_PRIVS so the runsc process (and anything it
359        // spawns) cannot gain privileges via setuid/setgid binaries.
360        //
361        // PR_SET_NO_NEW_PRIVS only affects the calling thread. Verify we are
362        // single-threaded so no sibling thread can race to exec a setuid binary.
363        let thread_count = std::fs::read_to_string("/proc/self/status")
364            .ok()
365            .and_then(|s| {
366                s.lines()
367                    .find(|l| l.starts_with("Threads:"))
368                    .and_then(|l| l.split_whitespace().nth(1))
369                    .and_then(|n| n.parse::<u32>().ok())
370            });
371        if thread_count != Some(1) {
372            return Err(NucleusError::GVisorError(format!(
373                "PR_SET_NO_NEW_PRIVS requires single-threaded process, found {:?} threads",
374                thread_count
375            )));
376        }
377        let ret = unsafe { libc::prctl(libc::PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) };
378        if ret != 0 {
379            return Err(NucleusError::GVisorError(format!(
380                "Failed to set PR_SET_NO_NEW_PRIVS before gVisor exec: {}",
381                std::io::Error::last_os_error()
382            )));
383        }
384        info!("PR_SET_NO_NEW_PRIVS applied before gVisor exec (defense-in-depth)");
385
386        // execve - this replaces the current process with runsc
387        nix::unistd::execve::<std::ffi::CString, std::ffi::CString>(&program, &c_args, &c_env)?;
388
389        // Should never reach here
390        Ok(())
391    }
392
393    /// Check if gVisor is available on this system
394    pub fn is_available() -> bool {
395        Self::find_runsc().is_ok()
396    }
397
398    /// Get runsc version
399    pub fn version(&self) -> Result<String> {
400        let output = Command::new(&self.runsc_path)
401            .arg("--version")
402            .output()
403            .map_err(|e| NucleusError::GVisorError(format!("Failed to get version: {}", e)))?;
404
405        if !output.status.success() {
406            return Err(NucleusError::GVisorError(
407                "Failed to get runsc version".to_string(),
408            ));
409        }
410
411        let version = String::from_utf8_lossy(&output.stdout).to_string();
412        Ok(version.trim().to_string())
413    }
414
415    fn exec_environment(&self, runtime_dir: &Path) -> Result<Vec<CString>> {
416        let mut env = Vec::new();
417        let mut push = |key: &str, value: String| -> Result<()> {
418            env.push(
419                CString::new(format!("{}={}", key, value))
420                    .map_err(|e| NucleusError::GVisorError(format!("Invalid {}: {}", key, e)))?,
421            );
422            Ok(())
423        };
424
425        // Use a hardcoded PATH for the runsc supervisor process to prevent
426        // host PATH from leaking into the gVisor environment.
427        push(
428            "PATH",
429            "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin".to_string(),
430        )?;
431        let runtime_dir = runtime_dir.to_string_lossy().to_string();
432        push("TMPDIR", runtime_dir.clone())?;
433        push("XDG_RUNTIME_DIR", runtime_dir)?;
434
435        // Hardcode safe values instead of leaking host identity/paths.
436        // HOME could point to an attacker-controlled directory; USER/LOGNAME
437        // leak host identity information — none of which gVisor needs.
438        push("HOME", "/root".to_string())?;
439        push("USER", "root".to_string())?;
440        push("LOGNAME", "root".to_string())?;
441
442        Ok(env)
443    }
444}
445
446#[cfg(test)]
447mod tests {
448    use super::*;
449    use std::path::Path;
450
451    #[test]
452    fn test_gvisor_availability() {
453        // This test just checks if we can determine availability
454        // It may pass or fail depending on whether gVisor is installed
455        let available = GVisorRuntime::is_available();
456        println!("gVisor available: {}", available);
457    }
458
459    #[test]
460    fn test_gvisor_new() {
461        let runtime = GVisorRuntime::new();
462        if let Ok(rt) = runtime {
463            println!("Found runsc at: {}", rt.runsc_path);
464            if let Ok(version) = rt.version() {
465                println!("runsc version: {}", version);
466            }
467        }
468    }
469
470    #[test]
471    fn test_find_runsc() {
472        // Test that find_runsc either succeeds or returns appropriate error
473        match GVisorRuntime::find_runsc() {
474            Ok(path) => {
475                println!("Found runsc at: {}", path);
476                assert!(!path.is_empty());
477            }
478            Err(e) => {
479                println!("runsc not found (expected if gVisor not installed): {}", e);
480            }
481        }
482    }
483
484    #[test]
485    fn test_validate_runsc_rejects_world_writable() {
486        let dir = tempfile::tempdir().unwrap();
487        let fake_runsc = dir.path().join("runsc");
488        std::fs::write(&fake_runsc, "#!/bin/sh\necho fake").unwrap();
489        // Make world-writable
490        std::fs::set_permissions(&fake_runsc, std::fs::Permissions::from_mode(0o777)).unwrap();
491
492        let result = GVisorRuntime::validate_runsc_path(&fake_runsc);
493        assert!(
494            result.is_err(),
495            "validate_runsc_path must reject world-writable binaries"
496        );
497    }
498
499    #[test]
500    fn test_validate_runsc_rejects_group_writable() {
501        let dir = tempfile::tempdir().unwrap();
502        let fake_runsc = dir.path().join("runsc");
503        std::fs::write(&fake_runsc, "#!/bin/sh\necho fake").unwrap();
504        // Make group-writable
505        std::fs::set_permissions(&fake_runsc, std::fs::Permissions::from_mode(0o775)).unwrap();
506
507        let result = GVisorRuntime::validate_runsc_path(&fake_runsc);
508        assert!(
509            result.is_err(),
510            "validate_runsc_path must reject group-writable binaries"
511        );
512    }
513
514    #[test]
515    fn test_runsc_owner_accepts_nix_store_artifact_owner() {
516        // Use a real Nix store binary so the metadata/permission checks pass.
517        // The /nix/store contents are read-only and content-addressed, so any
518        // existing file with mode 555 works.
519        let nix_binary = std::fs::read_dir("/nix/store")
520            .ok()
521            .and_then(|mut entries| {
522                entries.find_map(|e| {
523                    let dir = e.ok()?.path();
524                    let candidate = dir.join("bin/runsc");
525                    if candidate.exists() {
526                        Some(candidate)
527                    } else {
528                        None
529                    }
530                })
531            });
532
533        let path = match nix_binary {
534            Some(p) => p,
535            None => {
536                eprintln!("skipping: no runsc binary found in /nix/store");
537                return;
538            }
539        };
540
541        assert!(GVisorRuntime::is_trusted_runsc_owner(&path, 65534, 1000));
542    }
543
544    #[test]
545    fn test_exec_environment_uses_hardcoded_path() {
546        // The gVisor supervisor must NOT inherit the host PATH, to prevent
547        // host filesystem layout leaking into the container environment.
548        // Verify by setting a distinctive PATH and checking exec_environment
549        // returns a hardcoded value instead.
550        std::env::set_var("PATH", "/tmp/evil-inject/bin:/opt/attacker/sbin");
551        let rt = GVisorRuntime::with_path("/fake/runsc".to_string());
552        let tmp = tempfile::tempdir().unwrap();
553        let env = rt.exec_environment(tmp.path()).unwrap();
554        let path_entry = env
555            .iter()
556            .find(|e| e.to_str().is_ok_and(|s| s.starts_with("PATH=")))
557            .expect("exec_environment must set PATH");
558        let path_val = path_entry.to_str().unwrap();
559        assert!(
560            !path_val.contains("evil-inject") && !path_val.contains("attacker"),
561            "exec_environment must use hardcoded PATH, not host PATH. Got: {}",
562            path_val
563        );
564        assert_eq!(
565            path_val, "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin",
566            "exec_environment PATH must be the standard hardcoded value"
567        );
568    }
569
570    #[test]
571    fn test_runsc_owner_rejects_untrusted_non_store_owner() {
572        assert!(!GVisorRuntime::is_trusted_runsc_owner(
573            Path::new("/tmp/runsc"),
574            4242,
575            1000
576        ));
577    }
578}